diff --git a/CHANGELOG.md b/CHANGELOG.md index 6dbf8dc96..4cfb08a92 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,8 @@ _When adding new entries to the changelog, please include issue/PR numbers where * Patches that create or delete datasets are now supported in Datasets V2 [#239](https://github.com/koordinates/sno/issues/239) * `apply`, `commit` and `merge` commands now optimise repositories after committing, to avoid poor repo performance. [#250](https://github.com/koordinates/sno/issues/250) * `commit` now checks that the diff to be committed matches the schema, and rejects diffs that do not - this is possible in working copy formats that have relatively lax type enforcement, ie GPKG [#300](https://github.com/koordinates/sno/pull/300) + * Added GPKG support for sno types that GPKG doesn't support - they are approximated as strings. [#304](https://github.com/koordinates/sno/pull/304) + * `schema.json` no longer stores attributes that are null - a missing attribute has the same meaning as that attribute being present and null. [#304](https://github.com/koordinates/sno/pull/304) * `data ls` now accepts an optional ref argument * `meta get` now accepts a `--ref=REF` option * `clone` now accepts a `--branch` option to clone a specific branch. diff --git a/sno/dataset2.py b/sno/dataset2.py index eb0fb01e3..5e568cb59 100644 --- a/sno/dataset2.py +++ b/sno/dataset2.py @@ -97,6 +97,10 @@ def get_meta_item(self, name): if rel_path.startswith(self.LEGEND_PATH): return data + if rel_path.endswith("schema.json"): + # Unfortunately, some schemas might be stored slightly differently to others - + # - eg with or without null attributes. This normalises them. + return Schema.normalise_column_dicts(json_unpack(data)) if rel_path.endswith(".json"): return json_unpack(data) elif rel_path.endswith(".wkt"): diff --git a/sno/gpkg_adapter.py b/sno/gpkg_adapter.py index 3d4d75102..cdfb5d9ca 100644 --- a/sno/gpkg_adapter.py +++ b/sno/gpkg_adapter.py @@ -296,25 +296,7 @@ def _column_schema_to_gpkg(cid, column_schema, is_spatial): # TEXT{(max_len)}, BLOB{(max_len)}, DATE, DATETIME, -_GPKG_TYPE_TO_V2_TYPE = { - "BOOLEAN": "boolean", - "TINYINT": ("integer", {"size": 8}), - "SMALLINT": ("integer", {"size": 16}), - "MEDIUMINT": ("integer", {"size": 32}), - "INT": ("integer", {"size": 64}), - "INTEGER": ("integer", {"size": 64}), - "FLOAT": ("float", {"size": 32}), - "DOUBLE": ("float", {"size": 64}), - "REAL": ("float", {"size": 64}), - "TEXT": "text", - "BLOB": "blob", - "DATE": "date", - "DATETIME": "timestamp", - # GEOMETRY types handled differently -} - - -_V2_TYPE_TO_GPKG_TYPE = { +V2_TYPE_TO_GPKG_TYPE = { "boolean": "BOOLEAN", "integer": { 0: "INTEGER", @@ -328,22 +310,49 @@ def _column_schema_to_gpkg(cid, column_schema, is_spatial): "blob": "BLOB", "date": "DATE", "timestamp": "DATETIME", - # geometry types handled differently + "time": "TEXT", # Approximated + "numeric": "TEXT", # Approximated + "interval": "TEXT", # Approximated + "geometry": "GEOMETRY", +} + + +GPKG_TYPE_TO_V2_TYPE = { + "BOOLEAN": "boolean", + "TINYINT": ("integer", 8), + "SMALLINT": ("integer", 16), + "MEDIUMINT": ("integer", 32), + "INT": ("integer", 64), + "INTEGER": ("integer", 64), + "FLOAT": ("float", 32), + "DOUBLE": ("float", 64), + "REAL": ("float", 64), + "TEXT": "text", + "BLOB": "blob", + "DATE": "date", + "DATETIME": "timestamp", + "GEOMETRY": "geometry", } +# Types that can't be roundtrip perfectly in GPKG, and what they end up as. +APPROXIMATED_TYPES = {"interval": "text", "time": "text", "numeric": "text"} + + def gpkg_type_to_v2_type(gkpg_type): """Convert a gpkg type to v2 schema type.""" m = re.match(r"^(TEXT|BLOB)\(([0-9]+)\)$", gkpg_type) if m: return m.group(1).lower(), {"length": int(m.group(2))} - v2_type_info = _GPKG_TYPE_TO_V2_TYPE.get(gkpg_type) + v2_type_info = GPKG_TYPE_TO_V2_TYPE.get(gkpg_type) if v2_type_info is None: raise ValueError(f"Unrecognised GPKG type: {gkpg_type}") elif isinstance(v2_type_info, tuple): - v2_type, extra_type_info = v2_type_info + v2_type, size = v2_type_info + extra_type_info = {"size": size} else: - v2_type, extra_type_info = v2_type_info, {} + v2_type = v2_type_info + extra_type_info = {} return v2_type, extra_type_info @@ -379,7 +388,7 @@ def v2_type_to_gpkg_type(column_schema, is_spatial): if column_schema.data_type == "geometry": return extra_type_info["geometryType"].split(" ", 1)[0] - gpkg_type_info = _V2_TYPE_TO_GPKG_TYPE.get(v2_type) + gpkg_type_info = V2_TYPE_TO_GPKG_TYPE.get(v2_type) if gpkg_type_info is None: raise ValueError(f"Unrecognised data type: {v2_type}") diff --git a/sno/ogr_import_source.py b/sno/ogr_import_source.py index e9e2011e9..9282695b4 100644 --- a/sno/ogr_import_source.py +++ b/sno/ogr_import_source.py @@ -13,6 +13,7 @@ from .exceptions import ( InvalidOperation, NotFound, + NotYetImplemented, NO_IMPORT_SOURCE, NO_TABLE, ) @@ -43,38 +44,21 @@ class OgrImportSource(ImportSource): Imports from an OGR source, currently from a whitelist of formats. """ - OGR_TYPE_TO_SQLITE_TYPE = { - # NOTE: we don't handle OGR's *List (array) fields at all. - # If you write them to GPKG using OGR, you end up with TEXT. - # We also don't handle ogr's "Time" fields, because they end up as TEXT in GPKG, - # which we can't roundtrip. Tackle when we get someone actually using those types... - "Integer": "MEDIUMINT", - "Integer64": "INTEGER", - "Real": "REAL", - "String": "TEXT", - "Binary": "BLOB", - "Date": "DATE", - "DateTime": "DATETIME", - } - OGR_SUBTYPE_TO_SQLITE_TYPE = { - ogr.OFSTBoolean: "BOOLEAN", - ogr.OFSTInt16: "SMALLINT", - ogr.OFSTFloat32: "FLOAT", - } - + # NOTE: We don't support *List fields (eg IntegerList). OGR_TYPE_TO_V2_SCHEMA_TYPE = { - "Integer": ("integer", {"size": 32}), - "Integer64": ("integer", {"size": 64}), - "Real": ("float", {"size": 64}), - "String": ("text", {}), - "Binary": ("blob", {}), - "Date": ("date", {}), - "DateTime": ("timestamp", {}), + "Integer": ("integer", 32), + "Integer64": ("integer", 64), + "Real": ("float", 64), + "String": "text", + "Binary": "blob", + "Date": "date", + "DateTime": "timestamp", + "Time": "time", } OGR_SUBTYPE_TO_V2_SCHEMA_TYPE = { - ogr.OFSTBoolean: ("boolean", {}), - ogr.OFSTInt16: ("integer", {"size": 16}), - ogr.OFSTFloat32: ("float", {"size": 32}), + ogr.OFSTBoolean: "boolean", + ogr.OFSTInt16: ("integer", 16), + ogr.OFSTFloat32: ("float", 32), } @classmethod @@ -490,11 +474,24 @@ def _field_to_v2_column_schema(self, fd): ogr_type = fd.GetTypeName() ogr_subtype = fd.GetSubType() if ogr_subtype == ogr.OFSTNone: - data_type, extra_type_info = self.OGR_TYPE_TO_V2_SCHEMA_TYPE[ogr_type] + data_type_info = self.OGR_TYPE_TO_V2_SCHEMA_TYPE.get(ogr_type) + if data_type_info is None: + raise NotYetImplemented( + f"Unsupported column type for import: OGR type={ogr_type}" + ) else: - data_type, extra_type_info = self.OGR_SUBTYPE_TO_V2_SCHEMA_TYPE[ogr_subtype] + data_type_info = self.OGR_SUBTYPE_TO_V2_SCHEMA_TYPE.get(ogr_subtype) + if data_type_info is None: + raise NotYetImplemented( + f"Unsupported column type for import: OGR subtype={ogr_subtype}" + ) - extra_type_info = extra_type_info.copy() + if isinstance(data_type_info, tuple): + data_type, size = data_type_info + extra_type_info = {"size": size} + elif isinstance(data_type_info, str): + data_type = data_type_info + extra_type_info = {} if data_type in ("TEXT", "BLOB"): width = fd.GetWidth() diff --git a/sno/schema.py b/sno/schema.py index e449b9452..ba07556e8 100644 --- a/sno/schema.py +++ b/sno/schema.py @@ -152,22 +152,21 @@ def __new__(cls, id, name, data_type, pk_index, **extra_type_info): @classmethod def from_dict(cls, d): d = d.copy() - return cls( - d.pop("id"), - d.pop("name"), - d.pop("dataType"), - d.pop("primaryKeyIndex"), - **d, - ) + id_ = d.pop("id") + name = d.pop("name") + data_type = d.pop("dataType") + pk_index = d.pop("primaryKeyIndex", None) + extra_type_info = dict((k, v) for k, v in d.items() if v is not None) + return cls(id_, name, data_type, pk_index, **extra_type_info) def to_dict(self): - return { - "id": self.id, - "name": self.name, - "dataType": self.data_type, - "primaryKeyIndex": self.pk_index, - **self.extra_type_info, - } + result = {"id": self.id, "name": self.name, "dataType": self.data_type} + if self.pk_index is not None: + result["primaryKeyIndex"] = self.pk_index + for key, value in self.extra_type_info.items(): + if value is not None: + result[key] = value + return result def __eq__(self, other): if not isinstance(other, ColumnSchema): @@ -240,6 +239,10 @@ def from_column_dicts(cls, column_dicts): columns = [ColumnSchema.from_dict(d) for d in column_dicts] return cls(columns) + @classmethod + def normalise_column_dicts(cls, column_dicts): + return Schema.from_column_dicts(column_dicts).to_column_dicts() + @classmethod def loads(cls, data): """Load a schema from a bytestring""" @@ -346,12 +349,22 @@ def sanitise_pks(self, pk_values): pk_values[i] = int(pk_values[i]) return tuple(pk_values) - def align_to_self(self, new_schema): + def align_to_self(self, new_schema, approximated_types=None): """ Returns a new schema the same as the one given, except that some column IDs might be changed to match those in self. Column IDs are copied from self onto the resulting schema if the columns in the resulting schema are the "same" as columns in the previous schema. Uses a heuristic - columns are the same if they have the same name + type (handles reordering), or, if they have the same position and type (handles renames). + + approximated_types - if the new_schema has been roundtripped through a database that doesn't support all sno + types, then some of them will have had to be approximated. Aligning the schema also restores them to their + original type if they had been approximated as close as possible. approximated_types should be a dict + mapping type to approximated type, sometimes with an extra sub-dict for size subtypes eg: + { + "timestamp": "string", # timestamp is approximated as text + "integer": {8: ("integer", 32), 16: ("integer", 32)} # int8, int16 are approximated as int32 + } + """ # TODO - could prompt the user to help with more complex schema changes. old_cols = self.to_column_dicts() @@ -360,8 +373,8 @@ def align_to_self(self, new_schema): return Schema.from_column_dicts(new_cols) @classmethod - def align_schema_cols(cls, old_cols, new_cols): - """Same as align_to_self, but works on lists of column dicts, instead of Schema objects.""" + def align_schema_cols(cls, old_cols, new_cols, approximated_types=None): + """Same as align_to_self, but mutates new_cols list, instead of returning a new Schema object.""" for old_col in old_cols: old_col["done"] = False for new_col in new_cols: @@ -370,11 +383,13 @@ def align_schema_cols(cls, old_cols, new_cols): # Align columns by name + type - handles reordering. old_cols_by_name = {c["name"]: c for c in old_cols} for new_col in new_cols: - cls._try_align(old_cols_by_name.get(new_col["name"]), new_col) + cls._try_align( + old_cols_by_name.get(new_col["name"]), new_col, approximated_types + ) # Align columns by position + type - handles renames. for old_col, new_col in zip(old_cols, new_cols): - cls._try_align(old_col, new_col) + cls._try_align(old_col, new_col, approximated_types) for old_col in old_cols: del old_col["done"] @@ -383,25 +398,37 @@ def align_schema_cols(cls, old_cols, new_cols): return new_cols @classmethod - def _try_align(cls, old_col, new_col): + def _try_align(cls, old_col, new_col, approximated_types=None): if old_col is None or new_col is None: return False if old_col["done"] or new_col["done"]: return False - if old_col["dataType"] != new_col["dataType"]: - return False - if old_col["primaryKeyIndex"] != new_col["primaryKeyIndex"]: + if old_col.get("primaryKeyIndex") != new_col.get("primaryKeyIndex"): return False + + old_type, new_type = old_col["dataType"], new_col["dataType"] + if old_type != new_type: + if approximated_types and approximated_types[old_type] == new_type: + # new_col's type was the best approximation we could manage of old_col's type. + new_col["dataType"] = old_col["dataType"] + else: + return False + + # The two columns are similar enough that we can align their IDs. + new_col["id"] = old_col["id"] - # We can't roundtrip size properly for GPKG primary keys since they have to be of type INTEGER. - if ( - old_col["primaryKeyIndex"] is not None - and "size" in old_col - and "size" in new_col - and old_col["size"] != new_col["size"] - ): - new_col["size"] = old_col["size"] + old_size, new_size = old_col.get("size"), new_col.get("size") + if old_size != new_size: + data_type = old_type + approx_info = approximated_types and approximated_types.get(data_type) + + if old_col.get("primaryKeyIndex") is not None and new_size == 64: + # We can't roundtrip size properly for GPKG primary keys since they have to be of type INTEGER. + new_col["size"] = old_col.get("size") + elif approx_info and approx_info.get(old_size) == (data_type, new_size): + # new_col's size was the best approximation we could manage of old_col's size. + new_col["size"] = old_col.get("size") old_col["done"] = True new_col["done"] = True @@ -453,18 +480,20 @@ def diff_types(self, new_schema): def diff_type_counts(self, new_schema): return {k: len(v) for k, v in self.diff_types(new_schema).items()} - EQUIVALENT_PYTHON_TYPES = { + # These are the types that are stored in datasets. + # Different types might be returned from the working copy DB driver, in which case, they must be adapted. + EQUIVALENT_MSGPACK_TYPES = { "boolean": (bool,), "blob": (bytes,), - "date": (str,), + "date": (str,), # might be datetime.date from DB "float": (float, int), "geometry": (Geometry,), "integer": (int,), - "interval": (str,), - "numeric": (str,), + "interval": (str,), # might be datetime.timedelta from DB + "numeric": (str,), # might be decimal.Decimal from DB "text": (str,), - "time": (str,), - "timestamp": (str,), + "time": (str,), # might be datetime.time from DB + "timestamp": (str,), # might be datetime.datetime from DB } def validate_feature(self, feature, col_violations=None): @@ -504,7 +533,7 @@ def _find_column_violation(self, col, feature): return None col_type = col.data_type - if type(value) not in self.EQUIVALENT_PYTHON_TYPES[col_type]: + if type(value) not in self.EQUIVALENT_MSGPACK_TYPES[col_type]: return f"In column '{col.name}' value {repr(value)} doesn't match schema type {col_type}" elif col_type == "integer" and "size" in col.extra_type_info: size = col.extra_type_info["size"] diff --git a/sno/working_copy/base.py b/sno/working_copy/base.py index f9280b2e6..3247f31af 100644 --- a/sno/working_copy/base.py +++ b/sno/working_copy/base.py @@ -253,6 +253,9 @@ def diff_db_to_tree_meta(self, dataset, raise_if_dirty=False): raise WorkingCopyDirty() return result + # Subclasses should override if there are certain types they cannot represent perfectly. + _APPROXIMATED_TYPES = None + def _remove_hidden_meta_diffs(self, ds_meta_items, wc_meta_items): """ Remove any meta diffs that can't or shouldn't be committed, and so shouldn't be shown to the user. @@ -261,9 +264,9 @@ def _remove_hidden_meta_diffs(self, ds_meta_items, wc_meta_items): can't store the dataset description, then that should be removed from the diff. """ if "schema.json" in ds_meta_items and "schema.json" in wc_meta_items: - Schema.align_schema_cols( - ds_meta_items["schema.json"], wc_meta_items["schema.json"] - ) + ds_schema = ds_meta_items["schema.json"] + wc_schema = wc_meta_items["schema.json"] + Schema.align_schema_cols(ds_schema, wc_schema, self._APPROXIMATED_TYPES) def meta_items(self, dataset): """Returns all the meta items for the given dataset from the working copy DB.""" diff --git a/sno/working_copy/gpkg.py b/sno/working_copy/gpkg.py index 6be81291f..2b8fbdb49 100644 --- a/sno/working_copy/gpkg.py +++ b/sno/working_copy/gpkg.py @@ -338,6 +338,9 @@ def meta_items(self, dataset): yield from gpkg_adapter.all_v2_meta_items(gpkg_meta_items_obj, id_salt=id_salt) + # Some types are approximated as text in GPKG - see super()._remove_hidden_meta_diffs + _APPROXIMATED_TYPES = gpkg_adapter.APPROXIMATED_TYPES + def delete_meta(self, dataset): table_name = dataset.table_name with self.session() as db: diff --git a/sno/working_copy/postgis.py b/sno/working_copy/postgis.py index 428a806f8..6c8178db4 100644 --- a/sno/working_copy/postgis.py +++ b/sno/working_copy/postgis.py @@ -46,10 +46,14 @@ def adapt_geometry_from_db(g, dbcur): def adapt_timestamp_from_db(t, dbcur): # TODO - revisit timezones. - if isinstance(t, str): - # This makes postgis timestamps behave more like GPKG ones. - return t.replace(" ", "T") + "Z" - return t + if t is None: + return t + # This makes postgis timestamps behave more like GPKG ones. + return str(t).replace(" ", "T") + "Z" + + +def adapt_to_string(v, dbcur): + return str(v) if v is not None else None # See https://github.com/psycopg/psycopg2/blob/master/psycopg/typecast_builtins.c @@ -57,6 +61,23 @@ def adapt_timestamp_from_db(t, dbcur): TIMESTAMP = new_type((TIMESTAMP_OID,), "TIMESTAMP", adapt_timestamp_from_db) psycopg2.extensions.register_type(TIMESTAMP) +# We mostly want data out of the DB as strings, just as happens in GPKG. +# Then we can serialise it using MessagePack. +# TODO - maybe move adapt-to-string logic to MessagePack? +ADAPT_TO_STR_TYPES = { + 1082: "DATE", + 1083: "TIME", + 1266: "TIME", + 1184: "TIMESTAMPTZ", + 704: "INTERVAL", + 1186: "INTERVAL", + 1700: "DECIMAL", +} + +for oid in ADAPT_TO_STR_TYPES: + t = new_type((oid,), ADAPT_TO_STR_TYPES[oid], adapt_to_string) + psycopg2.extensions.register_type(t) + class WorkingCopy_Postgis(WorkingCopy): def __init__(self, repo, uri): @@ -658,8 +679,12 @@ def meta_items(self, dataset): id_str = crs_util.get_identifier_str(wkt) yield f"crs/{id_str}.wkt", crs_util.normalise_wkt(wkt) + # Postgis has nowhere obvious to put this metadata. _UNSUPPORTED_META_ITEMS = ("description", "metadata/dataset.json") + # Postgis approximates an int8 as an int16 - see super()._remove_hidden_meta_diffs + _APPROXIMATED_TYPES = postgis_adapter.APPROXIMATED_TYPES + def _remove_hidden_meta_diffs(self, ds_meta_items, wc_meta_items): super()._remove_hidden_meta_diffs(ds_meta_items, wc_meta_items) diff --git a/sno/working_copy/postgis_adapter.py b/sno/working_copy/postgis_adapter.py index b37c88449..eab703eb4 100644 --- a/sno/working_copy/postgis_adapter.py +++ b/sno/working_copy/postgis_adapter.py @@ -6,7 +6,7 @@ from sno.schema import Schema, ColumnSchema -_V2_TYPE_TO_PG_TYPE = { +V2_TYPE_TO_PG_TYPE = { "boolean": "boolean", "blob": "bytea", "date": "date", @@ -14,6 +14,7 @@ "geometry": "geometry", "integer": { 0: "integer", + 8: "smallint", # Approximated as smallint (int16) 16: "smallint", 32: "integer", 64: "bigint", @@ -27,7 +28,7 @@ # Code for preserving these flavours in datasets and working copies needs more work. } -_PG_TYPE_TO_V2_TYPE = { +PG_TYPE_TO_V2_TYPE = { "boolean": "boolean", "smallint": ("integer", 16), "integer": ("integer", 32), @@ -46,6 +47,9 @@ "varchar": "text", } +# Types that can't be roundtrip perfectly in Postgis, and what they end up as. +APPROXIMATED_TYPES = {"integer": {8: ("integer", 16)}} + def v2_schema_to_postgis_spec(schema, v2_obj): """ @@ -70,7 +74,7 @@ def v2_type_to_pg_type(column_schema, v2_obj): v2_type = column_schema.data_type extra_type_info = column_schema.extra_type_info - pg_type_info = _V2_TYPE_TO_PG_TYPE.get(v2_type) + pg_type_info = V2_TYPE_TO_PG_TYPE.get(v2_type) if pg_type_info is None: raise ValueError(f"Unrecognised data type: {v2_type}") @@ -147,9 +151,9 @@ def _postgis_to_column_schema(pg_col_info, pg_spatial_ref_sys, id_salt): def _pg_type_to_v2_type(pg_col_info, pg_spatial_ref_sys): - v2_type_info = _PG_TYPE_TO_V2_TYPE.get(pg_col_info["data_type"]) + v2_type_info = PG_TYPE_TO_V2_TYPE.get(pg_col_info["data_type"]) if v2_type_info is None: - v2_type_info = _PG_TYPE_TO_V2_TYPE.get(pg_col_info["udt_name"]) + v2_type_info = PG_TYPE_TO_V2_TYPE.get(pg_col_info["udt_name"]) if isinstance(v2_type_info, tuple): v2_type = v2_type_info[0] diff --git a/tests/data/gpkg-types.tgz b/tests/data/gpkg-types.tgz new file mode 100644 index 000000000..b75992c61 Binary files /dev/null and b/tests/data/gpkg-types.tgz differ diff --git a/tests/data/patches/polygons.snopatch b/tests/data/patches/polygons.snopatch index 3cb34d474..ccb1e3a1b 100644 --- a/tests/data/patches/polygons.snopatch +++ b/tests/data/patches/polygons.snopatch @@ -28,28 +28,24 @@ "id": "c1d4dea1-c0ad-0255-7857-b5695e3ba2e9", "name": "geom", "dataType": "geometry", - "primaryKeyIndex": null, "geometryType": "MULTIPOLYGON", "geometryCRS": "EPSG:4167" }, { "id": "d3d4b64b-d48e-4069-4bb5-dfa943d91e6b", "name": "date_adjusted", - "dataType": "timestamp", - "primaryKeyIndex": null + "dataType": "timestamp" }, { "id": "dff34196-229d-f0b5-7fd4-b14ecf835b2c", "name": "survey_reference", "dataType": "text", - "primaryKeyIndex": null, "length": 50 }, { "id": "13dc4918-974e-978f-05ce-3b4321077c50", "name": "adjusted_nodes", "dataType": "integer", - "primaryKeyIndex": null, "size": 32 } ] diff --git a/tests/data/types.tgz b/tests/data/types.tgz deleted file mode 100644 index ef99b67e6..000000000 Binary files a/tests/data/types.tgz and /dev/null differ diff --git a/tests/data/types2.tgz b/tests/data/types2.tgz new file mode 100644 index 000000000..eda10b87c Binary files /dev/null and b/tests/data/types2.tgz differ diff --git a/tests/test_gpkg_adapter.py b/tests/test_gpkg_adapter.py index f4693bb5f..84785723e 100644 --- a/tests/test_gpkg_adapter.py +++ b/tests/test_gpkg_adapter.py @@ -15,27 +15,23 @@ "dataType": "geometry", "geometryType": "GEOMETRY", "geometryCRS": "EPSG:2193", - "primaryKeyIndex": None, }, { "id": "aa33d0d8-589d-4389-b2d7-b117358b4320", "name": "Ward", "dataType": "text", - "primaryKeyIndex": None, }, { "id": "9abe0a9f-6668-412a-83af-c3d6e44be647", "name": "Shape_Leng", "dataType": "float", "size": 64, - "primaryKeyIndex": None, }, { "id": "07f1f8b0-42f5-4805-b884-cb25d015a06f", "name": "Shape_Area", "dataType": "float", "size": 64, - "primaryKeyIndex": None, }, ] diff --git a/tests/test_init.py b/tests/test_init.py index 5ee8cd8c8..65ca9c8e3 100644 --- a/tests/test_init.py +++ b/tests/test_init.py @@ -1,6 +1,5 @@ import json import shutil -import subprocess import pytest import pygit2 @@ -412,7 +411,7 @@ def test_import_replace_existing_with_column_renames( def test_init_import_table_ogr_types( data_archive_readonly, tmp_path, cli_runner, repo_version ): - with data_archive_readonly("types") as data: + with data_archive_readonly("gpkg-types") as data: repo_path = tmp_path / "repo" r = cli_runner.invoke( [ diff --git a/tests/test_schema.py b/tests/test_schema.py index 56d0298b3..31d0c9dd4 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -275,7 +275,6 @@ def check_points_diff_output(r, output_format): ' "id": "f488ae9b-6e15-1fe3-0bda-e0d5d38ea69e",', ' "name": "geom",', ' "dataType": "geometry",', - ' "primaryKeyIndex": null,', ' "geometryType": "POINT",', ' "geometryCRS": "EPSG:4326"', " },", @@ -283,35 +282,30 @@ def check_points_diff_output(r, output_format): ' "id": "4a1c7a86-c425-ea77-7f1a-d74321a10edc",', ' "name": "t50_fid",', ' "dataType": "integer",', - ' "primaryKeyIndex": null,', ' "size": 32', " },", " {", ' "id": "d2a62351-a66d-bde2-ce3e-356fec9641e9",', ' "name": "name_ascii",', ' "dataType": "text",', - ' "primaryKeyIndex": null,', ' "length": 75', " },", " {", ' "id": "c3389414-a511-5385-7dcd-891c4ead1663",', ' "name": "macronated",', ' "dataType": "text",', - ' "primaryKeyIndex": null,', ' "length": 1', " },", " {", ' "id": "45b00eaa-5700-662d-8a21-9614e40c437b",', ' "name": "name",', ' "dataType": "text",', - ' "primaryKeyIndex": null,', ' "length": 75', " },", "+ {", '+ "id": "74853a9a-83b1-2b4d-5a70-fc678161d470",', '+ "name": "colour",', '+ "dataType": "text",', - '+ "primaryKeyIndex": null,', '+ "length": 32', "+ },", " ]", @@ -354,13 +348,11 @@ def check_points_diff_output(r, output_format): "geometryType": "POINT", "id": "f488ae9b-6e15-1fe3-0bda-e0d5d38ea69e", "name": "geom", - "primaryKeyIndex": None, }, { "dataType": "integer", "id": "4a1c7a86-c425-ea77-7f1a-d74321a10edc", "name": "t50_fid", - "primaryKeyIndex": None, "size": 32, }, { @@ -368,21 +360,18 @@ def check_points_diff_output(r, output_format): "id": "d2a62351-a66d-bde2-ce3e-356fec9641e9", "length": 75, "name": "name_ascii", - "primaryKeyIndex": None, }, { "dataType": "text", "id": "c3389414-a511-5385-7dcd-891c4ead1663", "length": 1, "name": "macronated", - "primaryKeyIndex": None, }, { "dataType": "text", "id": "45b00eaa-5700-662d-8a21-9614e40c437b", "length": 75, "name": "name", - "primaryKeyIndex": None, }, ], "+": [ @@ -399,13 +388,11 @@ def check_points_diff_output(r, output_format): "geometryType": "POINT", "id": "f488ae9b-6e15-1fe3-0bda-e0d5d38ea69e", "name": "geom", - "primaryKeyIndex": None, }, { "dataType": "integer", "id": "4a1c7a86-c425-ea77-7f1a-d74321a10edc", "name": "t50_fid", - "primaryKeyIndex": None, "size": 32, }, { @@ -413,28 +400,24 @@ def check_points_diff_output(r, output_format): "id": "d2a62351-a66d-bde2-ce3e-356fec9641e9", "length": 75, "name": "name_ascii", - "primaryKeyIndex": None, }, { "dataType": "text", "id": "c3389414-a511-5385-7dcd-891c4ead1663", "length": 1, "name": "macronated", - "primaryKeyIndex": None, }, { "dataType": "text", "id": "45b00eaa-5700-662d-8a21-9614e40c437b", "length": 75, "name": "name", - "primaryKeyIndex": None, }, { "dataType": "text", "id": "74853a9a-83b1-2b4d-5a70-fc678161d470", "length": 32, "name": "colour", - "primaryKeyIndex": None, }, ], } @@ -599,36 +582,31 @@ def check_polygons_diff_output(r, output_format): ' "id": "c1d4dea1-c0ad-0255-7857-b5695e3ba2e9",', ' "name": "geom",', ' "dataType": "geometry",', - ' "primaryKeyIndex": null,', ' "geometryType": "MULTIPOLYGON",', ' "geometryCRS": "EPSG:4167"', " },", " {", ' "id": "d3d4b64b-d48e-4069-4bb5-dfa943d91e6b",', ' "name": "date_adjusted",', - ' "dataType": "timestamp",', - ' "primaryKeyIndex": null', + ' "dataType": "timestamp"', " },", " {", ' "id": "dff34196-229d-f0b5-7fd4-b14ecf835b2c",', '- "name": "survey_reference",', '+ "name": "surv_ref",', ' "dataType": "text",', - ' "primaryKeyIndex": null,', ' "length": 50,', " },", " {", ' "id": "13dc4918-974e-978f-05ce-3b4321077c50",', ' "name": "adjusted_nodes",', ' "dataType": "integer",', - ' "primaryKeyIndex": null,', ' "size": 32', " },", "+ {", '+ "id": "4f3f4df1-c5a9-a768-f14d-9c3d001f5141",', '+ "name": "colour",', '+ "dataType": "text",', - '+ "primaryKeyIndex": null,', '+ "length": 32', "+ },", " ]", @@ -670,26 +648,22 @@ def check_polygons_diff_output(r, output_format): "geometryType": "MULTIPOLYGON", "id": "c1d4dea1-c0ad-0255-7857-b5695e3ba2e9", "name": "geom", - "primaryKeyIndex": None, }, { "dataType": "timestamp", "id": "d3d4b64b-d48e-4069-4bb5-dfa943d91e6b", "name": "date_adjusted", - "primaryKeyIndex": None, }, { "dataType": "text", "id": "dff34196-229d-f0b5-7fd4-b14ecf835b2c", "length": 50, "name": "survey_reference", - "primaryKeyIndex": None, }, { "dataType": "integer", "id": "13dc4918-974e-978f-05ce-3b4321077c50", "name": "adjusted_nodes", - "primaryKeyIndex": None, "size": 32, }, ], @@ -707,26 +681,22 @@ def check_polygons_diff_output(r, output_format): "geometryType": "MULTIPOLYGON", "id": "c1d4dea1-c0ad-0255-7857-b5695e3ba2e9", "name": "geom", - "primaryKeyIndex": None, }, { "dataType": "timestamp", "id": "d3d4b64b-d48e-4069-4bb5-dfa943d91e6b", "name": "date_adjusted", - "primaryKeyIndex": None, }, { "dataType": "text", "id": "dff34196-229d-f0b5-7fd4-b14ecf835b2c", "length": 50, "name": "surv_ref", - "primaryKeyIndex": None, }, { "dataType": "integer", "id": "13dc4918-974e-978f-05ce-3b4321077c50", "name": "adjusted_nodes", - "primaryKeyIndex": None, "size": 32, }, { @@ -734,7 +704,6 @@ def check_polygons_diff_output(r, output_format): "id": "4f3f4df1-c5a9-a768-f14d-9c3d001f5141", "length": 32, "name": "colour", - "primaryKeyIndex": None, }, ], } @@ -1030,7 +999,6 @@ def test_schema_diff_as_text(gen_uuid): '- "id": "0d167b8b-294f-c2be-4747-bc947672d3a0",', '- "name": "geom",', '- "dataType": "geometry",', - '- "primaryKeyIndex": null,', '- "geometryType": "MULTIPOLYGON",', '- "geometryCRS": "EPSG:2193"', "- },", @@ -1038,7 +1006,6 @@ def test_schema_diff_as_text(gen_uuid): ' "id": "0f28f35f-89d8-2b93-40d7-30abe42c69ea",', ' "name": "building_id",', ' "dataType": "integer",', - ' "primaryKeyIndex": null,', '- "size": 32,', '+ "size": 64,', " },", @@ -1046,59 +1013,50 @@ def test_schema_diff_as_text(gen_uuid): ' "id": "b5c69fa8-f48f-59bb-7aab-95225daf4774",', ' "name": "name",', ' "dataType": "text",', - ' "primaryKeyIndex": null,', '+ "size": 40,', " },", "+ {", '+ "id": "d087bf39-1c76-fdd9-1315-0e81c6bd360f",', '+ "name": "territorial_authority",', - '+ "dataType": "text",', - '+ "primaryKeyIndex": null', + '+ "dataType": "text"', "+ },", " {", ' "id": "9f1924ac-097a-fc0a-b168-a06e8db32af7",', ' "name": "use",', - ' "dataType": "text",', - ' "primaryKeyIndex": null', + ' "dataType": "text"', " },", "- {", '- "id": "1bcf7a4a-19e9-9752-6264-0fd1d387633b",', '- "name": "suburb_locality",', - '- "dataType": "text",', - '- "primaryKeyIndex": null', + '- "dataType": "text"', "- },", "+ {", '+ "id": "0f4e1e5b-9adb-edbe-6cbd-0ee0140448e6",', '+ "name": "colour",', '+ "dataType": "integer",', - '+ "primaryKeyIndex": null,', '+ "size": 32', "+ },", " {", ' "id": "1777c850-baa2-6d52-dfcd-309f1741ff51",', ' "name": "town_city",', - ' "dataType": "text",', - ' "primaryKeyIndex": null', + ' "dataType": "text"', " },", "- {", '- "id": "d087bf39-1c76-fdd9-1315-0e81c6bd360f",', '- "name": "territorial_authority",', - '- "dataType": "text",', - '- "primaryKeyIndex": null', + '- "dataType": "text"', "- },", "+ {", '+ "id": "0d167b8b-294f-c2be-4747-bc947672d3a0",', '+ "name": "geom",', '+ "dataType": "geometry",', - '+ "primaryKeyIndex": null,', '+ "geometryType": "MULTIPOLYGON",', '+ "geometryCRS": "EPSG:2193"', "+ },", " {", ' "id": "db82ba8c-c997-4bf1-87ef-b5108bdccde7",', ' "name": "last_modified",', - ' "dataType": "date",', - ' "primaryKeyIndex": null', + ' "dataType": "date"', " },", " ]", ] diff --git a/tests/test_structure.py b/tests/test_structure.py index 3cb678681..14a12658d 100644 --- a/tests/test_structure.py +++ b/tests/test_structure.py @@ -465,16 +465,14 @@ def test_shp_import_meta( { "name": "geom", "dataType": "geometry", - "primaryKeyIndex": None, "geometryType": "POLYGON", "geometryCRS": "EPSG:4167", }, - {"name": "date_adjus", "dataType": "date", "primaryKeyIndex": None}, - {"name": "survey_ref", "dataType": "text", "primaryKeyIndex": None}, + {"name": "date_adjus", "dataType": "date"}, + {"name": "survey_ref", "dataType": "text"}, { "name": "adjusted_n", "dataType": "integer", - "primaryKeyIndex": None, "size": 32, }, ] @@ -562,16 +560,14 @@ def _test_postgis_import( { "name": "geom", "dataType": "geometry", - "primaryKeyIndex": None, "geometryType": "MULTIPOLYGON", "geometryCRS": "EPSG:4167", }, - {"name": "date_adjusted", "dataType": "timestamp", "primaryKeyIndex": None}, - {"name": "survey_reference", "dataType": "text", "primaryKeyIndex": None}, + {"name": "date_adjusted", "dataType": "timestamp"}, + {"name": "survey_reference", "dataType": "text"}, { "name": "adjusted_nodes", "dataType": "integer", - "primaryKeyIndex": None, "size": 32, }, ] diff --git a/tests/test_upgrade.py b/tests/test_upgrade.py index 2a234d434..bc5cddf74 100644 --- a/tests/test_upgrade.py +++ b/tests/test_upgrade.py @@ -28,13 +28,13 @@ def test_upgrade_v0(archive, data_archive, cli_runner, tmp_path, chdir): if archive == "points0.snow": assert r.stdout.splitlines() == [ - "commit 98ad108336442d2447bec5997bb13192ca589a2c", + "commit e0f39729ffe37b9f858afe0783ff0a29c98d699d", "Author: Robert Coup ", "Date: Thu Jun 20 15:28:33 2019 +0100", "", " Improve naming on Coromandel East coast", "", - "commit f5ed4a3c537c9828ab392ddd08e3d4c56b460ed5", + "commit 31f8edfc3bfb660c36e0568d67722affd71813eb", "Author: Robert Coup ", "Date: Tue Jun 11 12:03:58 2019 +0100", "", @@ -67,13 +67,13 @@ def test_upgrade_v1(archive, layer, data_archive, cli_runner, tmp_path, chdir): if layer == H.POINTS.LAYER: assert r.stdout.splitlines() == [ - "commit 98ad108336442d2447bec5997bb13192ca589a2c", + "commit e0f39729ffe37b9f858afe0783ff0a29c98d699d", "Author: Robert Coup ", "Date: Thu Jun 20 15:28:33 2019 +0100", "", " Improve naming on Coromandel East coast", "", - "commit f5ed4a3c537c9828ab392ddd08e3d4c56b460ed5", + "commit 31f8edfc3bfb660c36e0568d67722affd71813eb", "Author: Robert Coup ", "Date: Tue Jun 11 12:03:58 2019 +0100", "", diff --git a/tests/test_working_copy.py b/tests/test_working_copy.py new file mode 100644 index 000000000..8d87a8e7e --- /dev/null +++ b/tests/test_working_copy.py @@ -0,0 +1,32 @@ +from sno.schema import ALL_DATA_TYPES + +_ALL_SIZE_VARIANTS = { + "integer": (8, 16, 32, 64), + "float": (32, 64), +} + + +def compute_approximated_types(v2_to_db, db_to_v2): + result = {} + for orig_type in ALL_DATA_TYPES: + size_variants = _ALL_SIZE_VARIANTS.get(orig_type, (None,)) + for orig_size in size_variants: + db_type = v2_to_db[orig_type] + if orig_size is not None: + db_type = db_type[orig_size] + + roundtripped_type = db_to_v2[db_type] + roundtripped_size = None + if isinstance(roundtripped_type, tuple): + roundtripped_type, roundtripped_size = roundtripped_type + + if (orig_type, orig_size) == (roundtripped_type, roundtripped_size): + continue + + if orig_size is not None: + result.setdefault(orig_type, {}) + result[orig_type][orig_size] = (roundtripped_type, roundtripped_size) + else: + result[orig_type] = roundtripped_type + + return result diff --git a/tests/test_working_copy_gpkg.py b/tests/test_working_copy_gpkg.py index eb12d6c11..6daa86772 100644 --- a/tests/test_working_copy_gpkg.py +++ b/tests/test_working_copy_gpkg.py @@ -7,11 +7,13 @@ import apsw import pygit2 +from sno import gpkg_adapter from sno.exceptions import INVALID_ARGUMENT, INVALID_OPERATION from sno.sno_repo import SnoRepo from sno.structure import RepositoryStructure from sno.working_copy import WorkingCopy from sno.working_copy.gpkg import WorkingCopy_GPKG_1 +from test_working_copy import compute_approximated_types H = pytest.helpers.helpers() @@ -895,3 +897,17 @@ def test_reset(data_working_copy, cli_runner, geopackage, edit_polygons): assert changes is None r = cli_runner.invoke(["diff", "--exit-code"]) assert r.exit_code == 0, r + + +def test_approximated_types(): + assert gpkg_adapter.APPROXIMATED_TYPES == compute_approximated_types( + gpkg_adapter.V2_TYPE_TO_GPKG_TYPE, gpkg_adapter.GPKG_TYPE_TO_V2_TYPE + ) + + +def test_types_roundtrip(data_working_copy, cli_runner): + # If type-approximation roundtrip code isn't working, + # we would get spurious diffs on types that GPKG doesn't support. + with data_working_copy("types2") as (repo_path, wc): + r = cli_runner.invoke(["diff", "--exit-code"]) + assert r.exit_code == 0, r.stdout diff --git a/tests/test_working_copy_postgis.py b/tests/test_working_copy_postgis.py index 28abc03ed..1c7c9b87c 100644 --- a/tests/test_working_copy_postgis.py +++ b/tests/test_working_copy_postgis.py @@ -4,8 +4,9 @@ import pygit2 from sno.sno_repo import SnoRepo -from sno.working_copy import WorkingCopy +from sno.working_copy import WorkingCopy, postgis_adapter from sno.structure import RepositoryStructure +from test_working_copy import compute_approximated_types H = pytest.helpers.helpers() @@ -146,7 +147,7 @@ def test_commit_edits( assert repo.head.peel(pygit2.Commit).hex == orig_head -def test_edit_schema(data_archive, geopackage, cli_runner, new_postgis_db_schema): +def test_edit_schema(data_archive, cli_runner, new_postgis_db_schema): with data_archive("polygons2") as repo_path: repo = SnoRepo(repo_path) H.clear_working_copy() @@ -191,7 +192,7 @@ def test_edit_schema(data_archive, geopackage, cli_runner, new_postgis_db_schema diff = r.stdout.splitlines() assert "--- nz_waca_adjustments:meta:crs/EPSG:4167.wkt" in diff assert "+++ nz_waca_adjustments:meta:crs/EPSG:3857.wkt" in diff - assert diff[-51:] == [ + assert diff[-46:] == [ "--- nz_waca_adjustments:meta:schema.json", "+++ nz_waca_adjustments:meta:schema.json", " [", @@ -206,7 +207,6 @@ def test_edit_schema(data_archive, geopackage, cli_runner, new_postgis_db_schema ' "id": "c1d4dea1-c0ad-0255-7857-b5695e3ba2e9",', ' "name": "geom",', ' "dataType": "geometry",', - ' "primaryKeyIndex": null,', ' "geometryType": "MULTIPOLYGON",', '- "geometryCRS": "EPSG:4167",', '+ "geometryCRS": "EPSG:3857",', @@ -214,28 +214,24 @@ def test_edit_schema(data_archive, geopackage, cli_runner, new_postgis_db_schema " {", ' "id": "d3d4b64b-d48e-4069-4bb5-dfa943d91e6b",', ' "name": "date_adjusted",', - ' "dataType": "timestamp",', - ' "primaryKeyIndex": null', + ' "dataType": "timestamp"', " },", "- {", '- "id": "dff34196-229d-f0b5-7fd4-b14ecf835b2c",', '- "name": "survey_reference",', '- "dataType": "text",', - '- "primaryKeyIndex": null,', '- "length": 50', "- },", " {", ' "id": "13dc4918-974e-978f-05ce-3b4321077c50",', ' "name": "adjusted_nodes",', ' "dataType": "integer",', - ' "primaryKeyIndex": null,', ' "size": 32', " },", "+ {", '+ "id": "f14e9f9b-e47a-d309-6eff-fe498e0ee443",', '+ "name": "colour",', '+ "dataType": "text",', - '+ "primaryKeyIndex": null,', '+ "length": 32', "+ },", " ]", @@ -275,7 +271,7 @@ class SucceedAndRollback(Exception): pass -def test_edit_crs(data_archive, geopackage, cli_runner, new_postgis_db_schema): +def test_edit_crs(data_archive, cli_runner, new_postgis_db_schema): with data_archive("points2") as repo_path: repo = SnoRepo(repo_path) H.clear_working_copy() @@ -345,3 +341,24 @@ def test_edit_crs(data_archive, geopackage, cli_runner, new_postgis_db_schema): ) raise SucceedAndRollback() + + +def test_approximated_types(): + assert postgis_adapter.APPROXIMATED_TYPES == compute_approximated_types( + postgis_adapter.V2_TYPE_TO_PG_TYPE, postgis_adapter.PG_TYPE_TO_V2_TYPE + ) + + +def test_types_roundtrip(data_archive, cli_runner, new_postgis_db_schema): + with data_archive("types2") as repo_path: + repo = SnoRepo(repo_path) + H.clear_working_copy() + + with new_postgis_db_schema() as (postgres_url, postgres_schema): + repo.config["sno.workingcopy.path"] = postgres_url + r = cli_runner.invoke(["checkout"]) + + # If type-approximation roundtrip code isn't working, + # we would get spurious diffs on types that GPKG doesn't support. + r = cli_runner.invoke(["diff", "--exit-code"]) + assert r.exit_code == 0, r.stdout