Merge pull request #304 from koordinates/types

Tighten up type roundtrips ready for postgis launch
koordinates · Nov 13, 2020 · 9ca24f1 · 9ca24f1
2 parents c346056 + dd428dd
commit 9ca24f1
Show file tree

Hide file tree

Showing 21 changed files with 274 additions and 188 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,8 @@ _When adding new entries to the changelog, please include issue/PR numbers where
  * Patches that create or delete datasets are now supported in Datasets V2 [#239](https://github.com/koordinates/sno/issues/239)
  * `apply`, `commit` and `merge` commands now optimise repositories after committing, to avoid poor repo performance. [#250](https://github.com/koordinates/sno/issues/250)
  * `commit` now checks that the diff to be committed matches the schema, and rejects diffs that do not - this is possible in working copy formats that have relatively lax type enforcement, ie GPKG [#300](https://github.com/koordinates/sno/pull/300)
+ *  Added GPKG support for sno types that GPKG doesn't support - they are approximated as strings. [#304](https://github.com/koordinates/sno/pull/304)
+ *  `schema.json` no longer stores attributes that are null - a missing attribute has the same meaning as that attribute being present and null. [#304](https://github.com/koordinates/sno/pull/304)
  * `data ls` now accepts an optional ref argument
  * `meta get` now accepts a `--ref=REF` option
  * `clone` now accepts a `--branch` option to clone a specific branch.

diff --git a/sno/dataset2.py b/sno/dataset2.py
@@ -97,6 +97,10 @@ def get_meta_item(self, name):
         if rel_path.startswith(self.LEGEND_PATH):
             return data
 
+        if rel_path.endswith("schema.json"):
+            # Unfortunately, some schemas might be stored slightly differently to others -
+            # - eg with or without null attributes. This normalises them.
+            return Schema.normalise_column_dicts(json_unpack(data))
         if rel_path.endswith(".json"):
             return json_unpack(data)
         elif rel_path.endswith(".wkt"):

diff --git a/sno/gpkg_adapter.py b/sno/gpkg_adapter.py
@@ -296,25 +296,7 @@ def _column_schema_to_gpkg(cid, column_schema, is_spatial):
 # TEXT{(max_len)}, BLOB{(max_len)}, DATE, DATETIME, <geometry_type_name>
 
 
-_GPKG_TYPE_TO_V2_TYPE = {
-    "BOOLEAN": "boolean",
-    "TINYINT": ("integer", {"size": 8}),
-    "SMALLINT": ("integer", {"size": 16}),
-    "MEDIUMINT": ("integer", {"size": 32}),
-    "INT": ("integer", {"size": 64}),
-    "INTEGER": ("integer", {"size": 64}),
-    "FLOAT": ("float", {"size": 32}),
-    "DOUBLE": ("float", {"size": 64}),
-    "REAL": ("float", {"size": 64}),
-    "TEXT": "text",
-    "BLOB": "blob",
-    "DATE": "date",
-    "DATETIME": "timestamp",
-    # GEOMETRY types handled differently
-}
-
-
-_V2_TYPE_TO_GPKG_TYPE = {
+V2_TYPE_TO_GPKG_TYPE = {
     "boolean": "BOOLEAN",
     "integer": {
         0: "INTEGER",
@@ -328,22 +310,49 @@ def _column_schema_to_gpkg(cid, column_schema, is_spatial):
     "blob": "BLOB",
     "date": "DATE",
     "timestamp": "DATETIME",
-    # geometry types handled differently
+    "time": "TEXT",  # Approximated
+    "numeric": "TEXT",  # Approximated
+    "interval": "TEXT",  # Approximated
+    "geometry": "GEOMETRY",
+}
+
+
+GPKG_TYPE_TO_V2_TYPE = {
+    "BOOLEAN": "boolean",
+    "TINYINT": ("integer", 8),
+    "SMALLINT": ("integer", 16),
+    "MEDIUMINT": ("integer", 32),
+    "INT": ("integer", 64),
+    "INTEGER": ("integer", 64),
+    "FLOAT": ("float", 32),
+    "DOUBLE": ("float", 64),
+    "REAL": ("float", 64),
+    "TEXT": "text",
+    "BLOB": "blob",
+    "DATE": "date",
+    "DATETIME": "timestamp",
+    "GEOMETRY": "geometry",
 }
 
 
+# Types that can't be roundtrip perfectly in GPKG, and what they end up as.
+APPROXIMATED_TYPES = {"interval": "text", "time": "text", "numeric": "text"}
+
+
 def gpkg_type_to_v2_type(gkpg_type):
     """Convert a gpkg type to v2 schema type."""
     m = re.match(r"^(TEXT|BLOB)\(([0-9]+)\)$", gkpg_type)
     if m:
         return m.group(1).lower(), {"length": int(m.group(2))}
-    v2_type_info = _GPKG_TYPE_TO_V2_TYPE.get(gkpg_type)
+    v2_type_info = GPKG_TYPE_TO_V2_TYPE.get(gkpg_type)
     if v2_type_info is None:
         raise ValueError(f"Unrecognised GPKG type: {gkpg_type}")
     elif isinstance(v2_type_info, tuple):
-        v2_type, extra_type_info = v2_type_info
+        v2_type, size = v2_type_info
+        extra_type_info = {"size": size}
     else:
-        v2_type, extra_type_info = v2_type_info, {}
+        v2_type = v2_type_info
+        extra_type_info = {}
     return v2_type, extra_type_info
 
 
@@ -379,7 +388,7 @@ def v2_type_to_gpkg_type(column_schema, is_spatial):
     if column_schema.data_type == "geometry":
         return extra_type_info["geometryType"].split(" ", 1)[0]
 
-    gpkg_type_info = _V2_TYPE_TO_GPKG_TYPE.get(v2_type)
+    gpkg_type_info = V2_TYPE_TO_GPKG_TYPE.get(v2_type)
     if gpkg_type_info is None:
         raise ValueError(f"Unrecognised data type: {v2_type}")
 

diff --git a/sno/ogr_import_source.py b/sno/ogr_import_source.py
@@ -13,6 +13,7 @@
 from .exceptions import (
     InvalidOperation,
     NotFound,
+    NotYetImplemented,
     NO_IMPORT_SOURCE,
     NO_TABLE,
 )
@@ -43,38 +44,21 @@ class OgrImportSource(ImportSource):
     Imports from an OGR source, currently from a whitelist of formats.
     """
 
-    OGR_TYPE_TO_SQLITE_TYPE = {
-        # NOTE: we don't handle OGR's *List (array) fields at all.
-        # If you write them to GPKG using OGR, you end up with TEXT.
-        # We also don't handle  ogr's "Time" fields, because they end up as TEXT in GPKG,
-        # which we can't roundtrip. Tackle when we get someone actually using those types...
-        "Integer": "MEDIUMINT",
-        "Integer64": "INTEGER",
-        "Real": "REAL",
-        "String": "TEXT",
-        "Binary": "BLOB",
-        "Date": "DATE",
-        "DateTime": "DATETIME",
-    }
-    OGR_SUBTYPE_TO_SQLITE_TYPE = {
-        ogr.OFSTBoolean: "BOOLEAN",
-        ogr.OFSTInt16: "SMALLINT",
-        ogr.OFSTFloat32: "FLOAT",
-    }
-
+    # NOTE: We don't support *List fields (eg IntegerList).
     OGR_TYPE_TO_V2_SCHEMA_TYPE = {
-        "Integer": ("integer", {"size": 32}),
-        "Integer64": ("integer", {"size": 64}),
-        "Real": ("float", {"size": 64}),
-        "String": ("text", {}),
-        "Binary": ("blob", {}),
-        "Date": ("date", {}),
-        "DateTime": ("timestamp", {}),
+        "Integer": ("integer", 32),
+        "Integer64": ("integer", 64),
+        "Real": ("float", 64),
+        "String": "text",
+        "Binary": "blob",
+        "Date": "date",
+        "DateTime": "timestamp",
+        "Time": "time",
     }
     OGR_SUBTYPE_TO_V2_SCHEMA_TYPE = {
-        ogr.OFSTBoolean: ("boolean", {}),
-        ogr.OFSTInt16: ("integer", {"size": 16}),
-        ogr.OFSTFloat32: ("float", {"size": 32}),
+        ogr.OFSTBoolean: "boolean",
+        ogr.OFSTInt16: ("integer", 16),
+        ogr.OFSTFloat32: ("float", 32),
     }
 
     @classmethod
@@ -490,11 +474,24 @@ def _field_to_v2_column_schema(self, fd):
         ogr_type = fd.GetTypeName()
         ogr_subtype = fd.GetSubType()
         if ogr_subtype == ogr.OFSTNone:
-            data_type, extra_type_info = self.OGR_TYPE_TO_V2_SCHEMA_TYPE[ogr_type]
+            data_type_info = self.OGR_TYPE_TO_V2_SCHEMA_TYPE.get(ogr_type)
+            if data_type_info is None:
+                raise NotYetImplemented(
+                    f"Unsupported column type for import: OGR type={ogr_type}"
+                )
         else:
-            data_type, extra_type_info = self.OGR_SUBTYPE_TO_V2_SCHEMA_TYPE[ogr_subtype]
+            data_type_info = self.OGR_SUBTYPE_TO_V2_SCHEMA_TYPE.get(ogr_subtype)
+            if data_type_info is None:
+                raise NotYetImplemented(
+                    f"Unsupported column type for import: OGR subtype={ogr_subtype}"
+                )
 
-        extra_type_info = extra_type_info.copy()
+        if isinstance(data_type_info, tuple):
+            data_type, size = data_type_info
+            extra_type_info = {"size": size}
+        elif isinstance(data_type_info, str):
+            data_type = data_type_info
+            extra_type_info = {}
 
         if data_type in ("TEXT", "BLOB"):
             width = fd.GetWidth()

diff --git a/sno/schema.py b/sno/schema.py
@@ -152,22 +152,21 @@ def __new__(cls, id, name, data_type, pk_index, **extra_type_info):
     @classmethod
     def from_dict(cls, d):
         d = d.copy()
-        return cls(
-            d.pop("id"),
-            d.pop("name"),
-            d.pop("dataType"),
-            d.pop("primaryKeyIndex"),
-            **d,
-        )
+        id_ = d.pop("id")
+        name = d.pop("name")
+        data_type = d.pop("dataType")
+        pk_index = d.pop("primaryKeyIndex", None)
+        extra_type_info = dict((k, v) for k, v in d.items() if v is not None)
+        return cls(id_, name, data_type, pk_index, **extra_type_info)
 
     def to_dict(self):
-        return {
-            "id": self.id,
-            "name": self.name,
-            "dataType": self.data_type,
-            "primaryKeyIndex": self.pk_index,
-            **self.extra_type_info,
-        }
+        result = {"id": self.id, "name": self.name, "dataType": self.data_type}
+        if self.pk_index is not None:
+            result["primaryKeyIndex"] = self.pk_index
+        for key, value in self.extra_type_info.items():
+            if value is not None:
+                result[key] = value
+        return result
 
     def __eq__(self, other):
         if not isinstance(other, ColumnSchema):
@@ -240,6 +239,10 @@ def from_column_dicts(cls, column_dicts):
         columns = [ColumnSchema.from_dict(d) for d in column_dicts]
         return cls(columns)
 
+    @classmethod
+    def normalise_column_dicts(cls, column_dicts):
+        return Schema.from_column_dicts(column_dicts).to_column_dicts()
+
     @classmethod
     def loads(cls, data):
         """Load a schema from a bytestring"""
@@ -346,12 +349,22 @@ def sanitise_pks(self, pk_values):
                 pk_values[i] = int(pk_values[i])
         return tuple(pk_values)
 
-    def align_to_self(self, new_schema):
+    def align_to_self(self, new_schema, approximated_types=None):
         """
         Returns a new schema the same as the one given, except that some column IDs might be changed to match those in
         self. Column IDs are copied from self onto the resulting schema if the columns in the resulting schema
         are the "same" as columns in the previous schema. Uses a heuristic - columns are the same if they have the
         same name + type (handles reordering), or, if they have the same position and type (handles renames).
+
+        approximated_types - if the new_schema has been roundtripped through a database that doesn't support all sno
+            types, then some of them will have had to be approximated. Aligning the schema also restores them to their
+            original type if they had been approximated as close as possible. approximated_types should be a dict
+            mapping type to approximated type, sometimes with an extra sub-dict for size subtypes eg:
+            {
+                "timestamp": "string",                                  # timestamp is approximated as text
+                "integer": {8: ("integer", 32), 16: ("integer", 32)}    # int8, int16 are approximated as int32
+            }
+
         """
         # TODO - could prompt the user to help with more complex schema changes.
         old_cols = self.to_column_dicts()
@@ -360,8 +373,8 @@ def align_to_self(self, new_schema):
         return Schema.from_column_dicts(new_cols)
 
     @classmethod
-    def align_schema_cols(cls, old_cols, new_cols):
-        """Same as align_to_self, but works on lists of column dicts, instead of Schema objects."""
+    def align_schema_cols(cls, old_cols, new_cols, approximated_types=None):
+        """Same as align_to_self, but mutates new_cols list, instead of returning a new Schema object."""
         for old_col in old_cols:
             old_col["done"] = False
         for new_col in new_cols:
@@ -370,11 +383,13 @@ def align_schema_cols(cls, old_cols, new_cols):
         # Align columns by name + type - handles reordering.
         old_cols_by_name = {c["name"]: c for c in old_cols}
         for new_col in new_cols:
-            cls._try_align(old_cols_by_name.get(new_col["name"]), new_col)
+            cls._try_align(
+                old_cols_by_name.get(new_col["name"]), new_col, approximated_types
+            )
 
         # Align columns by position + type - handles renames.
         for old_col, new_col in zip(old_cols, new_cols):
-            cls._try_align(old_col, new_col)
+            cls._try_align(old_col, new_col, approximated_types)
 
         for old_col in old_cols:
             del old_col["done"]
@@ -383,25 +398,37 @@ def align_schema_cols(cls, old_cols, new_cols):
         return new_cols
 
     @classmethod
-    def _try_align(cls, old_col, new_col):
+    def _try_align(cls, old_col, new_col, approximated_types=None):
         if old_col is None or new_col is None:
             return False
         if old_col["done"] or new_col["done"]:
             return False
-        if old_col["dataType"] != new_col["dataType"]:
-            return False
-        if old_col["primaryKeyIndex"] != new_col["primaryKeyIndex"]:
+        if old_col.get("primaryKeyIndex") != new_col.get("primaryKeyIndex"):
             return False
+
+        old_type, new_type = old_col["dataType"], new_col["dataType"]
+        if old_type != new_type:
+            if approximated_types and approximated_types[old_type] == new_type:
+                # new_col's type was the best approximation we could manage of old_col's type.
+                new_col["dataType"] = old_col["dataType"]
+            else:
+                return False
+
+        # The two columns are similar enough that we can align their IDs.
+
         new_col["id"] = old_col["id"]
 
-        # We can't roundtrip size properly for GPKG primary keys since they have to be of type INTEGER.
-        if (
-            old_col["primaryKeyIndex"] is not None
-            and "size" in old_col
-            and "size" in new_col
-            and old_col["size"] != new_col["size"]
-        ):
-            new_col["size"] = old_col["size"]
+        old_size, new_size = old_col.get("size"), new_col.get("size")
+        if old_size != new_size:
+            data_type = old_type
+            approx_info = approximated_types and approximated_types.get(data_type)
+
+            if old_col.get("primaryKeyIndex") is not None and new_size == 64:
+                # We can't roundtrip size properly for GPKG primary keys since they have to be of type INTEGER.
+                new_col["size"] = old_col.get("size")
+            elif approx_info and approx_info.get(old_size) == (data_type, new_size):
+                # new_col's size was the best approximation we could manage of old_col's size.
+                new_col["size"] = old_col.get("size")
 
         old_col["done"] = True
         new_col["done"] = True
@@ -453,18 +480,20 @@ def diff_types(self, new_schema):
     def diff_type_counts(self, new_schema):
         return {k: len(v) for k, v in self.diff_types(new_schema).items()}
 
-    EQUIVALENT_PYTHON_TYPES = {
+    # These are the types that are stored in datasets.
+    # Different types might be returned from the working copy DB driver, in which case, they must be adapted.
+    EQUIVALENT_MSGPACK_TYPES = {
         "boolean": (bool,),
         "blob": (bytes,),
-        "date": (str,),
+        "date": (str,),  # might be datetime.date from DB
         "float": (float, int),
         "geometry": (Geometry,),
         "integer": (int,),
-        "interval": (str,),
-        "numeric": (str,),
+        "interval": (str,),  # might be datetime.timedelta from DB
+        "numeric": (str,),  # might be decimal.Decimal from DB
         "text": (str,),
-        "time": (str,),
-        "timestamp": (str,),
+        "time": (str,),  # might be datetime.time from DB
+        "timestamp": (str,),  # might be datetime.datetime from DB
     }
 
     def validate_feature(self, feature, col_violations=None):
@@ -504,7 +533,7 @@ def _find_column_violation(self, col, feature):
             return None
 
         col_type = col.data_type
-        if type(value) not in self.EQUIVALENT_PYTHON_TYPES[col_type]:
+        if type(value) not in self.EQUIVALENT_MSGPACK_TYPES[col_type]:
             return f"In column '{col.name}' value {repr(value)} doesn't match schema type {col_type}"
         elif col_type == "integer" and "size" in col.extra_type_info:
             size = col.extra_type_info["size"]