Skip to content

Commit

Permalink
Merge pull request #304 from koordinates/types
Browse files Browse the repository at this point in the history
Tighten up type roundtrips ready for postgis launch
  • Loading branch information
olsen232 authored Nov 13, 2020
2 parents c346056 + dd428dd commit 9ca24f1
Show file tree
Hide file tree
Showing 21 changed files with 274 additions and 188 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ _When adding new entries to the changelog, please include issue/PR numbers where
* Patches that create or delete datasets are now supported in Datasets V2 [#239](https://github.com/koordinates/sno/issues/239)
* `apply`, `commit` and `merge` commands now optimise repositories after committing, to avoid poor repo performance. [#250](https://github.com/koordinates/sno/issues/250)
* `commit` now checks that the diff to be committed matches the schema, and rejects diffs that do not - this is possible in working copy formats that have relatively lax type enforcement, ie GPKG [#300](https://github.com/koordinates/sno/pull/300)
* Added GPKG support for sno types that GPKG doesn't support - they are approximated as strings. [#304](https://github.com/koordinates/sno/pull/304)
* `schema.json` no longer stores attributes that are null - a missing attribute has the same meaning as that attribute being present and null. [#304](https://github.com/koordinates/sno/pull/304)
* `data ls` now accepts an optional ref argument
* `meta get` now accepts a `--ref=REF` option
* `clone` now accepts a `--branch` option to clone a specific branch.
Expand Down
4 changes: 4 additions & 0 deletions sno/dataset2.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,10 @@ def get_meta_item(self, name):
if rel_path.startswith(self.LEGEND_PATH):
return data

if rel_path.endswith("schema.json"):
# Unfortunately, some schemas might be stored slightly differently to others -
# - eg with or without null attributes. This normalises them.
return Schema.normalise_column_dicts(json_unpack(data))
if rel_path.endswith(".json"):
return json_unpack(data)
elif rel_path.endswith(".wkt"):
Expand Down
57 changes: 33 additions & 24 deletions sno/gpkg_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,25 +296,7 @@ def _column_schema_to_gpkg(cid, column_schema, is_spatial):
# TEXT{(max_len)}, BLOB{(max_len)}, DATE, DATETIME, <geometry_type_name>


_GPKG_TYPE_TO_V2_TYPE = {
"BOOLEAN": "boolean",
"TINYINT": ("integer", {"size": 8}),
"SMALLINT": ("integer", {"size": 16}),
"MEDIUMINT": ("integer", {"size": 32}),
"INT": ("integer", {"size": 64}),
"INTEGER": ("integer", {"size": 64}),
"FLOAT": ("float", {"size": 32}),
"DOUBLE": ("float", {"size": 64}),
"REAL": ("float", {"size": 64}),
"TEXT": "text",
"BLOB": "blob",
"DATE": "date",
"DATETIME": "timestamp",
# GEOMETRY types handled differently
}


_V2_TYPE_TO_GPKG_TYPE = {
V2_TYPE_TO_GPKG_TYPE = {
"boolean": "BOOLEAN",
"integer": {
0: "INTEGER",
Expand All @@ -328,22 +310,49 @@ def _column_schema_to_gpkg(cid, column_schema, is_spatial):
"blob": "BLOB",
"date": "DATE",
"timestamp": "DATETIME",
# geometry types handled differently
"time": "TEXT", # Approximated
"numeric": "TEXT", # Approximated
"interval": "TEXT", # Approximated
"geometry": "GEOMETRY",
}


GPKG_TYPE_TO_V2_TYPE = {
"BOOLEAN": "boolean",
"TINYINT": ("integer", 8),
"SMALLINT": ("integer", 16),
"MEDIUMINT": ("integer", 32),
"INT": ("integer", 64),
"INTEGER": ("integer", 64),
"FLOAT": ("float", 32),
"DOUBLE": ("float", 64),
"REAL": ("float", 64),
"TEXT": "text",
"BLOB": "blob",
"DATE": "date",
"DATETIME": "timestamp",
"GEOMETRY": "geometry",
}


# Types that can't be roundtrip perfectly in GPKG, and what they end up as.
APPROXIMATED_TYPES = {"interval": "text", "time": "text", "numeric": "text"}


def gpkg_type_to_v2_type(gkpg_type):
"""Convert a gpkg type to v2 schema type."""
m = re.match(r"^(TEXT|BLOB)\(([0-9]+)\)$", gkpg_type)
if m:
return m.group(1).lower(), {"length": int(m.group(2))}
v2_type_info = _GPKG_TYPE_TO_V2_TYPE.get(gkpg_type)
v2_type_info = GPKG_TYPE_TO_V2_TYPE.get(gkpg_type)
if v2_type_info is None:
raise ValueError(f"Unrecognised GPKG type: {gkpg_type}")
elif isinstance(v2_type_info, tuple):
v2_type, extra_type_info = v2_type_info
v2_type, size = v2_type_info
extra_type_info = {"size": size}
else:
v2_type, extra_type_info = v2_type_info, {}
v2_type = v2_type_info
extra_type_info = {}
return v2_type, extra_type_info


Expand Down Expand Up @@ -379,7 +388,7 @@ def v2_type_to_gpkg_type(column_schema, is_spatial):
if column_schema.data_type == "geometry":
return extra_type_info["geometryType"].split(" ", 1)[0]

gpkg_type_info = _V2_TYPE_TO_GPKG_TYPE.get(v2_type)
gpkg_type_info = V2_TYPE_TO_GPKG_TYPE.get(v2_type)
if gpkg_type_info is None:
raise ValueError(f"Unrecognised data type: {v2_type}")

Expand Down
61 changes: 29 additions & 32 deletions sno/ogr_import_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from .exceptions import (
InvalidOperation,
NotFound,
NotYetImplemented,
NO_IMPORT_SOURCE,
NO_TABLE,
)
Expand Down Expand Up @@ -43,38 +44,21 @@ class OgrImportSource(ImportSource):
Imports from an OGR source, currently from a whitelist of formats.
"""

OGR_TYPE_TO_SQLITE_TYPE = {
# NOTE: we don't handle OGR's *List (array) fields at all.
# If you write them to GPKG using OGR, you end up with TEXT.
# We also don't handle ogr's "Time" fields, because they end up as TEXT in GPKG,
# which we can't roundtrip. Tackle when we get someone actually using those types...
"Integer": "MEDIUMINT",
"Integer64": "INTEGER",
"Real": "REAL",
"String": "TEXT",
"Binary": "BLOB",
"Date": "DATE",
"DateTime": "DATETIME",
}
OGR_SUBTYPE_TO_SQLITE_TYPE = {
ogr.OFSTBoolean: "BOOLEAN",
ogr.OFSTInt16: "SMALLINT",
ogr.OFSTFloat32: "FLOAT",
}

# NOTE: We don't support *List fields (eg IntegerList).
OGR_TYPE_TO_V2_SCHEMA_TYPE = {
"Integer": ("integer", {"size": 32}),
"Integer64": ("integer", {"size": 64}),
"Real": ("float", {"size": 64}),
"String": ("text", {}),
"Binary": ("blob", {}),
"Date": ("date", {}),
"DateTime": ("timestamp", {}),
"Integer": ("integer", 32),
"Integer64": ("integer", 64),
"Real": ("float", 64),
"String": "text",
"Binary": "blob",
"Date": "date",
"DateTime": "timestamp",
"Time": "time",
}
OGR_SUBTYPE_TO_V2_SCHEMA_TYPE = {
ogr.OFSTBoolean: ("boolean", {}),
ogr.OFSTInt16: ("integer", {"size": 16}),
ogr.OFSTFloat32: ("float", {"size": 32}),
ogr.OFSTBoolean: "boolean",
ogr.OFSTInt16: ("integer", 16),
ogr.OFSTFloat32: ("float", 32),
}

@classmethod
Expand Down Expand Up @@ -490,11 +474,24 @@ def _field_to_v2_column_schema(self, fd):
ogr_type = fd.GetTypeName()
ogr_subtype = fd.GetSubType()
if ogr_subtype == ogr.OFSTNone:
data_type, extra_type_info = self.OGR_TYPE_TO_V2_SCHEMA_TYPE[ogr_type]
data_type_info = self.OGR_TYPE_TO_V2_SCHEMA_TYPE.get(ogr_type)
if data_type_info is None:
raise NotYetImplemented(
f"Unsupported column type for import: OGR type={ogr_type}"
)
else:
data_type, extra_type_info = self.OGR_SUBTYPE_TO_V2_SCHEMA_TYPE[ogr_subtype]
data_type_info = self.OGR_SUBTYPE_TO_V2_SCHEMA_TYPE.get(ogr_subtype)
if data_type_info is None:
raise NotYetImplemented(
f"Unsupported column type for import: OGR subtype={ogr_subtype}"
)

extra_type_info = extra_type_info.copy()
if isinstance(data_type_info, tuple):
data_type, size = data_type_info
extra_type_info = {"size": size}
elif isinstance(data_type_info, str):
data_type = data_type_info
extra_type_info = {}

if data_type in ("TEXT", "BLOB"):
width = fd.GetWidth()
Expand Down
105 changes: 67 additions & 38 deletions sno/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,22 +152,21 @@ def __new__(cls, id, name, data_type, pk_index, **extra_type_info):
@classmethod
def from_dict(cls, d):
d = d.copy()
return cls(
d.pop("id"),
d.pop("name"),
d.pop("dataType"),
d.pop("primaryKeyIndex"),
**d,
)
id_ = d.pop("id")
name = d.pop("name")
data_type = d.pop("dataType")
pk_index = d.pop("primaryKeyIndex", None)
extra_type_info = dict((k, v) for k, v in d.items() if v is not None)
return cls(id_, name, data_type, pk_index, **extra_type_info)

def to_dict(self):
return {
"id": self.id,
"name": self.name,
"dataType": self.data_type,
"primaryKeyIndex": self.pk_index,
**self.extra_type_info,
}
result = {"id": self.id, "name": self.name, "dataType": self.data_type}
if self.pk_index is not None:
result["primaryKeyIndex"] = self.pk_index
for key, value in self.extra_type_info.items():
if value is not None:
result[key] = value
return result

def __eq__(self, other):
if not isinstance(other, ColumnSchema):
Expand Down Expand Up @@ -240,6 +239,10 @@ def from_column_dicts(cls, column_dicts):
columns = [ColumnSchema.from_dict(d) for d in column_dicts]
return cls(columns)

@classmethod
def normalise_column_dicts(cls, column_dicts):
return Schema.from_column_dicts(column_dicts).to_column_dicts()

@classmethod
def loads(cls, data):
"""Load a schema from a bytestring"""
Expand Down Expand Up @@ -346,12 +349,22 @@ def sanitise_pks(self, pk_values):
pk_values[i] = int(pk_values[i])
return tuple(pk_values)

def align_to_self(self, new_schema):
def align_to_self(self, new_schema, approximated_types=None):
"""
Returns a new schema the same as the one given, except that some column IDs might be changed to match those in
self. Column IDs are copied from self onto the resulting schema if the columns in the resulting schema
are the "same" as columns in the previous schema. Uses a heuristic - columns are the same if they have the
same name + type (handles reordering), or, if they have the same position and type (handles renames).
approximated_types - if the new_schema has been roundtripped through a database that doesn't support all sno
types, then some of them will have had to be approximated. Aligning the schema also restores them to their
original type if they had been approximated as close as possible. approximated_types should be a dict
mapping type to approximated type, sometimes with an extra sub-dict for size subtypes eg:
{
"timestamp": "string", # timestamp is approximated as text
"integer": {8: ("integer", 32), 16: ("integer", 32)} # int8, int16 are approximated as int32
}
"""
# TODO - could prompt the user to help with more complex schema changes.
old_cols = self.to_column_dicts()
Expand All @@ -360,8 +373,8 @@ def align_to_self(self, new_schema):
return Schema.from_column_dicts(new_cols)

@classmethod
def align_schema_cols(cls, old_cols, new_cols):
"""Same as align_to_self, but works on lists of column dicts, instead of Schema objects."""
def align_schema_cols(cls, old_cols, new_cols, approximated_types=None):
"""Same as align_to_self, but mutates new_cols list, instead of returning a new Schema object."""
for old_col in old_cols:
old_col["done"] = False
for new_col in new_cols:
Expand All @@ -370,11 +383,13 @@ def align_schema_cols(cls, old_cols, new_cols):
# Align columns by name + type - handles reordering.
old_cols_by_name = {c["name"]: c for c in old_cols}
for new_col in new_cols:
cls._try_align(old_cols_by_name.get(new_col["name"]), new_col)
cls._try_align(
old_cols_by_name.get(new_col["name"]), new_col, approximated_types
)

# Align columns by position + type - handles renames.
for old_col, new_col in zip(old_cols, new_cols):
cls._try_align(old_col, new_col)
cls._try_align(old_col, new_col, approximated_types)

for old_col in old_cols:
del old_col["done"]
Expand All @@ -383,25 +398,37 @@ def align_schema_cols(cls, old_cols, new_cols):
return new_cols

@classmethod
def _try_align(cls, old_col, new_col):
def _try_align(cls, old_col, new_col, approximated_types=None):
if old_col is None or new_col is None:
return False
if old_col["done"] or new_col["done"]:
return False
if old_col["dataType"] != new_col["dataType"]:
return False
if old_col["primaryKeyIndex"] != new_col["primaryKeyIndex"]:
if old_col.get("primaryKeyIndex") != new_col.get("primaryKeyIndex"):
return False

old_type, new_type = old_col["dataType"], new_col["dataType"]
if old_type != new_type:
if approximated_types and approximated_types[old_type] == new_type:
# new_col's type was the best approximation we could manage of old_col's type.
new_col["dataType"] = old_col["dataType"]
else:
return False

# The two columns are similar enough that we can align their IDs.

new_col["id"] = old_col["id"]

# We can't roundtrip size properly for GPKG primary keys since they have to be of type INTEGER.
if (
old_col["primaryKeyIndex"] is not None
and "size" in old_col
and "size" in new_col
and old_col["size"] != new_col["size"]
):
new_col["size"] = old_col["size"]
old_size, new_size = old_col.get("size"), new_col.get("size")
if old_size != new_size:
data_type = old_type
approx_info = approximated_types and approximated_types.get(data_type)

if old_col.get("primaryKeyIndex") is not None and new_size == 64:
# We can't roundtrip size properly for GPKG primary keys since they have to be of type INTEGER.
new_col["size"] = old_col.get("size")
elif approx_info and approx_info.get(old_size) == (data_type, new_size):
# new_col's size was the best approximation we could manage of old_col's size.
new_col["size"] = old_col.get("size")

old_col["done"] = True
new_col["done"] = True
Expand Down Expand Up @@ -453,18 +480,20 @@ def diff_types(self, new_schema):
def diff_type_counts(self, new_schema):
return {k: len(v) for k, v in self.diff_types(new_schema).items()}

EQUIVALENT_PYTHON_TYPES = {
# These are the types that are stored in datasets.
# Different types might be returned from the working copy DB driver, in which case, they must be adapted.
EQUIVALENT_MSGPACK_TYPES = {
"boolean": (bool,),
"blob": (bytes,),
"date": (str,),
"date": (str,), # might be datetime.date from DB
"float": (float, int),
"geometry": (Geometry,),
"integer": (int,),
"interval": (str,),
"numeric": (str,),
"interval": (str,), # might be datetime.timedelta from DB
"numeric": (str,), # might be decimal.Decimal from DB
"text": (str,),
"time": (str,),
"timestamp": (str,),
"time": (str,), # might be datetime.time from DB
"timestamp": (str,), # might be datetime.datetime from DB
}

def validate_feature(self, feature, col_violations=None):
Expand Down Expand Up @@ -504,7 +533,7 @@ def _find_column_violation(self, col, feature):
return None

col_type = col.data_type
if type(value) not in self.EQUIVALENT_PYTHON_TYPES[col_type]:
if type(value) not in self.EQUIVALENT_MSGPACK_TYPES[col_type]:
return f"In column '{col.name}' value {repr(value)} doesn't match schema type {col_type}"
elif col_type == "integer" and "size" in col.extra_type_info:
size = col.extra_type_info["size"]
Expand Down
Loading

0 comments on commit 9ca24f1

Please sign in to comment.