Skip to content

Commit

Permalink
Validate features - keep ints in expected range
Browse files Browse the repository at this point in the history
  • Loading branch information
olsen232 committed Nov 5, 2020
1 parent 3db2e18 commit ce7d14e
Show file tree
Hide file tree
Showing 9 changed files with 145 additions and 16 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ _When adding new entries to the changelog, please include issue/PR numbers where
* `apply` can now apply patches to branches other than `HEAD` [#294](https://github.com/koordinates/sno/issues/294)
* Patches that create or delete datasets are now supported in Datasets V2 [#239](https://github.com/koordinates/sno/issues/239)
* `apply`, `commit` and `merge` commands now optimise repositories after committing, to avoid poor repo performance. [#250](https://github.com/koordinates/sno/issues/250)
* `commit` now checks that the diff to be committed matches the schema, and rejects diffs that do not - this is possible in working copy formats that have relatively lax type enforcement [#300](https://github.com/koordinates/sno/pull/300)
* `data ls` now accepts an optional ref argument
* `meta get` now accepts a `--ref=REF` option
* `clone` now accepts a `--branch` option to clone a specific branch.
Expand Down
16 changes: 9 additions & 7 deletions sno/apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def unjson_feature(geom_column_name, f):
def apply_patch(
*,
repo,
commit,
do_commit,
patch_file,
allow_empty,
allow_missing_old_values=False,
Expand All @@ -108,7 +108,7 @@ def apply_patch(
raise click.FileError("Failed to parse JSON patch file") from e

if ref != "HEAD":
if not commit:
if not do_commit:
raise click.UsageError("--no-commit and --ref are incompatible")
if not ref.startswith("refs/heads/"):
ref = f"refs/heads/{ref}"
Expand All @@ -119,7 +119,7 @@ def apply_patch(

rs = RepositoryStructure.lookup(repo, ref)
wc = WorkingCopy.get(repo)
if not commit and not wc:
if not do_commit and not wc:
# TODO: might it be useful to apply without committing just to *check* if the patch applies?
raise NotFound("--no-commit requires a working copy", exit_code=NO_WORKING_COPY)

Expand All @@ -130,7 +130,9 @@ def apply_patch(
for ds_path, ds_diff_dict in json_diff.items():
dataset = rs.get(ds_path)
meta_change_type = _meta_change_type(ds_diff_dict)
check_change_supported(rs.version, dataset, ds_path, meta_change_type, commit)
check_change_supported(
rs.version, dataset, ds_path, meta_change_type, do_commit
)

meta_changes = ds_diff_dict.get("meta", {})

Expand Down Expand Up @@ -172,7 +174,7 @@ def parse_delta(change):
)
repo_diff.recursive_set([ds_path, "feature"], feature_diff)

if commit:
if do_commit:
try:
metadata = patch["sno.patch/v1"]
except KeyError:
Expand Down Expand Up @@ -225,14 +227,14 @@ def parse_delta(change):
# oid refers to either a commit or tree
wc_target = repo.get(oid)
click.echo(f"Updating {wc.path} ...")
wc.reset(wc_target, track_changes_as_dirty=not commit)
wc.reset(wc_target, track_changes_as_dirty=not do_commit)


@click.command()
@click.pass_context
@click.option(
"--commit/--no-commit",
"commit",
"do_commit",
default=True,
help="Commit changes",
)
Expand Down
1 change: 1 addition & 0 deletions sno/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
INVALID_OPERATION = 20
MERGE_CONFLICT = 21
PATCH_DOES_NOT_APPLY = 22
SCHEMA_VIOLATION = 23

NOT_YET_IMPLEMENTED = 30

Expand Down
2 changes: 1 addition & 1 deletion sno/meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ def meta_set(ctx, message, dataset, items):
patch_file.seek(0)
apply_patch(
repo=ctx.obj.repo,
commit=True,
do_commit=True,
patch_file=patch_file,
allow_empty=False,
allow_missing_old_values=True,
Expand Down
12 changes: 8 additions & 4 deletions sno/rich_base_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,8 @@ def apply_feature_diff(
if delta.type == "delete" and old_path not in tree:
has_conflicts = True
click.echo(
f"{self.path}: Trying to delete nonexistent feature: {old_key}"
f"{self.path}: Trying to delete nonexistent feature: {old_key}",
err=True,
)
continue

Expand All @@ -345,14 +346,16 @@ def apply_feature_diff(
):
has_conflicts = True
click.echo(
f"{self.path}: Trying to create feature that already exists: {new_key}"
f"{self.path}: Trying to create feature that already exists: {new_key}",
err=True,
)
continue

if delta.type == "update" and old_path not in tree:
has_conflicts = True
click.echo(
f"{self.path}: Trying to update nonexistent feature: {old_key}"
f"{self.path}: Trying to update nonexistent feature: {old_key}",
err=True,
)
continue

Expand All @@ -361,7 +364,8 @@ def apply_feature_diff(
):
has_conflicts = True
click.echo(
f"{self.path}: Trying to update already-changed feature: {old_key}"
f"{self.path}: Trying to update already-changed feature: {old_key}",
err=True,
)
continue

Expand Down
48 changes: 48 additions & 0 deletions sno/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import functools
import uuid

from .geometry import Geometry
from .serialise_util import (
msg_pack,
msg_unpack,
Expand Down Expand Up @@ -451,3 +452,50 @@ def diff_types(self, new_schema):

def diff_type_counts(self, new_schema):
return {k: len(v) for k, v in self.diff_types(new_schema).items()}

EQUIVALENT_PYTHON_TYPES = {
"boolean": (bool,),
"blob": (bytes,),
"date": (str,),
"float": (float, int),
"geometry": (Geometry,),
"integer": (int,),
"interval": (str,),
"numeric": (str,),
"text": (str,),
"time": (str,),
"timestamp": (str,),
}

def validate_feature(self, feature, violation_info):
has_violation = False

for col in self.columns:
if col.name not in feature:
continue
value = feature[col.name]
if value is None:
continue
col_type = col.data_type
if type(value) not in self.EQUIVALENT_PYTHON_TYPES[col_type]:
has_violation = True
if col.name not in violation_info:
message = f"In column {col.name} value {repr(value)} doesn't match schema type {col_type}"
violation_info[col.name] = message
elif col_type == "integer" and "size" in col.extra_type_info:
size = col.extra_type_info["size"]
if self._signed_bit_length(value) > size:
has_violation = True
if col.name not in violation_info:
bounds = 2 ** (size - 1)
message = f"In column {col.name} value {repr(value)} does not fit into an int{size}: {-bounds} to {bounds-1}"
violation_info[col.name] = message
# TODO - more type validation - eg maximum string lengths

return has_violation

def _signed_bit_length(self, integer):
if integer < 0:
return (integer + 1).bit_length() + 1
else:
return integer.bit_length() + 1
50 changes: 50 additions & 0 deletions sno/structure.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,23 @@
import logging
from collections import deque

import click
import pygit2

from . import git_util
from .base_dataset import BaseDataset
from .exceptions import (
InvalidOperation,
NotFound,
NotYetImplemented,
NO_CHANGES,
NO_COMMIT,
PATCH_DOES_NOT_APPLY,
SCHEMA_VIOLATION,
)
from .rich_tree_builder import RichTreeBuilder
from .repository_version import get_repo_version, extra_blobs_for_version
from .schema import Schema


L = logging.getLogger("sno.structure")
Expand Down Expand Up @@ -242,6 +247,49 @@ def create_tree_from_diff(self, repo_diff, *, allow_missing_old_values=False):
L.info(f"Tree sha: {tree.oid}")
return tree.oid

def check_values_match_schema(self, repo_diff):
has_violations = False
violations = {}

for ds_path, ds_diff in repo_diff.items():
ds_violations = {}
violations[ds_path] = ds_violations

schema_delta = ds_diff.recursive_get(["meta", "schema.json"])
if schema_delta:
if self.version < 2:
# This should have been handled already, but just to be safe.
raise NotYetImplemented(
"Meta changes are not supported until datasets V2"
)
elif schema_delta.type == "delete":
new_schema = None
else:
new_schema = Schema.from_column_dicts(schema_delta.new_value)
else:
new_schema = self.get(ds_path).schema

feature_diff = ds_diff.get("feature") or {}
for feature_delta in feature_diff.values():
new_value = feature_delta.new_value
if new_value is None:
continue
if new_schema is None:
raise InvalidOperation(
f"Can't {feature_delta.type} feature {feature_delta.new_key} in deleted dataset {ds_path}",
exit_code=PATCH_DOES_NOT_APPLY,
)
has_violations |= new_schema.validate_feature(new_value, ds_violations)

if has_violations:
for ds_path, ds_violations in violations.items():
for message in ds_violations.values():
click.echo(f"{ds_path}: {message}", err=True)
raise InvalidOperation(
"Schema violation - values do not match schema",
exit_code=SCHEMA_VIOLATION,
)

def commit(
self,
wcdiff,
Expand All @@ -262,6 +310,8 @@ def commit(
`ref` should be a key that works with repo.references, i.e.
either "HEAD" or "refs/heads/{branchname}"
"""
self.check_values_match_schema(wcdiff)

old_tree_oid = self.tree.oid if self.tree is not None else None
new_tree_oid = self.create_tree_from_diff(
wcdiff,
Expand Down
6 changes: 3 additions & 3 deletions tests/test_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,15 +66,15 @@ def test_apply_twice(data_archive, cli_runner):

assert (
"nz_pa_points_topo_150k: Trying to delete nonexistent feature: 1241"
in r.stdout
in r.stderr
)
assert (
"nz_pa_points_topo_150k: Trying to create feature that already exists: 9999"
in r.stdout
in r.stderr
)
assert (
"nz_pa_points_topo_150k: Trying to update already-changed feature: 1795"
in r.stdout
in r.stderr
)
assert "Patch does not apply" in r.stderr

Expand Down
25 changes: 24 additions & 1 deletion tests/test_commit.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,13 @@
import pygit2

from sno import repo_files
from sno.exceptions import INVALID_ARGUMENT, NO_CHANGES, NO_DATA, NO_REPOSITORY
from sno.exceptions import (
INVALID_ARGUMENT,
NO_CHANGES,
NO_DATA,
NO_REPOSITORY,
SCHEMA_VIOLATION,
)
from sno.repo_files import fallback_editor
from sno.sno_repo import SnoRepo
from sno.structure import RepositoryStructure
Expand Down Expand Up @@ -324,3 +330,20 @@ def test_commit_user_info(tmp_path, cli_runner, chdir, data_working_copy):
assert author.email == "user@example.com"
assert author.time == 1000000000
assert author.offset == 750


def test_commit_invalid_value(cli_runner, data_working_copy, geopackage):
with data_working_copy("points2") as (repo_dir, wc_path):
db = geopackage(wc_path)
with db:
dbcur = db.cursor()
dbcur.execute(
f"UPDATE {H.POINTS.LAYER} SET t50_fid=123456789012 WHERE fid=1;"
)

r = cli_runner.invoke(["commit", "-m", "test"])
assert r.exit_code == SCHEMA_VIOLATION, r.stderr
assert r.stderr.splitlines() == [
"nz_pa_points_topo_150k: In column t50_fid value 123456789012 does not fit into an int32: -2147483648 to 2147483647",
"Error: Schema violation - values do not match schema",
]

0 comments on commit ce7d14e

Please sign in to comment.