Skip to content

Commit

Permalink
Diff: Add wildcard filters and meta filters
Browse files Browse the repository at this point in the history
  • Loading branch information
craigds committed Jan 11, 2022
1 parent 8c924ab commit 19fbca9
Show file tree
Hide file tree
Showing 7 changed files with 242 additions and 42 deletions.
5 changes: 4 additions & 1 deletion kart/base_diff_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,10 @@ def __init__(
)

self.user_key_filters = user_key_filters
self.repo_key_filter = RepoKeyFilter.build_from_user_patterns(user_key_filters)
self.repo_key_filter = RepoKeyFilter.build_from_user_patterns(
user_key_filters,
implicit_meta=False,
)

self.spatial_filter = repo.spatial_filter

Expand Down
2 changes: 1 addition & 1 deletion kart/diff_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def get_all_ds_paths(base_rs, target_rs, repo_key_filter=RepoKeyFilter.MATCH_ALL
all_ds_paths = base_ds_paths | target_ds_paths

if not repo_key_filter.match_all:
all_ds_paths = all_ds_paths & repo_key_filter.keys()
all_ds_paths = repo_key_filter.filter_keys(all_ds_paths)

return sorted(list(all_ds_paths))

Expand Down
141 changes: 112 additions & 29 deletions kart/key_filters.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import fnmatch
import re

import click
Expand Down Expand Up @@ -105,11 +106,49 @@ class RepoKeyFilter(KeyFilterDict):
child_type = DatasetKeyFilter
child_that_matches_all = DatasetKeyFilter(match_all=True)

_ENTIRE_DATASET_PATTERN = re.compile(r"^[^:]+$")
_SINGLE_FEATURE_PATTERN = re.compile(
r"^(?P<dataset>[^:]+):(feature:)?(?P<pk>[^:]+)$"
# https://github.com/koordinates/kart/blob/master/docs/DATASETS_v3.md#valid-dataset-names
# note: we allow '*' here; it's not a valid dataset name character but it's used in the filter
# pattern.
FILTER_PATTERN = re.compile(
# dataset part
r'^(?P<dataset_glob>[^:<>"|?\x00-\x1f]+)'
# optional sub-dataset part. This is optional; if a PK is given and ':feature' isn't, we assume feature anyway.
# (i.e. 'datasetname:123' is equivalent to 'datasetname:feature:123'
r"(?::(?P<subdataset>feature|meta))?"
# The rest of the pattern is either a meta key or a PK
r"(?::(?P<rest>.*))?"
)

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._dataset_glob_filters = {}

def _bad_pattern(self, user_pattern):
return click.UsageError(
f"Invalid filter format, should be '<dataset>' or '<dataset>:<primary_key>': got {user_pattern!r}"
)

def _parse_user_pattern(self, user_pattern):
match = self.FILTER_PATTERN.match(user_pattern)
if not match:
raise self._bad_pattern(user_pattern)
groups = match.groupdict()
dataset_glob = groups["dataset_glob"]
if (
dataset_glob.startswith(("/", "."))
or dataset_glob.endswith(("/", "."))
or "./" in dataset_glob
or "/." in dataset_glob
):
raise self._bad_pattern(user_pattern)

subdataset = groups["subdataset"]
if not subdataset:
if groups["rest"]:
subdataset = "feature"

return groups["dataset_glob"], subdataset, groups["rest"] or None

@classmethod
def build_from_user_patterns(cls, user_patterns, implicit_meta=True):
"""
Expand All @@ -124,32 +163,76 @@ def build_from_user_patterns(cls, user_patterns, implicit_meta=True):
return result if result else cls.MATCH_ALL

def add_user_pattern(self, user_pattern, implicit_meta=True):
for p in (self._ENTIRE_DATASET_PATTERN, self._SINGLE_FEATURE_PATTERN):
match = p.match(user_pattern)
if match:
break
else:
raise click.UsageError(
f"Invalid filter format, should be <dataset> or <dataset>:<primary_key> - {user_pattern}"
)

if p is self._ENTIRE_DATASET_PATTERN:
ds_path = user_pattern
self[ds_path] = DatasetKeyFilter.MATCH_ALL

if p is self._SINGLE_FEATURE_PATTERN:
ds_path = match.group("dataset")
pk = match.group("pk")

ds_filter = self.get(ds_path)
if not ds_filter:
ds_filter = DatasetKeyFilter()
if implicit_meta:
ds_filter["meta"] = UserStringKeyFilter.MATCH_ALL
ds_filter["feature"] = UserStringKeyFilter()
self[ds_path] = ds_filter

ds_filter["feature"].add(pk)
dataset_glob, subdataset, rest = self._parse_user_pattern(user_pattern)

if subdataset is None:
# whole dataset
self[dataset_glob] = DatasetKeyFilter.MATCH_ALL
return
# meta or feature filter
ds_filter = self.get(dataset_glob)
if not ds_filter:
ds_filter = DatasetKeyFilter()
# TODO: remove this? what is it for?
# if implicit_meta:
# ds_filter["meta"] = UserStringKeyFilter.MATCH_ALL
if rest:
# Specific feature or specific meta item
ds_filter[subdataset] = UserStringKeyFilter()
else:
# All features, or all meta items
ds_filter[subdataset] = UserStringKeyFilter.MATCH_ALL
self[dataset_glob] = ds_filter
ds_filter[subdataset].add(rest)

def _dataset_glob_pattern_matching_key(self, key):
if self._dataset_glob_filters:
for glob_pattern in self._dataset_glob_filters.keys():
if fnmatch.fnmatch(key, glob_pattern):
return glob_pattern
return False

def filter_keys(self, keys: set):
matched_keys = keys & self.keys()
matched_keys.update(
{
k
for k in keys - matched_keys
if self._dataset_glob_pattern_matching_key(k)
}
)
return matched_keys

def __contains__(self, key):
return super().__contains__(key) or self._dataset_glob_pattern_matching_key(key)

def __getitem__(self, key):
if self.match_all:
return self.child_that_matches_all
try:
return super().__getitem__(key)
except KeyError:
glob_pattern = self._dataset_glob_pattern_matching_key(key)
if not glob_pattern:
raise
return self._dataset_glob_filters[glob_pattern]

def get(self, key, default_value=None):
try:
return self[key]
except KeyError:
return default_value

def __setitem__(self, key, value):
if self.match_all:
return
if "*" in key:
# escape the glob for passing to fnmatch later.
# This is because fnmatch actually processes '*?[]' chars specially, but we only want to support '*' for now.
for char in "?[]":
key = key.replace(char, f"[{char}]")
self._dataset_glob_filters[key] = value
super().__setitem__(key, value)


RepoKeyFilter.MATCH_ALL = RepoKeyFilter(match_all=True)
10 changes: 7 additions & 3 deletions kart/log.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,10 +205,14 @@ def convert_user_patterns_to_raw_paths(paths, repo, commits):
finds the path encoding for the dataset they apply to and converts them to the feature's path, eg:
path/to/dataset/.table-dataset/feature/F/9/o/6/kc4F9o6L
"""
repo_filter = RepoKeyFilter.build_from_user_patterns(paths, implicit_meta=False)
# Specially handle raw paths, because we can and it's nice for Kart developers
result = [p for p in paths if "/.table-dataset/" in p]
normal_paths = [p for p in paths if "/.table-dataset/" not in p]
repo_filter = RepoKeyFilter.build_from_user_patterns(
normal_paths, implicit_meta=False
)
if repo_filter.match_all:
return []
result = []
return result
for ds_path, ds_filter in repo_filter.items():
if ds_filter.match_all:
result.append(ds_path)
Expand Down
21 changes: 15 additions & 6 deletions kart/rich_base_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
NotYetImplemented,
PATCH_DOES_NOT_APPLY,
)
from .key_filters import DatasetKeyFilter, FeatureKeyFilter
from .key_filters import DatasetKeyFilter, FeatureKeyFilter, MetaKeyFilter
from .schema import Schema
from .spatial_filter import SpatialFilter
from .promisor_utils import object_is_promised, fetch_promised_blobs
Expand Down Expand Up @@ -173,14 +173,15 @@ def diff(self, other, ds_filter=DatasetKeyFilter.MATCH_ALL, reverse=False):
If reverse is true, generates a diff from other -> self.
"""
ds_diff = DatasetDiff()
ds_diff["meta"] = self.diff_meta(other, reverse=reverse)
meta_filter = ds_filter.get("meta", ds_filter.child_type())
ds_diff["meta"] = self.diff_meta(other, meta_filter, reverse=reverse)
feature_filter = ds_filter.get("feature", ds_filter.child_type())
ds_diff["feature"] = DeltaDiff(
self.diff_feature(other, feature_filter, reverse=reverse)
)
return ds_diff

def diff_meta(self, other, reverse=False):
def diff_meta(self, other, meta_filter=MetaKeyFilter.MATCH_ALL, reverse=False):
"""
Returns a diff from self -> other, but only for meta items.
If reverse is true, generates a diff from other -> self.
Expand All @@ -190,8 +191,16 @@ def diff_meta(self, other, reverse=False):
else:
old, new = self, other

meta_old = dict(old.meta_items()) if old else {}
meta_new = dict(new.meta_items()) if new else {}
meta_old = (
{k: v for k, v in old.meta_items().items() if k in meta_filter}
if old
else {}
)
meta_new = (
{k: v for k, v in new.meta_items().items() if k in meta_filter}
if new
else {}
)
return DeltaDiff.diff_dicts(meta_old, meta_new)

_INSERT_UPDATE_DELETE = (
Expand Down Expand Up @@ -522,7 +531,7 @@ def fetch_missing_dirty_features(self, working_copy):
"""Fetch all the promised features in this dataset that are marked as dirty in the working copy."""

# Attempting this more than once in a single kart invocation will waste time and have no effect.
if getattr(self, 'fetch_missing_dirty_features_attempted', False):
if getattr(self, "fetch_missing_dirty_features_attempted", False):
return False
self.fetch_missing_dirty_features_attempted = True

Expand Down
103 changes: 102 additions & 1 deletion tests/test_diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -1514,7 +1514,7 @@ def test_show_points_HEAD(output_format, data_archive_readonly, cli_runner):
}


def test_diff_filtered(data_archive_readonly, cli_runner):
def test_diff_filtered_text(data_archive_readonly, cli_runner):
with data_archive_readonly("points"):
r = cli_runner.invoke(["diff", "HEAD^...", "nz_pa_points_topo_150k:1182"])
assert r.exit_code == 0, r.stderr
Expand All @@ -1530,6 +1530,107 @@ def test_diff_filtered(data_archive_readonly, cli_runner):
]


def test_diff_wildcard_dataset_filters(data_archive, cli_runner):
with data_archive("polygons") as repo_path:
# Add another dataset at "second/dataset"
with data_archive("gpkg-polygons") as src:
src_gpkg_path = src / "nz-waca-adjustments.gpkg"
r = cli_runner.invoke(
[
"-C",
repo_path,
"import",
src_gpkg_path,
"nz_waca_adjustments:second/dataset",
]
)
assert r.exit_code == 0, r.stderr

# Find all meta changes for datasets matching a filter
r = cli_runner.invoke(["diff", "HEAD^^?...", "second/*:meta", "-o", "json"])
assert r.exit_code == 0, r.stderr
diff = json.loads(r.stdout)["kart.diff/v1+hexwkb"]
assert diff.keys() == {"second/dataset"}
assert diff["second/dataset"].keys() == {"meta"}
assert diff["second/dataset"]["meta"].keys() == {
"title",
"metadata.xml",
"crs/EPSG:4167.wkt",
"schema.json",
"description",
}

# Find title changes for all datasets
r = cli_runner.invoke(["diff", "HEAD^^?...", "*:meta:title", "-o", "json"])
assert r.exit_code == 0, r.stderr
assert json.loads(r.stdout) == {
"kart.diff/v1+hexwkb": {
"nz_waca_adjustments": {
"meta": {"title": {"+": "NZ WACA Adjustments"}}
},
"second/dataset": {"meta": {"title": {"+": "NZ WACA Adjustments"}}},
}
}

# Wildcard dataset filter for specific feature ID in all datasets.
# This mostly exists for consistency with the meta ones shown above, but might be useful for ... something?
r = cli_runner.invoke(["diff", "HEAD^^?...", "*:feature:4408145", "-o", "json"])
assert r.exit_code == 0, r.stderr
assert json.loads(r.stdout) == {
"kart.diff/v1+hexwkb": {
"nz_waca_adjustments": {
"feature": [
{
"+": {
"id": 4408145,
"geom": "0106000000010000000103000000010000000A000000D7D232C2528C6540224B1CB992CC45C035AC93FE308C6540AED61AE518CC45C077BF65A9308C65400CE8853B17CC45C0F188658E208C654079F5E0A49FCB45C03C3B141A248C654006E470019FCB45C0896DFC19278C6540671929E5A3CB45C0DF597160288C654000A7080BA6CB45C064B3C319648C65408C0F44B114CC45C0C885FE1E988C6540B64D609F81CC45C0D7D232C2528C6540224B1CB992CC45C0",
"date_adjusted": "2016-12-15T12:37:17",
"survey_reference": None,
"adjusted_nodes": 891,
}
}
]
},
"second/dataset": {
"feature": [
{
"+": {
"id": 4408145,
"geom": "0106000000010000000103000000010000000A000000D7D232C2528C6540224B1CB992CC45C035AC93FE308C6540AED61AE518CC45C077BF65A9308C65400CE8853B17CC45C0F188658E208C654079F5E0A49FCB45C03C3B141A248C654006E470019FCB45C0896DFC19278C6540671929E5A3CB45C0DF597160288C654000A7080BA6CB45C064B3C319648C65408C0F44B114CC45C0C885FE1E988C6540B64D609F81CC45C0D7D232C2528C6540224B1CB992CC45C0",
"date_adjusted": "2016-12-15T12:37:17",
"survey_reference": None,
"adjusted_nodes": 891,
}
}
]
},
}
}

# Filter for features in datasets whose name matches a pattern
r = cli_runner.invoke(
["diff", "HEAD^^?...", "*/dataset:feature:4408145", "-o", "json"]
)
assert r.exit_code == 0, r.stderr
assert json.loads(r.stdout) == {
"kart.diff/v1+hexwkb": {
"second/dataset": {
"feature": [
{
"+": {
"id": 4408145,
"geom": "0106000000010000000103000000010000000A000000D7D232C2528C6540224B1CB992CC45C035AC93FE308C6540AED61AE518CC45C077BF65A9308C65400CE8853B17CC45C0F188658E208C654079F5E0A49FCB45C03C3B141A248C654006E470019FCB45C0896DFC19278C6540671929E5A3CB45C0DF597160288C654000A7080BA6CB45C064B3C319648C65408C0F44B114CC45C0C885FE1E988C6540B64D609F81CC45C0D7D232C2528C6540224B1CB992CC45C0",
"date_adjusted": "2016-12-15T12:37:17",
"survey_reference": None,
"adjusted_nodes": 891,
}
}
]
}
}
}


@pytest.mark.parametrize("output_format", SHOW_OUTPUT_FORMATS)
def test_show_polygons_initial(output_format, data_archive_readonly, cli_runner):
"""Make sure we can show the initial commit"""
Expand Down
2 changes: 1 addition & 1 deletion tests/test_log.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ def num_commits(r):
)
assert r.exit_code == 0, r.stderr
assert num_commits(r) == 1
# Path syntax still works:
# Raw path syntax still works:
PK_1_PATH = "nz_pa_points_topo_150k/.table-dataset/feature/A/A/A/A/kQ0="
r = cli_runner.invoke(["log", "-o", output_format, "--", PK_1_PATH])
assert r.exit_code == 0, r.stderr
Expand Down

0 comments on commit 19fbca9

Please sign in to comment.