Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support wildcards and meta items in diff/commit filters #532

Merged
merged 4 commits into from
Jan 12, 2022
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion kart/diff_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def get_all_ds_paths(base_rs, target_rs, repo_key_filter=RepoKeyFilter.MATCH_ALL
all_ds_paths = base_ds_paths | target_ds_paths

if not repo_key_filter.match_all:
all_ds_paths = all_ds_paths & repo_key_filter.keys()
all_ds_paths = repo_key_filter.filter_keys(all_ds_paths)

return sorted(list(all_ds_paths))

Expand Down
145 changes: 112 additions & 33 deletions kart/key_filters.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import fnmatch
import re

import click
Expand Down Expand Up @@ -105,51 +106,129 @@ class RepoKeyFilter(KeyFilterDict):
child_type = DatasetKeyFilter
child_that_matches_all = DatasetKeyFilter(match_all=True)

_ENTIRE_DATASET_PATTERN = re.compile(r"^[^:]+$")
_SINGLE_FEATURE_PATTERN = re.compile(
r"^(?P<dataset>[^:]+):(feature:)?(?P<pk>[^:]+)$"
# https://github.com/koordinates/kart/blob/master/docs/DATASETS_v3.md#valid-dataset-names
# note: we allow '*' here; it's not a valid dataset name character but it's used in the filter
# pattern.
FILTER_PATTERN = re.compile(
# dataset part
r'^(?P<dataset_glob>[^:<>"|?\x00-\x1f]+)'
# optional sub-dataset part. This is optional; if a PK is given and ':feature' isn't, we assume feature anyway.
# (i.e. 'datasetname:123' is equivalent to 'datasetname:feature:123'
r"(?::(?P<subdataset>feature|meta))?"
# The rest of the pattern is either a meta key or a PK
r"(?::(?P<rest>.*))?"
)

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._dataset_glob_filters = {}

def _bad_pattern(self, user_pattern):
return click.UsageError(
f"Invalid filter format, should be '<dataset>' or '<dataset>:<primary_key>': got {user_pattern!r}"
)

def _parse_user_pattern(self, user_pattern):
match = self.FILTER_PATTERN.match(user_pattern)
if not match:
raise self._bad_pattern(user_pattern)
groups = match.groupdict()
dataset_glob = groups["dataset_glob"]
if (
dataset_glob.startswith(("/", "."))
or dataset_glob.endswith(("/", "."))
or "./" in dataset_glob
or "/." in dataset_glob
):
raise self._bad_pattern(user_pattern)

subdataset = groups["subdataset"]
if not subdataset:
if groups["rest"]:
subdataset = "feature"

return groups["dataset_glob"], subdataset, groups["rest"] or None

@classmethod
def build_from_user_patterns(cls, user_patterns, implicit_meta=True):
def build_from_user_patterns(cls, user_patterns):
"""
Given a list of strings like ["datasetA:1", "datasetA:2", "datasetB"],
builds a RepoKeyFilter with the appropriate entries for "datasetA" and "datasetB".
If no patterns are specified, returns RepoKeyFilter.MATCH_ALL.
If implicit_meta is True, then all meta changes are matched as soon as any feature changes are requested.
"""
result = cls()
for user_pattern in user_patterns:
result.add_user_pattern(user_pattern, implicit_meta=implicit_meta)
result.add_user_pattern(user_pattern)
return result if result else cls.MATCH_ALL

def add_user_pattern(self, user_pattern, implicit_meta=True):
for p in (self._ENTIRE_DATASET_PATTERN, self._SINGLE_FEATURE_PATTERN):
match = p.match(user_pattern)
if match:
break
else:
raise click.UsageError(
f"Invalid filter format, should be <dataset> or <dataset>:<primary_key> - {user_pattern}"
)

if p is self._ENTIRE_DATASET_PATTERN:
ds_path = user_pattern
self[ds_path] = DatasetKeyFilter.MATCH_ALL

if p is self._SINGLE_FEATURE_PATTERN:
ds_path = match.group("dataset")
pk = match.group("pk")

ds_filter = self.get(ds_path)
if not ds_filter:
ds_filter = DatasetKeyFilter()
if implicit_meta:
ds_filter["meta"] = UserStringKeyFilter.MATCH_ALL
ds_filter["feature"] = UserStringKeyFilter()
self[ds_path] = ds_filter

ds_filter["feature"].add(pk)
def add_user_pattern(self, user_pattern):
dataset_glob, subdataset, rest = self._parse_user_pattern(user_pattern)

if subdataset is None:
# whole dataset
self[dataset_glob] = DatasetKeyFilter.MATCH_ALL
return
# Either a meta or feature filter
ds_filter = self.get(dataset_glob)
if not ds_filter:
ds_filter = DatasetKeyFilter()
if rest:
# Specific feature or meta item
ds_filter[subdataset] = UserStringKeyFilter()
else:
# All features, or all meta items
ds_filter[subdataset] = UserStringKeyFilter.MATCH_ALL
self[dataset_glob] = ds_filter
ds_filter[subdataset].add(rest)

def _dataset_glob_pattern_matching_key(self, key):
if self._dataset_glob_filters:
for glob_pattern in self._dataset_glob_filters.keys():
if fnmatch.fnmatch(key, glob_pattern):
return glob_pattern
return False

def filter_keys(self, keys: set):
matched_keys = keys & self.keys()
matched_keys.update(
{
k
for k in keys - matched_keys
if self._dataset_glob_pattern_matching_key(k)
}
)
return matched_keys

def __contains__(self, key):
return super().__contains__(key) or self._dataset_glob_pattern_matching_key(key)

def __getitem__(self, key):
if self.match_all:
return self.child_that_matches_all
try:
return super().__getitem__(key)
except KeyError:
glob_pattern = self._dataset_glob_pattern_matching_key(key)
if not glob_pattern:
raise
return self._dataset_glob_filters[glob_pattern]

def get(self, key, default_value=None):
try:
return self[key]
except KeyError:
return default_value

def __setitem__(self, key, value):
if self.match_all:
return
if "*" in key:
# escape the glob for passing to fnmatch later.
# This is because fnmatch actually processes '*?[]' chars specially, but we only want to support '*' for now.
for char in "?[]":
key = key.replace(char, f"[{char}]")
self._dataset_glob_filters[key] = value
super().__setitem__(key, value)


RepoKeyFilter.MATCH_ALL = RepoKeyFilter(match_all=True)
61 changes: 41 additions & 20 deletions kart/log.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from .cli_util import tool_environment
from .exec import execvp
from .exceptions import SubprocessError
from .exceptions import NotYetImplemented, SubprocessError
from .key_filters import RepoKeyFilter
from .output_util import dump_json_output
from .repo import KartRepoState
Expand Down Expand Up @@ -205,30 +205,51 @@ def convert_user_patterns_to_raw_paths(paths, repo, commits):
finds the path encoding for the dataset they apply to and converts them to the feature's path, eg:
path/to/dataset/.table-dataset/feature/F/9/o/6/kc4F9o6L
"""
repo_filter = RepoKeyFilter.build_from_user_patterns(paths, implicit_meta=False)
DATASET_DIRNAME = repo.dataset_class.DATASET_DIRNAME
# Specially handle raw paths, because we can and it's nice for Kart developers
result = [p for p in paths if f"/{DATASET_DIRNAME}/" in p]
normal_paths = [p for p in paths if f"/{DATASET_DIRNAME}/" not in p]
repo_filter = RepoKeyFilter.build_from_user_patterns(normal_paths)
if repo_filter.match_all:
return []
result = []
return result
for ds_path, ds_filter in repo_filter.items():
if ds_filter.match_all:
result.append(ds_path)
continue
ds = find_dataset(ds_path, repo, commits)
if not ds:
result.append(ds_path)
continue
for ds_part, part_filter in ds_filter.items():
if part_filter.match_all:
result.append(f"{ds.inner_path}/{ds_part}")
continue

for item_key in part_filter:
if ds_part == "feature":
result.append(
ds.encode_pks_to_path(ds.schema.sanitise_pks(item_key))
)
else:
result.append(f"{ds.inner_path}/{ds_part}/{item_key}")

for char in "?[]":
# git pathspecs actually treat '*?[]' specially but we only want to support '*' for now
ds_path = ds_path.replace(char, f"[{char}]")

# NOTE: git's interpretation of '*' is pretty loose.
# It matches all characters in a path *including slashes*, so '*abc' will match 'foo/bar/abc'
# This is pretty much what we want though 👍
if ds_filter.match_all:
result.append(f"{ds_path}/*")
else:
for ds_part, part_filter in ds_filter.items():
if part_filter.match_all:
result.append(f"{ds_path}/{DATASET_DIRNAME}/{ds_part}/*")
continue

for item_key in part_filter:
if ds_part == "feature":
if "*" in ds_path:
raise NotYetImplemented(
"`kart log` doesn't currently support filters with both wildcards and feature IDs"
)
else:
ds = find_dataset(ds_path, repo, commits)
if not ds:
result.append(ds_path)
continue
result.append(
ds.encode_pks_to_path(ds.schema.sanitise_pks(item_key))
)
else:
result.append(
f"{ds_path}/{DATASET_DIRNAME}/{ds_part}/{item_key}"
)
return result


Expand Down
21 changes: 15 additions & 6 deletions kart/rich_base_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
NotYetImplemented,
PATCH_DOES_NOT_APPLY,
)
from .key_filters import DatasetKeyFilter, FeatureKeyFilter
from .key_filters import DatasetKeyFilter, FeatureKeyFilter, MetaKeyFilter
from .schema import Schema
from .spatial_filter import SpatialFilter
from .promisor_utils import object_is_promised, fetch_promised_blobs
Expand Down Expand Up @@ -173,14 +173,15 @@ def diff(self, other, ds_filter=DatasetKeyFilter.MATCH_ALL, reverse=False):
If reverse is true, generates a diff from other -> self.
"""
ds_diff = DatasetDiff()
ds_diff["meta"] = self.diff_meta(other, reverse=reverse)
meta_filter = ds_filter.get("meta", ds_filter.child_type())
ds_diff["meta"] = self.diff_meta(other, meta_filter, reverse=reverse)
feature_filter = ds_filter.get("feature", ds_filter.child_type())
ds_diff["feature"] = DeltaDiff(
self.diff_feature(other, feature_filter, reverse=reverse)
)
return ds_diff

def diff_meta(self, other, reverse=False):
def diff_meta(self, other, meta_filter=MetaKeyFilter.MATCH_ALL, reverse=False):
"""
Returns a diff from self -> other, but only for meta items.
If reverse is true, generates a diff from other -> self.
Expand All @@ -190,8 +191,16 @@ def diff_meta(self, other, reverse=False):
else:
old, new = self, other

meta_old = dict(old.meta_items()) if old else {}
meta_new = dict(new.meta_items()) if new else {}
meta_old = (
{k: v for k, v in old.meta_items().items() if k in meta_filter}
if old
else {}
)
meta_new = (
{k: v for k, v in new.meta_items().items() if k in meta_filter}
if new
else {}
)
return DeltaDiff.diff_dicts(meta_old, meta_new)

_INSERT_UPDATE_DELETE = (
Expand Down Expand Up @@ -522,7 +531,7 @@ def fetch_missing_dirty_features(self, working_copy):
"""Fetch all the promised features in this dataset that are marked as dirty in the working copy."""

# Attempting this more than once in a single kart invocation will waste time and have no effect.
if getattr(self, 'fetch_missing_dirty_features_attempted', False):
if getattr(self, "fetch_missing_dirty_features_attempted", False):
return False
self.fetch_missing_dirty_features_attempted = True

Expand Down
Loading