diff --git a/kart/base_diff_writer.py b/kart/base_diff_writer.py index 1c5b76f86..03ad45569 100644 --- a/kart/base_diff_writer.py +++ b/kart/base_diff_writer.py @@ -86,7 +86,10 @@ def __init__( ) self.user_key_filters = user_key_filters - self.repo_key_filter = RepoKeyFilter.build_from_user_patterns(user_key_filters) + self.repo_key_filter = RepoKeyFilter.build_from_user_patterns( + user_key_filters, + implicit_meta=False, + ) self.spatial_filter = repo.spatial_filter diff --git a/kart/diff_util.py b/kart/diff_util.py index 7f450f496..28f9710f6 100644 --- a/kart/diff_util.py +++ b/kart/diff_util.py @@ -19,7 +19,7 @@ def get_all_ds_paths(base_rs, target_rs, repo_key_filter=RepoKeyFilter.MATCH_ALL all_ds_paths = base_ds_paths | target_ds_paths if not repo_key_filter.match_all: - all_ds_paths = all_ds_paths & repo_key_filter.keys() + all_ds_paths = repo_key_filter.filter_keys(all_ds_paths) return sorted(list(all_ds_paths)) diff --git a/kart/key_filters.py b/kart/key_filters.py index aead01a4c..e51014358 100644 --- a/kart/key_filters.py +++ b/kart/key_filters.py @@ -1,3 +1,4 @@ +import fnmatch import re import click @@ -105,11 +106,49 @@ class RepoKeyFilter(KeyFilterDict): child_type = DatasetKeyFilter child_that_matches_all = DatasetKeyFilter(match_all=True) - _ENTIRE_DATASET_PATTERN = re.compile(r"^[^:]+$") - _SINGLE_FEATURE_PATTERN = re.compile( - r"^(?P[^:]+):(feature:)?(?P[^:]+)$" + # https://github.com/koordinates/kart/blob/master/docs/DATASETS_v3.md#valid-dataset-names + # note: we allow '*' here; it's not a valid dataset name character but it's used in the filter + # pattern. + FILTER_PATTERN = re.compile( + # dataset part + r'^(?P[^:<>"|?\x00-\x1f]+)' + # optional sub-dataset part. This is optional; if a PK is given and ':feature' isn't, we assume feature anyway. + # (i.e. 'datasetname:123' is equivalent to 'datasetname:feature:123' + r"(?::(?Pfeature|meta))?" + # The rest of the pattern is either a meta key or a PK + r"(?::(?P.*))?" ) + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._dataset_glob_filters = {} + + def _bad_pattern(self, user_pattern): + return click.UsageError( + f"Invalid filter format, should be '' or ':': got {user_pattern!r}" + ) + + def _parse_user_pattern(self, user_pattern): + match = self.FILTER_PATTERN.match(user_pattern) + if not match: + raise self._bad_pattern(user_pattern) + groups = match.groupdict() + dataset_glob = groups["dataset_glob"] + if ( + dataset_glob.startswith(("/", ".")) + or dataset_glob.endswith(("/", ".")) + or "./" in dataset_glob + or "/." in dataset_glob + ): + raise self._bad_pattern(user_pattern) + + subdataset = groups["subdataset"] + if not subdataset: + if groups["rest"]: + subdataset = "feature" + + return groups["dataset_glob"], subdataset, groups["rest"] or None + @classmethod def build_from_user_patterns(cls, user_patterns, implicit_meta=True): """ @@ -124,32 +163,76 @@ def build_from_user_patterns(cls, user_patterns, implicit_meta=True): return result if result else cls.MATCH_ALL def add_user_pattern(self, user_pattern, implicit_meta=True): - for p in (self._ENTIRE_DATASET_PATTERN, self._SINGLE_FEATURE_PATTERN): - match = p.match(user_pattern) - if match: - break - else: - raise click.UsageError( - f"Invalid filter format, should be or : - {user_pattern}" - ) - - if p is self._ENTIRE_DATASET_PATTERN: - ds_path = user_pattern - self[ds_path] = DatasetKeyFilter.MATCH_ALL - - if p is self._SINGLE_FEATURE_PATTERN: - ds_path = match.group("dataset") - pk = match.group("pk") - - ds_filter = self.get(ds_path) - if not ds_filter: - ds_filter = DatasetKeyFilter() - if implicit_meta: - ds_filter["meta"] = UserStringKeyFilter.MATCH_ALL - ds_filter["feature"] = UserStringKeyFilter() - self[ds_path] = ds_filter - - ds_filter["feature"].add(pk) + dataset_glob, subdataset, rest = self._parse_user_pattern(user_pattern) + + if subdataset is None: + # whole dataset + self[dataset_glob] = DatasetKeyFilter.MATCH_ALL + return + # meta or feature filter + ds_filter = self.get(dataset_glob) + if not ds_filter: + ds_filter = DatasetKeyFilter() + # TODO: remove this? what is it for? + # if implicit_meta: + # ds_filter["meta"] = UserStringKeyFilter.MATCH_ALL + if rest: + # Specific feature or specific meta item + ds_filter[subdataset] = UserStringKeyFilter() + else: + # All features, or all meta items + ds_filter[subdataset] = UserStringKeyFilter.MATCH_ALL + self[dataset_glob] = ds_filter + ds_filter[subdataset].add(rest) + + def _dataset_glob_pattern_matching_key(self, key): + if self._dataset_glob_filters: + for glob_pattern in self._dataset_glob_filters.keys(): + if fnmatch.fnmatch(key, glob_pattern): + return glob_pattern + return False + + def filter_keys(self, keys: set): + matched_keys = keys & self.keys() + matched_keys.update( + { + k + for k in keys - matched_keys + if self._dataset_glob_pattern_matching_key(k) + } + ) + return matched_keys + + def __contains__(self, key): + return super().__contains__(key) or self._dataset_glob_pattern_matching_key(key) + + def __getitem__(self, key): + if self.match_all: + return self.child_that_matches_all + try: + return super().__getitem__(key) + except KeyError: + glob_pattern = self._dataset_glob_pattern_matching_key(key) + if not glob_pattern: + raise + return self._dataset_glob_filters[glob_pattern] + + def get(self, key, default_value=None): + try: + return self[key] + except KeyError: + return default_value + + def __setitem__(self, key, value): + if self.match_all: + return + if "*" in key: + # escape the glob for passing to fnmatch later. + # This is because fnmatch actually processes '*?[]' chars specially, but we only want to support '*' for now. + for char in "?[]": + key = key.replace(char, f"[{char}]") + self._dataset_glob_filters[key] = value + super().__setitem__(key, value) RepoKeyFilter.MATCH_ALL = RepoKeyFilter(match_all=True) diff --git a/kart/log.py b/kart/log.py index d8dfeb255..0520bd8f2 100644 --- a/kart/log.py +++ b/kart/log.py @@ -205,10 +205,14 @@ def convert_user_patterns_to_raw_paths(paths, repo, commits): finds the path encoding for the dataset they apply to and converts them to the feature's path, eg: path/to/dataset/.table-dataset/feature/F/9/o/6/kc4F9o6L """ - repo_filter = RepoKeyFilter.build_from_user_patterns(paths, implicit_meta=False) + # Specially handle raw paths, because we can and it's nice for Kart developers + result = [p for p in paths if "/.table-dataset/" in p] + normal_paths = [p for p in paths if "/.table-dataset/" not in p] + repo_filter = RepoKeyFilter.build_from_user_patterns( + normal_paths, implicit_meta=False + ) if repo_filter.match_all: - return [] - result = [] + return result for ds_path, ds_filter in repo_filter.items(): if ds_filter.match_all: result.append(ds_path) diff --git a/kart/rich_base_dataset.py b/kart/rich_base_dataset.py index 0ae2d8819..f03769bd8 100644 --- a/kart/rich_base_dataset.py +++ b/kart/rich_base_dataset.py @@ -14,7 +14,7 @@ NotYetImplemented, PATCH_DOES_NOT_APPLY, ) -from .key_filters import DatasetKeyFilter, FeatureKeyFilter +from .key_filters import DatasetKeyFilter, FeatureKeyFilter, MetaKeyFilter from .schema import Schema from .spatial_filter import SpatialFilter from .promisor_utils import object_is_promised, fetch_promised_blobs @@ -173,14 +173,15 @@ def diff(self, other, ds_filter=DatasetKeyFilter.MATCH_ALL, reverse=False): If reverse is true, generates a diff from other -> self. """ ds_diff = DatasetDiff() - ds_diff["meta"] = self.diff_meta(other, reverse=reverse) + meta_filter = ds_filter.get("meta", ds_filter.child_type()) + ds_diff["meta"] = self.diff_meta(other, meta_filter, reverse=reverse) feature_filter = ds_filter.get("feature", ds_filter.child_type()) ds_diff["feature"] = DeltaDiff( self.diff_feature(other, feature_filter, reverse=reverse) ) return ds_diff - def diff_meta(self, other, reverse=False): + def diff_meta(self, other, meta_filter=MetaKeyFilter.MATCH_ALL, reverse=False): """ Returns a diff from self -> other, but only for meta items. If reverse is true, generates a diff from other -> self. @@ -190,8 +191,16 @@ def diff_meta(self, other, reverse=False): else: old, new = self, other - meta_old = dict(old.meta_items()) if old else {} - meta_new = dict(new.meta_items()) if new else {} + meta_old = ( + {k: v for k, v in old.meta_items().items() if k in meta_filter} + if old + else {} + ) + meta_new = ( + {k: v for k, v in new.meta_items().items() if k in meta_filter} + if new + else {} + ) return DeltaDiff.diff_dicts(meta_old, meta_new) _INSERT_UPDATE_DELETE = ( @@ -522,7 +531,7 @@ def fetch_missing_dirty_features(self, working_copy): """Fetch all the promised features in this dataset that are marked as dirty in the working copy.""" # Attempting this more than once in a single kart invocation will waste time and have no effect. - if getattr(self, 'fetch_missing_dirty_features_attempted', False): + if getattr(self, "fetch_missing_dirty_features_attempted", False): return False self.fetch_missing_dirty_features_attempted = True diff --git a/tests/test_diff.py b/tests/test_diff.py index 26759d187..5fa127430 100644 --- a/tests/test_diff.py +++ b/tests/test_diff.py @@ -1514,7 +1514,7 @@ def test_show_points_HEAD(output_format, data_archive_readonly, cli_runner): } -def test_diff_filtered(data_archive_readonly, cli_runner): +def test_diff_filtered_text(data_archive_readonly, cli_runner): with data_archive_readonly("points"): r = cli_runner.invoke(["diff", "HEAD^...", "nz_pa_points_topo_150k:1182"]) assert r.exit_code == 0, r.stderr @@ -1530,6 +1530,107 @@ def test_diff_filtered(data_archive_readonly, cli_runner): ] +def test_diff_wildcard_dataset_filters(data_archive, cli_runner): + with data_archive("polygons") as repo_path: + # Add another dataset at "second/dataset" + with data_archive("gpkg-polygons") as src: + src_gpkg_path = src / "nz-waca-adjustments.gpkg" + r = cli_runner.invoke( + [ + "-C", + repo_path, + "import", + src_gpkg_path, + "nz_waca_adjustments:second/dataset", + ] + ) + assert r.exit_code == 0, r.stderr + + # Find all meta changes for datasets matching a filter + r = cli_runner.invoke(["diff", "HEAD^^?...", "second/*:meta", "-o", "json"]) + assert r.exit_code == 0, r.stderr + diff = json.loads(r.stdout)["kart.diff/v1+hexwkb"] + assert diff.keys() == {"second/dataset"} + assert diff["second/dataset"].keys() == {"meta"} + assert diff["second/dataset"]["meta"].keys() == { + "title", + "metadata.xml", + "crs/EPSG:4167.wkt", + "schema.json", + "description", + } + + # Find title changes for all datasets + r = cli_runner.invoke(["diff", "HEAD^^?...", "*:meta:title", "-o", "json"]) + assert r.exit_code == 0, r.stderr + assert json.loads(r.stdout) == { + "kart.diff/v1+hexwkb": { + "nz_waca_adjustments": { + "meta": {"title": {"+": "NZ WACA Adjustments"}} + }, + "second/dataset": {"meta": {"title": {"+": "NZ WACA Adjustments"}}}, + } + } + + # Wildcard dataset filter for specific feature ID in all datasets. + # This mostly exists for consistency with the meta ones shown above, but might be useful for ... something? + r = cli_runner.invoke(["diff", "HEAD^^?...", "*:feature:4408145", "-o", "json"]) + assert r.exit_code == 0, r.stderr + assert json.loads(r.stdout) == { + "kart.diff/v1+hexwkb": { + "nz_waca_adjustments": { + "feature": [ + { + "+": { + "id": 4408145, + "geom": "0106000000010000000103000000010000000A000000D7D232C2528C6540224B1CB992CC45C035AC93FE308C6540AED61AE518CC45C077BF65A9308C65400CE8853B17CC45C0F188658E208C654079F5E0A49FCB45C03C3B141A248C654006E470019FCB45C0896DFC19278C6540671929E5A3CB45C0DF597160288C654000A7080BA6CB45C064B3C319648C65408C0F44B114CC45C0C885FE1E988C6540B64D609F81CC45C0D7D232C2528C6540224B1CB992CC45C0", + "date_adjusted": "2016-12-15T12:37:17", + "survey_reference": None, + "adjusted_nodes": 891, + } + } + ] + }, + "second/dataset": { + "feature": [ + { + "+": { + "id": 4408145, + "geom": "0106000000010000000103000000010000000A000000D7D232C2528C6540224B1CB992CC45C035AC93FE308C6540AED61AE518CC45C077BF65A9308C65400CE8853B17CC45C0F188658E208C654079F5E0A49FCB45C03C3B141A248C654006E470019FCB45C0896DFC19278C6540671929E5A3CB45C0DF597160288C654000A7080BA6CB45C064B3C319648C65408C0F44B114CC45C0C885FE1E988C6540B64D609F81CC45C0D7D232C2528C6540224B1CB992CC45C0", + "date_adjusted": "2016-12-15T12:37:17", + "survey_reference": None, + "adjusted_nodes": 891, + } + } + ] + }, + } + } + + # Filter for features in datasets whose name matches a pattern + r = cli_runner.invoke( + ["diff", "HEAD^^?...", "*/dataset:feature:4408145", "-o", "json"] + ) + assert r.exit_code == 0, r.stderr + assert json.loads(r.stdout) == { + "kart.diff/v1+hexwkb": { + "second/dataset": { + "feature": [ + { + "+": { + "id": 4408145, + "geom": "0106000000010000000103000000010000000A000000D7D232C2528C6540224B1CB992CC45C035AC93FE308C6540AED61AE518CC45C077BF65A9308C65400CE8853B17CC45C0F188658E208C654079F5E0A49FCB45C03C3B141A248C654006E470019FCB45C0896DFC19278C6540671929E5A3CB45C0DF597160288C654000A7080BA6CB45C064B3C319648C65408C0F44B114CC45C0C885FE1E988C6540B64D609F81CC45C0D7D232C2528C6540224B1CB992CC45C0", + "date_adjusted": "2016-12-15T12:37:17", + "survey_reference": None, + "adjusted_nodes": 891, + } + } + ] + } + } + } + + @pytest.mark.parametrize("output_format", SHOW_OUTPUT_FORMATS) def test_show_polygons_initial(output_format, data_archive_readonly, cli_runner): """Make sure we can show the initial commit""" diff --git a/tests/test_log.py b/tests/test_log.py index b6fac27d9..3f5a8f12f 100644 --- a/tests/test_log.py +++ b/tests/test_log.py @@ -288,7 +288,7 @@ def num_commits(r): ) assert r.exit_code == 0, r.stderr assert num_commits(r) == 1 - # Path syntax still works: + # Raw path syntax still works: PK_1_PATH = "nz_pa_points_topo_150k/.table-dataset/feature/A/A/A/A/kQ0=" r = cli_runner.invoke(["log", "-o", output_format, "--", PK_1_PATH]) assert r.exit_code == 0, r.stderr