From 5745e45717682abd73461bc511a5879e261d0e16 Mon Sep 17 00:00:00 2001 From: Andrew Olsen Date: Fri, 15 May 2020 16:22:11 +1200 Subject: [PATCH] List conflicts - label + classify conflicts --- sno/conflicts.py | 341 ++++++++++++++++++++++++++++++++-------- sno/merge.py | 10 +- sno/structure.py | 38 +++-- tests/test_conflicts.py | 57 ++++++- 4 files changed, 366 insertions(+), 80 deletions(-) diff --git a/sno/conflicts.py b/sno/conflicts.py index 607acb7d7..49e646ed5 100644 --- a/sno/conflicts.py +++ b/sno/conflicts.py @@ -6,7 +6,7 @@ import click import pygit2 -from .exceptions import InvalidOperation +from .exceptions import InvalidOperation, NotYetImplemented from .repo_files import ( ORIG_HEAD, MERGE_HEAD, @@ -20,6 +20,7 @@ repo_file_exists, ) from .structs import AncestorOursTheirs +from .structure import RepositoryStructure L = logging.getLogger("sno.conflicts") @@ -171,58 +172,264 @@ def _ensure_conflict(self, conflict): ) -def summarise_conflicts_json(repo, conflict_index): - # Shortcut used often below - def aot(generator_or_tuple): - return AncestorOursTheirs(*generator_or_tuple) +class ConflictOutputFormat: + """Different ways of showing all the conflicts resulting from a merge.""" + # Only SHORT_SUMMARY is used so far, by `sno merge`. + + # Summaries: + SHORT_SUMMARY = 0 # Show counts of types of conflicts. + SUMMARY = 1 # List all the features that conflicted. + + # Full diffs: Show all versions of all the features that conflicted, in... + FULL_TEXT_DIFF = 2 # ... text format. + FULL_JSON_DIFF = 3 # ... JSON format. + FULL_GEOJSON_DIFF = 4 # ... GEOJSON format. + + _SUMMARY_FORMATS = (SHORT_SUMMARY, SUMMARY) + + +def list_conflicts( + repo, conflict_index, output_format, *, ancestor, ours, theirs, flat=False +): + """ + Lists all the conflicts in conflict_index, categorised into nested dicts. + Example: + { + "table_A": { + "featureConflicts": + "edit/edit": { + "table_A:fid=5": {"ancestor": "...", "ours": ..., "theirs": ...}, + "table_A:fid=11": {"ancestor": "...", "ours": ..., "theirs": ...}, + }, + "add/add": {...} + }, + "metaConflicts": { + "edit/edit": { + "table_A:meta:gpkg_spatial_ref_sys": {"ancestor": ..., "ours": ..., "theirs": ...} + } + } + }, + "table_B": {...} + } + Depending on the output_format, the conflicts themselves could be summarised as counts + or as lists of names, eg ["table_1:fid=5", "table_1:fid=11"] + + repo - the pygit2.Repository. + conflict_index - the ConflictIndex containing the conflicts found. + output_format - one of the constants in ConflictOutputFormat. + ancestor, ours, theirs - CommitWithReference objects. + flat - if True, don't categorise conflicts. Put them all at the top level. + """ + if output_format not in ConflictOutputFormat._SUMMARY_FORMATS: + raise NotYetImplemented( + "Sorry, Only SUMMARY and SHORT_SUMMARY are supported at present" + ) + + repo_structure3 = AncestorOursTheirs( + RepositoryStructure(repo, ancestor.commit), + RepositoryStructure(repo, ours.commit), + RepositoryStructure(repo, theirs.commit), + ) conflicts = {} for key, conflict3 in conflict_index.conflicts.items(): - paths3 = aot(c.path if c else None for c in conflict3) - if not any(paths3): - # Shouldn't happen - raise RuntimeError("Conflict has no paths") - - # Paths look like this: # path/to/table/.sno-table/feature_path for features - # Or path/to/table/.sno-table/meta/... for metadata - # (This will need updating if newer dataset versions don't follow this pattern.) - tables3 = aot(p.split("/.sno-table/", 1)[0] if p else None for p in paths3) - actual_tables = [t for t in tables3 if t] - all_same_table = all(a == actual_tables[0] for a in actual_tables) - if not all_same_table: - # This is a really bad conflict - it seems to involve multiple tables. - # Perhaps features were moved from one table to another, or perhaps - # a table was renamed. - conflicts.setdefault("", 0) - conflicts[""] += 1 - continue - - table = actual_tables[0] - conflicts.setdefault(table, {}) - conflicts_table = conflicts[table] - - meta_change = any("/.sno-table/meta/" in (p or "") for p in paths3) - if meta_change: - conflicts_table.setdefault("metaConflicts", 0) - conflicts_table["metaConflicts"] += 1 - continue - - conflicts_table.setdefault("featureConflicts", {}) - feature_conflicts = conflicts_table["featureConflicts"] - - all_same_path = all((p == paths3[0] for p in paths3)) - if all_same_path: - feature_conflicts.setdefault("edit/edit", 0) - feature_conflicts["edit/edit"] += 1 - continue - - feature_conflicts.setdefault("other", 0) - feature_conflicts["other"] += 1 + decoded_path3 = decode_conflict_paths(conflict3, repo_structure3) + conflict_dict = get_conflict_as_dict( + conflict3, repo_structure3, decoded_path3, output_format + ) + if flat: + conflicts.update(conflict_dict) + else: + conflict_category = get_conflict_category(decoded_path3) + add_conflict_dict_to_category(conflicts, conflict_category, conflict_dict) + + if output_format in ConflictOutputFormat._SUMMARY_FORMATS: + conflicts = summarise_conflicts(conflicts, output_format) return conflicts +def decode_conflict_paths(conflict3, repo_structure3): + """ + Given 3 versions of an IndexEntry, and 3 versions of a repository_structure, + return 3 versions of a decoded path - see RepositoryStructure.decode_path. + """ + return AncestorOursTheirs( + *( + rs.decode_path(c.path) if c else None + for c, rs, in zip(conflict3, repo_structure3) + ) + ) + + +def get_conflict_category(decoded_path3): + """ + Given 3 versions of the decoded path, tries to categorise the conflict, + so that similar conflicts can be grouped together. + For example, a returned category might be: + ["table_A", "featureConflicts", "edit/edit"] + Meaning conflicting edits were made to a feature in table_A. + """ + dpath3 = decoded_path3 + actual_dpaths = [p for p in dpath3 if p] + actual_tables = [p[0] for p in actual_dpaths] + all_same_table = len(set(actual_tables)) == 1 + + if not all_same_table: + return [""] + table = actual_tables[0] + + actual_tableparts = [p[1] for p in actual_dpaths] + all_same_tablepart = len(set(actual_tableparts)) == 1 + if all_same_tablepart: + tablepart = actual_tableparts[0] + "Conflicts" + else: + # Meta/feature conflict. Shouldn't really happen. + return [table, ""] + + # type currently includes everything involving renames. + conflict_type = "" + all_same_path = len(set(actual_dpaths)) == 1 + if all_same_path: + if not dpath3.ancestor: + if dpath3.ours and dpath3.theirs: + conflict_type = "add/add" + else: + if dpath3.ours and dpath3.theirs: + conflict_type = "edit/edit" + elif dpath3.ours or dpath3.theirs: + conflict_type = "edit/delete" + + return [table, tablepart, conflict_type] + + +# Stand in for a conflict if the conflict is going to be summarised anyway - +# this helps code re-use between summary and full-diff output modes. +_CONFLICT_PLACEHOLDER = object() + + +def get_conflict_as_dict(conflict3, repo_structure3, decoded_path3, output_format): + """ + Given 3 versions of an IndexEntry, 3 versions of the repository_structure, + and 3 versions of the decoded_path, returns a representation of the conflict + as a dict according to the output format. The outermost dict only contains + a single key, which is a unique name for the conflict. + For example: + {"table_A:fid=5": {"ancestor": ..., "ours": ..., "theirs": ...}} + """ + + label = get_conflict_label(decoded_path3) + if output_format in ConflictOutputFormat._SUMMARY_FORMATS: + # No need to output info about conflict itself - it will be summarised - + # so we just return a placeholder. + return {label: _CONFLICT_PLACEHOLDER} + else: + # TODO - return {label: {"ancestor": ..., "ours": ..., "theirs": ...}} + raise NotYetImplemented("Output of full conflict diffs is not supported") + + +def get_conflict_label(decoded_path3): + """ + Given 3 versions of the decoded path, returns a unique name for a conflict. + In simply cases, this will be something like: "table_A:fid=5" + But if renames have occurred, it could have multiple names, eg: + "ancestor=table_A:fid=5 ours=table_A:fid=6 theirs=table_A:fid=12" + """ + dpath3 = decoded_path3 + actual_dpaths = [p for p in dpath3 if p] + all_same_path = len(set(actual_dpaths)) == 1 + + if all_same_path: + return decoded_path_to_label(actual_dpaths[0]) + + label3 = AncestorOursTheirs( + *( + f"{v}={decoded_path_to_label(p)}" if p else None + for v, p, in zip(AncestorOursTheirs.NAMES, decoded_path3) + ) + ) + return " ".join([l for l in label3 if l]) + + +def decoded_path_to_label(decoded_path): + """ + Converts a decoded path to a unique name, eg: + ("table_A", "feature", "fid", 5) -> "table_A:fid=5" + """ + if decoded_path is None: + return None + if decoded_path[1] == "feature": + table, tablepart, pk_field, pk = decoded_path + return f"{table}:{pk_field}={pk}" + else: + return ":".join(decoded_path) + + +def add_conflict_dict_to_category(root_dict, conflict_category, conflict_dict): + """ + Ensures the given category of conflicts exists, and then adds + the given conflict dict to it. + """ + cur_dict = root_dict + for c in conflict_category: + cur_dict.setdefault(c, {}) + cur_dict = cur_dict[c] + + cur_dict.update(conflict_dict) + + +def summarise_conflicts(cur_dict, output_format): + """ + Recursively traverses the tree of categorised conflicts, + looking for a dict where the values are placeholders. + For example: + { + K1: _CONFLICT_PLACEHOLDER, + K2: _CONFLICT_PLACEHOLDER, + } + When found, it will be replaced with one of the following: + 1) SHORT_SUMMARY: 2 (the size of the dict) + 2) SUMMARY: [K1, K2] + """ + first_value = next(iter(cur_dict.values())) + if first_value == _CONFLICT_PLACEHOLDER: + if output_format == ConflictOutputFormat.SHORT_SUMMARY: + return len(cur_dict) + else: + + def sortkey(item): + tableybit, pk = item.split('=') + pk = int(pk) if pk.isdigit() else pk + return tableybit, pk + + return sorted([k for k in cur_dict.keys()], key=_label_sort_key) + + for k, v in cur_dict.items(): + cur_dict[k] = summarise_conflicts(v, output_format) + return cur_dict + + +def _label_sort_key(label): + """Sort labels of conflicts in a sensible way.""" + if ( + label.startswith("ancestor=") + or label.startswith("ours=") + or label.startswith("theirs=") + ): + # Put the complicated conflicts last. + return "Z multiple-path", label + + parts = label.split('=', 1) + if len(parts) == 2: + prefix, pk = parts + pk = int(pk) if pk.isdigit() else pk + return "Feature", prefix, pk + else: + # TODO: maybe meta conflicts should go before feature conflicts. + return "Meta", label + + def move_repo_to_merging_state( repo, conflict_index, merge_message, *, ancestor, ours, theirs ): @@ -269,21 +476,29 @@ def abort_merging_state(repo): # Not sure if it matters - we don't modify HEAD when we move into merging state. -def output_json_conflicts_as_text(jdict): - for table, table_conflicts in sorted(jdict.items()): - if table == "": - continue - click.secho(f"{table}:", bold=True) - meta_conflicts = table_conflicts.get("metaConflicts", 0) - if meta_conflicts: - click.echo(f" META conflicts: {meta_conflicts}") - feature_conflicts = table_conflicts.get("featureConflicts", {}) - if feature_conflicts: - click.echo(" Feature conflicts:") - for k, v in sorted(feature_conflicts.items()): - click.echo(f" {k}: {v}") - click.echo() - - non_table_conflicts = jdict.get("", 0) - if non_table_conflicts: - click.secho(f"Other conflicts: {non_table_conflicts}", bold=True) +_JSON_KEYS_TO_TEXT_HEADERS = { + "featureConflicts": "Feature conflicts", + "metaConflicts": "META conflicts", +} + + +def output_conflicts_as_text(jdict, level=0): + """Writes the JSON output of list_conflicts to stdout as text, using click.echo.""" + top_level = level == 0 + indent = " " * level + + for k, v in sorted(jdict.items()): + heading = _JSON_KEYS_TO_TEXT_HEADERS.get(k, k) + if isinstance(v, dict): + click.secho(f"{indent}{heading}:", bold=top_level) + output_conflicts_as_text(v, level + 1) + if top_level: + click.echo() + elif isinstance(v, list): + click.secho(f"{indent}{heading}:", bold=top_level) + for item in v: + click.echo(f"{indent} {item}") + if top_level: + click.echo() + else: + click.echo(f"{indent}{heading}: {v}") diff --git a/sno/merge.py b/sno/merge.py index 0f6331bd3..7495e85d4 100644 --- a/sno/merge.py +++ b/sno/merge.py @@ -8,8 +8,8 @@ ConflictIndex, abort_merging_state, move_repo_to_merging_state, - summarise_conflicts_json, - output_json_conflicts_as_text, + list_conflicts, + output_conflicts_as_text, ) from .exceptions import InvalidOperation from .output_util import dump_json_output @@ -150,7 +150,9 @@ def do_merge(repo, ff, ff_only, dry_run, commit): if merge_index.conflicts: conflict_index = ConflictIndex(merge_index) - merge_jdict["conflicts"] = summarise_conflicts_json(repo, conflict_index) + merge_jdict["conflicts"] = list_conflicts( + repo, conflict_index, 0, ancestor=ancestor, ours=ours, theirs=theirs + ) if not dry_run: move_repo_to_merging_state( repo, @@ -212,7 +214,7 @@ def output_merge_json_as_text(jdict): return click.echo("Conflicts found:\n") - output_json_conflicts_as_text(conflicts) + output_conflicts_as_text(conflicts) if dry_run: click.echo("(Not actually merging due to --dry-run)") diff --git a/sno/structure.py b/sno/structure.py index fb2f6e676..3408ff878 100644 --- a/sno/structure.py +++ b/sno/structure.py @@ -77,6 +77,16 @@ def __repr__(self): else: return f"RepoStructure<{self.repo.path} >" + def decode_path(self, path): + """ + Given a path in the sno repository - eg "table_A/.sno-table/49/3e/Bg==" - + returns a tuple in either of the following forms: + 1. (table, "feature", primary_key_field, primary_key) + 2. (table, "meta", metadata_file_path) + """ + table, table_path = path.split("/.sno-table/", 1) + return (table,) + self.get(table).decode_path(table_path) + def get(self, path): try: return self.get_at(path, self.tree) @@ -129,10 +139,6 @@ def iter_at(self, tree): # examine inside this directory to_examine.append((te_path, o)) - def get_for_index_entry(self, index_entry): - dataset_path = index_entry.path.split(r"/.sno-table/", maxsplit=1)[0] - return self.get(dataset_path) - @property def id(self): obj = self._commit or self._tree @@ -674,12 +680,6 @@ def primary_key_type(self): field = next(f for f in schema if f["name"] == self.primary_key) return field["type"] - def index_entry_to_pk(self, index_entry): - feature_path = index_entry.path.split("/.sno-table/", maxsplit=1)[1] - if feature_path.startswith("meta/"): - return 'META' - return self.decode_pk(os.path.basename(feature_path)) - def encode_pk(self, pk): pk_enc = msgpack.packb(pk, use_bin_type=True) # encode pk value via msgpack pk_str = base64.urlsafe_b64encode(pk_enc).decode("utf8") # filename safe @@ -696,6 +696,24 @@ def get_feature_path(self, pk): ).hexdigest() # hash to randomly spread filenames return "/".join([".sno-table", pk_hash[:2], pk_hash[2:4], pk_enc]) + _FEATURE_PATH_PATTERN = re.compile(r"^.sno-table/../../(?P.+)$") + + _FEATURE_PATH_PATTERN = re.compile(r"^.sno-table/meta/../(?P.+)$") + + def decode_path(self, path): + """ + Given a path in this layer of the sno repository - eg ".sno-table/49/3e/Bg==" - + returns a tuple in either of the following forms: + 1. ("feature", primary_key_field, primary_key) + 2. ("meta", metadata_file_path) + """ + if path.startswith(".sno-table/"): + path = path[len(".sno-table/") :] + if path.startswith("meta/"): + return ("meta", path[len("meta/") :]) + pk = self.decode_pk(os.path.basename(path)) + return ("feature", self.primary_key, pk) + def import_meta_items(self, source): """ path/to/layer/.sno-table/ diff --git a/tests/test_conflicts.py b/tests/test_conflicts.py index 052942b4b..19ba42818 100644 --- a/tests/test_conflicts.py +++ b/tests/test_conflicts.py @@ -4,7 +4,7 @@ import pygit2 -from sno.conflicts import ConflictIndex +from sno.conflicts import ConflictIndex, ConflictOutputFormat, list_conflicts from sno.structs import CommitWithReference from sno.repo_files import ( MERGE_HEAD, @@ -95,8 +95,8 @@ def test_merge_conflicts( "", f"{data.LAYER}:", " Feature conflicts:", + " add/add: 1", " edit/edit: 3", - " other: 1", "", ] + dry_run_message @@ -113,7 +113,9 @@ def test_merge_conflicts( "dryRun": dry_run, "message": "Merge branch \"theirs_branch\" into ours_branch", "conflicts": { - data.LAYER: {"featureConflicts": {"edit/edit": 3, "other": 1}}, + data.LAYER: { + "featureConflicts": {"add/add": 1, "edit/edit": 3} + }, }, }, } @@ -168,3 +170,52 @@ def test_conflict_index_roundtrip(create_conflicts, cli_runner): r1.write("test.conflict.index") r2 = ConflictIndex.read("test.conflict.index") assert r2 == r1 + + +def test_list_conflicts(create_conflicts, cli_runner): + f = ConflictOutputFormat + + # Difficult to create conflict indexes directly - easier to create them by doing a merge: + with create_conflicts(H.POLYGONS) as repo: + ancestor = CommitWithReference.resolve(repo, "ancestor_branch") + ours = CommitWithReference.resolve(repo, "ours_branch") + theirs = CommitWithReference.resolve(repo, "theirs_branch") + + merge_index = repo.merge_trees( + ancestor=ancestor.tree, ours=ours.tree, theirs=theirs.tree + ) + cindex = ConflictIndex(merge_index) + + kwargs = {"ancestor": ancestor, "ours": ours, "theirs": theirs} + short_summary = list_conflicts(repo, cindex, f.SHORT_SUMMARY, **kwargs) + + assert short_summary == { + "nz_waca_adjustments": {"featureConflicts": {"add/add": 1, "edit/edit": 3}}, + } + + flat_short_summary = list_conflicts( + repo, cindex, f.SHORT_SUMMARY, **kwargs, flat=True + ) + assert flat_short_summary == 4 + + summary = list_conflicts(repo, cindex, f.SUMMARY, **kwargs,) + assert summary == { + "nz_waca_adjustments": { + "featureConflicts": { + "add/add": ["nz_waca_adjustments:id=98001"], + "edit/edit": [ + "nz_waca_adjustments:id=1452332", + "nz_waca_adjustments:id=1456853", + "nz_waca_adjustments:id=1456912", + ], + } + }, + } + + flat_summary = list_conflicts(repo, cindex, f.SUMMARY, **kwargs, flat=True) + assert flat_summary == [ + "nz_waca_adjustments:id=98001", + "nz_waca_adjustments:id=1452332", + "nz_waca_adjustments:id=1456853", + "nz_waca_adjustments:id=1456912", + ]