From 5745e45717682abd73461bc511a5879e261d0e16 Mon Sep 17 00:00:00 2001
From: Andrew Olsen <andrew.olsen@koordinates.com>
Date: Fri, 15 May 2020 16:22:11 +1200
Subject: [PATCH] List conflicts - label + classify conflicts

---
 sno/conflicts.py        | 341 ++++++++++++++++++++++++++++++++--------
 sno/merge.py            |  10 +-
 sno/structure.py        |  38 +++--
 tests/test_conflicts.py |  57 ++++++-
 4 files changed, 366 insertions(+), 80 deletions(-)

diff --git a/sno/conflicts.py b/sno/conflicts.py
index 607acb7d7..49e646ed5 100644
--- a/sno/conflicts.py
+++ b/sno/conflicts.py
@@ -6,7 +6,7 @@
 import click
 import pygit2
 
-from .exceptions import InvalidOperation
+from .exceptions import InvalidOperation, NotYetImplemented
 from .repo_files import (
     ORIG_HEAD,
     MERGE_HEAD,
@@ -20,6 +20,7 @@
     repo_file_exists,
 )
 from .structs import AncestorOursTheirs
+from .structure import RepositoryStructure
 
 
 L = logging.getLogger("sno.conflicts")
@@ -171,58 +172,264 @@ def _ensure_conflict(self, conflict):
             )
 
 
-def summarise_conflicts_json(repo, conflict_index):
-    # Shortcut used often below
-    def aot(generator_or_tuple):
-        return AncestorOursTheirs(*generator_or_tuple)
+class ConflictOutputFormat:
+    """Different ways of showing all the conflicts resulting from a merge."""
 
+    # Only SHORT_SUMMARY is used so far, by `sno merge`.
+
+    # Summaries:
+    SHORT_SUMMARY = 0  # Show counts of types of conflicts.
+    SUMMARY = 1  # List all the features that conflicted.
+
+    # Full diffs: Show all versions of all the features that conflicted, in...
+    FULL_TEXT_DIFF = 2  # ... text format.
+    FULL_JSON_DIFF = 3  # ... JSON format.
+    FULL_GEOJSON_DIFF = 4  # ... GEOJSON format.
+
+    _SUMMARY_FORMATS = (SHORT_SUMMARY, SUMMARY)
+
+
+def list_conflicts(
+    repo, conflict_index, output_format, *, ancestor, ours, theirs, flat=False
+):
+    """
+        Lists all the conflicts in conflict_index, categorised into nested dicts.
+        Example:
+        {
+            "table_A": {
+                "featureConflicts":
+                    "edit/edit": {
+                        "table_A:fid=5": {"ancestor": "...", "ours": ..., "theirs": ...},
+                        "table_A:fid=11": {"ancestor": "...", "ours": ..., "theirs": ...},
+                    },
+                    "add/add": {...}
+                },
+                "metaConflicts": {
+                    "edit/edit": {
+                        "table_A:meta:gpkg_spatial_ref_sys": {"ancestor": ..., "ours": ..., "theirs": ...}
+                    }
+                }
+            },
+            "table_B": {...}
+        }
+        Depending on the output_format, the conflicts themselves could be summarised as counts
+        or as lists of names, eg ["table_1:fid=5", "table_1:fid=11"]
+
+        repo - the pygit2.Repository.
+        conflict_index - the ConflictIndex containing the conflicts found.
+        output_format - one of the constants in ConflictOutputFormat.
+        ancestor, ours, theirs - CommitWithReference objects.
+        flat - if True, don't categorise conflicts. Put them all at the top level.
+    """
+    if output_format not in ConflictOutputFormat._SUMMARY_FORMATS:
+        raise NotYetImplemented(
+            "Sorry, Only SUMMARY and SHORT_SUMMARY are supported at present"
+        )
+
+    repo_structure3 = AncestorOursTheirs(
+        RepositoryStructure(repo, ancestor.commit),
+        RepositoryStructure(repo, ours.commit),
+        RepositoryStructure(repo, theirs.commit),
+    )
     conflicts = {}
 
     for key, conflict3 in conflict_index.conflicts.items():
-        paths3 = aot(c.path if c else None for c in conflict3)
-        if not any(paths3):
-            # Shouldn't happen
-            raise RuntimeError("Conflict has no paths")
-
-        # Paths look like this: # path/to/table/.sno-table/feature_path for features
-        # Or path/to/table/.sno-table/meta/... for metadata
-        # (This will need updating if newer dataset versions don't follow this pattern.)
-        tables3 = aot(p.split("/.sno-table/", 1)[0] if p else None for p in paths3)
-        actual_tables = [t for t in tables3 if t]
-        all_same_table = all(a == actual_tables[0] for a in actual_tables)
-        if not all_same_table:
-            # This is a really bad conflict - it seems to involve multiple tables.
-            # Perhaps features were moved from one table to another, or perhaps
-            # a table was renamed.
-            conflicts.setdefault("<other>", 0)
-            conflicts["<other>"] += 1
-            continue
-
-        table = actual_tables[0]
-        conflicts.setdefault(table, {})
-        conflicts_table = conflicts[table]
-
-        meta_change = any("/.sno-table/meta/" in (p or "") for p in paths3)
-        if meta_change:
-            conflicts_table.setdefault("metaConflicts", 0)
-            conflicts_table["metaConflicts"] += 1
-            continue
-
-        conflicts_table.setdefault("featureConflicts", {})
-        feature_conflicts = conflicts_table["featureConflicts"]
-
-        all_same_path = all((p == paths3[0] for p in paths3))
-        if all_same_path:
-            feature_conflicts.setdefault("edit/edit", 0)
-            feature_conflicts["edit/edit"] += 1
-            continue
-
-        feature_conflicts.setdefault("other", 0)
-        feature_conflicts["other"] += 1
+        decoded_path3 = decode_conflict_paths(conflict3, repo_structure3)
+        conflict_dict = get_conflict_as_dict(
+            conflict3, repo_structure3, decoded_path3, output_format
+        )
+        if flat:
+            conflicts.update(conflict_dict)
+        else:
+            conflict_category = get_conflict_category(decoded_path3)
+            add_conflict_dict_to_category(conflicts, conflict_category, conflict_dict)
+
+    if output_format in ConflictOutputFormat._SUMMARY_FORMATS:
+        conflicts = summarise_conflicts(conflicts, output_format)
 
     return conflicts
 
 
+def decode_conflict_paths(conflict3, repo_structure3):
+    """
+    Given 3 versions of an IndexEntry, and 3 versions of a repository_structure,
+    return 3 versions of a decoded path - see RepositoryStructure.decode_path.
+    """
+    return AncestorOursTheirs(
+        *(
+            rs.decode_path(c.path) if c else None
+            for c, rs, in zip(conflict3, repo_structure3)
+        )
+    )
+
+
+def get_conflict_category(decoded_path3):
+    """
+    Given 3 versions of the decoded path, tries to categorise the conflict,
+    so that similar conflicts can be grouped together.
+    For example, a returned category might be:
+    ["table_A", "featureConflicts", "edit/edit"]
+    Meaning conflicting edits were made to a feature in table_A.
+    """
+    dpath3 = decoded_path3
+    actual_dpaths = [p for p in dpath3 if p]
+    actual_tables = [p[0] for p in actual_dpaths]
+    all_same_table = len(set(actual_tables)) == 1
+
+    if not all_same_table:
+        return ["<uncategorised>"]
+    table = actual_tables[0]
+
+    actual_tableparts = [p[1] for p in actual_dpaths]
+    all_same_tablepart = len(set(actual_tableparts)) == 1
+    if all_same_tablepart:
+        tablepart = actual_tableparts[0] + "Conflicts"
+    else:
+        # Meta/feature conflict. Shouldn't really happen.
+        return [table, "<uncategorised>"]
+
+    # <uncategorised> type currently includes everything involving renames.
+    conflict_type = "<uncategorised>"
+    all_same_path = len(set(actual_dpaths)) == 1
+    if all_same_path:
+        if not dpath3.ancestor:
+            if dpath3.ours and dpath3.theirs:
+                conflict_type = "add/add"
+        else:
+            if dpath3.ours and dpath3.theirs:
+                conflict_type = "edit/edit"
+            elif dpath3.ours or dpath3.theirs:
+                conflict_type = "edit/delete"
+
+    return [table, tablepart, conflict_type]
+
+
+# Stand in for a conflict if the conflict is going to be summarised anyway -
+# this helps code re-use between summary and full-diff output modes.
+_CONFLICT_PLACEHOLDER = object()
+
+
+def get_conflict_as_dict(conflict3, repo_structure3, decoded_path3, output_format):
+    """
+    Given 3 versions of an IndexEntry, 3 versions of the repository_structure,
+    and 3 versions of the decoded_path, returns a representation of the conflict
+    as a dict according to the output format. The outermost dict only contains
+    a single key, which is a unique name for the conflict.
+    For example:
+    {"table_A:fid=5": {"ancestor": ..., "ours": ..., "theirs": ...}}
+    """
+
+    label = get_conflict_label(decoded_path3)
+    if output_format in ConflictOutputFormat._SUMMARY_FORMATS:
+        # No need to output info about conflict itself - it will be summarised -
+        # so we just return a placeholder.
+        return {label: _CONFLICT_PLACEHOLDER}
+    else:
+        # TODO - return {label: {"ancestor": ..., "ours": ..., "theirs": ...}}
+        raise NotYetImplemented("Output of full conflict diffs is not supported")
+
+
+def get_conflict_label(decoded_path3):
+    """
+    Given 3 versions of the decoded path, returns a unique name for a conflict.
+    In simply cases, this will be something like: "table_A:fid=5"
+    But if renames have occurred, it could have multiple names, eg:
+    "ancestor=table_A:fid=5 ours=table_A:fid=6 theirs=table_A:fid=12"
+    """
+    dpath3 = decoded_path3
+    actual_dpaths = [p for p in dpath3 if p]
+    all_same_path = len(set(actual_dpaths)) == 1
+
+    if all_same_path:
+        return decoded_path_to_label(actual_dpaths[0])
+
+    label3 = AncestorOursTheirs(
+        *(
+            f"{v}={decoded_path_to_label(p)}" if p else None
+            for v, p, in zip(AncestorOursTheirs.NAMES, decoded_path3)
+        )
+    )
+    return " ".join([l for l in label3 if l])
+
+
+def decoded_path_to_label(decoded_path):
+    """
+    Converts a decoded path to a unique name, eg:
+    ("table_A", "feature", "fid", 5) -> "table_A:fid=5"
+    """
+    if decoded_path is None:
+        return None
+    if decoded_path[1] == "feature":
+        table, tablepart, pk_field, pk = decoded_path
+        return f"{table}:{pk_field}={pk}"
+    else:
+        return ":".join(decoded_path)
+
+
+def add_conflict_dict_to_category(root_dict, conflict_category, conflict_dict):
+    """
+    Ensures the given category of conflicts exists, and then adds
+    the given conflict dict to it.
+    """
+    cur_dict = root_dict
+    for c in conflict_category:
+        cur_dict.setdefault(c, {})
+        cur_dict = cur_dict[c]
+
+    cur_dict.update(conflict_dict)
+
+
+def summarise_conflicts(cur_dict, output_format):
+    """
+    Recursively traverses the tree of categorised conflicts,
+    looking for a dict where the values are placeholders.
+    For example:
+    {
+        K1: _CONFLICT_PLACEHOLDER,
+        K2: _CONFLICT_PLACEHOLDER,
+    }
+    When found, it will be replaced with one of the following:
+    1) SHORT_SUMMARY: 2 (the size of the dict)
+    2) SUMMARY: [K1, K2]
+    """
+    first_value = next(iter(cur_dict.values()))
+    if first_value == _CONFLICT_PLACEHOLDER:
+        if output_format == ConflictOutputFormat.SHORT_SUMMARY:
+            return len(cur_dict)
+        else:
+
+            def sortkey(item):
+                tableybit, pk = item.split('=')
+                pk = int(pk) if pk.isdigit() else pk
+                return tableybit, pk
+
+            return sorted([k for k in cur_dict.keys()], key=_label_sort_key)
+
+    for k, v in cur_dict.items():
+        cur_dict[k] = summarise_conflicts(v, output_format)
+    return cur_dict
+
+
+def _label_sort_key(label):
+    """Sort labels of conflicts in a sensible way."""
+    if (
+        label.startswith("ancestor=")
+        or label.startswith("ours=")
+        or label.startswith("theirs=")
+    ):
+        # Put the complicated conflicts last.
+        return "Z multiple-path", label
+
+    parts = label.split('=', 1)
+    if len(parts) == 2:
+        prefix, pk = parts
+        pk = int(pk) if pk.isdigit() else pk
+        return "Feature", prefix, pk
+    else:
+        # TODO: maybe meta conflicts should go before feature conflicts.
+        return "Meta", label
+
+
 def move_repo_to_merging_state(
     repo, conflict_index, merge_message, *, ancestor, ours, theirs
 ):
@@ -269,21 +476,29 @@ def abort_merging_state(repo):
     # Not sure if it matters - we don't modify HEAD when we move into merging state.
 
 
-def output_json_conflicts_as_text(jdict):
-    for table, table_conflicts in sorted(jdict.items()):
-        if table == "<other>":
-            continue
-        click.secho(f"{table}:", bold=True)
-        meta_conflicts = table_conflicts.get("metaConflicts", 0)
-        if meta_conflicts:
-            click.echo(f"  META conflicts: {meta_conflicts}")
-        feature_conflicts = table_conflicts.get("featureConflicts", {})
-        if feature_conflicts:
-            click.echo("  Feature conflicts:")
-            for k, v in sorted(feature_conflicts.items()):
-                click.echo(f"    {k}: {v}")
-        click.echo()
-
-    non_table_conflicts = jdict.get("<other>", 0)
-    if non_table_conflicts:
-        click.secho(f"Other conflicts: {non_table_conflicts}", bold=True)
+_JSON_KEYS_TO_TEXT_HEADERS = {
+    "featureConflicts": "Feature conflicts",
+    "metaConflicts": "META conflicts",
+}
+
+
+def output_conflicts_as_text(jdict, level=0):
+    """Writes the JSON output of list_conflicts to stdout as text, using click.echo."""
+    top_level = level == 0
+    indent = "  " * level
+
+    for k, v in sorted(jdict.items()):
+        heading = _JSON_KEYS_TO_TEXT_HEADERS.get(k, k)
+        if isinstance(v, dict):
+            click.secho(f"{indent}{heading}:", bold=top_level)
+            output_conflicts_as_text(v, level + 1)
+            if top_level:
+                click.echo()
+        elif isinstance(v, list):
+            click.secho(f"{indent}{heading}:", bold=top_level)
+            for item in v:
+                click.echo(f"{indent}  {item}")
+            if top_level:
+                click.echo()
+        else:
+            click.echo(f"{indent}{heading}: {v}")
diff --git a/sno/merge.py b/sno/merge.py
index 0f6331bd3..7495e85d4 100644
--- a/sno/merge.py
+++ b/sno/merge.py
@@ -8,8 +8,8 @@
     ConflictIndex,
     abort_merging_state,
     move_repo_to_merging_state,
-    summarise_conflicts_json,
-    output_json_conflicts_as_text,
+    list_conflicts,
+    output_conflicts_as_text,
 )
 from .exceptions import InvalidOperation
 from .output_util import dump_json_output
@@ -150,7 +150,9 @@ def do_merge(repo, ff, ff_only, dry_run, commit):
 
     if merge_index.conflicts:
         conflict_index = ConflictIndex(merge_index)
-        merge_jdict["conflicts"] = summarise_conflicts_json(repo, conflict_index)
+        merge_jdict["conflicts"] = list_conflicts(
+            repo, conflict_index, 0, ancestor=ancestor, ours=ours, theirs=theirs
+        )
         if not dry_run:
             move_repo_to_merging_state(
                 repo,
@@ -212,7 +214,7 @@ def output_merge_json_as_text(jdict):
         return
 
     click.echo("Conflicts found:\n")
-    output_json_conflicts_as_text(conflicts)
+    output_conflicts_as_text(conflicts)
 
     if dry_run:
         click.echo("(Not actually merging due to --dry-run)")
diff --git a/sno/structure.py b/sno/structure.py
index fb2f6e676..3408ff878 100644
--- a/sno/structure.py
+++ b/sno/structure.py
@@ -77,6 +77,16 @@ def __repr__(self):
         else:
             return f"RepoStructure<{self.repo.path} <empty>>"
 
+    def decode_path(self, path):
+        """
+        Given a path in the sno repository - eg "table_A/.sno-table/49/3e/Bg==" -
+        returns a tuple in either of the following forms:
+        1. (table, "feature", primary_key_field, primary_key)
+        2. (table, "meta", metadata_file_path)
+        """
+        table, table_path = path.split("/.sno-table/", 1)
+        return (table,) + self.get(table).decode_path(table_path)
+
     def get(self, path):
         try:
             return self.get_at(path, self.tree)
@@ -129,10 +139,6 @@ def iter_at(self, tree):
                         # examine inside this directory
                         to_examine.append((te_path, o))
 
-    def get_for_index_entry(self, index_entry):
-        dataset_path = index_entry.path.split(r"/.sno-table/", maxsplit=1)[0]
-        return self.get(dataset_path)
-
     @property
     def id(self):
         obj = self._commit or self._tree
@@ -674,12 +680,6 @@ def primary_key_type(self):
         field = next(f for f in schema if f["name"] == self.primary_key)
         return field["type"]
 
-    def index_entry_to_pk(self, index_entry):
-        feature_path = index_entry.path.split("/.sno-table/", maxsplit=1)[1]
-        if feature_path.startswith("meta/"):
-            return 'META'
-        return self.decode_pk(os.path.basename(feature_path))
-
     def encode_pk(self, pk):
         pk_enc = msgpack.packb(pk, use_bin_type=True)  # encode pk value via msgpack
         pk_str = base64.urlsafe_b64encode(pk_enc).decode("utf8")  # filename safe
@@ -696,6 +696,24 @@ def get_feature_path(self, pk):
         ).hexdigest()  # hash to randomly spread filenames
         return "/".join([".sno-table", pk_hash[:2], pk_hash[2:4], pk_enc])
 
+    _FEATURE_PATH_PATTERN = re.compile(r"^.sno-table/../../(?P<encoded>.+)$")
+
+    _FEATURE_PATH_PATTERN = re.compile(r"^.sno-table/meta/../(?P<encoded>.+)$")
+
+    def decode_path(self, path):
+        """
+        Given a path in this layer of the sno repository - eg ".sno-table/49/3e/Bg==" -
+        returns a tuple in either of the following forms:
+        1. ("feature", primary_key_field, primary_key)
+        2. ("meta", metadata_file_path)
+        """
+        if path.startswith(".sno-table/"):
+            path = path[len(".sno-table/") :]
+        if path.startswith("meta/"):
+            return ("meta", path[len("meta/") :])
+        pk = self.decode_pk(os.path.basename(path))
+        return ("feature", self.primary_key, pk)
+
     def import_meta_items(self, source):
         """
             path/to/layer/.sno-table/
diff --git a/tests/test_conflicts.py b/tests/test_conflicts.py
index 052942b4b..19ba42818 100644
--- a/tests/test_conflicts.py
+++ b/tests/test_conflicts.py
@@ -4,7 +4,7 @@
 
 import pygit2
 
-from sno.conflicts import ConflictIndex
+from sno.conflicts import ConflictIndex, ConflictOutputFormat, list_conflicts
 from sno.structs import CommitWithReference
 from sno.repo_files import (
     MERGE_HEAD,
@@ -95,8 +95,8 @@ def test_merge_conflicts(
                     "",
                     f"{data.LAYER}:",
                     "  Feature conflicts:",
+                    "    add/add: 1",
                     "    edit/edit: 3",
-                    "    other: 1",
                     "",
                 ]
                 + dry_run_message
@@ -113,7 +113,9 @@ def test_merge_conflicts(
                     "dryRun": dry_run,
                     "message": "Merge branch \"theirs_branch\" into ours_branch",
                     "conflicts": {
-                        data.LAYER: {"featureConflicts": {"edit/edit": 3, "other": 1}},
+                        data.LAYER: {
+                            "featureConflicts": {"add/add": 1, "edit/edit": 3}
+                        },
                     },
                 },
             }
@@ -168,3 +170,52 @@ def test_conflict_index_roundtrip(create_conflicts, cli_runner):
         r1.write("test.conflict.index")
         r2 = ConflictIndex.read("test.conflict.index")
         assert r2 == r1
+
+
+def test_list_conflicts(create_conflicts, cli_runner):
+    f = ConflictOutputFormat
+
+    # Difficult to create conflict indexes directly - easier to create them by doing a merge:
+    with create_conflicts(H.POLYGONS) as repo:
+        ancestor = CommitWithReference.resolve(repo, "ancestor_branch")
+        ours = CommitWithReference.resolve(repo, "ours_branch")
+        theirs = CommitWithReference.resolve(repo, "theirs_branch")
+
+        merge_index = repo.merge_trees(
+            ancestor=ancestor.tree, ours=ours.tree, theirs=theirs.tree
+        )
+        cindex = ConflictIndex(merge_index)
+
+        kwargs = {"ancestor": ancestor, "ours": ours, "theirs": theirs}
+        short_summary = list_conflicts(repo, cindex, f.SHORT_SUMMARY, **kwargs)
+
+        assert short_summary == {
+            "nz_waca_adjustments": {"featureConflicts": {"add/add": 1, "edit/edit": 3}},
+        }
+
+        flat_short_summary = list_conflicts(
+            repo, cindex, f.SHORT_SUMMARY, **kwargs, flat=True
+        )
+        assert flat_short_summary == 4
+
+        summary = list_conflicts(repo, cindex, f.SUMMARY, **kwargs,)
+        assert summary == {
+            "nz_waca_adjustments": {
+                "featureConflicts": {
+                    "add/add": ["nz_waca_adjustments:id=98001"],
+                    "edit/edit": [
+                        "nz_waca_adjustments:id=1452332",
+                        "nz_waca_adjustments:id=1456853",
+                        "nz_waca_adjustments:id=1456912",
+                    ],
+                }
+            },
+        }
+
+        flat_summary = list_conflicts(repo, cindex, f.SUMMARY, **kwargs, flat=True)
+        assert flat_summary == [
+            "nz_waca_adjustments:id=98001",
+            "nz_waca_adjustments:id=1452332",
+            "nz_waca_adjustments:id=1456853",
+            "nz_waca_adjustments:id=1456912",
+        ]