diff --git a/helpers/pathmap.py b/helpers/pathmap.py index 314ef5e8d..de0303881 100644 --- a/helpers/pathmap.py +++ b/helpers/pathmap.py @@ -1,7 +1,7 @@ -import collections -import operator +from dataclasses import dataclass, field from difflib import SequenceMatcher from os.path import relpath +from typing import Sequence def _clean_path(path): @@ -28,19 +28,79 @@ def _check_ancestors(path, match, ancestors): return ml.endswith("/".join(pl.split("/")[(ancestors + 1) * -1 :])) +def _get_best_match(path: str, possibilities: list[str]) -> str: + """ + Given a `path`, return the most similar one out of `possibilities`. + """ + + best_match = (-1, "") + for possibility in possibilities: + match = SequenceMatcher(None, path, possibility).ratio() + if match > best_match[0]: + best_match = (match, possibility) + + return best_match[1] + + +@dataclass +class Node: + terminals: list[str] = field(default_factory=list) + """ + A list of paths terminating in this node. + """ + + children: dict[str, "Node"] = field(default_factory=dict) + """ + Child nodes, keyed by path component. + """ + + class Tree: - def __init__(self, *args, **kwargs): - self.instance = {} + """ + This tree maintains a list of files and allows matching on them. + + It internally organizes the list of files (called `paths`) as a tree of `Node`s. + The paths are split into path components in reverse order. + Lookups in the tree also happen in reverse path-component order. + + For example, the following list of files: + - `src/foo/mod.rs` + - `src/foo/bar/mod.rs` + + ... are organized in a tree that looks like this: + - mod.rs + - foo + - src => src/foo/mod.rs + - bar + - foo + - src => src/foo/bar/mod.rs + + Using this tree, it is possible to look up paths like: + - `C:\\Users\\ci\\repo\\src\\foo\\mod.rs` + + Matching / lookup again happens in reverse path-component order, from right to left. + In this particular case, the tree traversal would walk the tree `Node`s `mod.rs`, `foo`, `src` + before it hits the `src/foo/mod.rs` "terminal", which is the result of the lookup. + """ - # Sequence end indicator - self._END = "\\*__ends__*//" + def __init__(self, paths: Sequence[str]): + self.root = Node() + for path in paths: + self.insert(path) + + def insert(self, path: str): + # the path components, in reverse order + components = reversed(path.split("/")) - # Original value indicator - self._ORIG = "\\*__orig__*//" + node = self.root + for component in components: + component = component.lower() + node = node.children.setdefault(component, Node()) + + node.terminals.append(path) def resolve_path(self, path: str, ancestors: int | None = None) -> str | None: path = _clean_path(path) - new_path = self.lookup(path, ancestors) if new_path: @@ -53,105 +113,60 @@ def resolve_path(self, path: str, ancestors: int | None = None) -> str | None: # path was not resolved return None - def _list_to_nested_dict(self, lis): - """ - Turns a list into a nested dict - - E.g.: - ['a','b','c'] => { 'c' : { 'b' : { 'a' : {} } } } - """ - d = {} - for i in range(0, len(lis)): - d[self._END] = True if i == 0 else False - d[self._ORIG] = ["/".join(lis[i:])] - d = {lis[i].lower(): d} - return d - - def _get_best_match(self, path, possibilities): - """ - Given a path find how similar it is to all paths in possibilities - - :str: path - A path part E.g.: a/b.py => a - :list: possibilities - Collected possibilities - """ - - # Map out similarity of possible paths with the path being looked up - similarity = list( - map(lambda x: SequenceMatcher(None, path, x).ratio(), possibilities) - ) - - # Get the index, value of the most similar path - index, value = max(enumerate(similarity), key=operator.itemgetter(1)) - - return possibilities[index] - - def _drill(self, d, results): + def _drill(self, node: Node) -> list[str] | None: """ - Drill down a branch of a tree. - Collects results until a ._END is reached. - - :returns - A list containing a possible path or None + "Drill down" a straight branch of a tree, returning the first terminal. """ - root_keys = [x for x in d.keys() if x != self._ORIG and x != self._END] - - if len(root_keys) > 1 or not root_keys: - return None - - root_key = root_keys[0] - root = d.get(root_key) + while len(node.children) == 1: + node = next(iter(node.children.values())) + if len(node.terminals): + return node.terminals - if root.get(self._END): - return root.get(self._ORIG) - else: - return self._drill(root, results) + return None - def _recursive_lookup(self, d, lis, results, i=0, end=False, match=False): + def _recursive_lookup( + self, + node: Node, + components: list[str], + results: list[str], + i=0, + end=False, + match=False, + ): """ Performs a lookup in tree recursively - :dict: d - tree branch - :list: lis - list of strings to search for - :list: results - Collected hit results - :int: i - Index of lis :bool: end - Indicates if last lookup was the end of a sequence :bool: match - Indicates if filename has any match in tree - - :returns a list of hit results if path is found in the tree """ - key = None - - if i < len(lis): - key = lis[i].lower() - root = d.get(key) - if root: - if root.get(self._END): - results = root.get(self._ORIG) + child_node = ( + node.children.get(components[i].lower()) if i < len(components) else None + ) + if child_node: + is_end = len(child_node.terminals) > 0 + if is_end: + results = child_node.terminals return self._recursive_lookup( - root, lis, results, i + 1, root.get(self._END), True + child_node, components, results, i + 1, is_end, True ) else: if not end and match: - next_path = self._drill(d, results) + next_path = self._drill(node) if next_path: results.extend(next_path) return results - def lookup(self, path, ancestors=None): + def lookup(self, path: str, ancestors=None) -> str | None: """ - Lookup a path in the tree - - :str: path - The path to search for - - :returns The closest matching path in the tree if present else None + Lookup a path in the tree, returning the closest matching path + in the tree if found. """ path_hit = None - path_split = list(reversed(path.split("/"))) - results = self._recursive_lookup(self.instance, path_split, []) - + components = list(reversed(path.split("/"))) + results = self._recursive_lookup(self.root, components, []) if not results: return None - if len(results) == 1: path_hit = results[0] else: @@ -160,54 +175,5 @@ def lookup(self, path, ancestors=None): closest_length = min(path_lengths, key=lambda x: abs(x - ancestors)) path_hit = next(x for x in results if len(x) == closest_length) else: - path_hit = self._get_best_match(path, list(reversed(results))) - + path_hit = _get_best_match(path, list(reversed(results))) return path_hit - - def update(self, d, u): - """ - Update a dictionary - :dict: d - Dictionary being updated - :dict: u - Dictionary being merged - """ - for k, v in u.items(): - if isinstance(v, collections.abc.Mapping): - r = self.update(d.get(k, {}), v) - d[k] = r - else: - if k == self._END and d.get(k) is True: - pass - elif k == self._ORIG and d.get(k) and u.get(k): - if d[k] != u[k]: - d[k] = d[k] + u[k] - else: - d[k] = u[k] - return d - - def insert(self, path): - """ - Insert a path into the tree - - :str: path - The path to insert - """ - - path_split = path.split("/") - root_key = path_split[-1].lower() - root = self.instance.get(root_key) - - if not root: - u = self._list_to_nested_dict(path_split) - self.instance.update(u) - else: - u = self._list_to_nested_dict(path_split) - self.instance = self.update(self.instance, u) - - def construct_tree(self, toc): - """ - Constructs a tree - - :list: toc - The table of contents - """ - - for path in toc: - self.insert(path) diff --git a/helpers/tests/pathmap/test_pathmap.py b/helpers/tests/pathmap/test_pathmap.py index b92c93307..9fa1b76b3 100644 --- a/helpers/tests/pathmap/test_pathmap.py +++ b/helpers/tests/pathmap/test_pathmap.py @@ -13,16 +13,13 @@ def test_clean_path(): def test_resolve_path(): - expected_path = "src/components/login.js" - tree = Tree() - tree.construct_tree([expected_path]) + tree = Tree(["src/components/login.js"]) - assert tree.resolve_path("Src/components/login.js") == expected_path + assert tree.resolve_path("Src/components/login.js") == "src/components/login.js" def test_resolve_case(): - tree = Tree() - tree.construct_tree(["Aa/Bb/cc", "Aa/Bb/Cc"]) + tree = Tree(["Aa/Bb/cc", "Aa/Bb/Cc"]) assert tree.resolve_path("aa/bb/cc") == "Aa/Bb/cc" assert tree.resolve_path("aa/bb/Cc") == "Aa/Bb/Cc" @@ -44,24 +41,19 @@ def test_resolve_paths(): "a/Path With Space", ] - tree = Tree() - tree.construct_tree([path for path in after if path]) + tree = Tree([path for path in after if path]) for path, expected in zip(before, after): assert tree.resolve_path(path) == expected def test_resolve_path_when_to_short(): - toc = ["a/b/c"] - tree = Tree() - tree.construct_tree(toc) + tree = Tree(["a/b/c"]) assert tree.resolve_path("b/c", 0) == "a/b/c" assert tree.resolve_path("b/c", 1) == "a/b/c" def test_resolve_path_when_to_long(): - toc = ["a/b/c"] - tree = Tree() - tree.construct_tree(toc) + tree = Tree(["a/b/c"]) assert tree.resolve_path("z/y/b/c", 1) == "a/b/c" @@ -80,9 +72,7 @@ def test_check_ancestors(): def test_resolve_paths_with_ancestors(): - toc = ["x/y/z"] - tree = Tree() - tree.construct_tree(toc) + tree = Tree(["x/y/z"]) # default, no ancestors ============================ paths = ["z", "R/z", "R/y/z", "x/y/z", "w/x/y/z"] @@ -104,71 +94,51 @@ def test_resolve_paths_with_ancestors(): def test_resolving(): - toc = ["a/b/c", "a/r/c", "c"] - tree = Tree() - tree.construct_tree(toc) + tree = Tree(["a/b/c", "a/r/c", "c"]) assert tree.resolve_path("r/c", 1) == "a/r/c" assert tree.resolve_path("r/c") == "a/r/c" - toc = ["a/b", "a/b/c/d", "x/y"] - tree = Tree() - tree.construct_tree(toc) + tree = Tree(["a/b", "a/b/c/d", "x/y"]) assert tree.resolve_path("c/d", 1) == "a/b/c/d" def test_with_plus(): - toc = ["b+c"] - tree = Tree() - tree.construct_tree(toc) + tree = Tree(["b+c"]) assert tree.resolve_path("b+c") == "b+c" - toc = ["a/b+c"] - tree = Tree() - tree.construct_tree(toc) + tree = Tree(["a/b+c"]) assert tree.resolve_path("b+c") == "a/b+c" def test_case_sensitive_ancestors(): - toc = ["src/HeapDump/GCHeapDump.cs"] - tree = Tree() - tree.construct_tree(toc) + tree = Tree(["src/HeapDump/GCHeapDump.cs"]) path = "C:/projects/perfview/src/heapDump/GCHeapDump.cs" new_path = tree.resolve_path(path, 1) assert new_path == "src/HeapDump/GCHeapDump.cs" def test_path_should_not_resolve(): - toc = ["four/six/three.py"] - path = "four/six/seven.py" - tree = Tree() - tree.construct_tree(toc) - path = tree.resolve_path(path) - assert path is None + tree = Tree(["four/six/three.py"]) + assert tree.resolve_path("four/six/seven.py") is None def test_path_should_not_resolve_case_insensative(): - toc = ["a/b/C"] - path = "a/B/c" - tree = Tree() - tree.construct_tree(toc) - path = tree.resolve_path(path) - assert path == "a/b/C" + tree = Tree(["a/b/C"]) + assert tree.resolve_path("a/B/c") == "a/b/C" def test_ancestors_original_missing(): - toc = ["shorter.h"] - tree = Tree() - tree.construct_tree(toc) + tree = Tree(["shorter.h"]) assert tree.resolve_path("a/long/path/shorter.h", 1) == "shorter.h" def test_ancestors_absolute_path(): - toc = [ - "examples/ChurchNumerals.scala", - "tests/src/test/scala/at/logic/gapt/examples/ChurchNumerals.scala", - ] - tree = Tree() - tree.construct_tree(toc) + tree = Tree( + [ + "examples/ChurchNumerals.scala", + "tests/src/test/scala/at/logic/gapt/examples/ChurchNumerals.scala", + ] + ) path = "/home/travis/build/gapt/gapt/examples/ChurchNumerals.scala" assert tree.resolve_path(path, 1) == "examples/ChurchNumerals.scala" diff --git a/helpers/tests/pathmap/test_tree.py b/helpers/tests/pathmap/test_tree.py index f48bfb5cd..ec22b88ad 100644 --- a/helpers/tests/pathmap/test_tree.py +++ b/helpers/tests/pathmap/test_tree.py @@ -1,88 +1,43 @@ -from helpers.pathmap import Tree +from helpers.pathmap import Tree, _get_best_match -class TestTree(object): - @classmethod - def setup_class(cls): - cls.tree = Tree() +def test_get_best_match(): + path = "a/bB.py" + possibilities = ["c/bB.py", "d/Bb.py"] - def setup_method(self, method): - self.tree.instance = {} + assert _get_best_match(path, possibilities) == "c/bB.py" - def test_list_to_nested_dict(self): - keys = ["a", "b", "c"] - nested_dict = self.tree._list_to_nested_dict(keys) - leaf = nested_dict.get("c").get("b").get("a") +def test_drill(): + tree = Tree(["a/b/c"]) + assert tree._drill(tree.root) == ["a/b/c"] - assert leaf - assert leaf.get(self.tree._END) - assert leaf.get(self.tree._ORIG) == ["a/b/c"] - def test_get_best_match(self): - path = "a/bB.py" - possibilities = ["c/bB.py", "d/Bb.py"] +def test_drill_multiple_possible_paths(): + tree = Tree(["src/list.rs", "benches/list.rs"]) - match = self.tree._get_best_match(path, possibilities) + branch = tree.root.children.get("list.rs") + assert tree._drill(branch) is None - assert match == "c/bB.py" - def test_drill(self): - """ - Test drilling a branch of tree - """ +def test_recursive_lookup(): + path = "one/two/three.py" - nested = self.tree._list_to_nested_dict(["a", "b", "c"]) - assert self.tree._drill(nested, []) == ["a/b/c"] + tree = Tree([path]) - def test_drill_multiple_possible_paths(self): - toc = ["src/list.rs", "benches/list.rs"] - self.tree.construct_tree(toc) + path_split = list(reversed(path.split("/"))) + match = tree._recursive_lookup(tree.root, path_split, []) - branch = self.tree.instance.get("list.rs") - results = [] - assert self.tree._drill(branch, results) is None + assert match == ["one/two/three.py"] - def test_recursive_lookup(self): - path = "one/two/three.py" + path = "four/five/three.py" + path_split = list(reversed(path.split("/"))) + match = tree._recursive_lookup(tree.root, path_split, []) - self.tree.construct_tree([path]) + assert match == ["one/two/three.py"] - path_split = list(reversed(path.split("/"))) - match = self.tree._recursive_lookup(self.tree.instance, path_split, []) - assert match == ["one/two/three.py"] +def test_lookup(): + tree = Tree(["one/two/three.py"]) - path = "four/five/three.py" - path_split = list(reversed(path.split("/"))) - match = self.tree._recursive_lookup(self.tree.instance, path_split, []) - - assert match == ["one/two/three.py"] - - def test_lookup(self): - toc = ["one/two/three.py"] - path = "two/one/three.py" - self.tree.construct_tree(toc) - - assert self.tree.lookup(path) == "one/two/three.py" - - def test_update(self): - dict1 = self.tree._list_to_nested_dict(["a", "b", "c"]) - dict2 = self.tree._list_to_nested_dict(["e", "g", "c"]) - - updated = self.tree.update(dict1, dict2) - - assert updated.get("c").get("b").get("a") - assert updated.get("c").get("g").get("e") - - def test_insert(self): - path = "a/b/c.py" - self.tree.insert(path) - - assert self.tree.instance.get("c.py").get("b").get("a") - - def test_construct_tree(self): - toc = ["a/b/c"] - - self.tree.construct_tree(toc) - assert self.tree.instance.get("c").get("b").get("a") + assert tree.lookup("two/one/three.py") == "one/two/three.py" diff --git a/services/path_fixer/__init__.py b/services/path_fixer/__init__.py index 57acc9cdc..3c1682044 100644 --- a/services/path_fixer/__init__.py +++ b/services/path_fixer/__init__.py @@ -76,8 +76,7 @@ def __init__( self.path_matcher = UserPathIncludes(self.path_patterns) if self.toc and not should_disable_default_pathfixes: - self.tree = Tree() - self.tree.construct_tree(self.toc) + self.tree = Tree(self.toc) else: self.tree = None