fea: improve error messages (#2806)

Use Levenshtein distance to improve error messages when a similar word can be found in the namespace
vyperlang · Apr 17, 2022 · 857bc86 · 857bc86
1 parent ead39ce
commit 857bc86
Show file tree

Hide file tree

Showing 7 changed files with 130 additions and 6 deletions.
diff --git a/vyper/semantics/namespace.py b/vyper/semantics/namespace.py
@@ -7,6 +7,7 @@
     StructureException,
     UndeclaredDefinition,
 )
+from vyper.semantics.validation.levenshtein_utils import get_levenshtein_error_suggestions
 
 
 class Namespace(dict):
@@ -42,7 +43,8 @@ def __setitem__(self, attr, obj):
 
     def __getitem__(self, key):
         if key not in self:
-            raise UndeclaredDefinition(f"'{key}' has not been declared")
+            suggestions_str = get_levenshtein_error_suggestions(key, self, 0.2)
+            raise UndeclaredDefinition(f"'{key}' has not been declared. {suggestions_str}")
         return super().__getitem__(key)
 
     def __enter__(self):

diff --git a/vyper/semantics/types/bases.py b/vyper/semantics/types/bases.py
@@ -18,6 +18,7 @@
     UnknownAttribute,
 )
 from vyper.semantics.types.abstract import AbstractDataType
+from vyper.semantics.validation.levenshtein_utils import get_levenshtein_error_suggestions
 
 
 class DataLocation(Enum):
@@ -585,7 +586,8 @@ def get_member(self, key: str, node: vy_ast.VyperNode) -> BaseTypeDefinition:
             type_.location = self.location
             type_.is_constant = self.is_constant
             return type_
-        raise UnknownAttribute(f"{self} has no member '{key}'", node)
+        suggestions_str = get_levenshtein_error_suggestions(key, self.members, 0.3)
+        raise UnknownAttribute(f"{self} has no member '{key}'. {suggestions_str}", node)
 
     def __repr__(self):
         return f"{self._id}"

diff --git a/vyper/semantics/types/user/struct.py b/vyper/semantics/types/user/struct.py
@@ -14,6 +14,7 @@
 from vyper.semantics.types.bases import DataLocation, MemberTypeDefinition, ValueTypeDefinition
 from vyper.semantics.types.indexable.mapping import MappingDefinition
 from vyper.semantics.types.utils import get_type_from_annotation
+from vyper.semantics.validation.levenshtein_utils import get_levenshtein_error_suggestions
 from vyper.semantics.validation.utils import validate_expected_type
 
 
@@ -97,7 +98,11 @@ def fetch_call_return(self, node: vy_ast.Call) -> StructDefinition:
         keys = list(self.members.keys())
         for i, (key, value) in enumerate(zip(node.args[0].keys, node.args[0].values)):
             if key is None or key.get("id") not in members:
-                raise UnknownAttribute("Unknown or duplicate struct member", key or value)
+                suggestions_str = get_levenshtein_error_suggestions(key.get("id"), members, 1.0)
+                raise UnknownAttribute(
+                    f"Unknown or duplicate struct member. {suggestions_str}",
+                    key or value,
+                )
             expected_key = keys[i]
             if key.id != expected_key:
                 raise InvalidAttribute(

diff --git a/vyper/semantics/types/utils.py b/vyper/semantics/types/utils.py
@@ -13,6 +13,7 @@
 from vyper.semantics.namespace import get_namespace
 from vyper.semantics.types.bases import BaseTypeDefinition, DataLocation
 from vyper.semantics.types.indexable.sequence import ArrayDefinition, TupleDefinition
+from vyper.semantics.validation.levenshtein_utils import get_levenshtein_error_suggestions
 from vyper.semantics.validation.utils import get_exact_type_from_node, get_index_value
 
 
@@ -154,7 +155,11 @@ def get_type_from_annotation(
     try:
         type_obj = namespace[type_name]
     except UndeclaredDefinition:
-        raise UnknownType(f"No builtin or user-defined type named '{type_name}'", node) from None
+        suggestions_str = get_levenshtein_error_suggestions(type_name, namespace, 0.3)
+        raise UnknownType(
+            f"No builtin or user-defined type named '{type_name}'. {suggestions_str}",
+            node,
+        ) from None
 
     if getattr(type_obj, "_as_array", False) and isinstance(node, vy_ast.Subscript):
         # TODO: handle `is_immutable` for arrays

diff --git a/vyper/semantics/validation/levenshtein_utils.py b/vyper/semantics/validation/levenshtein_utils.py
@@ -0,0 +1,104 @@
+from typing import Any, Dict
+
+
+def levenshtein_norm(source: str, target: str) -> float:
+    """Calculates the normalized Levenshtein distance between two string
+    arguments. The result will be a float in the range [0.0, 1.0], with 1.0
+    signifying the biggest possible distance between strings with these lengths
+
+    From jazzband/docopt-ng
+    https://github.com/jazzband/docopt-ng/blob/bbed40a2335686d2e14ac0e6c3188374dc4784da/docopt.py
+    """
+
+    # Compute Levenshtein distance using helper function. The max is always
+    # just the length of the longer string, so this is used to normalize result
+    # before returning it
+    distance = levenshtein(source, target)
+    return float(distance) / max(len(source), len(target))
+
+
+def levenshtein(source: str, target: str) -> int:
+    """Computes the Levenshtein
+    (https://en.wikipedia.org/wiki/Levenshtein_distance)
+    and restricted Damerau-Levenshtein
+    (https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance)
+    distances between two Unicode strings with given lengths using the
+    Wagner-Fischer algorithm
+    (https://en.wikipedia.org/wiki/Wagner%E2%80%93Fischer_algorithm).
+    These distances are defined recursively, since the distance between two
+    strings is just the cost of adjusting the last one or two characters plus
+    the distance between the prefixes that exclude these characters (e.g. the
+    distance between "tester" and "tested" is 1 + the distance between "teste"
+    and "teste"). The Wagner-Fischer algorithm retains this idea but eliminates
+    redundant computations by storing the distances between various prefixes in
+    a matrix that is filled in iteratively.
+
+    From jazzband/docopt-ng
+    https://github.com/jazzband/docopt-ng/blob/bbed40a2335686d2e14ac0e6c3188374dc4784da/docopt.py
+    """
+
+    # Create matrix of correct size (this is s_len + 1 * t_len + 1 so that the
+    # empty prefixes "" can also be included). The leftmost column represents
+    # transforming various source prefixes into an empty string, which can
+    # always be done by deleting all characters in the respective prefix, and
+    # the top row represents transforming the empty string into various target
+    # prefixes, which can always be done by inserting every character in the
+    # respective prefix. The ternary used to build the list should ensure that
+    # this row and column are now filled correctly
+    s_range = range(len(source) + 1)
+    t_range = range(len(target) + 1)
+    matrix = [[(i if j == 0 else j) for j in t_range] for i in s_range]
+
+    # Iterate through rest of matrix, filling it in with Levenshtein
+    # distances for the remaining prefix combinations
+    for i in s_range[1:]:
+        for j in t_range[1:]:
+            # Applies the recursive logic outlined above using the values
+            # stored in the matrix so far. The options for the last pair of
+            # characters are deletion, insertion, and substitution, which
+            # amount to dropping the source character, the target character,
+            # or both and then calculating the distance for the resulting
+            # prefix combo. If the characters at this point are the same, the
+            # situation can be thought of as a free substitution
+            del_dist = matrix[i - 1][j] + 1
+            ins_dist = matrix[i][j - 1] + 1
+            sub_trans_cost = 0 if source[i - 1] == target[j - 1] else 1
+            sub_dist = matrix[i - 1][j - 1] + sub_trans_cost
+
+            # Choose option that produces smallest distance
+            matrix[i][j] = min(del_dist, ins_dist, sub_dist)
+
+    # At this point, the matrix is full, and the biggest prefixes are just the
+    # strings themselves, so this is the desired distance
+    return matrix[len(source)][len(target)]
+
+
+def get_levenshtein_error_suggestions(key: str, namespace: Dict[str, Any], threshold: float) -> str:
+    """
+    Generate an error message snippet for the suggested closest values in the provided namespace
+    with the shortest normalized Levenshtein distance from the given key if that distance
+    is below the threshold. Otherwise, return an empty string.
+
+    As a heuristic, the threshold value is inversely correlated to the size of the namespace.
+    For a small namespace (e.g. struct members), the threshold value can be the maximum of
+    1.0 since the key must be one of the defined struct members. For a large namespace
+    (e.g. types, builtin functions and state variables), the threshold value should be lower
+    to ensure the matches are relevant.
+
+    :param key: A string of the identifier being accessed
+    :param namespace: A dictionary of the possible identifiers
+    :param threshold: A floating value between 0.0 and 1.0
+
+    :return: The error message snippet if the Levenshtein value is below the threshold,
+        or an empty string.
+    """
+
+    if key is None or key == "":
+        return ""
+
+    distances = sorted([(i, levenshtein_norm(key, i)) for i in namespace], key=lambda k: k[1])
+    if len(distances) > 0 and distances[0][1] <= threshold:
+        if len(distances) > 1 and distances[1][1] <= threshold:
+            return f"Did you mean '{distances[0][0]}', or maybe '{distances[1][0]}'?"
+        return f"Did you mean '{distances[0][0]}'?"
+    return ""
diff --git a/vyper/semantics/validation/module.py b/vyper/semantics/validation/module.py
@@ -25,6 +25,7 @@
 from vyper.semantics.types.user.event import Event
 from vyper.semantics.types.utils import check_constant, get_type_from_annotation
 from vyper.semantics.validation.base import VyperNodeVisitorBase
+from vyper.semantics.validation.levenshtein_utils import get_levenshtein_error_suggestions
 from vyper.semantics.validation.utils import validate_expected_type, validate_unique_method_ids
 from vyper.typing import InterfaceDict
 
@@ -305,7 +306,8 @@ def _add_import(
     if module == "vyper.interfaces":
         interface_codes = _get_builtin_interfaces()
     if name not in interface_codes:
-        raise UndeclaredDefinition(f"Unknown interface: {name}", node)
+        suggestions_str = get_levenshtein_error_suggestions(name, _get_builtin_interfaces(), 1.0)
+        raise UndeclaredDefinition(f"Unknown interface: {name}. {suggestions_str}", node)
 
     if interface_codes[name]["type"] == "vyper":
         interface_ast = vy_ast.parse_to_ast(interface_codes[name]["code"], contract_name=name)

diff --git a/vyper/semantics/validation/utils.py b/vyper/semantics/validation/utils.py
@@ -27,6 +27,7 @@
 )
 from vyper.semantics.types.value.array_value import BytesArrayDefinition, StringDefinition
 from vyper.semantics.types.value.boolean import BoolDefinition
+from vyper.semantics.validation.levenshtein_utils import get_levenshtein_error_suggestions
 
 
 def _validate_op(node, types_list, validation_fn_name):
@@ -151,8 +152,11 @@ def types_from_Attribute(self, node):
                     f"'{name}' is not a storage variable, it should not be prepended with self",
                     node,
                 ) from None
+
+            suggestions_str = get_levenshtein_error_suggestions(name, var.members, 0.4)
             raise UndeclaredDefinition(
-                f"Storage variable '{name}' has not been declared", node
+                f"Storage variable '{name}' has not been declared. {suggestions_str}",
+                node,
             ) from None
 
     def types_from_BinOp(self, node):