Merge pull request #109 from maxbachmann-forks/master

use rapidfuzz instead of fuzzywuzzy
conda · Apr 14, 2020 · b37b01f · b37b01f
2 parents 537f6b6 + e755850
commit b37b01f
Show file tree

Hide file tree

Showing 4 changed files with 18 additions and 12 deletions.
diff --git a/.isort.cfg b/.isort.cfg
@@ -1,6 +1,6 @@
 [settings]
 line_length=88
-known_third_party=requests,ruamel,yaml,pytest,fuzzywuzzy,opensource,colorama,progressbar,progressbar2
+known_third_party=requests,ruamel,yaml,pytest,rapidfuzz,opensource,colorama,progressbar,progressbar2
 multi_line_output=3
 include_trailing_comma=True
 force_grid_wrap=0

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -7,7 +7,7 @@ pr:
 variables:
 - group: Codecov
 - name: grayskull_deps
-  value: pytest pytest-azurepipelines pytest-xdist pytest-cov pytest-forked requests ruamel.yaml codecov ruamel.yaml.jinja2 "coverage<5.0" stdlib-list pip setuptools mock fuzzywuzzy git colorama progressbar2
+  value: pytest pytest-azurepipelines pytest-xdist pytest-cov pytest-forked requests ruamel.yaml codecov ruamel.yaml.jinja2 "coverage<5.0" stdlib-list pip setuptools mock rapidfuzz git colorama progressbar2
 
 jobs:
   - job:

diff --git a/grayskull/license/discovery.py b/grayskull/license/discovery.py
@@ -12,8 +12,8 @@
 
 import requests
 from colorama import Fore
-from fuzzywuzzy import process
-from fuzzywuzzy.fuzz import token_set_ratio, token_sort_ratio
+from rapidfuzz import process
+from rapidfuzz.fuzz import token_set_ratio, token_sort_ratio
 from requests import HTTPError
 
 from grayskull.license.data import get_all_licenses  # noqa
@@ -61,14 +61,20 @@ def match_license(name: str) -> dict:
     all_licenses = get_all_licenses_from_spdx()
     name = re.sub(r"\s+license\s*", "", name.strip(), flags=re.IGNORECASE)
 
-    best_matches = process.extractBests(name, _get_all_license_choice(all_licenses))
+    best_matches = process.extract(name, _get_all_license_choice(all_licenses))
     spdx_license = best_matches[0]
     if spdx_license[1] != 100:
         best_matches = [l[0] for l in best_matches if not l[0].endswith("-only")]
+
         if best_matches:
-            spdx_license = process.extractOne(
-                name, best_matches, scorer=token_sort_ratio
-            )
+            best_matches = process.extract(name, best_matches, scorer=token_set_ratio)
+            spdx_license = best_matches[0]
+            best_matches = [l[0] for l in best_matches if l[1] >= spdx_license[1]]
+            if len(best_matches) > 1:
+                spdx_license = process.extractOne(
+                    name, best_matches, scorer=token_sort_ratio
+                )
+
     log.info(
         f"Best match for license {name} was {spdx_license}.\n"
         f"Best matches: {best_matches}"
@@ -318,15 +324,16 @@ def get_license_type(path_license: str, default: Optional[str] = None) -> Option
     print(f"{Fore.LIGHTBLACK_EX}Matching license file with database from Grayskull...")
     all_licenses = get_all_licenses()
     licenses_text = list(map(itemgetter(1), all_licenses))
-    best_match = process.extractBests(
+    best_match = process.extract(
         license_content, licenses_text, scorer=token_sort_ratio
     )
 
     if default and best_match[0][1] < 51:
         log.info(f"Match too low for recipe {best_match}, using the default {default}")
         return default
+
     higher_match = best_match[0]
-    equal_values = [val[0] for val in best_match if higher_match[1] == val[1]]
+    equal_values = [val[0] for val in best_match if val[1] > (higher_match[1] - 3)]
     if len(equal_values) > 1:
         higher_match = process.extractOne(
             license_content, equal_values, scorer=token_set_ratio

diff --git a/setup.py b/setup.py
@@ -25,8 +25,7 @@
         "stdlib-list",
         "pip",
         "setuptools>=30.3.0",
-        "fuzzywuzzy",
-        "python-Levenshtein",
+        "rapidfuzz",
         "progressbar2",
         "colorama",
     ],