Skip to content

Commit

Permalink
Merge pull request #109 from maxbachmann-forks/master
Browse files Browse the repository at this point in the history
use rapidfuzz instead of fuzzywuzzy
  • Loading branch information
marcelotrevisani authored Apr 14, 2020
2 parents 537f6b6 + e755850 commit b37b01f
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 12 deletions.
2 changes: 1 addition & 1 deletion .isort.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[settings]
line_length=88
known_third_party=requests,ruamel,yaml,pytest,fuzzywuzzy,opensource,colorama,progressbar,progressbar2
known_third_party=requests,ruamel,yaml,pytest,rapidfuzz,opensource,colorama,progressbar,progressbar2
multi_line_output=3
include_trailing_comma=True
force_grid_wrap=0
Expand Down
2 changes: 1 addition & 1 deletion azure-pipelines.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ pr:
variables:
- group: Codecov
- name: grayskull_deps
value: pytest pytest-azurepipelines pytest-xdist pytest-cov pytest-forked requests ruamel.yaml codecov ruamel.yaml.jinja2 "coverage<5.0" stdlib-list pip setuptools mock fuzzywuzzy git colorama progressbar2
value: pytest pytest-azurepipelines pytest-xdist pytest-cov pytest-forked requests ruamel.yaml codecov ruamel.yaml.jinja2 "coverage<5.0" stdlib-list pip setuptools mock rapidfuzz git colorama progressbar2

jobs:
- job:
Expand Down
23 changes: 15 additions & 8 deletions grayskull/license/discovery.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@

import requests
from colorama import Fore
from fuzzywuzzy import process
from fuzzywuzzy.fuzz import token_set_ratio, token_sort_ratio
from rapidfuzz import process
from rapidfuzz.fuzz import token_set_ratio, token_sort_ratio
from requests import HTTPError

from grayskull.license.data import get_all_licenses # noqa
Expand Down Expand Up @@ -61,14 +61,20 @@ def match_license(name: str) -> dict:
all_licenses = get_all_licenses_from_spdx()
name = re.sub(r"\s+license\s*", "", name.strip(), flags=re.IGNORECASE)

best_matches = process.extractBests(name, _get_all_license_choice(all_licenses))
best_matches = process.extract(name, _get_all_license_choice(all_licenses))
spdx_license = best_matches[0]
if spdx_license[1] != 100:
best_matches = [l[0] for l in best_matches if not l[0].endswith("-only")]

if best_matches:
spdx_license = process.extractOne(
name, best_matches, scorer=token_sort_ratio
)
best_matches = process.extract(name, best_matches, scorer=token_set_ratio)
spdx_license = best_matches[0]
best_matches = [l[0] for l in best_matches if l[1] >= spdx_license[1]]
if len(best_matches) > 1:
spdx_license = process.extractOne(
name, best_matches, scorer=token_sort_ratio
)

log.info(
f"Best match for license {name} was {spdx_license}.\n"
f"Best matches: {best_matches}"
Expand Down Expand Up @@ -318,15 +324,16 @@ def get_license_type(path_license: str, default: Optional[str] = None) -> Option
print(f"{Fore.LIGHTBLACK_EX}Matching license file with database from Grayskull...")
all_licenses = get_all_licenses()
licenses_text = list(map(itemgetter(1), all_licenses))
best_match = process.extractBests(
best_match = process.extract(
license_content, licenses_text, scorer=token_sort_ratio
)

if default and best_match[0][1] < 51:
log.info(f"Match too low for recipe {best_match}, using the default {default}")
return default

higher_match = best_match[0]
equal_values = [val[0] for val in best_match if higher_match[1] == val[1]]
equal_values = [val[0] for val in best_match if val[1] > (higher_match[1] - 3)]
if len(equal_values) > 1:
higher_match = process.extractOne(
license_content, equal_values, scorer=token_set_ratio
Expand Down
3 changes: 1 addition & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,7 @@
"stdlib-list",
"pip",
"setuptools>=30.3.0",
"fuzzywuzzy",
"python-Levenshtein",
"rapidfuzz",
"progressbar2",
"colorama",
],
Expand Down

0 comments on commit b37b01f

Please sign in to comment.