Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

make TermNormIsSubStringMappingStrategy handle multi-word substrings #24

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import itertools
from abc import ABC
from collections.abc import Iterable
import re
from typing import Optional

from kazu.data import (
Expand Down Expand Up @@ -336,8 +337,9 @@ def filter_candidates(


class SynNormIsSubStringMappingStrategy(MappingStrategy):
"""For a :data:`~.CandidatesToMetrics`, see if any of their .synonym_norm are string
matches of the match_norm tokens based on whitespace tokenisation.
"""For a :data:`~.CandidatesToMetrics`, see if any of their .synonym_norm are a
substring of the match_norm tokens, with a word boundary immediately before and
after.

If exactly one element of :data:`~.CandidatesToMetrics` matches, prefer it.

Expand Down Expand Up @@ -373,7 +375,6 @@ def filter_candidates(
candidates: CandidatesToMetrics,
parser_name: str,
) -> CandidatesToMetrics:
norm_tokens = set(ent_match_norm.split(" "))

filtered_candidates_and_len = [
(
Expand All @@ -384,7 +385,7 @@ def filter_candidates(
len(candidate.synonym_norm),
)
for candidate, metrics in candidates.items()
if candidate.synonym_norm in norm_tokens
if self._regex_check_substring_words(candidate.synonym_norm, ent_match_norm)
and len(candidate.synonym_norm) >= self.min_syn_norm_len_to_consider
]
filtered_candidates_and_len.sort(key=lambda x: x[1], reverse=True)
Expand All @@ -398,6 +399,12 @@ def filter_candidates(
return {candidate: metrics}
return {}

@staticmethod
def _regex_check_substring_words(possible_substring: str, full_string: str) -> bool:
# there should be a word boundary on either side
regexp = r"\b" + possible_substring + r"\b"
return bool(re.search(regexp, full_string))


class StrongMatchMappingStrategy(MappingStrategy):
"""
Expand Down
Loading