From bb86cc0c391d0a73687384e57793a46bbc77f214 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Perceval=20Wajsb=C3=BCrt?= Date: Fri, 17 Mar 2023 14:31:57 +0100 Subject: [PATCH] feat: add ignore_space_tokens to relevant components and update docs & tests --- changelog.md | 1 + docs/pipelines/core/normalisation.md | 4 +- docs/pipelines/core/terminology.md | 10 +-- .../contextual_matcher/contextual_matcher.py | 19 ++++-- .../core/contextual_matcher/factory.py | 3 + edsnlp/pipelines/core/endlines/endlines.py | 1 + edsnlp/pipelines/core/matcher/factory.py | 47 +++++++++++-- edsnlp/pipelines/core/matcher/matcher.py | 9 ++- edsnlp/pipelines/core/terminology/factory.py | 59 ++++++++++++---- .../pipelines/core/terminology/terminology.py | 6 ++ edsnlp/pipelines/ner/cim10/factory.py | 5 ++ edsnlp/pipelines/ner/covid/factory.py | 5 ++ edsnlp/pipelines/ner/drugs/factory.py | 5 ++ edsnlp/pipelines/ner/scores/base_score.py | 68 ++++++++++--------- .../pipelines/ner/scores/charlson/factory.py | 3 + .../ner/scores/elstonellis/factory.py | 48 +++++++++++-- .../ner/scores/emergency/ccmu/factory.py | 49 ++++++++++--- .../ner/scores/emergency/gemsa/factory.py | 49 ++++++++++--- .../ner/scores/emergency/priority/factory.py | 49 ++++++++++--- edsnlp/pipelines/ner/scores/factory.py | 52 +++++++++++--- edsnlp/pipelines/ner/scores/sofa/factory.py | 49 +++++++++++-- edsnlp/pipelines/ner/scores/sofa/sofa.py | 17 +++-- edsnlp/pipelines/ner/umls/factory.py | 5 ++ tests/pipelines/core/test_terminology.py | 5 +- tests/pipelines/ner/test_score.py | 1 + 25 files changed, 443 insertions(+), 126 deletions(-) diff --git a/changelog.md b/changelog.md index f4f57abc8..0e156fb04 100644 --- a/changelog.md +++ b/changelog.md @@ -5,6 +5,7 @@ ### Added - Add `eds.spaces` (or `eds.normalizer` with `spaces=True`) to detect space tokens, and add `ignore_space_tokens` to `EDSPhraseMatcher` and `SimstringMatcher` to skip them +- Add `ignore_space_tokens` option in most components ## v0.8.0 (2023-03-09) diff --git a/docs/pipelines/core/normalisation.md b/docs/pipelines/core/normalisation.md index 6d22ea766..bd7c0ebfd 100644 --- a/docs/pipelines/core/normalisation.md +++ b/docs/pipelines/core/normalisation.md @@ -62,8 +62,8 @@ Moreover, every span exposes a `normalized_variant` extension getter, which comp The pipeline can be configured using the following parameters : ::: edsnlp.pipelines.core.normalizer.factory.create_component - options: - only_parameters: true + options: + only_parameters: true ## Pipelines diff --git a/docs/pipelines/core/terminology.md b/docs/pipelines/core/terminology.md index 69e3c7950..9b14d1d30 100644 --- a/docs/pipelines/core/terminology.md +++ b/docs/pipelines/core/terminology.md @@ -55,13 +55,9 @@ This snippet is complete, and should run as is. The pipeline can be configured using the following parameters : -| Parameter | Explanation | Default | -| ----------------- | ------------------------------------------------ | ----------------------- | -| `label` | Top-level label. | Required | -| `terms` | Terms patterns. Expects a dictionary. | `None` (use regex only) | -| `regex` | RegExp patterns. Expects a dictionary. | `None` (use terms only) | -| `attr` | spaCy attribute to match on (eg `NORM`, `LOWER`) | `"TEXT"` | -| `ignore_excluded` | Whether to skip excluded tokens during matching | `False` | +::: edsnlp.pipelines.core.terminology.factory.create_component + options: + only_parameters: true Patterns, be they `terms` or `regex`, are defined as dictionaries where keys become the `kb_id_` of the extracted entities. Dictionary values are a either a single expression or a list of expressions that match the concept (see [example](#usage)). diff --git a/edsnlp/pipelines/core/contextual_matcher/contextual_matcher.py b/edsnlp/pipelines/core/contextual_matcher/contextual_matcher.py index 3ca7b551b..a23073e07 100644 --- a/edsnlp/pipelines/core/contextual_matcher/contextual_matcher.py +++ b/edsnlp/pipelines/core/contextual_matcher/contextual_matcher.py @@ -51,6 +51,8 @@ class ContextualMatcher(BaseComponent): Attribute to match on, eg `TEXT`, `NORM`, etc. ignore_excluded : bool Whether to skip excluded tokens during matching. + ignore_space_tokens: bool + Whether to skip space tokens during matching. alignment_mode : str Overwrite alignment mode. regex_flags : Union[re.RegexFlag, int] @@ -65,12 +67,13 @@ def __init__( nlp: Language, name: str, patterns: Union[Dict[str, Any], List[Dict[str, Any]]], - assign_as_span: bool, - alignment_mode: str, - attr: str, - regex_flags: Union[re.RegexFlag, int], - ignore_excluded: bool, - include_assigned: bool, + assign_as_span: bool = False, + alignment_mode: str = "expand", + attr: str = "NORM", + regex_flags: Union[re.RegexFlag, int] = 0, + ignore_excluded: bool = False, + ignore_space_tokens: bool = False, + include_assigned: bool = False, ): self.name = name self.nlp = nlp @@ -160,6 +163,7 @@ def __init__( attr=p["regex_attr"] or self.attr, flags=p["regex_flags"] or self.regex_flags, ignore_excluded=ignore_excluded, + ignore_space_tokens=ignore_space_tokens, alignment_mode=alignment_mode, span_from_group=True, ) @@ -290,8 +294,9 @@ def assign_one(self, span: Span) -> Span: end_char=match.end(0), key=matcher["matcher"].regex[0][0], attr=matcher["matcher"].regex[0][2], - alignment_mode=matcher["matcher"].regex[0][4], + alignment_mode=matcher["matcher"].regex[0][5], ignore_excluded=matcher["matcher"].regex[0][3], + ignore_space_tokens=matcher["matcher"].regex[0][4], ), ) for (span, match) in assigned_list diff --git a/edsnlp/pipelines/core/contextual_matcher/factory.py b/edsnlp/pipelines/core/contextual_matcher/factory.py index a41107d98..9f0f1fea2 100644 --- a/edsnlp/pipelines/core/contextual_matcher/factory.py +++ b/edsnlp/pipelines/core/contextual_matcher/factory.py @@ -9,6 +9,7 @@ DEFAULT_CONFIG = dict( attr="NORM", ignore_excluded=False, + ignore_space_tokens=False, regex_flags=0, alignment_mode="expand", assign_as_span=False, @@ -28,6 +29,7 @@ def create_component( alignment_mode: str, attr: str, ignore_excluded: bool, + ignore_space_tokens: bool, regex_flags: Union[re.RegexFlag, int], include_assigned: bool, ): @@ -68,6 +70,7 @@ def create_component( alignment_mode, attr=attr, ignore_excluded=ignore_excluded, + ignore_space_tokens=ignore_space_tokens, regex_flags=regex_flags, include_assigned=include_assigned, ) diff --git a/edsnlp/pipelines/core/endlines/endlines.py b/edsnlp/pipelines/core/endlines/endlines.py index c51b4205a..9cb1ff1bf 100644 --- a/edsnlp/pipelines/core/endlines/endlines.py +++ b/edsnlp/pipelines/core/endlines/endlines.py @@ -49,6 +49,7 @@ def __init__( new_line=r"\n+", ), ignore_excluded=False, + ignore_space_tokens=False, **kwargs, ) diff --git a/edsnlp/pipelines/core/matcher/factory.py b/edsnlp/pipelines/core/matcher/factory.py index 88593144c..b34147219 100644 --- a/edsnlp/pipelines/core/matcher/factory.py +++ b/edsnlp/pipelines/core/matcher/factory.py @@ -11,6 +11,7 @@ regex=None, attr="TEXT", ignore_excluded=False, + ignore_space_tokens=False, term_matcher=GenericTermMatcher.exact, term_matcher_config={}, ) @@ -27,14 +28,45 @@ ) def create_component( nlp: Language, - name: str, - terms: Optional[Dict[str, Union[str, List[str]]]], - attr: Union[str, Dict[str, str]], - regex: Optional[Dict[str, Union[str, List[str]]]], - ignore_excluded: bool, - term_matcher: GenericTermMatcher, - term_matcher_config: Dict[str, Any], + name: str = "eds.matcher", + terms: Optional[Dict[str, Union[str, List[str]]]] = None, + attr: Union[str, Dict[str, str]] = None, + regex: Optional[Dict[str, Union[str, List[str]]]] = "TEXT", + ignore_excluded: bool = False, + ignore_space_tokens: bool = False, + term_matcher: GenericTermMatcher = GenericTermMatcher.exact, + term_matcher_config: Dict[str, Any] = {}, ): + """ + Provides a generic matcher component. + + Parameters + ---------- + nlp : Language + The spaCy object. + name: str + The name of the component. + terms : Optional[Patterns] + A dictionary of terms. + regex : Optional[Patterns] + A dictionary of regular expressions. + attr : str + The default attribute to use for matching. + Can be overridden using the `terms` and `regex` configurations. + ignore_excluded : bool + Whether to skip excluded tokens (requires an upstream + pipeline to mark excluded tokens). + ignore_space_tokens: bool + Whether to skip space tokens during matching. + + You won't be able to match on newlines if this is enabled and + the "spaces"/"newline" option of `eds.normalizer` is enabled (by default). + term_matcher: GenericTermMatcher + The matcher to use for matching phrases ? + One of (exact, simstring) + term_matcher_config: Dict[str,Any] + Parameters of the matcher class + """ assert not (terms is None and regex is None) if terms is None: @@ -48,6 +80,7 @@ def create_component( attr=attr, regex=regex, ignore_excluded=ignore_excluded, + ignore_space_tokens=ignore_space_tokens, term_matcher=term_matcher, term_matcher_config=term_matcher_config, ) diff --git a/edsnlp/pipelines/core/matcher/matcher.py b/edsnlp/pipelines/core/matcher/matcher.py index b3fb15d55..8fc552bbb 100644 --- a/edsnlp/pipelines/core/matcher/matcher.py +++ b/edsnlp/pipelines/core/matcher/matcher.py @@ -35,9 +35,11 @@ class GenericMatcher(BaseComponent): ignore_excluded : bool Whether to skip excluded tokens (requires an upstream pipeline to mark excluded tokens). - ignore_excluded : bool - Whether to skip space tokens (requires an upstream - pipeline to mark space tokens). + ignore_space_tokens: bool + Whether to skip space tokens during matching. + + You won't be able to match on newlines if this is enabled and + the "spaces"/"newline" option of `eds.normalizer` is enabled (by default). term_matcher: GenericTermMatcher The matcher to use for matching phrases ? One of (exact, simstring) @@ -86,6 +88,7 @@ def __init__( self.regex_matcher = RegexMatcher( attr=attr, ignore_excluded=ignore_excluded, + ignore_space_tokens=ignore_space_tokens, ) self.phrase_matcher.build_patterns(nlp=nlp, terms=terms) diff --git a/edsnlp/pipelines/core/terminology/factory.py b/edsnlp/pipelines/core/terminology/factory.py index 3da4adbc0..28362b5a0 100644 --- a/edsnlp/pipelines/core/terminology/factory.py +++ b/edsnlp/pipelines/core/terminology/factory.py @@ -6,9 +6,10 @@ DEFAULT_CONFIG = dict( terms=None, - regex=None, attr="TEXT", + regex=None, ignore_excluded=False, + ignore_space_tokens=False, term_matcher="exact", term_matcher_config={}, ) @@ -21,29 +22,59 @@ ) def create_component( nlp: Language, - name: str, label: str, terms: Optional[Dict[str, Union[str, List[str]]]], - attr: Union[str, Dict[str, str]], - regex: Optional[Dict[str, Union[str, List[str]]]], - ignore_excluded: bool, - term_matcher: TerminologyTermMatcher, - term_matcher_config: Dict[str, Any], + name: str = "eds.terminology", + attr: Union[str, Dict[str, str]] = "TEXT", + regex: Optional[Dict[str, Union[str, List[str]]]] = None, + ignore_excluded: bool = False, + ignore_space_tokens: bool = False, + term_matcher: TerminologyTermMatcher = "exact", + term_matcher_config: Dict[str, Any] = {}, ): - assert not (terms is None and regex is None) + """ + Provides a terminology matching component. - if terms is None: - terms = dict() - if regex is None: - regex = dict() + The terminology matching component differs from the simple matcher component in that + the `regex` and `terms` keys are used as spaCy's `kb_id`. All matched entities + have the same label, defined in the top-level constructor (argument `label`). + + Parameters + ---------- + nlp : Language + The spaCy object. + name: str + The name of the component. + label : str + Top-level label + terms : Optional[Patterns] + A dictionary of terms. + regex : Optional[Patterns] + A dictionary of regular expressions. + attr : str + The default attribute to use for matching. + Can be overridden using the `terms` and `regex` configurations. + ignore_excluded : bool + Whether to skip excluded tokens (requires an upstream + pipeline to mark excluded tokens). + ignore_space_tokens: bool + Whether to skip space tokens during matching. + term_matcher: TerminologyTermMatcher + The matcher to use for matching phrases ? + One of (exact, simstring) + term_matcher_config: Dict[str,Any] + Parameters of the matcher class + """ + assert not (terms is None and regex is None) return TerminologyMatcher( nlp, label=label, - terms=terms, + terms=terms or dict(), attr=attr, - regex=regex, + regex=regex or dict(), ignore_excluded=ignore_excluded, + ignore_space_tokens=ignore_space_tokens, term_matcher=term_matcher, term_matcher_config=term_matcher_config, ) diff --git a/edsnlp/pipelines/core/terminology/terminology.py b/edsnlp/pipelines/core/terminology/terminology.py index e79f09171..ac1735338 100644 --- a/edsnlp/pipelines/core/terminology/terminology.py +++ b/edsnlp/pipelines/core/terminology/terminology.py @@ -42,6 +42,8 @@ class TerminologyMatcher(BaseComponent): ignore_excluded : bool Whether to skip excluded tokens (requires an upstream pipeline to mark excluded tokens). + ignore_space_tokens: bool + Whether to skip space tokens during matching. term_matcher: TerminologyTermMatcher The matcher to use for matching phrases ? One of (exact, simstring) @@ -57,6 +59,7 @@ def __init__( regex: Optional[Patterns], attr: str, ignore_excluded: bool, + ignore_space_tokens: bool = False, term_matcher: TerminologyTermMatcher = TerminologyTermMatcher.exact, term_matcher_config=None, ): @@ -72,6 +75,7 @@ def __init__( self.nlp.vocab, attr=attr, ignore_excluded=ignore_excluded, + ignore_space_tokens=ignore_space_tokens, **(term_matcher_config or {}), ) elif term_matcher == TerminologyTermMatcher.simstring: @@ -79,6 +83,7 @@ def __init__( vocab=self.nlp.vocab, attr=attr, ignore_excluded=ignore_excluded, + ignore_space_tokens=ignore_space_tokens, **(term_matcher_config or {}), ) else: @@ -90,6 +95,7 @@ def __init__( self.regex_matcher = RegexMatcher( attr=attr, ignore_excluded=ignore_excluded, + ignore_space_tokens=ignore_space_tokens, ) self.phrase_matcher.build_patterns(nlp=nlp, terms=terms, progress=True) diff --git a/edsnlp/pipelines/ner/cim10/factory.py b/edsnlp/pipelines/ner/cim10/factory.py index 38011e92d..985ddef0c 100644 --- a/edsnlp/pipelines/ner/cim10/factory.py +++ b/edsnlp/pipelines/ner/cim10/factory.py @@ -9,6 +9,7 @@ DEFAULT_CONFIG = dict( attr="NORM", ignore_excluded=False, + ignore_space_tokens=False, term_matcher=TerminologyTermMatcher.exact, term_matcher_config={}, ) @@ -22,6 +23,7 @@ def create_component( name: str = "eds.cim10", attr: Union[str, Dict[str, str]] = "NORM", ignore_excluded: bool = False, + ignore_space_tokens: bool = False, term_matcher: TerminologyTermMatcher = TerminologyTermMatcher.exact, term_matcher_config: Dict[str, Any] = {}, ): @@ -39,6 +41,8 @@ def create_component( Attribute to match on, eg `TEXT`, `NORM`, etc. ignore_excluded: bool Whether to skip excluded tokens during matching. + ignore_space_tokens: bool + Whether to skip space tokens during matching. term_matcher: TerminologyTermMatcher The term matcher to use, either `TerminologyTermMatcher.exact` or `TerminologyTermMatcher.simstring` @@ -57,6 +61,7 @@ def create_component( terms=patterns.get_patterns(), attr=attr, ignore_excluded=ignore_excluded, + ignore_space_tokens=ignore_space_tokens, term_matcher=term_matcher, term_matcher_config=term_matcher_config, ) diff --git a/edsnlp/pipelines/ner/covid/factory.py b/edsnlp/pipelines/ner/covid/factory.py index f4f8a4650..b59861232 100644 --- a/edsnlp/pipelines/ner/covid/factory.py +++ b/edsnlp/pipelines/ner/covid/factory.py @@ -9,6 +9,7 @@ DEFAULT_CONFIG = dict( attr="LOWER", ignore_excluded=False, + ignore_space_tokens=False, ) @@ -22,6 +23,7 @@ def create_component( name: str = "eds.covid", attr: Union[str, Dict[str, str]] = "LOWER", ignore_excluded: bool = False, + ignore_space_tokens: bool = False, ): """ Create a factory that returns new GenericMatcher with patterns for covid @@ -36,6 +38,8 @@ def create_component( Attribute to match on, eg `TEXT`, `NORM`, etc. ignore_excluded: bool Whether to skip excluded tokens during matching. + ignore_space_tokens: bool + Whether to skip space tokens during matching. Returns ------- @@ -48,4 +52,5 @@ def create_component( regex=dict(covid=patterns.pattern), attr=attr, ignore_excluded=ignore_excluded, + ignore_space_tokens=ignore_space_tokens, ) diff --git a/edsnlp/pipelines/ner/drugs/factory.py b/edsnlp/pipelines/ner/drugs/factory.py index 77e80a53f..f55656400 100644 --- a/edsnlp/pipelines/ner/drugs/factory.py +++ b/edsnlp/pipelines/ner/drugs/factory.py @@ -9,6 +9,7 @@ DEFAULT_CONFIG = dict( attr="NORM", ignore_excluded=False, + ignore_space_tokens=False, term_matcher=TerminologyTermMatcher.exact, term_matcher_config={}, ) @@ -24,6 +25,7 @@ def create_component( name: str = "eds.drugs", attr: str = "NORM", ignore_excluded: bool = False, + ignore_space_tokens: bool = False, term_matcher: TerminologyTermMatcher = TerminologyTermMatcher.exact, term_matcher_config: Dict[str, Any] = {}, ): @@ -42,6 +44,8 @@ def create_component( Attribute to match on, eg `TEXT`, `NORM`, etc. ignore_excluded: bool Whether to skip excluded tokens during matching. + ignore_space_tokens: bool + Whether to skip space tokens during matching. term_matcher: TerminologyTermMatcher The term matcher to use, either `TerminologyTermMatcher.exact` or `TerminologyTermMatcher.simstring` @@ -59,6 +63,7 @@ def create_component( regex=dict(), attr=attr, ignore_excluded=ignore_excluded, + ignore_space_tokens=ignore_space_tokens, term_matcher=term_matcher, term_matcher_config=term_matcher_config, ) diff --git a/edsnlp/pipelines/ner/scores/base_score.py b/edsnlp/pipelines/ner/scores/base_score.py index d1d724805..451dfc6c9 100644 --- a/edsnlp/pipelines/ner/scores/base_score.py +++ b/edsnlp/pipelines/ner/scores/base_score.py @@ -10,30 +10,7 @@ class Score(ContextualMatcher): - """ - Matcher component to extract a numeric score - - Parameters - ---------- - nlp : Language - The spaCy object. - score_name : str - The name of the extracted score - regex : List[str] - A list of regexes to identify the score - attr : str - Wether to match on the text ('TEXT') or on the normalized text ('NORM') - value_extract : str - Regex with capturing group to get the score value - score_normalization : Callable[[Union[str,None]], Any] - Function that takes the "raw" value extracted from the `value_extract` regex, - and should return - - None if no score could be extracted - - The desired score value else - window : int - Number of token to include after the score's mention to find the - score's value - """ + """Matcher component to extract a numeric score""" def __init__( self, @@ -45,16 +22,44 @@ def __init__( score_normalization: Union[str, Callable[[Union[str, None]], Any]], window: int, ignore_excluded: bool, + ignore_space_tokens: bool, flags: Union[re.RegexFlag, int], ): + """ + Parameters + ---------- + nlp : Language + The spaCy object. + score_name : str + The name of the extracted score + regex : List[str] + A list of regexes to identify the score + attr : str + Whether to match on the text ('TEXT') or on the normalized text ('NORM') + value_extract : str + Regex with capturing group to get the score value + score_normalization : Callable[[Union[str,None]], Any] + Function that takes the "raw" value extracted from the `value_extract` + regex and should return: + + - None if no score could be extracted + - The desired score value else + window : int + Number of token to include after the score's mention to find the + score's value + ignore_excluded : bool + Whether to ignore excluded spans when matching + ignore_space_tokens : bool + Whether to ignore space tokens when matching + flags : Union[re.RegexFlag, int] + Regex flags to use when matching + """ if isinstance(value_extract, str): - value_extract = [ - dict( - name="value", - regex=value_extract, - window=window, - ) - ] + value_extract = dict( + name="value", + regex=value_extract, + window=window, + ) if isinstance(value_extract, dict): value_extract = [value_extract] @@ -83,6 +88,7 @@ def __init__( assign_as_span=False, alignment_mode="expand", ignore_excluded=ignore_excluded, + ignore_space_tokens=ignore_space_tokens, attr=attr, regex_flags=flags, include_assigned=False, diff --git a/edsnlp/pipelines/ner/scores/charlson/factory.py b/edsnlp/pipelines/ner/scores/charlson/factory.py index df37e998f..5ab820eec 100644 --- a/edsnlp/pipelines/ner/scores/charlson/factory.py +++ b/edsnlp/pipelines/ner/scores/charlson/factory.py @@ -14,6 +14,7 @@ attr="NORM", window=7, ignore_excluded=False, + ignore_space_tokens=False, flags=0, ) @@ -38,6 +39,7 @@ def create_component( attr: str, window: int, ignore_excluded: bool, + ignore_space_tokens: bool, flags: Union[re.RegexFlag, int], ): return Score( @@ -49,5 +51,6 @@ def create_component( attr=attr, window=window, ignore_excluded=ignore_excluded, + ignore_space_tokens=ignore_space_tokens, flags=flags, ) diff --git a/edsnlp/pipelines/ner/scores/elstonellis/factory.py b/edsnlp/pipelines/ner/scores/elstonellis/factory.py index 289480894..448afc099 100644 --- a/edsnlp/pipelines/ner/scores/elstonellis/factory.py +++ b/edsnlp/pipelines/ner/scores/elstonellis/factory.py @@ -13,6 +13,7 @@ attr="TEXT", window=20, ignore_excluded=False, + ignore_space_tokens=False, flags=0, ) @@ -25,14 +26,46 @@ def create_component( nlp: Language, name: str, - regex: List[str], - value_extract: str, - score_normalization: Union[str, Callable[[Union[str, None]], Any]], - attr: str, - window: int, - ignore_excluded: bool, - flags: Union[re.RegexFlag, int], + regex: List[str] = patterns.regex, + value_extract: str = patterns.value_extract, + score_normalization: Union[ + str, Callable[[Union[str, None]], Any] + ] = patterns.score_normalization_str, + attr: str = "TEXT", + window: int = 20, + ignore_excluded: bool = False, + ignore_space_tokens: bool = False, + flags: Union[re.RegexFlag, int] = 0, ): + """ + Matcher for the Elston-Ellis score. + + Parameters + ---------- + nlp: Language + The spaCy Language object + name: str + The name of the component + regex: List[str] + The regex patterns to match + value_extract: str + The regex pattern to extract the value from the matched text + score_normalization: Union[str, Callable[[Union[str, None]], Any]] + The normalization function to apply to the extracted value + attr: str + The token attribute to match on (e.g. "TEXT" or "NORM") + window: int + The window size to search for the regex pattern + ignore_excluded: bool + Whether to ignore excluded tokens + ignore_space_tokens: bool + Whether to ignore space tokens + flags: Union[re.RegexFlag, int] + The regex flags to use + Returns + ------- + Score + """ return Score( nlp, score_name=name, @@ -42,5 +75,6 @@ def create_component( attr=attr, window=window, ignore_excluded=ignore_excluded, + ignore_space_tokens=ignore_space_tokens, flags=flags, ) diff --git a/edsnlp/pipelines/ner/scores/emergency/ccmu/factory.py b/edsnlp/pipelines/ner/scores/emergency/ccmu/factory.py index 22917c173..e1f181562 100644 --- a/edsnlp/pipelines/ner/scores/emergency/ccmu/factory.py +++ b/edsnlp/pipelines/ner/scores/emergency/ccmu/factory.py @@ -31,15 +31,47 @@ ) def create_component( nlp: Language, - name: str, - regex: List[str], - value_extract: str, - score_normalization: Union[str, Callable[[Union[str, None]], Any]], - attr: str, - window: int, - ignore_excluded: bool, - flags: Union[re.RegexFlag, int], + name: str = "eds.emergency.ccmu", + regex: List[str] = patterns.regex, + value_extract: str = patterns.value_extract, + score_normalization: Union[ + str, Callable[[Union[str, None]], Any] + ] = patterns.score_normalization_str, + attr: str = "NORM", + window: int = 20, + ignore_excluded: bool = False, + ignore_space_tokens: bool = False, + flags: Union[re.RegexFlag, int] = 0, ): + """ + Matcher for the Emergency CCMU score. + + Parameters + ---------- + nlp: Language + The spaCy Language object + name: str + The name of the component + regex: List[str] + The regex patterns to match + value_extract: str + The regex pattern to extract the value from the matched text + score_normalization: Union[str, Callable[[Union[str, None]], Any]] + The normalization function to apply to the extracted value + attr: str + The token attribute to match on (e.g. "TEXT" or "NORM") + window: int + The window size to search for the regex pattern + ignore_excluded: bool + Whether to ignore excluded tokens + ignore_space_tokens: bool + Whether to ignore space tokens + flags: Union[re.RegexFlag, int] + The regex flags to use + Returns + ------- + Score + """ return Score( nlp, score_name=name, @@ -49,5 +81,6 @@ def create_component( attr=attr, window=window, ignore_excluded=ignore_excluded, + ignore_space_tokens=ignore_space_tokens, flags=flags, ) diff --git a/edsnlp/pipelines/ner/scores/emergency/gemsa/factory.py b/edsnlp/pipelines/ner/scores/emergency/gemsa/factory.py index a6bf8ba38..f8e263100 100644 --- a/edsnlp/pipelines/ner/scores/emergency/gemsa/factory.py +++ b/edsnlp/pipelines/ner/scores/emergency/gemsa/factory.py @@ -31,15 +31,47 @@ ) def create_component( nlp: Language, - name: str, - regex: List[str], - value_extract: str, - score_normalization: Union[str, Callable[[Union[str, None]], Any]], - attr: str, - window: int, - ignore_excluded: bool, - flags: Union[re.RegexFlag, int], + name: str = "eds.emergency.gemsa", + regex: List[str] = patterns.regex, + value_extract: str = patterns.value_extract, + score_normalization: Union[ + str, Callable[[Union[str, None]], Any] + ] = patterns.score_normalization_str, + attr: str = "NORM", + window: int = 20, + ignore_excluded: bool = False, + ignore_space_tokens: bool = False, + flags: Union[re.RegexFlag, int] = 0, ): + """ + Matcher for the Emergency CCMU score. + + Parameters + ---------- + nlp: Language + The spaCy Language object + name: str + The name of the component + regex: List[str] + The regex patterns to match + value_extract: str + The regex pattern to extract the value from the matched text + score_normalization: Union[str, Callable[[Union[str, None]], Any]] + The normalization function to apply to the extracted value + attr: str + The token attribute to match on (e.g. "TEXT" or "NORM") + window: int + The window size to search for the regex pattern + ignore_excluded: bool + Whether to ignore excluded tokens + ignore_space_tokens: bool + Whether to ignore space tokens + flags: Union[re.RegexFlag, int] + The regex flags to use + Returns + ------- + Score + """ return Score( nlp, score_name=name, @@ -49,5 +81,6 @@ def create_component( attr=attr, window=window, ignore_excluded=ignore_excluded, + ignore_space_tokens=ignore_space_tokens, flags=flags, ) diff --git a/edsnlp/pipelines/ner/scores/emergency/priority/factory.py b/edsnlp/pipelines/ner/scores/emergency/priority/factory.py index d49252c71..500261e63 100644 --- a/edsnlp/pipelines/ner/scores/emergency/priority/factory.py +++ b/edsnlp/pipelines/ner/scores/emergency/priority/factory.py @@ -31,15 +31,47 @@ ) def create_component( nlp: Language, - name: str, - regex: List[str], - value_extract: str, - score_normalization: Union[str, Callable[[Union[str, None]], Any]], - attr: str, - window: int, - ignore_excluded: bool, - flags: Union[re.RegexFlag, int], + name: str = "emergency.priority", + regex: List[str] = patterns.regex, + value_extract: str = patterns.value_extract, + score_normalization: Union[ + str, Callable[[Union[str, None]], Any] + ] = patterns.score_normalization_str, + attr: str = "NORM", + window: int = 7, + ignore_excluded: bool = False, + ignore_space_tokens: bool = False, + flags: Union[re.RegexFlag, int] = 0, ): + """ + Matcher for the Emergency Priority score. + + Parameters + ---------- + nlp: Language + The spaCy Language object + name: str + The name of the component + regex: List[str] + The regex patterns to match + value_extract: str + The regex pattern to extract the value from the matched text + score_normalization: Union[str, Callable[[Union[str, None]], Any]] + The normalization function to apply to the extracted value + attr: str + The token attribute to match on (e.g. "TEXT" or "NORM") + window: int + The window size to search for the regex pattern + ignore_excluded: bool + Whether to ignore excluded tokens + ignore_space_tokens: bool + Whether to ignore space tokens + flags: Union[re.RegexFlag, int] + The regex flags to use + Returns + ------- + Score + """ return Score( nlp, score_name=name, @@ -49,5 +81,6 @@ def create_component( attr=attr, window=window, ignore_excluded=ignore_excluded, + ignore_space_tokens=ignore_space_tokens, flags=flags, ) diff --git a/edsnlp/pipelines/ner/scores/factory.py b/edsnlp/pipelines/ner/scores/factory.py index fe637fcd5..23b7fc484 100644 --- a/edsnlp/pipelines/ner/scores/factory.py +++ b/edsnlp/pipelines/ner/scores/factory.py @@ -10,6 +10,7 @@ attr="NORM", window=7, ignore_excluded=False, + ignore_space_tokens=False, flags=0, ) @@ -27,16 +28,48 @@ ) def create_component( nlp: Language, - name: str, - score_name: str, - regex: List[str], - value_extract: str, - score_normalization: Union[str, Callable[[Union[str, None]], Any]], - attr: str, - window: int, - flags: Union[re.RegexFlag, int], - ignore_excluded: bool, + name: str = "eds.score", + score_name: str = None, + regex: List[str] = None, + value_extract: str = None, + score_normalization: Union[str, Callable[[Union[str, None]], Any]] = None, + attr: str = "NORM", + window: int = 7, + flags: Union[re.RegexFlag, int] = 0, + ignore_excluded: bool = False, + ignore_space_tokens: bool = False, ): + """ + Parameters + ---------- + nlp : Language + The spaCy object. + name : str + The name of the component. + score_name : str + The name of the extracted score + regex : List[str] + A list of regexes to identify the score + attr : str + Whether to match on the text ('TEXT') or on the normalized text ('NORM') + value_extract : str + Regex with capturing group to get the score value + score_normalization : Callable[[Union[str,None]], Any] + Function that takes the "raw" value extracted from the `value_extract` regex, + and should return: + + - None if no score could be extracted + - The desired score value else + window : int + Number of token to include after the score's mention to find the + score's value + ignore_excluded : bool + Whether to ignore excluded spans when matching + ignore_space_tokens : bool + Whether to ignore space tokens when matching + flags : Union[re.RegexFlag, int] + Regex flags to use when matching + """ return Score( nlp, score_name=score_name, @@ -47,4 +80,5 @@ def create_component( flags=flags, window=window, ignore_excluded=ignore_excluded, + ignore_space_tokens=ignore_space_tokens, ) diff --git a/edsnlp/pipelines/ner/scores/sofa/factory.py b/edsnlp/pipelines/ner/scores/sofa/factory.py index e565262ac..8c9b1fb97 100644 --- a/edsnlp/pipelines/ner/scores/sofa/factory.py +++ b/edsnlp/pipelines/ner/scores/sofa/factory.py @@ -13,6 +13,7 @@ attr="NORM", window=10, ignore_excluded=False, + ignore_space_tokens=False, flags=0, ) @@ -31,14 +32,47 @@ def create_component( nlp: Language, name: str, - regex: List[str], - value_extract: List[Dict[str, str]], - score_normalization: Union[str, Callable[[Union[str, None]], Any]], - attr: str, - window: int, - ignore_excluded: bool, - flags: Union[re.RegexFlag, int], + regex: List[str] = patterns.regex, + value_extract: List[Dict[str, str]] = patterns.value_extract, + score_normalization: Union[ + str, Callable[[Union[str, None]], Any] + ] = patterns.score_normalization_str, + attr: str = "NORM", + window: int = 10, + ignore_excluded: bool = False, + ignore_space_tokens: bool = False, + flags: Union[re.RegexFlag, int] = 0, ): + """ + Matcher component to extract the SOFA score + + Parameters + ---------- + nlp : Language + The spaCy object. + name : str + The name of the extracted score + regex : List[str] + A list of regexes to identify the SOFA score + attr : str + Whether to match on the text ('TEXT') or on the normalized text ('CUSTOM_NORM') + value_extract : Dict[str, str] + Regex to extract the score value + score_normalization : Callable[[Union[str,None]], Any] + Function that takes the "raw" value extracted from the `value_extract` regex, + and should return + - None if no score could be extracted + - The desired score value else + window : int + Number of token to include after the score's mention to find the + score's value + ignore_excluded : bool + Whether to ignore excluded spans + ignore_space_tokens : bool + Whether to ignore space tokens + flags : Union[re.RegexFlag, int] + Flags to pass to the regex + """ return Sofa( nlp, score_name=name, @@ -48,5 +82,6 @@ def create_component( attr=attr, window=window, ignore_excluded=ignore_excluded, + ignore_space_tokens=ignore_space_tokens, flags=flags, ) diff --git a/edsnlp/pipelines/ner/scores/sofa/sofa.py b/edsnlp/pipelines/ner/scores/sofa/sofa.py index d3d5e65f3..0ea49962e 100644 --- a/edsnlp/pipelines/ner/scores/sofa/sofa.py +++ b/edsnlp/pipelines/ner/scores/sofa/sofa.py @@ -15,16 +15,11 @@ class Sofa(Score): ---------- nlp : Language The spaCy object. - score_name : str - The name of the extracted score regex : List[str] A list of regexes to identify the SOFA score attr : str - Wether to match on the text ('TEXT') or on the normalized text ('CUSTOM_NORM') - method_regex : str - Regex with capturing group to get the score extraction method - (e.g. "à l'admission", "à 24H", "Maximum") - value_regex : str + Whether to match on the text ('TEXT') or on the normalized text ('CUSTOM_NORM') + value_extract : Dict[str, str] Regex to extract the score value score_normalization : Callable[[Union[str,None]], Any] Function that takes the "raw" value extracted from the `value_extract` regex, @@ -34,6 +29,12 @@ class Sofa(Score): window : int Number of token to include after the score's mention to find the score's value + ignore_excluded : bool + Whether to ignore excluded spans + ignore_space_tokens : bool + Whether to ignore space tokens + flags : Union[re.RegexFlag, int] + Flags to pass to the regex """ def __init__( @@ -47,6 +48,7 @@ def __init__( window: int, flags: Union[re.RegexFlag, int], ignore_excluded: bool, + ignore_space_tokens: bool, ): super().__init__( @@ -59,6 +61,7 @@ def __init__( window=window, flags=flags, ignore_excluded=ignore_excluded, + ignore_space_tokens=ignore_space_tokens, ) self.set_extensions() diff --git a/edsnlp/pipelines/ner/umls/factory.py b/edsnlp/pipelines/ner/umls/factory.py index b32b38b63..11264e259 100644 --- a/edsnlp/pipelines/ner/umls/factory.py +++ b/edsnlp/pipelines/ner/umls/factory.py @@ -9,6 +9,7 @@ DEFAULT_CONFIG = dict( attr="NORM", ignore_excluded=False, + ignore_space_tokens=False, term_matcher=TerminologyTermMatcher.exact, term_matcher_config={}, pattern_config=dict( @@ -26,6 +27,7 @@ def create_component( name: str = "eds.umls", attr: Union[str, Dict[str, str]] = "NORM", ignore_excluded: bool = False, + ignore_space_tokens: bool = False, term_matcher: TerminologyTermMatcher = TerminologyTermMatcher.exact, term_matcher_config: Dict[str, Any] = {}, pattern_config: Dict[str, Any] = dict( @@ -47,6 +49,8 @@ def create_component( Attribute to match on, eg `TEXT`, `NORM`, etc. ignore_excluded: bool Whether to skip excluded tokens during matching. + ignore_space_tokens: bool + Whether to skip space tokens during matching. term_matcher: TerminologyTermMatcher The term matcher to use, either `TerminologyTermMatcher.exact` or `TerminologyTermMatcher.simstring` @@ -67,6 +71,7 @@ def create_component( terms=patterns.get_patterns(pattern_config), attr=attr, ignore_excluded=ignore_excluded, + ignore_space_tokens=ignore_space_tokens, term_matcher=term_matcher, term_matcher_config=term_matcher_config, ) diff --git a/tests/pipelines/core/test_terminology.py b/tests/pipelines/core/test_terminology.py index 4e7429f5c..b385f9cc3 100644 --- a/tests/pipelines/core/test_terminology.py +++ b/tests/pipelines/core/test_terminology.py @@ -1,3 +1,4 @@ +import pytest from spacy.language import Language from edsnlp.utils.examples import parse_example @@ -5,13 +6,15 @@ example = "1g de doliprane" -def test_terminology(blank_nlp: Language): +@pytest.mark.parametrize("term_matcher", ["exact", "simstring"]) +def test_terminology(blank_nlp: Language, term_matcher: str): blank_nlp.add_pipe( "eds.terminology", config=dict( label="drugs", terms=dict(paracetamol=["doliprane", "tylenol", "paracetamol"]), attr="NORM", + term_matcher=term_matcher, ), ) diff --git a/tests/pipelines/ner/test_score.py b/tests/pipelines/ner/test_score.py index 36db1a53c..cad10cfcf 100644 --- a/tests/pipelines/ner/test_score.py +++ b/tests/pipelines/ner/test_score.py @@ -51,6 +51,7 @@ def testscore_normalization(raw_score: str): regex=[r"test+score"], attr="NORM", ignore_excluded=True, + ignore_space_tokens=False, value_extract=r"(\d+)", score_normalization=testscore_normalization, window=4,