feat: add ignore_space_tokens to relevant components and update docs …

…& tests
aphp · Mar 20, 2023 · bb86cc0 · bb86cc0
1 parent 1186444
commit bb86cc0
Show file tree

Hide file tree

Showing 25 changed files with 443 additions and 126 deletions.
diff --git a/changelog.md b/changelog.md
@@ -5,6 +5,7 @@
 ### Added
 
 - Add `eds.spaces` (or `eds.normalizer` with `spaces=True`) to detect space tokens, and add `ignore_space_tokens` to `EDSPhraseMatcher` and `SimstringMatcher` to skip them
+- Add `ignore_space_tokens` option in most components
 
 ## v0.8.0 (2023-03-09)
 

diff --git a/docs/pipelines/core/normalisation.md b/docs/pipelines/core/normalisation.md
@@ -62,8 +62,8 @@ Moreover, every span exposes a `normalized_variant` extension getter, which comp
 The pipeline can be configured using the following parameters :
 
 ::: edsnlp.pipelines.core.normalizer.factory.create_component
-   options:
-      only_parameters: true
+    options:
+       only_parameters: true
 
 ## Pipelines
 

diff --git a/docs/pipelines/core/terminology.md b/docs/pipelines/core/terminology.md
@@ -55,13 +55,9 @@ This snippet is complete, and should run as is.
 
 The pipeline can be configured using the following parameters :
 
-| Parameter         | Explanation                                      | Default                 |
-| ----------------- | ------------------------------------------------ | ----------------------- |
-| `label`           | Top-level label.                                 | Required                |
-| `terms`           | Terms patterns. Expects a dictionary.            | `None` (use regex only) |
-| `regex`           | RegExp patterns. Expects a dictionary.           | `None` (use terms only) |
-| `attr`            | spaCy attribute to match on (eg `NORM`, `LOWER`) | `"TEXT"`                |
-| `ignore_excluded` | Whether to skip excluded tokens during matching  | `False`                 |
+::: edsnlp.pipelines.core.terminology.factory.create_component
+    options:
+       only_parameters: true
 
 Patterns, be they `terms` or `regex`, are defined as dictionaries where keys become the `kb_id_` of the extracted entities.
 Dictionary values are a either a single expression or a list of expressions that match the concept (see [example](#usage)).

diff --git a/edsnlp/pipelines/core/contextual_matcher/contextual_matcher.py b/edsnlp/pipelines/core/contextual_matcher/contextual_matcher.py
@@ -51,6 +51,8 @@ class ContextualMatcher(BaseComponent):
         Attribute to match on, eg `TEXT`, `NORM`, etc.
     ignore_excluded : bool
         Whether to skip excluded tokens during matching.
+    ignore_space_tokens: bool
+        Whether to skip space tokens during matching.
     alignment_mode : str
         Overwrite alignment mode.
     regex_flags : Union[re.RegexFlag, int]
@@ -65,12 +67,13 @@ def __init__(
         nlp: Language,
         name: str,
         patterns: Union[Dict[str, Any], List[Dict[str, Any]]],
-        assign_as_span: bool,
-        alignment_mode: str,
-        attr: str,
-        regex_flags: Union[re.RegexFlag, int],
-        ignore_excluded: bool,
-        include_assigned: bool,
+        assign_as_span: bool = False,
+        alignment_mode: str = "expand",
+        attr: str = "NORM",
+        regex_flags: Union[re.RegexFlag, int] = 0,
+        ignore_excluded: bool = False,
+        ignore_space_tokens: bool = False,
+        include_assigned: bool = False,
     ):
         self.name = name
         self.nlp = nlp
@@ -160,6 +163,7 @@ def __init__(
                     attr=p["regex_attr"] or self.attr,
                     flags=p["regex_flags"] or self.regex_flags,
                     ignore_excluded=ignore_excluded,
+                    ignore_space_tokens=ignore_space_tokens,
                     alignment_mode=alignment_mode,
                     span_from_group=True,
                 )
@@ -290,8 +294,9 @@ def assign_one(self, span: Span) -> Span:
                         end_char=match.end(0),
                         key=matcher["matcher"].regex[0][0],
                         attr=matcher["matcher"].regex[0][2],
-                        alignment_mode=matcher["matcher"].regex[0][4],
+                        alignment_mode=matcher["matcher"].regex[0][5],
                         ignore_excluded=matcher["matcher"].regex[0][3],
+                        ignore_space_tokens=matcher["matcher"].regex[0][4],
                     ),
                 )
                 for (span, match) in assigned_list

diff --git a/edsnlp/pipelines/core/contextual_matcher/factory.py b/edsnlp/pipelines/core/contextual_matcher/factory.py
@@ -9,6 +9,7 @@
 DEFAULT_CONFIG = dict(
     attr="NORM",
     ignore_excluded=False,
+    ignore_space_tokens=False,
     regex_flags=0,
     alignment_mode="expand",
     assign_as_span=False,
@@ -28,6 +29,7 @@ def create_component(
     alignment_mode: str,
     attr: str,
     ignore_excluded: bool,
+    ignore_space_tokens: bool,
     regex_flags: Union[re.RegexFlag, int],
     include_assigned: bool,
 ):
@@ -68,6 +70,7 @@ def create_component(
         alignment_mode,
         attr=attr,
         ignore_excluded=ignore_excluded,
+        ignore_space_tokens=ignore_space_tokens,
         regex_flags=regex_flags,
         include_assigned=include_assigned,
     )
diff --git a/edsnlp/pipelines/core/endlines/endlines.py b/edsnlp/pipelines/core/endlines/endlines.py
@@ -49,6 +49,7 @@ def __init__(
                 new_line=r"\n+",
             ),
             ignore_excluded=False,
+            ignore_space_tokens=False,
             **kwargs,
         )
 

diff --git a/edsnlp/pipelines/core/matcher/factory.py b/edsnlp/pipelines/core/matcher/factory.py
@@ -11,6 +11,7 @@
     regex=None,
     attr="TEXT",
     ignore_excluded=False,
+    ignore_space_tokens=False,
     term_matcher=GenericTermMatcher.exact,
     term_matcher_config={},
 )
@@ -27,14 +28,45 @@
 )
 def create_component(
     nlp: Language,
-    name: str,
-    terms: Optional[Dict[str, Union[str, List[str]]]],
-    attr: Union[str, Dict[str, str]],
-    regex: Optional[Dict[str, Union[str, List[str]]]],
-    ignore_excluded: bool,
-    term_matcher: GenericTermMatcher,
-    term_matcher_config: Dict[str, Any],
+    name: str = "eds.matcher",
+    terms: Optional[Dict[str, Union[str, List[str]]]] = None,
+    attr: Union[str, Dict[str, str]] = None,
+    regex: Optional[Dict[str, Union[str, List[str]]]] = "TEXT",
+    ignore_excluded: bool = False,
+    ignore_space_tokens: bool = False,
+    term_matcher: GenericTermMatcher = GenericTermMatcher.exact,
+    term_matcher_config: Dict[str, Any] = {},
 ):
+    """
+    Provides a generic matcher component.
+
+    Parameters
+    ----------
+    nlp : Language
+        The spaCy object.
+    name: str
+        The name of the component.
+    terms : Optional[Patterns]
+        A dictionary of terms.
+    regex : Optional[Patterns]
+        A dictionary of regular expressions.
+    attr : str
+        The default attribute to use for matching.
+        Can be overridden using the `terms` and `regex` configurations.
+    ignore_excluded : bool
+        Whether to skip excluded tokens (requires an upstream
+        pipeline to mark excluded tokens).
+    ignore_space_tokens: bool
+        Whether to skip space tokens during matching.
+
+        You won't be able to match on newlines if this is enabled and
+        the "spaces"/"newline" option of `eds.normalizer` is enabled (by default).
+    term_matcher: GenericTermMatcher
+        The matcher to use for matching phrases ?
+        One of (exact, simstring)
+    term_matcher_config: Dict[str,Any]
+        Parameters of the matcher class
+    """
     assert not (terms is None and regex is None)
 
     if terms is None:
@@ -48,6 +80,7 @@ def create_component(
         attr=attr,
         regex=regex,
         ignore_excluded=ignore_excluded,
+        ignore_space_tokens=ignore_space_tokens,
         term_matcher=term_matcher,
         term_matcher_config=term_matcher_config,
     )
diff --git a/edsnlp/pipelines/core/matcher/matcher.py b/edsnlp/pipelines/core/matcher/matcher.py
@@ -35,9 +35,11 @@ class GenericMatcher(BaseComponent):
     ignore_excluded : bool
         Whether to skip excluded tokens (requires an upstream
         pipeline to mark excluded tokens).
-    ignore_excluded : bool
-        Whether to skip space tokens (requires an upstream
-        pipeline to mark space tokens).
+    ignore_space_tokens: bool
+        Whether to skip space tokens during matching.
+
+        You won't be able to match on newlines if this is enabled and
+        the "spaces"/"newline" option of `eds.normalizer` is enabled (by default).
     term_matcher: GenericTermMatcher
         The matcher to use for matching phrases ?
         One of (exact, simstring)
@@ -86,6 +88,7 @@ def __init__(
         self.regex_matcher = RegexMatcher(
             attr=attr,
             ignore_excluded=ignore_excluded,
+            ignore_space_tokens=ignore_space_tokens,
         )
 
         self.phrase_matcher.build_patterns(nlp=nlp, terms=terms)

diff --git a/edsnlp/pipelines/core/terminology/factory.py b/edsnlp/pipelines/core/terminology/factory.py
@@ -6,9 +6,10 @@
 
 DEFAULT_CONFIG = dict(
     terms=None,
-    regex=None,
     attr="TEXT",
+    regex=None,
     ignore_excluded=False,
+    ignore_space_tokens=False,
     term_matcher="exact",
     term_matcher_config={},
 )
@@ -21,29 +22,59 @@
 )
 def create_component(
     nlp: Language,
-    name: str,
     label: str,
     terms: Optional[Dict[str, Union[str, List[str]]]],
-    attr: Union[str, Dict[str, str]],
-    regex: Optional[Dict[str, Union[str, List[str]]]],
-    ignore_excluded: bool,
-    term_matcher: TerminologyTermMatcher,
-    term_matcher_config: Dict[str, Any],
+    name: str = "eds.terminology",
+    attr: Union[str, Dict[str, str]] = "TEXT",
+    regex: Optional[Dict[str, Union[str, List[str]]]] = None,
+    ignore_excluded: bool = False,
+    ignore_space_tokens: bool = False,
+    term_matcher: TerminologyTermMatcher = "exact",
+    term_matcher_config: Dict[str, Any] = {},
 ):
-    assert not (terms is None and regex is None)
+    """
+    Provides a terminology matching component.
 
-    if terms is None:
-        terms = dict()
-    if regex is None:
-        regex = dict()
+    The terminology matching component differs from the simple matcher component in that
+    the `regex` and `terms` keys are used as spaCy's `kb_id`. All matched entities
+    have the same label, defined in the top-level constructor (argument `label`).
+
+    Parameters
+    ----------
+    nlp : Language
+        The spaCy object.
+    name: str
+        The name of the component.
+    label : str
+        Top-level label
+    terms : Optional[Patterns]
+        A dictionary of terms.
+    regex : Optional[Patterns]
+        A dictionary of regular expressions.
+    attr : str
+        The default attribute to use for matching.
+        Can be overridden using the `terms` and `regex` configurations.
+    ignore_excluded : bool
+        Whether to skip excluded tokens (requires an upstream
+        pipeline to mark excluded tokens).
+    ignore_space_tokens: bool
+        Whether to skip space tokens during matching.
+    term_matcher: TerminologyTermMatcher
+        The matcher to use for matching phrases ?
+        One of (exact, simstring)
+    term_matcher_config: Dict[str,Any]
+        Parameters of the matcher class
+    """
+    assert not (terms is None and regex is None)
 
     return TerminologyMatcher(
         nlp,
         label=label,
-        terms=terms,
+        terms=terms or dict(),
         attr=attr,
-        regex=regex,
+        regex=regex or dict(),
         ignore_excluded=ignore_excluded,
+        ignore_space_tokens=ignore_space_tokens,
         term_matcher=term_matcher,
         term_matcher_config=term_matcher_config,
     )
diff --git a/edsnlp/pipelines/core/terminology/terminology.py b/edsnlp/pipelines/core/terminology/terminology.py
@@ -42,6 +42,8 @@ class TerminologyMatcher(BaseComponent):
     ignore_excluded : bool
         Whether to skip excluded tokens (requires an upstream
         pipeline to mark excluded tokens).
+    ignore_space_tokens: bool
+        Whether to skip space tokens during matching.
     term_matcher: TerminologyTermMatcher
         The matcher to use for matching phrases ?
         One of (exact, simstring)
@@ -57,6 +59,7 @@ def __init__(
         regex: Optional[Patterns],
         attr: str,
         ignore_excluded: bool,
+        ignore_space_tokens: bool = False,
         term_matcher: TerminologyTermMatcher = TerminologyTermMatcher.exact,
         term_matcher_config=None,
     ):
@@ -72,13 +75,15 @@ def __init__(
                 self.nlp.vocab,
                 attr=attr,
                 ignore_excluded=ignore_excluded,
+                ignore_space_tokens=ignore_space_tokens,
                 **(term_matcher_config or {}),
             )
         elif term_matcher == TerminologyTermMatcher.simstring:
             self.phrase_matcher = SimstringMatcher(
                 vocab=self.nlp.vocab,
                 attr=attr,
                 ignore_excluded=ignore_excluded,
+                ignore_space_tokens=ignore_space_tokens,
                 **(term_matcher_config or {}),
             )
         else:
@@ -90,6 +95,7 @@ def __init__(
         self.regex_matcher = RegexMatcher(
             attr=attr,
             ignore_excluded=ignore_excluded,
+            ignore_space_tokens=ignore_space_tokens,
         )
 
         self.phrase_matcher.build_patterns(nlp=nlp, terms=terms, progress=True)

diff --git a/edsnlp/pipelines/ner/cim10/factory.py b/edsnlp/pipelines/ner/cim10/factory.py
@@ -9,6 +9,7 @@
 DEFAULT_CONFIG = dict(
     attr="NORM",
     ignore_excluded=False,
+    ignore_space_tokens=False,
     term_matcher=TerminologyTermMatcher.exact,
     term_matcher_config={},
 )
@@ -22,6 +23,7 @@ def create_component(
     name: str = "eds.cim10",
     attr: Union[str, Dict[str, str]] = "NORM",
     ignore_excluded: bool = False,
+    ignore_space_tokens: bool = False,
     term_matcher: TerminologyTermMatcher = TerminologyTermMatcher.exact,
     term_matcher_config: Dict[str, Any] = {},
 ):
@@ -39,6 +41,8 @@ def create_component(
         Attribute to match on, eg `TEXT`, `NORM`, etc.
     ignore_excluded: bool
         Whether to skip excluded tokens during matching.
+    ignore_space_tokens: bool
+        Whether to skip space tokens during matching.
     term_matcher: TerminologyTermMatcher
         The term matcher to use, either `TerminologyTermMatcher.exact` or
         `TerminologyTermMatcher.simstring`
@@ -57,6 +61,7 @@ def create_component(
         terms=patterns.get_patterns(),
         attr=attr,
         ignore_excluded=ignore_excluded,
+        ignore_space_tokens=ignore_space_tokens,
         term_matcher=term_matcher,
         term_matcher_config=term_matcher_config,
     )
diff --git a/edsnlp/pipelines/ner/covid/factory.py b/edsnlp/pipelines/ner/covid/factory.py
@@ -9,6 +9,7 @@
 DEFAULT_CONFIG = dict(
     attr="LOWER",
     ignore_excluded=False,
+    ignore_space_tokens=False,
 )
 
 
@@ -22,6 +23,7 @@ def create_component(
     name: str = "eds.covid",
     attr: Union[str, Dict[str, str]] = "LOWER",
     ignore_excluded: bool = False,
+    ignore_space_tokens: bool = False,
 ):
     """
     Create a factory that returns new GenericMatcher with patterns for covid
@@ -36,6 +38,8 @@ def create_component(
         Attribute to match on, eg `TEXT`, `NORM`, etc.
     ignore_excluded: bool
         Whether to skip excluded tokens during matching.
+    ignore_space_tokens: bool
+        Whether to skip space tokens during matching.
 
     Returns
     -------
@@ -48,4 +52,5 @@ def create_component(
         regex=dict(covid=patterns.pattern),
         attr=attr,
         ignore_excluded=ignore_excluded,
+        ignore_space_tokens=ignore_space_tokens,
     )