From bb86cc0c391d0a73687384e57793a46bbc77f214 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Perceval=20Wajsb=C3=BCrt?= <perceval.wajsburt-ext@aphp.fr>
Date: Fri, 17 Mar 2023 14:31:57 +0100
Subject: [PATCH] feat: add ignore_space_tokens to relevant components and
 update docs & tests

---
 changelog.md                                  |  1 +
 docs/pipelines/core/normalisation.md          |  4 +-
 docs/pipelines/core/terminology.md            | 10 +--
 .../contextual_matcher/contextual_matcher.py  | 19 ++++--
 .../core/contextual_matcher/factory.py        |  3 +
 edsnlp/pipelines/core/endlines/endlines.py    |  1 +
 edsnlp/pipelines/core/matcher/factory.py      | 47 +++++++++++--
 edsnlp/pipelines/core/matcher/matcher.py      |  9 ++-
 edsnlp/pipelines/core/terminology/factory.py  | 59 ++++++++++++----
 .../pipelines/core/terminology/terminology.py |  6 ++
 edsnlp/pipelines/ner/cim10/factory.py         |  5 ++
 edsnlp/pipelines/ner/covid/factory.py         |  5 ++
 edsnlp/pipelines/ner/drugs/factory.py         |  5 ++
 edsnlp/pipelines/ner/scores/base_score.py     | 68 ++++++++++---------
 .../pipelines/ner/scores/charlson/factory.py  |  3 +
 .../ner/scores/elstonellis/factory.py         | 48 +++++++++++--
 .../ner/scores/emergency/ccmu/factory.py      | 49 ++++++++++---
 .../ner/scores/emergency/gemsa/factory.py     | 49 ++++++++++---
 .../ner/scores/emergency/priority/factory.py  | 49 ++++++++++---
 edsnlp/pipelines/ner/scores/factory.py        | 52 +++++++++++---
 edsnlp/pipelines/ner/scores/sofa/factory.py   | 49 +++++++++++--
 edsnlp/pipelines/ner/scores/sofa/sofa.py      | 17 +++--
 edsnlp/pipelines/ner/umls/factory.py          |  5 ++
 tests/pipelines/core/test_terminology.py      |  5 +-
 tests/pipelines/ner/test_score.py             |  1 +
 25 files changed, 443 insertions(+), 126 deletions(-)

diff --git a/changelog.md b/changelog.md
index f4f57abc8..0e156fb04 100644
--- a/changelog.md
+++ b/changelog.md
@@ -5,6 +5,7 @@
 ### Added
 
 - Add `eds.spaces` (or `eds.normalizer` with `spaces=True`) to detect space tokens, and add `ignore_space_tokens` to `EDSPhraseMatcher` and `SimstringMatcher` to skip them
+- Add `ignore_space_tokens` option in most components
 
 ## v0.8.0 (2023-03-09)
 
diff --git a/docs/pipelines/core/normalisation.md b/docs/pipelines/core/normalisation.md
index 6d22ea766..bd7c0ebfd 100644
--- a/docs/pipelines/core/normalisation.md
+++ b/docs/pipelines/core/normalisation.md
@@ -62,8 +62,8 @@ Moreover, every span exposes a `normalized_variant` extension getter, which comp
 The pipeline can be configured using the following parameters :
 
 ::: edsnlp.pipelines.core.normalizer.factory.create_component
-   options:
-      only_parameters: true
+    options:
+       only_parameters: true
 
 ## Pipelines
 
diff --git a/docs/pipelines/core/terminology.md b/docs/pipelines/core/terminology.md
index 69e3c7950..9b14d1d30 100644
--- a/docs/pipelines/core/terminology.md
+++ b/docs/pipelines/core/terminology.md
@@ -55,13 +55,9 @@ This snippet is complete, and should run as is.
 
 The pipeline can be configured using the following parameters :
 
-| Parameter         | Explanation                                      | Default                 |
-| ----------------- | ------------------------------------------------ | ----------------------- |
-| `label`           | Top-level label.                                 | Required                |
-| `terms`           | Terms patterns. Expects a dictionary.            | `None` (use regex only) |
-| `regex`           | RegExp patterns. Expects a dictionary.           | `None` (use terms only) |
-| `attr`            | spaCy attribute to match on (eg `NORM`, `LOWER`) | `"TEXT"`                |
-| `ignore_excluded` | Whether to skip excluded tokens during matching  | `False`                 |
+::: edsnlp.pipelines.core.terminology.factory.create_component
+    options:
+       only_parameters: true
 
 Patterns, be they `terms` or `regex`, are defined as dictionaries where keys become the `kb_id_` of the extracted entities.
 Dictionary values are a either a single expression or a list of expressions that match the concept (see [example](#usage)).
diff --git a/edsnlp/pipelines/core/contextual_matcher/contextual_matcher.py b/edsnlp/pipelines/core/contextual_matcher/contextual_matcher.py
index 3ca7b551b..a23073e07 100644
--- a/edsnlp/pipelines/core/contextual_matcher/contextual_matcher.py
+++ b/edsnlp/pipelines/core/contextual_matcher/contextual_matcher.py
@@ -51,6 +51,8 @@ class ContextualMatcher(BaseComponent):
         Attribute to match on, eg `TEXT`, `NORM`, etc.
     ignore_excluded : bool
         Whether to skip excluded tokens during matching.
+    ignore_space_tokens: bool
+        Whether to skip space tokens during matching.
     alignment_mode : str
         Overwrite alignment mode.
     regex_flags : Union[re.RegexFlag, int]
@@ -65,12 +67,13 @@ def __init__(
         nlp: Language,
         name: str,
         patterns: Union[Dict[str, Any], List[Dict[str, Any]]],
-        assign_as_span: bool,
-        alignment_mode: str,
-        attr: str,
-        regex_flags: Union[re.RegexFlag, int],
-        ignore_excluded: bool,
-        include_assigned: bool,
+        assign_as_span: bool = False,
+        alignment_mode: str = "expand",
+        attr: str = "NORM",
+        regex_flags: Union[re.RegexFlag, int] = 0,
+        ignore_excluded: bool = False,
+        ignore_space_tokens: bool = False,
+        include_assigned: bool = False,
     ):
         self.name = name
         self.nlp = nlp
@@ -160,6 +163,7 @@ def __init__(
                     attr=p["regex_attr"] or self.attr,
                     flags=p["regex_flags"] or self.regex_flags,
                     ignore_excluded=ignore_excluded,
+                    ignore_space_tokens=ignore_space_tokens,
                     alignment_mode=alignment_mode,
                     span_from_group=True,
                 )
@@ -290,8 +294,9 @@ def assign_one(self, span: Span) -> Span:
                         end_char=match.end(0),
                         key=matcher["matcher"].regex[0][0],
                         attr=matcher["matcher"].regex[0][2],
-                        alignment_mode=matcher["matcher"].regex[0][4],
+                        alignment_mode=matcher["matcher"].regex[0][5],
                         ignore_excluded=matcher["matcher"].regex[0][3],
+                        ignore_space_tokens=matcher["matcher"].regex[0][4],
                     ),
                 )
                 for (span, match) in assigned_list
diff --git a/edsnlp/pipelines/core/contextual_matcher/factory.py b/edsnlp/pipelines/core/contextual_matcher/factory.py
index a41107d98..9f0f1fea2 100644
--- a/edsnlp/pipelines/core/contextual_matcher/factory.py
+++ b/edsnlp/pipelines/core/contextual_matcher/factory.py
@@ -9,6 +9,7 @@
 DEFAULT_CONFIG = dict(
     attr="NORM",
     ignore_excluded=False,
+    ignore_space_tokens=False,
     regex_flags=0,
     alignment_mode="expand",
     assign_as_span=False,
@@ -28,6 +29,7 @@ def create_component(
     alignment_mode: str,
     attr: str,
     ignore_excluded: bool,
+    ignore_space_tokens: bool,
     regex_flags: Union[re.RegexFlag, int],
     include_assigned: bool,
 ):
@@ -68,6 +70,7 @@ def create_component(
         alignment_mode,
         attr=attr,
         ignore_excluded=ignore_excluded,
+        ignore_space_tokens=ignore_space_tokens,
         regex_flags=regex_flags,
         include_assigned=include_assigned,
     )
diff --git a/edsnlp/pipelines/core/endlines/endlines.py b/edsnlp/pipelines/core/endlines/endlines.py
index c51b4205a..9cb1ff1bf 100644
--- a/edsnlp/pipelines/core/endlines/endlines.py
+++ b/edsnlp/pipelines/core/endlines/endlines.py
@@ -49,6 +49,7 @@ def __init__(
                 new_line=r"\n+",
             ),
             ignore_excluded=False,
+            ignore_space_tokens=False,
             **kwargs,
         )
 
diff --git a/edsnlp/pipelines/core/matcher/factory.py b/edsnlp/pipelines/core/matcher/factory.py
index 88593144c..b34147219 100644
--- a/edsnlp/pipelines/core/matcher/factory.py
+++ b/edsnlp/pipelines/core/matcher/factory.py
@@ -11,6 +11,7 @@
     regex=None,
     attr="TEXT",
     ignore_excluded=False,
+    ignore_space_tokens=False,
     term_matcher=GenericTermMatcher.exact,
     term_matcher_config={},
 )
@@ -27,14 +28,45 @@
 )
 def create_component(
     nlp: Language,
-    name: str,
-    terms: Optional[Dict[str, Union[str, List[str]]]],
-    attr: Union[str, Dict[str, str]],
-    regex: Optional[Dict[str, Union[str, List[str]]]],
-    ignore_excluded: bool,
-    term_matcher: GenericTermMatcher,
-    term_matcher_config: Dict[str, Any],
+    name: str = "eds.matcher",
+    terms: Optional[Dict[str, Union[str, List[str]]]] = None,
+    attr: Union[str, Dict[str, str]] = None,
+    regex: Optional[Dict[str, Union[str, List[str]]]] = "TEXT",
+    ignore_excluded: bool = False,
+    ignore_space_tokens: bool = False,
+    term_matcher: GenericTermMatcher = GenericTermMatcher.exact,
+    term_matcher_config: Dict[str, Any] = {},
 ):
+    """
+    Provides a generic matcher component.
+
+    Parameters
+    ----------
+    nlp : Language
+        The spaCy object.
+    name: str
+        The name of the component.
+    terms : Optional[Patterns]
+        A dictionary of terms.
+    regex : Optional[Patterns]
+        A dictionary of regular expressions.
+    attr : str
+        The default attribute to use for matching.
+        Can be overridden using the `terms` and `regex` configurations.
+    ignore_excluded : bool
+        Whether to skip excluded tokens (requires an upstream
+        pipeline to mark excluded tokens).
+    ignore_space_tokens: bool
+        Whether to skip space tokens during matching.
+
+        You won't be able to match on newlines if this is enabled and
+        the "spaces"/"newline" option of `eds.normalizer` is enabled (by default).
+    term_matcher: GenericTermMatcher
+        The matcher to use for matching phrases ?
+        One of (exact, simstring)
+    term_matcher_config: Dict[str,Any]
+        Parameters of the matcher class
+    """
     assert not (terms is None and regex is None)
 
     if terms is None:
@@ -48,6 +80,7 @@ def create_component(
         attr=attr,
         regex=regex,
         ignore_excluded=ignore_excluded,
+        ignore_space_tokens=ignore_space_tokens,
         term_matcher=term_matcher,
         term_matcher_config=term_matcher_config,
     )
diff --git a/edsnlp/pipelines/core/matcher/matcher.py b/edsnlp/pipelines/core/matcher/matcher.py
index b3fb15d55..8fc552bbb 100644
--- a/edsnlp/pipelines/core/matcher/matcher.py
+++ b/edsnlp/pipelines/core/matcher/matcher.py
@@ -35,9 +35,11 @@ class GenericMatcher(BaseComponent):
     ignore_excluded : bool
         Whether to skip excluded tokens (requires an upstream
         pipeline to mark excluded tokens).
-    ignore_excluded : bool
-        Whether to skip space tokens (requires an upstream
-        pipeline to mark space tokens).
+    ignore_space_tokens: bool
+        Whether to skip space tokens during matching.
+
+        You won't be able to match on newlines if this is enabled and
+        the "spaces"/"newline" option of `eds.normalizer` is enabled (by default).
     term_matcher: GenericTermMatcher
         The matcher to use for matching phrases ?
         One of (exact, simstring)
@@ -86,6 +88,7 @@ def __init__(
         self.regex_matcher = RegexMatcher(
             attr=attr,
             ignore_excluded=ignore_excluded,
+            ignore_space_tokens=ignore_space_tokens,
         )
 
         self.phrase_matcher.build_patterns(nlp=nlp, terms=terms)
diff --git a/edsnlp/pipelines/core/terminology/factory.py b/edsnlp/pipelines/core/terminology/factory.py
index 3da4adbc0..28362b5a0 100644
--- a/edsnlp/pipelines/core/terminology/factory.py
+++ b/edsnlp/pipelines/core/terminology/factory.py
@@ -6,9 +6,10 @@
 
 DEFAULT_CONFIG = dict(
     terms=None,
-    regex=None,
     attr="TEXT",
+    regex=None,
     ignore_excluded=False,
+    ignore_space_tokens=False,
     term_matcher="exact",
     term_matcher_config={},
 )
@@ -21,29 +22,59 @@
 )
 def create_component(
     nlp: Language,
-    name: str,
     label: str,
     terms: Optional[Dict[str, Union[str, List[str]]]],
-    attr: Union[str, Dict[str, str]],
-    regex: Optional[Dict[str, Union[str, List[str]]]],
-    ignore_excluded: bool,
-    term_matcher: TerminologyTermMatcher,
-    term_matcher_config: Dict[str, Any],
+    name: str = "eds.terminology",
+    attr: Union[str, Dict[str, str]] = "TEXT",
+    regex: Optional[Dict[str, Union[str, List[str]]]] = None,
+    ignore_excluded: bool = False,
+    ignore_space_tokens: bool = False,
+    term_matcher: TerminologyTermMatcher = "exact",
+    term_matcher_config: Dict[str, Any] = {},
 ):
-    assert not (terms is None and regex is None)
+    """
+    Provides a terminology matching component.
 
-    if terms is None:
-        terms = dict()
-    if regex is None:
-        regex = dict()
+    The terminology matching component differs from the simple matcher component in that
+    the `regex` and `terms` keys are used as spaCy's `kb_id`. All matched entities
+    have the same label, defined in the top-level constructor (argument `label`).
+
+    Parameters
+    ----------
+    nlp : Language
+        The spaCy object.
+    name: str
+        The name of the component.
+    label : str
+        Top-level label
+    terms : Optional[Patterns]
+        A dictionary of terms.
+    regex : Optional[Patterns]
+        A dictionary of regular expressions.
+    attr : str
+        The default attribute to use for matching.
+        Can be overridden using the `terms` and `regex` configurations.
+    ignore_excluded : bool
+        Whether to skip excluded tokens (requires an upstream
+        pipeline to mark excluded tokens).
+    ignore_space_tokens: bool
+        Whether to skip space tokens during matching.
+    term_matcher: TerminologyTermMatcher
+        The matcher to use for matching phrases ?
+        One of (exact, simstring)
+    term_matcher_config: Dict[str,Any]
+        Parameters of the matcher class
+    """
+    assert not (terms is None and regex is None)
 
     return TerminologyMatcher(
         nlp,
         label=label,
-        terms=terms,
+        terms=terms or dict(),
         attr=attr,
-        regex=regex,
+        regex=regex or dict(),
         ignore_excluded=ignore_excluded,
+        ignore_space_tokens=ignore_space_tokens,
         term_matcher=term_matcher,
         term_matcher_config=term_matcher_config,
     )
diff --git a/edsnlp/pipelines/core/terminology/terminology.py b/edsnlp/pipelines/core/terminology/terminology.py
index e79f09171..ac1735338 100644
--- a/edsnlp/pipelines/core/terminology/terminology.py
+++ b/edsnlp/pipelines/core/terminology/terminology.py
@@ -42,6 +42,8 @@ class TerminologyMatcher(BaseComponent):
     ignore_excluded : bool
         Whether to skip excluded tokens (requires an upstream
         pipeline to mark excluded tokens).
+    ignore_space_tokens: bool
+        Whether to skip space tokens during matching.
     term_matcher: TerminologyTermMatcher
         The matcher to use for matching phrases ?
         One of (exact, simstring)
@@ -57,6 +59,7 @@ def __init__(
         regex: Optional[Patterns],
         attr: str,
         ignore_excluded: bool,
+        ignore_space_tokens: bool = False,
         term_matcher: TerminologyTermMatcher = TerminologyTermMatcher.exact,
         term_matcher_config=None,
     ):
@@ -72,6 +75,7 @@ def __init__(
                 self.nlp.vocab,
                 attr=attr,
                 ignore_excluded=ignore_excluded,
+                ignore_space_tokens=ignore_space_tokens,
                 **(term_matcher_config or {}),
             )
         elif term_matcher == TerminologyTermMatcher.simstring:
@@ -79,6 +83,7 @@ def __init__(
                 vocab=self.nlp.vocab,
                 attr=attr,
                 ignore_excluded=ignore_excluded,
+                ignore_space_tokens=ignore_space_tokens,
                 **(term_matcher_config or {}),
             )
         else:
@@ -90,6 +95,7 @@ def __init__(
         self.regex_matcher = RegexMatcher(
             attr=attr,
             ignore_excluded=ignore_excluded,
+            ignore_space_tokens=ignore_space_tokens,
         )
 
         self.phrase_matcher.build_patterns(nlp=nlp, terms=terms, progress=True)
diff --git a/edsnlp/pipelines/ner/cim10/factory.py b/edsnlp/pipelines/ner/cim10/factory.py
index 38011e92d..985ddef0c 100644
--- a/edsnlp/pipelines/ner/cim10/factory.py
+++ b/edsnlp/pipelines/ner/cim10/factory.py
@@ -9,6 +9,7 @@
 DEFAULT_CONFIG = dict(
     attr="NORM",
     ignore_excluded=False,
+    ignore_space_tokens=False,
     term_matcher=TerminologyTermMatcher.exact,
     term_matcher_config={},
 )
@@ -22,6 +23,7 @@ def create_component(
     name: str = "eds.cim10",
     attr: Union[str, Dict[str, str]] = "NORM",
     ignore_excluded: bool = False,
+    ignore_space_tokens: bool = False,
     term_matcher: TerminologyTermMatcher = TerminologyTermMatcher.exact,
     term_matcher_config: Dict[str, Any] = {},
 ):
@@ -39,6 +41,8 @@ def create_component(
         Attribute to match on, eg `TEXT`, `NORM`, etc.
     ignore_excluded: bool
         Whether to skip excluded tokens during matching.
+    ignore_space_tokens: bool
+        Whether to skip space tokens during matching.
     term_matcher: TerminologyTermMatcher
         The term matcher to use, either `TerminologyTermMatcher.exact` or
         `TerminologyTermMatcher.simstring`
@@ -57,6 +61,7 @@ def create_component(
         terms=patterns.get_patterns(),
         attr=attr,
         ignore_excluded=ignore_excluded,
+        ignore_space_tokens=ignore_space_tokens,
         term_matcher=term_matcher,
         term_matcher_config=term_matcher_config,
     )
diff --git a/edsnlp/pipelines/ner/covid/factory.py b/edsnlp/pipelines/ner/covid/factory.py
index f4f8a4650..b59861232 100644
--- a/edsnlp/pipelines/ner/covid/factory.py
+++ b/edsnlp/pipelines/ner/covid/factory.py
@@ -9,6 +9,7 @@
 DEFAULT_CONFIG = dict(
     attr="LOWER",
     ignore_excluded=False,
+    ignore_space_tokens=False,
 )
 
 
@@ -22,6 +23,7 @@ def create_component(
     name: str = "eds.covid",
     attr: Union[str, Dict[str, str]] = "LOWER",
     ignore_excluded: bool = False,
+    ignore_space_tokens: bool = False,
 ):
     """
     Create a factory that returns new GenericMatcher with patterns for covid
@@ -36,6 +38,8 @@ def create_component(
         Attribute to match on, eg `TEXT`, `NORM`, etc.
     ignore_excluded: bool
         Whether to skip excluded tokens during matching.
+    ignore_space_tokens: bool
+        Whether to skip space tokens during matching.
 
     Returns
     -------
@@ -48,4 +52,5 @@ def create_component(
         regex=dict(covid=patterns.pattern),
         attr=attr,
         ignore_excluded=ignore_excluded,
+        ignore_space_tokens=ignore_space_tokens,
     )
diff --git a/edsnlp/pipelines/ner/drugs/factory.py b/edsnlp/pipelines/ner/drugs/factory.py
index 77e80a53f..f55656400 100644
--- a/edsnlp/pipelines/ner/drugs/factory.py
+++ b/edsnlp/pipelines/ner/drugs/factory.py
@@ -9,6 +9,7 @@
 DEFAULT_CONFIG = dict(
     attr="NORM",
     ignore_excluded=False,
+    ignore_space_tokens=False,
     term_matcher=TerminologyTermMatcher.exact,
     term_matcher_config={},
 )
@@ -24,6 +25,7 @@ def create_component(
     name: str = "eds.drugs",
     attr: str = "NORM",
     ignore_excluded: bool = False,
+    ignore_space_tokens: bool = False,
     term_matcher: TerminologyTermMatcher = TerminologyTermMatcher.exact,
     term_matcher_config: Dict[str, Any] = {},
 ):
@@ -42,6 +44,8 @@ def create_component(
         Attribute to match on, eg `TEXT`, `NORM`, etc.
     ignore_excluded: bool
         Whether to skip excluded tokens during matching.
+    ignore_space_tokens: bool
+        Whether to skip space tokens during matching.
     term_matcher: TerminologyTermMatcher
         The term matcher to use, either `TerminologyTermMatcher.exact` or
         `TerminologyTermMatcher.simstring`
@@ -59,6 +63,7 @@ def create_component(
         regex=dict(),
         attr=attr,
         ignore_excluded=ignore_excluded,
+        ignore_space_tokens=ignore_space_tokens,
         term_matcher=term_matcher,
         term_matcher_config=term_matcher_config,
     )
diff --git a/edsnlp/pipelines/ner/scores/base_score.py b/edsnlp/pipelines/ner/scores/base_score.py
index d1d724805..451dfc6c9 100644
--- a/edsnlp/pipelines/ner/scores/base_score.py
+++ b/edsnlp/pipelines/ner/scores/base_score.py
@@ -10,30 +10,7 @@
 
 
 class Score(ContextualMatcher):
-    """
-    Matcher component to extract a numeric score
-
-    Parameters
-    ----------
-    nlp : Language
-        The spaCy object.
-    score_name : str
-        The name of the extracted score
-    regex : List[str]
-        A list of regexes to identify the score
-    attr : str
-        Wether to match on the text ('TEXT') or on the normalized text ('NORM')
-    value_extract : str
-        Regex with capturing group to get the score value
-    score_normalization : Callable[[Union[str,None]], Any]
-        Function that takes the "raw" value extracted from the `value_extract` regex,
-        and should return
-        - None if no score could be extracted
-        - The desired score value else
-    window : int
-        Number of token to include after the score's mention to find the
-        score's value
-    """
+    """Matcher component to extract a numeric score"""
 
     def __init__(
         self,
@@ -45,16 +22,44 @@ def __init__(
         score_normalization: Union[str, Callable[[Union[str, None]], Any]],
         window: int,
         ignore_excluded: bool,
+        ignore_space_tokens: bool,
         flags: Union[re.RegexFlag, int],
     ):
+        """
+        Parameters
+        ----------
+        nlp : Language
+            The spaCy object.
+        score_name : str
+            The name of the extracted score
+        regex : List[str]
+            A list of regexes to identify the score
+        attr : str
+            Whether to match on the text ('TEXT') or on the normalized text ('NORM')
+        value_extract : str
+            Regex with capturing group to get the score value
+        score_normalization : Callable[[Union[str,None]], Any]
+            Function that takes the "raw" value extracted from the `value_extract`
+            regex and should return:
+
+            - None if no score could be extracted
+            - The desired score value else
+        window : int
+            Number of token to include after the score's mention to find the
+            score's value
+        ignore_excluded : bool
+            Whether to ignore excluded spans when matching
+        ignore_space_tokens : bool
+            Whether to ignore space tokens when matching
+        flags : Union[re.RegexFlag, int]
+            Regex flags to use when matching
+        """
         if isinstance(value_extract, str):
-            value_extract = [
-                dict(
-                    name="value",
-                    regex=value_extract,
-                    window=window,
-                )
-            ]
+            value_extract = dict(
+                name="value",
+                regex=value_extract,
+                window=window,
+            )
 
         if isinstance(value_extract, dict):
             value_extract = [value_extract]
@@ -83,6 +88,7 @@ def __init__(
             assign_as_span=False,
             alignment_mode="expand",
             ignore_excluded=ignore_excluded,
+            ignore_space_tokens=ignore_space_tokens,
             attr=attr,
             regex_flags=flags,
             include_assigned=False,
diff --git a/edsnlp/pipelines/ner/scores/charlson/factory.py b/edsnlp/pipelines/ner/scores/charlson/factory.py
index df37e998f..5ab820eec 100644
--- a/edsnlp/pipelines/ner/scores/charlson/factory.py
+++ b/edsnlp/pipelines/ner/scores/charlson/factory.py
@@ -14,6 +14,7 @@
     attr="NORM",
     window=7,
     ignore_excluded=False,
+    ignore_space_tokens=False,
     flags=0,
 )
 
@@ -38,6 +39,7 @@ def create_component(
     attr: str,
     window: int,
     ignore_excluded: bool,
+    ignore_space_tokens: bool,
     flags: Union[re.RegexFlag, int],
 ):
     return Score(
@@ -49,5 +51,6 @@ def create_component(
         attr=attr,
         window=window,
         ignore_excluded=ignore_excluded,
+        ignore_space_tokens=ignore_space_tokens,
         flags=flags,
     )
diff --git a/edsnlp/pipelines/ner/scores/elstonellis/factory.py b/edsnlp/pipelines/ner/scores/elstonellis/factory.py
index 289480894..448afc099 100644
--- a/edsnlp/pipelines/ner/scores/elstonellis/factory.py
+++ b/edsnlp/pipelines/ner/scores/elstonellis/factory.py
@@ -13,6 +13,7 @@
     attr="TEXT",
     window=20,
     ignore_excluded=False,
+    ignore_space_tokens=False,
     flags=0,
 )
 
@@ -25,14 +26,46 @@
 def create_component(
     nlp: Language,
     name: str,
-    regex: List[str],
-    value_extract: str,
-    score_normalization: Union[str, Callable[[Union[str, None]], Any]],
-    attr: str,
-    window: int,
-    ignore_excluded: bool,
-    flags: Union[re.RegexFlag, int],
+    regex: List[str] = patterns.regex,
+    value_extract: str = patterns.value_extract,
+    score_normalization: Union[
+        str, Callable[[Union[str, None]], Any]
+    ] = patterns.score_normalization_str,
+    attr: str = "TEXT",
+    window: int = 20,
+    ignore_excluded: bool = False,
+    ignore_space_tokens: bool = False,
+    flags: Union[re.RegexFlag, int] = 0,
 ):
+    """
+    Matcher for the Elston-Ellis score.
+
+    Parameters
+    ----------
+    nlp: Language
+        The spaCy Language object
+    name: str
+        The name of the component
+    regex: List[str]
+        The regex patterns to match
+    value_extract: str
+        The regex pattern to extract the value from the matched text
+    score_normalization: Union[str, Callable[[Union[str, None]], Any]]
+        The normalization function to apply to the extracted value
+    attr: str
+        The token attribute to match on (e.g. "TEXT" or "NORM")
+    window: int
+        The window size to search for the regex pattern
+    ignore_excluded: bool
+        Whether to ignore excluded tokens
+    ignore_space_tokens: bool
+        Whether to ignore space tokens
+    flags: Union[re.RegexFlag, int]
+        The regex flags to use
+    Returns
+    -------
+    Score
+    """
     return Score(
         nlp,
         score_name=name,
@@ -42,5 +75,6 @@ def create_component(
         attr=attr,
         window=window,
         ignore_excluded=ignore_excluded,
+        ignore_space_tokens=ignore_space_tokens,
         flags=flags,
     )
diff --git a/edsnlp/pipelines/ner/scores/emergency/ccmu/factory.py b/edsnlp/pipelines/ner/scores/emergency/ccmu/factory.py
index 22917c173..e1f181562 100644
--- a/edsnlp/pipelines/ner/scores/emergency/ccmu/factory.py
+++ b/edsnlp/pipelines/ner/scores/emergency/ccmu/factory.py
@@ -31,15 +31,47 @@
 )
 def create_component(
     nlp: Language,
-    name: str,
-    regex: List[str],
-    value_extract: str,
-    score_normalization: Union[str, Callable[[Union[str, None]], Any]],
-    attr: str,
-    window: int,
-    ignore_excluded: bool,
-    flags: Union[re.RegexFlag, int],
+    name: str = "eds.emergency.ccmu",
+    regex: List[str] = patterns.regex,
+    value_extract: str = patterns.value_extract,
+    score_normalization: Union[
+        str, Callable[[Union[str, None]], Any]
+    ] = patterns.score_normalization_str,
+    attr: str = "NORM",
+    window: int = 20,
+    ignore_excluded: bool = False,
+    ignore_space_tokens: bool = False,
+    flags: Union[re.RegexFlag, int] = 0,
 ):
+    """
+    Matcher for the Emergency CCMU score.
+
+    Parameters
+    ----------
+    nlp: Language
+        The spaCy Language object
+    name: str
+        The name of the component
+    regex: List[str]
+        The regex patterns to match
+    value_extract: str
+        The regex pattern to extract the value from the matched text
+    score_normalization: Union[str, Callable[[Union[str, None]], Any]]
+        The normalization function to apply to the extracted value
+    attr: str
+        The token attribute to match on (e.g. "TEXT" or "NORM")
+    window: int
+        The window size to search for the regex pattern
+    ignore_excluded: bool
+        Whether to ignore excluded tokens
+    ignore_space_tokens: bool
+        Whether to ignore space tokens
+    flags: Union[re.RegexFlag, int]
+        The regex flags to use
+    Returns
+    -------
+    Score
+    """
     return Score(
         nlp,
         score_name=name,
@@ -49,5 +81,6 @@ def create_component(
         attr=attr,
         window=window,
         ignore_excluded=ignore_excluded,
+        ignore_space_tokens=ignore_space_tokens,
         flags=flags,
     )
diff --git a/edsnlp/pipelines/ner/scores/emergency/gemsa/factory.py b/edsnlp/pipelines/ner/scores/emergency/gemsa/factory.py
index a6bf8ba38..f8e263100 100644
--- a/edsnlp/pipelines/ner/scores/emergency/gemsa/factory.py
+++ b/edsnlp/pipelines/ner/scores/emergency/gemsa/factory.py
@@ -31,15 +31,47 @@
 )
 def create_component(
     nlp: Language,
-    name: str,
-    regex: List[str],
-    value_extract: str,
-    score_normalization: Union[str, Callable[[Union[str, None]], Any]],
-    attr: str,
-    window: int,
-    ignore_excluded: bool,
-    flags: Union[re.RegexFlag, int],
+    name: str = "eds.emergency.gemsa",
+    regex: List[str] = patterns.regex,
+    value_extract: str = patterns.value_extract,
+    score_normalization: Union[
+        str, Callable[[Union[str, None]], Any]
+    ] = patterns.score_normalization_str,
+    attr: str = "NORM",
+    window: int = 20,
+    ignore_excluded: bool = False,
+    ignore_space_tokens: bool = False,
+    flags: Union[re.RegexFlag, int] = 0,
 ):
+    """
+    Matcher for the Emergency CCMU score.
+
+    Parameters
+    ----------
+    nlp: Language
+        The spaCy Language object
+    name: str
+        The name of the component
+    regex: List[str]
+        The regex patterns to match
+    value_extract: str
+        The regex pattern to extract the value from the matched text
+    score_normalization: Union[str, Callable[[Union[str, None]], Any]]
+        The normalization function to apply to the extracted value
+    attr: str
+        The token attribute to match on (e.g. "TEXT" or "NORM")
+    window: int
+        The window size to search for the regex pattern
+    ignore_excluded: bool
+        Whether to ignore excluded tokens
+    ignore_space_tokens: bool
+        Whether to ignore space tokens
+    flags: Union[re.RegexFlag, int]
+        The regex flags to use
+    Returns
+    -------
+    Score
+    """
     return Score(
         nlp,
         score_name=name,
@@ -49,5 +81,6 @@ def create_component(
         attr=attr,
         window=window,
         ignore_excluded=ignore_excluded,
+        ignore_space_tokens=ignore_space_tokens,
         flags=flags,
     )
diff --git a/edsnlp/pipelines/ner/scores/emergency/priority/factory.py b/edsnlp/pipelines/ner/scores/emergency/priority/factory.py
index d49252c71..500261e63 100644
--- a/edsnlp/pipelines/ner/scores/emergency/priority/factory.py
+++ b/edsnlp/pipelines/ner/scores/emergency/priority/factory.py
@@ -31,15 +31,47 @@
 )
 def create_component(
     nlp: Language,
-    name: str,
-    regex: List[str],
-    value_extract: str,
-    score_normalization: Union[str, Callable[[Union[str, None]], Any]],
-    attr: str,
-    window: int,
-    ignore_excluded: bool,
-    flags: Union[re.RegexFlag, int],
+    name: str = "emergency.priority",
+    regex: List[str] = patterns.regex,
+    value_extract: str = patterns.value_extract,
+    score_normalization: Union[
+        str, Callable[[Union[str, None]], Any]
+    ] = patterns.score_normalization_str,
+    attr: str = "NORM",
+    window: int = 7,
+    ignore_excluded: bool = False,
+    ignore_space_tokens: bool = False,
+    flags: Union[re.RegexFlag, int] = 0,
 ):
+    """
+    Matcher for the Emergency Priority score.
+
+    Parameters
+    ----------
+    nlp: Language
+        The spaCy Language object
+    name: str
+        The name of the component
+    regex: List[str]
+        The regex patterns to match
+    value_extract: str
+        The regex pattern to extract the value from the matched text
+    score_normalization: Union[str, Callable[[Union[str, None]], Any]]
+        The normalization function to apply to the extracted value
+    attr: str
+        The token attribute to match on (e.g. "TEXT" or "NORM")
+    window: int
+        The window size to search for the regex pattern
+    ignore_excluded: bool
+        Whether to ignore excluded tokens
+    ignore_space_tokens: bool
+        Whether to ignore space tokens
+    flags: Union[re.RegexFlag, int]
+        The regex flags to use
+    Returns
+    -------
+    Score
+    """
     return Score(
         nlp,
         score_name=name,
@@ -49,5 +81,6 @@ def create_component(
         attr=attr,
         window=window,
         ignore_excluded=ignore_excluded,
+        ignore_space_tokens=ignore_space_tokens,
         flags=flags,
     )
diff --git a/edsnlp/pipelines/ner/scores/factory.py b/edsnlp/pipelines/ner/scores/factory.py
index fe637fcd5..23b7fc484 100644
--- a/edsnlp/pipelines/ner/scores/factory.py
+++ b/edsnlp/pipelines/ner/scores/factory.py
@@ -10,6 +10,7 @@
     attr="NORM",
     window=7,
     ignore_excluded=False,
+    ignore_space_tokens=False,
     flags=0,
 )
 
@@ -27,16 +28,48 @@
 )
 def create_component(
     nlp: Language,
-    name: str,
-    score_name: str,
-    regex: List[str],
-    value_extract: str,
-    score_normalization: Union[str, Callable[[Union[str, None]], Any]],
-    attr: str,
-    window: int,
-    flags: Union[re.RegexFlag, int],
-    ignore_excluded: bool,
+    name: str = "eds.score",
+    score_name: str = None,
+    regex: List[str] = None,
+    value_extract: str = None,
+    score_normalization: Union[str, Callable[[Union[str, None]], Any]] = None,
+    attr: str = "NORM",
+    window: int = 7,
+    flags: Union[re.RegexFlag, int] = 0,
+    ignore_excluded: bool = False,
+    ignore_space_tokens: bool = False,
 ):
+    """
+    Parameters
+    ----------
+    nlp : Language
+        The spaCy object.
+    name : str
+        The name of the component.
+    score_name : str
+        The name of the extracted score
+    regex : List[str]
+        A list of regexes to identify the score
+    attr : str
+        Whether to match on the text ('TEXT') or on the normalized text ('NORM')
+    value_extract : str
+        Regex with capturing group to get the score value
+    score_normalization : Callable[[Union[str,None]], Any]
+        Function that takes the "raw" value extracted from the `value_extract` regex,
+        and should return:
+
+        - None if no score could be extracted
+        - The desired score value else
+    window : int
+        Number of token to include after the score's mention to find the
+        score's value
+    ignore_excluded : bool
+        Whether to ignore excluded spans when matching
+    ignore_space_tokens : bool
+        Whether to ignore space tokens when matching
+    flags : Union[re.RegexFlag, int]
+        Regex flags to use when matching
+    """
     return Score(
         nlp,
         score_name=score_name,
@@ -47,4 +80,5 @@ def create_component(
         flags=flags,
         window=window,
         ignore_excluded=ignore_excluded,
+        ignore_space_tokens=ignore_space_tokens,
     )
diff --git a/edsnlp/pipelines/ner/scores/sofa/factory.py b/edsnlp/pipelines/ner/scores/sofa/factory.py
index e565262ac..8c9b1fb97 100644
--- a/edsnlp/pipelines/ner/scores/sofa/factory.py
+++ b/edsnlp/pipelines/ner/scores/sofa/factory.py
@@ -13,6 +13,7 @@
     attr="NORM",
     window=10,
     ignore_excluded=False,
+    ignore_space_tokens=False,
     flags=0,
 )
 
@@ -31,14 +32,47 @@
 def create_component(
     nlp: Language,
     name: str,
-    regex: List[str],
-    value_extract: List[Dict[str, str]],
-    score_normalization: Union[str, Callable[[Union[str, None]], Any]],
-    attr: str,
-    window: int,
-    ignore_excluded: bool,
-    flags: Union[re.RegexFlag, int],
+    regex: List[str] = patterns.regex,
+    value_extract: List[Dict[str, str]] = patterns.value_extract,
+    score_normalization: Union[
+        str, Callable[[Union[str, None]], Any]
+    ] = patterns.score_normalization_str,
+    attr: str = "NORM",
+    window: int = 10,
+    ignore_excluded: bool = False,
+    ignore_space_tokens: bool = False,
+    flags: Union[re.RegexFlag, int] = 0,
 ):
+    """
+    Matcher component to extract the SOFA score
+
+    Parameters
+    ----------
+    nlp : Language
+        The spaCy object.
+    name : str
+        The name of the extracted score
+    regex : List[str]
+        A list of regexes to identify the SOFA score
+    attr : str
+        Whether to match on the text ('TEXT') or on the normalized text ('CUSTOM_NORM')
+    value_extract : Dict[str, str]
+        Regex to extract the score value
+    score_normalization : Callable[[Union[str,None]], Any]
+        Function that takes the "raw" value extracted from the `value_extract` regex,
+        and should return
+        - None if no score could be extracted
+        - The desired score value else
+    window : int
+        Number of token to include after the score's mention to find the
+        score's value
+    ignore_excluded : bool
+        Whether to ignore excluded spans
+    ignore_space_tokens : bool
+        Whether to ignore space tokens
+    flags : Union[re.RegexFlag, int]
+        Flags to pass to the regex
+    """
     return Sofa(
         nlp,
         score_name=name,
@@ -48,5 +82,6 @@ def create_component(
         attr=attr,
         window=window,
         ignore_excluded=ignore_excluded,
+        ignore_space_tokens=ignore_space_tokens,
         flags=flags,
     )
diff --git a/edsnlp/pipelines/ner/scores/sofa/sofa.py b/edsnlp/pipelines/ner/scores/sofa/sofa.py
index d3d5e65f3..0ea49962e 100644
--- a/edsnlp/pipelines/ner/scores/sofa/sofa.py
+++ b/edsnlp/pipelines/ner/scores/sofa/sofa.py
@@ -15,16 +15,11 @@ class Sofa(Score):
     ----------
     nlp : Language
         The spaCy object.
-    score_name : str
-        The name of the extracted score
     regex : List[str]
         A list of regexes to identify the SOFA score
     attr : str
-        Wether to match on the text ('TEXT') or on the normalized text ('CUSTOM_NORM')
-    method_regex : str
-        Regex with capturing group to get the score extraction method
-        (e.g. "à l'admission", "à 24H", "Maximum")
-    value_regex : str
+        Whether to match on the text ('TEXT') or on the normalized text ('CUSTOM_NORM')
+    value_extract : Dict[str, str]
         Regex to extract the score value
     score_normalization : Callable[[Union[str,None]], Any]
         Function that takes the "raw" value extracted from the `value_extract` regex,
@@ -34,6 +29,12 @@ class Sofa(Score):
     window : int
         Number of token to include after the score's mention to find the
         score's value
+    ignore_excluded : bool
+        Whether to ignore excluded spans
+    ignore_space_tokens : bool
+        Whether to ignore space tokens
+    flags : Union[re.RegexFlag, int]
+        Flags to pass to the regex
     """
 
     def __init__(
@@ -47,6 +48,7 @@ def __init__(
         window: int,
         flags: Union[re.RegexFlag, int],
         ignore_excluded: bool,
+        ignore_space_tokens: bool,
     ):
 
         super().__init__(
@@ -59,6 +61,7 @@ def __init__(
             window=window,
             flags=flags,
             ignore_excluded=ignore_excluded,
+            ignore_space_tokens=ignore_space_tokens,
         )
 
         self.set_extensions()
diff --git a/edsnlp/pipelines/ner/umls/factory.py b/edsnlp/pipelines/ner/umls/factory.py
index b32b38b63..11264e259 100644
--- a/edsnlp/pipelines/ner/umls/factory.py
+++ b/edsnlp/pipelines/ner/umls/factory.py
@@ -9,6 +9,7 @@
 DEFAULT_CONFIG = dict(
     attr="NORM",
     ignore_excluded=False,
+    ignore_space_tokens=False,
     term_matcher=TerminologyTermMatcher.exact,
     term_matcher_config={},
     pattern_config=dict(
@@ -26,6 +27,7 @@ def create_component(
     name: str = "eds.umls",
     attr: Union[str, Dict[str, str]] = "NORM",
     ignore_excluded: bool = False,
+    ignore_space_tokens: bool = False,
     term_matcher: TerminologyTermMatcher = TerminologyTermMatcher.exact,
     term_matcher_config: Dict[str, Any] = {},
     pattern_config: Dict[str, Any] = dict(
@@ -47,6 +49,8 @@ def create_component(
         Attribute to match on, eg `TEXT`, `NORM`, etc.
     ignore_excluded: bool
         Whether to skip excluded tokens during matching.
+    ignore_space_tokens: bool
+        Whether to skip space tokens during matching.
     term_matcher: TerminologyTermMatcher
         The term matcher to use, either `TerminologyTermMatcher.exact` or
         `TerminologyTermMatcher.simstring`
@@ -67,6 +71,7 @@ def create_component(
         terms=patterns.get_patterns(pattern_config),
         attr=attr,
         ignore_excluded=ignore_excluded,
+        ignore_space_tokens=ignore_space_tokens,
         term_matcher=term_matcher,
         term_matcher_config=term_matcher_config,
     )
diff --git a/tests/pipelines/core/test_terminology.py b/tests/pipelines/core/test_terminology.py
index 4e7429f5c..b385f9cc3 100644
--- a/tests/pipelines/core/test_terminology.py
+++ b/tests/pipelines/core/test_terminology.py
@@ -1,3 +1,4 @@
+import pytest
 from spacy.language import Language
 
 from edsnlp.utils.examples import parse_example
@@ -5,13 +6,15 @@
 example = "1g de <ent kb_id=paracetamol>doliprane</ent>"
 
 
-def test_terminology(blank_nlp: Language):
+@pytest.mark.parametrize("term_matcher", ["exact", "simstring"])
+def test_terminology(blank_nlp: Language, term_matcher: str):
     blank_nlp.add_pipe(
         "eds.terminology",
         config=dict(
             label="drugs",
             terms=dict(paracetamol=["doliprane", "tylenol", "paracetamol"]),
             attr="NORM",
+            term_matcher=term_matcher,
         ),
     )
 
diff --git a/tests/pipelines/ner/test_score.py b/tests/pipelines/ner/test_score.py
index 36db1a53c..cad10cfcf 100644
--- a/tests/pipelines/ner/test_score.py
+++ b/tests/pipelines/ner/test_score.py
@@ -51,6 +51,7 @@ def testscore_normalization(raw_score: str):
         regex=[r"test+score"],
         attr="NORM",
         ignore_excluded=True,
+        ignore_space_tokens=False,
         value_extract=r"(\d+)",
         score_normalization=testscore_normalization,
         window=4,