Skip to content

Commit

Permalink
feat: add ignore_space_tokens to relevant components and update docs …
Browse files Browse the repository at this point in the history
…& tests
  • Loading branch information
percevalw committed Mar 20, 2023
1 parent 1186444 commit bb86cc0
Show file tree
Hide file tree
Showing 25 changed files with 443 additions and 126 deletions.
1 change: 1 addition & 0 deletions changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
### Added

- Add `eds.spaces` (or `eds.normalizer` with `spaces=True`) to detect space tokens, and add `ignore_space_tokens` to `EDSPhraseMatcher` and `SimstringMatcher` to skip them
- Add `ignore_space_tokens` option in most components

## v0.8.0 (2023-03-09)

Expand Down
4 changes: 2 additions & 2 deletions docs/pipelines/core/normalisation.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,8 @@ Moreover, every span exposes a `normalized_variant` extension getter, which comp
The pipeline can be configured using the following parameters :

::: edsnlp.pipelines.core.normalizer.factory.create_component
options:
only_parameters: true
options:
only_parameters: true

## Pipelines

Expand Down
10 changes: 3 additions & 7 deletions docs/pipelines/core/terminology.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,9 @@ This snippet is complete, and should run as is.

The pipeline can be configured using the following parameters :

| Parameter | Explanation | Default |
| ----------------- | ------------------------------------------------ | ----------------------- |
| `label` | Top-level label. | Required |
| `terms` | Terms patterns. Expects a dictionary. | `None` (use regex only) |
| `regex` | RegExp patterns. Expects a dictionary. | `None` (use terms only) |
| `attr` | spaCy attribute to match on (eg `NORM`, `LOWER`) | `"TEXT"` |
| `ignore_excluded` | Whether to skip excluded tokens during matching | `False` |
::: edsnlp.pipelines.core.terminology.factory.create_component
options:
only_parameters: true

Patterns, be they `terms` or `regex`, are defined as dictionaries where keys become the `kb_id_` of the extracted entities.
Dictionary values are a either a single expression or a list of expressions that match the concept (see [example](#usage)).
Expand Down
19 changes: 12 additions & 7 deletions edsnlp/pipelines/core/contextual_matcher/contextual_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ class ContextualMatcher(BaseComponent):
Attribute to match on, eg `TEXT`, `NORM`, etc.
ignore_excluded : bool
Whether to skip excluded tokens during matching.
ignore_space_tokens: bool
Whether to skip space tokens during matching.
alignment_mode : str
Overwrite alignment mode.
regex_flags : Union[re.RegexFlag, int]
Expand All @@ -65,12 +67,13 @@ def __init__(
nlp: Language,
name: str,
patterns: Union[Dict[str, Any], List[Dict[str, Any]]],
assign_as_span: bool,
alignment_mode: str,
attr: str,
regex_flags: Union[re.RegexFlag, int],
ignore_excluded: bool,
include_assigned: bool,
assign_as_span: bool = False,
alignment_mode: str = "expand",
attr: str = "NORM",
regex_flags: Union[re.RegexFlag, int] = 0,
ignore_excluded: bool = False,
ignore_space_tokens: bool = False,
include_assigned: bool = False,
):
self.name = name
self.nlp = nlp
Expand Down Expand Up @@ -160,6 +163,7 @@ def __init__(
attr=p["regex_attr"] or self.attr,
flags=p["regex_flags"] or self.regex_flags,
ignore_excluded=ignore_excluded,
ignore_space_tokens=ignore_space_tokens,
alignment_mode=alignment_mode,
span_from_group=True,
)
Expand Down Expand Up @@ -290,8 +294,9 @@ def assign_one(self, span: Span) -> Span:
end_char=match.end(0),
key=matcher["matcher"].regex[0][0],
attr=matcher["matcher"].regex[0][2],
alignment_mode=matcher["matcher"].regex[0][4],
alignment_mode=matcher["matcher"].regex[0][5],
ignore_excluded=matcher["matcher"].regex[0][3],
ignore_space_tokens=matcher["matcher"].regex[0][4],
),
)
for (span, match) in assigned_list
Expand Down
3 changes: 3 additions & 0 deletions edsnlp/pipelines/core/contextual_matcher/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
DEFAULT_CONFIG = dict(
attr="NORM",
ignore_excluded=False,
ignore_space_tokens=False,
regex_flags=0,
alignment_mode="expand",
assign_as_span=False,
Expand All @@ -28,6 +29,7 @@ def create_component(
alignment_mode: str,
attr: str,
ignore_excluded: bool,
ignore_space_tokens: bool,
regex_flags: Union[re.RegexFlag, int],
include_assigned: bool,
):
Expand Down Expand Up @@ -68,6 +70,7 @@ def create_component(
alignment_mode,
attr=attr,
ignore_excluded=ignore_excluded,
ignore_space_tokens=ignore_space_tokens,
regex_flags=regex_flags,
include_assigned=include_assigned,
)
1 change: 1 addition & 0 deletions edsnlp/pipelines/core/endlines/endlines.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ def __init__(
new_line=r"\n+",
),
ignore_excluded=False,
ignore_space_tokens=False,
**kwargs,
)

Expand Down
47 changes: 40 additions & 7 deletions edsnlp/pipelines/core/matcher/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
regex=None,
attr="TEXT",
ignore_excluded=False,
ignore_space_tokens=False,
term_matcher=GenericTermMatcher.exact,
term_matcher_config={},
)
Expand All @@ -27,14 +28,45 @@
)
def create_component(
nlp: Language,
name: str,
terms: Optional[Dict[str, Union[str, List[str]]]],
attr: Union[str, Dict[str, str]],
regex: Optional[Dict[str, Union[str, List[str]]]],
ignore_excluded: bool,
term_matcher: GenericTermMatcher,
term_matcher_config: Dict[str, Any],
name: str = "eds.matcher",
terms: Optional[Dict[str, Union[str, List[str]]]] = None,
attr: Union[str, Dict[str, str]] = None,
regex: Optional[Dict[str, Union[str, List[str]]]] = "TEXT",
ignore_excluded: bool = False,
ignore_space_tokens: bool = False,
term_matcher: GenericTermMatcher = GenericTermMatcher.exact,
term_matcher_config: Dict[str, Any] = {},
):
"""
Provides a generic matcher component.
Parameters
----------
nlp : Language
The spaCy object.
name: str
The name of the component.
terms : Optional[Patterns]
A dictionary of terms.
regex : Optional[Patterns]
A dictionary of regular expressions.
attr : str
The default attribute to use for matching.
Can be overridden using the `terms` and `regex` configurations.
ignore_excluded : bool
Whether to skip excluded tokens (requires an upstream
pipeline to mark excluded tokens).
ignore_space_tokens: bool
Whether to skip space tokens during matching.
You won't be able to match on newlines if this is enabled and
the "spaces"/"newline" option of `eds.normalizer` is enabled (by default).
term_matcher: GenericTermMatcher
The matcher to use for matching phrases ?
One of (exact, simstring)
term_matcher_config: Dict[str,Any]
Parameters of the matcher class
"""
assert not (terms is None and regex is None)

if terms is None:
Expand All @@ -48,6 +80,7 @@ def create_component(
attr=attr,
regex=regex,
ignore_excluded=ignore_excluded,
ignore_space_tokens=ignore_space_tokens,
term_matcher=term_matcher,
term_matcher_config=term_matcher_config,
)
9 changes: 6 additions & 3 deletions edsnlp/pipelines/core/matcher/matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,11 @@ class GenericMatcher(BaseComponent):
ignore_excluded : bool
Whether to skip excluded tokens (requires an upstream
pipeline to mark excluded tokens).
ignore_excluded : bool
Whether to skip space tokens (requires an upstream
pipeline to mark space tokens).
ignore_space_tokens: bool
Whether to skip space tokens during matching.
You won't be able to match on newlines if this is enabled and
the "spaces"/"newline" option of `eds.normalizer` is enabled (by default).
term_matcher: GenericTermMatcher
The matcher to use for matching phrases ?
One of (exact, simstring)
Expand Down Expand Up @@ -86,6 +88,7 @@ def __init__(
self.regex_matcher = RegexMatcher(
attr=attr,
ignore_excluded=ignore_excluded,
ignore_space_tokens=ignore_space_tokens,
)

self.phrase_matcher.build_patterns(nlp=nlp, terms=terms)
Expand Down
59 changes: 45 additions & 14 deletions edsnlp/pipelines/core/terminology/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@

DEFAULT_CONFIG = dict(
terms=None,
regex=None,
attr="TEXT",
regex=None,
ignore_excluded=False,
ignore_space_tokens=False,
term_matcher="exact",
term_matcher_config={},
)
Expand All @@ -21,29 +22,59 @@
)
def create_component(
nlp: Language,
name: str,
label: str,
terms: Optional[Dict[str, Union[str, List[str]]]],
attr: Union[str, Dict[str, str]],
regex: Optional[Dict[str, Union[str, List[str]]]],
ignore_excluded: bool,
term_matcher: TerminologyTermMatcher,
term_matcher_config: Dict[str, Any],
name: str = "eds.terminology",
attr: Union[str, Dict[str, str]] = "TEXT",
regex: Optional[Dict[str, Union[str, List[str]]]] = None,
ignore_excluded: bool = False,
ignore_space_tokens: bool = False,
term_matcher: TerminologyTermMatcher = "exact",
term_matcher_config: Dict[str, Any] = {},
):
assert not (terms is None and regex is None)
"""
Provides a terminology matching component.
if terms is None:
terms = dict()
if regex is None:
regex = dict()
The terminology matching component differs from the simple matcher component in that
the `regex` and `terms` keys are used as spaCy's `kb_id`. All matched entities
have the same label, defined in the top-level constructor (argument `label`).
Parameters
----------
nlp : Language
The spaCy object.
name: str
The name of the component.
label : str
Top-level label
terms : Optional[Patterns]
A dictionary of terms.
regex : Optional[Patterns]
A dictionary of regular expressions.
attr : str
The default attribute to use for matching.
Can be overridden using the `terms` and `regex` configurations.
ignore_excluded : bool
Whether to skip excluded tokens (requires an upstream
pipeline to mark excluded tokens).
ignore_space_tokens: bool
Whether to skip space tokens during matching.
term_matcher: TerminologyTermMatcher
The matcher to use for matching phrases ?
One of (exact, simstring)
term_matcher_config: Dict[str,Any]
Parameters of the matcher class
"""
assert not (terms is None and regex is None)

return TerminologyMatcher(
nlp,
label=label,
terms=terms,
terms=terms or dict(),
attr=attr,
regex=regex,
regex=regex or dict(),
ignore_excluded=ignore_excluded,
ignore_space_tokens=ignore_space_tokens,
term_matcher=term_matcher,
term_matcher_config=term_matcher_config,
)
6 changes: 6 additions & 0 deletions edsnlp/pipelines/core/terminology/terminology.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ class TerminologyMatcher(BaseComponent):
ignore_excluded : bool
Whether to skip excluded tokens (requires an upstream
pipeline to mark excluded tokens).
ignore_space_tokens: bool
Whether to skip space tokens during matching.
term_matcher: TerminologyTermMatcher
The matcher to use for matching phrases ?
One of (exact, simstring)
Expand All @@ -57,6 +59,7 @@ def __init__(
regex: Optional[Patterns],
attr: str,
ignore_excluded: bool,
ignore_space_tokens: bool = False,
term_matcher: TerminologyTermMatcher = TerminologyTermMatcher.exact,
term_matcher_config=None,
):
Expand All @@ -72,13 +75,15 @@ def __init__(
self.nlp.vocab,
attr=attr,
ignore_excluded=ignore_excluded,
ignore_space_tokens=ignore_space_tokens,
**(term_matcher_config or {}),
)
elif term_matcher == TerminologyTermMatcher.simstring:
self.phrase_matcher = SimstringMatcher(
vocab=self.nlp.vocab,
attr=attr,
ignore_excluded=ignore_excluded,
ignore_space_tokens=ignore_space_tokens,
**(term_matcher_config or {}),
)
else:
Expand All @@ -90,6 +95,7 @@ def __init__(
self.regex_matcher = RegexMatcher(
attr=attr,
ignore_excluded=ignore_excluded,
ignore_space_tokens=ignore_space_tokens,
)

self.phrase_matcher.build_patterns(nlp=nlp, terms=terms, progress=True)
Expand Down
5 changes: 5 additions & 0 deletions edsnlp/pipelines/ner/cim10/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
DEFAULT_CONFIG = dict(
attr="NORM",
ignore_excluded=False,
ignore_space_tokens=False,
term_matcher=TerminologyTermMatcher.exact,
term_matcher_config={},
)
Expand All @@ -22,6 +23,7 @@ def create_component(
name: str = "eds.cim10",
attr: Union[str, Dict[str, str]] = "NORM",
ignore_excluded: bool = False,
ignore_space_tokens: bool = False,
term_matcher: TerminologyTermMatcher = TerminologyTermMatcher.exact,
term_matcher_config: Dict[str, Any] = {},
):
Expand All @@ -39,6 +41,8 @@ def create_component(
Attribute to match on, eg `TEXT`, `NORM`, etc.
ignore_excluded: bool
Whether to skip excluded tokens during matching.
ignore_space_tokens: bool
Whether to skip space tokens during matching.
term_matcher: TerminologyTermMatcher
The term matcher to use, either `TerminologyTermMatcher.exact` or
`TerminologyTermMatcher.simstring`
Expand All @@ -57,6 +61,7 @@ def create_component(
terms=patterns.get_patterns(),
attr=attr,
ignore_excluded=ignore_excluded,
ignore_space_tokens=ignore_space_tokens,
term_matcher=term_matcher,
term_matcher_config=term_matcher_config,
)
5 changes: 5 additions & 0 deletions edsnlp/pipelines/ner/covid/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
DEFAULT_CONFIG = dict(
attr="LOWER",
ignore_excluded=False,
ignore_space_tokens=False,
)


Expand All @@ -22,6 +23,7 @@ def create_component(
name: str = "eds.covid",
attr: Union[str, Dict[str, str]] = "LOWER",
ignore_excluded: bool = False,
ignore_space_tokens: bool = False,
):
"""
Create a factory that returns new GenericMatcher with patterns for covid
Expand All @@ -36,6 +38,8 @@ def create_component(
Attribute to match on, eg `TEXT`, `NORM`, etc.
ignore_excluded: bool
Whether to skip excluded tokens during matching.
ignore_space_tokens: bool
Whether to skip space tokens during matching.
Returns
-------
Expand All @@ -48,4 +52,5 @@ def create_component(
regex=dict(covid=patterns.pattern),
attr=attr,
ignore_excluded=ignore_excluded,
ignore_space_tokens=ignore_space_tokens,
)
Loading

0 comments on commit bb86cc0

Please sign in to comment.