From db55239be272fe82ac109b9d942a333a26e294f3 Mon Sep 17 00:00:00 2001 From: Perceval Wajsburt Date: Fri, 3 May 2024 08:53:23 +0000 Subject: [PATCH] feat: add required & include parameter & support for span_getter in eds.contextual_matcher --- changelog.md | 2 + docs/assets/stylesheets/extra.css | 17 ++ docs/pipes/core/contextual-matcher.md | 70 +---- edsnlp/matchers/regex.py | 4 +- .../contextual_matcher/contextual_matcher.py | 286 ++++++++++-------- .../pipes/core/contextual_matcher/models.py | 231 +++++++++++--- edsnlp/utils/typing.py | 3 +- pyproject.toml | 4 +- 8 files changed, 381 insertions(+), 236 deletions(-) diff --git a/changelog.md b/changelog.md index 2ff94fc8b..f1ad038bb 100644 --- a/changelog.md +++ b/changelog.md @@ -6,6 +6,8 @@ - Expose the defaults patterns of `eds.negation`, `eds.hypothesis`, `eds.family`, `eds.history` and `eds.reported_speech` under a `eds.negation.default_patterns` attribute - Added a `context_getter` SpanGetter argument to the `eds.matcher` class to only retrieve entities inside the spans returned by the getter +- Added a `filter_expr` parameter to scorers to filter the documents to score +- Added a new `required` field to `eds.contextual_matcher` assign patterns to only match if the required field has been found, and an `include` parameter (similar to `exclude`) to search for required patterns without assigning them to the entity ## v0.11.2 diff --git a/docs/assets/stylesheets/extra.css b/docs/assets/stylesheets/extra.css index 0193f144b..e856dba66 100644 --- a/docs/assets/stylesheets/extra.css +++ b/docs/assets/stylesheets/extra.css @@ -166,3 +166,20 @@ body, input { .md-typeset code a:not(.md-annotation__index) { border-bottom: 1px dashed var(--md-typeset-a-color); } + +.doc-param-details .subdoc { + padding: 0; + box-shadow: none; + border-color: var(--md-typeset-table-color); +} + +.doc-param-details .subdoc > div > div > div> table { + padding: 0; + box-shadow: none; + border: none; +} + +.doc-param-details .subdoc > summary { + margin: 0; + font-weight: normal; +} diff --git a/docs/pipes/core/contextual-matcher.md b/docs/pipes/core/contextual-matcher.md index 8f7e21c29..7a5697eda 100644 --- a/docs/pipes/core/contextual-matcher.md +++ b/docs/pipes/core/contextual-matcher.md @@ -206,74 +206,6 @@ Let us see what we can get from this pipeline with a few examples However, most of the configuration is provided in the `patterns` key, as a **pattern dictionary** or a **list of pattern dictionaries** -## The pattern dictionary - -### Description - -A patterr is a nested dictionary with the following keys: - -=== "`source`" - - A label describing the pattern - -=== "`regex`" - - A single Regex or a list of Regexes - -=== "`regex_attr`" - - An attributes to overwrite the given `attr` when matching with Regexes. - -=== "`terms`" - - A single term or a list of terms (for exact matches) - -=== "`exclude`" - - A dictionary (or list of dictionaries) to define exclusion rules. Exclusion rules are given as Regexes, and if a - match is found in the surrounding context of an extraction, the extraction is removed. Each dictionary should have the following keys: - - === "`window`" - - Size of the context to use (in number of words). You can provide the window as: - - - A positive integer, in this case the used context will be taken **after** the extraction - - A negative integer, in this case the used context will be taken **before** the extraction - - A tuple of integers `(start, end)`, in this case the used context will be the snippet from `start` tokens before the extraction to `end` tokens after the extraction - - === "`regex`" - - A single Regex or a list of Regexes. - -=== "`assign`" - - A dictionary to refine the extraction. Similarily to the `exclude` key, you can provide a dictionary to - use on the context **before** and **after** the extraction. - - === "`name`" - - A name (string) - - === "`window`" - - Size of the context to use (in number of words). You can provide the window as: - - - A positive integer, in this case the used context will be taken **after** the extraction - - A negative integer, in this case the used context will be taken **before** the extraction - - A tuple of integers `(start, end)`, in this case the used context will be the snippet from `start` tokens before the extraction to `end` tokens after the extraction - - === "`regex`" - - A dictionary where keys are labels and values are **Regexes with a single capturing group** - - === "`replace_entity`" - - If set to `True`, the match from the corresponding assign key will be used as entity, instead of the main match. See [this paragraph][the-replace_entity-parameter] - - === "`reduce_mode`" - - Set how multiple assign matches are handled. See the documentation of the [`reduce_mode` parameter][the-reduce_mode-parameter] - ### A full pattern dictionary example ```python @@ -300,6 +232,8 @@ dict( regex=r"(neonatal)", expand_entity=True, window=3, + # keep the extraction only if neonatal is found + required=True, ), dict( name="trans", diff --git a/edsnlp/matchers/regex.py b/edsnlp/matchers/regex.py index 681788535..4c1921238 100644 --- a/edsnlp/matchers/regex.py +++ b/edsnlp/matchers/regex.py @@ -1,6 +1,6 @@ import re from bisect import bisect_left, bisect_right -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, Iterator, List, Optional, Tuple, Union from loguru import logger from spacy.tokens import Doc, Span @@ -465,7 +465,7 @@ def __call__( doclike: Union[Doc, Span], as_spans=False, return_groupdict=False, - ) -> Union[Span, Tuple[Span, Dict[str, Any]]]: + ) -> Iterator[Union[Span, Tuple[Span, Dict[str, Any]]]]: """ Performs matching. Yields matches. diff --git a/edsnlp/pipes/core/contextual_matcher/contextual_matcher.py b/edsnlp/pipes/core/contextual_matcher/contextual_matcher.py index a68f343e7..868cac49d 100644 --- a/edsnlp/pipes/core/contextual_matcher/contextual_matcher.py +++ b/edsnlp/pipes/core/contextual_matcher/contextual_matcher.py @@ -1,11 +1,9 @@ +import copy import re import warnings -from collections import defaultdict from functools import lru_cache -from operator import attrgetter -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Dict, List, Optional, Tuple, Union -import pydantic from confit import VisibleDeprecationWarning from loguru import logger from spacy.tokens import Doc, Span @@ -13,22 +11,36 @@ from edsnlp.core import PipelineProtocol from edsnlp.matchers.phrase import EDSPhraseMatcher from edsnlp.matchers.regex import RegexMatcher, create_span -from edsnlp.matchers.utils import get_text from edsnlp.pipes.base import BaseNERComponent, SpanSetterArg from edsnlp.utils.collections import flatten_once +from edsnlp.utils.doc_to_text import get_text +from edsnlp.utils.span_getters import get_spans +from edsnlp.utils.typing import AsList # noqa: F401 from . import models +from .models import FullConfig, SingleAssignModel, SingleConfig @lru_cache(64) def get_window( doclike: Union[Doc, Span], window: Tuple[int, int], limit_to_sentence: bool ): + """ + Generate a window around the first parameter + """ start_limit = doclike.sent.start if limit_to_sentence else 0 end_limit = doclike.sent.end if limit_to_sentence else len(doclike.doc) - start = max(doclike.start + window[0], start_limit) - end = min(doclike.end + window[1], end_limit) + start = ( + max(doclike.start + window[0], start_limit) + if window and window[0] is not None + else start_limit + ) + end = ( + min(doclike.end + window[1], end_limit) + if window and window[0] is not None + else end_limit + ) return doclike.doc[start:end] @@ -44,8 +56,13 @@ class ContextualMatcher(BaseNERComponent): spaCy `Language` object. name : Optional[str] The name of the pipe - patterns : Union[Dict[str, Any], List[Dict[str, Any]]] - The configuration dictionary + patterns : AsList[SingleConfig] + ??? subdoc "The patterns to match" + + ::: edsnlp.pipes.core.contextual_matcher.models.SingleConfig + options: + only_parameters: "no-header" + show_toc: false assign_as_span : bool Whether to store eventual extractions defined via the `assign` key as Spans or as string @@ -75,7 +92,7 @@ def __init__( nlp: Optional[PipelineProtocol], name: Optional[str] = "contextual_matcher", *, - patterns: Union[Dict[str, Any], List[Dict[str, Any]]], + patterns: FullConfig, assign_as_span: bool = False, alignment_mode: str = "expand", attr: str = "NORM", @@ -104,12 +121,13 @@ def __init__( self.ignore_excluded = ignore_excluded self.ignore_space_tokens = ignore_space_tokens self.alignment_mode = alignment_mode - self.regex_flags = regex_flags + self.regex_flags: Union[re.RegexFlag, int] = regex_flags self.include_assigned = include_assigned # Configuration parsing - patterns = pydantic.parse_obj_as(models.FullConfig, patterns) - self.patterns = {pattern.source: pattern for pattern in patterns} + self.patterns: Dict[str, SingleConfig] = copy.deepcopy( + {pattern.source: pattern for pattern in patterns} + ) # Matchers for the anchors self.phrase_matcher = EDSPhraseMatcher( @@ -146,11 +164,6 @@ def __init__( } ) - self.exclude_matchers = defaultdict( - list - ) # Will contain all the exclusion matchers - self.assign_matchers = defaultdict(list) # Will contain all the assign matchers - # Will contain the reduce mode (for each source and assign matcher) self.reduce_mode = {} @@ -159,71 +172,62 @@ def __init__( self.replace_key = {} for source, p in self.patterns.items(): - p = p.dict() - - for exclude in p["exclude"]: - exclude_matcher = RegexMatcher( - attr=exclude["regex_attr"] or p["regex_attr"] or self.attr, - flags=exclude["regex_flags"] - or p["regex_flags"] - or self.regex_flags, + p: SingleConfig + + for exclude in p.exclude: + exclude.matcher = RegexMatcher( + attr=exclude.regex_attr or p.regex_attr or self.attr, + flags=exclude.regex_flags or p.regex_flags or self.regex_flags, ignore_excluded=ignore_excluded, ignore_space_tokens=ignore_space_tokens, alignment_mode="expand", ) - exclude_matcher.build_patterns(regex={"exclude": exclude["regex"]}) + exclude.matcher.build_patterns(regex={"exclude": exclude.regex}) - self.exclude_matchers[source].append( - dict( - matcher=exclude_matcher, - window=exclude["window"], - limit_to_sentence=exclude["limit_to_sentence"], - ) - ) - - replace_key = None - - for assign in p["assign"]: - assign_matcher = RegexMatcher( - attr=assign["regex_attr"] or p["regex_attr"] or self.attr, - flags=assign["regex_flags"] or p["regex_flags"] or self.regex_flags, + for include in p.include: + include.matcher = RegexMatcher( + attr=include.regex_attr or p.regex_attr or self.attr, + flags=include.regex_flags or p.regex_flags or self.regex_flags, ignore_excluded=ignore_excluded, ignore_space_tokens=ignore_space_tokens, - alignment_mode=alignment_mode, - span_from_group=True, + alignment_mode="expand", ) - assign_matcher.build_patterns( - regex={assign["name"]: assign["regex"]}, - ) + include.matcher.build_patterns(regex={"include": include.regex}) + + replace_key = None - self.assign_matchers[source].append( - dict( - name=assign["name"], - matcher=assign_matcher, - window=assign["window"], - limit_to_sentence=assign["limit_to_sentence"], - replace_entity=assign["replace_entity"], - reduce_mode=assign["reduce_mode"], + for assign in p.assign: + assign.matcher = None + if assign.regex: + assign.matcher = RegexMatcher( + attr=assign.regex_attr or p.regex_attr or self.attr, + flags=assign.regex_flags or p.regex_flags or self.regex_flags, + ignore_excluded=ignore_excluded, + ignore_space_tokens=ignore_space_tokens, + alignment_mode=alignment_mode, + span_from_group=True, ) - ) - if assign["replace_entity"]: + assign.matcher.build_patterns( + regex={assign.name: assign.regex}, + ) + assign.regex = assign.matcher + + if assign.replace_entity: # We know that there is only one assign name # with `replace_entity==True` # from PyDantic validation - replace_key = assign["name"] + replace_key = assign.name + self.reduce_mode[source] = {d.name: d.reduce_mode for d in p.assign} self.replace_key[source] = replace_key - self.reduce_mode[source] = { - d["name"]: d["reduce_mode"] for d in self.assign_matchers[source] - } - - self.set_extensions() - def set_extensions(self) -> None: + """ + Define the extensions used by the component + """ super().set_extensions() if not Span.has_extension("assigned"): Span.set_extension("assigned", default=dict()) @@ -232,8 +236,8 @@ def set_extensions(self) -> None: def filter_one(self, span: Span) -> Span: """ - Filter extracted entity based on the "exclusion filter" mentioned - in the configuration + Filter extracted entity based on the exclusion and inclusion filters of + the configuration. Parameters ---------- @@ -247,22 +251,26 @@ def filter_one(self, span: Span) -> Span: """ source = span.label_ to_keep = True - for matcher in self.exclude_matchers[source]: - window = matcher["window"] - limit_to_sentence = matcher["limit_to_sentence"] + for exclude in self.patterns[source].exclude: snippet = get_window( doclike=span, - window=window, - limit_to_sentence=limit_to_sentence, + window=exclude.window, + limit_to_sentence=exclude.limit_to_sentence, ) - if ( - next( - matcher["matcher"](snippet, as_spans=True), - None, - ) - is not None - ): + if next(exclude.matcher(snippet, as_spans=True), None) is not None: + to_keep = False + logger.trace(f"Entity {span} was filtered out") + break + + for include in self.patterns[source].include: + snippet = get_window( + doclike=span, + window=include.window, + limit_to_sentence=include.limit_to_sentence, + ) + + if next(include.matcher(snippet, as_spans=True), None) is None: to_keep = False logger.trace(f"Entity {span} was filtered out") break @@ -290,18 +298,17 @@ def assign_one(self, span: Span) -> Span: """ if span is None: - yield from [] return source = span.label_ assigned_dict = models.AssignDict(reduce_mode=self.reduce_mode[source]) replace_key = None - for matcher in self.assign_matchers[source]: - attr = self.patterns[source].regex_attr or matcher["matcher"].default_attr - window = matcher["window"] - limit_to_sentence = matcher["limit_to_sentence"] - replace_entity = matcher["replace_entity"] # Boolean + all_assigned_list = [] + for assign in self.patterns[source].assign: + assign: SingleAssignModel + window = assign.window + limit_to_sentence = assign.limit_to_sentence snippet = get_window( doclike=span, @@ -309,53 +316,67 @@ def assign_one(self, span: Span) -> Span: limit_to_sentence=limit_to_sentence, ) - # Getting the matches - assigned_list = list(matcher["matcher"].match(snippet)) - - assigned_list = [ - (span, span, matcher["matcher"].regex[0][0]) - if not match.groups() - else ( - span, - create_span( - doclike=snippet, - start_char=match.start(0), - end_char=match.end(0), - key=matcher["matcher"].regex[0][0], - attr=matcher["matcher"].regex[0][2], - alignment_mode=matcher["matcher"].regex[0][5], - ignore_excluded=matcher["matcher"].regex[0][3], - ignore_space_tokens=matcher["matcher"].regex[0][4], - ), - matcher["matcher"].regex[0][0], - ) - for (span, match) in assigned_list - ] + matcher: RegexMatcher = assign.matcher + if matcher is not None: + # Getting the matches + assigned_list = list(matcher.match(snippet)) + assigned_list = [ + (matched_span, matched_span, matcher.regex[0][0], assign) + if not re_match.groups() + else ( + matched_span, + create_span( + doclike=snippet, + start_char=re_match.start(0), + end_char=re_match.end(0), + key=matcher.regex[0][0], + attr=matcher.regex[0][2], + alignment_mode=matcher.regex[0][5], + ignore_excluded=matcher.regex[0][3], + ignore_space_tokens=matcher.regex[0][4], + ), + matcher.regex[0][0], + assign, + ) + for (matched_span, re_match) in assigned_list + ] + else: + assigned_list = [ + (matched_span, matched_span, assign.name, assign) + for matched_span in get_spans(snippet.doc, assign.span_getter) + if matched_span.start >= snippet.start + and matched_span.end <= snippet.end + ] # assigned_list now contains tuples with # - the first element being the span extracted from the group # - the second element being the full match - if not assigned_list: # No match was found + if assign.required and not assigned_list: + logger.trace(f"Entity {span} was filtered out") + return + + all_assigned_list.extend(assigned_list) + + for assigned in all_assigned_list: + if assigned is None: continue + group_span, full_match_span, value_key, assign = assigned + if assign.replace_entity: + replace_key = value_key + + # Using he overridden `__setitem__` method from AssignDict here: + assigned_dict[value_key] = { + "span": full_match_span, # Full span + "value_span": group_span, # Span of the group + "value_text": get_text( + group_span, + attr=self.patterns[source].regex_attr, + ignore_excluded=self.ignore_excluded, + ), # Text of the group + } + logger.trace(f"Assign key {value_key} matched on entity {span}") - for assigned in assigned_list: - if assigned is None: - continue - if replace_entity: - replace_key = assigned[2] - - # Using he overrid `__setitem__` method from AssignDict here: - assigned_dict[assigned[2]] = { - "span": assigned[1], # Full span - "value_span": assigned[0], # Span of the group - "value_text": get_text( - assigned[0], - attr=attr, - ignore_excluded=self.ignore_excluded, - ), # Text of the group - } - logger.trace(f"Assign key {matcher['name']} matched on entity {span}") if replace_key is None and self.replace_key[source] is not None: # There should have been a replacement, but none was found # So we discard the entity @@ -388,13 +409,13 @@ def assign_one(self, span: Span) -> Span: closest = Span( span.doc, - min(expandables, key=attrgetter("start")).start, - max(expandables, key=attrgetter("end")).end, + min(span.start for span in expandables if span is not None), + max(span.end for span in expandables if span is not None), span.label_, ) kept_ents.append(closest) - kept_ents.sort(key=attrgetter("start")) + kept_ents.sort(key=lambda e: e.start) for replaced in kept_ents: # Propagating attributes from the anchor @@ -434,6 +455,19 @@ def assign_one(self, span: Span) -> Span: yield from kept_ents def process_one(self, span): + """ + Processes one span, applying both the filters and the assignments + + Parameters + ---------- + span: + spaCy Span object + + Yields + ------ + span: + Filtered spans, with optional assignments + """ filtered = self.filter_one(span) yield from self.assign_one(filtered) diff --git a/edsnlp/pipes/core/contextual_matcher/models.py b/edsnlp/pipes/core/contextual_matcher/models.py index 4e684189c..f4aa44edb 100644 --- a/edsnlp/pipes/core/contextual_matcher/models.py +++ b/edsnlp/pipes/core/contextual_matcher/models.py @@ -1,11 +1,12 @@ import re -from typing import List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Union -import pydantic import regex from pydantic import BaseModel, Extra, validator from edsnlp.matchers.utils import ListOrStr +from edsnlp.utils.span_getters import SpanGetterArg +from edsnlp.utils.typing import AsList Flags = Union[re.RegexFlag, int] Window = Union[ @@ -16,6 +17,8 @@ def normalize_window(cls, v): + if v is None: + return v if isinstance(v, list): assert ( len(v) == 2 @@ -89,13 +92,42 @@ def keep_last(key, value): class SingleExcludeModel(BaseModel): + """ + A dictionary to define exclusion rules. Exclusion rules are given as Regexes, and + if a match is found in the surrounding context of an extraction, the extraction is + removed. Each dictionary should have the following keys: + + Parameters + ---------- + regex: ListOrStr + A single Regex or a list of Regexes + window: Optional[Window] + Size of the context to use (in number of words). You can provide the window as: + + - A positive integer, in this case the used context will be taken **after** + the extraction + - A negative integer, in this case the used context will be taken **before** + the extraction + - A tuple of integers `(start, end)`, in this case the used context will be + the snippet from `start` tokens before the extraction to `end` tokens + after the extraction + limit_to_sentence: Optional[bool] + If set to `True`, the exclusion will be limited to the sentence containing the + extraction + regex_flags: Optional[Flags] + Flags to use when compiling the Regexes + regex_attr: Optional[str] + An attribute to overwrite the given `attr` when matching with Regexes. + """ + regex: ListOrStr = [] - window: Window + window: Optional[Window] = None limit_to_sentence: Optional[bool] = True regex_flags: Optional[Flags] = None regex_attr: Optional[str] = None + matcher: Optional[Any] = None - @validator("regex") + @validator("regex", allow_reuse=True) def exclude_regex_validation(cls, v): if isinstance(v, str): v = [v] @@ -104,52 +136,138 @@ def exclude_regex_validation(cls, v): _normalize_window = validator("window", allow_reuse=True)(normalize_window) -class ExcludeModel: - @classmethod - def item_to_list(cls, v, config): - if not isinstance(v, list): +class SingleIncludeModel(BaseModel): + """ + A dictionary to define inclusion rules. Inclusion rules are given as Regexes, and + if a match isn't found in the surrounding context of an extraction, the extraction + is removed. Each dictionary should have the following keys: + + Parameters + ---------- + regex: ListOrStr + A single Regex or a list of Regexes + window: Optional[Window] + Size of the context to use (in number of words). You can provide the window as: + + - A positive integer, in this case the used context will be taken **after** + the extraction + - A negative integer, in this case the used context will be taken **before** + the extraction + - A tuple of integers `(start, end)`, in this case the used context will be + the snippet from `start` tokens before the extraction to `end` tokens + after the extraction + limit_to_sentence: Optional[bool] + If set to `True`, the exclusion will be limited to the sentence containing the + extraction + regex_flags: Optional[Flags] + Flags to use when compiling the Regexes + regex_attr: Optional[str] + An attribute to overwrite the given `attr` when matching with Regexes. + """ + + regex: ListOrStr = [] + window: Optional[Window] = None + limit_to_sentence: Optional[bool] = True + regex_flags: Optional[Flags] = None + regex_attr: Optional[str] = None + matcher: Optional[Any] = None + + @validator("regex", allow_reuse=True) + def exclude_regex_validation(cls, v): + if isinstance(v, str): v = [v] - return [pydantic.parse_obj_as(SingleExcludeModel, x) for x in v] + return v - @classmethod - def __get_validators__(cls): - yield cls.item_to_list + _normalize_window = validator("window", allow_reuse=True)(normalize_window) + + +class ExcludeModel(AsList[SingleExcludeModel]): + """ + A list of `SingleExcludeModel` objects. If a single config is passed, + it will be automatically converted to a list of a single element. + """ + + +class IncludeModel(AsList[SingleIncludeModel]): + """ + A list of `SingleIncludeModel` objects. If a single config is passed, + it will be automatically converted to a list of a single element. + """ class SingleAssignModel(BaseModel): + """ + A dictionary to refine the extraction. Similarly to the `exclude` key, you can + provide a dictionary to use on the context **before** and **after** the extraction. + + Parameters + ---------- + name: ListOrStr + A name (string) + window: Optional[Window] + Size of the context to use (in number of words). You can provide the window as: + + - A positive integer, in this case the used context will be taken **after** + the extraction + - A negative integer, in this case the used context will be taken **before** + the extraction + - A tuple of integers `(start, end)`, in this case the used context will be the + snippet from `start` tokens before the extraction to `end` tokens after the + extraction + span_getter: Optional[SpanGetterArg] + A span getter to pick the assigned spans from already extracted entities + in the doc. + regex: Optional[Window] + A dictionary where keys are labels and values are **Regexes with a single + capturing group** + replace_entity: Optional[bool] + If set to `True`, the match from the corresponding assign key will be used as + entity, instead of the main match. + See [this paragraph][the-replace_entity-parameter] + reduce_mode: Optional[Flags] + Set how multiple assign matches are handled. See the documentation of the + [`reduce_mode` parameter][the-reduce_mode-parameter] + required: Optional[str] + If set to `True`, the assign key must match for the extraction to be kept. If + it does not match, the extraction is discarded. + """ + name: str - regex: str - window: Window + regex: Optional[str] = None + span_getter: Optional[SpanGetterArg] = None + window: Optional[Window] = None limit_to_sentence: Optional[bool] = True regex_flags: Optional[Flags] = None regex_attr: Optional[str] = None replace_entity: bool = False reduce_mode: Optional[str] = None + required: Optional[bool] = False + + matcher: Optional[Any] = None - @validator("regex") + @validator("regex", allow_reuse=True) def check_single_regex_group(cls, pat): + if pat is None: + return pat compiled_pat = regex.compile( pat ) # Using regex to allow multiple fgroups with same name n_groups = compiled_pat.groups - assert n_groups == 1, ( - "The pattern {pat} should have only one capturing group, not {n_groups}" - ).format( - pat=pat, - n_groups=n_groups, - ) + assert ( + n_groups == 1 + ), f"The pattern {pat} should have exactly one capturing group, not {n_groups}" return pat _normalize_window = validator("window", allow_reuse=True)(normalize_window) -class AssignModel: - @classmethod - def item_to_list(cls, v, config): - if not isinstance(v, list): - v = [v] - return [pydantic.parse_obj_as(SingleAssignModel, x) for x in v] +class AssignModel(AsList[SingleAssignModel]): + """ + A list of `SingleAssignModel` objects that should have at most + one element with `replace_entity=True`. If a single config is passed, + it will be automatically converted to a list of a single element. + """ @classmethod def name_uniqueness(cls, v, config): @@ -167,27 +285,62 @@ def replace_uniqueness(cls, v, config): @classmethod def __get_validators__(cls): - yield cls.item_to_list + yield cls.validate yield cls.name_uniqueness yield cls.replace_uniqueness +if TYPE_CHECKING: + ExcludeModel = List[SingleExcludeModel] # noqa: F811 + IncludeModel = List[SingleIncludeModel] # noqa: F811 + AssignModel = List[SingleAssignModel] # noqa: F811 + + class SingleConfig(BaseModel, extra=Extra.forbid): + """ + A single configuration for the contextual matcher. + + Parameters + ---------- + source : str + A label describing the pattern + regex : ListOrStr + A single Regex or a list of Regexes + regex_attr : Optional[str] + An attributes to overwrite the given `attr` when matching with Regexes. + terms : Union[re.RegexFlag, int] + A single term or a list of terms (for exact matches) + exclude : AsList[SingleExcludeModel] + ??? subdoc "One or more exclusion patterns" + + ::: edsnlp.pipes.core.contextual_matcher.models.SingleExcludeModel + options: + only_parameters: "no-header" + assign : AsList[SingleAssignModel] + ??? subdoc "One or more assignment patterns" + + ::: edsnlp.pipes.core.contextual_matcher.models.SingleAssignModel + options: + only_parameters: "no-header" + + """ + source: str terms: ListOrStr = [] regex: ListOrStr = [] regex_attr: Optional[str] = None regex_flags: Union[re.RegexFlag, int] = None - exclude: Optional[ExcludeModel] = [] - assign: Optional[AssignModel] = [] + exclude: ExcludeModel = [] + include: IncludeModel = [] + assign: AssignModel = [] -class FullConfig: - @classmethod - def pattern_to_list(cls, v, config): - if not isinstance(v, list): - v = [v] - return [pydantic.parse_obj_as(SingleConfig, item) for item in v] +class FullConfig(AsList[SingleConfig]): + """ + A list of `SingleConfig` objects that should have distinct `source` fields. + If a single config is passed, it will be automatically converted to a list of + a single element. + """ @classmethod def source_uniqueness(cls, v, config): @@ -197,5 +350,9 @@ def source_uniqueness(cls, v, config): @classmethod def __get_validators__(cls): - yield cls.pattern_to_list + yield cls.validate yield cls.source_uniqueness + + +if TYPE_CHECKING: + FullConfig = List[SingleConfig] # noqa: F811 diff --git a/edsnlp/utils/typing.py b/edsnlp/utils/typing.py index 6562642dc..68b1fcbdf 100644 --- a/edsnlp/utils/typing.py +++ b/edsnlp/utils/typing.py @@ -9,7 +9,8 @@ class MetaAsList(type): def __init__(cls, name, bases, dct): super().__init__(name, bases, dct) - cls.item = Any + item = next((base.item for base in bases if hasattr(base, "item")), Any) + cls.item = item def __getitem__(self, item): new_type = MetaAsList(self.__name__, (self,), {}) diff --git a/pyproject.toml b/pyproject.toml index 9b347117a..36ac8fbe6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -338,9 +338,9 @@ ignore-property-decorators = false ignore-module = true ignore-nested-functions = true ignore-nested-classes = true -ignore-setters = false +ignore-setters = true fail-under = 40 -exclude = ["setup.py", "docs", "build", "tests"] +exclude = ["setup.py", "docs", "build", "tests", "edsnlp/pipes/core/contextual_matcher/models.py"] verbose = 0 quiet = false whitelist-regex = []