Skip to content

Commit

Permalink
feat: added context string selectors
Browse files Browse the repository at this point in the history
  • Loading branch information
percevalw committed May 19, 2024
1 parent b756307 commit 465ba39
Show file tree
Hide file tree
Showing 5 changed files with 200 additions and 86 deletions.
1 change: 1 addition & 0 deletions changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
- Added a `context_getter` SpanGetter argument to the `eds.matcher` class to only retrieve entities inside the spans returned by the getter
- Added a `filter_expr` parameter to scorers to filter the documents to score
- Added a new `required` field to `eds.contextual_matcher` assign patterns to only match if the required field has been found, and an `include` parameter (similar to `exclude`) to search for required patterns without assigning them to the entity
- Added context strings (e.g., "words[0:5] | sent[0:1]") to the `eds.contextual_matcher` component to allow for more complex patterns in the selection of the window around the trigger spans

### Changed

Expand Down
20 changes: 3 additions & 17 deletions edsnlp/pipes/core/contextual_matcher/contextual_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,23 +252,15 @@ def filter_one(self, span: Span) -> Span:
source = span.label_
to_keep = True
for exclude in self.patterns[source].exclude:
snippet = get_window(
doclike=span,
window=exclude.window,
limit_to_sentence=exclude.limit_to_sentence,
)
snippet = exclude.window(span)

if next(exclude.matcher(snippet, as_spans=True), None) is not None:
to_keep = False
logger.trace(f"Entity {span} was filtered out")
break

for include in self.patterns[source].include:
snippet = get_window(
doclike=span,
window=include.window,
limit_to_sentence=include.limit_to_sentence,
)
snippet = include.window(span)

if next(include.matcher(snippet, as_spans=True), None) is None:
to_keep = False
Expand Down Expand Up @@ -308,13 +300,7 @@ def assign_one(self, span: Span) -> Span:
for assign in self.patterns[source].assign:
assign: SingleAssignModel
window = assign.window
limit_to_sentence = assign.limit_to_sentence

snippet = get_window(
doclike=span,
window=window,
limit_to_sentence=limit_to_sentence,
)
snippet = window(span)

matcher: RegexMatcher = assign.matcher
if matcher is not None:
Expand Down
97 changes: 32 additions & 65 deletions edsnlp/pipes/core/contextual_matcher/models.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,27 @@
import re
from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Union
from typing import TYPE_CHECKING, Any, List, Optional, Union

import regex
from pydantic import BaseModel, Extra, validator
from pydantic import BaseModel, Extra, root_validator

from edsnlp.matchers.utils import ListOrStr
from edsnlp.utils.span_getters import SpanGetterArg
from edsnlp.utils.span_getters import Context, SentenceContext, SpanGetterArg
from edsnlp.utils.typing import AsList

Flags = Union[re.RegexFlag, int]
Window = Union[
Tuple[int, int],
List[int],
int,
]


def normalize_window(cls, v):
if v is None:
return v
if isinstance(v, list):
assert (
len(v) == 2
), "`window` should be a tuple/list of two integer, or a single integer"
v = tuple(v)
if isinstance(v, int):
assert v != 0, "The provided `window` should not be 0"
if v < 0:
return (v, 0)
if v > 0:
return (0, v)
assert v[0] < v[1], "The provided `window` should contain at least 1 token"
return v
def validate_window(cls, values):
if isinstance(values.get("regex"), str):
values["regex"] = [values["regex"]]
window = values.get("window")
if window is None or isinstance(window, (int, tuple, list)):
values["limit_to_sentence"] = True
window = values.get("window")
if window is not None:
values["window"] = Context.validate(window)
if values.get("limit_to_sentence"):
values["window"] = values.get("window") & SentenceContext(0, 0)
return values


class AssignDict(dict):
Expand Down Expand Up @@ -101,9 +91,10 @@ class SingleExcludeModel(BaseModel):
----------
regex: ListOrStr
A single Regex or a list of Regexes
window: Optional[Window]
window: Optional[Context]
Size of the context to use (in number of words). You can provide the window as:
- A [context string][context-string]
- A positive integer, in this case the used context will be taken **after**
the extraction
- A negative integer, in this case the used context will be taken **before**
Expand All @@ -121,19 +112,13 @@ class SingleExcludeModel(BaseModel):
"""

regex: ListOrStr = []
window: Optional[Window] = None
limit_to_sentence: Optional[bool] = True
limit_to_sentence: Optional[bool] = None
window: Optional[Context] = None
regex_flags: Optional[Flags] = None
regex_attr: Optional[str] = None
matcher: Optional[Any] = None

@validator("regex", allow_reuse=True)
def exclude_regex_validation(cls, v):
if isinstance(v, str):
v = [v]
return v

_normalize_window = validator("window", allow_reuse=True)(normalize_window)
validate_window = root_validator(pre=True, allow_reuse=True)(validate_window)


class SingleIncludeModel(BaseModel):
Expand All @@ -146,9 +131,10 @@ class SingleIncludeModel(BaseModel):
----------
regex: ListOrStr
A single Regex or a list of Regexes
window: Optional[Window]
window: Optional[Context]
Size of the context to use (in number of words). You can provide the window as:
- A [context string][context-string]
- A positive integer, in this case the used context will be taken **after**
the extraction
- A negative integer, in this case the used context will be taken **before**
Expand All @@ -166,19 +152,13 @@ class SingleIncludeModel(BaseModel):
"""

regex: ListOrStr = []
window: Optional[Window] = None
limit_to_sentence: Optional[bool] = True
limit_to_sentence: Optional[bool] = None
window: Optional[Context] = None
regex_flags: Optional[Flags] = None
regex_attr: Optional[str] = None
matcher: Optional[Any] = None

@validator("regex", allow_reuse=True)
def exclude_regex_validation(cls, v):
if isinstance(v, str):
v = [v]
return v

_normalize_window = validator("window", allow_reuse=True)(normalize_window)
validate_window = root_validator(pre=True, allow_reuse=True)(validate_window)


class ExcludeModel(AsList[SingleExcludeModel]):
Expand All @@ -204,9 +184,10 @@ class SingleAssignModel(BaseModel):
----------
name: ListOrStr
A name (string)
window: Optional[Window]
window: Optional[Context]
Size of the context to use (in number of words). You can provide the window as:
- A [context string][context-string]
- A positive integer, in this case the used context will be taken **after**
the extraction
- A negative integer, in this case the used context will be taken **before**
Expand All @@ -217,7 +198,7 @@ class SingleAssignModel(BaseModel):
span_getter: Optional[SpanGetterArg]
A span getter to pick the assigned spans from already extracted entities
in the doc.
regex: Optional[Window]
regex: Optional[Context]
A dictionary where keys are labels and values are **Regexes with a single
capturing group**
replace_entity: Optional[bool]
Expand All @@ -233,10 +214,10 @@ class SingleAssignModel(BaseModel):
"""

name: str
regex: Optional[str] = None
regex: ListOrStr = []
span_getter: Optional[SpanGetterArg] = None
window: Optional[Window] = None
limit_to_sentence: Optional[bool] = True
limit_to_sentence: Optional[bool] = None
window: Optional[Context] = None
regex_flags: Optional[Flags] = None
regex_attr: Optional[str] = None
replace_entity: bool = False
Expand All @@ -245,21 +226,7 @@ class SingleAssignModel(BaseModel):

matcher: Optional[Any] = None

@validator("regex", allow_reuse=True)
def check_single_regex_group(cls, pat):
if pat is None:
return pat
compiled_pat = regex.compile(
pat
) # Using regex to allow multiple fgroups with same name
n_groups = compiled_pat.groups
assert (
n_groups == 1
), f"The pattern {pat} should have exactly one capturing group, not {n_groups}"

return pat

_normalize_window = validator("window", allow_reuse=True)(normalize_window)
validate_window = root_validator(pre=True, allow_reuse=True)(validate_window)


class AssignModel(AsList[SingleAssignModel]):
Expand Down
Loading

0 comments on commit 465ba39

Please sign in to comment.