Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rule based relation #304

Open
wants to merge 20 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ _build/
*.tar.gz
*.tsv
*.ann
!text.ann

# Editors
.idea
Expand Down
8 changes: 8 additions & 0 deletions changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,14 @@

- `eds.tables` accepts a minimum_table_size (default 2) argument to reduce pollution
- `RuleBasedQualifier` now expose a `process` method that only returns qualified entities and token without actually tagging them, defering this task to the `__call__` method.
- Relation implementation in `doc.spans["<label>"][i]._.rel = [{'type':'rel_type', 'target': <span>},]`
- Relation connector with brat2docs and docs2brat in `edsnlp.connectors.brat` compatible with `edsnlp.data.read_*` and `edsnlp.data.write_*` (modified files : `edsnlp.data.converters`, `edsnlp.data.standoff`)
- Rule based relation model using proximity and/or sentence in `edsnlp.pipes.misc.relations` registered as `eds.relation`
- Documentation using Mkdocs for relations `docs.pipes.misc.relations.md` and `docs.pipes.misc.index.md`
- Tests for relations `tests.pipelines.misc.test_relations` and ressources `ressources.relations`
- `data.set_processing(...)` now expose an `autocast` parameter to disable or tweak the automatic casting of the tensor
during the processing. Autocasting should result in a slight speedup, but may lead to numerical instability.
- Use `torch.inference_mode` to disable view tracking and version counter bumps during inference.

### Fixed

Expand Down
1 change: 1 addition & 0 deletions docs/pipes/misc/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,6 @@ For instance, the date detection and normalisation pipeline falls in this catego
| `eds.sections` | Section detection |
| `eds.reason` | Rule-based hospitalisation reason detection |
| `eds.tables` | Tables detection |
| `eds.relations` | Relations extraction |

<!-- --8<-- [end:components] -->
8 changes: 8 additions & 0 deletions docs/pipes/misc/relations.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Relations {: #edsnlp.pipes.misc.relations.factory.create_component }

::: edsnlp.pipes.misc.relations.factory.create_component
options:
heading_level: 2
show_bases: false
show_source: false
only_class_level: true
164 changes: 144 additions & 20 deletions edsnlp/data/converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,9 @@ class StandoffDict2DocConverter:
span_attributes : Optional[AttributesMappingArg]
Mapping from BRAT attributes to Span extensions (can be a list too).
By default, all attributes are imported as Span extensions with the same name.
span_rel : Optional[AttributesMappingArg]
Mapping from BRAT relations to Span extensions (can be a list too).
By default, all relations are imported as Span extensions with the name rel.
keep_raw_attribute_values : bool
Whether to keep the raw attribute values (as strings) or to convert them to
Python objects (e.g. booleans).
Expand All @@ -214,6 +217,7 @@ def __init__(
tokenizer: Optional[Tokenizer] = None,
span_setter: SpanSetterArg = {"ents": True, "*": True},
span_attributes: Optional[AttributesMappingArg] = None,
span_rel: Optional[AttributesMappingArg] = None, # to keep ?
keep_raw_attribute_values: bool = False,
bool_attributes: SequenceStr = [],
default_attributes: AttributesMappingArg = {},
Expand All @@ -223,6 +227,7 @@ def __init__(
self.tokenizer = tokenizer or (nlp.tokenizer if nlp is not None else None)
self.span_setter = span_setter
self.span_attributes = span_attributes # type: ignore
self.span_rel = span_rel # to keep ?
self.keep_raw_attribute_values = keep_raw_attribute_values
self.default_attributes = default_attributes
self.notes_as_span_attribute = notes_as_span_attribute
Expand All @@ -244,12 +249,19 @@ def __call__(self, obj):
if not Span.has_extension(dst):
Span.set_extension(dst, default=None)

############## Modifications for relations ###############
dict_entities = {} ## dict for entity storage
for ent in obj.get("entities") or ():
begin = min(f["begin"] for f in ent["fragments"]) # start of the entity
end = max(f["end"] for f in ent["fragments"]) # end of the entity
dict_entities[ent["entity_id"]] = (
ent["label"] + ";" + str(begin) + ";" + str(end)
)
fragments = (
[
{
"begin": min(f["begin"] for f in ent["fragments"]),
"end": max(f["end"] for f in ent["fragments"]),
"begin": begin,
"end": end,
}
]
if not self.split_fragments
Expand All @@ -267,6 +279,11 @@ def __call__(self, obj):
if isinstance(ent["attributes"], list)
else ent["attributes"]
)
attributes = (
{a["label"]: a["value"] for a in ent["attributes"]}
if isinstance(ent["attributes"], list)
else ent["attributes"]
)
if self.notes_as_span_attribute and ent["notes"]:
ent["attributes"][self.notes_as_span_attribute] = "|".join(
note["value"] for note in ent["notes"]
Expand Down Expand Up @@ -302,6 +319,67 @@ def __call__(self, obj):
if span._.get(attr) is None:
span._.set(attr, value)

############## Modifications fo relations ###############
# add relations in spans
if self.span_rel is None and not Span.has_extension("rel"):
Span.set_extension("rel", default=[])

for rel in obj.get("relations") or (): # iterates relations
for label in doc.spans: # iterates source labels
for i, spa in enumerate(doc.spans[label]): # iterates source spans
bo = False

# relations
if dict_entities[rel["from_entity_id"]].split(";") == [
label,
str(spa.start_char),
str(spa.end_char),
]: # sif source entity is the same as the span
for label2 in doc.spans: # iiterates target labels
for j, spa2 in enumerate(
doc.spans[label2]
): # iterates target label
if dict_entities[rel["to_entity_id"]].split(";") == [
label2,
str(spa2.start_char),
str(spa2.end_char),
]: # if target entity is the same as the span
relation = {
"type": rel["relation_label"],
"target": doc.spans[label2][j],
} # create the relation
doc.spans[label][i]._.rel.append(
relation
) # add the relation to the span
bo = True
break
if bo:
break
bo = False

# inverse relations
if dict_entities[rel["to_entity_id"]].split(";") == [
label,
str(spa.start_char),
str(spa.end_char),
]:
for label2 in doc.spans:
for j, spa2 in enumerate(doc.spans[label2]):
if dict_entities[rel["from_entity_id"]].split(";") == [
label2,
str(spa2.start_char),
str(spa2.end_char),
]:
relation = {
"type": "inv_" + rel["relation_label"],
"target": doc.spans[label2][j],
}
doc.spans[label][i]._.rel.append(relation)
bo = True
break
if bo:
break

return doc


Expand Down Expand Up @@ -346,29 +424,75 @@ def __init__(

def __call__(self, doc):
spans = get_spans(doc, self.span_getter)
entities = [
{
"entity_id": i,
"fragments": [
{
"begin": ent.start_char,
"end": ent.end_char,
}
],
"attributes": {
obj_name: getattr(ent._, ext_name)
for ext_name, obj_name in self.span_attributes.items()
if ent._.has(ext_name)
},
"label": ent.label_,
}
for i, ent in enumerate(sorted(dict.fromkeys(spans)))
]

# mapping between entities and their `entity_id`
entity_map = {
(
ent["fragments"][0]["begin"],
ent["fragments"][0]["end"],
ent["label"],
): ent["entity_id"]
for ent in entities
}

# doesn't include 'inv_' relations
relations = []
relation_idx = 1
for span_label, span_list in doc.spans.items():
for spa in span_list:
source_entity_id = entity_map.get(
(spa.start_char, spa.end_char, spa.label_)
)
for rel in spa._.rel:
if not rel["type"].startswith("inv_"):
target_entity_id = entity_map.get(
(
rel["target"].start_char,
rel["target"].end_char,
rel["target"].label_,
)
)
if (
source_entity_id is not None
and target_entity_id is not None
):
relations.append(
{
"rel_id": relation_idx,
"from_entity_id": source_entity_id,
"relation_type": rel["type"],
"to_entity_id": target_entity_id,
}
)
relation_idx += 1

# final object
obj = {
FILENAME: doc._.note_id,
"doc_id": doc._.note_id,
"text": doc.text,
"entities": [
{
"entity_id": i,
"fragments": [
{
"begin": ent.start_char,
"end": ent.end_char,
}
],
"attributes": {
obj_name: getattr(ent._, ext_name)
for ext_name, obj_name in self.span_attributes.items()
if ent._.has(ext_name)
},
"label": ent.label_,
}
for i, ent in enumerate(sorted(dict.fromkeys(spans)))
],
"entities": entities,
"relations": relations,
}

return obj


Expand Down
28 changes: 14 additions & 14 deletions edsnlp/data/standoff.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,20 +264,20 @@ def dump_standoff_file(
file=f,
)
attribute_idx += 1

# fmt: off
# if "relations" in doc:
# for i, relation in enumerate(doc["relations"]):
# entity_from = entities_ids[relation["from_entity_id"]]
# entity_to = entities_ids[relation["to_entity_id"]]
# print(
# "R{}\t{} Arg1:{} Arg2:{}\t".format(
# i + 1, str(relation["label"]), entity_from,
# entity_to
# ),
# file=f,
# )
# fmt: on
# Ajout du traitement des relations
relation_idx = 1
if "relations" in doc:
for relation in doc["relations"]:
print(
"R{}\t{} Arg1:{} Arg2:{}".format(
relation_idx,
relation["relation_type"],
entities_ids[relation["from_entity_id"]],
entities_ids[relation["to_entity_id"]],
),
file=f,
)
relation_idx += 1


class StandoffReader(BaseReader):
Expand Down
1 change: 1 addition & 0 deletions edsnlp/pipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from .misc.dates.factory import create_component as dates
from .misc.quantities.factory import create_component as quantities
from .misc.reason.factory import create_component as reason
from .misc.relations.factory import create_component as relations
from .misc.sections.factory import create_component as sections
from .misc.tables.factory import create_component as tables
from .ner.adicap.factory import create_component as adicap
Expand Down
1 change: 1 addition & 0 deletions edsnlp/pipes/misc/relations/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .relations import RelationsMatcher
17 changes: 17 additions & 0 deletions edsnlp/pipes/misc/relations/factory.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from edsnlp.core import registry

from .relations import RelationsMatcher

DEFAULT_CONFIG = dict(
scheme=None,
use_sentences=False,
clean_rel=False,
proximity_method="right",
max_dist=45,
)

create_component = registry.factory.register(
"eds.relations",
assigns=["doc.spans"],
deprecated=["relations"],
)(RelationsMatcher)
17 changes: 17 additions & 0 deletions edsnlp/pipes/misc/relations/patterns.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
scheme = [
{
"source": [{"label": "Chemical_and_drugs", "attr": {"Tech": [None]}}],
"target": [
{
"label": "Temporal",
"attr": {"AttTemp": [None, "Duration", "Date", "Frequency"]},
},
{
"label": "Chemical_and_drugs",
"attr": {"Tech": ["dosage", "route", "strength", "form"]},
},
],
"type": "Depend",
"inv_type": "inv_Depend",
},
]
Loading
Loading