Skip to content

Commit

Permalink
Merge pull request #6 from CompNet/dev
Browse files Browse the repository at this point in the history
fix co_occurrences_dist typo
  • Loading branch information
Aethor authored Nov 8, 2023
2 parents 263c346 + 1e2eb38 commit 04416d2
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 29 deletions.
8 changes: 4 additions & 4 deletions docs/pipeline.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ document. Here is a simple example:
[
NLTKTokenizer(),
NLTKNamedEntityRecognizer(),
GraphRulesCharacterUnifier(min_appearances=10),
CoOccurrencesGraphExtractor(co_occurences_dist=25)
NaiveCharactersExtractor(min_appearance=10),
CoOccurrencesGraphExtractor(co_occurrences_dist=25)
]
)
Expand Down Expand Up @@ -55,8 +55,8 @@ to compute them yourself :
pipeline = Pipeline(
[
NLTKNamedEntityRecognizer(),
GraphRulesCharacterUnifier(min_appearances=10),
CoOccurrencesGraphExtractor(co_occurences_dist=25)
NaiveCharactersExtractor(min_appearance=10),
CoOccurrencesGraphExtractor(co_occurrences_dist=25)
]
)
Expand Down
54 changes: 34 additions & 20 deletions renard/pipeline/graph_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,13 +81,18 @@ class CoOccurrencesGraphExtractor(PipelineStep):

def __init__(
self,
co_occurences_dist: Union[int, Tuple[int, Literal["tokens", "sentences"]]],
co_occurrences_dist: Optional[
Union[int, Tuple[int, Literal["tokens", "sentences"]]]
],
dynamic: bool = False,
dynamic_window: Optional[int] = None,
dynamic_overlap: int = 0,
co_occurences_dist: Optional[
Union[int, Tuple[int, Literal["tokens", "sentences"]]]
] = None,
) -> None:
"""
:param co_occurences_dist: max accepted distance between two
:param co_occurrences_dist: max accepted distance between two
character appearances to form a co-occurence interaction.
- if an ``int`` is given, the distance is in number of
Expand All @@ -99,28 +104,37 @@ def __init__(
:param dynamic:
- if ``False`` (the default), a static ``nx.graph`` is
extracted
- if ``False`` (the default), a static ``nx.graph`` is
extracted
- if ``True``, several ``nx.graph`` are extracted. In
that case, ``dynamic_window`` and
``dynamic_overlap``*can* be specified. If
``dynamic_window`` is not specified, this step is
expecting the text to be cut into chapters', and a graph
will be extracted for each 'chapter'. In that case,
``chapters`` must be passed to the pipeline as a
``List[str]`` at runtime.
- if ``True``, several ``nx.graph`` are extracted. In
that case, ``dynamic_window`` and
``dynamic_overlap``*can* be specified. If
``dynamic_window`` is not specified, this step is
expecting the text to be cut into chapters', and a
graph will be extracted for each 'chapter'. In that
case, ``chapters`` must be passed to the pipeline as
a ``List[str]`` at runtime.
:param dynamic_window: dynamic window, in number of
interactions. a dynamic window of `n` means that each
returned graph will be formed by `n` interactions.
:param dynamic_overlap: overlap, in number of interactions.
:param co_occurences_dist: same as ``co_occurrences_dist``.
Included because of retro-compatibility, as it was a
previously included typo.
"""
# typo retrocompatibility
if not co_occurences_dist is None:
co_occurrences_dist = co_occurences_dist
if co_occurrences_dist is None and co_occurences_dist is None:
raise ValueError()

if isinstance(co_occurences_dist, int):
co_occurences_dist = (co_occurences_dist, "tokens")
self.co_occurences_dist = co_occurences_dist
if isinstance(co_occurrences_dist, int):
co_occurrences_dist = (co_occurrences_dist, "tokens")
self.co_occurrences_dist = co_occurrences_dist

if dynamic:
if not dynamic_window is None:
Expand Down Expand Up @@ -181,25 +195,25 @@ def _mentions_interact(
.. note::
the attribute ``self.co_occurences_dist`` is used to know wether mentions are in co_occurences
the attribute ``self.co_occurrences_dist`` is used to know wether mentions are in co_occurences
:param mention_1:
:param mention_2:
:param sentences:
:return: a boolean indicating wether the two mentions are co-occuring
"""
if self.co_occurences_dist[1] == "tokens":
if self.co_occurrences_dist[1] == "tokens":
return (
abs(mention_2.start_idx - mention_1.start_idx)
<= self.co_occurences_dist[0]
<= self.co_occurrences_dist[0]
)
elif self.co_occurences_dist[1] == "sentences":
elif self.co_occurrences_dist[1] == "sentences":
assert not sentences is None
mention_1_sent = sent_index_for_token_index(mention_1.start_idx, sentences)
mention_2_sent = sent_index_for_token_index(
mention_2.end_idx - 1, sentences
)
return abs(mention_2_sent - mention_1_sent) <= self.co_occurences_dist[0]
return abs(mention_2_sent - mention_1_sent) <= self.co_occurrences_dist[0]
else:
raise NotImplementedError

Expand Down
8 changes: 8 additions & 0 deletions renard/pipeline/preconfigured.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ def nltk_pipeline(
**pipeline_kwargs
)
else:

if not "co_occurrences_dist" in graph_extractor_kwargs:
graph_extractor_kwargs["co_occurrences_dist"] = (1, "sentences")

return Pipeline(
[
NLTKTokenizer(**tokenizer_kwargs),
Expand Down Expand Up @@ -113,6 +117,10 @@ def bert_pipeline(
**pipeline_kwargs
)
else:

if not "co_occurrences_dist" in graph_extractor_kwargs:
graph_extractor_kwargs["co_occurrences_dist"] = (1, "sentences")

return Pipeline(
[
NLTKTokenizer(**tokenizer_kwargs),
Expand Down
10 changes: 5 additions & 5 deletions renard_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,8 @@
# [
# NLTKTokenizer(), # tokenization
# NLTKNamedEntityRecognizer(), # named entity recognition
# GraphRulesCharacterUnifier(), # characters extraction
# CoOccurrencesGraphExtractor(co_occurences_dist=(1, "sentences")) # graph extraction
# GraphRulesCharactersExtractor(), # characters extraction
# CoOccurrencesGraphExtractor(co_occurrences_dist=(1, "sentences")) # graph extraction
# ]
# )
# ```
Expand Down Expand Up @@ -105,7 +105,7 @@
GraphRulesCharacterUnifier(),
# an interaction will be a co-occurence in a range of 3
# sentences or less
CoOccurrencesGraphExtractor(co_occurences_dist=(3, "sentences")),
CoOccurrencesGraphExtractor(co_occurrences_dist=(3, "sentences")),
],
lang="fra",
)
Expand Down Expand Up @@ -143,7 +143,7 @@
GraphRulesCharacterUnifier(min_appearances=3),
# A co-occurence between two characters is counted if its
# range is lower or equal to 10 sentences
CoOccurrencesGraphExtractor(co_occurences_dist=(10, "sentences")),
CoOccurrencesGraphExtractor(co_occurrences_dist=(10, "sentences")),
],
lang="fra",
)
Expand Down Expand Up @@ -180,7 +180,7 @@
[
GraphRulesCharacterUnifier(min_appearances=3),
CoOccurrencesGraphExtractor(
co_occurences_dist=(20, "sentences"),
co_occurrences_dist=(20, "sentences"),
dynamic=True, # we want to extract a dynamic graph (i.e. a list of sequential graphs)
dynamic_window=20, # the size, in interaction, of each graph
dynamic_overlap=0, # overlap between windows
Expand Down

0 comments on commit 04416d2

Please sign in to comment.