From 66a46dc1e4d076e9a5610a28da48750a5b64eb8f Mon Sep 17 00:00:00 2001 From: Aethor Date: Fri, 29 Sep 2023 15:23:36 +0200 Subject: [PATCH 1/3] fix a typo in CoOccurrencesGraphExtractor --- docs/pipeline.rst | 6 ++-- renard/pipeline/graph_extraction.py | 50 +++++++++++++++++------------ renard/pipeline/preconfigured.py | 6 ++-- renard_tutorial.py | 8 ++--- 4 files changed, 40 insertions(+), 30 deletions(-) diff --git a/docs/pipeline.rst b/docs/pipeline.rst index 7991447..b479cc2 100644 --- a/docs/pipeline.rst +++ b/docs/pipeline.rst @@ -23,7 +23,7 @@ document. Here is a simple example: NLTKTokenizer(), NLTKNamedEntityRecognizer(), NaiveCharactersExtractor(min_appearance=10), - CoOccurrencesGraphExtractor(co_occurences_dist=25) + CoOccurrencesGraphExtractor(co_occurrences_dist=25) ] ) @@ -56,7 +56,7 @@ to compute them yourself : [ NLTKNamedEntityRecognizer(), NaiveCharactersExtractor(min_appearance=10), - CoOccurrencesGraphExtractor(co_occurences_dist=25) + CoOccurrencesGraphExtractor(co_occurrences_dist=25) ] ) @@ -201,7 +201,7 @@ time. In Renard, such graphs are representend by a ``List`` of NLTKNamedEntityRecognizer(), NaiveCharactersExtractor(min_appearance=10), CoOccurrencesGraphExtractor( - co_occurences_dist=25, + co_occurrences_dist=25, dynamic=True, # note the 'dynamic' dynamic_window=20 # and the 'dynamic_window' argument ) diff --git a/renard/pipeline/graph_extraction.py b/renard/pipeline/graph_extraction.py index bdcca23..b13a1ed 100644 --- a/renard/pipeline/graph_extraction.py +++ b/renard/pipeline/graph_extraction.py @@ -80,13 +80,16 @@ class CoOccurrencesGraphExtractor(PipelineStep): def __init__( self, - co_occurences_dist: Union[int, Tuple[int, Literal["tokens", "sentences"]]], + co_occurrences_dist: Union[int, Tuple[int, Literal["tokens", "sentences"]]], dynamic: bool = False, dynamic_window: Optional[int] = None, dynamic_overlap: int = 0, + co_occurences_dist: Optional[ + Union[int, Tuple[int, Literal["tokens", "sentences"]]] + ] = None, ) -> None: """ - :param co_occurences_dist: max accepted distance between two + :param co_occurrences_dist: max accepted distance between two character appearances to form a co-occurence interaction. - if an ``int`` is given, the distance is in number of @@ -98,28 +101,35 @@ def __init__( :param dynamic: - - if ``False`` (the default), a static ``nx.graph`` is - extracted + - if ``False`` (the default), a static ``nx.graph`` is + extracted - - if ``True``, several ``nx.graph`` are extracted. In - that case, ``dynamic_window`` and - ``dynamic_overlap``*can* be specified. If - ``dynamic_window`` is not specified, this step is - expecting the text to be cut into chapters', and a graph - will be extracted for each 'chapter'. In that case, - ``chapters`` must be passed to the pipeline as a - ``List[str]`` at runtime. + - if ``True``, several ``nx.graph`` are extracted. In + that case, ``dynamic_window`` and + ``dynamic_overlap``*can* be specified. If + ``dynamic_window`` is not specified, this step is + expecting the text to be cut into chapters', and a + graph will be extracted for each 'chapter'. In that + case, ``chapters`` must be passed to the pipeline as + a ``List[str]`` at runtime. :param dynamic_window: dynamic window, in number of interactions. a dynamic window of `n` means that each returned graph will be formed by `n` interactions. :param dynamic_overlap: overlap, in number of interactions. + + :param co_occurences_dist: same as ``co_occurrences_dist``. + Included because of retro-compatibility, as it was a + previously included typo. """ + # typo retrocompatibility + if not co_occurences_dist is None: + co_occurrences_dist = co_occurences_dist - if isinstance(co_occurences_dist, int): - co_occurences_dist = (co_occurences_dist, "tokens") - self.co_occurences_dist = co_occurences_dist + if isinstance(co_occurrences_dist, int): + co_occurrences_dist = (co_occurrences_dist, "tokens") + self.co_occurrences_dist = co_occurrences_dist if dynamic: if not dynamic_window is None: @@ -181,25 +191,25 @@ def _mentions_interact( .. note:: - the attribute ``self.co_occurences_dist`` is used to know wether mentions are in co_occurences + the attribute ``self.co_occurrences_dist`` is used to know wether mentions are in co_occurences :param mention_1: :param mention_2: :param sentences: :return: a boolean indicating wether the two mentions are co-occuring """ - if self.co_occurences_dist[1] == "tokens": + if self.co_occurrences_dist[1] == "tokens": return ( abs(mention_2.start_idx - mention_1.start_idx) - <= self.co_occurences_dist[0] + <= self.co_occurrences_dist[0] ) - elif self.co_occurences_dist[1] == "sentences": + elif self.co_occurrences_dist[1] == "sentences": assert not sentences is None mention_1_sent = sent_index_for_token_index(mention_1.start_idx, sentences) mention_2_sent = sent_index_for_token_index( mention_2.end_idx - 1, sentences ) - return abs(mention_2_sent - mention_1_sent) <= self.co_occurences_dist[0] + return abs(mention_2_sent - mention_1_sent) <= self.co_occurrences_dist[0] else: raise NotImplementedError diff --git a/renard/pipeline/preconfigured.py b/renard/pipeline/preconfigured.py index 994e4b5..30a54dd 100644 --- a/renard/pipeline/preconfigured.py +++ b/renard/pipeline/preconfigured.py @@ -27,8 +27,8 @@ def nltk_pipeline( characters_extractor_kwargs = characters_extractor_kwargs or {} graph_extractor_kwargs = graph_extractor_kwargs or {} - if not "co_occurences_dist" in graph_extractor_kwargs: - graph_extractor_kwargs["co_occurences_dist"] = (1, "sentences") + if not "co_occurrences_dist" in graph_extractor_kwargs: + graph_extractor_kwargs["co_occurrences_dist"] = (1, "sentences") return Pipeline( [ @@ -69,7 +69,7 @@ def bert_pipeline( NLTKTokenizer(), BertNamedEntityRecognizer(), GraphRulesCharactersExtractor(), - CoOccurrencesGraphExtractor(co_occurences_dist=(1, "sentences")), + CoOccurrencesGraphExtractor(co_occurrences_dist=(1, "sentences")), ], **pipeline_kwargs ) diff --git a/renard_tutorial.py b/renard_tutorial.py index b4c6541..dd09c23 100644 --- a/renard_tutorial.py +++ b/renard_tutorial.py @@ -61,7 +61,7 @@ # NLTKTokenizer(), # tokenization # NLTKNamedEntityRecognizer(), # named entity recognition # GraphRulesCharactersExtractor(), # characters extraction -# CoOccurrencesGraphExtractor(co_occurences_dist=(1, "sentences")) # graph extraction +# CoOccurrencesGraphExtractor(co_occurrences_dist=(1, "sentences")) # graph extraction # ] # ) # ``` @@ -105,7 +105,7 @@ GraphRulesCharactersExtractor(), # an interaction will be a co-occurence in a range of 3 # sentences or less - CoOccurrencesGraphExtractor(co_occurences_dist=(3, "sentences")), + CoOccurrencesGraphExtractor(co_occurrences_dist=(3, "sentences")), ], lang="fra", ) @@ -143,7 +143,7 @@ GraphRulesCharactersExtractor(min_appearances=3), # A co-occurence between two characters is counted if its # range is lower or equal to 10 sentences - CoOccurrencesGraphExtractor(co_occurences_dist=(10, "sentences")), + CoOccurrencesGraphExtractor(co_occurrences_dist=(10, "sentences")), ], lang="fra", ) @@ -180,7 +180,7 @@ [ GraphRulesCharactersExtractor(min_appearances=3), CoOccurrencesGraphExtractor( - co_occurences_dist=(20, "sentences"), + co_occurrences_dist=(20, "sentences"), dynamic=True, # we want to extract a dynamic graph (i.e. a list of sequential graphs) dynamic_window=20, # the size, in interaction, of each graph dynamic_overlap=0, # overlap between windows From 58ceac7e1fd78247158e634bd2b42bd55386d08c Mon Sep 17 00:00:00 2001 From: Aethor Date: Wed, 8 Nov 2023 10:39:07 +0100 Subject: [PATCH 2/3] fix a typo in CoOccurrencesGraphExtractor --- docs/pipeline.rst | 9 ++++-- renard/pipeline/graph_extraction.py | 50 +++++++++++++++++------------ renard_tutorial.py | 10 +++--- 3 files changed, 42 insertions(+), 27 deletions(-) diff --git a/docs/pipeline.rst b/docs/pipeline.rst index 092b791..134b9c9 100644 --- a/docs/pipeline.rst +++ b/docs/pipeline.rst @@ -22,8 +22,8 @@ document. Here is a simple example: [ NLTKTokenizer(), NLTKNamedEntityRecognizer(), - GraphRulesCharacterUnifier(min_appearances=10), - CoOccurrencesGraphExtractor(co_occurences_dist=25) + NaiveCharactersExtractor(min_appearance=10), + CoOccurrencesGraphExtractor(co_occurrences_dist=25) ] ) @@ -55,8 +55,13 @@ to compute them yourself : pipeline = Pipeline( [ NLTKNamedEntityRecognizer(), +<<<<<<< HEAD GraphRulesCharacterUnifier(min_appearances=10), CoOccurrencesGraphExtractor(co_occurences_dist=25) +======= + NaiveCharactersExtractor(min_appearance=10), + CoOccurrencesGraphExtractor(co_occurrences_dist=25) +>>>>>>> 66a46dc (fix a typo in CoOccurrencesGraphExtractor) ] ) diff --git a/renard/pipeline/graph_extraction.py b/renard/pipeline/graph_extraction.py index e70e957..b4cf5d3 100644 --- a/renard/pipeline/graph_extraction.py +++ b/renard/pipeline/graph_extraction.py @@ -81,13 +81,16 @@ class CoOccurrencesGraphExtractor(PipelineStep): def __init__( self, - co_occurences_dist: Union[int, Tuple[int, Literal["tokens", "sentences"]]], + co_occurrences_dist: Union[int, Tuple[int, Literal["tokens", "sentences"]]], dynamic: bool = False, dynamic_window: Optional[int] = None, dynamic_overlap: int = 0, + co_occurences_dist: Optional[ + Union[int, Tuple[int, Literal["tokens", "sentences"]]] + ] = None, ) -> None: """ - :param co_occurences_dist: max accepted distance between two + :param co_occurrences_dist: max accepted distance between two character appearances to form a co-occurence interaction. - if an ``int`` is given, the distance is in number of @@ -99,28 +102,35 @@ def __init__( :param dynamic: - - if ``False`` (the default), a static ``nx.graph`` is - extracted + - if ``False`` (the default), a static ``nx.graph`` is + extracted - - if ``True``, several ``nx.graph`` are extracted. In - that case, ``dynamic_window`` and - ``dynamic_overlap``*can* be specified. If - ``dynamic_window`` is not specified, this step is - expecting the text to be cut into chapters', and a graph - will be extracted for each 'chapter'. In that case, - ``chapters`` must be passed to the pipeline as a - ``List[str]`` at runtime. + - if ``True``, several ``nx.graph`` are extracted. In + that case, ``dynamic_window`` and + ``dynamic_overlap``*can* be specified. If + ``dynamic_window`` is not specified, this step is + expecting the text to be cut into chapters', and a + graph will be extracted for each 'chapter'. In that + case, ``chapters`` must be passed to the pipeline as + a ``List[str]`` at runtime. :param dynamic_window: dynamic window, in number of interactions. a dynamic window of `n` means that each returned graph will be formed by `n` interactions. :param dynamic_overlap: overlap, in number of interactions. + + :param co_occurences_dist: same as ``co_occurrences_dist``. + Included because of retro-compatibility, as it was a + previously included typo. """ + # typo retrocompatibility + if not co_occurences_dist is None: + co_occurrences_dist = co_occurences_dist - if isinstance(co_occurences_dist, int): - co_occurences_dist = (co_occurences_dist, "tokens") - self.co_occurences_dist = co_occurences_dist + if isinstance(co_occurrences_dist, int): + co_occurrences_dist = (co_occurrences_dist, "tokens") + self.co_occurrences_dist = co_occurrences_dist if dynamic: if not dynamic_window is None: @@ -181,25 +191,25 @@ def _mentions_interact( .. note:: - the attribute ``self.co_occurences_dist`` is used to know wether mentions are in co_occurences + the attribute ``self.co_occurrences_dist`` is used to know wether mentions are in co_occurences :param mention_1: :param mention_2: :param sentences: :return: a boolean indicating wether the two mentions are co-occuring """ - if self.co_occurences_dist[1] == "tokens": + if self.co_occurrences_dist[1] == "tokens": return ( abs(mention_2.start_idx - mention_1.start_idx) - <= self.co_occurences_dist[0] + <= self.co_occurrences_dist[0] ) - elif self.co_occurences_dist[1] == "sentences": + elif self.co_occurrences_dist[1] == "sentences": assert not sentences is None mention_1_sent = sent_index_for_token_index(mention_1.start_idx, sentences) mention_2_sent = sent_index_for_token_index( mention_2.end_idx - 1, sentences ) - return abs(mention_2_sent - mention_1_sent) <= self.co_occurences_dist[0] + return abs(mention_2_sent - mention_1_sent) <= self.co_occurrences_dist[0] else: raise NotImplementedError diff --git a/renard_tutorial.py b/renard_tutorial.py index caa2bcb..4596042 100644 --- a/renard_tutorial.py +++ b/renard_tutorial.py @@ -60,8 +60,8 @@ # [ # NLTKTokenizer(), # tokenization # NLTKNamedEntityRecognizer(), # named entity recognition -# GraphRulesCharacterUnifier(), # characters extraction -# CoOccurrencesGraphExtractor(co_occurences_dist=(1, "sentences")) # graph extraction +# GraphRulesCharactersExtractor(), # characters extraction +# CoOccurrencesGraphExtractor(co_occurrences_dist=(1, "sentences")) # graph extraction # ] # ) # ``` @@ -105,7 +105,7 @@ GraphRulesCharacterUnifier(), # an interaction will be a co-occurence in a range of 3 # sentences or less - CoOccurrencesGraphExtractor(co_occurences_dist=(3, "sentences")), + CoOccurrencesGraphExtractor(co_occurrences_dist=(3, "sentences")), ], lang="fra", ) @@ -143,7 +143,7 @@ GraphRulesCharacterUnifier(min_appearances=3), # A co-occurence between two characters is counted if its # range is lower or equal to 10 sentences - CoOccurrencesGraphExtractor(co_occurences_dist=(10, "sentences")), + CoOccurrencesGraphExtractor(co_occurrences_dist=(10, "sentences")), ], lang="fra", ) @@ -180,7 +180,7 @@ [ GraphRulesCharacterUnifier(min_appearances=3), CoOccurrencesGraphExtractor( - co_occurences_dist=(20, "sentences"), + co_occurrences_dist=(20, "sentences"), dynamic=True, # we want to extract a dynamic graph (i.e. a list of sequential graphs) dynamic_window=20, # the size, in interaction, of each graph dynamic_overlap=0, # overlap between windows From 1e2eb384497e7aa61afe13165fb2058f899248d1 Mon Sep 17 00:00:00 2001 From: Aethor Date: Wed, 8 Nov 2023 10:45:00 +0100 Subject: [PATCH 3/3] fix remaining issues with co_occurrences_dist typo --- renard/pipeline/graph_extraction.py | 6 +++++- renard/pipeline/preconfigured.py | 8 ++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/renard/pipeline/graph_extraction.py b/renard/pipeline/graph_extraction.py index b4cf5d3..8498041 100644 --- a/renard/pipeline/graph_extraction.py +++ b/renard/pipeline/graph_extraction.py @@ -81,7 +81,9 @@ class CoOccurrencesGraphExtractor(PipelineStep): def __init__( self, - co_occurrences_dist: Union[int, Tuple[int, Literal["tokens", "sentences"]]], + co_occurrences_dist: Optional[ + Union[int, Tuple[int, Literal["tokens", "sentences"]]] + ], dynamic: bool = False, dynamic_window: Optional[int] = None, dynamic_overlap: int = 0, @@ -127,6 +129,8 @@ def __init__( # typo retrocompatibility if not co_occurences_dist is None: co_occurrences_dist = co_occurences_dist + if co_occurrences_dist is None and co_occurences_dist is None: + raise ValueError() if isinstance(co_occurrences_dist, int): co_occurrences_dist = (co_occurrences_dist, "tokens") diff --git a/renard/pipeline/preconfigured.py b/renard/pipeline/preconfigured.py index 00b227a..d797231 100644 --- a/renard/pipeline/preconfigured.py +++ b/renard/pipeline/preconfigured.py @@ -56,6 +56,10 @@ def nltk_pipeline( **pipeline_kwargs ) else: + + if not "co_occurrences_dist" in graph_extractor_kwargs: + graph_extractor_kwargs["co_occurrences_dist"] = (1, "sentences") + return Pipeline( [ NLTKTokenizer(**tokenizer_kwargs), @@ -113,6 +117,10 @@ def bert_pipeline( **pipeline_kwargs ) else: + + if not "co_occurrences_dist" in graph_extractor_kwargs: + graph_extractor_kwargs["co_occurrences_dist"] = (1, "sentences") + return Pipeline( [ NLTKTokenizer(**tokenizer_kwargs),