From 7b20245ef0c260d8bbc53def8cef8caee29371d0 Mon Sep 17 00:00:00 2001 From: Lorenzo Ampil Date: Sun, 14 Jun 2020 21:32:10 +0800 Subject: [PATCH 01/22] Add B I handling to grouping --- src/transformers/pipelines.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py index 1d91c21c267dfc..c764948a52ebb9 100755 --- a/src/transformers/pipelines.py +++ b/src/transformers/pipelines.py @@ -1021,21 +1021,24 @@ def __call__(self, *args, **kwargs): "index": idx, } last_idx, _ = filtered_labels_idx[-1] + is_last_idx = idx == last_idx + if self.grouped_entities: if not entity_group_disagg: entity_group_disagg += [entity] - if idx == last_idx: + if is_last_idx: entity_groups += [self.group_entities(entity_group_disagg)] continue # If the current entity is similar and adjacent to the previous entity, append it to the disaggregated entity group + # The split is meant to account for the "B" and "I" suffixes if ( - entity["entity"] == entity_group_disagg[-1]["entity"] + entity["entity"].split("-")[-1] == entity_group_disagg[-1]["entity"].split("-")[-1] and entity["index"] == entity_group_disagg[-1]["index"] + 1 ): entity_group_disagg += [entity] # Group the entities at the last entity - if idx == last_idx: + if is_last_idx: entity_groups += [self.group_entities(entity_group_disagg)] # If the current entity is different from the previous entity, aggregate the disaggregated entity group else: From 562bd7c61b1d6c2edefa1a0a0238dcee914a33c0 Mon Sep 17 00:00:00 2001 From: Lorenzo Ampil Date: Sun, 14 Jun 2020 21:41:00 +0800 Subject: [PATCH 02/22] Add fix to include separate entity as last token --- src/transformers/pipelines.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py index c764948a52ebb9..e1bce42c58ee16 100755 --- a/src/transformers/pipelines.py +++ b/src/transformers/pipelines.py @@ -1044,6 +1044,9 @@ def __call__(self, *args, **kwargs): else: entity_groups += [self.group_entities(entity_group_disagg)] entity_group_disagg = [entity] + # If it's the last entity, + if is_last_idx: + entity_groups += [self.group_entities(entity_group_disagg)] entities += [entity] From 9f0936c98c964937251e23750b9e52bf4779f6f1 Mon Sep 17 00:00:00 2001 From: Lorenzo Ampil Date: Sun, 14 Jun 2020 21:43:41 +0800 Subject: [PATCH 03/22] move last_idx definition outside loop --- src/transformers/pipelines.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py index e1bce42c58ee16..78fec72e02bb92 100755 --- a/src/transformers/pipelines.py +++ b/src/transformers/pipelines.py @@ -1011,6 +1011,7 @@ def __call__(self, *args, **kwargs): for idx, label_idx in enumerate(labels_idx) if self.model.config.id2label[label_idx] not in self.ignore_labels ] + last_idx, _ = filtered_labels_idx[-1] for idx, label_idx in filtered_labels_idx: @@ -1020,7 +1021,6 @@ def __call__(self, *args, **kwargs): "entity": self.model.config.id2label[label_idx], "index": idx, } - last_idx, _ = filtered_labels_idx[-1] is_last_idx = idx == last_idx if self.grouped_entities: From d3c48387cab02c117b40e91b77b954b1efa7e905 Mon Sep 17 00:00:00 2001 From: Lorenzo Ampil Date: Sun, 14 Jun 2020 21:47:44 +0800 Subject: [PATCH 04/22] Use first entity in entity group as reference for entity type --- src/transformers/pipelines.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py index 78fec72e02bb92..96cacaa1afd75d 100755 --- a/src/transformers/pipelines.py +++ b/src/transformers/pipelines.py @@ -1069,7 +1069,7 @@ def group_entities(self, entities): Returns grouped entities """ # Get the last entity in the entity group - entity = entities[-1]["entity"] + entity = entities[0]["entity"] scores = np.mean([entity["score"] for entity in entities]) tokens = [entity["word"] for entity in entities] From 9a182eae076d29a33d33993c13a13ee440152dda Mon Sep 17 00:00:00 2001 From: Lorenzo Ampil Date: Wed, 17 Jun 2020 22:10:34 +0800 Subject: [PATCH 05/22] Add test cases --- tests/test_pipelines.py | 86 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 82 insertions(+), 4 deletions(-) diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py index 1b978f5afd90ca..df337d9286bfab 100644 --- a/tests/test_pipelines.py +++ b/tests/test_pipelines.py @@ -10,6 +10,7 @@ VALID_INPUTS = ["A simple string", ["list of strings"]] NER_FINETUNED_MODELS = ["sshleifer/tiny-dbmdz-bert-large-cased-finetuned-conll03-english"] +GROUPED_NER_FINETUNED_MODELS = ["mrm8488/bert-spanish-cased-finetuned-ner"] # xlnet-base-cased disabled for now, since it crashes TF2 FEATURE_EXTRACT_FINETUNED_MODELS = ["sshleifer/tiny-distilbert-base-cased"] @@ -38,6 +39,15 @@ {"sequence": "The largest city in France is Lyon", "score": 0.21112334728240967, "token": 12790}, ], ] + +expected_grouped_ner_result = [ + [ + {"entity_group": "B-PER", "score": 0.9710702640669686, "word": "Consuelo Araújo Noguera"}, + {"entity_group": "B-PER", "score": 0.9997273534536362, "word": "Andrés Pastrana"}, + {"entity_group": "B-ORG", "score": 0.8589080572128296, "word": "Farc"}, + ] +] + SUMMARIZATION_KWARGS = dict(num_beams=2, min_length=2, max_length=5) @@ -166,9 +176,19 @@ def test_torch_ner(self): @require_torch def test_ner_grouped(self): mandatory_keys = {"entity_group", "word", "score"} - for model_name in NER_FINETUNED_MODELS: + valid_inputs = [ + "Consuelo Araújo Noguera, ministra de cultura del presidente Andrés Pastrana (1998.2002) fue asesinada por las Farc luego de haber permanecido secuestrada por algunos meses." + ] + expected_check_keys = ["entity_group", "word"] + for model_name in GROUPED_NER_FINETUNED_MODELS: nlp = pipeline(task="ner", model=model_name, tokenizer=model_name, grouped_entities=True) - self._test_mono_column_pipeline(nlp, VALID_INPUTS, mandatory_keys) + self._test_mono_column_pipeline( + nlp, + valid_inputs, + mandatory_keys, + expected_multi_result=expected_grouped_ner_result, + expected_check_keys=expected_check_keys, + ) @require_tf def test_tf_ner(self): @@ -180,9 +200,19 @@ def test_tf_ner(self): @require_tf def test_tf_ner_grouped(self): mandatory_keys = {"entity_group", "word", "score"} - for model_name in NER_FINETUNED_MODELS: + valid_inputs = [ + "Consuelo Araújo Noguera, ministra de cultura del presidente Andrés Pastrana (1998.2002) fue asesinada por las Farc luego de haber permanecido secuestrada por algunos meses." + ] + expected_check_keys = ["entity_group", "word"] + for model_name in GROUPED_NER_FINETUNED_MODELS: nlp = pipeline(task="ner", model=model_name, tokenizer=model_name, framework="tf", grouped_entities=True) - self._test_mono_column_pipeline(nlp, VALID_INPUTS, mandatory_keys) + self._test_mono_column_pipeline( + nlp, + valid_inputs, + mandatory_keys, + expected_multi_result=expected_grouped_ner_result, + expected_check_keys=expected_check_keys, + ) @require_torch def test_torch_sentiment_analysis(self): @@ -345,6 +375,54 @@ def test_tf_text_generation(self): QA_FINETUNED_MODELS = ["sshleifer/tiny-distilbert-base-cased-distilled-squad"] +class QAPipelineTests(unittest.TestCase): + def _test_qa_pipeline(self, nlp): + output_keys = {"score", "answer", "start", "end"} + valid_inputs = [ + {"question": "Where was HuggingFace founded ?", "context": "HuggingFace was founded in Paris."}, + { + "question": "In what field is HuggingFace working ?", + "context": "HuggingFace is a startup based in New-York founded in Paris which is trying to solve NLP.", + }, + ] + invalid_inputs = [ + {"question": "", "context": "This is a test to try empty question edge case"}, + {"question": None, "context": "This is a test to try empty question edge case"}, + {"question": "What is does with empty context ?", "context": ""}, + {"question": "What is does with empty context ?", "context": None}, + ] + self.assertIsNotNone(nlp) + + mono_result = nlp(valid_inputs[0]) + self.assertIsInstance(mono_result, dict) + + for key in output_keys: + self.assertIn(key, mono_result) + + multi_result = nlp(valid_inputs) + self.assertIsInstance(multi_result, list) + self.assertIsInstance(multi_result[0], dict) + + for result in multi_result: + for key in output_keys: + self.assertIn(key, result) + for bad_input in invalid_inputs: + self.assertRaises(Exception, nlp, bad_input) + self.assertRaises(Exception, nlp, invalid_inputs) + + @require_torch + def test_torch_question_answering(self): + for model_name in QA_FINETUNED_MODELS: + nlp = pipeline(task="question-answering", model=model_name, tokenizer=model_name) + self._test_qa_pipeline(nlp) + + @require_tf + def test_tf_question_answering(self): + for model_name in QA_FINETUNED_MODELS: + nlp = pipeline(task="question-answering", model=model_name, tokenizer=model_name, framework="tf") + self._test_qa_pipeline(nlp) + + class QAPipelineTests(unittest.TestCase): def _test_qa_pipeline(self, nlp): output_keys = {"score", "answer", "start", "end"} From 7de96853076e12c5362a04bfe79157bd8fad9ce3 Mon Sep 17 00:00:00 2001 From: Lorenzo Ampil Date: Wed, 17 Jun 2020 22:12:22 +0800 Subject: [PATCH 06/22] Take out extra class accidentally added --- tests/test_pipelines.py | 48 ----------------------------------------- 1 file changed, 48 deletions(-) diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py index df337d9286bfab..eac8325b91eb1e 100644 --- a/tests/test_pipelines.py +++ b/tests/test_pipelines.py @@ -375,54 +375,6 @@ def test_tf_text_generation(self): QA_FINETUNED_MODELS = ["sshleifer/tiny-distilbert-base-cased-distilled-squad"] -class QAPipelineTests(unittest.TestCase): - def _test_qa_pipeline(self, nlp): - output_keys = {"score", "answer", "start", "end"} - valid_inputs = [ - {"question": "Where was HuggingFace founded ?", "context": "HuggingFace was founded in Paris."}, - { - "question": "In what field is HuggingFace working ?", - "context": "HuggingFace is a startup based in New-York founded in Paris which is trying to solve NLP.", - }, - ] - invalid_inputs = [ - {"question": "", "context": "This is a test to try empty question edge case"}, - {"question": None, "context": "This is a test to try empty question edge case"}, - {"question": "What is does with empty context ?", "context": ""}, - {"question": "What is does with empty context ?", "context": None}, - ] - self.assertIsNotNone(nlp) - - mono_result = nlp(valid_inputs[0]) - self.assertIsInstance(mono_result, dict) - - for key in output_keys: - self.assertIn(key, mono_result) - - multi_result = nlp(valid_inputs) - self.assertIsInstance(multi_result, list) - self.assertIsInstance(multi_result[0], dict) - - for result in multi_result: - for key in output_keys: - self.assertIn(key, result) - for bad_input in invalid_inputs: - self.assertRaises(Exception, nlp, bad_input) - self.assertRaises(Exception, nlp, invalid_inputs) - - @require_torch - def test_torch_question_answering(self): - for model_name in QA_FINETUNED_MODELS: - nlp = pipeline(task="question-answering", model=model_name, tokenizer=model_name) - self._test_qa_pipeline(nlp) - - @require_tf - def test_tf_question_answering(self): - for model_name in QA_FINETUNED_MODELS: - nlp = pipeline(task="question-answering", model=model_name, tokenizer=model_name, framework="tf") - self._test_qa_pipeline(nlp) - - class QAPipelineTests(unittest.TestCase): def _test_qa_pipeline(self, nlp): output_keys = {"score", "answer", "start", "end"} From 4a7a4839d0a44940b746c7bd25c48e3e660cace3 Mon Sep 17 00:00:00 2001 From: Lorenzo Ampil Date: Wed, 17 Jun 2020 22:42:24 +0800 Subject: [PATCH 07/22] Return tf ner grouped test to original --- tests/test_pipelines.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py index eac8325b91eb1e..43a4d4c3188a09 100644 --- a/tests/test_pipelines.py +++ b/tests/test_pipelines.py @@ -200,19 +200,9 @@ def test_tf_ner(self): @require_tf def test_tf_ner_grouped(self): mandatory_keys = {"entity_group", "word", "score"} - valid_inputs = [ - "Consuelo Araújo Noguera, ministra de cultura del presidente Andrés Pastrana (1998.2002) fue asesinada por las Farc luego de haber permanecido secuestrada por algunos meses." - ] - expected_check_keys = ["entity_group", "word"] - for model_name in GROUPED_NER_FINETUNED_MODELS: + for model_name in NER_FINETUNED_MODELS: nlp = pipeline(task="ner", model=model_name, tokenizer=model_name, framework="tf", grouped_entities=True) - self._test_mono_column_pipeline( - nlp, - valid_inputs, - mandatory_keys, - expected_multi_result=expected_grouped_ner_result, - expected_check_keys=expected_check_keys, - ) + self._test_mono_column_pipeline(nlp, VALID_INPUTS, mandatory_keys) @require_torch def test_torch_sentiment_analysis(self): From 010b7848d697372694912f1c4a75444168ed2fe2 Mon Sep 17 00:00:00 2001 From: Lorenzo Ampil Date: Sat, 4 Jul 2020 17:26:46 +0800 Subject: [PATCH 08/22] Take out redundant last entity --- src/transformers/pipelines.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py index 96cacaa1afd75d..81639f721ddcfe 100755 --- a/src/transformers/pipelines.py +++ b/src/transformers/pipelines.py @@ -1044,9 +1044,6 @@ def __call__(self, *args, **kwargs): else: entity_groups += [self.group_entities(entity_group_disagg)] entity_group_disagg = [entity] - # If it's the last entity, - if is_last_idx: - entity_groups += [self.group_entities(entity_group_disagg)] entities += [entity] From e1b2d38dc2ab78138802818c35cfe8dd61c13702 Mon Sep 17 00:00:00 2001 From: Lorenzo Ampil Date: Sat, 4 Jul 2020 17:31:05 +0800 Subject: [PATCH 09/22] Get last_idx safely Co-authored-by: ColleterVi <36503688+ColleterVi@users.noreply.github.com> --- src/transformers/pipelines.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py index 81639f721ddcfe..0644233481be41 100755 --- a/src/transformers/pipelines.py +++ b/src/transformers/pipelines.py @@ -1011,7 +1011,8 @@ def __call__(self, *args, **kwargs): for idx, label_idx in enumerate(labels_idx) if self.model.config.id2label[label_idx] not in self.ignore_labels ] - last_idx, _ = filtered_labels_idx[-1] + if filtered_labels_idx: + last_idx, _ = filtered_labels_idx[-1] for idx, label_idx in filtered_labels_idx: From 0775ef5a9be329ebc378fb3f8333fe8cd5bf0347 Mon Sep 17 00:00:00 2001 From: Lorenzo Ampil Date: Sat, 4 Jul 2020 17:34:48 +0800 Subject: [PATCH 10/22] Fix first entity comment --- src/transformers/pipelines.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py index 0644233481be41..f59883aefbc7eb 100755 --- a/src/transformers/pipelines.py +++ b/src/transformers/pipelines.py @@ -1066,7 +1066,7 @@ def group_entities(self, entities): """ Returns grouped entities """ - # Get the last entity in the entity group + # Get the first entity in the entity group entity = entities[0]["entity"] scores = np.mean([entity["score"] for entity in entities]) tokens = [entity["word"] for entity in entities] From 1b097fbd9a5962ee001290f016634bee236c555b Mon Sep 17 00:00:00 2001 From: Lorenzo Ampil Date: Sat, 4 Jul 2020 22:19:08 +0800 Subject: [PATCH 11/22] Create separate functions for group_sub_entities and group_entities (splitting call method to testable functions) --- src/transformers/pipelines.py | 78 ++++++++++++++++++++--------------- 1 file changed, 45 insertions(+), 33 deletions(-) diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py index f59883aefbc7eb..59d528d54ce680 100755 --- a/src/transformers/pipelines.py +++ b/src/transformers/pipelines.py @@ -1003,8 +1003,6 @@ def __call__(self, *args, **kwargs): labels_idx = score.argmax(axis=-1) entities = [] - entity_groups = [] - entity_group_disagg = [] # Filter to labels not in `self.ignore_labels` filtered_labels_idx = [ (idx, label_idx) @@ -1022,39 +1020,13 @@ def __call__(self, *args, **kwargs): "entity": self.model.config.id2label[label_idx], "index": idx, } - is_last_idx = idx == last_idx - - if self.grouped_entities: - if not entity_group_disagg: - entity_group_disagg += [entity] - if is_last_idx: - entity_groups += [self.group_entities(entity_group_disagg)] - continue - - # If the current entity is similar and adjacent to the previous entity, append it to the disaggregated entity group - # The split is meant to account for the "B" and "I" suffixes - if ( - entity["entity"].split("-")[-1] == entity_group_disagg[-1]["entity"].split("-")[-1] - and entity["index"] == entity_group_disagg[-1]["index"] + 1 - ): - entity_group_disagg += [entity] - # Group the entities at the last entity - if is_last_idx: - entity_groups += [self.group_entities(entity_group_disagg)] - # If the current entity is different from the previous entity, aggregate the disaggregated entity group - else: - entity_groups += [self.group_entities(entity_group_disagg)] - entity_group_disagg = [entity] entities += [entity] - # Ensure if an entity is the latest one in the sequence it gets appended to the output - if len(entity_group_disagg) > 0: - entity_groups.append(self.group_entities(entity_group_disagg)) - - # Append + # Append grouped entities if self.grouped_entities: - answers += [entity_groups] + answers += [self.group_entities(entities)] + # Append ungrouped entities else: answers += [entities] @@ -1062,9 +1034,9 @@ def __call__(self, *args, **kwargs): return answers[0] return answers - def group_entities(self, entities): + def group_sub_entities(self, entities): """ - Returns grouped entities + Returns grouped sub entities """ # Get the first entity in the entity group entity = entities[0]["entity"] @@ -1078,6 +1050,46 @@ def group_entities(self, entities): } return entity_group + def group_entities(self, entities): + """ + Returns grouped entities + """ + + entity_groups = [] + entity_group_disagg = [] + + if entities: + last_idx = entities[-1]["index"] + + for entity in entities: + is_last_idx = entity["index"] == last_idx + if not entity_group_disagg: + entity_group_disagg += [entity] + if is_last_idx: + entity_groups += [self.group_sub_entities(entity_group_disagg)] + continue + + # If the current entity is similar and adjacent to the previous entity, append it to the disaggregated entity group + # The split is meant to account for the "B" and "I" suffixes + if ( + entity["entity"].split("-")[-1] == entity_group_disagg[-1]["entity"].split("-")[-1] + and entity["index"] == entity_group_disagg[-1]["index"] + 1 + ): + entity_group_disagg += [entity] + # Group the entities at the last entity + if is_last_idx: + entity_groups += [self.group_sub_entities(entity_group_disagg)] + # If the current entity is different from the previous entity, aggregate the disaggregated entity group + else: + entity_groups += [self.group_sub_entities(entity_group_disagg)] + entity_group_disagg = [entity] + + # Ensure if an entity is the latest one in the sequence it gets appended to the output + if len(entity_group_disagg) > 0: + entity_groups.append(self.group_sub_entities(entity_group_disagg)) + + return entity_groups + NerPipeline = TokenClassificationPipeline From 1eb4989bf8295b063a5ce78436753f382b0bdee9 Mon Sep 17 00:00:00 2001 From: Lorenzo Ampil Date: Sat, 4 Jul 2020 22:50:10 +0800 Subject: [PATCH 12/22] Take out unnecessary last_idx --- src/transformers/pipelines.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py index 59d528d54ce680..b2ad4c1b140444 100755 --- a/src/transformers/pipelines.py +++ b/src/transformers/pipelines.py @@ -1009,8 +1009,6 @@ def __call__(self, *args, **kwargs): for idx, label_idx in enumerate(labels_idx) if self.model.config.id2label[label_idx] not in self.ignore_labels ] - if filtered_labels_idx: - last_idx, _ = filtered_labels_idx[-1] for idx, label_idx in filtered_labels_idx: From f3cc9a4c611c07e2b7a938bb9aea2ab8461622db Mon Sep 17 00:00:00 2001 From: Lorenzo Ampil Date: Sat, 4 Jul 2020 22:58:39 +0800 Subject: [PATCH 13/22] Remove additional forward pass test --- tests/test_pipelines.py | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py index 43a4d4c3188a09..b972c824695dd3 100644 --- a/tests/test_pipelines.py +++ b/tests/test_pipelines.py @@ -10,7 +10,6 @@ VALID_INPUTS = ["A simple string", ["list of strings"]] NER_FINETUNED_MODELS = ["sshleifer/tiny-dbmdz-bert-large-cased-finetuned-conll03-english"] -GROUPED_NER_FINETUNED_MODELS = ["mrm8488/bert-spanish-cased-finetuned-ner"] # xlnet-base-cased disabled for now, since it crashes TF2 FEATURE_EXTRACT_FINETUNED_MODELS = ["sshleifer/tiny-distilbert-base-cased"] @@ -176,19 +175,9 @@ def test_torch_ner(self): @require_torch def test_ner_grouped(self): mandatory_keys = {"entity_group", "word", "score"} - valid_inputs = [ - "Consuelo Araújo Noguera, ministra de cultura del presidente Andrés Pastrana (1998.2002) fue asesinada por las Farc luego de haber permanecido secuestrada por algunos meses." - ] - expected_check_keys = ["entity_group", "word"] - for model_name in GROUPED_NER_FINETUNED_MODELS: + for model_name in NER_FINETUNED_MODELS: nlp = pipeline(task="ner", model=model_name, tokenizer=model_name, grouped_entities=True) - self._test_mono_column_pipeline( - nlp, - valid_inputs, - mandatory_keys, - expected_multi_result=expected_grouped_ner_result, - expected_check_keys=expected_check_keys, - ) + self._test_mono_column_pipeline(nlp, VALID_INPUTS, mandatory_keys) @require_tf def test_tf_ner(self): From b500617aa3e4691c65948b2278a5703c170efb02 Mon Sep 17 00:00:00 2001 From: Lorenzo Ampil Date: Sat, 4 Jul 2020 23:01:00 +0800 Subject: [PATCH 14/22] Move token classification basic tests to separate class --- tests/test_pipelines.py | 58 +++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py index b972c824695dd3..b564d15a52bfbe 100644 --- a/tests/test_pipelines.py +++ b/tests/test_pipelines.py @@ -165,34 +165,6 @@ def _test_mono_column_pipeline( self.assertRaises(Exception, nlp, invalid_inputs) - @require_torch - def test_torch_ner(self): - mandatory_keys = {"entity", "word", "score"} - for model_name in NER_FINETUNED_MODELS: - nlp = pipeline(task="ner", model=model_name, tokenizer=model_name) - self._test_mono_column_pipeline(nlp, VALID_INPUTS, mandatory_keys) - - @require_torch - def test_ner_grouped(self): - mandatory_keys = {"entity_group", "word", "score"} - for model_name in NER_FINETUNED_MODELS: - nlp = pipeline(task="ner", model=model_name, tokenizer=model_name, grouped_entities=True) - self._test_mono_column_pipeline(nlp, VALID_INPUTS, mandatory_keys) - - @require_tf - def test_tf_ner(self): - mandatory_keys = {"entity", "word", "score"} - for model_name in NER_FINETUNED_MODELS: - nlp = pipeline(task="ner", model=model_name, tokenizer=model_name, framework="tf") - self._test_mono_column_pipeline(nlp, VALID_INPUTS, mandatory_keys) - - @require_tf - def test_tf_ner_grouped(self): - mandatory_keys = {"entity_group", "word", "score"} - for model_name in NER_FINETUNED_MODELS: - nlp = pipeline(task="ner", model=model_name, tokenizer=model_name, framework="tf", grouped_entities=True) - self._test_mono_column_pipeline(nlp, VALID_INPUTS, mandatory_keys) - @require_torch def test_torch_sentiment_analysis(self): mandatory_keys = {"label", "score"} @@ -402,6 +374,36 @@ def test_tf_question_answering(self): self._test_qa_pipeline(nlp) +class TokenClassificationPipelineTests(unittest.TestCase): + @require_torch + def test_torch_ner(self): + mandatory_keys = {"entity", "word", "score"} + for model_name in NER_FINETUNED_MODELS: + nlp = pipeline(task="ner", model=model_name, tokenizer=model_name) + self._test_mono_column_pipeline(nlp, VALID_INPUTS, mandatory_keys) + + @require_torch + def test_ner_grouped(self): + mandatory_keys = {"entity_group", "word", "score"} + for model_name in NER_FINETUNED_MODELS: + nlp = pipeline(task="ner", model=model_name, tokenizer=model_name, grouped_entities=True) + self._test_mono_column_pipeline(nlp, VALID_INPUTS, mandatory_keys) + + @require_tf + def test_tf_ner(self): + mandatory_keys = {"entity", "word", "score"} + for model_name in NER_FINETUNED_MODELS: + nlp = pipeline(task="ner", model=model_name, tokenizer=model_name, framework="tf") + self._test_mono_column_pipeline(nlp, VALID_INPUTS, mandatory_keys) + + @require_tf + def test_tf_ner_grouped(self): + mandatory_keys = {"entity_group", "word", "score"} + for model_name in NER_FINETUNED_MODELS: + nlp = pipeline(task="ner", model=model_name, tokenizer=model_name, framework="tf", grouped_entities=True) + self._test_mono_column_pipeline(nlp, VALID_INPUTS, mandatory_keys) + + class PipelineCommonTests(unittest.TestCase): pipelines = SUPPORTED_TASKS.keys() From ff91c620cade7889ceb0745827c02b9943a72a67 Mon Sep 17 00:00:00 2001 From: Lorenzo Ampil Date: Sat, 4 Jul 2020 23:39:21 +0800 Subject: [PATCH 15/22] Move token classification basic tests back to monocolumninputtestcase --- tests/test_pipelines.py | 58 ++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 30 deletions(-) diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py index b564d15a52bfbe..b972c824695dd3 100644 --- a/tests/test_pipelines.py +++ b/tests/test_pipelines.py @@ -165,6 +165,34 @@ def _test_mono_column_pipeline( self.assertRaises(Exception, nlp, invalid_inputs) + @require_torch + def test_torch_ner(self): + mandatory_keys = {"entity", "word", "score"} + for model_name in NER_FINETUNED_MODELS: + nlp = pipeline(task="ner", model=model_name, tokenizer=model_name) + self._test_mono_column_pipeline(nlp, VALID_INPUTS, mandatory_keys) + + @require_torch + def test_ner_grouped(self): + mandatory_keys = {"entity_group", "word", "score"} + for model_name in NER_FINETUNED_MODELS: + nlp = pipeline(task="ner", model=model_name, tokenizer=model_name, grouped_entities=True) + self._test_mono_column_pipeline(nlp, VALID_INPUTS, mandatory_keys) + + @require_tf + def test_tf_ner(self): + mandatory_keys = {"entity", "word", "score"} + for model_name in NER_FINETUNED_MODELS: + nlp = pipeline(task="ner", model=model_name, tokenizer=model_name, framework="tf") + self._test_mono_column_pipeline(nlp, VALID_INPUTS, mandatory_keys) + + @require_tf + def test_tf_ner_grouped(self): + mandatory_keys = {"entity_group", "word", "score"} + for model_name in NER_FINETUNED_MODELS: + nlp = pipeline(task="ner", model=model_name, tokenizer=model_name, framework="tf", grouped_entities=True) + self._test_mono_column_pipeline(nlp, VALID_INPUTS, mandatory_keys) + @require_torch def test_torch_sentiment_analysis(self): mandatory_keys = {"label", "score"} @@ -374,36 +402,6 @@ def test_tf_question_answering(self): self._test_qa_pipeline(nlp) -class TokenClassificationPipelineTests(unittest.TestCase): - @require_torch - def test_torch_ner(self): - mandatory_keys = {"entity", "word", "score"} - for model_name in NER_FINETUNED_MODELS: - nlp = pipeline(task="ner", model=model_name, tokenizer=model_name) - self._test_mono_column_pipeline(nlp, VALID_INPUTS, mandatory_keys) - - @require_torch - def test_ner_grouped(self): - mandatory_keys = {"entity_group", "word", "score"} - for model_name in NER_FINETUNED_MODELS: - nlp = pipeline(task="ner", model=model_name, tokenizer=model_name, grouped_entities=True) - self._test_mono_column_pipeline(nlp, VALID_INPUTS, mandatory_keys) - - @require_tf - def test_tf_ner(self): - mandatory_keys = {"entity", "word", "score"} - for model_name in NER_FINETUNED_MODELS: - nlp = pipeline(task="ner", model=model_name, tokenizer=model_name, framework="tf") - self._test_mono_column_pipeline(nlp, VALID_INPUTS, mandatory_keys) - - @require_tf - def test_tf_ner_grouped(self): - mandatory_keys = {"entity_group", "word", "score"} - for model_name in NER_FINETUNED_MODELS: - nlp = pipeline(task="ner", model=model_name, tokenizer=model_name, framework="tf", grouped_entities=True) - self._test_mono_column_pipeline(nlp, VALID_INPUTS, mandatory_keys) - - class PipelineCommonTests(unittest.TestCase): pipelines = SUPPORTED_TASKS.keys() From 9533bf76f4ad61f8276e02c2d2a21c32b5df36ac Mon Sep 17 00:00:00 2001 From: Lorenzo Ampil Date: Sun, 5 Jul 2020 00:00:11 +0800 Subject: [PATCH 16/22] Move base ner tests to nerpipelinetests --- tests/test_pipelines.py | 82 +++++++++++++++++++++++++++-------------- 1 file changed, 54 insertions(+), 28 deletions(-) diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py index b972c824695dd3..41ac47c262d1eb 100644 --- a/tests/test_pipelines.py +++ b/tests/test_pipelines.py @@ -165,34 +165,6 @@ def _test_mono_column_pipeline( self.assertRaises(Exception, nlp, invalid_inputs) - @require_torch - def test_torch_ner(self): - mandatory_keys = {"entity", "word", "score"} - for model_name in NER_FINETUNED_MODELS: - nlp = pipeline(task="ner", model=model_name, tokenizer=model_name) - self._test_mono_column_pipeline(nlp, VALID_INPUTS, mandatory_keys) - - @require_torch - def test_ner_grouped(self): - mandatory_keys = {"entity_group", "word", "score"} - for model_name in NER_FINETUNED_MODELS: - nlp = pipeline(task="ner", model=model_name, tokenizer=model_name, grouped_entities=True) - self._test_mono_column_pipeline(nlp, VALID_INPUTS, mandatory_keys) - - @require_tf - def test_tf_ner(self): - mandatory_keys = {"entity", "word", "score"} - for model_name in NER_FINETUNED_MODELS: - nlp = pipeline(task="ner", model=model_name, tokenizer=model_name, framework="tf") - self._test_mono_column_pipeline(nlp, VALID_INPUTS, mandatory_keys) - - @require_tf - def test_tf_ner_grouped(self): - mandatory_keys = {"entity_group", "word", "score"} - for model_name in NER_FINETUNED_MODELS: - nlp = pipeline(task="ner", model=model_name, tokenizer=model_name, framework="tf", grouped_entities=True) - self._test_mono_column_pipeline(nlp, VALID_INPUTS, mandatory_keys) - @require_torch def test_torch_sentiment_analysis(self): mandatory_keys = {"label", "score"} @@ -402,6 +374,60 @@ def test_tf_question_answering(self): self._test_qa_pipeline(nlp) +class NerPipelineTests(unittest.TestCase): + def _test_ner_pipeline( + self, nlp: Pipeline, + ): + output_keys = {"entity", "word", "score"} + + self.assertIsNotNone(nlp) + + mono_result = nlp(VALID_INPUTS[0], **kwargs) + self.assertIsInstance(mono_result, list) + self.assertIsInstance(mono_result[0], (dict, list)) + + if isinstance(mono_result[0], list): + mono_result = mono_result[0] + + for key in output_keys: + self.assertIn(key, mono_result[0]) + + multi_result = [nlp(input) for input in VALID_INPUTS] + self.assertIsInstance(multi_result, list) + self.assertIsInstance(multi_result[0], (dict, list)) + + if isinstance(multi_result[0], list): + multi_result = multi_result[0] + + for result in multi_result: + for key in output_keys: + self.assertIn(key, result) + + @require_torch + def test_torch_ner(self): + for model_name in NER_FINETUNED_MODELS: + nlp = pipeline(task="ner", model=model_name, tokenizer=model_name) + self._test_ner_pipeline(nlp) + + @require_torch + def test_ner_grouped(self): + for model_name in NER_FINETUNED_MODELS: + nlp = pipeline(task="ner", model=model_name, tokenizer=model_name, grouped_entities=True) + self._test_ner_pipeline(nlp) + + @require_tf + def test_tf_ner(self): + for model_name in NER_FINETUNED_MODELS: + nlp = pipeline(task="ner", model=model_name, tokenizer=model_name, framework="tf") + self._test_ner_pipeline(nlp) + + @require_tf + def test_tf_ner_grouped(self): + for model_name in NER_FINETUNED_MODELS: + nlp = pipeline(task="ner", model=model_name, tokenizer=model_name, framework="tf", grouped_entities=True) + self._test_ner_pipeline(nlp) + + class PipelineCommonTests(unittest.TestCase): pipelines = SUPPORTED_TASKS.keys() From e719f811f765ff236bacab3ffaabc67bb9707bd0 Mon Sep 17 00:00:00 2001 From: Lorenzo Ampil Date: Sun, 5 Jul 2020 00:03:31 +0800 Subject: [PATCH 17/22] Take out unused kwargs --- tests/test_pipelines.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py index 41ac47c262d1eb..a1b3ed937f9589 100644 --- a/tests/test_pipelines.py +++ b/tests/test_pipelines.py @@ -382,7 +382,7 @@ def _test_ner_pipeline( self.assertIsNotNone(nlp) - mono_result = nlp(VALID_INPUTS[0], **kwargs) + mono_result = nlp(VALID_INPUTS[0]) self.assertIsInstance(mono_result, list) self.assertIsInstance(mono_result[0], (dict, list)) From f8d0a76c0c0a9afa0294e767d32f288878c4194c Mon Sep 17 00:00:00 2001 From: Lorenzo Ampil Date: Sun, 5 Jul 2020 00:14:26 +0800 Subject: [PATCH 18/22] Add back mandatory_keys argument --- tests/test_pipelines.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py index a1b3ed937f9589..d7fb41e5e1286e 100644 --- a/tests/test_pipelines.py +++ b/tests/test_pipelines.py @@ -377,9 +377,8 @@ def test_tf_question_answering(self): class NerPipelineTests(unittest.TestCase): def _test_ner_pipeline( self, nlp: Pipeline, + output_keys: Iterable[str], ): - output_keys = {"entity", "word", "score"} - self.assertIsNotNone(nlp) mono_result = nlp(VALID_INPUTS[0]) @@ -405,27 +404,31 @@ def _test_ner_pipeline( @require_torch def test_torch_ner(self): + mandatory_keys = {"entity", "word", "score"} for model_name in NER_FINETUNED_MODELS: nlp = pipeline(task="ner", model=model_name, tokenizer=model_name) - self._test_ner_pipeline(nlp) + self._test_ner_pipeline(nlp, mandatory_keys) @require_torch def test_ner_grouped(self): + mandatory_keys = {"entity_group", "word", "score"} for model_name in NER_FINETUNED_MODELS: nlp = pipeline(task="ner", model=model_name, tokenizer=model_name, grouped_entities=True) - self._test_ner_pipeline(nlp) + self._test_ner_pipeline(nlp, mandatory_keys) @require_tf def test_tf_ner(self): + mandatory_keys = {"entity", "word", "score"} for model_name in NER_FINETUNED_MODELS: nlp = pipeline(task="ner", model=model_name, tokenizer=model_name, framework="tf") - self._test_ner_pipeline(nlp) + self._test_ner_pipeline(nlp, mandatory_keys) @require_tf def test_tf_ner_grouped(self): + mandatory_keys = {"entity_group", "word", "score"} for model_name in NER_FINETUNED_MODELS: nlp = pipeline(task="ner", model=model_name, tokenizer=model_name, framework="tf", grouped_entities=True) - self._test_ner_pipeline(nlp) + self._test_ner_pipeline(nlp, mandatory_keys) class PipelineCommonTests(unittest.TestCase): From f71b1786c5de4d30cf7b3259507dece84db72e8c Mon Sep 17 00:00:00 2001 From: Lorenzo Ampil Date: Sun, 5 Jul 2020 00:39:14 +0800 Subject: [PATCH 19/22] Add unitary tests for group_entities in _test_ner_pipeline --- tests/test_pipelines.py | 49 ++++++++++++++++++++++++++++++++--------- 1 file changed, 39 insertions(+), 10 deletions(-) diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py index d7fb41e5e1286e..1e94b2e67d02f8 100644 --- a/tests/test_pipelines.py +++ b/tests/test_pipelines.py @@ -39,14 +39,6 @@ ], ] -expected_grouped_ner_result = [ - [ - {"entity_group": "B-PER", "score": 0.9710702640669686, "word": "Consuelo Araújo Noguera"}, - {"entity_group": "B-PER", "score": 0.9997273534536362, "word": "Andrés Pastrana"}, - {"entity_group": "B-ORG", "score": 0.8589080572128296, "word": "Farc"}, - ] -] - SUMMARIZATION_KWARGS = dict(num_beams=2, min_length=2, max_length=5) @@ -376,9 +368,43 @@ def test_tf_question_answering(self): class NerPipelineTests(unittest.TestCase): def _test_ner_pipeline( - self, nlp: Pipeline, - output_keys: Iterable[str], + self, nlp: Pipeline, output_keys: Iterable[str], ): + + ungrouped_ner_inputs = [ + [ + {"entity": "B-PER", "index": 1, "score": 0.9994944930076599, "word": "Cons"}, + {"entity": "B-PER", "index": 2, "score": 0.8025449514389038, "word": "##uelo"}, + {"entity": "I-PER", "index": 3, "score": 0.9993102550506592, "word": "Ara"}, + {"entity": "I-PER", "index": 4, "score": 0.9993743896484375, "word": "##új"}, + {"entity": "I-PER", "index": 5, "score": 0.9992871880531311, "word": "##o"}, + {"entity": "I-PER", "index": 6, "score": 0.9993029236793518, "word": "No"}, + {"entity": "I-PER", "index": 7, "score": 0.9981776475906372, "word": "##guera"}, + {"entity": "B-PER", "index": 15, "score": 0.9998136162757874, "word": "Andrés"}, + {"entity": "I-PER", "index": 16, "score": 0.999740719795227, "word": "Pas"}, + {"entity": "I-PER", "index": 17, "score": 0.9997414350509644, "word": "##tran"}, + {"entity": "I-PER", "index": 18, "score": 0.9996136426925659, "word": "##a"}, + {"entity": "B-ORG", "index": 28, "score": 0.9989739060401917, "word": "Far"}, + {"entity": "I-ORG", "index": 29, "score": 0.7188422083854675, "word": "##c"}, + ], + [ + {"entity": "I-PER", "index": 1, "score": 0.9968166351318359, "word": "En"}, + {"entity": "I-PER", "index": 2, "score": 0.9957635998725891, "word": "##zo"}, + {"entity": "I-ORG", "index": 7, "score": 0.9986497163772583, "word": "UN"}, + ], + ] + expected_grouped_ner_results = [ + [ + {"entity_group": "B-PER", "score": 0.9710702640669686, "word": "Consuelo Araújo Noguera"}, + {"entity_group": "B-PER", "score": 0.9997273534536362, "word": "Andrés Pastrana"}, + {"entity_group": "B-ORG", "score": 0.8589080572128296, "word": "Farc"}, + ], + [ + {"entity_group": "I-PER", "score": 0.9962901175022125, "word": "Enzo"}, + {"entity_group": "I-ORG", "score": 0.9986497163772583, "word": "UN"}, + ], + ] + self.assertIsNotNone(nlp) mono_result = nlp(VALID_INPUTS[0]) @@ -402,6 +428,9 @@ def _test_ner_pipeline( for key in output_keys: self.assertIn(key, result) + for ungrouped_input, grouped_result in zip(ungrouped_ner_inputs, expected_grouped_ner_results): + self.assertEqual(nlp.group_entities(ungrouped_input), grouped_result) + @require_torch def test_torch_ner(self): mandatory_keys = {"entity", "word", "score"} From 4a98747a0473aaf2ebce529e309b0ca6996b91a7 Mon Sep 17 00:00:00 2001 From: Lorenzo Ampil Date: Sun, 5 Jul 2020 02:04:41 +0800 Subject: [PATCH 20/22] Fix last entity handling --- src/transformers/pipelines.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py index b2ad4c1b140444..b812d4ddbb8977 100755 --- a/src/transformers/pipelines.py +++ b/src/transformers/pipelines.py @@ -1081,10 +1081,9 @@ def group_entities(self, entities): else: entity_groups += [self.group_sub_entities(entity_group_disagg)] entity_group_disagg = [entity] - - # Ensure if an entity is the latest one in the sequence it gets appended to the output - if len(entity_group_disagg) > 0: - entity_groups.append(self.group_sub_entities(entity_group_disagg)) + # If it's the last entity, add it to the entity groups + if is_last_idx: + entity_groups += [self.group_entities(entity_group_disagg)] return entity_groups From 8f29ef9eee1a11241cc3467803a8e238d7e0c9ef Mon Sep 17 00:00:00 2001 From: Lorenzo Ampil Date: Sun, 5 Jul 2020 02:21:33 +0800 Subject: [PATCH 21/22] Fix grouping fucntion used --- src/transformers/pipelines.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py index b812d4ddbb8977..fbac21cec936d6 100755 --- a/src/transformers/pipelines.py +++ b/src/transformers/pipelines.py @@ -1083,7 +1083,7 @@ def group_entities(self, entities): entity_group_disagg = [entity] # If it's the last entity, add it to the entity groups if is_last_idx: - entity_groups += [self.group_entities(entity_group_disagg)] + entity_groups += [self.group_sub_entities(entity_group_disagg)] return entity_groups From 05f50d9d58932e13f55a6c3ade51e43a7e1ff390 Mon Sep 17 00:00:00 2001 From: Lorenzo Ampil Date: Wed, 8 Jul 2020 07:25:27 +0800 Subject: [PATCH 22/22] Add typing to group_sub_entities and group_entities --- src/transformers/pipelines.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py index fbac21cec936d6..7b7a56bfbd16c1 100755 --- a/src/transformers/pipelines.py +++ b/src/transformers/pipelines.py @@ -1032,7 +1032,7 @@ def __call__(self, *args, **kwargs): return answers[0] return answers - def group_sub_entities(self, entities): + def group_sub_entities(self, entities: List[dict]) -> dict: """ Returns grouped sub entities """ @@ -1048,7 +1048,7 @@ def group_sub_entities(self, entities): } return entity_group - def group_entities(self, entities): + def group_entities(self, entities: List[dict]) -> List[dict]: """ Returns grouped entities """