Revert "Patch token classification pipeline (huggingface#8364)"

This reverts commit 9b63735.
fabiocapsouza · Nov 15, 2020 · caeb163 · caeb163
1 parent a4901c5
commit caeb163
Show file tree

Hide file tree

Showing 2 changed files with 48 additions and 104 deletions.
diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py
@@ -1333,17 +1333,18 @@ class TokenClassificationArgumentHandler(ArgumentHandler):
     def __call__(self, *args, **kwargs):
 
         if args is not None and len(args) > 0:
-            inputs = list(args)
+            if isinstance(args, str):
+                inputs = [args]
+            else:
+                inputs = args
             batch_size = len(inputs)
-        else:
-            raise ValueError("At least one input is required.")
 
-        offset_mapping = kwargs.get("offset_mapping")
+        offset_mapping = kwargs.get("offset_mapping", None)
         if offset_mapping:
             if isinstance(offset_mapping, list) and isinstance(offset_mapping[0], tuple):
                 offset_mapping = [offset_mapping]
             if len(offset_mapping) != batch_size:
-                raise ValueError("offset_mapping should have the same batch size as the input")
+                raise ("offset_mapping should have the same batch size as the input")
         return inputs, offset_mapping
 
 
@@ -1378,19 +1379,20 @@ def __init__(
         tokenizer: PreTrainedTokenizer,
         modelcard: Optional[ModelCard] = None,
         framework: Optional[str] = None,
-        args_parser: ArgumentHandler = TokenClassificationArgumentHandler(),
+        args_parser: ArgumentHandler = None,
         device: int = -1,
         binary_output: bool = False,
         ignore_labels=["O"],
         task: str = "",
         grouped_entities: bool = False,
-        ignore_subwords: bool = False,
+        ignore_subwords: bool = True,
     ):
         super().__init__(
             model=model,
             tokenizer=tokenizer,
             modelcard=modelcard,
             framework=framework,
+            args_parser=TokenClassificationArgumentHandler(),
             device=device,
             binary_output=binary_output,
             task=task,
@@ -1403,17 +1405,10 @@ def __init__(
         )
 
         self._basic_tokenizer = BasicTokenizer(do_lower_case=False)
-        self._args_parser = args_parser
         self.ignore_labels = ignore_labels
         self.grouped_entities = grouped_entities
         self.ignore_subwords = ignore_subwords
 
-        if self.ignore_subwords and not self.tokenizer.is_fast:
-            raise ValueError(
-                "Slow tokenizers cannot ignore subwords. Please set the `ignore_subwords` option"
-                "to `False` or use a fast tokenizer."
-            )
-
     def __call__(self, inputs: Union[str, List[str]], **kwargs):
         """
         Classify each token of the text(s) given as inputs.
@@ -1434,7 +1429,10 @@ def __call__(self, inputs: Union[str, List[str]], **kwargs):
               corresponding token in the sentence.
         """
 
-        inputs, offset_mappings = self._args_parser(inputs, **kwargs)
+        if isinstance(inputs, str):
+            inputs = [inputs]
+
+        offset_mappings = kwargs.get("offset_mappings")
 
         answers = []
 
@@ -1452,13 +1450,14 @@ def __call__(self, inputs: Union[str, List[str]], **kwargs):
                     return_offsets_mapping=self.tokenizer.is_fast,
                 )
                 if self.tokenizer.is_fast:
-                    offset_mapping = tokens.pop("offset_mapping").cpu().numpy()[0]
+                    offset_mapping = tokens["offset_mapping"].cpu().numpy()[0]
+                    del tokens["offset_mapping"]
                 elif offset_mappings:
                     offset_mapping = offset_mappings[i]
                 else:
-                    offset_mapping = None
-
-                special_tokens_mask = tokens.pop("special_tokens_mask").cpu().numpy()[0]
+                    raise Exception("To decode [UNK] tokens use a fast tokenizer or provide offset_mapping parameter")
+                special_tokens_mask = tokens["special_tokens_mask"].cpu().numpy()[0]
+                del tokens["special_tokens_mask"]
 
                 # Forward
                 if self.framework == "tf":
@@ -1483,17 +1482,14 @@ def __call__(self, inputs: Union[str, List[str]], **kwargs):
             ]
 
             for idx, label_idx in filtered_labels_idx:
-                if offset_mapping is not None:
-                    start_ind, end_ind = offset_mapping[idx]
-                    word_ref = sentence[start_ind:end_ind]
-                    word = self.tokenizer.convert_ids_to_tokens([int(input_ids[idx])])[0]
-                    is_subword = len(word_ref) != len(word)
-
-                    if int(input_ids[idx]) == self.tokenizer.unk_token_id:
-                        word = word_ref
-                        is_subword = False
-                else:
-                    word = self.tokenizer.convert_ids_to_tokens(int(input_ids[idx]))
+                start_ind, end_ind = offset_mapping[idx]
+                word_ref = sentence[start_ind:end_ind]
+                word = self.tokenizer.convert_ids_to_tokens([int(input_ids[idx])])[0]
+                is_subword = len(word_ref) != len(word)
+
+                if int(input_ids[idx]) == self.tokenizer.unk_token_id:
+                    word = word_ref
+                    is_subword = False
 
                 entity = {
                     "word": word,

diff --git a/tests/test_pipelines_ner.py b/tests/test_pipelines_ner.py
@@ -1,7 +1,7 @@
 import unittest
 
 from transformers import AutoTokenizer, pipeline
-from transformers.pipelines import Pipeline, TokenClassificationArgumentHandler
+from transformers.pipelines import Pipeline
 from transformers.testing_utils import require_tf, require_torch
 
 from .test_pipelines_common import CustomInputPipelineCommonMixin
@@ -107,9 +107,13 @@ def _test_pipeline(self, nlp: Pipeline):
     def test_tf_only(self):
         model_name = "Narsil/small"  # This model only has a TensorFlow version
         # We test that if we don't specificy framework='tf', it gets detected automatically
-        nlp = pipeline(task="ner", model=model_name)
+        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
+        nlp = pipeline(task="ner", model=model_name, tokenizer=tokenizer)
         self._test_pipeline(nlp)
 
+    #         offset=tokenizer(VALID_INPUTS[0],return_offsets_mapping=True)['offset_mapping']
+    #         pipeline_running_kwargs = {"offset_mapping"}  # Additional kwargs to run the pipeline with
+
     @require_tf
     def test_tf_defaults(self):
         for model_name in self.small_models:
@@ -118,8 +122,9 @@ def test_tf_defaults(self):
         self._test_pipeline(nlp)
 
     @require_tf
-    def test_tf_small_ignore_subwords_available_for_fast_tokenizers(self):
+    def test_tf_small(self):
         for model_name in self.small_models:
+            print(model_name)
             tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
             nlp = pipeline(
                 task="ner",
@@ -131,41 +136,27 @@ def test_tf_small_ignore_subwords_available_for_fast_tokenizers(self):
             )
             self._test_pipeline(nlp)
 
-        for model_name in self.small_models:
-            tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
-            nlp = pipeline(
-                task="ner",
-                model=model_name,
-                tokenizer=tokenizer,
-                framework="tf",
-                grouped_entities=True,
-                ignore_subwords=False,
-            )
-            self._test_pipeline(nlp)
-
-    @require_torch
-    def test_pt_ignore_subwords_slow_tokenizer_raises(self):
-        for model_name in self.small_models:
-            tokenizer = AutoTokenizer.from_pretrained(model_name)
-
-            with self.assertRaises(ValueError):
-                pipeline(task="ner", model=model_name, tokenizer=tokenizer, ignore_subwords=True)
-
-    @require_torch
-    def test_pt_defaults_slow_tokenizer(self):
-        for model_name in self.small_models:
-            tokenizer = AutoTokenizer.from_pretrained(model_name)
-            nlp = pipeline(task="ner", model=model_name, tokenizer=tokenizer)
-            self._test_pipeline(nlp)
+            for model_name in self.small_models:
+                tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
+                nlp = pipeline(
+                    task="ner",
+                    model=model_name,
+                    tokenizer=tokenizer,
+                    framework="tf",
+                    grouped_entities=True,
+                    ignore_subwords=False,
+                )
+                self._test_pipeline(nlp)
 
     @require_torch
     def test_pt_defaults(self):
         for model_name in self.small_models:
-            nlp = pipeline(task="ner", model=model_name)
+            tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
+            nlp = pipeline(task="ner", model=model_name, tokenizer=tokenizer)
             self._test_pipeline(nlp)
 
     @require_torch
-    def test_pt_small_ignore_subwords_available_for_fast_tokenizers(self):
+    def test_torch_small(self):
         for model_name in self.small_models:
             tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
             nlp = pipeline(
@@ -179,46 +170,3 @@ def test_pt_small_ignore_subwords_available_for_fast_tokenizers(self):
                 task="ner", model=model_name, tokenizer=tokenizer, grouped_entities=True, ignore_subwords=False
             )
             self._test_pipeline(nlp)
-
-
-class TokenClassificationArgumentHandlerTestCase(unittest.TestCase):
-    def setUp(self):
-        self.args_parser = TokenClassificationArgumentHandler()
-
-    def test_simple(self):
-        string = "This is a simple input"
-
-        inputs, offset_mapping = self.args_parser(string)
-        self.assertEqual(inputs, [string])
-        self.assertEqual(offset_mapping, None)
-
-        inputs, offset_mapping = self.args_parser(string, string)
-        self.assertEqual(inputs, [string, string])
-        self.assertEqual(offset_mapping, None)
-
-        inputs, offset_mapping = self.args_parser(string, offset_mapping=[(0, 1), (1, 2)])
-        self.assertEqual(inputs, [string])
-        self.assertEqual(offset_mapping, [[(0, 1), (1, 2)]])
-
-        inputs, offset_mapping = self.args_parser(string, string, offset_mapping=[[(0, 1), (1, 2)], [(0, 2), (2, 3)]])
-        self.assertEqual(inputs, [string, string])
-        self.assertEqual(offset_mapping, [[(0, 1), (1, 2)], [(0, 2), (2, 3)]])
-
-    def test_errors(self):
-        string = "This is a simple input"
-
-        # 2 sentences, 1 offset_mapping
-        with self.assertRaises(ValueError):
-            self.args_parser(string, string, offset_mapping=[[(0, 1), (1, 2)]])
-
-        # 2 sentences, 1 offset_mapping
-        with self.assertRaises(ValueError):
-            self.args_parser(string, string, offset_mapping=[(0, 1), (1, 2)])
-
-        # 1 sentences, 2 offset_mapping
-        with self.assertRaises(ValueError):
-            self.args_parser(string, offset_mapping=[[(0, 1), (1, 2)], [(0, 2), (2, 3)]])
-
-        # 0 sentences, 1 offset_mapping
-        with self.assertRaises(ValueError):
-            self.args_parser(offset_mapping=[[(0, 1), (1, 2)]])