Skip to content

Commit

Permalink
Revert "Patch token classification pipeline (huggingface#8364)"
Browse files Browse the repository at this point in the history
This reverts commit 9b63735.
  • Loading branch information
fabiocapsouza authored Nov 15, 2020
1 parent a4901c5 commit caeb163
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 104 deletions.
56 changes: 26 additions & 30 deletions src/transformers/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -1333,17 +1333,18 @@ class TokenClassificationArgumentHandler(ArgumentHandler):
def __call__(self, *args, **kwargs):

if args is not None and len(args) > 0:
inputs = list(args)
if isinstance(args, str):
inputs = [args]
else:
inputs = args
batch_size = len(inputs)
else:
raise ValueError("At least one input is required.")

offset_mapping = kwargs.get("offset_mapping")
offset_mapping = kwargs.get("offset_mapping", None)
if offset_mapping:
if isinstance(offset_mapping, list) and isinstance(offset_mapping[0], tuple):
offset_mapping = [offset_mapping]
if len(offset_mapping) != batch_size:
raise ValueError("offset_mapping should have the same batch size as the input")
raise ("offset_mapping should have the same batch size as the input")
return inputs, offset_mapping


Expand Down Expand Up @@ -1378,19 +1379,20 @@ def __init__(
tokenizer: PreTrainedTokenizer,
modelcard: Optional[ModelCard] = None,
framework: Optional[str] = None,
args_parser: ArgumentHandler = TokenClassificationArgumentHandler(),
args_parser: ArgumentHandler = None,
device: int = -1,
binary_output: bool = False,
ignore_labels=["O"],
task: str = "",
grouped_entities: bool = False,
ignore_subwords: bool = False,
ignore_subwords: bool = True,
):
super().__init__(
model=model,
tokenizer=tokenizer,
modelcard=modelcard,
framework=framework,
args_parser=TokenClassificationArgumentHandler(),
device=device,
binary_output=binary_output,
task=task,
Expand All @@ -1403,17 +1405,10 @@ def __init__(
)

self._basic_tokenizer = BasicTokenizer(do_lower_case=False)
self._args_parser = args_parser
self.ignore_labels = ignore_labels
self.grouped_entities = grouped_entities
self.ignore_subwords = ignore_subwords

if self.ignore_subwords and not self.tokenizer.is_fast:
raise ValueError(
"Slow tokenizers cannot ignore subwords. Please set the `ignore_subwords` option"
"to `False` or use a fast tokenizer."
)

def __call__(self, inputs: Union[str, List[str]], **kwargs):
"""
Classify each token of the text(s) given as inputs.
Expand All @@ -1434,7 +1429,10 @@ def __call__(self, inputs: Union[str, List[str]], **kwargs):
corresponding token in the sentence.
"""

inputs, offset_mappings = self._args_parser(inputs, **kwargs)
if isinstance(inputs, str):
inputs = [inputs]

offset_mappings = kwargs.get("offset_mappings")

answers = []

Expand All @@ -1452,13 +1450,14 @@ def __call__(self, inputs: Union[str, List[str]], **kwargs):
return_offsets_mapping=self.tokenizer.is_fast,
)
if self.tokenizer.is_fast:
offset_mapping = tokens.pop("offset_mapping").cpu().numpy()[0]
offset_mapping = tokens["offset_mapping"].cpu().numpy()[0]
del tokens["offset_mapping"]
elif offset_mappings:
offset_mapping = offset_mappings[i]
else:
offset_mapping = None

special_tokens_mask = tokens.pop("special_tokens_mask").cpu().numpy()[0]
raise Exception("To decode [UNK] tokens use a fast tokenizer or provide offset_mapping parameter")
special_tokens_mask = tokens["special_tokens_mask"].cpu().numpy()[0]
del tokens["special_tokens_mask"]

# Forward
if self.framework == "tf":
Expand All @@ -1483,17 +1482,14 @@ def __call__(self, inputs: Union[str, List[str]], **kwargs):
]

for idx, label_idx in filtered_labels_idx:
if offset_mapping is not None:
start_ind, end_ind = offset_mapping[idx]
word_ref = sentence[start_ind:end_ind]
word = self.tokenizer.convert_ids_to_tokens([int(input_ids[idx])])[0]
is_subword = len(word_ref) != len(word)

if int(input_ids[idx]) == self.tokenizer.unk_token_id:
word = word_ref
is_subword = False
else:
word = self.tokenizer.convert_ids_to_tokens(int(input_ids[idx]))
start_ind, end_ind = offset_mapping[idx]
word_ref = sentence[start_ind:end_ind]
word = self.tokenizer.convert_ids_to_tokens([int(input_ids[idx])])[0]
is_subword = len(word_ref) != len(word)

if int(input_ids[idx]) == self.tokenizer.unk_token_id:
word = word_ref
is_subword = False

entity = {
"word": word,
Expand Down
96 changes: 22 additions & 74 deletions tests/test_pipelines_ner.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import unittest

from transformers import AutoTokenizer, pipeline
from transformers.pipelines import Pipeline, TokenClassificationArgumentHandler
from transformers.pipelines import Pipeline
from transformers.testing_utils import require_tf, require_torch

from .test_pipelines_common import CustomInputPipelineCommonMixin
Expand Down Expand Up @@ -107,9 +107,13 @@ def _test_pipeline(self, nlp: Pipeline):
def test_tf_only(self):
model_name = "Narsil/small" # This model only has a TensorFlow version
# We test that if we don't specificy framework='tf', it gets detected automatically
nlp = pipeline(task="ner", model=model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
nlp = pipeline(task="ner", model=model_name, tokenizer=tokenizer)
self._test_pipeline(nlp)

# offset=tokenizer(VALID_INPUTS[0],return_offsets_mapping=True)['offset_mapping']
# pipeline_running_kwargs = {"offset_mapping"} # Additional kwargs to run the pipeline with

@require_tf
def test_tf_defaults(self):
for model_name in self.small_models:
Expand All @@ -118,8 +122,9 @@ def test_tf_defaults(self):
self._test_pipeline(nlp)

@require_tf
def test_tf_small_ignore_subwords_available_for_fast_tokenizers(self):
def test_tf_small(self):
for model_name in self.small_models:
print(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
nlp = pipeline(
task="ner",
Expand All @@ -131,41 +136,27 @@ def test_tf_small_ignore_subwords_available_for_fast_tokenizers(self):
)
self._test_pipeline(nlp)

for model_name in self.small_models:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
nlp = pipeline(
task="ner",
model=model_name,
tokenizer=tokenizer,
framework="tf",
grouped_entities=True,
ignore_subwords=False,
)
self._test_pipeline(nlp)

@require_torch
def test_pt_ignore_subwords_slow_tokenizer_raises(self):
for model_name in self.small_models:
tokenizer = AutoTokenizer.from_pretrained(model_name)

with self.assertRaises(ValueError):
pipeline(task="ner", model=model_name, tokenizer=tokenizer, ignore_subwords=True)

@require_torch
def test_pt_defaults_slow_tokenizer(self):
for model_name in self.small_models:
tokenizer = AutoTokenizer.from_pretrained(model_name)
nlp = pipeline(task="ner", model=model_name, tokenizer=tokenizer)
self._test_pipeline(nlp)
for model_name in self.small_models:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
nlp = pipeline(
task="ner",
model=model_name,
tokenizer=tokenizer,
framework="tf",
grouped_entities=True,
ignore_subwords=False,
)
self._test_pipeline(nlp)

@require_torch
def test_pt_defaults(self):
for model_name in self.small_models:
nlp = pipeline(task="ner", model=model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
nlp = pipeline(task="ner", model=model_name, tokenizer=tokenizer)
self._test_pipeline(nlp)

@require_torch
def test_pt_small_ignore_subwords_available_for_fast_tokenizers(self):
def test_torch_small(self):
for model_name in self.small_models:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
nlp = pipeline(
Expand All @@ -179,46 +170,3 @@ def test_pt_small_ignore_subwords_available_for_fast_tokenizers(self):
task="ner", model=model_name, tokenizer=tokenizer, grouped_entities=True, ignore_subwords=False
)
self._test_pipeline(nlp)


class TokenClassificationArgumentHandlerTestCase(unittest.TestCase):
def setUp(self):
self.args_parser = TokenClassificationArgumentHandler()

def test_simple(self):
string = "This is a simple input"

inputs, offset_mapping = self.args_parser(string)
self.assertEqual(inputs, [string])
self.assertEqual(offset_mapping, None)

inputs, offset_mapping = self.args_parser(string, string)
self.assertEqual(inputs, [string, string])
self.assertEqual(offset_mapping, None)

inputs, offset_mapping = self.args_parser(string, offset_mapping=[(0, 1), (1, 2)])
self.assertEqual(inputs, [string])
self.assertEqual(offset_mapping, [[(0, 1), (1, 2)]])

inputs, offset_mapping = self.args_parser(string, string, offset_mapping=[[(0, 1), (1, 2)], [(0, 2), (2, 3)]])
self.assertEqual(inputs, [string, string])
self.assertEqual(offset_mapping, [[(0, 1), (1, 2)], [(0, 2), (2, 3)]])

def test_errors(self):
string = "This is a simple input"

# 2 sentences, 1 offset_mapping
with self.assertRaises(ValueError):
self.args_parser(string, string, offset_mapping=[[(0, 1), (1, 2)]])

# 2 sentences, 1 offset_mapping
with self.assertRaises(ValueError):
self.args_parser(string, string, offset_mapping=[(0, 1), (1, 2)])

# 1 sentences, 2 offset_mapping
with self.assertRaises(ValueError):
self.args_parser(string, offset_mapping=[[(0, 1), (1, 2)], [(0, 2), (2, 3)]])

# 0 sentences, 1 offset_mapping
with self.assertRaises(ValueError):
self.args_parser(offset_mapping=[[(0, 1), (1, 2)]])

0 comments on commit caeb163

Please sign in to comment.