diff --git a/ark_nlp/processor/tokenizer/transfomer.py b/ark_nlp/processor/tokenizer/transfomer.py index 75b3e8f..37aeabe 100644 --- a/ark_nlp/processor/tokenizer/transfomer.py +++ b/ark_nlp/processor/tokenizer/transfomer.py @@ -229,8 +229,13 @@ class TokenTokenizer(TransfomerTokenizer): def tokenize(self, text, **kwargs): tokens = [] - text = ' '.join([token_ for token_ in text]) - tokens = self.vocab.tokenize(text) + for token_ in text: + tokenized_token_ = self.vocab.tokenize(token_) + if tokenized_token_ == []: + tokens.extend([token_]) + else: + tokens.extend(tokenized_token_) + return tokens def sequence_to_ids(self, sequence, **kwargs):