Skip to content

Commit

Permalink
fix(tokenizer): 修复TokenTokenizer无法解决空格的问题
Browse files Browse the repository at this point in the history
Closes #37
  • Loading branch information
王翔 committed Mar 26, 2022
1 parent 72391f8 commit 4956c88
Showing 1 changed file with 7 additions and 2 deletions.
9 changes: 7 additions & 2 deletions ark_nlp/processor/tokenizer/transfomer.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,8 +229,13 @@ class TokenTokenizer(TransfomerTokenizer):

def tokenize(self, text, **kwargs):
tokens = []
text = ' '.join([token_ for token_ in text])
tokens = self.vocab.tokenize(text)
for token_ in text:
tokenized_token_ = self.vocab.tokenize(token_)
if tokenized_token_ == []:
tokens.extend([token_])
else:
tokens.extend(tokenized_token_)

return tokens

def sequence_to_ids(self, sequence, **kwargs):
Expand Down

0 comments on commit 4956c88

Please sign in to comment.