-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathword_tokenizer.py
56 lines (45 loc) · 2.11 KB
/
word_tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
"""WordpieceTokenizer classes."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import unicodedata
from tokenizer_utils import (load_vocab,
convert_to_unicode,
clean_text,
split_on_whitespace,
convert_by_vocab,
tokenize_chinese_chars)
class WordTokenizer(object):
"""Runs WordPiece tokenziation."""
def __init__(self, vocab = None, unk_token="[UNK]"):
self.vocab = load_vocab(vocab)
self.inv_vocab = {v: k for k, v in self.vocab.items()}
self.unk_token = unk_token
def tokenize(self, text):
""" Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform tokenization
using the given vocabulary.
For example:
input = "unaffable"
output = ["un", "##aff", "##able"]
Args:
text: A single token or whitespace separated tokens. This should have already been passed through `BasicTokenizer`.
Returns:
output_tokens: A list of wordpiece tokens.
current_positions: A list of the current positions for the original words in text .
"""
text = convert_to_unicode(text)
text = clean_text(text)
text = tokenize_chinese_chars(text)
# output_tokens = []
token_list = split_on_whitespace(text)
# for chars in token_list:
# # current_positions.append([])
# if chars in self.vocab:
# output_tokens.append(chars)
# else:
# output_tokens.append(self.unk_token)
return token_list
def convert_tokens_to_ids(self, tokens, max_seq_length = None, blank_id = 0, unk_id = 1, uncased = True):
return convert_by_vocab(self.vocab, tokens, max_seq_length, blank_id, unk_id, uncased=uncased)
def convert_ids_to_tokens(self, ids):
return convert_by_vocab(self.inv_vocab, ids)