Skip to content

Commit

Permalink
Add tokenization examples in ptr4tr book (#491)
Browse files Browse the repository at this point in the history
  • Loading branch information
lintool authored Apr 21, 2021
1 parent 0b35e5b commit de3e715
Showing 1 changed file with 35 additions and 0 deletions.
35 changes: 35 additions & 0 deletions tests/test_tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,46 @@ def test_bert_base_uncased(self):
tokens = tokenizer.tokenize('I have a new GPU!')
self.assertEqual(['i', 'have', 'a', 'new', 'gp', '##u', '!'], tokens)

def test_bert_base_uncased(self):
# These are examples used in the ptr4tr book
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokens = tokenizer.tokenize('walking talking balking biking hiking rolling scrolling')
self.assertEqual(['walking', 'talking', 'bal', '##king', 'biking', 'hiking', 'rolling', 'scrolling'], tokens)

tokens = tokenizer.tokenize('biostatistics')
self.assertEqual(['bio', '##sta', '##tist', '##ics'], tokens)

tokens = tokenizer.tokenize('adversarial')
self.assertEqual(['ad', '##vers', '##aria', '##l'], tokens)

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
tokens = tokenizer.tokenize('walking talking balking biking hiking')
self.assertEqual(['walking', 'talking', 'b', '##alk', '##ing', 'bi', '##king', 'hiking'], tokens)

tokens = tokenizer.tokenize('rolling scrolling')
self.assertEqual(['rolling', 'scroll', '##ing'], tokens)

tokens = tokenizer.tokenize('biostatistics')
self.assertEqual(['bio', '##sta', '##tist', '##ics'], tokens)

tokens = tokenizer.tokenize('adversarial')
self.assertEqual(['ad', '##vers', '##aria', '##l'], tokens)

def test_doc2query(self):
tokenizer = T5Tokenizer.from_pretrained('castorini/doc2query-t5-base-msmarco')
tokens = tokenizer.tokenize('I have a new GPU!')
self.assertEqual(['▁I', '▁have', '▁', 'a', '▁new', '▁GPU', '!'], tokens)

tokenizer = T5Tokenizer.from_pretrained('castorini/doc2query-t5-base-msmarco')
tokens = tokenizer.tokenize('walking talking biking scrolling')
self.assertEqual(['▁walking', '▁talking', '▁biking', '▁scroll', 'ing'], tokens)

tokens = tokenizer.tokenize('biostatistics')
self.assertEqual(['▁bio', 'stat', 'istic', 's'], tokens)

tokens = tokenizer.tokenize('adversarial')
self.assertEqual(['▁adversar', 'i', 'al'], tokens)

def tearDown(self):
pass

Expand Down

0 comments on commit de3e715

Please sign in to comment.