Skip to content

Commit

Permalink
Adding tokenizer alignment function (#953)
Browse files Browse the repository at this point in the history
* Copying configs from superglue

* adding senteval probing config commands

* adding meta-script for transfer and probing exps

* Adding meta bash script fixed

* give_permissions script

* small fix transfer_analysis.sh (#946)

model_*.th might indicate several models; fixed to model_*.best.th

* lr_patience fix

* target_task training -> pretrain training

* adding edgeprobing configs and command

* adding edge probing conf

* fix load_target_train bug

* add hyperparameter sweeping

* val_interval change

* adding sweep function

* Task specific val_intervals

* add reload_vocab to hyperparameter sweep

* adding batch_size specification

* fixing senteval-word-content

* fixing senteval preprocess script

* revert extra delete

* remove extra files

* black format

* black formatting trainer.py

* remove load_data()

* removing extra changes

* adding alignment mapping function

* fix comment nits

* comment nit

* adding example of token_alignment
  • Loading branch information
Yada Pruksachatkun authored Nov 6, 2019
1 parent bf98de4 commit 300251b
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 0 deletions.
29 changes: 29 additions & 0 deletions jiant/utils/retokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,35 @@ def _mat_from_spans_sparse(spans: Sequence[Tuple[int, int]], n_chars: int) -> Ma
return sparse.csr_matrix((data, (ridxs, cidxs)), shape=(len(spans), n_chars))


def create_tokenization_alignment(
tokens: Sequence[str], tokenizer_name: str
) -> Sequence[Tuple[str, str]]:
"""
Builds alignment mapping between space tokenization and tokenization of
choice.
Example:
Input: ['Larger', 'than', 'life.']
Output: [('Larger', ['ĠL', 'arger']), ('than', ['Ġthan']), ('life.', ['Ġlife', '.'])]
Parameters
-----------------------
tokens: list[(str)]. list of tokens,
tokenizer_name: str
Returns
-----------------------
tokenization_mapping: list[(str, str)], list of tuples with (orig_token, tokenized_token).
"""
tokenizer = get_tokenizer(tokenizer_name)
tokenization_mapping = []
for tok in tokens:
aligned_tok = tokenizer.tokenize(tok)
tokenization_mapping.append((tok, aligned_tok))
return tokenization_mapping


def realign_spans(record, tokenizer_name):
"""
Builds the indices alignment while also tokenizing the input
Expand Down
20 changes: 20 additions & 0 deletions tests/test_retokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,11 @@ def test_wpm(self):
[ta.project_span(start, end) for (start, end) in span_idxs]
for ta, span_idxs in zip(tas, self.span_index_src)
]
orig_tokens = self.text[0].split()
alignment_map = retokenize.create_tokenization_alignment(orig_tokens, "bert-base-cased")
wpm_tokens = self.tokens[0]
for i, v in enumerate(alignment_map):
assert v[0] == orig_tokens[i] and ",".join(v[1]) == wpm_tokens[i]
assert self.tokens == tokens
assert self.token_index_tgt == token_index_tgt
assert self.span_index_tgt == span_index_tgt
Expand Down Expand Up @@ -209,6 +214,11 @@ def test_bpe(self):
[ta.project_span(start, end) for (start, end) in span_idxs]
for ta, span_idxs in zip(tas, self.span_index_src)
]
orig_tokens = self.text[0].split()
alignment_map = retokenize.create_tokenization_alignment(orig_tokens, "openai-gpt")
bpe_tokens = self.tokens[0]
for i, v in enumerate(alignment_map):
assert v[0] == orig_tokens[i] and ",".join(v[1]) == bpe_tokens[i]
assert self.tokens == tokens
assert self.token_index_tgt == token_index_tgt
assert self.span_index_tgt == span_index_tgt
Expand Down Expand Up @@ -292,6 +302,11 @@ def test_sentencepiece(self):
[ta.project_span(start, end) for (start, end) in span_idxs]
for ta, span_idxs in zip(tas, self.span_index_src)
]
orig_tokens = self.text[0].split()
alignment_map = retokenize.create_tokenization_alignment(orig_tokens, "xlnet-base-cased")
se_tokens = self.tokens[0]
for i, v in enumerate(alignment_map):
assert v[0] == orig_tokens[i] and ",".join(v[1]) == se_tokens[i]
assert self.tokens == tokens
assert self.token_index_tgt == token_index_tgt
assert self.span_index_tgt == span_index_tgt
Expand Down Expand Up @@ -344,6 +359,11 @@ def test_bytebpe(self):
[ta.project_span(start, end) for (start, end) in span_idxs]
for ta, span_idxs in zip(tas, self.span_index_src)
]
orig_tokens = self.text[0].split()
alignment_map = retokenize.create_tokenization_alignment(orig_tokens, "roberta-base")
bytebpe_tokens = ["ĠMembers", "Ġof", "Ġthe", "ĠHouse", "Ġcl,apped", "Ġtheir", "Ġhands"]
for i, v in enumerate(alignment_map):
assert v[0] == orig_tokens[i] and ",".join(v[1]) == bytebpe_tokens[i]
assert self.tokens == tokens
assert self.token_index_tgt == token_index_tgt
assert self.span_index_tgt == span_index_tgt

0 comments on commit 300251b

Please sign in to comment.