Adding tokenizer alignment function (#953)

* Copying configs from superglue * adding senteval probing config commands * adding meta-script for transfer and probing exps * Adding meta bash script fixed * give_permissions script * small fix transfer_analysis.sh (#946) model_*.th might indicate several models; fixed to model_*.best.th * lr_patience fix * target_task training -> pretrain training * adding edgeprobing configs and command * adding edge probing conf * fix load_target_train bug * add hyperparameter sweeping * val_interval change * adding sweep function * Task specific val_intervals * add reload_vocab to hyperparameter sweep * adding batch_size specification * fixing senteval-word-content * fixing senteval preprocess script * revert extra delete * remove extra files * black format * black formatting trainer.py * remove load_data() * removing extra changes * adding alignment mapping function * fix comment nits * comment nit * adding example of token_alignment
nyu-mll · Nov 6, 2019 · 300251b · 300251b
1 parent bf98de4
commit 300251b
Show file tree

Hide file tree

Showing 2 changed files with 49 additions and 0 deletions.
diff --git a/jiant/utils/retokenize.py b/jiant/utils/retokenize.py
@@ -95,6 +95,35 @@ def _mat_from_spans_sparse(spans: Sequence[Tuple[int, int]], n_chars: int) -> Ma
     return sparse.csr_matrix((data, (ridxs, cidxs)), shape=(len(spans), n_chars))
 
 
+def create_tokenization_alignment(
+    tokens: Sequence[str], tokenizer_name: str
+) -> Sequence[Tuple[str, str]]:
+    """
+    Builds alignment mapping between space tokenization and tokenization of 
+    choice. 
+    
+    Example:
+        Input: ['Larger', 'than', 'life.']
+        Output: [('Larger', ['ĠL', 'arger']), ('than', ['Ġthan']), ('life.', ['Ġlife', '.'])]
+
+    Parameters
+    -----------------------
+        tokens: list[(str)]. list of tokens, 
+        tokenizer_name: str
+
+    Returns
+    -----------------------
+        tokenization_mapping: list[(str, str)], list of tuples with (orig_token, tokenized_token).
+
+    """
+    tokenizer = get_tokenizer(tokenizer_name)
+    tokenization_mapping = []
+    for tok in tokens:
+        aligned_tok = tokenizer.tokenize(tok)
+        tokenization_mapping.append((tok, aligned_tok))
+    return tokenization_mapping
+
+
 def realign_spans(record, tokenizer_name):
     """
     Builds the indices alignment while also tokenizing the input

diff --git a/tests/test_retokenize.py b/tests/test_retokenize.py
@@ -136,6 +136,11 @@ def test_wpm(self):
             [ta.project_span(start, end) for (start, end) in span_idxs]
             for ta, span_idxs in zip(tas, self.span_index_src)
         ]
+        orig_tokens = self.text[0].split()
+        alignment_map = retokenize.create_tokenization_alignment(orig_tokens, "bert-base-cased")
+        wpm_tokens = self.tokens[0]
+        for i, v in enumerate(alignment_map):
+            assert v[0] == orig_tokens[i] and ",".join(v[1]) == wpm_tokens[i]
         assert self.tokens == tokens
         assert self.token_index_tgt == token_index_tgt
         assert self.span_index_tgt == span_index_tgt
@@ -209,6 +214,11 @@ def test_bpe(self):
             [ta.project_span(start, end) for (start, end) in span_idxs]
             for ta, span_idxs in zip(tas, self.span_index_src)
         ]
+        orig_tokens = self.text[0].split()
+        alignment_map = retokenize.create_tokenization_alignment(orig_tokens, "openai-gpt")
+        bpe_tokens = self.tokens[0]
+        for i, v in enumerate(alignment_map):
+            assert v[0] == orig_tokens[i] and ",".join(v[1]) == bpe_tokens[i]
         assert self.tokens == tokens
         assert self.token_index_tgt == token_index_tgt
         assert self.span_index_tgt == span_index_tgt
@@ -292,6 +302,11 @@ def test_sentencepiece(self):
             [ta.project_span(start, end) for (start, end) in span_idxs]
             for ta, span_idxs in zip(tas, self.span_index_src)
         ]
+        orig_tokens = self.text[0].split()
+        alignment_map = retokenize.create_tokenization_alignment(orig_tokens, "xlnet-base-cased")
+        se_tokens = self.tokens[0]
+        for i, v in enumerate(alignment_map):
+            assert v[0] == orig_tokens[i] and ",".join(v[1]) == se_tokens[i]
         assert self.tokens == tokens
         assert self.token_index_tgt == token_index_tgt
         assert self.span_index_tgt == span_index_tgt
@@ -344,6 +359,11 @@ def test_bytebpe(self):
             [ta.project_span(start, end) for (start, end) in span_idxs]
             for ta, span_idxs in zip(tas, self.span_index_src)
         ]
+        orig_tokens = self.text[0].split()
+        alignment_map = retokenize.create_tokenization_alignment(orig_tokens, "roberta-base")
+        bytebpe_tokens = ["ĠMembers", "Ġof", "Ġthe", "ĠHouse", "Ġcl,apped", "Ġtheir", "Ġhands"]
+        for i, v in enumerate(alignment_map):
+            assert v[0] == orig_tokens[i] and ",".join(v[1]) == bytebpe_tokens[i]
         assert self.tokens == tokens
         assert self.token_index_tgt == token_index_tgt
         assert self.span_index_tgt == span_index_tgt