From c18840d48e160701bcbdcdee62ffd227aadcb23b Mon Sep 17 00:00:00 2001 From: Haokun Liu Date: Fri, 27 Dec 2019 21:57:54 -0500 Subject: [PATCH 1/2] fix roberta tokenization error --- jiant/utils/retokenize.py | 2 +- tests/test_retokenize.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/jiant/utils/retokenize.py b/jiant/utils/retokenize.py index 45570fce5..b77ad24b8 100644 --- a/jiant/utils/retokenize.py +++ b/jiant/utils/retokenize.py @@ -336,7 +336,7 @@ def process_sentencepiece_for_alignment(t): def process_bytebpe_for_alignment(t): """Add markers to ensure word-boundary alignment.""" - if t.startswith("▁"): + if t.startswith("Ġ"): return "" + re.sub(r"^Ġ", "", t) else: return t diff --git a/tests/test_retokenize.py b/tests/test_retokenize.py index d54ef6f52..daffb92e6 100644 --- a/tests/test_retokenize.py +++ b/tests/test_retokenize.py @@ -337,13 +337,13 @@ def test_bytebpe(self): ] self.token_index_tgt = [ [[0], [1], [2], [3], [4, 5], [6], [7]], - [[0], [1], [2], [3, 4], [5], [6, 7], [8], [9, 10, 11]], - [[0], [1, 2, 3], [4], [5], [6], [7], [8], [9, 10, 11], [12], [13], [14, 15]], + [[0], [1], [2], [3, 4], [5, 6], [7], [8], [9, 10, 11]], + [[0, 1], [2, 3], [4], [5], [6], [7], [8], [9, 10, 11], [12], [13], [14, 15]], [[0, 1]], ] self.span_index_tgt = [ [(0, 4), (6, 8)], - [(0, 1), (3, 6)], + [(0, 1), (3, 7)], [(0, 4), (8, 16), (8, 12), (9, 16)], [(0, 2)], ] From 77fcd2fbdf835227cb80f10d37e3dc7a00ecead7 Mon Sep 17 00:00:00 2001 From: Haokun Liu Date: Fri, 27 Dec 2019 22:06:54 -0500 Subject: [PATCH 2/2] format --- jiant/utils/retokenize.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/jiant/utils/retokenize.py b/jiant/utils/retokenize.py index b77ad24b8..6cde76df5 100644 --- a/jiant/utils/retokenize.py +++ b/jiant/utils/retokenize.py @@ -99,16 +99,16 @@ def create_tokenization_alignment( tokens: Sequence[str], tokenizer_name: str ) -> Sequence[Tuple[str, str]]: """ - Builds alignment mapping between space tokenization and tokenization of - choice. - + Builds alignment mapping between space tokenization and tokenization of + choice. + Example: Input: ['Larger', 'than', 'life.'] Output: [('Larger', ['ĠL', 'arger']), ('than', ['Ġthan']), ('life.', ['Ġlife', '.'])] Parameters ----------------------- - tokens: list[(str)]. list of tokens, + tokens: list[(str)]. list of tokens, tokenizer_name: str Returns