From c18840d48e160701bcbdcdee62ffd227aadcb23b Mon Sep 17 00:00:00 2001
From: Haokun Liu <haokunliu412@gmail.com>
Date: Fri, 27 Dec 2019 21:57:54 -0500
Subject: [PATCH 1/2] fix roberta tokenization error

---
 jiant/utils/retokenize.py | 2 +-
 tests/test_retokenize.py  | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/jiant/utils/retokenize.py b/jiant/utils/retokenize.py
index 45570fce5..b77ad24b8 100644
--- a/jiant/utils/retokenize.py
+++ b/jiant/utils/retokenize.py
@@ -336,7 +336,7 @@ def process_sentencepiece_for_alignment(t):
 
 def process_bytebpe_for_alignment(t):
     """Add <w> markers to ensure word-boundary alignment."""
-    if t.startswith("▁"):
+    if t.startswith("Ġ"):
         return "<w>" + re.sub(r"^Ġ", "", t)
     else:
         return t
diff --git a/tests/test_retokenize.py b/tests/test_retokenize.py
index d54ef6f52..daffb92e6 100644
--- a/tests/test_retokenize.py
+++ b/tests/test_retokenize.py
@@ -337,13 +337,13 @@ def test_bytebpe(self):
         ]
         self.token_index_tgt = [
             [[0], [1], [2], [3], [4, 5], [6], [7]],
-            [[0], [1], [2], [3, 4], [5], [6, 7], [8], [9, 10, 11]],
-            [[0], [1, 2, 3], [4], [5], [6], [7], [8], [9, 10, 11], [12], [13], [14, 15]],
+            [[0], [1], [2], [3, 4], [5, 6], [7], [8], [9, 10, 11]],
+            [[0, 1], [2, 3], [4], [5], [6], [7], [8], [9, 10, 11], [12], [13], [14, 15]],
             [[0, 1]],
         ]
         self.span_index_tgt = [
             [(0, 4), (6, 8)],
-            [(0, 1), (3, 6)],
+            [(0, 1), (3, 7)],
             [(0, 4), (8, 16), (8, 12), (9, 16)],
             [(0, 2)],
         ]

From 77fcd2fbdf835227cb80f10d37e3dc7a00ecead7 Mon Sep 17 00:00:00 2001
From: Haokun Liu <haokunliu412@gmail.com>
Date: Fri, 27 Dec 2019 22:06:54 -0500
Subject: [PATCH 2/2] format

---
 jiant/utils/retokenize.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/jiant/utils/retokenize.py b/jiant/utils/retokenize.py
index b77ad24b8..6cde76df5 100644
--- a/jiant/utils/retokenize.py
+++ b/jiant/utils/retokenize.py
@@ -99,16 +99,16 @@ def create_tokenization_alignment(
     tokens: Sequence[str], tokenizer_name: str
 ) -> Sequence[Tuple[str, str]]:
     """
-    Builds alignment mapping between space tokenization and tokenization of 
-    choice. 
-    
+    Builds alignment mapping between space tokenization and tokenization of
+    choice.
+
     Example:
         Input: ['Larger', 'than', 'life.']
         Output: [('Larger', ['ĠL', 'arger']), ('than', ['Ġthan']), ('life.', ['Ġlife', '.'])]
 
     Parameters
     -----------------------
-        tokens: list[(str)]. list of tokens, 
+        tokens: list[(str)]. list of tokens,
         tokenizer_name: str
 
     Returns