nyu-mll · pruksmhc · Jan 8, 2020 · Dec 28, 2019 · Dec 28, 2019 · Jan 3, 2020
@@ -99,16 +99,16 @@ def create_tokenization_alignment(
     tokens: Sequence[str], tokenizer_name: str
 ) -> Sequence[Tuple[str, str]]:
     """
-    Builds alignment mapping between space tokenization and tokenization of 
-    choice. 
-    
+    Builds alignment mapping between space tokenization and tokenization of
+    choice.
+
     Example:
         Input: ['Larger', 'than', 'life.']
         Output: [('Larger', ['ĠL', 'arger']), ('than', ['Ġthan']), ('life.', ['Ġlife', '.'])]
 
     Parameters
     -----------------------
-        tokens: list[(str)]. list of tokens, 
+        tokens: list[(str)]. list of tokens,
         tokenizer_name: str
 
     Returns
@@ -336,7 +336,7 @@ def process_sentencepiece_for_alignment(t):
 
 def process_bytebpe_for_alignment(t):
     """Add <w> markers to ensure word-boundary alignment."""
-    if t.startswith("▁"):
+    if t.startswith("Ġ"):
         return "<w>" + re.sub(r"^Ġ", "", t)
     else:
         return t

@@ -337,13 +337,13 @@ def test_bytebpe(self):
         ]
         self.token_index_tgt = [
             [[0], [1], [2], [3], [4, 5], [6], [7]],
-            [[0], [1], [2], [3, 4], [5], [6, 7], [8], [9, 10, 11]],
-            [[0], [1, 2, 3], [4], [5], [6], [7], [8], [9, 10, 11], [12], [13], [14, 15]],
+            [[0], [1], [2], [3, 4], [5, 6], [7], [8], [9, 10, 11]],
+            [[0, 1], [2, 3], [4], [5], [6], [7], [8], [9, 10, 11], [12], [13], [14, 15]],
             [[0, 1]],
         ]
         self.span_index_tgt = [
             [(0, 4), (6, 8)],
-            [(0, 1), (3, 6)],
+            [(0, 1), (3, 7)],
             [(0, 4), (8, 16), (8, 12), (9, 16)],
             [(0, 2)],
         ]