huggingface · Narsil · Jun 26, 2023 · Apr 12, 2023 · Apr 14, 2023 · Apr 14, 2023
diff --git a/bindings/node/lib/bindings/models.d.ts b/bindings/node/lib/bindings/models.d.ts
@@ -170,6 +170,11 @@ export interface UnigramOptions {
    * @default undefined
    */
   unkId?: number;
+  /**
+   * Whether or not bytefallback support should be enabled.
+   * @default false
+   */
+  byte_fallback?: boolean;
 }
 
 export namespace Unigram {

diff --git a/bindings/node/lib/bindings/models.test.ts b/bindings/node/lib/bindings/models.test.ts
@@ -124,6 +124,7 @@ describe("Unigram", () => {
       ],
       {
         unkId: 0,
+        byte_fallback: false,
       }
     );
     expect(unigram.constructor.name).toEqual("Model");

diff --git a/bindings/node/native/src/models.rs b/bindings/node/native/src/models.rs
@@ -191,6 +191,7 @@ fn bpe_init(mut cx: FunctionContext) -> JsResult<JsModel> {
 ///   unkToken?: string,
 ///   continuingSubwordPrefix?: string,
 ///   endOfWordSuffix?: string
+///   byteFallback?: bool
 /// }, callback)
 fn bpe_from_file(mut cx: FunctionContext) -> JsResult<JsUndefined> {
     let (options, callback) = match cx.extract_opt::<BpeOptions>(2) {
@@ -369,16 +370,16 @@ fn wordlevel_empty(mut cx: FunctionContext) -> JsResult<JsModel> {
 #[serde(rename_all = "camelCase")]
 struct UnigramOptions {
     unk_id: Option<usize>,
+    byte_fallback: Option<bool>,
 }
-
 /// unigram_init(vocab: [string, number][], options?: {
 ///   unkId?: number
 /// })
 fn unigram_init(mut cx: FunctionContext) -> JsResult<JsModel> {
     let vocab = cx.extract::<Vec<(String, f64)>>(0)?;
     let options = cx.extract_opt::<UnigramOptions>(1)?.unwrap_or_default();
-
-    let unigram = tk::models::unigram::Unigram::from(vocab, options.unk_id)
+    let byte_fallback = options.byte_fallback.unwrap_or(false);
+    let unigram = tk::models::unigram::Unigram::from(vocab, options.unk_id, byte_fallback)
         .map_err(|e| Error(e.to_string()))?;
 
     let mut js_model = JsModel::new::<_, JsModel, _>(&mut cx, vec![])?;

diff --git a/bindings/node/package-lock.json b/bindings/node/package-lock.json
diff --git a/bindings/node/package.json b/bindings/node/package.json
@@ -16,7 +16,9 @@
     "license": "Apache-2.0",
     "dependencies": {
         "@types/node": "^13.13.52",
-        "node-pre-gyp": "^0.14.0"
+        "native": "^0.3.3",
+        "node-pre-gyp": "^0.14.0",
+        "package.json": "^2.0.1"
     },
     "devDependencies": {
         "@types/jest": "^26.0.24",

diff --git a/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py b/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py
@@ -162,6 +162,7 @@ def from_spm(filename: str):
         vocab = [(piece.piece, piece.score) for piece in m.pieces]
         unk_id = m.trainer_spec.unk_id
         model_type = m.trainer_spec.model_type
+        byte_fallback = m.trainer_spec.byte_fallback
         if model_type != 1:
             raise Exception(
                 "You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
@@ -170,7 +171,7 @@ def from_spm(filename: str):
         replacement = "▁"
         add_prefix_space = True
 
-        tokenizer = Tokenizer(Unigram(vocab, unk_id))
+        tokenizer = Tokenizer(Unigram(vocab, unk_id, byte_fallback))
 
         tokenizer.normalizer = normalizers.Sequence(
             [

diff --git a/bindings/python/py_src/tokenizers/models/__init__.pyi b/bindings/python/py_src/tokenizers/models/__init__.pyi
@@ -242,11 +242,11 @@ class Unigram(Model):
     An implementation of the Unigram algorithm
 
     Args:
-        vocab (:obj:`List[Tuple[str, float]]`, `optional`):
+        vocab (:obj:`List[Tuple[str, float]]`, `optional`, `optional`):
             A list of vocabulary items and their relative score [("am", -0.2442),...]
     """
 
-    def __init__(self, vocab):
+    def __init__(self, vocab, unk_id, byte_fallback):
         pass
     def get_trainer(self):
         """

diff --git a/bindings/python/src/models.rs b/bindings/python/src/models.rs
@@ -804,24 +804,32 @@ impl PyWordLevel {
 /// An implementation of the Unigram algorithm
 ///
 /// Args:
-///     vocab (:obj:`List[Tuple[str, float]]`, `optional`):
+///     vocab (:obj:`List[Tuple[str, float]]`, `optional`, `optional`):
 ///         A list of vocabulary items and their relative score [("am", -0.2442),...]
 #[pyclass(extends=PyModel, module = "tokenizers.models", name = "Unigram")]
-#[pyo3(text_signature = "(self, vocab)")]
+#[pyo3(text_signature = "(self, vocab, unk_id, byte_fallback)")]
 pub struct PyUnigram {}
 
 #[pymethods]
 impl PyUnigram {
     #[new]
-    fn new(vocab: Option<Vec<(String, f64)>>, unk_id: Option<usize>) -> PyResult<(Self, PyModel)> {
-        match (vocab, unk_id) {
-            (Some(vocab), unk_id) => {
-                let model = Unigram::from(vocab, unk_id).map_err(|e| {
-                    exceptions::PyException::new_err(format!("Error while loading Unigram: {}", e))
-                })?;
+    fn new(
+        vocab: Option<Vec<(String, f64)>>,
+        unk_id: Option<usize>,
+        byte_fallback: Option<bool>,
+    ) -> PyResult<(Self, PyModel)> {
+        match (vocab, unk_id, byte_fallback) {
+            (Some(vocab), unk_id, byte_fallback) => {
+                let model =
+                    Unigram::from(vocab, unk_id, byte_fallback.unwrap_or(false)).map_err(|e| {
+                        exceptions::PyException::new_err(format!(
+                            "Error while loading Unigram: {}",
+                            e
+                        ))
+                    })?;
                 Ok((PyUnigram {}, model.into()))
             }
-            (None, None) => Ok((PyUnigram {}, Unigram::default().into())),
+            (None, None, _) => Ok((PyUnigram {}, Unigram::default().into())),
             _ => Err(exceptions::PyValueError::new_err(
                 "`vocab` and `unk_id` must be both specified",
             )),

diff --git a/bindings/python/tests/bindings/test_tokenizer.py b/bindings/python/tests/bindings/test_tokenizer.py
@@ -5,7 +5,7 @@
 
 from tokenizers import AddedToken, Encoding, Tokenizer
 from tokenizers.implementations import BertWordPieceTokenizer
-from tokenizers.models import BPE, Model, WordPiece
+from tokenizers.models import BPE, Model, WordPiece, Unigram
 from tokenizers.normalizers import Lowercase
 from tokenizers.pre_tokenizers import ByteLevel
 from tokenizers.processors import BertProcessing, RobertaProcessing
@@ -412,3 +412,29 @@ def test_from_pretrained_revision(self):
         tokenizer = Tokenizer.from_pretrained("anthony/tokenizers-test", revision="gpt-2")
         output = tokenizer.encode("Hey there dear friend!", add_special_tokens=False)
         assert output.tokens == ["Hey", "Ġthere", "Ġdear", "Ġfriend", "!"]
+
+    def test_unigram_byte_fallback(self):
+        vocab = [
+            ("<unk>", 0.0),
+            ("A", -0.01),
+            ("sen", -0.02),
+            ("te", -0.03),
+            ("n", -0.04),
+            ("ce", -0.05),
+            ("<0xF0>", -0.06),
+            ("<0x9F>", -0.06),
+            ("<0xA4>", -0.06),
+            ("<0x97>", -0.06),
+            (" ", -0.4),
+        ]
+        tokenizer = tokenizer = Tokenizer(Unigram(vocab, 0, byte_fallback=False))
+
+        output = tokenizer.encode("A sentence 🤗")
+        assert output.ids == [1, 10, 2, 3, 4, 5, 10, 0]
+        assert output.tokens == ["A", " ", "sen", "te", "n", "ce", " ", "🤗"]
+
+        tokenizer = Tokenizer(Unigram(vocab, 0, byte_fallback=True))
+
+        output = tokenizer.encode("A sentence 🤗")
+        assert output.ids == [1, 10, 2, 3, 4, 5, 10, 6, 7, 8, 9]
+        assert output.tokens == ["A", " ", "sen", "te", "n", "ce", " ", "<0xF0>", "<0x9F>", "<0xA4>", "<0x97>"]
diff --git a/tokenizers/src/models/unigram/model.rs b/tokenizers/src/models/unigram/model.rs
@@ -27,6 +27,7 @@ pub struct Unigram {
 
     fuse_unk: bool,
     is_optimized: bool,
+    byte_fallback: bool,
 }
 impl PartialEq for Unigram {
     fn eq(&self, other: &Self) -> bool {
@@ -50,6 +51,7 @@ impl Clone for Unigram {
             eos_id: self.eos_id,
             fuse_unk: self.fuse_unk,
             is_optimized: self.is_optimized,
+            byte_fallback: self.byte_fallback,
         }
     }
 }
@@ -59,6 +61,7 @@ impl std::fmt::Debug for Unigram {
         fmt.debug_struct("Unigram")
             .field("vocab", &self.vocab.len())
             .field("unk_id", &self.unk_id)
+            .field("byte_fallback", &self.byte_fallback)
             .finish()
     }
 }
@@ -78,7 +81,7 @@ pub enum UnigramError {
 impl Default for Unigram {
     fn default() -> Self {
         let vocab = vec![("<unk>".to_string(), 0.0)];
-        Self::from(vocab, Some(0)).unwrap()
+        Self::from(vocab, Some(0), false).unwrap()
     }
 }
 
@@ -89,7 +92,11 @@ impl Unigram {
     /// unk_id, is the index within the vocabulary.
     /// For now `Unigram` *requires* at least `unk` because we might find a never seen char.
     /// Further versions might allow that part to be hidden.
-    pub fn from(vocab: Vec<(String, f64)>, unk_id: Option<usize>) -> Result<Self> {
+    pub fn from(
+        vocab: Vec<(String, f64)>,
+        unk_id: Option<usize>,
+        byte_fallback: bool,
+    ) -> Result<Self> {
         let n = vocab.len();
         let mut token_to_ids: TokenMap = HashMap::new();
         let mut builder = TrieBuilder::default();
@@ -102,7 +109,6 @@ impl Unigram {
                 return Err(Box::new(UnigramError::UnkIdNotInVocabulary));
             }
         }
-
         let bos_id = n + 1;
         let eos_id = n + 2;
 
@@ -130,6 +136,7 @@ impl Unigram {
             fuse_unk,
             cache: Cache::default(),
             is_optimized,
+            byte_fallback,
         })
     }
 
@@ -143,7 +150,9 @@ impl Unigram {
     pub(super) fn set_optimized(&mut self, is_optimized: bool) {
         self.is_optimized = is_optimized;
     }
-
+    pub fn byte_fallback(&self) -> bool {
+        self.byte_fallback
+    }
     pub(super) fn len(&self) -> usize {
         self.vocab.len()
     }
@@ -205,7 +214,7 @@ impl Unigram {
     ///     ("abc".to_string(), 5.0),
     ///     ("abcd".to_string(), 10.0),
     /// ];
-    /// let model = Unigram::from(pieces, Some(0)).unwrap();
+    /// let model = Unigram::from(pieces, Some(0), false).unwrap();
     /// let result = model.encode("abcdacdxx").unwrap();
     /// assert_eq!(result, vec!["abcd", "a", "cd", "xx"]);
     /// ```
@@ -407,12 +416,31 @@ impl Model for Unigram {
         let mut offset = 0;
         let mut tokens = Vec::with_capacity(str_tokens.len());
         for string in str_tokens {
+            let len = string.len();
+            let offsets = (offset, offset + len);
             let id: u32 = match self.token_to_ids.get(&string) {
                 Some(id) => *id,
-                None => self.unk_id.ok_or(UnigramError::MissingUnkId)? as u32,
+                None => {
+                    if self.byte_fallback {
+                        let byte_tokens: Option<Vec<_>> = string
+                            .bytes()
+                            .map(|byte| -> Option<Token> {
+                                let byte_string = format!("<0x{:02X}>", byte);
+                                let id = self.token_to_ids.get(&byte_string);
+                                id.map(|id| Token::new(*id, byte_string, (offset, offset + len)))
+                            })
+                            .collect();
+                        if let Some(byte_tokens) = byte_tokens {
+                            for token in byte_tokens {
+                                tokens.push(token);
+                            }
+                            offset += len;
+                            continue;
+                        }
+                    }
+                    self.unk_id.ok_or(UnigramError::MissingUnkId)? as u32
+                }
             };
-            let len = string.len();
-            let offsets = (offset, offset + len);
             offset += len;
             tokens.push(Token::new(id, string, offsets));
         }
@@ -452,7 +480,7 @@ mod tests {
     #[test]
     fn test_populate_nodes_unk() {
         let pieces = vec![("<unk>".to_string(), 0.0)];
-        let model = Unigram::from(pieces, Some(0)).unwrap();
+        let model = Unigram::from(pieces, Some(0), false).unwrap();
 
         let mut lattice = Lattice::from("abc", model.bos_id, model.eos_id);
         model.populate_nodes(&mut lattice);
@@ -477,7 +505,7 @@ mod tests {
             ("ab".to_string(), 0.3),
             ("bc".to_string(), 0.4),
         ];
-        let model = Unigram::from(pieces, Some(0)).unwrap();
+        let model = Unigram::from(pieces, Some(0), false).unwrap();
 
         let mut lattice = Lattice::from("abc", model.bos_id, model.eos_id);
         model.populate_nodes(&mut lattice);
@@ -514,7 +542,7 @@ mod tests {
             ("abcd".to_string(), 10.0),
         ];
 
-        let model = Unigram::from(sentencepieces, Some(0)).unwrap();
+        let model = Unigram::from(sentencepieces, Some(0), false).unwrap();
         let result = model.encode("abcd").unwrap();
         assert_eq!(result, vec!["abcd"]);
     }
@@ -536,7 +564,7 @@ mod tests {
             ("qr".to_string(), -0.5),
         ];
 
-        let mut model = Unigram::from(sentencepieces, Some(0)).unwrap();
+        let mut model = Unigram::from(sentencepieces, Some(0), false).unwrap();
 
         for is_optimized in &[true, false] {
             model.set_optimized(*is_optimized);
@@ -573,4 +601,35 @@ mod tests {
             assert_eq!(model.encode("abqrcd").unwrap(), vec!["ab", "q", "r", "cd"]);
         }
     }
+
+    #[test]
+    fn test_unigram_bytefallback() {
+        // In [97]: processor.encode_as_pieces("⅐⅛⅑ ")
+        // Out[97]: ['▁', '<0xE2>', '<0x85>', '<0x90>', '⅛', '<0xE2>', '<0x85>', '<0x91>', '▁']
+        let sentencepieces = vec![
+            ("<unk>".to_string(), 0.0),
+            ("<0xC3>".to_string(), -0.01),
+            ("<0xA9>".to_string(), -0.03),
+        ];
+        let unigram = Unigram::from(sentencepieces, Some(0), true).unwrap();
+        let tokens: Vec<Token> = unigram.tokenize("é").unwrap();
+        assert_eq!(
+            tokens,
+            [
+                Token {
+                    id: 1,
+                    value: "<0xC3>".to_string(),
+                    offsets: (0, 2)
+                },
+                Token {
+                    id: 2,
+                    value: "<0xA9>".to_string(),
+                    offsets: (0, 2)
+                }
+            ]
+        );
+
+        let tokens = unigram.tokenize("?é").unwrap();
+        assert_eq!(tokens[0].id, 0);
+    }
 }
-Original file line number
+Diff line change
@@ Expand Up / @@ -124,6 +124,7 @@ describe("Unigram", () => { @@
           ],
           {
             unkId: 0,
+            byte_fallback: false,
           }
         );
         expect(unigram.constructor.name).toEqual("Model");
@@ Expand Down @@