guillaume-be · guillaume-be · Oct 1, 2023 · Sep 30, 2023 · Sep 30, 2023 · Sep 30, 2023
diff --git a/main/Cargo.lock b/main/Cargo.lock
diff --git a/main/Cargo.toml b/main/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "rust_tokenizers"
-version = "8.1.0"
+version = "8.1.1"
 authors = ["Guillaume Becquin <guillaume.becquin@gmail.com>"]
 edition = "2018"
 description = "High performance tokenizers for Rust"

diff --git a/main/src/tokenizer/roberta_tokenizer.rs b/main/src/tokenizer/roberta_tokenizer.rs
@@ -292,7 +292,8 @@ impl Tokenizer<RobertaVocab> for RobertaTokenizer {
             special_tokens_mask.extend(vec![0; length]);
             special_tokens_mask.push(1);
             token_segment_ids.push(0);
-            token_segment_ids.extend(vec![1; length + 1]);
+            // RobERTa does not use segment id, the entire sequence is set to zeros.
+            token_segment_ids.extend(vec![0; length + 1]);
             output.push(self.vocab.token_to_id(self.vocab.get_sep_value()));
             output.extend(tokens_ids_with_offsets_2_value.ids);
             output.push(self.vocab.token_to_id(self.vocab.get_sep_value()));

diff --git a/python-bindings/tests/test_tokenization_sst2.py b/python-bindings/tests/test_tokenization_sst2.py
@@ -123,7 +123,7 @@ def test_tokenization_distilbert(self):
 
     def test_tokenization_ctrl(self):
         # Given
-        self.base_tokenizer = CTRLTokenizer.from_pretrained('ctrl',
+        self.base_tokenizer = CTRLTokenizer.from_pretrained('Salesforce/ctrl',
                                                             do_lower_case=True,
                                                             cache_dir=self.test_dir)
         self.rust_tokenizer = PyCtrlTokenizer(