Return tokens in roberta sequence labeling

hezarai · Aug 16, 2023 · 5a08577 · 5a08577
1 parent 330a91a
commit 5a08577
Showing 1 changed file with 10 additions and 1 deletion.
diff --git a/hezar/models/sequence_labeling/roberta/roberta_sequence_labeling.py b/hezar/models/sequence_labeling/roberta/roberta_sequence_labeling.py
@@ -74,7 +74,16 @@ def preprocess(self, inputs: Union[str, List[str]], **kwargs):
             normalizer = self.preprocessor["text_normalizer"]
             inputs = normalizer(inputs)
         tokenizer = self.preprocessor[self.tokenizer_name]
-        inputs = tokenizer(inputs, return_tensors="pt", device=self.device)
+        inputs = tokenizer(
+            inputs,
+            return_word_ids=True,
+            return_tokens=True,
+            return_offsets_mapping=True,
+            padding=True,
+            truncation=True,
+            return_tensors="pt",
+            device=self.device,
+        )
         return inputs
 
     def post_process(self, inputs, **kwargs):