huggingface · LysandreJik · Jul 6, 2021 · Jun 28, 2021 · Jun 28, 2021 · Jun 28, 2021
diff --git a/docs/source/model_doc/roformer.rst b/docs/source/model_doc/roformer.rst
@@ -56,7 +56,7 @@ RoFormerTokenizer
         create_token_type_ids_from_sequences, save_vocabulary
 
 
-RobertaTokenizerFast
+RoFormerTokenizerFast
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: transformers.RoFormerTokenizerFast

diff --git a/examples/tensorflow/question-answering/utils_qa.py b/examples/tensorflow/question-answering/utils_qa.py
@@ -38,7 +38,7 @@ def postprocess_qa_predictions(
     null_score_diff_threshold: float = 0.0,
     output_dir: Optional[str] = None,
     prefix: Optional[str] = None,
-    is_world_process_zero: bool = True,
+    log_level: Optional[int] = logging.WARNING,
 ):
     """
     Post-processes the predictions of a question-answering model to convert them to answers that are substrings of the
@@ -70,8 +70,8 @@ def postprocess_qa_predictions(
             answers, are saved in `output_dir`.
         prefix (:obj:`str`, `optional`):
             If provided, the dictionaries mentioned above are saved with `prefix` added to their names.
-        is_world_process_zero (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether this process is the main process or not (used to determine if logging/saves should be done).
+        log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``):
+            ``logging`` log level (e.g., ``logging.WARNING``)
     """
     assert len(predictions) == 2, "`predictions` should be a tuple with two elements (start_logits, end_logits)."
     all_start_logits, all_end_logits = predictions
@@ -91,7 +91,7 @@ def postprocess_qa_predictions(
         scores_diff_json = collections.OrderedDict()
 
     # Logging.
-    logger.setLevel(logging.INFO if is_world_process_zero else logging.WARN)
+    logger.setLevel(log_level)
     logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")
 
     # Let's loop over all the examples!
@@ -250,7 +250,7 @@ def postprocess_qa_predictions_with_beam_search(
     end_n_top: int = 5,
     output_dir: Optional[str] = None,
     prefix: Optional[str] = None,
-    is_world_process_zero: bool = True,
+    log_level: Optional[int] = logging.WARNING,
 ):
     """
     Post-processes the predictions of a question-answering model with beam search to convert them to answers that are substrings of the
@@ -280,8 +280,8 @@ def postprocess_qa_predictions_with_beam_search(
             answers, are saved in `output_dir`.
         prefix (:obj:`str`, `optional`):
             If provided, the dictionaries mentioned above are saved with `prefix` added to their names.
-        is_world_process_zero (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether this process is the main process or not (used to determine if logging/saves should be done).
+        log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``):
+            ``logging`` log level (e.g., ``logging.WARNING``)
     """
     assert len(predictions) == 5, "`predictions` should be a tuple with five elements."
     start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits = predictions
@@ -302,7 +302,7 @@ def postprocess_qa_predictions_with_beam_search(
     scores_diff_json = collections.OrderedDict() if version_2_with_negative else None
 
     # Logging.
-    logger.setLevel(logging.INFO if is_world_process_zero else logging.WARN)
+    logger.setLevel(log_level)
     logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")
 
     # Let's loop over all the examples!
@@ -413,14 +413,14 @@ def postprocess_qa_predictions_with_beam_search(
                 output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json"
             )
 
-        print(f"Saving predictions to {prediction_file}.")
+        logger.info(f"Saving predictions to {prediction_file}.")
         with open(prediction_file, "w") as writer:
             writer.write(json.dumps(all_predictions, indent=4) + "\n")
-        print(f"Saving nbest_preds to {nbest_file}.")
+        logger.info(f"Saving nbest_preds to {nbest_file}.")
         with open(nbest_file, "w") as writer:
             writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
         if version_2_with_negative:
-            print(f"Saving null_odds to {null_odds_file}.")
+            logger.info(f"Saving null_odds to {null_odds_file}.")
             with open(null_odds_file, "w") as writer:
                 writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
 

diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
@@ -196,6 +196,7 @@
     from ..reformer.tokenization_reformer_fast import ReformerTokenizerFast
     from ..retribert.tokenization_retribert_fast import RetriBertTokenizerFast
     from ..roberta.tokenization_roberta_fast import RobertaTokenizerFast
+    from ..roformer.tokenization_roformer_fast import RoFormerTokenizerFast
     from ..squeezebert.tokenization_squeezebert_fast import SqueezeBertTokenizerFast
     from ..t5.tokenization_t5_fast import T5TokenizerFast
     from ..xlm_roberta.tokenization_xlm_roberta_fast import XLMRobertaTokenizerFast
@@ -230,6 +231,7 @@
     ReformerTokenizerFast = None
     RetriBertTokenizerFast = None
     RobertaTokenizerFast = None
+    RoFormerTokenizerFast = None
     SqueezeBertTokenizerFast = None
     T5TokenizerFast = None
     XLMRobertaTokenizerFast = None
@@ -243,7 +245,7 @@
 TOKENIZER_MAPPING = OrderedDict(
     [
         (RetriBertConfig, (RetriBertTokenizer, RetriBertTokenizerFast)),
-        (RoFormerConfig, (RoFormerTokenizer, None)),
+        (RoFormerConfig, (RoFormerTokenizer, RoFormerTokenizerFast)),
         (T5Config, (T5Tokenizer, T5TokenizerFast)),
         (MT5Config, (MT5Tokenizer, MT5TokenizerFast)),
         (MobileBertConfig, (MobileBertTokenizer, MobileBertTokenizerFast)),

diff --git a/src/transformers/models/roformer/configuration_roformer.py b/src/transformers/models/roformer/configuration_roformer.py
@@ -22,7 +22,11 @@
 
 ROFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     "junnyu/roformer_chinese_small": "https://huggingface.co/junnyu/roformer_chinese_small/resolve/main/config.json",
-    "junnyu/roformer_chinese_base": "https://huggingface.co/junnyu/roformer_chinese_base/resolve/main/config.json"
+    "junnyu/roformer_chinese_base": "https://huggingface.co/junnyu/roformer_chinese_base/resolve/main/config.json",
+    "junnyu/roformer_chinese_char_small": "https://huggingface.co/junnyu/roformer_chinese_char_small/resolve/main/config.json",
+    "junnyu/roformer_chinese_char_base": "https://huggingface.co/junnyu/roformer_chinese_char_base/resolve/main/config.json",
+    "junnyu/roformer_small_discriminator": "https://huggingface.co/junnyu/roformer_small_discriminator/resolve/main/config.json",
+    "junnyu/roformer_small_generator": "https://huggingface.co/junnyu/roformer_small_generator/resolve/main/config.json",
     # See all RoFormer models at https://huggingface.co/models?filter=roformer
 }
 
@@ -43,7 +47,7 @@ class RoFormerConfig(PretrainedConfig):
             Vocabulary size of the RoFormer model. Defines the number of different tokens that can be represented by
             the :obj:`inputs_ids` passed when calling :class:`~transformers.RoFormerModel` or
             :class:`~transformers.TFRoFormerModel`.
-        embedding_size (:obj:`int`, `optional`, defaults to 768):
+        embedding_size (:obj:`int`, `optional`, defaults to None):
             Dimensionality of the encoder layers and the pooler layer.
         hidden_size (:obj:`int`, `optional`, defaults to 768):
             Dimension of the encoder layers and the pooler layer.
@@ -96,7 +100,7 @@ class RoFormerConfig(PretrainedConfig):
     def __init__(
         self,
         vocab_size=50000,
-        embedding_size=768,
+        embedding_size=None,
         hidden_size=768,
         num_hidden_layers=12,
         num_attention_heads=12,
@@ -117,7 +121,7 @@ def __init__(
         super().__init__(pad_token_id=pad_token_id, **kwargs)
 
         self.vocab_size = vocab_size
-        self.embedding_size = embedding_size
+        self.embedding_size = hidden_size if embedding_size is None else embedding_size
         self.hidden_size = hidden_size
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads

diff --git a/src/transformers/models/roformer/modeling_roformer.py b/src/transformers/models/roformer/modeling_roformer.py
@@ -60,7 +60,11 @@
 
 ROFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "junnyu/roformer_chinese_small",
-    "junnyu/roformer_chinese_base"
+    "junnyu/roformer_chinese_base",
+    "junnyu/roformer_chinese_char_small",
+    "junnyu/roformer_chinese_char_base",
+    "junnyu/roformer_small_discriminator",
+    "junnyu/roformer_small_generator"
     # See all RoFormer models at https://huggingface.co/models?filter=roformer
 ]
 
@@ -327,9 +331,9 @@ def apply_rotary_position_embeddings(sinusoidal_pos, query_layer, key_layer, val
         # cos [batch_size, num_heads, sequence_length, embed_size_per_head//2]
         sin, cos = sinusoidal_pos.chunk(2, dim=-1)
         # sin [θ0,θ1,θ2......θd/2-1] -> sin_pos [θ0,θ0,θ1,θ1,θ2,θ2......θd/2-1,θd/2-1]
-        sin_pos = torch.repeat_interleave(sin, 2, dim=-1)
+        sin_pos = torch.stack([sin, sin], dim=-1).reshape_as(sinusoidal_pos)
         # cos [θ0,θ1,θ2......θd/2-1] -> cos_pos [θ0,θ0,θ1,θ1,θ2,θ2......θd/2-1,θd/2-1]
-        cos_pos = torch.repeat_interleave(cos, 2, dim=-1)
+        cos_pos = torch.stack([cos, cos], dim=-1).reshape_as(sinusoidal_pos)
         # rotate_half_query_layer [-q1,q0,-q3,q2......,-qd-1,qd-2]
         rotate_half_query_layer = torch.stack([-query_layer[..., 1::2], query_layer[..., ::2]], dim=-1).reshape_as(
             query_layer

diff --git a/src/transformers/models/roformer/modeling_tf_roformer.py b/src/transformers/models/roformer/modeling_tf_roformer.py
@@ -65,7 +65,11 @@
 
 TF_ROFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "junnyu/roformer_chinese_small",
-    "junnyu/roformer_chinese_base"
+    "junnyu/roformer_chinese_base",
+    "junnyu/roformer_chinese_char_small",
+    "junnyu/roformer_chinese_char_base",
+    "junnyu/roformer_small_discriminator",
+    "junnyu/roformer_small_generator"
     # See all RoFormer models at https://huggingface.co/models?filter=roformer
 ]
 

diff --git a/src/transformers/models/roformer/tokenization_roformer.py b/src/transformers/models/roformer/tokenization_roformer.py
@@ -31,21 +31,36 @@
     "vocab_file": {
         "junnyu/roformer_chinese_small": "https://huggingface.co/junnyu/roformer_chinese_small/resolve/main/vocab.txt",
         "junnyu/roformer_chinese_base": "https://huggingface.co/junnyu/roformer_chinese_base/resolve/main/vocab.txt",
+        "junnyu/roformer_chinese_char_small": "https://huggingface.co/junnyu/roformer_chinese_char_small/resolve/main/vocab.txt",
+        "junnyu/roformer_chinese_char_base": "https://huggingface.co/junnyu/roformer_chinese_char_base/resolve/main/vocab.txt",
+        "junnyu/roformer_small_discriminator": "https://huggingface.co/junnyu/roformer_small_discriminator/resolve/main/vocab.txt",
+        "junnyu/roformer_small_generator": "https://huggingface.co/junnyu/roformer_small_generator/resolve/main/vocab.txt",
     }
 }
 
-PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"junnyu/roformer_chinese_small": 1536, "junnyu/roformer_chinese_base": 1536}
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "junnyu/roformer_chinese_small": 1536,
+    "junnyu/roformer_chinese_base": 1536,
+    "junnyu/roformer_chinese_char_small": 512,
+    "junnyu/roformer_chinese_char_base": 512,
+    "junnyu/roformer_small_discriminator": 128,
+    "junnyu/roformer_small_generator": 128,
+}
 
 
 PRETRAINED_INIT_CONFIGURATION = {
     "junnyu/roformer_chinese_small": {"do_lower_case": True},
     "junnyu/roformer_chinese_base": {"do_lower_case": True},
+    "junnyu/roformer_chinese_char_small": {"do_lower_case": True},
+    "junnyu/roformer_chinese_char_base": {"do_lower_case": True},
+    "junnyu/roformer_small_discriminator": {"do_lower_case": True},
+    "junnyu/roformer_small_generator": {"do_lower_case": True},
 }
 
 
 class RoFormerTokenizer(PreTrainedTokenizer):
     r"""
-    Construct a RoFormer tokenizer. Based on `Rust Jieba <https://pypi.org/project/rjieba/>`.
+    Construct a RoFormer tokenizer. Based on `Jieba <https://pypi.org/project/jieba/>`.
 
     This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
     Users should refer to this superclass for more information regarding those methods.
@@ -143,13 +158,13 @@ def __init__(
             )
         self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
         try:
-            import rjieba
+            import jieba
         except ImportError:
             raise ImportError(
-                "You need to install rjieba to use RoFormerTokenizer."
-                "See https://pypi.org/project/rjieba/ for installation."
+                "You need to install jieba to use RoFormerTokenizer."
+                "See https://pypi.org/project/jieba/ for installation."
             )
-        self.jieba = rjieba
+        self.jieba = jieba
 
     @property
     def do_lower_case(self):
@@ -167,21 +182,21 @@ def __getstate__(self):
     def __setstate__(self, d):
         self.__dict__ = d
         try:
-            import rjieba
+            import jieba
         except ImportError:
             raise ImportError(
-                "You need to install rjieba to use RoFormerTokenizer."
-                "See https://pypi.org/project/rjieba/ for installation."
+                "You need to install jieba to use RoFormerTokenizer."
+                "See https://pypi.org/project/jieba/ for installation."
             )
-        self.jieba = rjieba
+        self.jieba = jieba
 
     def get_vocab(self):
         return dict(self.vocab, **self.added_tokens_encoder)
 
     def _tokenize(self, text, use_jieba=True):
         split_tokens = []
         if use_jieba:
-            for wholword in self.jieba.cut(text, False):
+            for wholword in self.jieba.cut(text, HMM=False):
                 if wholword in self.vocab:
                     split_tokens.append(wholword)
                 else:

diff --git a/src/transformers/models/roformer/tokenization_roformer_fast.py b/src/transformers/models/roformer/tokenization_roformer_fast.py
@@ -33,15 +33,30 @@
     "vocab_file": {
         "junnyu/roformer_chinese_small": "https://huggingface.co/junnyu/roformer_chinese_small/resolve/main/vocab.txt",
         "junnyu/roformer_chinese_base": "https://huggingface.co/junnyu/roformer_chinese_base/resolve/main/vocab.txt",
+        "junnyu/roformer_chinese_char_small": "https://huggingface.co/junnyu/roformer_chinese_char_small/resolve/main/vocab.txt",
+        "junnyu/roformer_chinese_char_base": "https://huggingface.co/junnyu/roformer_chinese_char_base/resolve/main/vocab.txt",
+        "junnyu/roformer_small_discriminator": "https://huggingface.co/junnyu/roformer_small_discriminator/resolve/main/vocab.txt",
+        "junnyu/roformer_small_generator": "https://huggingface.co/junnyu/roformer_small_generator/resolve/main/vocab.txt",
     }
 }
 
-PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"junnyu/roformer_chinese_small": 1536, "junnyu/roformer_chinese_base": 1536}
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "junnyu/roformer_chinese_small": 1536,
+    "junnyu/roformer_chinese_base": 1536,
+    "junnyu/roformer_chinese_char_small": 512,
+    "junnyu/roformer_chinese_char_base": 512,
+    "junnyu/roformer_small_discriminator": 128,
+    "junnyu/roformer_small_generator": 128,
+}
 
 
 PRETRAINED_INIT_CONFIGURATION = {
     "junnyu/roformer_chinese_small": {"do_lower_case": True},
     "junnyu/roformer_chinese_base": {"do_lower_case": True},
+    "junnyu/roformer_chinese_char_small": {"do_lower_case": True},
+    "junnyu/roformer_chinese_char_base": {"do_lower_case": True},
+    "junnyu/roformer_small_discriminator": {"do_lower_case": True},
+    "junnyu/roformer_small_generator": {"do_lower_case": True},
 }
 
 

diff --git a/src/transformers/models/roformer/tokenization_utils.py b/src/transformers/models/roformer/tokenization_utils.py
@@ -29,38 +29,38 @@ def __init__(self, vocab) -> None:
             lowercase=False,
         )
         try:
-            import rjieba
+            import jieba
         except ImportError:
             raise ImportError(
-                "You need to install rjieba to use RoFormerTokenizer."
-                "See https://pypi.org/project/rjieba/ for installation."
+                "You need to install jieba to use RoFormerTokenizer."
+                "See https://pypi.org/project/jieba/ for installation."
             )
-        self.jieba = rjieba
+        self.jieba = jieba
 
     def jieba_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
         splits = []
 
         # this code slice normalized_string is too slow (6s) but test_alignement_methods can pass
-        # for token, start, end in self.jieba.tokenize(str(normalized_string), hmm=False):
-        #     if token in self.vocab:
-        #         splits.append(normalized_string.slice((start, end)))
-        #     else:
-        #         token_list = self.normalizers.normalize_str(token).split()
-        #         for token in token_list:
-        #             if token:
-        #                 end = start + len(token)
-        #                 splits.append(normalized_string.slice((start, end)))
-        #                 start = end
-
-        # this code test_alignement_methods can't pass but fast (300ms)
-        for token in self.jieba.cut(str(normalized_string), False):
+        for token, start, end in self.jieba.tokenize(str(normalized_string), HMM=False):
             if token in self.vocab:
-                splits.append(NormalizedString(token))
+                splits.append(normalized_string[start:end])
             else:
                 token_list = self.normalizers.normalize_str(token).split()
                 for token in token_list:
                     if token:
-                        splits.append(NormalizedString(token))
+                        end = start + len(token)
+                        splits.append(normalized_string[start:end])
+                        start = end
+
+        # this code test_alignement_methods can't pass but fast (300ms)
+        # for token in self.jieba.cut(str(normalized_string), HMM=False):
+        #     if token in self.vocab:
+        #         splits.append(NormalizedString(token))
+        #     else:
+        #         token_list = self.normalizers.normalize_str(token).split()
+        #         for token in token_list:
+        #             if token:
+        #                 splits.append(NormalizedString(token))
 
         return splits
 

diff --git a/tests/test_tokenization_roformer.py b/tests/test_tokenization_roformer.py
@@ -22,21 +22,21 @@
 from .test_tokenization_common import TokenizerTesterMixin
 
 
-def is_rjieba_available():
-    return importlib.util.find_spec("rjieba") is not None
+def is_jieba_available():
+    return importlib.util.find_spec("jieba") is not None
 
 
-def require_rjieba(test_case):
+def require_jieba(test_case):
     """
     Decorator marking a test that requires Jieba. These tests are skipped when Jieba isn't installed.
     """
-    if not is_rjieba_available():
-        return unittest.skip("test requires rjieba")(test_case)
+    if not is_jieba_available():
+        return unittest.skip("test requires jieba")(test_case)
     else:
         return test_case
 
 
-@require_rjieba
+@require_jieba
 @require_tokenizers
 class RoFormerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
@@ -78,7 +78,3 @@ def test_rust_tokenizer(self):
         input_tokens = tokens + [tokenizer.unk_token]
         exp_tokens = [22943, 21332, 34431, 45904, 117, 306, 1231, 1231, 2653, 33994, 1266, 100]
         self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), exp_tokens)
-
-    # due to custom pre_tokenize , char_to_token may be error
-    def test_alignement_methods(self):
-        pass