From c86b1840bdebf43cc91359e43c2972599a5567f8 Mon Sep 17 00:00:00 2001 From: ahmetustun Date: Wed, 13 Mar 2024 20:37:05 +0000 Subject: [PATCH 1/2] tokenization tests are fixed --- .../models/auto/tokenization_auto.py | 5 +- .../models/cohere/tokenization_cohere_fast.py | 67 +++++++++++++++---- .../models/cohere/test_tokenization_cohere.py | 16 ++++- 3 files changed, 71 insertions(+), 17 deletions(-) diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index a0725452a1c4..10948d4ef7b3 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -137,10 +137,7 @@ ), ), ("codegen", ("CodeGenTokenizer", "CodeGenTokenizerFast" if is_tokenizers_available() else None)), - ( - "cohere", - (None, "CohereTokenizerFast" if is_tokenizers_available() else None), - ), + ("cohere", (None, "CohereTokenizerFast" if is_tokenizers_available() else None)), ("convbert", ("ConvBertTokenizer", "ConvBertTokenizerFast" if is_tokenizers_available() else None)), ( "cpm", diff --git a/src/transformers/models/cohere/tokenization_cohere_fast.py b/src/transformers/models/cohere/tokenization_cohere_fast.py index b066218b81b4..15278ae22ae6 100644 --- a/src/transformers/models/cohere/tokenization_cohere_fast.py +++ b/src/transformers/models/cohere/tokenization_cohere_fast.py @@ -15,13 +15,13 @@ # This file is based on the tokenization_llama_fast.py file in transformers - +import pickle from typing import Dict, List, Literal, Optional, Union from tokenizers import processors from ...pipelines.conversational import Conversation -from ...tokenization_utils_base import TensorType +from ...tokenization_utils_base import TensorType, BatchEncoding from ...tokenization_utils_fast import PreTrainedTokenizerFast from ...utils import logging from ...utils.versions import require_version @@ -30,11 +30,11 @@ require_version("tokenizers>=0.13.3") logger = logging.get_logger(__name__) -VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.json"} +VOCAB_FILES_NAMES = {"tokenizer_file": "tokenizer.json"} PRETRAINED_VOCAB_FILES_MAP = { - "vocab_file": { - "cohere-tokenizer": "https://huggingface.co/Cohere/Command-nightly/blob/main/tokenizer.json", + "tokenizer_file": { + "Cohere/Command-nightly": "https://huggingface.co/Cohere/Command-nightly/blob/main/tokenizer.json", }, } @@ -67,6 +67,14 @@ class CohereTokenizerFast(PreTrainedTokenizerFast): values of the first token and final token of an encoded sequence will not be correct). For more details, checkout [post-processors] (https://huggingface.co/docs/tokenizers/api/post-processors) documentation. + You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since + the model was not pretrained this way, it might yield a decrease in performance. + + + + When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`. + + This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should refer to this superclass for more information regarding those methods. @@ -94,17 +102,21 @@ class CohereTokenizerFast(PreTrainedTokenizerFast): Whether or not to add an `eos_token` at the end of sequences. use_default_system_prompt (`bool`, *optional*, defaults to `False`): Whether or not the default system prompt for Cohere tokenizer should be used. - add_prefix_space (`bool`, *optional*): + add_prefix_space (`bool`, *optional*, defaults to `False`):): Whether or not the tokenizer should automatically add a prefix space """ vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP padding_side = "left" model_input_names = ["input_ids", "attention_mask"] + slow_tokenizer_class = None + # No `max_model_input_sizes` def __init__( self, vocab_file=None, + merges_file=None, tokenizer_file=None, clean_up_tokenization_spaces=False, unk_token="", @@ -113,16 +125,12 @@ def __init__( add_bos_token=True, add_eos_token=False, use_default_system_prompt=False, - add_prefix_space=None, + add_prefix_space=False, **kwargs, ): - if add_prefix_space is not None: - logger.warning_once( - "You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers but Cohere tokenizer does not have a slow tokenizer. The `add_prefix_space` argument will be ignored." - ) - super().__init__( vocab_file=vocab_file, + merges_file=merges_file, tokenizer_file=tokenizer_file, clean_up_tokenization_spaces=clean_up_tokenization_spaces, unk_token=unk_token, @@ -131,6 +139,7 @@ def __init__( add_bos_token=add_bos_token, add_eos_token=add_eos_token, use_default_system_prompt=use_default_system_prompt, + add_prefix_space=add_prefix_space, **kwargs, ) self._add_bos_token = add_bos_token @@ -141,6 +150,40 @@ def __init__( self.grounded_generation_template = kwargs.pop("grounded_generation_template", None) self.tool_use_template = kwargs.pop("tool_use_template", None) + # TODO @ArthurZucker this can only work one way for now, to update later-on. Tests should also properly + # check this as they were green before. + pre_tok_state = pickle.dumps(self.backend_tokenizer.pre_tokenizer) + decoder_state = pickle.dumps(self.backend_tokenizer.decoder) + + if add_prefix_space: + pre_tok_state = pre_tok_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true') + decoder_state = decoder_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true') + self.backend_tokenizer.pre_tokenizer = pickle.loads(pre_tok_state) + self.backend_tokenizer.decoder = pickle.loads(decoder_state) + + self.add_prefix_space = add_prefix_space + + def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding: + is_split_into_words = kwargs.get("is_split_into_words", False) + if not (self.add_prefix_space or not is_split_into_words): + raise Exception( + f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True to use it with" + " pretokenized inputs." + ) + + return super()._batch_encode_plus(*args, **kwargs) + + def _encode_plus(self, *args, **kwargs) -> BatchEncoding: + is_split_into_words = kwargs.get("is_split_into_words", False) + + if not (self.add_prefix_space or not is_split_into_words): + raise Exception( + f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True to use it with" + " pretokenized inputs." + ) + + return super()._encode_plus(*args, **kwargs) + def update_post_processor(self): """ Updates the underlying post processor with the current `bos_token` and `eos_token`. diff --git a/tests/models/cohere/test_tokenization_cohere.py b/tests/models/cohere/test_tokenization_cohere.py index 8115e1c1e850..716ff4a200fb 100644 --- a/tests/models/cohere/test_tokenization_cohere.py +++ b/tests/models/cohere/test_tokenization_cohere.py @@ -29,6 +29,7 @@ class CohereTokenizationTest(TokenizerTesterMixin, unittest.TestCase): test_rust_tokenizer = True test_slow_tokenizer = False from_pretrained_vocab_key = "tokenizer_file" + from_pretrained_id = "CohereForAI/c4ai-command-r-v01" special_tokens_map = { "bos_token": "", "eos_token": "<|END_OF_TURN_TOKEN|>", @@ -61,8 +62,9 @@ def test_encodings_from_sample_data(self): computed_tokens = tokenizer.batch_encode_plus(INPUT_SENTENCES)["input_ids"] self.assertListEqual(TARGET_TOKENS, computed_tokens) + INPUT_SENTENCES_W_BOS = ["The quick brown fox<|END_OF_TURN_TOKEN|>", "jumps over the lazy dog<|END_OF_TURN_TOKEN|>"] decoded_tokens = tokenizer.batch_decode(computed_tokens) - self.assertListEqual(decoded_tokens, INPUT_SENTENCES) + self.assertListEqual(decoded_tokens, INPUT_SENTENCES_W_BOS) def test_padding(self, max_length=10): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: @@ -119,6 +121,11 @@ def test_padding(self, max_length=10): padding="max_length", ) + def test_pretrained_model_lists(self): + # No `max_model_input_sizes` for Cohere model + self.assertGreaterEqual(len(self.tokenizer_class.pretrained_vocab_files_map), 1) + self.assertGreaterEqual(len(list(self.tokenizer_class.pretrained_vocab_files_map.values())[0]), 1) + @require_jinja def test_tokenization_for_chat(self): tokenizer = self.get_rust_tokenizer() @@ -278,3 +285,10 @@ def test_tokenization_for_grounded_generation(self): Finally, Write 'Grounded answer:' followed by a response to the user's last input in high quality natural english. Use the symbols and to indicate when a fact comes from a document in the search result, e.g my fact for a fact from document 0.<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>""" self.assertEqual(grounded_generation_prompt, expected_prompt) + + def test_add_prefix_space_fast(self): + tokenizer_w_prefix = self.get_rust_tokenizer(add_prefix_space=True) + tokenizer_wo_prefix = self.get_rust_tokenizer(add_prefix_space=False) + tokens_w_prefix = tokenizer_w_prefix.tokenize("Hey") + tokens_wo_prefix = tokenizer_wo_prefix.tokenize("Hey") + self.assertNotEqual(tokens_w_prefix, tokens_wo_prefix) \ No newline at end of file From e7567ca70a2f51269c9af75e35ce0bead7f8596e Mon Sep 17 00:00:00 2001 From: ahmetustun Date: Wed, 13 Mar 2024 20:56:42 +0000 Subject: [PATCH 2/2] format fixes --- README_de.md | 1 + README_pt-br.md | 1 + README_ru.md | 1 + README_te.md | 1 + README_vi.md | 1 + .../models/cohere/tokenization_cohere_fast.py | 9 +++++---- tests/models/cohere/test_tokenization_cohere.py | 7 +++++-- 7 files changed, 15 insertions(+), 6 deletions(-) diff --git a/README_de.md b/README_de.md index 7b9c98f94821..f00338c1939b 100644 --- a/README_de.md +++ b/README_de.md @@ -464,6 +464,7 @@ Aktuelle Anzahl der Checkpoints: ![](https://img.shields.io/endpoint?url=https:/ 1. **[Pop2Piano](https://huggingface.co/docs/transformers/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi and Kyogu Lee. 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou. 1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (from Nanjing University, The University of Hong Kong etc.) released with the paper [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. +1. **[PVTv2](https://huggingface.co/docs/transformers/main/model_doc/pvt_v2)** (from Shanghai AI Laboratory, Nanjing University, The University of Hong Kong etc.) released with the paper [PVT v2: Improved Baselines with Pyramid Vision Transformer](https://arxiv.org/abs/2106.13797) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius. 1. **[Qwen2](https://huggingface.co/docs/transformers/model_doc/qwen2)** (from the Qwen team, Alibaba Group) released with the paper [Qwen Technical Report](https://arxiv.org/abs/2309.16609) by Jinze Bai, Shuai Bai, Yunfei Chu, Zeyu Cui, Kai Dang, Xiaodong Deng, Yang Fan, Wenbin Ge, Yu Han, Fei Huang, Binyuan Hui, Luo Ji, Mei Li, Junyang Lin, Runji Lin, Dayiheng Liu, Gao Liu, Chengqiang Lu, Keming Lu, Jianxin Ma, Rui Men, Xingzhang Ren, Xuancheng Ren, Chuanqi Tan, Sinan Tan, Jianhong Tu, Peng Wang, Shijie Wang, Wei Wang, Shengguang Wu, Benfeng Xu, Jin Xu, An Yang, Hao Yang, Jian Yang, Shusheng Yang, Yang Yao, Bowen Yu, Hongyi Yuan, Zheng Yuan, Jianwei Zhang, Xingxuan Zhang, Yichang Zhang, Zhenru Zhang, Chang Zhou, Jingren Zhou, Xiaohuan Zhou and Tianhang Zhu. 1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela. diff --git a/README_pt-br.md b/README_pt-br.md index 039578faecbf..0066a8850145 100644 --- a/README_pt-br.md +++ b/README_pt-br.md @@ -473,6 +473,7 @@ Número atual de pontos de verificação: ![](https://img.shields.io/endpoint?ur 1. **[Pop2Piano](https://huggingface.co/docs/transformers/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi and Kyogu Lee. 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou. 1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (from Nanjing University, The University of Hong Kong etc.) released with the paper [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. +1. **[PVTv2](https://huggingface.co/docs/transformers/main/model_doc/pvt_v2)** (from Shanghai AI Laboratory, Nanjing University, The University of Hong Kong etc.) released with the paper [PVT v2: Improved Baselines with Pyramid Vision Transformer](https://arxiv.org/abs/2106.13797) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius. 1. **[Qwen2](https://huggingface.co/docs/transformers/model_doc/qwen2)** (from the Qwen team, Alibaba Group) released with the paper [Qwen Technical Report](https://arxiv.org/abs/2309.16609) by Jinze Bai, Shuai Bai, Yunfei Chu, Zeyu Cui, Kai Dang, Xiaodong Deng, Yang Fan, Wenbin Ge, Yu Han, Fei Huang, Binyuan Hui, Luo Ji, Mei Li, Junyang Lin, Runji Lin, Dayiheng Liu, Gao Liu, Chengqiang Lu, Keming Lu, Jianxin Ma, Rui Men, Xingzhang Ren, Xuancheng Ren, Chuanqi Tan, Sinan Tan, Jianhong Tu, Peng Wang, Shijie Wang, Wei Wang, Shengguang Wu, Benfeng Xu, Jin Xu, An Yang, Hao Yang, Jian Yang, Shusheng Yang, Yang Yao, Bowen Yu, Hongyi Yuan, Zheng Yuan, Jianwei Zhang, Xingxuan Zhang, Yichang Zhang, Zhenru Zhang, Chang Zhou, Jingren Zhou, Xiaohuan Zhou and Tianhang Zhu. 1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela. diff --git a/README_ru.md b/README_ru.md index a3770cd2e2ee..9a486bc7f5d9 100644 --- a/README_ru.md +++ b/README_ru.md @@ -463,6 +463,7 @@ conda install conda-forge::transformers 1. **[Pop2Piano](https://huggingface.co/docs/transformers/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi and Kyogu Lee. 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou. 1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (from Nanjing University, The University of Hong Kong etc.) released with the paper [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. +1. **[PVTv2](https://huggingface.co/docs/transformers/main/model_doc/pvt_v2)** (from Shanghai AI Laboratory, Nanjing University, The University of Hong Kong etc.) released with the paper [PVT v2: Improved Baselines with Pyramid Vision Transformer](https://arxiv.org/abs/2106.13797) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius. 1. **[Qwen2](https://huggingface.co/docs/transformers/model_doc/qwen2)** (from the Qwen team, Alibaba Group) released with the paper [Qwen Technical Report](https://arxiv.org/abs/2309.16609) by Jinze Bai, Shuai Bai, Yunfei Chu, Zeyu Cui, Kai Dang, Xiaodong Deng, Yang Fan, Wenbin Ge, Yu Han, Fei Huang, Binyuan Hui, Luo Ji, Mei Li, Junyang Lin, Runji Lin, Dayiheng Liu, Gao Liu, Chengqiang Lu, Keming Lu, Jianxin Ma, Rui Men, Xingzhang Ren, Xuancheng Ren, Chuanqi Tan, Sinan Tan, Jianhong Tu, Peng Wang, Shijie Wang, Wei Wang, Shengguang Wu, Benfeng Xu, Jin Xu, An Yang, Hao Yang, Jian Yang, Shusheng Yang, Yang Yao, Bowen Yu, Hongyi Yuan, Zheng Yuan, Jianwei Zhang, Xingxuan Zhang, Yichang Zhang, Zhenru Zhang, Chang Zhou, Jingren Zhou, Xiaohuan Zhou and Tianhang Zhu. 1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela. diff --git a/README_te.md b/README_te.md index 2bf9f1b8a4dd..522539713c56 100644 --- a/README_te.md +++ b/README_te.md @@ -465,6 +465,7 @@ Flax, PyTorch లేదా TensorFlow యొక్క ఇన్‌స్టా 1. **[Pop2Piano](https://huggingface.co/docs/transformers/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi and Kyogu Lee. 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou. 1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (from Nanjing University, The University of Hong Kong etc.) released with the paper [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. +1. **[PVTv2](https://huggingface.co/docs/transformers/main/model_doc/pvt_v2)** (from Shanghai AI Laboratory, Nanjing University, The University of Hong Kong etc.) released with the paper [PVT v2: Improved Baselines with Pyramid Vision Transformer](https://arxiv.org/abs/2106.13797) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius. 1. **[Qwen2](https://huggingface.co/docs/transformers/model_doc/qwen2)** (from the Qwen team, Alibaba Group) released with the paper [Qwen Technical Report](https://arxiv.org/abs/2309.16609) by Jinze Bai, Shuai Bai, Yunfei Chu, Zeyu Cui, Kai Dang, Xiaodong Deng, Yang Fan, Wenbin Ge, Yu Han, Fei Huang, Binyuan Hui, Luo Ji, Mei Li, Junyang Lin, Runji Lin, Dayiheng Liu, Gao Liu, Chengqiang Lu, Keming Lu, Jianxin Ma, Rui Men, Xingzhang Ren, Xuancheng Ren, Chuanqi Tan, Sinan Tan, Jianhong Tu, Peng Wang, Shijie Wang, Wei Wang, Shengguang Wu, Benfeng Xu, Jin Xu, An Yang, Hao Yang, Jian Yang, Shusheng Yang, Yang Yao, Bowen Yu, Hongyi Yuan, Zheng Yuan, Jianwei Zhang, Xingxuan Zhang, Yichang Zhang, Zhenru Zhang, Chang Zhou, Jingren Zhou, Xiaohuan Zhou and Tianhang Zhu. 1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela. diff --git a/README_vi.md b/README_vi.md index 635802d19ffc..293ce00ad793 100644 --- a/README_vi.md +++ b/README_vi.md @@ -464,6 +464,7 @@ Số lượng điểm kiểm tra hiện tại: ![](https://img.shields.io/endpoi 1. **[Pop2Piano](https://huggingface.co/docs/transformers/model_doc/pop2piano)** được phát hành với bài báo [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi and Kyogu Lee. 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (từ Microsoft Research) được phát hành với bài báo [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou. 1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (từ Nanjing University, The University of Hong Kong etc.) được phát hành với bài báo [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. +1. **[PVTv2](https://huggingface.co/docs/transformers/main/model_doc/pvt_v2)** (từ Shanghai AI Laboratory, Nanjing University, The University of Hong Kong etc.) được phát hành với bài báo [PVT v2: Improved Baselines with Pyramid Vision Transformer](https://arxiv.org/abs/2106.13797) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (từ NVIDIA) được phát hành với bài báo [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius. 1. **[Qwen2](https://huggingface.co/docs/transformers/model_doc/qwen2)** (từ the Qwen team, Alibaba Group) được phát hành với bài báo [Qwen Technical Report](https://arxiv.org/abs/2309.16609) by Jinze Bai, Shuai Bai, Yunfei Chu, Zeyu Cui, Kai Dang, Xiaodong Deng, Yang Fan, Wenbin Ge, Yu Han, Fei Huang, Binyuan Hui, Luo Ji, Mei Li, Junyang Lin, Runji Lin, Dayiheng Liu, Gao Liu, Chengqiang Lu, Keming Lu, Jianxin Ma, Rui Men, Xingzhang Ren, Xuancheng Ren, Chuanqi Tan, Sinan Tan, Jianhong Tu, Peng Wang, Shijie Wang, Wei Wang, Shengguang Wu, Benfeng Xu, Jin Xu, An Yang, Hao Yang, Jian Yang, Shusheng Yang, Yang Yao, Bowen Yu, Hongyi Yuan, Zheng Yuan, Jianwei Zhang, Xingxuan Zhang, Yichang Zhang, Zhenru Zhang, Chang Zhou, Jingren Zhou, Xiaohuan Zhou and Tianhang Zhu. 1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (từ Facebook) được phát hành với bài báo [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela. diff --git a/src/transformers/models/cohere/tokenization_cohere_fast.py b/src/transformers/models/cohere/tokenization_cohere_fast.py index 505e0d79d4e5..98379d290a69 100644 --- a/src/transformers/models/cohere/tokenization_cohere_fast.py +++ b/src/transformers/models/cohere/tokenization_cohere_fast.py @@ -83,6 +83,7 @@ class CohereTokenizerFast(PreTrainedTokenizerFast): vocab_file (`str`, *optional*): [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .model extension) that contains the vocabulary necessary to instantiate a tokenizer. + merges_file (``, *optional*): tokenizer_file (`str`, *optional*): [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that contains everything needed to load the tokenizer. @@ -102,7 +103,7 @@ class CohereTokenizerFast(PreTrainedTokenizerFast): Whether or not to add an `eos_token` at the end of sequences. use_default_system_prompt (`bool`, *optional*, defaults to `False`): Whether or not the default system prompt for Cohere tokenizer should be used. - add_prefix_space (`bool`, *optional*, defaults to `False`):): + add_prefix_space (`bool`, *optional*, defaults to `False`): Whether or not the tokenizer should automatically add a prefix space """ @@ -111,7 +112,7 @@ class CohereTokenizerFast(PreTrainedTokenizerFast): padding_side = "left" model_input_names = ["input_ids", "attention_mask"] slow_tokenizer_class = None - # No `max_model_input_sizes` + # No `max_model_input_sizes` def __init__( self, @@ -326,7 +327,7 @@ def default_tool_use_template(self): "{{ param_fields.type }}" "{% endif %}" "{% endfor %}" - "{{ ') -> List[Dict]:\n \"\"\"'}}" + '{{ \') -> List[Dict]:\n """\'}}' "{{ tool.description }}" "{% if tool.parameter_definitions|length != 0 %}" "{{ '\n\n Args:\n '}}" @@ -343,7 +344,7 @@ def default_tool_use_template(self): "{{ '): ' + param_fields.description }}" "{% endfor %}" "{% endif %}" - "{{ '\n \"\"\"\n pass\n```' }}" + '{{ \'\n """\n pass\n```\' }}' "{% endfor %}" "{{ '<|END_OF_TURN_TOKEN|>'}}" "{% for message in loop_messages %}" diff --git a/tests/models/cohere/test_tokenization_cohere.py b/tests/models/cohere/test_tokenization_cohere.py index 716ff4a200fb..62e679e34fc5 100644 --- a/tests/models/cohere/test_tokenization_cohere.py +++ b/tests/models/cohere/test_tokenization_cohere.py @@ -62,7 +62,10 @@ def test_encodings_from_sample_data(self): computed_tokens = tokenizer.batch_encode_plus(INPUT_SENTENCES)["input_ids"] self.assertListEqual(TARGET_TOKENS, computed_tokens) - INPUT_SENTENCES_W_BOS = ["The quick brown fox<|END_OF_TURN_TOKEN|>", "jumps over the lazy dog<|END_OF_TURN_TOKEN|>"] + INPUT_SENTENCES_W_BOS = [ + "The quick brown fox<|END_OF_TURN_TOKEN|>", + "jumps over the lazy dog<|END_OF_TURN_TOKEN|>", + ] decoded_tokens = tokenizer.batch_decode(computed_tokens) self.assertListEqual(decoded_tokens, INPUT_SENTENCES_W_BOS) @@ -291,4 +294,4 @@ def test_add_prefix_space_fast(self): tokenizer_wo_prefix = self.get_rust_tokenizer(add_prefix_space=False) tokens_w_prefix = tokenizer_w_prefix.tokenize("Hey") tokens_wo_prefix = tokenizer_wo_prefix.tokenize("Hey") - self.assertNotEqual(tokens_w_prefix, tokens_wo_prefix) \ No newline at end of file + self.assertNotEqual(tokens_w_prefix, tokens_wo_prefix)