ylacombe · ylacombe · Mar 22, 2024 · Mar 4, 2024 · Mar 4, 2024 · Mar 4, 2024
diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py
@@ -475,6 +475,7 @@ def job_name(self):
         "pip install -U --upgrade-strategy eager 'git+https://github.com/facebookresearch/detectron2.git'",
         "sudo apt install tesseract-ocr",
         "pip install -U --upgrade-strategy eager pytesseract",
+        "pip install --upgrade-strategy eager sentencepiece",
         "pip install -U --upgrade-strategy eager natten==0.15.1+torch210cpu -f https://shi-labs.com/natten/wheels",
         "pip install -U --upgrade-strategy eager python-Levenshtein",
         "pip install -U --upgrade-strategy eager opencv-python",
@@ -485,6 +486,7 @@ def job_name(self):
         "tests/models/*layoutlmv*",
         "tests/models/*nat",
         "tests/models/deta",
+        "tests/models/udop",
         "tests/models/nougat",
     ],
     pytest_num_workers=1,

diff --git a/README.md b/README.md
diff --git a/README_de.md b/README_de.md
diff --git a/README_es.md b/README_es.md
diff --git a/README_fr.md b/README_fr.md
diff --git a/README_hd.md b/README_hd.md
diff --git a/README_ja.md b/README_ja.md
diff --git a/README_ko.md b/README_ko.md
diff --git a/README_pt-br.md b/README_pt-br.md
diff --git a/README_ru.md b/README_ru.md
diff --git a/README_te.md b/README_te.md
diff --git a/README_vi.md b/README_vi.md
diff --git a/README_zh-hans.md b/README_zh-hans.md
diff --git a/README_zh-hant.md b/README_zh-hant.md
diff --git a/docker/transformers-quantization-latest-gpu/Dockerfile b/docker/transformers-quantization-latest-gpu/Dockerfile
@@ -43,7 +43,10 @@ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/opt
 RUN python3 -m pip install --no-cache-dir aqlm[gpu]==1.0.2
 
 # Add autoawq for quantization testing
-RUN python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.1.8/autoawq-0.1.8+cu118-cp38-cp38-linux_x86_64.whl
+RUN python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.0/autoawq-0.2.0+cu118-cp38-cp38-linux_x86_64.whl
+
+# Add quanto for quantization testing
+RUN python3 -m pip install --no-cache-dir quanto
 
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.

diff --git a/docs/source/de/testing.md b/docs/source/de/testing.md
@@ -452,7 +452,7 @@ Dekorateure werden verwendet, um die Anforderungen von Tests in Bezug auf CPU/GP
 - `require_torch_multi_gpu` - wie `require_torch` und zusätzlich mindestens 2 GPUs erforderlich
 - `require_torch_non_multi_gpu` - wie `require_torch` plus benötigt 0 oder 1 GPUs
 - `require_torch_up_to_2_gpus` - wie `require_torch` plus erfordert 0 oder 1 oder 2 GPUs
-- `require_torch_tpu` - wie `require_torch` plus erfordert mindestens 1 TPU
+- `require_torch_xla` - wie `require_torch` plus erfordert mindestens 1 TPU
 
 Lassen Sie uns die GPU-Anforderungen in der folgenden Tabelle darstellen:
 

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -310,6 +310,8 @@
         title: CodeGen
       - local: model_doc/code_llama
         title: CodeLlama
+      - local: model_doc/cohere
+        title: Cohere
       - local: model_doc/convbert
         title: ConvBERT
       - local: model_doc/cpm
@@ -398,6 +400,8 @@
         title: M2M100
       - local: model_doc/madlad-400
         title: MADLAD-400
+      - local: model_doc/mamba
+        title: Mamba
       - local: model_doc/marian
         title: MarianMT
       - local: model_doc/markuplm
@@ -581,6 +585,8 @@
         title: PoolFormer
       - local: model_doc/pvt
         title: Pyramid Vision Transformer (PVT)
+      - local: model_doc/pvt_v2
+        title: Pyramid Vision Transformer v2 (PVTv2)
       - local: model_doc/regnet
         title: RegNet
       - local: model_doc/resnet
@@ -772,6 +778,8 @@
         title: TVLT
       - local: model_doc/tvp
         title: TVP
+      - local: model_doc/udop
+        title: UDOP
       - local: model_doc/vilt
         title: ViLT
       - local: model_doc/vipllava

diff --git a/docs/source/en/add_new_model.md b/docs/source/en/add_new_model.md
@@ -89,8 +89,8 @@ model.config  # model has access to its config
 Similar to the model, the configuration inherits basic serialization and deserialization functionalities from
 [`PretrainedConfig`]. Note that the configuration and the model are always serialized into two
 different formats - the model to a *pytorch_model.bin* file and the configuration to a *config.json* file. Calling
-[`~PreTrainedModel.save_pretrained`] will automatically call
-[`~PretrainedConfig.save_pretrained`], so that both model and configuration are saved.
+the model's [`~PreTrainedModel.save_pretrained`] will automatically call
+the config's [`~PretrainedConfig.save_pretrained`], so that both model and configuration are saved.
 
 
 ### Code style

diff --git a/docs/source/en/chat_templating.md b/docs/source/en/chat_templating.md
@@ -375,7 +375,7 @@ best performance for inference or fine-tuning when you precisely match the token
 If you're training a model from scratch, or fine-tuning a base language model for chat, on the other hand,
 you have a lot of freedom to choose an appropriate template! LLMs are smart enough to learn to handle lots of different
 input formats. Our default template for models that don't have a class-specific template follows the 
-[ChatML format](https://github.com/openai/openai-python/blob/main/chatml.md), and this is a good, flexible choice for many use-cases. It looks like this:
+`ChatML` format, and this is a good, flexible choice for many use-cases. It looks like this:
 
 ```
 {% for message in messages %}

diff --git a/docs/source/en/custom_tools.md b/docs/source/en/custom_tools.md
@@ -427,6 +427,15 @@ To upload your custom prompt on a repo on the Hub and share it with the communit
 
 ## Using custom tools
 
+<Tip warning={true}>
+
+Using custom tools in your local runtime means that you'll download code to run on your machine.
+
+ALWAYS inspect the tool you're downloading before loading it within your runtime, as you would do when
+installing a package using pip/npm/apt.
+
+</Tip>
+
 In this section, we'll be leveraging two existing custom tools that are specific to image generation:
 
 - We replace [huggingface-tools/image-transformation](https://huggingface.co/spaces/huggingface-tools/image-transformation),

diff --git a/docs/source/en/generation_strategies.md b/docs/source/en/generation_strategies.md
@@ -389,3 +389,6 @@ just like in multinomial sampling. However, in assisted decoding, reducing the t
 >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
 ['Alice and Bob are going to the same party. It is a small party, in a small']
 ```
+
+Alternativelly, you can also set the `prompt_lookup_num_tokens` to trigger n-gram based assisted decoding, as opposed
+to model based assisted decoding. You can read more about it [here](https://twitter.com/joao_gante/status/1747322413006643259).
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
@@ -95,6 +95,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                          [CLVP](model_doc/clvp)                          |       ✅        |         ❌         |      ❌      |
 |                       [CodeGen](model_doc/codegen)                       |       ✅        |         ❌         |      ❌      |
 |                    [CodeLlama](model_doc/code_llama)                     |       ✅        |         ❌         |      ✅      |
+|                        [Cohere](model_doc/cohere)                        |       ✅        |         ❌         |      ❌      |
 |              [Conditional DETR](model_doc/conditional_detr)              |       ✅        |         ❌         |      ❌      |
 |                      [ConvBERT](model_doc/convbert)                      |       ✅        |         ✅         |      ❌      |
 |                      [ConvNeXT](model_doc/convnext)                      |       ✅        |         ✅         |      ❌      |
@@ -180,6 +181,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                        [M-CTC-T](model_doc/mctct)                        |       ✅        |         ❌         |      ❌      |
 |                       [M2M100](model_doc/m2m_100)                        |       ✅        |         ❌         |      ❌      |
 |                    [MADLAD-400](model_doc/madlad-400)                    |       ✅        |         ✅         |      ✅      |
+|                         [Mamba](model_doc/mamba)                         |       ✅        |         ❌         |      ❌      |
 |                        [Marian](model_doc/marian)                        |       ✅        |         ✅         |      ✅      |
 |                      [MarkupLM](model_doc/markuplm)                      |       ✅        |         ❌         |      ❌      |
 |                   [Mask2Former](model_doc/mask2former)                   |       ✅        |         ❌         |      ❌      |
@@ -234,6 +236,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                     [Pop2Piano](model_doc/pop2piano)                     |       ✅        |         ❌         |      ❌      |
 |                    [ProphetNet](model_doc/prophetnet)                    |       ✅        |         ❌         |      ❌      |
 |                           [PVT](model_doc/pvt)                           |       ✅        |         ❌         |      ❌      |
+|                        [PVTv2](model_doc/pvt_v2)                         |       ✅        |         ❌         |      ❌      |
 |                       [QDQBert](model_doc/qdqbert)                       |       ✅        |         ❌         |      ❌      |
 |                         [Qwen2](model_doc/qwen2)                         |       ✅        |         ❌         |      ❌      |
 |                           [RAG](model_doc/rag)                           |       ✅        |         ✅         |      ❌      |
@@ -280,6 +283,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                         [TrOCR](model_doc/trocr)                         |       ✅        |         ❌         |      ❌      |
 |                          [TVLT](model_doc/tvlt)                          |       ✅        |         ❌         |      ❌      |
 |                           [TVP](model_doc/tvp)                           |       ✅        |         ❌         |      ❌      |
+|                          [UDOP](model_doc/udop)                          |       ✅        |         ❌         |      ❌      |
 |                           [UL2](model_doc/ul2)                           |       ✅        |         ✅         |      ✅      |
 |                          [UMT5](model_doc/umt5)                          |       ✅        |         ❌         |      ❌      |
 |                     [UniSpeech](model_doc/unispeech)                     |       ✅        |         ❌         |      ❌      |

diff --git a/docs/source/en/internal/generation_utils.md b/docs/source/en/internal/generation_utils.md
@@ -16,16 +16,7 @@ rendered properly in your Markdown viewer.
 
 # Utilities for Generation
 
-This page lists all the utility functions used by [`~generation.GenerationMixin.generate`],
-[`~generation.GenerationMixin.greedy_search`],
-[`~generation.GenerationMixin.contrastive_search`],
-[`~generation.GenerationMixin.sample`],
-[`~generation.GenerationMixin.beam_search`],
-[`~generation.GenerationMixin.beam_sample`],
-[`~generation.GenerationMixin.group_beam_search`], and
-[`~generation.GenerationMixin.constrained_beam_search`].
-
-Most of those are only useful if you are studying the code of the generate methods in the library.
+This page lists all the utility functions used by [`~generation.GenerationMixin.generate`].
 
 ## Generate Outputs
 
@@ -345,12 +336,6 @@ A [`Constraint`] can be used to force the generation to include specific tokens
     - process
     - finalize
 
-## Utilities
-
-[[autodoc]] top_k_top_p_filtering
-
-[[autodoc]] tf_top_k_top_p_filtering
-
 ## Streamers
 
 [[autodoc]] TextStreamer
@@ -376,4 +361,4 @@ A [`Constraint`] can be used to force the generation to include specific tokens
 
 [[autodoc]] StaticCache
     - update
-    - get_seq_length
+    - get_seq_length
diff --git a/docs/source/en/main_classes/quantization.md b/docs/source/en/main_classes/quantization.md
@@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.
 
 # Quantization
 
-Quantization techniques reduces memory and computational costs by representing weights and activations with lower-precision data types like 8-bit integers (int8). This enables loading larger models you normally wouldn't be able to fit into memory, and speeding up inference. Transformers supports the AWQ and GPTQ quantization algorithms and it supports 8-bit and 4-bit quantization with bitsandbytes.
+Quantization techniques reduce memory and computational costs by representing weights and activations with lower-precision data types like 8-bit integers (int8). This enables loading larger models you normally wouldn't be able to fit into memory, and speeding up inference. Transformers supports the AWQ and GPTQ quantization algorithms and it supports 8-bit and 4-bit quantization with bitsandbytes.
 
 Quantization techniques that aren't supported in Transformers can be added with the [`HfQuantizer`] class.
 
@@ -26,6 +26,10 @@ Learn how to quantize models in the [Quantization](../quantization) guide.
 
 </Tip>
 
+## QuantoConfig
+
+[[autodoc]] QuantoConfig
+
 ## AqlmConfig
 
 [[autodoc]] AqlmConfig

diff --git a/docs/source/en/main_classes/text_generation.md b/docs/source/en/main_classes/text_generation.md
@@ -37,19 +37,15 @@ like token streaming.
 	- from_pretrained
 	- from_model_config
 	- save_pretrained
+	- update
+	- validate
+	- get_generation_mode
 
 ## GenerationMixin
 
 [[autodoc]] generation.GenerationMixin
 	- generate
 	- compute_transition_scores
-	- greedy_search
-	- sample
-	- beam_search
-	- beam_sample
-	- contrastive_search
-	- group_beam_search
-	- constrained_beam_search
 
 ## TFGenerationMixin
 

diff --git a/docs/source/en/model_doc/bert.md b/docs/source/en/model_doc/bert.md
@@ -79,7 +79,7 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 <PipelineTag pipeline="token-classification"/>
 
 - A blog post on how to use [Hugging Face Transformers with Keras: Fine-tune a non-English BERT for Named Entity Recognition](https://www.philschmid.de/huggingface-transformers-keras-tf).
-- A notebook for [Finetuning BERT for named-entity recognition](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/Custom_Named_Entity_Recognition_with_BERT_only_first_wordpiece.ipynb) using only the first wordpiece of each word in the word label during tokenization. To propagate the label of the word to all wordpieces, see this [version](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/BERT/Custom_Named_Entity_Recognition_with_BERT.ipynb) of the notebook instead.
+- A notebook for [Finetuning BERT for named-entity recognition](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/BERT/Custom_Named_Entity_Recognition_with_BERT_only_first_wordpiece.ipynb) using only the first wordpiece of each word in the word label during tokenization. To propagate the label of the word to all wordpieces, see this [version](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/BERT/Custom_Named_Entity_Recognition_with_BERT.ipynb) of the notebook instead.
 - [`BertForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/token-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb).
 - [`TFBertForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/token-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb).
 - [`FlaxBertForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/token-classification).