diff --git a/README.md b/README.md
index d3a0c5c48..708f8521b 100644
--- a/README.md
+++ b/README.md
@@ -47,7 +47,7 @@ npm i @huggingface/transformers
Alternatively, you can use it in vanilla JS, without any bundler, by using a CDN or static hosting. For example, using [ES Modules](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Modules), you can import the library with:
```html
```
@@ -155,7 +155,7 @@ Check out the Transformers.js [template](https://huggingface.co/new-space?templa
-By default, Transformers.js uses [hosted pretrained models](https://huggingface.co/models?library=transformers.js) and [precompiled WASM binaries](https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.7.1/dist/), which should work out-of-the-box. You can customize this as follows:
+By default, Transformers.js uses [hosted pretrained models](https://huggingface.co/models?library=transformers.js) and [precompiled WASM binaries](https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.7.4/dist/), which should work out-of-the-box. You can customize this as follows:
### Settings
@@ -318,6 +318,7 @@ You can refine your search by selecting the task you're interested in (e.g., [te
1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://huggingface.co/papers/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
1. **[DINOv2](https://huggingface.co/docs/transformers/model_doc/dinov2)** (from Meta AI) released with the paper [DINOv2: Learning Robust Visual Features without Supervision](https://huggingface.co/papers/2304.07193) by Maxime Oquab, Timothée Darcet, Théo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervé Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski.
1. **[DINOv2 with Registers](https://huggingface.co/docs/transformers/model_doc/dinov2_with_registers)** (from Meta AI) released with the paper [DINOv2 with Registers](https://huggingface.co/papers/2309.16588) by Timothée Darcet, Maxime Oquab, Julien Mairal, Piotr Bojanowski.
+1. **[DINOv3](https://huggingface.co/docs/transformers/model_doc/dinov3)** (from Meta AI) released with the paper [DINOv3](https://huggingface.co/papers/2508.10104) by Oriane Siméoni, Huy V. Vo, Maximilian Seitzer, Federico Baldassarre, Maxime Oquab, Cijo Jose, Vasil Khalidov, Marc Szafraniec, Seungeun Yi, Michaël Ramamonjisoa, Francisco Massa, Daniel Haziza, Luca Wehrstedt, Jianyuan Wang, Timothée Darcet, Théo Moutakanni, Leonel Sentana, Claire Roberts, Andrea Vedaldi, Jamie Tolan, John Brandt, Camille Couprie, Julien Mairal, Hervé Jégou, Patrick Labatut, Piotr Bojanowski.
1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://huggingface.co/papers/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://huggingface.co/papers/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://huggingface.co/papers/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
@@ -358,7 +359,9 @@ You can refine your search by selecting the task you're interested in (e.g., [te
1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://huggingface.co/papers/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
1. **[LFM2](https://huggingface.co/docs/transformers/model_doc/lfm2)** (from Liquid AI) released with the blog post [Introducing LFM2: The Fastest On-Device Foundation Models on the Market](https://www.liquid.ai/blog/liquid-foundation-models-v2-our-second-series-of-generative-ai-models) by the Liquid AI Team.
1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (from The FAIR team of Meta AI) released with the paper [LLaMA: Open and Efficient Foundation Language Models](https://huggingface.co/papers/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
-1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (from The FAIR team of Meta AI) released with the paper [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/XXX) by Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom.
+1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (from The FAIR team of Meta AI) released with the paper [Llama2: Open Foundation and Fine-Tuned Chat Models](https://huggingface.co/papers/2307.09288) by Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom.
+1. **[Llama3](https://huggingface.co/docs/transformers/model_doc/llama3)** (from The FAIR team of Meta AI) released with the paper [The Llama 3 Herd of Models](https://huggingface.co/papers/2407.21783) by the Llama Team at Meta.
+1. **[Llama4](https://huggingface.co/docs/transformers/model_doc/llama4)** (from The FAIR team of Meta AI) released with the blog post [The Llama 4 herd: The beginning of a new era of natively multimodal AI innovation](https://ai.meta.com/blog/llama-4-multimodal-intelligence/) by the Llama Team at Meta.
1. **[LLaVa](https://huggingface.co/docs/transformers/model_doc/llava)** (from Microsoft Research & University of Wisconsin-Madison) released with the paper [Visual Instruction Tuning](https://huggingface.co/papers/2304.08485) by Haotian Liu, Chunyuan Li, Yuheng Li and Yong Jae Lee.
1. **[LLaVA-OneVision](https://huggingface.co/docs/transformers/model_doc/llava_onevision)** (from ByteDance & NTU & CUHK & HKUST) released with the paper [LLaVA-OneVision: Easy Visual Task Transfer](https://huggingface.co/papers/2408.03326) by Bo Li, Yuanhan Zhang, Dong Guo, Renrui Zhang, Feng Li, Hao Zhang, Kaichen Zhang, Yanwei Li, Ziwei Liu, Chunyuan Li
1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://huggingface.co/papers/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
@@ -436,6 +439,7 @@ You can refine your search by selecting the task you're interested in (e.g., [te
1. **Ultravox** (from Fixie.ai) released with the repository [fixie-ai/ultravox](https://github.com/fixie-ai/ultravox) by the Fixie.ai team.
1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://huggingface.co/papers/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://huggingface.co/papers/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
+1. **[VaultGemma](https://huggingface.co/docs/transformers/main/model_doc/vaultgemma)** (from Google) released with the technical report [VaultGemma: A Differentially Private Gemma Model](https://services.google.com/fh/files/blogs/vaultgemma_tech_report.pdf) by the VaultGemma Google team.
1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://huggingface.co/papers/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://huggingface.co/papers/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (from HUST-VL) released with the paper [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://huggingface.co/papers/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.
diff --git a/docs/snippets/2_installation.snippet b/docs/snippets/2_installation.snippet
index f96d4381c..a66b16774 100644
--- a/docs/snippets/2_installation.snippet
+++ b/docs/snippets/2_installation.snippet
@@ -7,6 +7,6 @@ npm i @huggingface/transformers
Alternatively, you can use it in vanilla JS, without any bundler, by using a CDN or static hosting. For example, using [ES Modules](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Modules), you can import the library with:
```html
```
diff --git a/docs/snippets/4_custom-usage.snippet b/docs/snippets/4_custom-usage.snippet
index 781caa943..06a111960 100644
--- a/docs/snippets/4_custom-usage.snippet
+++ b/docs/snippets/4_custom-usage.snippet
@@ -1,6 +1,6 @@
-By default, Transformers.js uses [hosted pretrained models](https://huggingface.co/models?library=transformers.js) and [precompiled WASM binaries](https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.7.1/dist/), which should work out-of-the-box. You can customize this as follows:
+By default, Transformers.js uses [hosted pretrained models](https://huggingface.co/models?library=transformers.js) and [precompiled WASM binaries](https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.7.4/dist/), which should work out-of-the-box. You can customize this as follows:
### Settings
diff --git a/docs/snippets/6_supported-models.snippet b/docs/snippets/6_supported-models.snippet
index d0459db74..0aa76c28c 100644
--- a/docs/snippets/6_supported-models.snippet
+++ b/docs/snippets/6_supported-models.snippet
@@ -32,6 +32,7 @@
1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://huggingface.co/papers/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
1. **[DINOv2](https://huggingface.co/docs/transformers/model_doc/dinov2)** (from Meta AI) released with the paper [DINOv2: Learning Robust Visual Features without Supervision](https://huggingface.co/papers/2304.07193) by Maxime Oquab, Timothée Darcet, Théo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervé Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski.
1. **[DINOv2 with Registers](https://huggingface.co/docs/transformers/model_doc/dinov2_with_registers)** (from Meta AI) released with the paper [DINOv2 with Registers](https://huggingface.co/papers/2309.16588) by Timothée Darcet, Maxime Oquab, Julien Mairal, Piotr Bojanowski.
+1. **[DINOv3](https://huggingface.co/docs/transformers/model_doc/dinov3)** (from Meta AI) released with the paper [DINOv3](https://huggingface.co/papers/2508.10104) by Oriane Siméoni, Huy V. Vo, Maximilian Seitzer, Federico Baldassarre, Maxime Oquab, Cijo Jose, Vasil Khalidov, Marc Szafraniec, Seungeun Yi, Michaël Ramamonjisoa, Francisco Massa, Daniel Haziza, Luca Wehrstedt, Jianyuan Wang, Timothée Darcet, Théo Moutakanni, Leonel Sentana, Claire Roberts, Andrea Vedaldi, Jamie Tolan, John Brandt, Camille Couprie, Julien Mairal, Hervé Jégou, Patrick Labatut, Piotr Bojanowski.
1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://huggingface.co/papers/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://huggingface.co/papers/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://huggingface.co/papers/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
@@ -72,7 +73,9 @@
1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://huggingface.co/papers/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
1. **[LFM2](https://huggingface.co/docs/transformers/model_doc/lfm2)** (from Liquid AI) released with the blog post [Introducing LFM2: The Fastest On-Device Foundation Models on the Market](https://www.liquid.ai/blog/liquid-foundation-models-v2-our-second-series-of-generative-ai-models) by the Liquid AI Team.
1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (from The FAIR team of Meta AI) released with the paper [LLaMA: Open and Efficient Foundation Language Models](https://huggingface.co/papers/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
-1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (from The FAIR team of Meta AI) released with the paper [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/XXX) by Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom.
+1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (from The FAIR team of Meta AI) released with the paper [Llama2: Open Foundation and Fine-Tuned Chat Models](https://huggingface.co/papers/2307.09288) by Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom.
+1. **[Llama3](https://huggingface.co/docs/transformers/model_doc/llama3)** (from The FAIR team of Meta AI) released with the paper [The Llama 3 Herd of Models](https://huggingface.co/papers/2407.21783) by the Llama Team at Meta.
+1. **[Llama4](https://huggingface.co/docs/transformers/model_doc/llama4)** (from The FAIR team of Meta AI) released with the blog post [The Llama 4 herd: The beginning of a new era of natively multimodal AI innovation](https://ai.meta.com/blog/llama-4-multimodal-intelligence/) by the Llama Team at Meta.
1. **[LLaVa](https://huggingface.co/docs/transformers/model_doc/llava)** (from Microsoft Research & University of Wisconsin-Madison) released with the paper [Visual Instruction Tuning](https://huggingface.co/papers/2304.08485) by Haotian Liu, Chunyuan Li, Yuheng Li and Yong Jae Lee.
1. **[LLaVA-OneVision](https://huggingface.co/docs/transformers/model_doc/llava_onevision)** (from ByteDance & NTU & CUHK & HKUST) released with the paper [LLaVA-OneVision: Easy Visual Task Transfer](https://huggingface.co/papers/2408.03326) by Bo Li, Yuanhan Zhang, Dong Guo, Renrui Zhang, Feng Li, Hao Zhang, Kaichen Zhang, Yanwei Li, Ziwei Liu, Chunyuan Li
1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://huggingface.co/papers/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
@@ -150,6 +153,7 @@
1. **Ultravox** (from Fixie.ai) released with the repository [fixie-ai/ultravox](https://github.com/fixie-ai/ultravox) by the Fixie.ai team.
1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://huggingface.co/papers/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://huggingface.co/papers/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
+1. **[VaultGemma](https://huggingface.co/docs/transformers/main/model_doc/vaultgemma)** (from Google) released with the technical report [VaultGemma: A Differentially Private Gemma Model](https://services.google.com/fh/files/blogs/vaultgemma_tech_report.pdf) by the VaultGemma Google team.
1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://huggingface.co/papers/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://huggingface.co/papers/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (from HUST-VL) released with the paper [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://huggingface.co/papers/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.
diff --git a/package-lock.json b/package-lock.json
index 84ce26ecb..bf46bb136 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,12 +1,12 @@
{
"name": "@huggingface/transformers",
- "version": "3.7.1",
+ "version": "3.7.4",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "@huggingface/transformers",
- "version": "3.7.1",
+ "version": "3.7.4",
"license": "Apache-2.0",
"dependencies": {
"@huggingface/jinja": "^0.5.1",
diff --git a/package.json b/package.json
index 9ef449ab1..2b2320444 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
{
"name": "@huggingface/transformers",
- "version": "3.7.1",
+ "version": "3.7.4",
"description": "State-of-the-art Machine Learning for the web. Run 🤗 Transformers directly in your browser, with no need for a server!",
"main": "./src/transformers.js",
"types": "./types/transformers.d.ts",
diff --git a/src/backends/onnx.js b/src/backends/onnx.js
index a64f9d160..3e49ece23 100644
--- a/src/backends/onnx.js
+++ b/src/backends/onnx.js
@@ -160,6 +160,32 @@ export async function createInferenceSession(buffer_or_path, session_options, se
return session;
}
+
+/**
+ * Currently, Transformers.js doesn't support simultaneous execution of sessions in WASM/WebGPU.
+ * For this reason, we need to chain the inference calls (otherwise we get "Error: Session already started").
+ * @type {Promise}
+ */
+let webInferenceChain = Promise.resolve();
+
+const IS_WEB_ENV = apis.IS_BROWSER_ENV || apis.IS_WEBWORKER_ENV;
+
+/**
+ * Run an inference session.
+ * @param {import('onnxruntime-common').InferenceSession} session The ONNX inference session.
+ * @param {Record} ortFeed The input tensors.
+ * @returns {Promise>} The output tensors.
+ */
+export async function runInferenceSession(session, ortFeed) {
+ const run = () => session.run(ortFeed);
+ const output = await (IS_WEB_ENV
+ ? (webInferenceChain = webInferenceChain.then(run))
+ : run()
+ );
+ return output;
+}
+
+
/**
* Check if an object is an ONNX tensor.
* @param {any} x The object to check
diff --git a/src/configs.js b/src/configs.js
index e32594fe6..95cedadfb 100644
--- a/src/configs.js
+++ b/src/configs.js
@@ -111,6 +111,7 @@ function getNormalizedConfig(config) {
mapping['hidden_size'] = 'hidden_size';
break;
case 'llama':
+ case 'llama4_text':
case 'arcee':
case 'lfm2':
case 'smollm3':
@@ -136,6 +137,7 @@ function getNormalizedConfig(config) {
case 'qwen3':
case 'gemma':
case 'gemma2':
+ case 'vaultgemma':
case 'gemma3_text':
case 'gemma3n_text':
case 'glm':
diff --git a/src/env.js b/src/env.js
index 13eb0f2f1..232d9a56d 100644
--- a/src/env.js
+++ b/src/env.js
@@ -26,7 +26,7 @@ import fs from 'node:fs';
import path from 'node:path';
import url from 'node:url';
-const VERSION = '3.7.1';
+const VERSION = '3.7.4';
// Check if various APIs are available (depends on environment)
const IS_BROWSER_ENV = typeof window !== "undefined" && typeof window.document !== "undefined";
diff --git a/src/models.js b/src/models.js
index 98e6b796f..5f0f6d359 100644
--- a/src/models.js
+++ b/src/models.js
@@ -48,6 +48,7 @@ import {
createInferenceSession,
isONNXTensor,
isONNXProxy,
+ runInferenceSession,
} from './backends/onnx.js';
import {
DATA_TYPES,
@@ -88,8 +89,6 @@ import {
MinNewTokensLengthLogitsProcessor,
TemperatureLogitsWarper,
- TopKLogitsWarper,
- TopPLogitsWarper,
ClassifierFreeGuidanceLogitsProcessor,
} from './generation/logits_process.js';
@@ -419,10 +418,6 @@ function validateInputs(session, inputs) {
return checkedInputs;
}
-// Currently, Transformers.js doesn't support simultaneous execution of sessions in WASM/WebGPU.
-// For this reason, we need to chain the inference calls (otherwise we get "Error: Session already started").
-let webInferenceChain = Promise.resolve();
-
/**
* Executes an InferenceSession using the specified inputs.
* NOTE: `inputs` must contain at least the input names of the model.
@@ -439,10 +434,7 @@ async function sessionRun(session, inputs) {
try {
// pass the original ort tensor
const ortFeed = Object.fromEntries(Object.entries(checkedInputs).map(([k, v]) => [k, v.ort_tensor]));
- const run = () => session.run(ortFeed);
- const output = await ((apis.IS_BROWSER_ENV || apis.IS_WEBWORKER_ENV)
- ? (webInferenceChain = webInferenceChain.then(run))
- : run());
+ const output = await runInferenceSession(session, ortFeed);
return replaceTensors(output);
} catch (e) {
// Error messages can be long (nested) and uninformative. For this reason,
@@ -1316,32 +1308,6 @@ export class PreTrainedModel extends Callable {
return this.configs?.generation_config ?? null;
}
- /**
- * This function returns a [`LogitsProcessorList`] list object that contains all relevant [`LogitsWarper`]
- * instances used for multinomial sampling.
- * @param {GenerationConfig} generation_config The generation config.
- * @returns {LogitsProcessorList} generation_config
- */
- _get_logits_warper(generation_config) {
-
- // instantiate warpers list
- const warpers = new LogitsProcessorList();
-
- if (generation_config.temperature !== null && generation_config.temperature !== 1.0) {
- warpers.push(new TemperatureLogitsWarper(generation_config.temperature));
- }
- if (generation_config.top_k !== null && generation_config.top_k !== 0) {
- // TODO: add min_tokens_to_keep
- warpers.push(new TopKLogitsWarper(generation_config.top_k));
- }
- if (generation_config.top_p !== null && generation_config.top_p < 1.0) {
- // TODO: add min_tokens_to_keep
- warpers.push(new TopPLogitsWarper(generation_config.top_p));
- }
-
- return warpers;
- }
-
/**
* @param {GenerationConfig} generation_config
* @param {number} input_ids_seq_length The starting sequence length for the input ids.
@@ -1461,6 +1427,19 @@ export class PreTrainedModel extends Callable {
processors.push(new ClassifierFreeGuidanceLogitsProcessor(generation_config.guidance_scale));
}
+ if (generation_config.do_sample) {
+ if (generation_config.temperature !== null && generation_config.temperature !== 1.0) {
+ processors.push(new TemperatureLogitsWarper(generation_config.temperature));
+ }
+ // TODO: Add TopPLogitsWarper and TopKLogitsWarper
+ // if (generation_config.top_k !== null && generation_config.top_k !== 0) {
+ // processors.push(new TopKLogitsWarper(generation_config.top_k));
+ // }
+ // if (generation_config.top_p !== null && generation_config.top_p < 1.0) {
+ // processors.push(new TopPLogitsWarper(generation_config.top_p));
+ // }
+ }
+
if (logits_processor !== null) {
processors.extend(logits_processor)
}
@@ -4594,6 +4573,13 @@ export class LlamaModel extends LlamaPreTrainedModel { }
export class LlamaForCausalLM extends LlamaPreTrainedModel { }
//////////////////////////////////////////////////
+
+//////////////////////////////////////////////////
+export class Llama4PreTrainedModel extends PreTrainedModel { }
+export class Llama4ForCausalLM extends Llama4PreTrainedModel { }
+//////////////////////////////////////////////////
+
+
//////////////////////////////////////////////////
// Arcee models
export class ArceePreTrainedModel extends PreTrainedModel { }
@@ -4710,6 +4696,12 @@ export class Gemma2Model extends Gemma2PreTrainedModel { }
export class Gemma2ForCausalLM extends Gemma2PreTrainedModel { }
//////////////////////////////////////////////////
+//////////////////////////////////////////////////
+// VaultGemma models
+export class VaultGemmaPreTrainedModel extends PreTrainedModel { }
+export class VaultGemmaModel extends VaultGemmaPreTrainedModel { }
+export class VaultGemmaForCausalLM extends VaultGemmaPreTrainedModel { }
+//////////////////////////////////////////////////
//////////////////////////////////////////////////
// Gemma3 models
@@ -5867,6 +5859,18 @@ export class Dinov2WithRegistersForImageClassification extends Dinov2WithRegiste
return new SequenceClassifierOutput(await super._call(model_inputs));
}
}
+//////////////////////////////////////////////////
+
+//////////////////////////////////////////////////
+export class DINOv3ViTPreTrainedModel extends PreTrainedModel { }
+export class DINOv3ViTModel extends DINOv3ViTPreTrainedModel { }
+//////////////////////////////////////////////////
+
+//////////////////////////////////////////////////
+export class DINOv3ConvNextPreTrainedModel extends PreTrainedModel { }
+export class DINOv3ConvNextModel extends DINOv3ConvNextPreTrainedModel { }
+//////////////////////////////////////////////////
+
//////////////////////////////////////////////////
export class GroundingDinoPreTrainedModel extends PreTrainedModel { }
export class GroundingDinoForObjectDetection extends GroundingDinoPreTrainedModel { }
@@ -7772,6 +7776,8 @@ const MODEL_MAPPING_NAMES_ENCODER_ONLY = new Map([
['convnextv2', ['ConvNextV2Model', ConvNextV2Model]],
['dinov2', ['Dinov2Model', Dinov2Model]],
['dinov2_with_registers', ['Dinov2WithRegistersModel', Dinov2WithRegistersModel]],
+ ['dinov3_vit', ['DINOv3ViTModel', DINOv3ViTModel]],
+ ['dinov3_convnext', ['DINOv3ConvNextModel', DINOv3ConvNextModel]],
['resnet', ['ResNetModel', ResNetModel]],
['swin', ['SwinModel', SwinModel]],
['swin2sr', ['Swin2SRModel', Swin2SRModel]],
@@ -7838,6 +7844,7 @@ const MODEL_MAPPING_NAMES_DECODER_ONLY = new Map([
['cohere', ['CohereModel', CohereModel]],
['gemma', ['GemmaModel', GemmaModel]],
['gemma2', ['Gemma2Model', Gemma2Model]],
+ ['vaultgemma', ['VaultGemmaModel', VaultGemmaModel]],
['gemma3_text', ['Gemma3Model', Gemma3Model]],
['helium', ['HeliumModel', HeliumModel]],
['glm', ['GlmModel', GlmModel]],
@@ -7935,6 +7942,7 @@ const MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = new Map([
['gpt_neox', ['GPTNeoXForCausalLM', GPTNeoXForCausalLM]],
['codegen', ['CodeGenForCausalLM', CodeGenForCausalLM]],
['llama', ['LlamaForCausalLM', LlamaForCausalLM]],
+ ['llama4_text', ['Llama4ForCausalLM', Llama4ForCausalLM]],
['arcee', ['ArceeForCausalLM', ArceeForCausalLM]],
['lfm2', ['Lfm2ForCausalLM', Lfm2ForCausalLM]],
['smollm3', ['SmolLM3ForCausalLM', SmolLM3ForCausalLM]],
@@ -7946,6 +7954,7 @@ const MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = new Map([
['cohere', ['CohereForCausalLM', CohereForCausalLM]],
['gemma', ['GemmaForCausalLM', GemmaForCausalLM]],
['gemma2', ['Gemma2ForCausalLM', Gemma2ForCausalLM]],
+ ['vaultgemma', ['VaultGemmaForCausalLM', VaultGemmaForCausalLM]],
['gemma3_text', ['Gemma3ForCausalLM', Gemma3ForCausalLM]],
['helium', ['HeliumForCausalLM', HeliumForCausalLM]],
['glm', ['GlmForCausalLM', GlmForCausalLM]],
diff --git a/src/models/dinov3_vit/image_processing_dinov3_vit.js b/src/models/dinov3_vit/image_processing_dinov3_vit.js
new file mode 100644
index 000000000..d907f810b
--- /dev/null
+++ b/src/models/dinov3_vit/image_processing_dinov3_vit.js
@@ -0,0 +1,6 @@
+
+import {
+ ImageProcessor,
+} from "../../base/image_processors_utils.js";
+
+export class DINOv3ViTImageProcessor extends ImageProcessor { }
diff --git a/src/models/image_processors.js b/src/models/image_processors.js
index 95f275893..57c4b158a 100644
--- a/src/models/image_processors.js
+++ b/src/models/image_processors.js
@@ -6,6 +6,7 @@ export * from './clip/image_processing_clip.js'
export * from './convnext/image_processing_convnext.js'
export * from './deit/image_processing_deit.js'
export * from './detr/image_processing_detr.js'
+export * from './dinov3_vit/image_processing_dinov3_vit.js'
export * from './donut/image_processing_donut.js'
export * from './dpt/image_processing_dpt.js'
export * from './efficientnet/image_processing_efficientnet.js'
diff --git a/src/ops/registry.js b/src/ops/registry.js
index 4f2179bec..1188f486f 100644
--- a/src/ops/registry.js
+++ b/src/ops/registry.js
@@ -1,8 +1,6 @@
-import { createInferenceSession, isONNXProxy } from "../backends/onnx.js";
+import { createInferenceSession, runInferenceSession, isONNXProxy } from "../backends/onnx.js";
import { Tensor } from "../utils/tensor.js";
-import { apis } from "../env.js";
-const IS_WEB_ENV = apis.IS_BROWSER_ENV || apis.IS_WEBWORKER_ENV;
/**
* Asynchronously creates a wrapper function for running an ONNX inference session.
*
@@ -19,16 +17,10 @@ const wrap = async (session_bytes, session_options, names) => {
new Uint8Array(session_bytes), session_options,
);
- /** @type {Promise} */
- let chain = Promise.resolve();
-
return /** @type {any} */(async (/** @type {Record} */ inputs) => {
const proxied = isONNXProxy();
const ortFeed = Object.fromEntries(Object.entries(inputs).map(([k, v]) => [k, (proxied ? v.clone() : v).ort_tensor]));
-
- // When running in-browser via WASM, we need to chain calls to session.run to avoid "Error: Session already started"
- const outputs = await (chain = IS_WEB_ENV ? chain.then(() => session.run(ortFeed)) : session.run(ortFeed));
-
+ const outputs = await runInferenceSession(session, ortFeed);
if (Array.isArray(names)) {
return names.map((n) => new Tensor(outputs[n]));
} else {
diff --git a/src/utils/hub.js b/src/utils/hub.js
index 56c2a643c..a56d1bf7a 100755
--- a/src/utils/hub.js
+++ b/src/utils/hub.js
@@ -625,7 +625,16 @@ export async function getModelFile(path_or_repo_id, filename, fatal = true, opti
) {
if (!result) {
// We haven't yet read the response body, so we need to do so now.
- await cache.put(cacheKey, /** @type {Response} */(response), options.progress_callback);
+ // Ensure progress updates include consistent metadata.
+ const wrapped_progress = options.progress_callback
+ ? (data) => dispatchCallback(options.progress_callback, {
+ status: 'progress',
+ name: path_or_repo_id,
+ file: filename,
+ ...data,
+ })
+ : undefined;
+ await cache.put(cacheKey, /** @type {Response} */(response), wrapped_progress);
} else {
// NOTE: We use `new Response(buffer, ...)` instead of `response.clone()` to handle LFS files
await cache.put(cacheKey, new Response(result, {