From 9c68e9e3ce174e9efa27556e2f2bdb632a686df9 Mon Sep 17 00:00:00 2001 From: Yoach Lacombe <52246514+ylacombe@users.noreply.github.com> Date: Thu, 18 Jan 2024 13:37:34 +0000 Subject: [PATCH] Add new meta w2v2-conformer BERT-like model (#28165) * first commit * correct default value non causal * update config and modeling code * update converting checkpoint * clean modeling and fix tests * make style * add new config parameters to docstring * fix copied from statements * Apply suggestions from code review Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com> * make position_embeddings_type docstrings clearer * clean converting script * remove function not used * clean modeling file * apply suggestion for test file + add convert script to not_doctested * modify tests according to review - cleaner logic and more tests * Apply nit suggestions from code review Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * add checker of valid position embeddings type * instantiate new layer norm layer with the right eps * fix freeze_feature_encoder since it can be None in some cases * add test same output in convert script * restore wav2vec2conformer and add new model * create processor and FE + clean * add new model code * fix convert script and set default config parameters * correct model id paths * make style * make fix-copies and cleaning files * fix copied from statements * complete .md and fixe copies * clean convert script argument defaults * fix config parameters docstrings * fix config docstring * add copied from and enrich FE tests * fix copied from and repo-consistency * add autotokenizer * make test input length shorter and change docstring code * fix docstrings and copied from * add add_adapter to ASR training example * make testing of adapters more robust * adapt to multi adapter layers * refactor input_values->input_features and remove w2v2-bert feature extractor * remove pretraining model * remove depreciated features and useless lines * add copied from and ignore statements to modeling tests * remove pretraining model #2 * change import in convert script * change default in convert script * update readme and remove useless line * Update tests/models/wav2vec2_bert/test_processor_wav2vec2_bert.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * refactor BERT to Bert for consistency * remove useless ignore copy statement * add persistent to buffer in rotary * add eps in LayerNorm init and remove copied from * add adapter activation parameters and add copied from statements * Fix copied statements and add unitest.skip reasons * add copied statement in test_processor * refactor processor * make style * replace numpy random by torch rand * remove expected output CTC * improve converting script with processor class * Apply suggestions from code review Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * remove gumbel class * remove tests related to previously deleted class * Update src/transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * correct typos * remove uused parameters * update processor to takes both text and audio * update checkpoints * update expected output and add ctc expected output * add label_attention_mask * replace pt with np in processor tests * fix typo * revert to behaviour with labels_attention_mask --------- Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com> Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> --- README.md | 1 + README_es.md | 1 + README_hd.md | 1 + README_ja.md | 1 + README_ko.md | 1 + README_zh-hans.md | 1 + README_zh-hant.md | 1 + docs/source/en/_toctree.yml | 2 + docs/source/en/index.md | 1 + docs/source/en/model_doc/wav2vec2-bert.md | 90 + docs/source/en/tasks/asr.md | 2 +- docs/source/en/tasks/audio_classification.md | 2 +- .../run_speech_recognition_ctc.py | 8 + src/transformers/__init__.py | 30 + src/transformers/models/__init__.py | 1 + .../models/auto/configuration_auto.py | 3 + .../models/auto/feature_extraction_auto.py | 1 + src/transformers/models/auto/modeling_auto.py | 5 + .../models/auto/processing_auto.py | 1 + .../models/auto/tokenization_auto.py | 1 + .../models/wav2vec2_bert/__init__.py | 70 + .../configuration_wav2vec2_bert.py | 314 ++++ .../convert_wav2vec2_seamless_checkpoint.py | 218 +++ .../wav2vec2_bert/modeling_wav2vec2_bert.py | 1667 +++++++++++++++++ .../wav2vec2_bert/processing_wav2vec2_bert.py | 145 ++ src/transformers/utils/dummy_pt_objects.py | 45 + tests/models/wav2vec2_bert/__init__.py | 0 .../test_modeling_wav2vec2_bert.py | 913 +++++++++ .../test_processor_wav2vec2_bert.py | 156 ++ utils/check_docstrings.py | 1 + utils/not_doctested.txt | 1 + 31 files changed, 3682 insertions(+), 2 deletions(-) create mode 100644 docs/source/en/model_doc/wav2vec2-bert.md create mode 100644 src/transformers/models/wav2vec2_bert/__init__.py create mode 100644 src/transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py create mode 100644 src/transformers/models/wav2vec2_bert/convert_wav2vec2_seamless_checkpoint.py create mode 100644 src/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py create mode 100644 src/transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py create mode 100644 tests/models/wav2vec2_bert/__init__.py create mode 100644 tests/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py create mode 100644 tests/models/wav2vec2_bert/test_processor_wav2vec2_bert.py diff --git a/README.md b/README.md index 806738d64edc87..b67706aee9bd6d 100644 --- a/README.md +++ b/README.md @@ -519,6 +519,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h 1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (from Kakao Enterprise) released with the paper [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) by Jaehyeon Kim, Jungil Kong, Juhee Son. 1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid. 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli. +1. **[Wav2Vec2-BERT](https://huggingface.co/docs/transformers/main/model_doc/wav2vec2-bert)** (from Meta AI) released with the paper [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team. 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino. 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli. 1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei. diff --git a/README_es.md b/README_es.md index c5aa57f226ea66..6ac2c7e0b238d3 100644 --- a/README_es.md +++ b/README_es.md @@ -494,6 +494,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt 1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (from Kakao Enterprise) released with the paper [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) by Jaehyeon Kim, Jungil Kong, Juhee Son. 1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid. 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli. +1. **[Wav2Vec2-BERT](https://huggingface.co/docs/transformers/main/model_doc/wav2vec2-bert)** (from Meta AI) released with the paper [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team. 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino. 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli. 1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei. diff --git a/README_hd.md b/README_hd.md index 5924c31627879d..c78dbae1afae7e 100644 --- a/README_hd.md +++ b/README_hd.md @@ -468,6 +468,7 @@ conda install conda-forge::transformers 1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (Kakao Enterprise से) Jaehyeon Kim, Jungil Kong, Juhee Son. द्वाराअनुसंधान पत्र [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) के साथ जारी किया गया 1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid. 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (फेसबुक एआई से) साथ में पेपर [wav2vec 2.0: ए फ्रेमवर्क फॉर सेल्फ-सुपरवाइज्ड लर्निंग ऑफ स्पीच रिप्रेजेंटेशन](https://arxiv.org/abs/2006.11477) एलेक्सी बेवस्की, हेनरी झोउ, अब्देलरहमान मोहम्मद, माइकल औली द्वारा। +1. **[Wav2Vec2-BERT](https://huggingface.co/docs/transformers/main/model_doc/wav2vec2-bert)** (from Meta AI) released with the paper [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team. 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (Facebook AI से) साथ वाला पेपर [FAIRSEQ S2T: FAIRSEQ के साथ फास्ट स्पीच-टू-टेक्स्ट मॉडलिंग ](https://arxiv.org/abs/2010.05171) चांगहान वांग, यूं तांग, जुताई मा, ऐनी वू, सरव्या पोपुरी, दिमित्रो ओखोनको, जुआन पिनो द्वारा पोस्ट किया गया। 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (Facebook AI से) साथ वाला पेपर [सरल और प्रभावी जीरो-शॉट क्रॉस-लिंगुअल फोनेम रिकॉग्निशन](https://arxiv.org/abs/2109.11680) कियानटोंग जू, एलेक्सी बाएव्स्की, माइकल औली द्वारा। 1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (माइक्रोसॉफ्ट रिसर्च से) पेपर के साथ जारी किया गया [WavLM: फुल स्टैक के लिए बड़े पैमाने पर स्व-पर्यवेक्षित पूर्व-प्रशिक्षण स्पीच प्रोसेसिंग](https://arxiv.org/abs/2110.13900) सानयुआन चेन, चेंगयी वांग, झेंगयांग चेन, यू वू, शुजी लियू, ज़ुओ चेन, जिन्यु ली, नाओयुकी कांडा, ताकुया योशियोका, ज़िओंग जिओ, जियान वू, लॉन्ग झोउ, शुओ रेन, यानमिन कियान, याओ कियान, जियान वू, माइकल ज़ेंग, फुरु वेई। diff --git a/README_ja.md b/README_ja.md index e54ba07359971c..2aed27e7c2d9ca 100644 --- a/README_ja.md +++ b/README_ja.md @@ -528,6 +528,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ 1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (Kakao Enterprise から) Jaehyeon Kim, Jungil Kong, Juhee Son. から公開された研究論文 [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) 1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid. 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (Facebook AI から) Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli から公開された研究論文: [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) +1. **[Wav2Vec2-BERT](https://huggingface.co/docs/transformers/main/model_doc/wav2vec2-bert)** (from Meta AI) released with the paper [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team. 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (Facebook AI から) Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino から公開された研究論文: [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (Facebook AI から) Qiantong Xu, Alexei Baevski, Michael Auli から公開された研究論文: [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) 1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (Microsoft Research から) Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei から公開された研究論文: [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) diff --git a/README_ko.md b/README_ko.md index 35d838346c7087..33ea7c68f05437 100644 --- a/README_ko.md +++ b/README_ko.md @@ -443,6 +443,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는 1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (Kakao Enterprise 에서 제공)은 Jaehyeon Kim, Jungil Kong, Juhee Son.의 [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103)논문과 함께 발표했습니다. 1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid. 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (Facebook AI 에서) Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli 의 [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) 논문과 함께 발표했습니다. +1. **[Wav2Vec2-BERT](https://huggingface.co/docs/transformers/main/model_doc/wav2vec2-bert)** (from Meta AI) released with the paper [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team. 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (Facebook AI 에서) Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino 의 [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) 논문과 함께 발표했습니다. 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (Facebook AI 에서) Qiantong Xu, Alexei Baevski, Michael Auli 의 [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) 논문과 함께 발표했습니다. 1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (Microsoft Research 에서) Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei 의 [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) 논문과 함께 발표했습니다. diff --git a/README_zh-hans.md b/README_zh-hans.md index 3999da2bc15d73..a0acb24e59b274 100644 --- a/README_zh-hans.md +++ b/README_zh-hans.md @@ -467,6 +467,7 @@ conda install conda-forge::transformers 1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (来自 Kakao Enterprise) 伴随论文 [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) 由 Jaehyeon Kim, Jungil Kong, Juhee Son 发布。 1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (来自 Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) 由 Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid. 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (来自 Facebook AI) 伴随论文 [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) 由 Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli 发布。 +1. **[Wav2Vec2-BERT](https://huggingface.co/docs/transformers/main/model_doc/wav2vec2-bert)** (from Meta AI) released with the paper [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team. 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (来自 Facebook AI) 伴随论文 [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) 由 Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino 发布。 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (来自 Facebook AI) 伴随论文 [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) 由 Qiantong Xu, Alexei Baevski, Michael Auli 发布。 1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei. diff --git a/README_zh-hant.md b/README_zh-hant.md index 83ff0b153273c6..aa0eaa174a2b30 100644 --- a/README_zh-hant.md +++ b/README_zh-hant.md @@ -479,6 +479,7 @@ conda install conda-forge::transformers 1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (from Kakao Enterprise) released with the paper [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) by Jaehyeon Kim, Jungil Kong, Juhee Son. 1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid. 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli. +1. **[Wav2Vec2-BERT](https://huggingface.co/docs/transformers/main/model_doc/wav2vec2-bert)** (from Meta AI) released with the paper [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team. 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino. 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli. 1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei. diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 02c13c277645ae..2f973b4c436a09 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -650,6 +650,8 @@ title: VITS - local: model_doc/wav2vec2 title: Wav2Vec2 + - local: model_doc/wav2vec2-bert + title: Wav2Vec2-BERT - local: model_doc/wav2vec2-conformer title: Wav2Vec2-Conformer - local: model_doc/wav2vec2_phoneme diff --git a/docs/source/en/index.md b/docs/source/en/index.md index 1f43ec7d54027d..6fc472cc040451 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -296,6 +296,7 @@ Flax), PyTorch, and/or TensorFlow. | [VITS](model_doc/vits) | ✅ | ❌ | ❌ | | [ViViT](model_doc/vivit) | ✅ | ❌ | ❌ | | [Wav2Vec2](model_doc/wav2vec2) | ✅ | ✅ | ✅ | +| [Wav2Vec2-BERT](model_doc/wav2vec2-bert) | ✅ | ❌ | ❌ | | [Wav2Vec2-Conformer](model_doc/wav2vec2-conformer) | ✅ | ❌ | ❌ | | [Wav2Vec2Phoneme](model_doc/wav2vec2_phoneme) | ✅ | ✅ | ✅ | | [WavLM](model_doc/wavlm) | ✅ | ❌ | ❌ | diff --git a/docs/source/en/model_doc/wav2vec2-bert.md b/docs/source/en/model_doc/wav2vec2-bert.md new file mode 100644 index 00000000000000..6514133330a9d4 --- /dev/null +++ b/docs/source/en/model_doc/wav2vec2-bert.md @@ -0,0 +1,90 @@ + + +# Wav2Vec2-BERT + +## Overview + +The Wav2Vec2-BERT model was proposed in [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team from Meta AI. + +This model was pre-trained on 4.5M hours of unlabeled audio data covering more than 143 languages. It requires finetuning to be used for downstream tasks such as Automatic Speech Recognition (ASR), or Audio Classification. + +The official results of the model can be found in Section 3.2.1 of the paper. + +The abstract from the paper is the following: + +*Recent advancements in automatic speech translation have dramatically expanded language coverage, improved multimodal capabilities, and enabled a wide range of tasks and functionalities. That said, large-scale automatic speech translation systems today lack key features that help machine-mediated communication feel seamless when compared to human-to-human dialogue. In this work, we introduce a family of models that enable end-to-end expressive and multilingual translations in a streaming fashion. First, we contribute an improved version of the massively multilingual and multimodal SeamlessM4T model—SeamlessM4T v2. This newer model, incorporating an updated UnitY2 framework, was trained on more low-resource language data. The expanded version of SeamlessAlign adds 114,800 hours of automatically aligned data for a total of 76 languages. SeamlessM4T v2 provides the foundation on which our two newest models, SeamlessExpressive and SeamlessStreaming, are initiated. SeamlessExpressive enables translation that preserves vocal styles and prosody. Compared to previous efforts in expressive speech research, our work addresses certain underexplored aspects of prosody, such as speech rate and pauses, while also preserving the style of one’s voice. As for SeamlessStreaming, our model leverages the Efficient Monotonic Multihead Attention (EMMA) mechanism to generate low-latency target translations without waiting for complete source utterances. As the first of its kind, SeamlessStreaming enables simultaneous speech-to-speech/text translation for multiple source and target languages. To understand the performance of these models, we combined novel and modified versions of existing automatic metrics to evaluate prosody, latency, and robustness. For human evaluations, we adapted existing protocols tailored for measuring the most relevant attributes in the preservation of meaning, naturalness, and expressivity. To ensure that our models can be used safely and responsibly, we implemented the first known red-teaming effort for multimodal machine translation, a system for the detection and mitigation of added toxicity, a systematic evaluation of gender bias, and an inaudible localized watermarking mechanism designed to dampen the impact of deepfakes. Consequently, we bring major components from SeamlessExpressive and SeamlessStreaming together to form Seamless, the first publicly available system that unlocks expressive cross-lingual communication in real-time. In sum, Seamless gives us a pivotal look at the technical foundation needed to turn the Universal Speech Translator from a science fiction concept into a real-world technology. Finally, contributions in this work—including models, code, and a watermark detector—are publicly released and accessible at the link below.* + +This model was contributed by [ylacombe](https://huggingface.co/ylacombe). The original code can be found [here](https://github.com/facebookresearch/seamless_communication). + +## Usage tips + +- Wav2Vec2-BERT follows the same architecture as Wav2Vec2-Conformer, but employs a causal depthwise convolutional layer and uses as input a mel-spectrogram representation of the audio instead of the raw waveform. +- Wav2Vec2-BERT can use either no relative position embeddings, Shaw-like position embeddings, Transformer-XL-like position embeddings, or + rotary position embeddings by setting the correct `config.position_embeddings_type`. +- Wav2Vec2-BERT also introduces a Conformer-based adapter network instead of a simple convolutional network. + +## Resources + + + +- [`Wav2Vec2BertForCTC`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/speech-recognition). +- You can also adapt these notebooks on [how to finetune a speech recognition model in English](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/speech_recognition.ipynb), and [how to finetune a speech recognition model in any language](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multi_lingual_speech_recognition.ipynb). + + + +- [`Wav2Vec2BertForSequenceClassification`] can be used by adapting this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/audio-classification). +- See also: [Audio classification task guide](../tasks/audio_classification) + + +## Wav2Vec2BertConfig + +[[autodoc]] Wav2Vec2BertConfig + +## Wav2Vec2BertProcessor + +[[autodoc]] Wav2Vec2BertProcessor + - __call__ + - pad + - from_pretrained + - save_pretrained + - batch_decode + - decode + +## Wav2Vec2BertModel + +[[autodoc]] Wav2Vec2BertModel + - forward + +## Wav2Vec2BertForCTC + +[[autodoc]] Wav2Vec2BertForCTC + - forward + +## Wav2Vec2BertForSequenceClassification + +[[autodoc]] Wav2Vec2BertForSequenceClassification + - forward + +## Wav2Vec2BertForAudioFrameClassification + +[[autodoc]] Wav2Vec2BertForAudioFrameClassification + - forward + +## Wav2Vec2BertForXVector + +[[autodoc]] Wav2Vec2BertForXVector + - forward diff --git a/docs/source/en/tasks/asr.md b/docs/source/en/tasks/asr.md index d01269ba60a696..737460ed297bcf 100644 --- a/docs/source/en/tasks/asr.md +++ b/docs/source/en/tasks/asr.md @@ -32,7 +32,7 @@ The task illustrated in this tutorial is supported by the following model archit -[Data2VecAudio](../model_doc/data2vec-audio), [Hubert](../model_doc/hubert), [M-CTC-T](../model_doc/mctct), [SEW](../model_doc/sew), [SEW-D](../model_doc/sew-d), [UniSpeech](../model_doc/unispeech), [UniSpeechSat](../model_doc/unispeech-sat), [Wav2Vec2](../model_doc/wav2vec2), [Wav2Vec2-Conformer](../model_doc/wav2vec2-conformer), [WavLM](../model_doc/wavlm) +[Data2VecAudio](../model_doc/data2vec-audio), [Hubert](../model_doc/hubert), [M-CTC-T](../model_doc/mctct), [SEW](../model_doc/sew), [SEW-D](../model_doc/sew-d), [UniSpeech](../model_doc/unispeech), [UniSpeechSat](../model_doc/unispeech-sat), [Wav2Vec2](../model_doc/wav2vec2), [Wav2Vec2-BERT](../model_doc/wav2vec2-bert), [Wav2Vec2-Conformer](../model_doc/wav2vec2-conformer), [WavLM](../model_doc/wavlm) diff --git a/docs/source/en/tasks/audio_classification.md b/docs/source/en/tasks/audio_classification.md index 743a797fc53fa8..678af90c4fa079 100644 --- a/docs/source/en/tasks/audio_classification.md +++ b/docs/source/en/tasks/audio_classification.md @@ -32,7 +32,7 @@ The task illustrated in this tutorial is supported by the following model archit -[Audio Spectrogram Transformer](../model_doc/audio-spectrogram-transformer), [Data2VecAudio](../model_doc/data2vec-audio), [Hubert](../model_doc/hubert), [SEW](../model_doc/sew), [SEW-D](../model_doc/sew-d), [UniSpeech](../model_doc/unispeech), [UniSpeechSat](../model_doc/unispeech-sat), [Wav2Vec2](../model_doc/wav2vec2), [Wav2Vec2-Conformer](../model_doc/wav2vec2-conformer), [WavLM](../model_doc/wavlm), [Whisper](../model_doc/whisper) +[Audio Spectrogram Transformer](../model_doc/audio-spectrogram-transformer), [Data2VecAudio](../model_doc/data2vec-audio), [Hubert](../model_doc/hubert), [SEW](../model_doc/sew), [SEW-D](../model_doc/sew-d), [UniSpeech](../model_doc/unispeech), [UniSpeechSat](../model_doc/unispeech-sat), [Wav2Vec2](../model_doc/wav2vec2), [Wav2Vec2-BERT](../model_doc/wav2vec2-bert), [Wav2Vec2-Conformer](../model_doc/wav2vec2-conformer), [WavLM](../model_doc/wavlm), [Whisper](../model_doc/whisper) diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py index 1c658904e71e30..3ca9a2c6f44d3e 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py @@ -132,6 +132,13 @@ class ModelArguments: ctc_loss_reduction: Optional[str] = field( default="mean", metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."} ) + add_adapter: Optional[bool] = field( + default=False, + metadata={ + "help": "Whether a convolutional attention network should be stacked on top of the Wav2Vec2BERT Encoder. Can be very" + "useful to downsample the output length." + }, + ) @dataclass @@ -602,6 +609,7 @@ def remove_special_characters(batch): "pad_token_id": tokenizer.pad_token_id, "vocab_size": len(tokenizer), "activation_dropout": model_args.activation_dropout, + "add_adapter": model_args.add_adapter, } ) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 78cef80fea6f42..8a8609b28f17a6 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -914,6 +914,11 @@ "Wav2Vec2Processor", "Wav2Vec2Tokenizer", ], + "models.wav2vec2_bert": [ + "WAV2VEC2_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", + "Wav2Vec2BertConfig", + "Wav2Vec2BertProcessor", + ], "models.wav2vec2_conformer": [ "WAV2VEC2_CONFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "Wav2Vec2ConformerConfig", @@ -3515,6 +3520,17 @@ "Wav2Vec2PreTrainedModel", ] ) + _import_structure["models.wav2vec2_bert"].extend( + [ + "WAV2VEC2_BERT_PRETRAINED_MODEL_ARCHIVE_LIST", + "Wav2Vec2BertForAudioFrameClassification", + "Wav2Vec2BertForCTC", + "Wav2Vec2BertForSequenceClassification", + "Wav2Vec2BertForXVector", + "Wav2Vec2BertModel", + "Wav2Vec2BertPreTrainedModel", + ] + ) _import_structure["models.wav2vec2_conformer"].extend( [ "WAV2VEC2_CONFORMER_PRETRAINED_MODEL_ARCHIVE_LIST", @@ -5617,6 +5633,11 @@ Wav2Vec2Processor, Wav2Vec2Tokenizer, ) + from .models.wav2vec2_bert import ( + WAV2VEC2_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, + Wav2Vec2BertConfig, + Wav2Vec2BertProcessor, + ) from .models.wav2vec2_conformer import ( WAV2VEC2_CONFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, Wav2Vec2ConformerConfig, @@ -7821,6 +7842,15 @@ Wav2Vec2Model, Wav2Vec2PreTrainedModel, ) + from .models.wav2vec2_bert import ( + WAV2VEC2_BERT_PRETRAINED_MODEL_ARCHIVE_LIST, + Wav2Vec2BertForAudioFrameClassification, + Wav2Vec2BertForCTC, + Wav2Vec2BertForSequenceClassification, + Wav2Vec2BertForXVector, + Wav2Vec2BertModel, + Wav2Vec2BertPreTrainedModel, + ) from .models.wav2vec2_conformer import ( WAV2VEC2_CONFORMER_PRETRAINED_MODEL_ARCHIVE_LIST, Wav2Vec2ConformerForAudioFrameClassification, diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 3fea0edb9a921e..09f5bb543ab043 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -235,6 +235,7 @@ vits, vivit, wav2vec2, + wav2vec2_bert, wav2vec2_conformer, wav2vec2_phoneme, wav2vec2_with_lm, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index d439e5bb611718..060d3057f518fa 100755 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -246,6 +246,7 @@ ("vits", "VitsConfig"), ("vivit", "VivitConfig"), ("wav2vec2", "Wav2Vec2Config"), + ("wav2vec2-bert", "Wav2Vec2BertConfig"), ("wav2vec2-conformer", "Wav2Vec2ConformerConfig"), ("wavlm", "WavLMConfig"), ("whisper", "WhisperConfig"), @@ -459,6 +460,7 @@ ("vits", "VITS_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("vivit", "VIVIT_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("wav2vec2", "WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("wav2vec2-bert", "WAV2VEC2_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("wav2vec2-conformer", "WAV2VEC2_CONFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("whisper", "WHISPER_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("xclip", "XCLIP_PRETRAINED_CONFIG_ARCHIVE_MAP"), @@ -718,6 +720,7 @@ ("vits", "VITS"), ("vivit", "ViViT"), ("wav2vec2", "Wav2Vec2"), + ("wav2vec2-bert", "Wav2Vec2-BERT"), ("wav2vec2-conformer", "Wav2Vec2-Conformer"), ("wav2vec2_phoneme", "Wav2Vec2Phoneme"), ("wavlm", "WavLM"), diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py index 457217566e7cfa..b3461e8b56a7a9 100644 --- a/src/transformers/models/auto/feature_extraction_auto.py +++ b/src/transformers/models/auto/feature_extraction_auto.py @@ -100,6 +100,7 @@ ("vit_mae", "ViTFeatureExtractor"), ("vit_msn", "ViTFeatureExtractor"), ("wav2vec2", "Wav2Vec2FeatureExtractor"), + ("wav2vec2-bert", "Wav2Vec2FeatureExtractor"), ("wav2vec2-conformer", "Wav2Vec2FeatureExtractor"), ("wavlm", "Wav2Vec2FeatureExtractor"), ("whisper", "WhisperFeatureExtractor"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 25e569920125e6..9332497732a4ce 100755 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -232,6 +232,7 @@ ("vits", "VitsModel"), ("vivit", "VivitModel"), ("wav2vec2", "Wav2Vec2Model"), + ("wav2vec2-bert", "Wav2Vec2BertModel"), ("wav2vec2-conformer", "Wav2Vec2ConformerModel"), ("wavlm", "WavLMModel"), ("whisper", "WhisperModel"), @@ -1034,6 +1035,7 @@ ("unispeech", "UniSpeechForSequenceClassification"), ("unispeech-sat", "UniSpeechSatForSequenceClassification"), ("wav2vec2", "Wav2Vec2ForSequenceClassification"), + ("wav2vec2-bert", "Wav2Vec2BertForSequenceClassification"), ("wav2vec2-conformer", "Wav2Vec2ConformerForSequenceClassification"), ("wavlm", "WavLMForSequenceClassification"), ("whisper", "WhisperForAudioClassification"), @@ -1051,6 +1053,7 @@ ("unispeech", "UniSpeechForCTC"), ("unispeech-sat", "UniSpeechSatForCTC"), ("wav2vec2", "Wav2Vec2ForCTC"), + ("wav2vec2-bert", "Wav2Vec2BertForCTC"), ("wav2vec2-conformer", "Wav2Vec2ConformerForCTC"), ("wavlm", "WavLMForCTC"), ] @@ -1062,6 +1065,7 @@ ("data2vec-audio", "Data2VecAudioForAudioFrameClassification"), ("unispeech-sat", "UniSpeechSatForAudioFrameClassification"), ("wav2vec2", "Wav2Vec2ForAudioFrameClassification"), + ("wav2vec2-bert", "Wav2Vec2BertForAudioFrameClassification"), ("wav2vec2-conformer", "Wav2Vec2ConformerForAudioFrameClassification"), ("wavlm", "WavLMForAudioFrameClassification"), ] @@ -1073,6 +1077,7 @@ ("data2vec-audio", "Data2VecAudioForXVector"), ("unispeech-sat", "UniSpeechSatForXVector"), ("wav2vec2", "Wav2Vec2ForXVector"), + ("wav2vec2-bert", "Wav2Vec2BertForXVector"), ("wav2vec2-conformer", "Wav2Vec2ConformerForXVector"), ("wavlm", "WavLMForXVector"), ] diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py index 208cd53ac25667..2a8823fea7c0ee 100644 --- a/src/transformers/models/auto/processing_auto.py +++ b/src/transformers/models/auto/processing_auto.py @@ -91,6 +91,7 @@ ("vipllava", "LlavaProcessor"), ("vision-text-dual-encoder", "VisionTextDualEncoderProcessor"), ("wav2vec2", "Wav2Vec2Processor"), + ("wav2vec2-bert", "Wav2Vec2Processor"), ("wav2vec2-conformer", "Wav2Vec2Processor"), ("wavlm", "Wav2Vec2Processor"), ("whisper", "WhisperProcessor"), diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index 27998e6c0f05f0..7390409d5144b7 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -418,6 +418,7 @@ ("visual_bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), ("vits", ("VitsTokenizer", None)), ("wav2vec2", ("Wav2Vec2CTCTokenizer", None)), + ("wav2vec2-bert", ("Wav2Vec2CTCTokenizer", None)), ("wav2vec2-conformer", ("Wav2Vec2CTCTokenizer", None)), ("wav2vec2_phoneme", ("Wav2Vec2PhonemeCTCTokenizer", None)), ("whisper", ("WhisperTokenizer", "WhisperTokenizerFast" if is_tokenizers_available() else None)), diff --git a/src/transformers/models/wav2vec2_bert/__init__.py b/src/transformers/models/wav2vec2_bert/__init__.py new file mode 100644 index 00000000000000..594f108bcaad96 --- /dev/null +++ b/src/transformers/models/wav2vec2_bert/__init__.py @@ -0,0 +1,70 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available + + +_import_structure = { + "configuration_wav2vec2_bert": [ + "WAV2VEC2_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", + "Wav2Vec2BertConfig", + ], + "processing_wav2vec2_bert": ["Wav2Vec2BertProcessor"], +} + + +try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_wav2vec2_bert"] = [ + "WAV2VEC2_BERT_PRETRAINED_MODEL_ARCHIVE_LIST", + "Wav2Vec2BertForAudioFrameClassification", + "Wav2Vec2BertForCTC", + "Wav2Vec2BertForSequenceClassification", + "Wav2Vec2BertForXVector", + "Wav2Vec2BertModel", + "Wav2Vec2BertPreTrainedModel", + ] + +if TYPE_CHECKING: + from .configuration_wav2vec2_bert import ( + WAV2VEC2_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, + Wav2Vec2BertConfig, + ) + from .processing_wav2vec2_bert import Wav2Vec2BertProcessor + + try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_wav2vec2_bert import ( + WAV2VEC2_BERT_PRETRAINED_MODEL_ARCHIVE_LIST, + Wav2Vec2BertForAudioFrameClassification, + Wav2Vec2BertForCTC, + Wav2Vec2BertForSequenceClassification, + Wav2Vec2BertForXVector, + Wav2Vec2BertModel, + Wav2Vec2BertPreTrainedModel, + ) + +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) diff --git a/src/transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py b/src/transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py new file mode 100644 index 00000000000000..12593107ef939d --- /dev/null +++ b/src/transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py @@ -0,0 +1,314 @@ +# coding=utf-8 +# Copyright 2024 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Wav2Vec2Bert model configuration""" + +import functools +import operator + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + +WAV2VEC2_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "facebook/w2v-bert-2.0": "https://huggingface.co/facebook/w2v-bert-2.0/resolve/main/config.json", +} + + +class Wav2Vec2BertConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Wav2Vec2BertModel`]. It is used to + instantiate an Wav2Vec2Bert model according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar configuration to that of the Wav2Vec2Bert + [facebook/wav2vec2-bert-rel-pos-large](https://huggingface.co/facebook/wav2vec2-bert-rel-pos-large) + architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*): + Vocabulary size of the Wav2Vec2Bert model. Defines the number of different tokens that can be + represented by the `inputs_ids` passed when calling [`Wav2Vec2BertModel`]. Vocabulary size of the + model. Defines the different tokens that can be represented by the *inputs_ids* passed to the forward + method of [`Wav2Vec2BertModel`]. + hidden_size (`int`, *optional*, defaults to 1024): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 24): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 4096): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + feature_projection_input_dim (`int`, *optional*, defaults to 160): + Input dimension of this model, i.e the dimension after processing input audios with [`SeamlessM4TFeatureExtractor`] or [`Wav2Vec2BertProcessor`]. + hidden_act (`str` or `function`, *optional*, defaults to `"swish"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"`, `"swish"` and `"gelu_new"` are supported. + hidden_dropout (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + activation_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for activations inside the fully connected layer. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + feat_proj_dropout (`float`, *optional*, defaults to 0.0): + The dropout probabilitiy for the feature projection. + final_dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for the final projection layer of [`Wav2Vec2BertForCTC`]. + layerdrop (`float`, *optional*, defaults to 0.1): + The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more + details. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization layers. + apply_spec_augment (`bool`, *optional*, defaults to `True`): + Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see + [SpecAugment: A Simple Data Augmentation Method for Automatic Speech + Recognition](https://arxiv.org/abs/1904.08779). + mask_time_prob (`float`, *optional*, defaults to 0.05): + Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking + procecure generates `mask_time_prob*len(time_axis)/mask_time_length ``independent masks over the axis. If + reasoning from the propability of each feature vector to be chosen as the start of the vector span to be + masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the + actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`. + mask_time_length (`int`, *optional*, defaults to 10): + Length of vector span along the time axis. + mask_time_min_masks (`int`, *optional*, defaults to 2): + The minimum number of masks of length `mask_feature_length` generated along the time axis, each time step, + irrespectively of `mask_feature_prob`. Only relevant if `mask_time_prob*len(time_axis)/mask_time_length < + mask_time_min_masks`. + mask_feature_prob (`float`, *optional*, defaults to 0.0): + Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The + masking procecure generates `mask_feature_prob*len(feature_axis)/mask_time_length` independent masks over + the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector + span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap + may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is + True`. + mask_feature_length (`int`, *optional*, defaults to 10): + Length of vector span along the feature axis. + mask_feature_min_masks (`int`, *optional*, defaults to 0): + The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time + step, irrespectively of `mask_feature_prob`. Only relevant if + `mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks`. + ctc_loss_reduction (`str`, *optional*, defaults to `"sum"`): + Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an + instance of [`Wav2Vec2BertForCTC`]. + ctc_zero_infinity (`bool`, *optional*, defaults to `False`): + Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly + occur when the inputs are too short to be aligned to the targets. Only relevant when training an instance + of [`Wav2Vec2BertForCTC`]. + use_weighted_layer_sum (`bool`, *optional*, defaults to `False`): + Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an + instance of [`Wav2Vec2BertForSequenceClassification`]. + classifier_proj_size (`int`, *optional*, defaults to 768): + Dimensionality of the projection before token mean-pooling for classification. + tdnn_dim (`Tuple[int]` or `List[int]`, *optional*, defaults to `(512, 512, 512, 512, 1500)`): + A tuple of integers defining the number of output channels of each 1D convolutional layer in the *TDNN* + module of the *XVector* model. The length of *tdnn_dim* defines the number of *TDNN* layers. + tdnn_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 3, 3, 1, 1)`): + A tuple of integers defining the kernel size of each 1D convolutional layer in the *TDNN* module of the + *XVector* model. The length of *tdnn_kernel* has to match the length of *tdnn_dim*. + tdnn_dilation (`Tuple[int]` or `List[int]`, *optional*, defaults to `(1, 2, 3, 1, 1)`): + A tuple of integers defining the dilation factor of each 1D convolutional layer in *TDNN* module of the + *XVector* model. The length of *tdnn_dilation* has to match the length of *tdnn_dim*. + xvector_output_dim (`int`, *optional*, defaults to 512): + Dimensionality of the *XVector* embedding vectors. + pad_token_id (`int`, *optional*, defaults to 0): The id of the _beginning-of-stream_ token. + bos_token_id (`int`, *optional*, defaults to 1): The id of the _padding_ token. + eos_token_id (`int`, *optional*, defaults to 2): The id of the _end-of-stream_ token. + add_adapter (`bool`, *optional*, defaults to `False`): + Whether a convolutional attention network should be stacked on top of the Wav2Vec2Bert Encoder. Can be very + useful for warm-starting Wav2Vec2Bert for SpeechEncoderDecoder models. + adapter_kernel_size (`int`, *optional*, defaults to 3): + Kernel size of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`. + adapter_stride (`int`, *optional*, defaults to 2): + Stride of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`. + num_adapter_layers (`int`, *optional*, defaults to 1): + Number of convolutional layers that should be used in the adapter network. Only relevant if `add_adapter is + True`. + adapter_act (`str` or `function`, *optional*, defaults to `"relu"`): + The non-linear activation function (function or string) in the adapter layers. If string, `"gelu"`, + `"relu"`, `"selu"`, `"swish"` and `"gelu_new"` are supported. + use_intermediate_ffn_before_adapter (`bool`, *optional*, defaults to `False`): + Whether an intermediate feed-forward block should be stacked on top of the Wav2Vec2Bert Encoder and before the adapter network. + Only relevant if `add_adapter is True`. + output_hidden_size (`int`, *optional*): + Dimensionality of the encoder output layer. If not defined, this defaults to *hidden-size*. Only relevant + if `add_adapter is True`. + position_embeddings_type (`str`, *optional*, defaults to `"relative_key"`): + Can be specified to : + - `rotary`, for rotary position embeddings. + - `relative`, for relative position embeddings. + - `relative_key`, for relative position embeddings as defined by Shaw in [Self-Attention + with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). + If left to `None`, no relative position embeddings is applied. + rotary_embedding_base (`int`, *optional*, defaults to 10000): + If `"rotary"` position embeddings are used, defines the size of the embedding base. + max_source_positions (`int`, *optional*, defaults to 5000): + if `"relative"` position embeddings are used, defines the maximum source input positions. + left_max_position_embeddings (`int`, *optional*, defaults to 64): + If `"relative_key"` (aka Shaw) position embeddings are used, defines the left clipping value for relative positions. + right_max_position_embeddings (`int`, *optional*, defaults to 8): + If `"relative_key"` (aka Shaw) position embeddings are used, defines the right clipping value for relative positions. + conv_depthwise_kernel_size (`int`, *optional*, defaults to 31): + Kernel size of convolutional depthwise 1D layer in Conformer blocks. + conformer_conv_dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for all convolutional layers in Conformer blocks. + Example: + + ```python + >>> from transformers import Wav2Vec2BertConfig, Wav2Vec2BertModel + + >>> # Initializing a Wav2Vec2Bert facebook/wav2vec2-bert-rel-pos-large style configuration + >>> configuration = Wav2Vec2BertConfig() + + >>> # Initializing a model (with random weights) from the facebook/wav2vec2-bert-rel-pos-large style configuration + >>> model = Wav2Vec2BertModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "wav2vec2-bert" + + def __init__( + self, + vocab_size=None, + hidden_size=1024, + num_hidden_layers=24, + num_attention_heads=16, + intermediate_size=4096, + feature_projection_input_dim=160, + hidden_act="swish", + hidden_dropout=0.0, + activation_dropout=0.0, + attention_dropout=0.0, + feat_proj_dropout=0.0, + final_dropout=0.1, + layerdrop=0.1, + initializer_range=0.02, + layer_norm_eps=1e-5, + apply_spec_augment=True, + mask_time_prob=0.05, + mask_time_length=10, + mask_time_min_masks=2, + mask_feature_prob=0.0, + mask_feature_length=10, + mask_feature_min_masks=0, + ctc_loss_reduction="sum", + ctc_zero_infinity=False, + use_weighted_layer_sum=False, + classifier_proj_size=768, + tdnn_dim=(512, 512, 512, 512, 1500), + tdnn_kernel=(5, 3, 3, 1, 1), + tdnn_dilation=(1, 2, 3, 1, 1), + xvector_output_dim=512, + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + add_adapter=False, + adapter_kernel_size=3, + adapter_stride=2, + num_adapter_layers=1, + adapter_act="relu", + use_intermediate_ffn_before_adapter=False, + output_hidden_size=None, + position_embeddings_type="relative_key", + rotary_embedding_base=10000, + max_source_positions=5000, + left_max_position_embeddings=64, + right_max_position_embeddings=8, + conv_depthwise_kernel_size=31, + conformer_conv_dropout=0.1, + **kwargs, + ): + super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id) + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.num_attention_heads = num_attention_heads + self.feature_projection_input_dim = feature_projection_input_dim + self.hidden_dropout = hidden_dropout + self.attention_dropout = attention_dropout + self.activation_dropout = activation_dropout + self.feat_proj_dropout = feat_proj_dropout + self.final_dropout = final_dropout + self.layerdrop = layerdrop + self.layer_norm_eps = layer_norm_eps + self.initializer_range = initializer_range + self.vocab_size = vocab_size + self.use_weighted_layer_sum = use_weighted_layer_sum + self.max_source_positions = max_source_positions + + if position_embeddings_type is not None and position_embeddings_type not in [ + "rotary", + "relative", + "relative_key", + ]: + raise ValueError( + """ + `position_embeddings_type` is not valid. It must be one of the following values: + `["rotary", "relative", "relative_key"]` or left as `None`. + """ + ) + self.position_embeddings_type = position_embeddings_type + self.rotary_embedding_base = rotary_embedding_base + self.left_max_position_embeddings = left_max_position_embeddings + self.right_max_position_embeddings = right_max_position_embeddings + + # Conformer-block related + self.conv_depthwise_kernel_size = conv_depthwise_kernel_size + self.conformer_conv_dropout = conformer_conv_dropout + + # fine-tuning config parameters for SpecAugment: https://arxiv.org/abs/1904.08779 + self.apply_spec_augment = apply_spec_augment + self.mask_time_prob = mask_time_prob + self.mask_time_length = mask_time_length + self.mask_time_min_masks = mask_time_min_masks + self.mask_feature_prob = mask_feature_prob + self.mask_feature_length = mask_feature_length + self.mask_feature_min_masks = mask_feature_min_masks + + # ctc loss + self.ctc_loss_reduction = ctc_loss_reduction + self.ctc_zero_infinity = ctc_zero_infinity + + # adapter + self.add_adapter = add_adapter + self.adapter_kernel_size = adapter_kernel_size + self.adapter_stride = adapter_stride + self.num_adapter_layers = num_adapter_layers + self.adapter_act = adapter_act + self.output_hidden_size = output_hidden_size if output_hidden_size is not None else hidden_size + if use_intermediate_ffn_before_adapter and not add_adapter: + raise ValueError("`use_intermediate_ffn_before_adapter` is `True` but `add_adapter` is `False`.") + self.use_intermediate_ffn_before_adapter = use_intermediate_ffn_before_adapter + + # SequenceClassification-specific parameter. Feel free to ignore for other classes. + self.classifier_proj_size = classifier_proj_size + + # XVector-specific parameters. Feel free to ignore for other classes. + self.tdnn_dim = list(tdnn_dim) + self.tdnn_kernel = list(tdnn_kernel) + self.tdnn_dilation = list(tdnn_dilation) + self.xvector_output_dim = xvector_output_dim + + @property + def inputs_to_logits_ratio(self): + return functools.reduce(operator.mul, self.conv_stride, 1) diff --git a/src/transformers/models/wav2vec2_bert/convert_wav2vec2_seamless_checkpoint.py b/src/transformers/models/wav2vec2_bert/convert_wav2vec2_seamless_checkpoint.py new file mode 100644 index 00000000000000..8b77cd71f7f7e0 --- /dev/null +++ b/src/transformers/models/wav2vec2_bert/convert_wav2vec2_seamless_checkpoint.py @@ -0,0 +1,218 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert Wav2Vec2Bert BERT checkpoint.""" + + +import argparse + +import torch +import torchaudio +from fairseq2.data import Collater +from fairseq2.data.audio import WaveformToFbankConverter +from fairseq2.nn.padding import get_seqs_and_padding_mask +from seamless_communication.models.conformer_shaw import load_conformer_shaw_model + +from transformers import ( + SeamlessM4TFeatureExtractor, + Wav2Vec2BertConfig, + Wav2Vec2BertModel, + logging, +) + + +logging.set_verbosity_info() +logger = logging.get_logger(__name__) + + +wav2vec_convert_list = [ + ("encoder_frontend.model_dim_proj", "feature_projection.projection"), + ("encoder_frontend.post_extract_layer_norm", "feature_projection.layer_norm"), + ("encoder_frontend.pos_encoder.conv", "encoder.pos_conv_embed.conv"), + ("encoder.inner.layers", "encoder.layers"), + ("encoder.inner_layer_norm", "encoder.layer_norm"), + ("encoder.adaptor_layers", "adapter.layers"), + ("inner_proj", "intermediate_dense"), + ("self_attn.output_proj", "self_attn.linear_out"), + ("output_proj", "output_dense"), + ("self_attn.k_proj", "self_attn.linear_k"), + ("self_attn.v_proj", "self_attn.linear_v"), + ("self_attn.q_proj", "self_attn.linear_q"), + ("self_attn.sdpa.u_bias", "self_attn.pos_bias_u"), + ("self_attn.sdpa.v_bias", "self_attn.pos_bias_v"), + ("self_attn.sdpa.rel_k_embed", "self_attn.distance_embedding"), + ("self_attn.sdpa.r_proj", "self_attn.linear_pos"), + ("conv.pointwise_conv1", "conv_module.pointwise_conv1"), + ("conv.pointwise_conv2", "conv_module.pointwise_conv2"), + ("conv.depthwise_conv", "conv_module.depthwise_conv"), + ("conv.layer_norm", "conv_module.depthwise_layer_norm"), + ("conv_layer_norm", "conv_module.layer_norm"), + ("encoder.proj1", "intermediate_ffn.intermediate_dense"), + ("encoder.proj2", "intermediate_ffn.output_dense"), + ("encoder.layer_norm", "inner_layer_norm"), + ("masker.temporal_mask_embed", "masked_spec_embed"), +] + +keys_to_remove = { + "quantizer.entry_proj", + "final_proj", + "final_target_proj", + "quantizer.entries", + "quantizer.num_updates", +} + + +def param_count(model): + return sum(p[1].numel() for p in model.named_parameters() if "final_proj" not in p[0]) + + +def _convert_model( + original_model, + hf_model, + convert_list, +): + state_dict = original_model.state_dict() + + for k, v in list(state_dict.items()): + new_key = k + for old_layer_name, new_layer_name in convert_list: + if old_layer_name in new_key: + new_key = new_key.replace(old_layer_name, new_layer_name) + + # must do it by hand + if ".layer_norm" in new_key and new_key.split(".layer_norm")[0][-1].isnumeric(): + new_key = new_key.replace("layer_norm", "final_layer_norm") + + add_key = True + for key in keys_to_remove: + if key in new_key: + state_dict.pop(k) + add_key = False + break + + if add_key: + state_dict[new_key] = state_dict.pop(k) + + extra_keys = set(state_dict.keys()) - set(hf_model.state_dict().keys()) + extra_keys = set({k for k in extra_keys if "num_updates" not in k}) # filter unecessary param + missing_keys = set(hf_model.state_dict().keys()) - set(state_dict.keys()) + if len(extra_keys) != 0: + raise ValueError(f"extra keys found: {extra_keys}") + if len(missing_keys) != 0: + raise ValueError(f"missing keys: {missing_keys}") + hf_model.load_state_dict(state_dict, strict=True) + n_params = param_count(hf_model) + + logger.info(f"model loaded: {round(n_params/1e6,1)}M params") + + hf_model.eval() + del state_dict + + return hf_model + + +@torch.no_grad() +def convert_wav2vec2_bert_checkpoint( + checkpoint_path, + pytorch_dump_folder_path, + config_path=None, + repo_id=None, +): + """ + Copy/paste/tweak model's weights to transformers design. + """ + if config_path is not None: + config = Wav2Vec2BertConfig.from_pretrained(config_path, hidden_act="swish") + else: + config = Wav2Vec2BertConfig(apply_spec_augment=False) + + hf_wav2vec = Wav2Vec2BertModel(config) + + model = load_conformer_shaw_model(checkpoint_path, dtype=torch.float32) + model.eval() + + hf_wav2vec = _convert_model(model, hf_wav2vec, wav2vec_convert_list) + + hf_wav2vec.save_pretrained(pytorch_dump_folder_path) + + if repo_id: + hf_wav2vec.push_to_hub(repo_id, create_pr=True) + + # save feature extractor + fe = SeamlessM4TFeatureExtractor(padding_value=1) + fe._set_processor_class("Wav2Vec2BertProcessor") + fe.save_pretrained(pytorch_dump_folder_path) + + if repo_id: + fe.push_to_hub(repo_id, create_pr=True) + + if args.audio_path: + waveform, sample_rate = torchaudio.load(args.audio_path) + waveform = torchaudio.functional.resample(waveform, sample_rate, fe.sampling_rate) + + fbank_converter = WaveformToFbankConverter( + num_mel_bins=80, + waveform_scale=2**15, + channel_last=True, + standardize=True, + dtype=torch.float32, + ) + collater = Collater(pad_value=1) + + decoded_audio = {"waveform": waveform.T, "sample_rate": fe.sampling_rate, "format": -1} + src = collater(fbank_converter(decoded_audio))["fbank"] + seqs, padding_mask = get_seqs_and_padding_mask(src) + + with torch.inference_mode(): + seqs, padding_mask = model.encoder_frontend(seqs, padding_mask) + original_output, padding_mask = model.encoder(seqs, padding_mask) + + hf_wav2vec.eval() + + inputs = fe(waveform, return_tensors="pt", padding=True) + with torch.no_grad(): + outputs = hf_wav2vec(**inputs) + + torch.testing.assert_close(original_output, outputs.last_hidden_state, atol=5e-3, rtol=5e-3) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--pytorch_dump_folder_path", + default=None, + type=str, + help="Path to the output PyTorch model.", + ) + parser.add_argument( + "--checkpoint_path", default="conformer_shaw", type=str, help="Path to seamless communication checkpoint" + ) + parser.add_argument( + "--config_path", + default=None, + type=str, + help="Path to hf config.json of model to convert", + ) + parser.add_argument("--repo_id", default=None, type=str, help="Push to this repo id if precised.") + parser.add_argument( + "--audio_path", + default=None, + type=str, + help="If specified, check that the original model and the converted model produce the same outputs.", + ) + + args = parser.parse_args() + convert_wav2vec2_bert_checkpoint( + args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.repo_id + ) diff --git a/src/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py b/src/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py new file mode 100644 index 00000000000000..034da900ee8ab3 --- /dev/null +++ b/src/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py @@ -0,0 +1,1667 @@ +# coding=utf-8 +# Copyright 2024 The Seamless Authors and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch Wav2Vec2-BERT model.""" + +import math +from typing import Optional, Tuple, Union + +import numpy as np +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import CrossEntropyLoss + +from ...activations import ACT2FN +from ...integrations.deepspeed import is_deepspeed_zero3_enabled +from ...modeling_attn_mask_utils import _prepare_4d_attention_mask +from ...modeling_outputs import ( + BaseModelOutput, + CausalLMOutput, + SequenceClassifierOutput, + TokenClassifierOutput, + Wav2Vec2BaseModelOutput, + XVectorOutput, +) +from ...modeling_utils import PreTrainedModel +from ...utils import ( + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, +) +from .configuration_wav2vec2_bert import Wav2Vec2BertConfig + + +logger = logging.get_logger(__name__) + + +_HIDDEN_STATES_START_POSITION = 2 + +# General docstring +_CONFIG_FOR_DOC = "Wav2Vec2BertConfig" + +# Base docstring +_BASE_CHECKPOINT_FOR_DOC = "facebook/w2v-bert-2.0" +_PRETRAINED_CHECKPOINT_FOR_DOC = "hf-audio/wav2vec2-bert-CV16-en" +_EXPECTED_OUTPUT_SHAPE = [1, 146, 1024] + +# CTC docstring +_CTC_EXPECTED_OUTPUT = "'mr quilter is the apostle of the middle classes and we are glad to welcome his gospel'" +_CTC_EXPECTED_LOSS = 17.04 + + +WAV2VEC2_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "facebook/w2v-bert-2.0", + # See all Wav2Vec2-BERT models at https://huggingface.co/models?filter=wav2vec2-bert +] + + +# Copied from transformers.models.seamless_m4t_v2.modeling_seamless_m4t_v2._compute_new_attention_mask +def _compute_new_attention_mask(hidden_states: torch.Tensor, seq_lens: torch.Tensor): + """ + Computes an attention mask of the form `(batch, seq_len)` with an attention for each element in the batch that + stops at the corresponding element in `seq_lens`. + Args: + hidden_states (`torch.FloatTensor` of shape `(batch, seq_len, *)`): + The sequences to mask, where `*` is any number of sequence-specific dimensions including none. + seq_lens (`torch.Tensor` of shape `(batch)`: + Each element represents the length of the sequence at the same index in `hidden_states` + Returns: + `torch.FloatTensor`: The float attention mask of shape `(batch, seq_len)` + """ + batch_size, mask_seq_len = hidden_states.shape[:2] + + indices = torch.arange(mask_seq_len, device=seq_lens.device).expand(batch_size, -1) + + bool_mask = indices >= seq_lens.unsqueeze(1).expand(-1, mask_seq_len) + + mask = hidden_states.new_ones((batch_size, mask_seq_len)) + + mask = mask.masked_fill(bool_mask, 0) + + return mask + + +# Copied from transformers.models.wav2vec2.modeling_wav2vec2._compute_mask_indices +def _compute_mask_indices( + shape: Tuple[int, int], + mask_prob: float, + mask_length: int, + attention_mask: Optional[torch.LongTensor] = None, + min_masks: int = 0, +) -> np.ndarray: + """ + Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for + ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on + CPU as part of the preprocessing during training. + + Args: + shape: The shape for which to compute masks. This should be of a tuple of size 2 where + the first element is the batch size and the second element is the length of the axis to span. + mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of + independently generated mask spans of length `mask_length` is computed by + `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the + actual percentage will be smaller. + mask_length: size of the mask + min_masks: minimum number of masked spans + attention_mask: A (right-padded) attention mask which independently shortens the feature axis of + each batch dimension. + """ + batch_size, sequence_length = shape + + if mask_length < 1: + raise ValueError("`mask_length` has to be bigger than 0.") + + if mask_length > sequence_length: + raise ValueError( + f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}" + f" and `sequence_length`: {sequence_length}`" + ) + + # epsilon is used for probabilistic rounding + epsilon = np.random.rand(1).item() + + def compute_num_masked_span(input_length): + """Given input length, compute how many spans should be masked""" + num_masked_span = int(mask_prob * input_length / mask_length + epsilon) + num_masked_span = max(num_masked_span, min_masks) + + # make sure num masked span <= sequence_length + if num_masked_span * mask_length > sequence_length: + num_masked_span = sequence_length // mask_length + + # make sure num_masked span is also <= input_length - (mask_length - 1) + if input_length - (mask_length - 1) < num_masked_span: + num_masked_span = max(input_length - (mask_length - 1), 0) + + return num_masked_span + + # compute number of masked spans in batch + input_lengths = ( + attention_mask.sum(-1).detach().tolist() + if attention_mask is not None + else [sequence_length for _ in range(batch_size)] + ) + + # SpecAugment mask to fill + spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool) + spec_aug_mask_idxs = [] + + max_num_masked_span = compute_num_masked_span(sequence_length) + + if max_num_masked_span == 0: + return spec_aug_mask + + for input_length in input_lengths: + # compute num of masked spans for this input + num_masked_span = compute_num_masked_span(input_length) + + # get random indices to mask + spec_aug_mask_idx = np.random.choice( + np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False + ) + + # pick first sampled index that will serve as a dummy index to pad vector + # to ensure same dimension for all batches due to probabilistic rounding + # Picking first sample just pads those vectors twice. + if len(spec_aug_mask_idx) == 0: + # this case can only happen if `input_length` is strictly smaller then + # `sequence_length` in which case the last token has to be a padding + # token which we can use as a dummy mask id + dummy_mask_idx = sequence_length - 1 + else: + dummy_mask_idx = spec_aug_mask_idx[0] + + spec_aug_mask_idx = np.concatenate( + [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx] + ) + spec_aug_mask_idxs.append(spec_aug_mask_idx) + + spec_aug_mask_idxs = np.array(spec_aug_mask_idxs) + + # expand masked indices to masked spans + spec_aug_mask_idxs = np.broadcast_to( + spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length) + ) + spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length) + + # add offset to the starting indexes so that indexes now create a span + offsets = np.arange(mask_length)[None, None, :] + offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape( + batch_size, max_num_masked_span * mask_length + ) + spec_aug_mask_idxs = spec_aug_mask_idxs + offsets + + # ensure that we cannot have indices larger than sequence_length + if spec_aug_mask_idxs.max() > sequence_length - 1: + spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1 + + # scatter indices to mask + np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1) + + return spec_aug_mask + + +# Copied from transformers.models.wav2vec2.modeling_wav2vec2._sample_negative_indices +def _sample_negative_indices( + features_shape: Tuple, num_negatives: int, mask_time_indices: Optional[np.ndarray] = None +): + """ + Sample `num_negatives` vectors from feature vectors. + """ + batch_size, sequence_length = features_shape + + # generate indices of the positive vectors themselves, repeat them `num_negatives` times + sequence_length_range = np.arange(sequence_length) + + # get `num_negatives` random vector indices from the same utterance + sampled_negative_indices = np.zeros(shape=(batch_size, sequence_length, num_negatives), dtype=np.int32) + + mask_time_indices = ( + mask_time_indices.astype(bool) if mask_time_indices is not None else np.ones(features_shape, dtype=bool) + ) + + for batch_idx in range(batch_size): + high = mask_time_indices[batch_idx].sum() - 1 + mapped_masked_indices = sequence_length_range[mask_time_indices[batch_idx]] + + feature_indices = np.broadcast_to(np.arange(high + 1)[:, None], (high + 1, num_negatives)) + sampled_indices = np.random.randint(0, high, size=(high + 1, num_negatives)) + # avoid sampling the same positive vector, but keep the distribution uniform + sampled_indices[sampled_indices >= feature_indices] += 1 + + # remap to actual indices + sampled_negative_indices[batch_idx][mask_time_indices[batch_idx]] = mapped_masked_indices[sampled_indices] + + # correct for batch size + sampled_negative_indices[batch_idx] += batch_idx * sequence_length + + return sampled_negative_indices + + +# Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerRotaryPositionalEmbedding with Wav2Vec2Conformer->Wav2Vec2Bert +class Wav2Vec2BertRotaryPositionalEmbedding(nn.Module): + """Rotary positional embedding + Reference : https://blog.eleuther.ai/rotary-embeddings/ Paper: https://arxiv.org/pdf/2104.09864.pdf + """ + + def __init__(self, config): + super().__init__() + dim = config.hidden_size // config.num_attention_heads + base = config.rotary_embedding_base + + inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim)) + # Ignore copy + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.cached_sequence_length = None + self.cached_rotary_positional_embedding = None + + def forward(self, hidden_states): + sequence_length = hidden_states.shape[1] + + if sequence_length == self.cached_sequence_length and self.cached_rotary_positional_embedding is not None: + return self.cached_rotary_positional_embedding + + self.cached_sequence_length = sequence_length + # Embeddings are computed in the dtype of the inv_freq constant + time_stamps = torch.arange(sequence_length).type_as(self.inv_freq) + freqs = torch.einsum("i,j->ij", time_stamps, self.inv_freq) + embeddings = torch.cat((freqs, freqs), dim=-1) + + cos_embeddings = embeddings.cos()[:, None, None, :] + sin_embeddings = embeddings.sin()[:, None, None, :] + # Computed embeddings are cast to the dtype of the hidden state inputs + self.cached_rotary_positional_embedding = torch.stack([cos_embeddings, sin_embeddings]).type_as(hidden_states) + return self.cached_rotary_positional_embedding + + +# Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerRelPositionalEmbedding with Wav2Vec2Conformer->Wav2Vec2Bert +class Wav2Vec2BertRelPositionalEmbedding(nn.Module): + """Relative positional encoding module.""" + + def __init__(self, config): + super().__init__() + self.max_len = config.max_source_positions + self.d_model = config.hidden_size + self.pe = None + self.extend_pe(torch.tensor(0.0).expand(1, self.max_len)) + + def extend_pe(self, x): + # Reset the positional encodings + if self.pe is not None: + # self.pe contains both positive and negative parts + # the length of self.pe is 2 * input_len - 1 + if self.pe.size(1) >= x.size(1) * 2 - 1: + if self.pe.dtype != x.dtype or self.pe.device != x.device: + self.pe = self.pe.to(dtype=x.dtype, device=x.device) + return + # Suppose `i` is the position of query vector and `j` is the + # position of key vector. We use positive relative positions when keys + # are to the left (i>j) and negative relative positions otherwise (i