From 0b4c246b6e1f3e9889e2f7a3c1025846613d8e4e Mon Sep 17 00:00:00 2001 From: Farzad Abdolhosseini Date: Tue, 23 Jul 2024 13:54:55 -0700 Subject: [PATCH 1/3] add covost2 dataset for validation & test --- ultravox/data/datasets.py | 80 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/ultravox/data/datasets.py b/ultravox/data/datasets.py index 8e0dc021..1688beda 100644 --- a/ultravox/data/datasets.py +++ b/ultravox/data/datasets.py @@ -805,6 +805,85 @@ def _get_sample(self, row) -> VoiceSample: return self._get_transcribe_sample(row, tcol="sentence") +class CoVoST2Dataset(VoiceDataset): + """ + CoVoST 2 is a large-scale multilingual speech translation corpus covering translations from 21 languages into English + and from English into 15 languages. The dataset is created using Mozilla's open-source Common Voice 4 database of + crowdsourced voice recordings. There are 2,900 hours of speech represented in the corpus. + + The original Hugging Face dataset link: https://huggingface.co/datasets/facebook/covost2 + Since this dataset requires audio files to be downloaded separately, a new dataset is created with the audio files: + https://huggingface.co/datasets/fixie-ai/covost2 + + Due to the scale of the dataset and the audio files being repeated, only a portion of the dataset was converted. + See [this issue](https://github.com/fixie-ai/ultravox/issues/50) for more information. + + Supported subsets (En -> X): + 'en_de', 'en_tr', 'en_fa', 'en_sv-SE', 'en_mn', 'en_zh-CN', 'en_cy', + 'en_ca', 'en_sl', 'en_et', 'en_id', 'en_ar', 'en_ta', 'en_lv', 'en_ja' + Supported subsets (X -> En): + 'fr_en', 'zh-CN_en', 'es_en' + """ + + CODE_TO_LANG = { + "en": "English", + "de": "German", + "tr": "Turkish", + "fa": "Persian", + "sv-SE": "Swedish", + "mn": "Mongolian", + "zh-CN": "Chinese", + "cy": "Welsh", + "ca": "Catalan", + "sl": "Slovenian", + "et": "Estonian", + "id": "Indonesian", + "ar": "Arabic", + "ta": "Tamil", + "lv": "Latvian", + "ja": "Japanese", + "fr": "French", + "es": "Spanish", + } + + # We currently don't use this dataset for training, so mainly the first prompt it ever used. + TRANSLATE_PROMPTS = [ + "Translate the following into {target} language: <|audio|>", + "Translate the following into {target}: <|audio|>", + "Please convert the following into {target}.\n<|audio|>", + "Could you translate this to {target} language?\n<|audio|>", + "Translate the text below to {target}. <|audio|>", + "Translate the subsequent text into {target} language. <|audio|>", + "Can you translate this into the {target} language?\n<|audio|>", + "Transform the following to {target}: <|audio|>", + ] + + def __init__(self, args: VoiceDatasetArgs, subset: str) -> None: + super().__init__(args) + dataset = self._load_audio_dataset( + "fixie-ai/covost2", subset, split=args.split.value + ) + langs = subset.split("_") + assert len(langs) == 2, f"Invalid subset: {subset}" + self.source_lang = self.CODE_TO_LANG[langs[0]] + self.target_lang = self.CODE_TO_LANG[langs[1]] + self._init_dataset(dataset) + + def _get_sample(self, row) -> VoiceSample: + prompt = self._choice(self.TRANSLATE_PROMPTS).format(target=self.target_lang) + + transcript = row["sentence"] + translation = row["translation"] + if not self._args.include_audio: + prompt.replace("<|audio|>", transcript) + + return self._make_sample( + _get_messages(prompt, translation), + self._get_audio(row), + audio_transcript=transcript, + ) + + class PeopleSpeechDataset(VoiceDataset): """ The People's Speech Dataset is among the world's largest English speech @@ -882,6 +961,7 @@ def create_dataset(name: str, args: VoiceDatasetArgs) -> data.IterableDataset: "librispeech": LibriSpeechDataset, "voxpopuli": VoxPopuliDataset, "commonvoice": CommonVoiceDataset, + "covost2": CoVoST2Dataset, "peoplespeech": PeopleSpeechDataset, "soda": SodaDataset, "dummy": LibriSpeechDummyDataset, From 657d58db4dbafe8bfc60460a8e774f59fe7d3acf Mon Sep 17 00:00:00 2001 From: Farzad Abdolhosseini Date: Tue, 23 Jul 2024 15:19:30 -0700 Subject: [PATCH 2/3] text-only fix --- ultravox/data/datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ultravox/data/datasets.py b/ultravox/data/datasets.py index 1688beda..b7ded5b3 100644 --- a/ultravox/data/datasets.py +++ b/ultravox/data/datasets.py @@ -875,7 +875,7 @@ def _get_sample(self, row) -> VoiceSample: transcript = row["sentence"] translation = row["translation"] if not self._args.include_audio: - prompt.replace("<|audio|>", transcript) + prompt = prompt.replace("<|audio|>", transcript) return self._make_sample( _get_messages(prompt, translation), From 4a1eb00dabcc79a068745f9ed6c17876d52b08aa Mon Sep 17 00:00:00 2001 From: Farzad Abdolhosseini Date: Wed, 24 Jul 2024 09:22:00 -0700 Subject: [PATCH 3/3] minor prompt changes --- ultravox/data/datasets.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ultravox/data/datasets.py b/ultravox/data/datasets.py index b7ded5b3..d9bf6262 100644 --- a/ultravox/data/datasets.py +++ b/ultravox/data/datasets.py @@ -848,11 +848,11 @@ class CoVoST2Dataset(VoiceDataset): # We currently don't use this dataset for training, so mainly the first prompt it ever used. TRANSLATE_PROMPTS = [ - "Translate the following into {target} language: <|audio|>", "Translate the following into {target}: <|audio|>", + "Translate the following into {target} language: <|audio|>", "Please convert the following into {target}.\n<|audio|>", "Could you translate this to {target} language?\n<|audio|>", - "Translate the text below to {target}. <|audio|>", + "Translate the text below to {target}.\n<|audio|>", "Translate the subsequent text into {target} language. <|audio|>", "Can you translate this into the {target} language?\n<|audio|>", "Transform the following to {target}: <|audio|>",