From 09a6fca18a9de33bb8e3c365d171bc82bbc5ee2b Mon Sep 17 00:00:00 2001 From: Vincent QB Date: Thu, 3 Dec 2020 14:02:12 -0500 Subject: [PATCH] no longer download CommonVoice directly (#1018) no longer allow to download the dataset directly. deprecate: download and url. add language. --- .../datasets/utils_test.py | 4 +- torchaudio/datasets/commonvoice.py | 177 ++++++++++-------- 2 files changed, 97 insertions(+), 84 deletions(-) diff --git a/test/torchaudio_unittest/datasets/utils_test.py b/test/torchaudio_unittest/datasets/utils_test.py index f75ae2319c..a9e13a1e38 100644 --- a/test/torchaudio_unittest/datasets/utils_test.py +++ b/test/torchaudio_unittest/datasets/utils_test.py @@ -54,7 +54,7 @@ class TestIterator(TorchaudioTestCase): path = get_asset_path() def test_disckcache_iterator(self): - data = COMMONVOICE(self.path, url="tatar") + data = COMMONVOICE(self.path, version="cv-corpus-4-2019-12-10", language="tatar") data = dataset_utils.diskcache_iterator(data) # Save data[0] @@ -62,7 +62,7 @@ def test_disckcache_iterator(self): data[0] def test_bg_iterator(self): - data = COMMONVOICE(self.path, url="tatar") + data = COMMONVOICE(self.path, version="cv-corpus-4-2019-12-10", language="tatar") data = dataset_utils.bg_iterator(data, 5) for _ in data: pass diff --git a/torchaudio/datasets/commonvoice.py b/torchaudio/datasets/commonvoice.py index 963777cf38..5899e3a4ac 100644 --- a/torchaudio/datasets/commonvoice.py +++ b/torchaudio/datasets/commonvoice.py @@ -1,9 +1,10 @@ import os +import warnings from pathlib import Path -from typing import List, Dict, Tuple, Union +from typing import List, Dict, Tuple, Optional, Union import torchaudio -from torchaudio.datasets.utils import download_url, extract_archive, unicode_csv_reader +from torchaudio.datasets.utils import extract_archive, unicode_csv_reader, validate_file from torch import Tensor from torch.utils.data import Dataset @@ -16,68 +17,39 @@ # validated.tsv FOLDER_IN_ARCHIVE = "CommonVoice" -URL = "english" -VERSION = "cv-corpus-4-2019-12-10" +LANGUAGE = "english" +VERSION = "cv-corpus-5.1-2020-06-22" TSV = "train.tsv" _CHECKSUMS = { - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/tt.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/en.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/de.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/fr.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cy.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/br.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cv.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/tr.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ky.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ga-IE.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/kab.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ca.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/zh-TW.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sl.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/it.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/nl.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cnh.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/eo.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/et.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/fa.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/eu.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/es.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/zh-CN.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/mn.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sah.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/dv.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/rw.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sv-SE.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ru.tar.gz": - None + "cv-corpus-5.1-2020-06-22/tt.tar.gz": None, + "cv-corpus-5.1-2020-06-22/en.tar.gz": None, + "cv-corpus-5.1-2020-06-22/de.tar.gz": None, + "cv-corpus-5.1-2020-06-22/fr.tar.gz": None, + "cv-corpus-5.1-2020-06-22/cy.tar.gz": None, + "cv-corpus-5.1-2020-06-22/br.tar.gz": None, + "cv-corpus-5.1-2020-06-22/cv.tar.gz": None, + "cv-corpus-5.1-2020-06-22/tr.tar.gz": None, + "cv-corpus-5.1-2020-06-22/ky.tar.gz": None, + "cv-corpus-5.1-2020-06-22/ga-IE.tar.gz": None, + "cv-corpus-5.1-2020-06-22/kab.tar.gz": None, + "cv-corpus-5.1-2020-06-22/ca.tar.gz": None, + "cv-corpus-5.1-2020-06-22/zh-TW.tar.gz": None, + "cv-corpus-5.1-2020-06-22/sl.tar.gz": None, + "cv-corpus-5.1-2020-06-22/it.tar.gz": None, + "cv-corpus-5.1-2020-06-22/nl.tar.gz": None, + "cv-corpus-5.1-2020-06-22/cnh.tar.gz": None, + "cv-corpus-5.1-2020-06-22/eo.tar.gz": None, + "cv-corpus-5.1-2020-06-22/et.tar.gz": None, + "cv-corpus-5.1-2020-06-22/fa.tar.gz": None, + "cv-corpus-5.1-2020-06-22/eu.tar.gz": None, + "cv-corpus-5.1-2020-06-22/es.tar.gz": None, + "cv-corpus-5.1-2020-06-22/zh-CN.tar.gz": None, + "cv-corpus-5.1-2020-06-22/mn.tar.gz": None, + "cv-corpus-5.1-2020-06-22/sah.tar.gz": None, + "cv-corpus-5.1-2020-06-22/dv.tar.gz": None, + "cv-corpus-5.1-2020-06-22/rw.tar.gz": None, + "cv-corpus-5.1-2020-06-22/sv-SE.tar.gz": None, + "cv-corpus-5.1-2020-06-22/ru.tar.gz": None, } @@ -101,15 +73,18 @@ def load_commonvoice_item(line: List[str], class COMMONVOICE(Dataset): - """Create a Dataset for CommonVoice. + """Create a Dataset for `CommonVoice `_. Args: root (str or Path): Path to the directory where the dataset is found or downloaded. tsv (str, optional): The name of the tsv file used to construct the metadata. (default: ``"train.tsv"``) - url (str, optional): The URL to download the dataset from, or the language of - the dataset to download. (default: ``"english"``). - Allowed language values are ``"tatar"``, ``"english"``, ``"german"``, + url (str, optional): Deprecated. + folder_in_archive (str, optional): The top-level directory of the dataset. + version (str): Version string. (default: ``"cv-corpus-5.1-2020-06-22"``) + language (str, optional): Language of the dataset. (default: None) + The following values are mapped to their corresponding shortened version: + ``"tatar"``, ``"english"``, ``"german"``, ``"french"``, ``"welsh"``, ``"breton"``, ``"chuvash"``, ``"turkish"``, ``"kyrgyz"``, ``"irish"``, ``"kabyle"``, ``"catalan"``, ``"taiwanese"``, ``"slovenian"``, ``"italian"``, ``"dutch"``, ``"hakha chin"``, ``"esperanto"``, ``"estonian"``, @@ -118,11 +93,8 @@ class COMMONVOICE(Dataset): ``"russian"``, ``"indonesian"``, ``"arabic"``, ``"tamil"``, ``"interlingua"``, ``"latvian"``, ``"japanese"``, ``"votic"``, ``"abkhaz"``, ``"cantonese"`` and ``"romansh sursilvan"``. - folder_in_archive (str, optional): The top-level directory of the dataset. - version (str): Version string. (default: ``"cv-corpus-4-2019-12-10"``) For the other allowed values, Please checkout https://commonvoice.mozilla.org/en/datasets. - download (bool, optional): - Whether to download the dataset if it is not found at root path. (default: ``False``). + download (bool, optional): Deprecated. """ _ext_txt = ".txt" @@ -132,10 +104,30 @@ class COMMONVOICE(Dataset): def __init__(self, root: Union[str, Path], tsv: str = TSV, - url: str = URL, + url: Optional[str] = None, folder_in_archive: str = FOLDER_IN_ARCHIVE, version: str = VERSION, - download: bool = False) -> None: + language: str = LANGUAGE, + download: Optional[bool] = False) -> None: + + if download is True: + raise RuntimeError( + "The dataset is no longer publicly accessible. You need to " + "download the archives externally and place them in the root " + "directory." + ) + elif download is False: + warnings.warn( + "The use of the download flag is deprecated, since the dataset " + "is no longer directly accessible.", RuntimeWarning + ) + + if url is not None: + warnings.warn( + "The use of the url flag is deprecated, since the dataset " + "is no longer publicly accessible. To specify the language of the dataset, " + "please use the language parameter instead.", RuntimeWarning + ) languages = { "tatar": "tt", @@ -180,12 +172,22 @@ def __init__(self, "romansh sursilvan": "rm-sursilv" } - if url in languages: + if language in languages: ext_archive = ".tar.gz" - language = languages[url] - - base_url = "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com" - url = os.path.join(base_url, version, language + ext_archive) + language = languages[language] + url = os.path.join(version, language + ext_archive) + else: + raise ValueError( + 'Allowed language values are ``"tatar"``, ``"english"``, ``"german"``,' + '``"french"``, ``"welsh"``, ``"breton"``, ``"chuvash"``, ``"turkish"``, ``"kyrgyz"``,' + '``"irish"``, ``"kabyle"``, ``"catalan"``, ``"taiwanese"``, ``"slovenian"``,' + '``"italian"``, ``"dutch"``, ``"hakha chin"``, ``"esperanto"``, ``"estonian"``,' + '``"persian"``, ``"portuguese"``, ``"basque"``, ``"spanish"``, ``"chinese"``,' + '``"mongolian"``, ``"sakha"``, ``"dhivehi"``, ``"kinyarwanda"``, ``"swedish"``,' + '``"russian"``, ``"indonesian"``, ``"arabic"``, ``"tamil"``, ``"interlingua"``,' + '``"latvian"``, ``"japanese"``, ``"votic"``, ``"abkhaz"``, ``"cantonese"`` and' + '``"romansh sursilvan"``.' + ) # Get string representation of 'root' in case Path object is passed root = os.fspath(root) @@ -198,12 +200,23 @@ def __init__(self, self._path = os.path.join(root, folder_in_archive) - if download: - if not os.path.isdir(self._path): - if not os.path.isfile(archive): - checksum = _CHECKSUMS.get(url, None) - download_url(url, root, hash_value=checksum) + if not os.path.isdir(self._path): + if os.path.isfile(archive): + checksum = _CHECKSUMS.get(url, None) + if checksum: + filepath = os.path.basename(url) + with open(filepath, "rb") as file_obj: + if not validate_file(file_obj, checksum, "sha256"): + raise RuntimeError( + f"The hash of {filepath} does not match. Delete the file manually and retry." + ) extract_archive(archive) + else: + raise RuntimeError( + "The dataset is no longer publicly accessible. You need to " + "download the archives externally and place them in the root " + "directory." + ) self._tsv = os.path.join(root, folder_in_archive, tsv)