Skip to content

Commit

Permalink
no longer download CommonVoice directly (#1018)
Browse files Browse the repository at this point in the history
no longer allow to download the dataset directly. deprecate: download and url. add language.
  • Loading branch information
vincentqb authored Dec 3, 2020
1 parent 9ad450a commit 09a6fca
Show file tree
Hide file tree
Showing 2 changed files with 97 additions and 84 deletions.
4 changes: 2 additions & 2 deletions test/torchaudio_unittest/datasets/utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,15 +54,15 @@ class TestIterator(TorchaudioTestCase):
path = get_asset_path()

def test_disckcache_iterator(self):
data = COMMONVOICE(self.path, url="tatar")
data = COMMONVOICE(self.path, version="cv-corpus-4-2019-12-10", language="tatar")
data = dataset_utils.diskcache_iterator(data)
# Save
data[0]
# Load
data[0]

def test_bg_iterator(self):
data = COMMONVOICE(self.path, url="tatar")
data = COMMONVOICE(self.path, version="cv-corpus-4-2019-12-10", language="tatar")
data = dataset_utils.bg_iterator(data, 5)
for _ in data:
pass
177 changes: 95 additions & 82 deletions torchaudio/datasets/commonvoice.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import os
import warnings
from pathlib import Path
from typing import List, Dict, Tuple, Union
from typing import List, Dict, Tuple, Optional, Union

import torchaudio
from torchaudio.datasets.utils import download_url, extract_archive, unicode_csv_reader
from torchaudio.datasets.utils import extract_archive, unicode_csv_reader, validate_file
from torch import Tensor
from torch.utils.data import Dataset

Expand All @@ -16,68 +17,39 @@
# validated.tsv

FOLDER_IN_ARCHIVE = "CommonVoice"
URL = "english"
VERSION = "cv-corpus-4-2019-12-10"
LANGUAGE = "english"
VERSION = "cv-corpus-5.1-2020-06-22"
TSV = "train.tsv"
_CHECKSUMS = {
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/tt.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/en.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/de.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/fr.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cy.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/br.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cv.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/tr.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ky.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ga-IE.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/kab.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ca.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/zh-TW.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sl.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/it.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/nl.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cnh.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/eo.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/et.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/fa.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/eu.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/es.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/zh-CN.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/mn.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sah.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/dv.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/rw.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sv-SE.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ru.tar.gz":
None
"cv-corpus-5.1-2020-06-22/tt.tar.gz": None,
"cv-corpus-5.1-2020-06-22/en.tar.gz": None,
"cv-corpus-5.1-2020-06-22/de.tar.gz": None,
"cv-corpus-5.1-2020-06-22/fr.tar.gz": None,
"cv-corpus-5.1-2020-06-22/cy.tar.gz": None,
"cv-corpus-5.1-2020-06-22/br.tar.gz": None,
"cv-corpus-5.1-2020-06-22/cv.tar.gz": None,
"cv-corpus-5.1-2020-06-22/tr.tar.gz": None,
"cv-corpus-5.1-2020-06-22/ky.tar.gz": None,
"cv-corpus-5.1-2020-06-22/ga-IE.tar.gz": None,
"cv-corpus-5.1-2020-06-22/kab.tar.gz": None,
"cv-corpus-5.1-2020-06-22/ca.tar.gz": None,
"cv-corpus-5.1-2020-06-22/zh-TW.tar.gz": None,
"cv-corpus-5.1-2020-06-22/sl.tar.gz": None,
"cv-corpus-5.1-2020-06-22/it.tar.gz": None,
"cv-corpus-5.1-2020-06-22/nl.tar.gz": None,
"cv-corpus-5.1-2020-06-22/cnh.tar.gz": None,
"cv-corpus-5.1-2020-06-22/eo.tar.gz": None,
"cv-corpus-5.1-2020-06-22/et.tar.gz": None,
"cv-corpus-5.1-2020-06-22/fa.tar.gz": None,
"cv-corpus-5.1-2020-06-22/eu.tar.gz": None,
"cv-corpus-5.1-2020-06-22/es.tar.gz": None,
"cv-corpus-5.1-2020-06-22/zh-CN.tar.gz": None,
"cv-corpus-5.1-2020-06-22/mn.tar.gz": None,
"cv-corpus-5.1-2020-06-22/sah.tar.gz": None,
"cv-corpus-5.1-2020-06-22/dv.tar.gz": None,
"cv-corpus-5.1-2020-06-22/rw.tar.gz": None,
"cv-corpus-5.1-2020-06-22/sv-SE.tar.gz": None,
"cv-corpus-5.1-2020-06-22/ru.tar.gz": None,
}


Expand All @@ -101,15 +73,18 @@ def load_commonvoice_item(line: List[str],


class COMMONVOICE(Dataset):
"""Create a Dataset for CommonVoice.
"""Create a Dataset for `CommonVoice <https://commonvoice.mozilla.org/>`_.
Args:
root (str or Path): Path to the directory where the dataset is found or downloaded.
tsv (str, optional): The name of the tsv file used to construct the metadata.
(default: ``"train.tsv"``)
url (str, optional): The URL to download the dataset from, or the language of
the dataset to download. (default: ``"english"``).
Allowed language values are ``"tatar"``, ``"english"``, ``"german"``,
url (str, optional): Deprecated.
folder_in_archive (str, optional): The top-level directory of the dataset.
version (str): Version string. (default: ``"cv-corpus-5.1-2020-06-22"``)
language (str, optional): Language of the dataset. (default: None)
The following values are mapped to their corresponding shortened version:
``"tatar"``, ``"english"``, ``"german"``,
``"french"``, ``"welsh"``, ``"breton"``, ``"chuvash"``, ``"turkish"``, ``"kyrgyz"``,
``"irish"``, ``"kabyle"``, ``"catalan"``, ``"taiwanese"``, ``"slovenian"``,
``"italian"``, ``"dutch"``, ``"hakha chin"``, ``"esperanto"``, ``"estonian"``,
Expand All @@ -118,11 +93,8 @@ class COMMONVOICE(Dataset):
``"russian"``, ``"indonesian"``, ``"arabic"``, ``"tamil"``, ``"interlingua"``,
``"latvian"``, ``"japanese"``, ``"votic"``, ``"abkhaz"``, ``"cantonese"`` and
``"romansh sursilvan"``.
folder_in_archive (str, optional): The top-level directory of the dataset.
version (str): Version string. (default: ``"cv-corpus-4-2019-12-10"``)
For the other allowed values, Please checkout https://commonvoice.mozilla.org/en/datasets.
download (bool, optional):
Whether to download the dataset if it is not found at root path. (default: ``False``).
download (bool, optional): Deprecated.
"""

_ext_txt = ".txt"
Expand All @@ -132,10 +104,30 @@ class COMMONVOICE(Dataset):
def __init__(self,
root: Union[str, Path],
tsv: str = TSV,
url: str = URL,
url: Optional[str] = None,
folder_in_archive: str = FOLDER_IN_ARCHIVE,
version: str = VERSION,
download: bool = False) -> None:
language: str = LANGUAGE,
download: Optional[bool] = False) -> None:

if download is True:
raise RuntimeError(
"The dataset is no longer publicly accessible. You need to "
"download the archives externally and place them in the root "
"directory."
)
elif download is False:
warnings.warn(
"The use of the download flag is deprecated, since the dataset "
"is no longer directly accessible.", RuntimeWarning
)

if url is not None:
warnings.warn(
"The use of the url flag is deprecated, since the dataset "
"is no longer publicly accessible. To specify the language of the dataset, "
"please use the language parameter instead.", RuntimeWarning
)

languages = {
"tatar": "tt",
Expand Down Expand Up @@ -180,12 +172,22 @@ def __init__(self,
"romansh sursilvan": "rm-sursilv"
}

if url in languages:
if language in languages:
ext_archive = ".tar.gz"
language = languages[url]

base_url = "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com"
url = os.path.join(base_url, version, language + ext_archive)
language = languages[language]
url = os.path.join(version, language + ext_archive)
else:
raise ValueError(
'Allowed language values are ``"tatar"``, ``"english"``, ``"german"``,'
'``"french"``, ``"welsh"``, ``"breton"``, ``"chuvash"``, ``"turkish"``, ``"kyrgyz"``,'
'``"irish"``, ``"kabyle"``, ``"catalan"``, ``"taiwanese"``, ``"slovenian"``,'
'``"italian"``, ``"dutch"``, ``"hakha chin"``, ``"esperanto"``, ``"estonian"``,'
'``"persian"``, ``"portuguese"``, ``"basque"``, ``"spanish"``, ``"chinese"``,'
'``"mongolian"``, ``"sakha"``, ``"dhivehi"``, ``"kinyarwanda"``, ``"swedish"``,'
'``"russian"``, ``"indonesian"``, ``"arabic"``, ``"tamil"``, ``"interlingua"``,'
'``"latvian"``, ``"japanese"``, ``"votic"``, ``"abkhaz"``, ``"cantonese"`` and'
'``"romansh sursilvan"``.'
)

# Get string representation of 'root' in case Path object is passed
root = os.fspath(root)
Expand All @@ -198,12 +200,23 @@ def __init__(self,

self._path = os.path.join(root, folder_in_archive)

if download:
if not os.path.isdir(self._path):
if not os.path.isfile(archive):
checksum = _CHECKSUMS.get(url, None)
download_url(url, root, hash_value=checksum)
if not os.path.isdir(self._path):
if os.path.isfile(archive):
checksum = _CHECKSUMS.get(url, None)
if checksum:
filepath = os.path.basename(url)
with open(filepath, "rb") as file_obj:
if not validate_file(file_obj, checksum, "sha256"):
raise RuntimeError(
f"The hash of {filepath} does not match. Delete the file manually and retry."
)
extract_archive(archive)
else:
raise RuntimeError(
"The dataset is no longer publicly accessible. You need to "
"download the archives externally and place them in the root "
"directory."
)

self._tsv = os.path.join(root, folder_in_archive, tsv)

Expand Down

0 comments on commit 09a6fca

Please sign in to comment.