Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix common voice #1071

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 96 additions & 45 deletions test/torchaudio_unittest/datasets/commonvoice_test.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os
import csv
import random
import tarfile
from pathlib import Path

from torchaudio.datasets import commonvoice
Expand All @@ -12,64 +12,115 @@
normalize_wav,
)

_HEADERS = [
"client_ids",
"path",
"sentence",
"up_votes",
"down_votes",
"age",
"gender",
"accent",
]

# Note: extension is changed to wav for the sake of test
# Note: the first content is missing values for `age`, `gender` and `accent` as in the original data.
_TRAIN_CSV_CONTENTS = [
[
"9d16c5d980247861130e0480e2719f448be73d86a496c36d01a477cbdecd8cfd1399403d7a77bf458d211a70711b2da0845c",
"common_voice_en_18885784.wav",
"He was accorded a State funeral, and was buried in Drayton and Toowoomba Cemetery.",
"2",
"0",
"",
"",
""
],
[
"c82eb9291328620f06025a1f8112b909099e447e485e99236cb87df008650250e79fea5ca772061fb6a370830847b9c44d20",
"common_voice_en_556542.wav",
"Once more into the breach",
"2",
"0",
"thirties",
"male",
"us",
],
[
"f74d880c5ad4c5917f314a604d3fc4805159d255796fb9f8defca35333ecc002bdf53dc463503c12674ea840b21b4a507b7c",
"common_voice_en_18607573.wav",
"Caddy, show Miss Clare and Miss Summerson their rooms.",
"2",
"0",
"twenties",
"male",
"canada",
],
]


def _make_dataset(root_dir, sample_rate=48000):
# The path convention commonvoice uses
base_dir = os.path.join(root_dir, "CommonVoice", "cv-corpus-4-2019-12-10", "en")
audio_dir = os.path.join(base_dir, "clips")
tsv_path = os.path.join(base_dir, "train.tsv")

os.makedirs(base_dir, exist_ok=True)
os.makedirs(audio_dir, exist_ok=True)

# Tsv file name difference does not mean different subset, testing as a whole dataset here
print(tsv_path)
with open(tsv_path, "w", newline='') as tsv:
writer = csv.writer(tsv, delimiter='\t')
writer.writerow(_HEADERS)
for content in _TRAIN_CSV_CONTENTS:
writer.writerow(content)

# Generate audio files
expected = []
for i, content in enumerate(_TRAIN_CSV_CONTENTS):
audio_path = os.path.join(audio_dir, content[1])
data = get_whitenoise(
sample_rate=sample_rate, duration=1, n_channels=1, seed=i, dtype='float32')
save_wav(audio_path, data, sample_rate)
print(audio_path)
expected.append((normalize_wav(data), sample_rate, dict(zip(_HEADERS, content))))
return expected


def _make_tarfile(output_filename, source_dir):
with tarfile.open(output_filename, "w:gz") as tar:
tar.add(source_dir, arcname=os.path.basename(source_dir))


class TestCommonVoice(TempDirMixin, TorchaudioTestCase):
backend = 'default'

root_dir = None
data = []
_headers = [u"client_ids", u"path", u"sentence", u"up_votes", u"down_votes", u"age", u"gender", u"accent"]
# Note: extension is changed to wav for the sake of test
# Note: the first content is missing values for `age`, `gender` and `accent` as in the original data.
_train_csv_contents = [
["9d16c5d980247861130e0480e2719f448be73d86a496c36d01a477cbdecd8cfd1399403d7a77bf458d211a70711b2da0845c",
"common_voice_en_18885784.wav",
"He was accorded a State funeral, and was buried in Drayton and Toowoomba Cemetery.", "2", "0", "", "", ""],
["c82eb9291328620f06025a1f8112b909099e447e485e99236cb87df008650250e79fea5ca772061fb6a370830847b9c44d20",
"common_voice_en_556542.wav", "Once more into the breach", "2", "0", "thirties", "male", "us"],
["f74d880c5ad4c5917f314a604d3fc4805159d255796fb9f8defca35333ecc002bdf53dc463503c12674ea840b21b4a507b7c",
"common_voice_en_18607573.wav",
"Caddy, show Miss Clare and Miss Summerson their rooms.", "2", "0", "twenties", "male", "canada"],
]
_folder_audio = "clips"
sample_rate = 48000
root_dir = ""
expected = []

@classmethod
def setUpClass(cls):
cls.root_dir = cls.get_base_temp_dir()
# The path convention commonvoice uses
base_dir = os.path.join(cls.root_dir, commonvoice.FOLDER_IN_ARCHIVE, commonvoice.VERSION, "en")
os.makedirs(base_dir, exist_ok=True)

# Tsv file name difference does not mean different subset, testing as a whole dataset here
tsv_filename = os.path.join(base_dir, commonvoice.TSV)
with open(tsv_filename, "w", newline='') as tsv:
writer = csv.writer(tsv, delimiter='\t')
writer.writerow(cls._headers)
for i, content in enumerate(cls._train_csv_contents):
audio_filename = audio_filename = content[1]
writer.writerow(content)

# Generate and store audio
audio_base_path = os.path.join(base_dir, cls._folder_audio)
os.makedirs(audio_base_path, exist_ok=True)
audio_path = os.path.join(audio_base_path, audio_filename)
data = get_whitenoise(sample_rate=cls.sample_rate, duration=1, n_channels=1, seed=i, dtype='float32')
save_wav(audio_path, data, cls.sample_rate)

# Append data entry
cls.data.append((normalize_wav(data), cls.sample_rate, dict(zip(cls._headers, content))))
root_dir = cls.get_base_temp_dir()
tmp_dir = os.path.join(root_dir, 'tmp')
expected = _make_dataset(tmp_dir)
source_dir = os.path.join(tmp_dir, 'CommonVoice')
arch_path = os.path.join(root_dir, 'en.tar.gz')
_make_tarfile(arch_path, source_dir)

cls.root_dir = root_dir
cls.expected = expected

def _test_commonvoice(self, dataset):
n_ite = 0
for i, (waveform, sample_rate, dictionary) in enumerate(dataset):
expected_dictionary = self.data[i][2]
expected_data = self.data[i][0]
expected_dictionary = self.expected[i][2]
expected_data = self.expected[i][0]
self.assertEqual(expected_data, waveform, atol=5e-5, rtol=1e-8)
assert sample_rate == TestCommonVoice.sample_rate
assert sample_rate == 48000
assert dictionary == expected_dictionary
n_ite += 1
assert n_ite == len(self.data)
assert n_ite == len(self.expected)

def test_commonvoice_str(self):
dataset = commonvoice.COMMONVOICE(self.root_dir)
Expand Down
4 changes: 2 additions & 2 deletions test/torchaudio_unittest/datasets/utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,15 +54,15 @@ class TestIterator(TorchaudioTestCase):
path = get_asset_path()

def test_disckcache_iterator(self):
data = COMMONVOICE(self.path, version="cv-corpus-4-2019-12-10", language="tatar")
data = COMMONVOICE(self.path, url="tatar")
data = dataset_utils.diskcache_iterator(data)
# Save
data[0]
# Load
data[0]

def test_bg_iterator(self):
data = COMMONVOICE(self.path, version="cv-corpus-4-2019-12-10", language="tatar")
data = COMMONVOICE(self.path, url="tatar")
data = dataset_utils.bg_iterator(data, 5)
for _ in data:
pass
Loading