Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add synthesizer preprocessing support for other datasets #441

Merged
4 commits merged into from Jul 23, 2020
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 63 additions & 32 deletions synthesizer/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,13 @@
import librosa


def preprocess_librispeech(datasets_root: Path, out_dir: Path, n_processes: int,
skip_existing: bool, hparams):
def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
skip_existing: bool, hparams, no_alignments: bool,
datasets_name: str, subfolders: str):
# Gather the input directories
dataset_root = datasets_root.joinpath("LibriSpeech")
input_dirs = [dataset_root.joinpath("train-clean-100"),
dataset_root.joinpath("train-clean-360")]
dataset_root = datasets_root.joinpath(datasets_name)
input_dirs = []
input_dirs.extend([dataset_root.joinpath(subfolder.strip()) for subfolder in subfolders.split(",")])
This conversation was marked as resolved.
Show resolved Hide resolved
print("\n ".join(map(str, ["Using data from:"] + input_dirs)))
assert all(input_dir.exists() for input_dir in input_dirs)

Expand All @@ -30,9 +31,9 @@ def preprocess_librispeech(datasets_root: Path, out_dir: Path, n_processes: int,
# Preprocess the dataset
speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))
func = partial(preprocess_speaker, out_dir=out_dir, skip_existing=skip_existing,
hparams=hparams)
hparams=hparams, no_alignments=no_alignments)
job = Pool(n_processes).imap(func, speaker_dirs)
for speaker_metadata in tqdm(job, "LibriSpeech", len(speaker_dirs), unit="speakers"):
for speaker_metadata in tqdm(job, datasets_name, len(speaker_dirs), unit="speakers"):
for metadatum in speaker_metadata:
metadata_file.write("|".join(str(x) for x in metadatum) + "\n")
metadata_file.close()
Expand All @@ -51,32 +52,62 @@ def preprocess_librispeech(datasets_root: Path, out_dir: Path, n_processes: int,
print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata))


def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool, hparams):
def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool, hparams, no_alignments: bool):
metadata = []
for book_dir in speaker_dir.glob("*"):
# Gather the utterance audios and texts
try:
alignments_fpath = next(book_dir.glob("*.alignment.txt"))
with alignments_fpath.open("r") as alignments_file:
alignments = [line.rstrip().split(" ") for line in alignments_file]
except StopIteration:
# A few alignment files will be missing
continue

# Iterate over each entry in the alignments file
for wav_fname, words, end_times in alignments:
wav_fpath = book_dir.joinpath(wav_fname + ".flac")
assert wav_fpath.exists()
words = words.replace("\"", "").split(",")
end_times = list(map(float, end_times.replace("\"", "").split(",")))

# Process each sub-utterance
wavs, texts = split_on_silences(wav_fpath, words, end_times, hparams)
for i, (wav, text) in enumerate(zip(wavs, texts)):
sub_basename = "%s_%02d" % (wav_fname, i)
metadata.append(process_utterance(wav, text, out_dir, sub_basename,
skip_existing, hparams))

if no_alignments:
# Gather the utterance audios and texts
# LibriTTS uses .wav but we will include extensions for compatibility with other datasets
extensions = ["*.wav", "*.flac", "*.mp3"]
for extension in extensions:
wav_fpaths = book_dir.glob(extension)

for wav_fpath in wav_fpaths:
# Load the audio waveform
wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
if hparams.rescale:
wav = wav / np.abs(wav).max() * hparams.rescaling_max

# Get the corresponding text
# Check for .txt (for compatibility with other datasets)
text_fpath = wav_fpath.with_suffix("").with_suffix(".txt")
if not text_fpath.exists():
# Check for .normalized.txt (LibriTTS)
text_fpath = wav_fpath.with_suffix("").with_suffix(".normalized.txt")
assert text_fpath.exists()
with text_fpath.open("r") as text_file:
text = "".join([line for line in text_file])
text = text.replace("\"", "")
text = text.strip()

# Process the utterance
metadata.append(process_utterance(wav, text, out_dir, str(wav_fpath.with_suffix("").name),
skip_existing, hparams))
else:
# Process alignment file (LibriSpeech support)
# Gather the utterance audios and texts
try:
alignments_fpath = next(book_dir.glob("*.alignment.txt"))
with alignments_fpath.open("r") as alignments_file:
alignments = [line.rstrip().split(" ") for line in alignments_file]
except StopIteration:
# A few alignment files will be missing
continue

# Iterate over each entry in the alignments file
for wav_fname, words, end_times in alignments:
wav_fpath = book_dir.joinpath(wav_fname + ".flac")
assert wav_fpath.exists()
words = words.replace("\"", "").split(",")
end_times = list(map(float, end_times.replace("\"", "").split(",")))

# Process each sub-utterance
wavs, texts = split_on_silences(wav_fpath, words, end_times, hparams)
for i, (wav, text) in enumerate(zip(wavs, texts)):
sub_basename = "%s_%02d" % (wav_fname, i)
metadata.append(process_utterance(wav, text, out_dir, sub_basename,
skip_existing, hparams))

return [m for m in metadata if m is not None]


Expand Down Expand Up @@ -222,4 +253,4 @@ def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_proce
func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
job = Pool(n_processes).imap(func, fpaths)
list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))


11 changes: 9 additions & 2 deletions synthesizer_preprocess_audio.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from synthesizer.preprocess import preprocess_librispeech
from synthesizer.preprocess import preprocess_dataset
from synthesizer.hparams import hparams
from utils.argutils import print_args
from pathlib import Path
Expand Down Expand Up @@ -26,6 +26,13 @@
"Hyperparameter overrides as a comma-separated list of name-value pairs")
parser.add_argument("--no_trim", action="store_true", help=\
"Preprocess audio without trimming silences (not recommended).")
parser.add_argument("--no_alignments", action="store_true", help=\
"Use this option when dataset does not include alignments\
(these are used to split long audio files into sub-utterances.)")
parser.add_argument("--datasets_name", type=str, default="LibriSpeech", help=\
"Name of the dataset directory to process.")
parser.add_argument("--subfolders", type=str, default="train-clean-100, train-clean-360", help=\
"Comma-separated list of subfolders to process inside your dataset directory")
args = parser.parse_args()

# Process the arguments
Expand All @@ -49,4 +56,4 @@
# Preprocess the dataset
print_args(args, parser)
args.hparams = hparams.parse(args.hparams)
preprocess_librispeech(**vars(args))
preprocess_dataset(**vars(args))