diff --git a/synthesizer/preprocess.py b/synthesizer/preprocess.py index 6eb11d9d2..aadde9d00 100644 --- a/synthesizer/preprocess.py +++ b/synthesizer/preprocess.py @@ -10,12 +10,12 @@ import librosa -def preprocess_librispeech(datasets_root: Path, out_dir: Path, n_processes: int, - skip_existing: bool, hparams): +def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int, + skip_existing: bool, hparams, no_alignments: bool, + datasets_name: str, subfolders: str): # Gather the input directories - dataset_root = datasets_root.joinpath("LibriSpeech") - input_dirs = [dataset_root.joinpath("train-clean-100"), - dataset_root.joinpath("train-clean-360")] + dataset_root = datasets_root.joinpath(datasets_name) + input_dirs = [dataset_root.joinpath(subfolder.strip()) for subfolder in subfolders.split(",")] print("\n ".join(map(str, ["Using data from:"] + input_dirs))) assert all(input_dir.exists() for input_dir in input_dirs) @@ -30,9 +30,9 @@ def preprocess_librispeech(datasets_root: Path, out_dir: Path, n_processes: int, # Preprocess the dataset speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs)) func = partial(preprocess_speaker, out_dir=out_dir, skip_existing=skip_existing, - hparams=hparams) + hparams=hparams, no_alignments=no_alignments) job = Pool(n_processes).imap(func, speaker_dirs) - for speaker_metadata in tqdm(job, "LibriSpeech", len(speaker_dirs), unit="speakers"): + for speaker_metadata in tqdm(job, datasets_name, len(speaker_dirs), unit="speakers"): for metadatum in speaker_metadata: metadata_file.write("|".join(str(x) for x in metadatum) + "\n") metadata_file.close() @@ -51,32 +51,62 @@ def preprocess_librispeech(datasets_root: Path, out_dir: Path, n_processes: int, print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata)) -def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool, hparams): +def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool, hparams, no_alignments: bool): metadata = [] for book_dir in speaker_dir.glob("*"): - # Gather the utterance audios and texts - try: - alignments_fpath = next(book_dir.glob("*.alignment.txt")) - with alignments_fpath.open("r") as alignments_file: - alignments = [line.rstrip().split(" ") for line in alignments_file] - except StopIteration: - # A few alignment files will be missing - continue - - # Iterate over each entry in the alignments file - for wav_fname, words, end_times in alignments: - wav_fpath = book_dir.joinpath(wav_fname + ".flac") - assert wav_fpath.exists() - words = words.replace("\"", "").split(",") - end_times = list(map(float, end_times.replace("\"", "").split(","))) - - # Process each sub-utterance - wavs, texts = split_on_silences(wav_fpath, words, end_times, hparams) - for i, (wav, text) in enumerate(zip(wavs, texts)): - sub_basename = "%s_%02d" % (wav_fname, i) - metadata.append(process_utterance(wav, text, out_dir, sub_basename, - skip_existing, hparams)) - + if no_alignments: + # Gather the utterance audios and texts + # LibriTTS uses .wav but we will include extensions for compatibility with other datasets + extensions = ["*.wav", "*.flac", "*.mp3"] + for extension in extensions: + wav_fpaths = book_dir.glob(extension) + + for wav_fpath in wav_fpaths: + # Load the audio waveform + wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate) + if hparams.rescale: + wav = wav / np.abs(wav).max() * hparams.rescaling_max + + # Get the corresponding text + # Check for .txt (for compatibility with other datasets) + text_fpath = wav_fpath.with_suffix(".txt") + if not text_fpath.exists(): + # Check for .normalized.txt (LibriTTS) + text_fpath = wav_fpath.with_suffix(".normalized.txt") + assert text_fpath.exists() + with text_fpath.open("r") as text_file: + text = "".join([line for line in text_file]) + text = text.replace("\"", "") + text = text.strip() + + # Process the utterance + metadata.append(process_utterance(wav, text, out_dir, str(wav_fpath.with_suffix("").name), + skip_existing, hparams)) + else: + # Process alignment file (LibriSpeech support) + # Gather the utterance audios and texts + try: + alignments_fpath = next(book_dir.glob("*.alignment.txt")) + with alignments_fpath.open("r") as alignments_file: + alignments = [line.rstrip().split(" ") for line in alignments_file] + except StopIteration: + # A few alignment files will be missing + continue + + # Iterate over each entry in the alignments file + for wav_fname, words, end_times in alignments: + wav_fpath = book_dir.joinpath(wav_fname + ".flac") + assert wav_fpath.exists() + words = words.replace("\"", "").split(",") + end_times = list(map(float, end_times.replace("\"", "").split(","))) + + # Process each sub-utterance + wavs, texts = split_on_silences(wav_fpath, words, end_times, hparams) + for i, (wav, text) in enumerate(zip(wavs, texts)): + sub_basename = "%s_%02d" % (wav_fname, i) + metadata.append(process_utterance(wav, text, out_dir, sub_basename, + skip_existing, hparams)) + return [m for m in metadata if m is not None] @@ -222,4 +252,4 @@ def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_proce func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath) job = Pool(n_processes).imap(func, fpaths) list(tqdm(job, "Embedding", len(fpaths), unit="utterances")) - \ No newline at end of file + diff --git a/synthesizer_preprocess_audio.py b/synthesizer_preprocess_audio.py index aa7842bcf..fd4d01d47 100644 --- a/synthesizer_preprocess_audio.py +++ b/synthesizer_preprocess_audio.py @@ -1,4 +1,4 @@ -from synthesizer.preprocess import preprocess_librispeech +from synthesizer.preprocess import preprocess_dataset from synthesizer.hparams import hparams from utils.argutils import print_args from pathlib import Path @@ -26,6 +26,13 @@ "Hyperparameter overrides as a comma-separated list of name-value pairs") parser.add_argument("--no_trim", action="store_true", help=\ "Preprocess audio without trimming silences (not recommended).") + parser.add_argument("--no_alignments", action="store_true", help=\ + "Use this option when dataset does not include alignments\ + (these are used to split long audio files into sub-utterances.)") + parser.add_argument("--datasets_name", type=str, default="LibriSpeech", help=\ + "Name of the dataset directory to process.") + parser.add_argument("--subfolders", type=str, default="train-clean-100, train-clean-360", help=\ + "Comma-separated list of subfolders to process inside your dataset directory") args = parser.parse_args() # Process the arguments @@ -49,4 +56,4 @@ # Preprocess the dataset print_args(args, parser) args.hparams = hparams.parse(args.hparams) - preprocess_librispeech(**vars(args)) + preprocess_dataset(**vars(args))