CorentinJ · Jul 23, 2020 · Jul 23, 2020 · Jul 23, 2020 · Jul 23, 2020 · Jul 23, 2020
diff --git a/synthesizer/preprocess.py b/synthesizer/preprocess.py
@@ -10,12 +10,13 @@
 import librosa
 
 
-def preprocess_librispeech(datasets_root: Path, out_dir: Path, n_processes: int, 
- skip_existing: bool, hparams):
+def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
+ skip_existing: bool, hparams, no_alignments: bool,
+ datasets_name: str, subfolders: str):
  # Gather the input directories
- dataset_root = datasets_root.joinpath("LibriSpeech")
- input_dirs = [dataset_root.joinpath("train-clean-100"), 
-  dataset_root.joinpath("train-clean-360")]
+ dataset_root = datasets_root.joinpath(datasets_name)
+ input_dirs = []
+ input_dirs.extend([dataset_root.joinpath(subfolder.strip()) for subfolder in subfolders.split(",")])
  print("\n ".join(map(str, ["Using data from:"] + input_dirs)))
  assert all(input_dir.exists() for input_dir in input_dirs)
 
@@ -30,9 +31,9 @@ def preprocess_librispeech(datasets_root: Path, out_dir: Path, n_processes: int,
  # Preprocess the dataset
  speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))
  func = partial(preprocess_speaker, out_dir=out_dir, skip_existing=skip_existing, 
- hparams=hparams)
+ hparams=hparams, no_alignments=no_alignments)
  job = Pool(n_processes).imap(func, speaker_dirs)
- for speaker_metadata in tqdm(job, "LibriSpeech", len(speaker_dirs), unit="speakers"):
+ for speaker_metadata in tqdm(job, datasets_name, len(speaker_dirs), unit="speakers"):
  for metadatum in speaker_metadata:
  metadata_file.write("|".join(str(x) for x in metadatum) + "\n")
  metadata_file.close()
@@ -51,32 +52,62 @@ def preprocess_librispeech(datasets_root: Path, out_dir: Path, n_processes: int,
  print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata))
 
 
-def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool, hparams):
+def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool, hparams, no_alignments: bool):
  metadata = []
  for book_dir in speaker_dir.glob("*"):
- # Gather the utterance audios and texts
- try:
- alignments_fpath = next(book_dir.glob("*.alignment.txt"))
- with alignments_fpath.open("r") as alignments_file:
- alignments = [line.rstrip().split(" ") for line in alignments_file]
- except StopIteration:
- # A few alignment files will be missing
- continue
-
- # Iterate over each entry in the alignments file
- for wav_fname, words, end_times in alignments:
- wav_fpath = book_dir.joinpath(wav_fname + ".flac")
- assert wav_fpath.exists()
- words = words.replace("\"", "").split(",")
- end_times = list(map(float, end_times.replace("\"", "").split(",")))
-
- # Process each sub-utterance
- wavs, texts = split_on_silences(wav_fpath, words, end_times, hparams)
- for i, (wav, text) in enumerate(zip(wavs, texts)):
- sub_basename = "%s_%02d" % (wav_fname, i)
- metadata.append(process_utterance(wav, text, out_dir, sub_basename, 
- skip_existing, hparams))
-
+ if no_alignments:
+ # Gather the utterance audios and texts
+ # LibriTTS uses .wav but we will include extensions for compatibility with other datasets
+ extensions = ["*.wav", "*.flac", "*.mp3"]
+ for extension in extensions:
+ wav_fpaths = book_dir.glob(extension)
+
+ for wav_fpath in wav_fpaths:
+ # Load the audio waveform
+ wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
+ if hparams.rescale:
+ wav = wav / np.abs(wav).max() * hparams.rescaling_max
+
+ # Get the corresponding text
+ # Check for .txt (for compatibility with other datasets)
+ text_fpath = wav_fpath.with_suffix("").with_suffix(".txt")
+ if not text_fpath.exists():
+ # Check for .normalized.txt (LibriTTS)
+ text_fpath = wav_fpath.with_suffix("").with_suffix(".normalized.txt")
+ assert text_fpath.exists()
+ with text_fpath.open("r") as text_file:
+ text = "".join([line for line in text_file])
+ text = text.replace("\"", "")
+ text = text.strip()
+
+ # Process the utterance
+ metadata.append(process_utterance(wav, text, out_dir, str(wav_fpath.with_suffix("").name),
+ skip_existing, hparams))
+ else:
+ # Process alignment file (LibriSpeech support)
+ # Gather the utterance audios and texts
+ try:
+ alignments_fpath = next(book_dir.glob("*.alignment.txt"))
+ with alignments_fpath.open("r") as alignments_file:
+ alignments = [line.rstrip().split(" ") for line in alignments_file]
+ except StopIteration:
+ # A few alignment files will be missing
+ continue
+
+ # Iterate over each entry in the alignments file
+ for wav_fname, words, end_times in alignments:
+ wav_fpath = book_dir.joinpath(wav_fname + ".flac")
+ assert wav_fpath.exists()
+ words = words.replace("\"", "").split(",")
+ end_times = list(map(float, end_times.replace("\"", "").split(",")))
+
+ # Process each sub-utterance
+ wavs, texts = split_on_silences(wav_fpath, words, end_times, hparams)
+ for i, (wav, text) in enumerate(zip(wavs, texts)):
+ sub_basename = "%s_%02d" % (wav_fname, i)
+ metadata.append(process_utterance(wav, text, out_dir, sub_basename,
+ skip_existing, hparams))
+
  return [m for m in metadata if m is not None]
 
 
@@ -222,4 +253,4 @@ def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_proce
  func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
  job = Pool(n_processes).imap(func, fpaths)
  list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
-
+
diff --git a/synthesizer_preprocess_audio.py b/synthesizer_preprocess_audio.py
@@ -1,4 +1,4 @@
-from synthesizer.preprocess import preprocess_librispeech
+from synthesizer.preprocess import preprocess_dataset
 from synthesizer.hparams import hparams
 from utils.argutils import print_args
 from pathlib import Path
@@ -26,6 +26,13 @@
  "Hyperparameter overrides as a comma-separated list of name-value pairs")
  parser.add_argument("--no_trim", action="store_true", help=\
  "Preprocess audio without trimming silences (not recommended).")
+ parser.add_argument("--no_alignments", action="store_true", help=\
+ "Use this option when dataset does not include alignments\
+ (these are used to split long audio files into sub-utterances.)")
+ parser.add_argument("--datasets_name", type=str, default="LibriSpeech", help=\
+ "Name of the dataset directory to process.")
+ parser.add_argument("--subfolders", type=str, default="train-clean-100, train-clean-360", help=\
+ "Comma-separated list of subfolders to process inside your dataset directory")
  args = parser.parse_args()
 
  # Process the arguments
@@ -49,4 +56,4 @@
  # Preprocess the dataset
  print_args(args, parser)
  args.hparams = hparams.parse(args.hparams)
- preprocess_librispeech(**vars(args)) 
+ preprocess_dataset(**vars(args))