coqui-ai · erogol · Dec 14, 2022 · Dec 13, 2022 · Dec 14, 2022 · Dec 14, 2022
diff --git a/TTS/.models.json b/TTS/.models.json
@@ -141,7 +141,7 @@
  "license": "bsd-3-clause",
  "contact": null,
  "commit": null
- }, 
+ },
  "fast_pitch": {
  "description": "FastPitch model trained on LJSpeech using the Aligner Network",
  "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--fast_pitch.zip",
@@ -150,6 +150,15 @@
  "author": "Eren Gölge @erogol",
  "license": "apache 2.0",
  "contact": "egolge@coqui.com"
+ },
+ "overflow": {
+ "description": "Overflow model trained on LJSpeech",
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.0_models/tts_models--en--ljspeech--overflow.zip",
+ "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
+ "commit": "3b1a28f",
+ "author": "Eren Gölge @erogol",
+ "license": "apache 2.0",
+ "contact": "egolge@coqui.ai"
  }
  },
  "vctk": {
@@ -223,7 +232,7 @@
  "author": "@NeonGeckoCom",
  "license": "bsd-3-clause"
  }
- } 
+ }
  },
  "fr": {
  "mai": {

diff --git a/TTS/tts/datasets/__init__.py b/TTS/tts/datasets/__init__.py
@@ -129,7 +129,8 @@ def load_tts_samples(
  meta_data_eval = formatter(root_path, meta_file_val, ignored_speakers=ignored_speakers)
  meta_data_eval = add_extra_keys(meta_data_eval, language, dataset_name)
  else:
- meta_data_eval, meta_data_train = split_dataset(meta_data_train, eval_split_max_size, eval_split_size)
+ eval_size_per_dataset = eval_split_max_size // len(datasets) if eval_split_max_size else None
+ meta_data_eval, meta_data_train = split_dataset(meta_data_train, eval_size_per_dataset, eval_split_size)
  meta_data_eval_all += meta_data_eval
  meta_data_train_all += meta_data_train
  # load attention masks for the duration predictor training

diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py
@@ -520,6 +520,7 @@ def collate_fn(self, batch):
  "raw_text": batch["raw_text"],
  "pitch": pitch,
  "language_ids": language_ids,
+ "audio_unique_names": batch["audio_unique_name"],
  }
 
  raise TypeError(

diff --git a/TTS/tts/layers/overflow/neural_hmm.py b/TTS/tts/layers/overflow/neural_hmm.py
@@ -311,7 +311,7 @@ def get_absorption_state_scaling_factor(self, mels_len, log_alpha_scaled, inputs
 
  # If the length of the mel is less than the number of states it will select the -inf values leading to nan gradients
  # Ideally, we should clean the dataset otherwise this is a little hack uncomment the line below
- # final_log_c = final_log_c.clamp(min=torch.finfo(final_log_c.dtype).min)
+ final_log_c = final_log_c.clamp(min=torch.finfo(final_log_c.dtype).min)
 
  sum_final_log_c = torch.logsumexp(final_log_c, dim=1)
  return sum_final_log_c

diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py
@@ -232,6 +232,7 @@ def format_batch(self, batch: Dict) -> Dict:
  "waveform": waveform,
  "pitch": pitch,
  "language_ids": language_ids,
+ "audio_unique_names": batch["audio_unique_names"],
  }
 
  def get_sampler(self, config: Coqpit, dataset: TTSDataset, num_gpus=1):
@@ -388,6 +389,9 @@ def test_run(self, assets: Dict) -> Tuple[Dict, Dict]:
  test_sentences = self.config.test_sentences
  aux_inputs = self._get_test_aux_input()
  for idx, sen in enumerate(test_sentences):
+ if isinstance(sen, list):
+ aux_inputs = self.get_aux_input_from_test_sentences(sen)
+ sen = aux_inputs["text"]
  outputs_dict = synthesis(
  self,
  sen,

diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py
@@ -366,7 +366,8 @@ def _download_zip_file(file_url, output_folder, progress_bar):
  for file_path in z.namelist()[1:]:
  src_path = os.path.join(output_folder, file_path)
  dst_path = os.path.join(output_folder, os.path.basename(file_path))
- copyfile(src_path, dst_path)
+ if src_path != dst_path:
+ copyfile(src_path, dst_path)
  # remove the extracted folder
  rmtree(os.path.join(output_folder, z.namelist()[0]))