From f6eaa61afe5f1103d953e7dcc3b37e7ccc2f542f Mon Sep 17 00:00:00 2001 From: Fred Date: Sun, 2 Jul 2023 18:55:50 -0300 Subject: [PATCH 1/3] Adding checkpoint model --- TTS/.models.json | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/TTS/.models.json b/TTS/.models.json index c97c6a383a..13da3503bd 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -10,6 +10,15 @@ "license": "CC BY-NC-ND 4.0", "contact": "egolge@coqui.ai" }, + "your_tts": { + "description": "Your TTS model accompanying the paper https://arxiv.org/abs/2306.10097", + "github_rls_url": "https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p", + "default_vocoder": null, + "commit": null, + "author": "@freds0", + "license": "CC-BY 4.0", + "contact": "fred.santos.oliveira@gmail.com" + }, "bark": { "description": "🐶 Bark TTS model released by suno-ai. You can find the original implementation in https://github.com/suno-ai/bark.", "hf_url": [ @@ -879,4 +888,4 @@ } } } -} \ No newline at end of file +} From bcd500fa7b216ab0021a63ef1221af4607b310b2 Mon Sep 17 00:00:00 2001 From: "Frederico S. Oliveira" Date: Thu, 30 Nov 2023 17:27:05 -0300 Subject: [PATCH 2/3] Fixing bug Correction in training the Fastspeech/Fastspeech2/FastPitch/SpeedySpeech model using external speaker embedding. --- TTS/tts/models/forward_tts.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/TTS/tts/models/forward_tts.py b/TTS/tts/models/forward_tts.py index 9e1b1c4097..b6e9ac8a14 100644 --- a/TTS/tts/models/forward_tts.py +++ b/TTS/tts/models/forward_tts.py @@ -241,7 +241,7 @@ def __init__( ) self.duration_predictor = DurationPredictor( - self.args.hidden_channels + self.embedded_speaker_dim, + self.args.hidden_channels, self.args.duration_predictor_hidden_channels, self.args.duration_predictor_kernel_size, self.args.duration_predictor_dropout_p, @@ -249,7 +249,7 @@ def __init__( if self.args.use_pitch: self.pitch_predictor = DurationPredictor( - self.args.hidden_channels + self.embedded_speaker_dim, + self.args.hidden_channels, self.args.pitch_predictor_hidden_channels, self.args.pitch_predictor_kernel_size, self.args.pitch_predictor_dropout_p, @@ -263,7 +263,7 @@ def __init__( if self.args.use_energy: self.energy_predictor = DurationPredictor( - self.args.hidden_channels + self.embedded_speaker_dim, + self.args.hidden_channels, self.args.energy_predictor_hidden_channels, self.args.energy_predictor_kernel_size, self.args.energy_predictor_dropout_p, @@ -299,7 +299,8 @@ def init_multispeaker(self, config: Coqpit): if config.use_d_vector_file: self.embedded_speaker_dim = config.d_vector_dim if self.args.d_vector_dim != self.args.hidden_channels: - self.proj_g = nn.Conv1d(self.args.d_vector_dim, self.args.hidden_channels, 1) + #self.proj_g = nn.Conv1d(self.args.d_vector_dim, self.args.hidden_channels, 1) + self.proj_g = nn.Linear(in_features=self.args.d_vector_dim, out_features=self.args.hidden_channels) # init speaker embedding layer if config.use_speaker_embedding and not config.use_d_vector_file: print(" > Init speaker_embedding layer.") @@ -403,10 +404,13 @@ def _forward_encoder( # [B, T, C] x_emb = self.emb(x) # encoder pass - o_en = self.encoder(torch.transpose(x_emb, 1, -1), x_mask) + #o_en = self.encoder(torch.transpose(x_emb, 1, -1), x_mask) + o_en = self.encoder(torch.transpose(x_emb, 1, -1), x_mask, g) # speaker conditioning # TODO: try different ways of conditioning - if g is not None: + if g is not None: + if hasattr(self, "proj_g"): + g = self.proj_g(g.view(g.shape[0], -1)).unsqueeze(-1) o_en = o_en + g return o_en, x_mask, g, x_emb From f9117918fe6403c4afa5a442198c47d8e76469e1 Mon Sep 17 00:00:00 2001 From: "Frederico S. Oliveira" Date: Mon, 11 Dec 2023 10:47:31 -0300 Subject: [PATCH 3/3] Update .models.json --- TTS/.models.json | 9 --------- 1 file changed, 9 deletions(-) diff --git a/TTS/.models.json b/TTS/.models.json index c586daebce..1957d78adb 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -40,15 +40,6 @@ "license": "CC BY-NC-ND 4.0", "contact": "egolge@coqui.ai" }, - "your_tts": { - "description": "Your TTS model accompanying the paper https://arxiv.org/abs/2306.10097", - "github_rls_url": "https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p", - "default_vocoder": null, - "commit": null, - "author": "@freds0", - "license": "CC-BY 4.0", - "contact": "fred.santos.oliveira@gmail.com" - }, "bark": { "description": "🐶 Bark TTS model released by suno-ai. You can find the original implementation in https://github.com/suno-ai/bark.", "hf_url": [