From fc8336fffd40c39bdb225c1b041ab4dd15fac4e9 Mon Sep 17 00:00:00 2001
From: ylzz1997 <ylzz1997@outlook.com>
Date: Wed, 2 Aug 2023 01:09:46 +0800
Subject: [PATCH] Updata VITS2 part (Transformer Flow)

---
 configs_template/config_template.json      |  4 +-
 configs_template/config_tiny_template.json |  4 +-
 models.py                                  | 48 ++++++++++++++++++++-
 modules/attentions.py                      | 22 ++++++++--
 modules/modules.py                         | 50 ++++++++++++++++++++++
 5 files changed, 122 insertions(+), 6 deletions(-)

diff --git a/configs_template/config_template.json b/configs_template/config_template.json
index 70a74a68..4b1b3232 100644
--- a/configs_template/config_template.json
+++ b/configs_template/config_template.json
@@ -54,6 +54,7 @@
     "upsample_initial_channel": 512,
     "upsample_kernel_sizes": [16,16, 4, 4, 4],
     "n_layers_q": 3,
+    "n_layers_trans_flow": 3,
     "n_flow_layer": 4,
     "use_spectral_norm": false,
     "gin_channels": 768,
@@ -65,7 +66,8 @@
     "vol_embedding":false,
     "use_depthwise_conv":false,
     "flow_share_parameter": false,
-    "use_automatic_f0_prediction": true
+    "use_automatic_f0_prediction": true,
+    "use_transformer_flow": false
   },
   "spk": {
     "nyaru": 0,
diff --git a/configs_template/config_tiny_template.json b/configs_template/config_tiny_template.json
index 4865ec5b..d0a4381e 100644
--- a/configs_template/config_tiny_template.json
+++ b/configs_template/config_tiny_template.json
@@ -54,6 +54,7 @@
     "upsample_initial_channel": 400,
     "upsample_kernel_sizes": [16,16, 4, 4, 4],
     "n_layers_q": 3,
+    "n_layers_trans_flow": 3,
     "n_flow_layer": 4,
     "use_spectral_norm": false,
     "gin_channels": 768,
@@ -65,7 +66,8 @@
     "vol_embedding":false,
     "use_depthwise_conv":true,
     "flow_share_parameter": true,
-    "use_automatic_f0_prediction": true
+    "use_automatic_f0_prediction": true,
+    "use_transformer_flow": false
   },
   "spk": {
     "nyaru": 0,
diff --git a/models.py b/models.py
index 69749809..24338fa2 100644
--- a/models.py
+++ b/models.py
@@ -51,6 +51,46 @@ def forward(self, x, x_mask, g=None, reverse=False):
                 x = flow(x, x_mask, g=g, reverse=reverse)
         return x
 
+class TransformerCouplingBlock(nn.Module):
+    def __init__(self,
+                 channels,
+                 hidden_channels,
+                 filter_channels,
+                 n_heads,
+                 n_layers,
+                 kernel_size,
+                 p_dropout,
+                 n_flows=4,
+                 gin_channels=0,
+                 share_parameter=False
+                 ):
+            
+        super().__init__()
+        self.channels = channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.n_layers = n_layers
+        self.n_flows = n_flows
+        self.gin_channels = gin_channels
+
+        self.flows = nn.ModuleList()
+
+        self.wn = attentions.FFT(hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, isflow = True, gin_channels = self.gin_channels) if share_parameter else None
+
+        for i in range(n_flows):
+            self.flows.append(
+                modules.TransformerCouplingLayer(channels, hidden_channels, kernel_size, n_layers, n_heads, p_dropout, filter_channels, mean_only=True, wn_sharing_parameter=self.wn, gin_channels = self.gin_channels))
+            self.flows.append(modules.Flip())
+
+    def forward(self, x, x_mask, g=None, reverse=False):
+        if not reverse:
+            for flow in self.flows:
+                x, _ = flow(x, x_mask, g=g, reverse=reverse)
+        else:
+            for flow in reversed(self.flows):
+                x = flow(x, x_mask, g=g, reverse=reverse)
+        return x
+
 
 class Encoder(nn.Module):
     def __init__(self,
@@ -327,6 +367,8 @@ def __init__(self,
                  use_automatic_f0_prediction = True,
                  flow_share_parameter = False,
                  n_flow_layer = 4,
+                 n_layers_trans_flow = 3,
+                 use_transformer_flow = False,
                  **kwargs):
 
         super().__init__()
@@ -351,6 +393,7 @@ def __init__(self,
         self.emb_g = nn.Embedding(n_speakers, gin_channels)
         self.use_depthwise_conv = use_depthwise_conv
         self.use_automatic_f0_prediction = use_automatic_f0_prediction
+        self.n_layers_trans_flow = n_layers_trans_flow
         if vol_embedding:
            self.emb_vol = nn.Linear(1, hidden_channels)
 
@@ -392,7 +435,10 @@ def __init__(self,
             self.dec = Generator(h=hps)
 
         self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
-        self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, n_flow_layer, gin_channels=gin_channels, share_parameter= flow_share_parameter)
+        if use_transformer_flow:
+            self.flow = TransformerCouplingBlock(inter_channels, hidden_channels, filter_channels, n_heads, n_layers_trans_flow, 5, p_dropout, n_flow_layer,  gin_channels=gin_channels, share_parameter= flow_share_parameter)
+        else:
+            self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, n_flow_layer, gin_channels=gin_channels, share_parameter= flow_share_parameter)
         if self.use_automatic_f0_prediction:
             self.f0_decoder = F0Decoder(
                 1,
diff --git a/modules/attentions.py b/modules/attentions.py
index 9086e0ed..f9d75bc6 100644
--- a/modules/attentions.py
+++ b/modules/attentions.py
@@ -5,12 +5,13 @@
 from torch.nn import functional as F
 
 import modules.commons as commons
+from modules.DSConv import weight_norm_modules
 from modules.modules import LayerNorm
 
 
 class FFT(nn.Module):
   def __init__(self, hidden_channels, filter_channels, n_heads, n_layers=1, kernel_size=1, p_dropout=0.,
-               proximal_bias=False, proximal_init=True, **kwargs):
+               proximal_bias=False, proximal_init=True, isflow = False, **kwargs):
     super().__init__()
     self.hidden_channels = hidden_channels
     self.filter_channels = filter_channels
@@ -20,7 +21,11 @@ def __init__(self, hidden_channels, filter_channels, n_heads, n_layers=1, kernel
     self.p_dropout = p_dropout
     self.proximal_bias = proximal_bias
     self.proximal_init = proximal_init
-
+    if isflow:
+      cond_layer = torch.nn.Conv1d(kwargs["gin_channels"], 2*hidden_channels*n_layers, 1)
+      self.cond_pre = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, 1)
+      self.cond_layer = weight_norm_modules(cond_layer, name='weight')
+      self.gin_channels = kwargs["gin_channels"]
     self.drop = nn.Dropout(p_dropout)
     self.self_attn_layers = nn.ModuleList()
     self.norm_layers_0 = nn.ModuleList()
@@ -35,14 +40,25 @@ def __init__(self, hidden_channels, filter_channels, n_heads, n_layers=1, kernel
         FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
       self.norm_layers_1.append(LayerNorm(hidden_channels))
 
-  def forward(self, x, x_mask):
+  def forward(self, x, x_mask, g = None):
     """
     x: decoder input
     h: encoder output
     """
+    if g is not None:
+      g = self.cond_layer(g)
+
     self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
     x = x * x_mask
     for i in range(self.n_layers):
+      if g is not None:
+        x = self.cond_pre(x)
+        cond_offset = i * 2 * self.hidden_channels
+        g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:]
+        x = commons.fused_add_tanh_sigmoid_multiply(
+          x,
+          g_l,
+          torch.IntTensor([self.hidden_channels]))
       y = self.self_attn_layers[i](x, x, self_attn_mask)
       y = self.drop(y)
       x = self.norm_layers_0[i](x + y)
diff --git a/modules/modules.py b/modules/modules.py
index 2b9ad903..a622d4f2 100644
--- a/modules/modules.py
+++ b/modules/modules.py
@@ -2,6 +2,7 @@
 from torch import nn
 from torch.nn import functional as F
 
+import modules.attentions as attentions
 import modules.commons as commons
 from modules.commons import get_padding, init_weights
 from modules.DSConv import (
@@ -304,3 +305,52 @@ def forward(self, x, x_mask, g=None, reverse=False):
       x1 = (x1 - m) * torch.exp(-logs) * x_mask
       x = torch.cat([x0, x1], 1)
       return x
+
+class TransformerCouplingLayer(nn.Module):
+  def __init__(self,
+      channels,
+      hidden_channels,
+      kernel_size,
+      n_layers,
+      n_heads,
+      p_dropout=0,
+      filter_channels=0,
+      mean_only=False,
+      wn_sharing_parameter=None,
+      gin_channels = 0
+      ):
+    assert channels % 2 == 0, "channels should be divisible by 2"
+    super().__init__()
+    self.channels = channels
+    self.hidden_channels = hidden_channels
+    self.kernel_size = kernel_size
+    self.n_layers = n_layers
+    self.half_channels = channels // 2
+    self.mean_only = mean_only
+
+    self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
+    self.enc = attentions.FFT(hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, isflow = True, gin_channels = gin_channels) if wn_sharing_parameter is None else wn_sharing_parameter
+    self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
+    self.post.weight.data.zero_()
+    self.post.bias.data.zero_()
+
+  def forward(self, x, x_mask, g=None, reverse=False):
+    x0, x1 = torch.split(x, [self.half_channels]*2, 1)
+    h = self.pre(x0) * x_mask
+    h = self.enc(h, x_mask, g=g)
+    stats = self.post(h) * x_mask
+    if not self.mean_only:
+      m, logs = torch.split(stats, [self.half_channels]*2, 1)
+    else:
+      m = stats
+      logs = torch.zeros_like(m)
+
+    if not reverse:
+      x1 = m + x1 * torch.exp(logs) * x_mask
+      x = torch.cat([x0, x1], 1)
+      logdet = torch.sum(logs, [1,2])
+      return x, logdet
+    else:
+      x1 = (x1 - m) * torch.exp(-logs) * x_mask
+      x = torch.cat([x0, x1], 1)
+      return x