From 0e3a84289de9d86cc902e53bc96674b059c366f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9E=97=E5=B0=91=E9=9B=84?= Date: Mon, 21 Feb 2022 21:20:45 +0800 Subject: [PATCH 01/34] Add DCCRN separator, modified espnet2/tasks/enh.py, add a configuration file of dccrn for egs2/dns_ins20 --- .../enh1/conf/tuning/train_enh_dccrn.yaml | 41 ++ espnet2/enh/layers/complexnn.py | 480 ++++++++++++++++++ espnet2/enh/separator/dccrn_separator.py | 302 +++++++++++ espnet2/tasks/enh.py | 2 + 4 files changed, 825 insertions(+) create mode 100644 egs2/dns_ins20/enh1/conf/tuning/train_enh_dccrn.yaml create mode 100644 espnet2/enh/layers/complexnn.py create mode 100644 espnet2/enh/separator/dccrn_separator.py diff --git a/egs2/dns_ins20/enh1/conf/tuning/train_enh_dccrn.yaml b/egs2/dns_ins20/enh1/conf/tuning/train_enh_dccrn.yaml new file mode 100644 index 00000000000..26dcbbeadbc --- /dev/null +++ b/egs2/dns_ins20/enh1/conf/tuning/train_enh_dccrn.yaml @@ -0,0 +1,41 @@ +optim: adam +max_epoch: 100 +batch_type: folded +batch_size: 32 +iterator_type: chunk +chunk_length: 64000 +num_workers: 4 +optim_conf: + lr: 1.0e-03 + eps: 1.0e-08 + weight_decay: 1.0e-7 +patience: 10 +val_scheduler_criterion: +- valid +- loss +best_model_criterion: +- - valid + - si_snr + - max +- - valid + - loss + - min +keep_nbest_models: 1 +scheduler: reducelronplateau +scheduler_conf: + mode: min + factor: 0.7 + patience: 1 +model_conf: + loss_type: si_snr +encoder: stft +encoder_conf: + n_fft: 512 + win_length: 400 + hop_length: 100 +decoder: stft +decoder_conf: + n_fft: 512 + win_length: 400 + hop_length: 100 +separator: dccrn diff --git a/espnet2/enh/layers/complexnn.py b/espnet2/enh/layers/complexnn.py new file mode 100644 index 00000000000..dd1941c5a5d --- /dev/null +++ b/espnet2/enh/layers/complexnn.py @@ -0,0 +1,480 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + + +def get_casual_padding1d(): + pass + + +def get_casual_padding2d(): + pass + + +class cPReLU(nn.Module): + def __init__(self, complex_axis=1): + super(cPReLU, self).__init__() + self.r_prelu = nn.PReLU() + self.i_prelu = nn.PReLU() + self.complex_axis = complex_axis + + def forward(self, inputs): + real, imag = torch.chunk(inputs, 2, self.complex_axis) + real = self.r_prelu(real) + imag = self.i_prelu(imag) + return torch.cat([real, imag], self.complex_axis) + + +class NavieComplexLSTM(nn.Module): + def __init__( + self, + input_size, + hidden_size, + projection_dim=None, + bidirectional=False, + batch_first=False, + ): + super(NavieComplexLSTM, self).__init__() + + self.input_dim = input_size // 2 + self.rnn_units = hidden_size // 2 + self.real_lstm = nn.LSTM( + self.input_dim, + self.rnn_units, + num_layers=1, + bidirectional=bidirectional, + batch_first=False, + ) + self.imag_lstm = nn.LSTM( + self.input_dim, + self.rnn_units, + num_layers=1, + bidirectional=bidirectional, + batch_first=False, + ) + if bidirectional: + bidirectional = 2 + else: + bidirectional = 1 + if projection_dim is not None: + self.projection_dim = projection_dim // 2 + self.r_trans = nn.Linear( + self.rnn_units * bidirectional, self.projection_dim + ) + self.i_trans = nn.Linear( + self.rnn_units * bidirectional, self.projection_dim + ) + else: + self.projection_dim = None + + def forward(self, inputs): + if isinstance(inputs, list): + real, imag = inputs + elif isinstance(inputs, torch.Tensor): + real, imag = torch.chunk(inputs, -1) + r2r_out = self.real_lstm(real)[0] + r2i_out = self.imag_lstm(real)[0] + i2r_out = self.real_lstm(imag)[0] + i2i_out = self.imag_lstm(imag)[0] + real_out = r2r_out - i2i_out + imag_out = i2r_out + r2i_out + if self.projection_dim is not None: + real_out = self.r_trans(real_out) + imag_out = self.i_trans(imag_out) + # print(real_out.shape,imag_out.shape) + return [real_out, imag_out] + + def flatten_parameters(self): + self.imag_lstm.flatten_parameters() + self.real_lstm.flatten_parameters() + + +def complex_cat(inputs, axis): + + real, imag = [], [] + for idx, data in enumerate(inputs): + r, i = torch.chunk(data, 2, axis) + real.append(r) + imag.append(i) + real = torch.cat(real, axis) + imag = torch.cat(imag, axis) + outputs = torch.cat([real, imag], axis) + return outputs + + +class ComplexConv2d(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size=(1, 1), + stride=(1, 1), + padding=(0, 0), + dilation=1, + groups=1, + causal=True, + complex_axis=1, + ): + """ + in_channels: real+imag + out_channels: real+imag + kernel_size : input [B,C,D,T] kernel size in [D,T] + padding : input [B,C,D,T] padding in [D,T] + causal: if causal, will padding time dimension's left side, + otherwise both + """ + super(ComplexConv2d, self).__init__() + self.in_channels = in_channels // 2 + self.out_channels = out_channels // 2 + self.kernel_size = kernel_size + self.stride = stride + self.padding = padding + self.causal = causal + self.groups = groups + self.dilation = dilation + self.complex_axis = complex_axis + self.real_conv = nn.Conv2d( + self.in_channels, + self.out_channels, + kernel_size, + self.stride, + padding=[self.padding[0], 0], + dilation=self.dilation, + groups=self.groups, + ) + self.imag_conv = nn.Conv2d( + self.in_channels, + self.out_channels, + kernel_size, + self.stride, + padding=[self.padding[0], 0], + dilation=self.dilation, + groups=self.groups, + ) + + nn.init.normal_(self.real_conv.weight.data, std=0.05) + nn.init.normal_(self.imag_conv.weight.data, std=0.05) + nn.init.constant_(self.real_conv.bias, 0.0) + nn.init.constant_(self.imag_conv.bias, 0.0) + + def forward(self, inputs): + if self.padding[1] != 0 and self.causal: + inputs = F.pad(inputs, [self.padding[1], 0, 0, 0]) + else: + inputs = F.pad(inputs, [self.padding[1], self.padding[1], 0, 0]) + + if self.complex_axis == 0: + real = self.real_conv(inputs) + imag = self.imag_conv(inputs) + real2real, imag2real = torch.chunk(real, 2, self.complex_axis) + real2imag, imag2imag = torch.chunk(imag, 2, self.complex_axis) + + else: + if isinstance(inputs, torch.Tensor): + real, imag = torch.chunk(inputs, 2, self.complex_axis) + + real2real = self.real_conv(real,) + imag2imag = self.imag_conv(imag,) + + real2imag = self.imag_conv(real) + imag2real = self.real_conv(imag) + + real = real2real - imag2imag + imag = real2imag + imag2real + out = torch.cat([real, imag], self.complex_axis) + + return out + + +class ComplexConvTranspose2d(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size=(1, 1), + stride=(1, 1), + padding=(0, 0), + output_padding=(0, 0), + causal=False, + complex_axis=1, + groups=1, + ): + """ + in_channels: real+imag + out_channels: real+imag + """ + super(ComplexConvTranspose2d, self).__init__() + self.in_channels = in_channels // 2 + self.out_channels = out_channels // 2 + self.kernel_size = kernel_size + self.stride = stride + self.padding = padding + self.output_padding = output_padding + self.groups = groups + + self.real_conv = nn.ConvTranspose2d( + self.in_channels, + self.out_channels, + kernel_size, + self.stride, + padding=self.padding, + output_padding=output_padding, + groups=self.groups, + ) + self.imag_conv = nn.ConvTranspose2d( + self.in_channels, + self.out_channels, + kernel_size, + self.stride, + padding=self.padding, + output_padding=output_padding, + groups=self.groups, + ) + self.complex_axis = complex_axis + + nn.init.normal_(self.real_conv.weight, std=0.05) + nn.init.normal_(self.imag_conv.weight, std=0.05) + nn.init.constant_(self.real_conv.bias, 0.0) + nn.init.constant_(self.imag_conv.bias, 0.0) + + def forward(self, inputs): + + if isinstance(inputs, torch.Tensor): + real, imag = torch.chunk(inputs, 2, self.complex_axis) + elif isinstance(inputs, tuple) or isinstance(inputs, list): + real = inputs[0] + imag = inputs[1] + if self.complex_axis == 0: + real = self.real_conv(inputs) + imag = self.imag_conv(inputs) + real2real, imag2real = torch.chunk(real, 2, self.complex_axis) + real2imag, imag2imag = torch.chunk(imag, 2, self.complex_axis) + + else: + if isinstance(inputs, torch.Tensor): + real, imag = torch.chunk(inputs, 2, self.complex_axis) + + real2real = self.real_conv(real,) + imag2imag = self.imag_conv(imag,) + + real2imag = self.imag_conv(real) + imag2real = self.real_conv(imag) + + real = real2real - imag2imag + imag = real2imag + imag2real + out = torch.cat([real, imag], self.complex_axis) + + return out + + +# Source: https://github.com/ChihebTrabelsi/deep_complex_networks/tree/pytorch +# from https://github.com/IMLHF/SE_DCUNet/blob/f28bf1661121c8901ad38149ea827693f1830715/models/layers/complexnn.py#L55 + + +class ComplexBatchNorm(torch.nn.Module): + def __init__( + self, + num_features, + eps=1e-5, + momentum=0.1, + affine=True, + track_running_stats=True, + complex_axis=1, + ): + super(ComplexBatchNorm, self).__init__() + self.num_features = num_features // 2 + self.eps = eps + self.momentum = momentum + self.affine = affine + self.track_running_stats = track_running_stats + + self.complex_axis = complex_axis + + if self.affine: + self.Wrr = torch.nn.Parameter(torch.Tensor(self.num_features)) + self.Wri = torch.nn.Parameter(torch.Tensor(self.num_features)) + self.Wii = torch.nn.Parameter(torch.Tensor(self.num_features)) + self.Br = torch.nn.Parameter(torch.Tensor(self.num_features)) + self.Bi = torch.nn.Parameter(torch.Tensor(self.num_features)) + else: + self.register_parameter("Wrr", None) + self.register_parameter("Wri", None) + self.register_parameter("Wii", None) + self.register_parameter("Br", None) + self.register_parameter("Bi", None) + + if self.track_running_stats: + self.register_buffer("RMr", torch.zeros(self.num_features)) + self.register_buffer("RMi", torch.zeros(self.num_features)) + self.register_buffer("RVrr", torch.ones(self.num_features)) + self.register_buffer("RVri", torch.zeros(self.num_features)) + self.register_buffer("RVii", torch.ones(self.num_features)) + self.register_buffer( + "num_batches_tracked", torch.tensor(0, dtype=torch.long) + ) + else: + self.register_parameter("RMr", None) + self.register_parameter("RMi", None) + self.register_parameter("RVrr", None) + self.register_parameter("RVri", None) + self.register_parameter("RVii", None) + self.register_parameter("num_batches_tracked", None) + self.reset_parameters() + + def reset_running_stats(self): + if self.track_running_stats: + self.RMr.zero_() + self.RMi.zero_() + self.RVrr.fill_(1) + self.RVri.zero_() + self.RVii.fill_(1) + self.num_batches_tracked.zero_() + + def reset_parameters(self): + self.reset_running_stats() + if self.affine: + self.Br.data.zero_() + self.Bi.data.zero_() + self.Wrr.data.fill_(1) + self.Wri.data.uniform_(-0.9, +0.9) # W will be positive-definite + self.Wii.data.fill_(1) + + def _check_input_dim(self, xr, xi): + assert xr.shape == xi.shape + assert xr.size(1) == self.num_features + + def forward(self, inputs): + # self._check_input_dim(xr, xi) + + xr, xi = torch.chunk(inputs, 2, axis=self.complex_axis) + exponential_average_factor = 0.0 + + if self.training and self.track_running_stats: + self.num_batches_tracked += 1 + if self.momentum is None: # use cumulative moving average + exponential_average_factor = 1.0 / self.num_batches_tracked.item() + else: # use exponential moving average + exponential_average_factor = self.momentum + + # + # NOTE: The precise meaning of the "training flag" is: + # True: Normalize using batch statistics, update running statistics + # if they are being collected. + # False: Normalize using running statistics, ignore batch statistics. + # + training = self.training or not self.track_running_stats + redux = [i for i in reversed(range(xr.dim())) if i != 1] + vdim = [1] * xr.dim() + vdim[1] = xr.size(1) + + # + # Mean M Computation and Centering + # + # Includes running mean update if training and running. + # + if training: + Mr, Mi = xr, xi + for d in redux: + Mr = Mr.mean(d, keepdim=True) + Mi = Mi.mean(d, keepdim=True) + if self.track_running_stats: + self.RMr.lerp_(Mr.squeeze(), exponential_average_factor) + self.RMi.lerp_(Mi.squeeze(), exponential_average_factor) + else: + Mr = self.RMr.view(vdim) + Mi = self.RMi.view(vdim) + xr, xi = xr - Mr, xi - Mi + + # + # Variance Matrix V Computation + # + # Includes epsilon numerical stabilizer/Tikhonov regularizer. + # Includes running variance update if training and running. + # + if training: + Vrr = xr * xr + Vri = xr * xi + Vii = xi * xi + for d in redux: + Vrr = Vrr.mean(d, keepdim=True) + Vri = Vri.mean(d, keepdim=True) + Vii = Vii.mean(d, keepdim=True) + if self.track_running_stats: + self.RVrr.lerp_(Vrr.squeeze(), exponential_average_factor) + self.RVri.lerp_(Vri.squeeze(), exponential_average_factor) + self.RVii.lerp_(Vii.squeeze(), exponential_average_factor) + else: + Vrr = self.RVrr.view(vdim) + Vri = self.RVri.view(vdim) + Vii = self.RVii.view(vdim) + Vrr = Vrr + self.eps + Vri = Vri + Vii = Vii + self.eps + + # + # Matrix Inverse Square Root U = V^-0.5 + # + # sqrt of a 2x2 matrix, + # - https://en.wikipedia.org/wiki/Square_root_of_a_2_by_2_matrix + tau = Vrr + Vii + delta = torch.addcmul(Vrr * Vii, -1, Vri, Vri) + s = delta.sqrt() + t = (tau + 2 * s).sqrt() + + # matrix inverse, http://mathworld.wolfram.com/MatrixInverse.html + rst = (s * t).reciprocal() + Urr = (s + Vii) * rst + Uii = (s + Vrr) * rst + Uri = (-Vri) * rst + + # + # Optionally left-multiply U by affine weights W to produce combined + # weights Z, left-multiply the inputs by Z, then optionally bias them. + # + # y = Zx + B + # y = WUx + B + # y = [Wrr Wri][Urr Uri] [xr] + [Br] + # [Wir Wii][Uir Uii] [xi] [Bi] + # + if self.affine: + Wrr, Wri, Wii = ( + self.Wrr.view(vdim), + self.Wri.view(vdim), + self.Wii.view(vdim), + ) + Zrr = (Wrr * Urr) + (Wri * Uri) + Zri = (Wrr * Uri) + (Wri * Uii) + Zir = (Wri * Urr) + (Wii * Uri) + Zii = (Wri * Uri) + (Wii * Uii) + else: + Zrr, Zri, Zir, Zii = Urr, Uri, Uri, Uii + + yr = (Zrr * xr) + (Zri * xi) + yi = (Zir * xr) + (Zii * xi) + + if self.affine: + yr = yr + self.Br.view(vdim) + yi = yi + self.Bi.view(vdim) + + outputs = torch.cat([yr, yi], self.complex_axis) + return outputs + + def extra_repr(self): + return ( + "{num_features}, eps={eps}, momentum={momentum}, affine={affine}, " + "track_running_stats={track_running_stats}".format(**self.__dict__) + ) + + +if __name__ == "__main__": + import dc_crn7 + + torch.manual_seed(20) + onet1 = dc_crn7.ComplexConv2d(12, 12, kernel_size=(3, 2), padding=(2, 1)) + onet2 = dc_crn7.ComplexConvTranspose2d(12, 12, kernel_size=(3, 2), padding=(2, 1)) + inputs = torch.randn([1, 12, 12, 10]) + # print(onet1.real_kernel[0,0,0,0]) + nnet1 = ComplexConv2d(12, 12, kernel_size=(3, 2), padding=(2, 1), causal=True) + # print(nnet1.real_conv.weight[0,0,0,0]) + nnet2 = ComplexConvTranspose2d(12, 12, kernel_size=(3, 2), padding=(2, 1)) + print(torch.mean(nnet1(inputs) - onet1(inputs))) diff --git a/espnet2/enh/separator/dccrn_separator.py b/espnet2/enh/separator/dccrn_separator.py new file mode 100644 index 00000000000..515a50c4369 --- /dev/null +++ b/espnet2/enh/separator/dccrn_separator.py @@ -0,0 +1,302 @@ +from collections import OrderedDict +from distutils.version import LooseVersion +from typing import List, Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from espnet2.enh.layers.complexnn import ( + ComplexBatchNorm, + ComplexConv2d, + ComplexConvTranspose2d, + NavieComplexLSTM, + complex_cat, +) +from espnet2.enh.separator.abs_separator import AbsSeparator +from torch_complex.tensor import ComplexTensor + +is_torch_1_9_plus = LooseVersion(torch.__version__) >= LooseVersion("1.9.0") +EPS = torch.finfo(torch.double).eps + + +class DCCRNSeparator(AbsSeparator): + def __init__( + self, + idim, + rnn_layer=2, + rnn_units=256, + fft_len=512, + masking_mode="E", + use_clstm=True, + bidirectional=False, + use_cbn=False, + kernel_size=5, + kernel_num=[32, 64, 128, 256, 256, 256], + use_builtin_complex: bool = True, + ): + """DCCRN separator + + Args: + num_spk (int, optional): number of speakers. Defaults to 1. + rnn_layer (int, optional): number of lstm layers in the crn. Defaults to 2. + rnn_units (int, optional): number of features in the hidden state, + for complex-lstm, rnn_units = real+imag. Defaults to 128. + fft_len (int, optional): n_fft. Defaults to 512. + masking_mode (str, optional): decide how to use the estimated mask. Defaults to "E". + use_clstm (bool, optional): whether to use complex LSTM or not. Defaults to False. + bidirectional (bool, optional): whether to use bidirectional LSTM or not. Defaults to False. + use_cbn (bool, optional): whether to use complex batch normalization. Defaults to False. + kernel_size (int, optional): convolution kernel size. Defaults to 5. + kernel_num (list, optional): output dimension of each convolution layer of the encoder(decoder). + Defaults to [16, 32, 64, 128, 256, 256]. + use_builtin_complex (bool, optional): use torch.complex if True, else use ComplexTensor. + + References + - [1] : "DCCRN: Deep Complex Convolution Recurrent Network for Phase-Aware Speech Enhancement", + Yanxin Hu et al. https://arxiv.org/abs/2008.00264 + - [2] : https://github.com/huyanxin/DeepComplexCRN + """ + super().__init__() + self.use_builtin_complex = use_builtin_complex + + # Network config + self.rnn_units = rnn_units + self.fft_len = fft_len + self.hidden_layers = rnn_layer + self.kernel_size = kernel_size + self.kernel_num = [2] + kernel_num + self.masking_mode = masking_mode + self.use_clstm = use_clstm + + fac = 2 if bidirectional else 1 + + self.encoder = nn.ModuleList() + self.decoder = nn.ModuleList() + for idx in range(len(self.kernel_num) - 1): + self.encoder.append( + nn.Sequential( + ComplexConv2d( + self.kernel_num[idx], + self.kernel_num[idx + 1], + kernel_size=(self.kernel_size, 2), + stride=(2, 1), + padding=(2, 1), + ), + nn.BatchNorm2d(self.kernel_num[idx + 1]) + if not use_cbn + else ComplexBatchNorm(self.kernel_num[idx + 1]), + nn.PReLU(), + ) + ) + hidden_dim = self.fft_len // (2 ** (len(self.kernel_num))) + + if self.use_clstm: + rnns = [] + for idx in range(rnn_layer): + rnns.append( + NavieComplexLSTM( + input_size=hidden_dim * self.kernel_num[-1] + if idx == 0 + else self.rnn_units, + hidden_size=self.rnn_units, + bidirectional=bidirectional, + batch_first=False, + projection_dim=hidden_dim * self.kernel_num[-1] + if idx == rnn_layer - 1 + else None, + ) + ) + self.enhance = nn.Sequential(*rnns) + else: + self.enhance = nn.LSTM( + input_size=hidden_dim * self.kernel_num[-1], + hidden_size=self.rnn_units, + num_layers=2, + dropout=0.0, + bidirectional=bidirectional, + batch_first=False, + ) + self.tranform = nn.Linear( + self.rnn_units * fac, hidden_dim * self.kernel_num[-1] + ) + + for idx in range(len(self.kernel_num) - 1, 0, -1): + if idx != 1: + self.decoder.append( + nn.Sequential( + ComplexConvTranspose2d( + self.kernel_num[idx] * 2, + self.kernel_num[idx - 1], + kernel_size=(self.kernel_size, 2), + stride=(2, 1), + padding=(2, 0), + output_padding=(1, 0), + ), + nn.BatchNorm2d(self.kernel_num[idx - 1]) + if not use_cbn + else ComplexBatchNorm(self.kernel_num[idx - 1]), + nn.PReLU(), + ) + ) + else: + self.decoder.append( + nn.Sequential( + ComplexConvTranspose2d( + self.kernel_num[idx] * 2, + self.kernel_num[idx - 1], + kernel_size=(self.kernel_size, 2), + stride=(2, 1), + padding=(2, 0), + output_padding=(1, 0), + ), + ) + ) + + self.flatten_parameters() + + def forward( + self, input: Union[torch.Tensor, ComplexTensor], ilens: torch.Tensor + ) -> Tuple[List[Union[torch.Tensor, ComplexTensor]], torch.Tensor, OrderedDict]: + """Forward. + + Args: + input (torch.Tensor or ComplexTensor): Encoded feature [B, T, F] + ilens (torch.Tensor): input lengths [Batch] + + Returns: + masked (List[Union(torch.Tensor, ComplexTensor)]): [(B, T, F), ...] + ilens (torch.Tensor): (B,) + others predicted data, e.g. masks: OrderedDict[ + 'mask_spk1': torch.Tensor(Batch, Frames, Freq), + 'mask_spk2': torch.Tensor(Batch, Frames, Freq), + ... + 'mask_spkn': torch.Tensor(Batch, Frames, Freq), + ] + """ + # shape (B, T, F) --> (B, F, T) + specs = input.permute(0, 2, 1) + real, imag = specs.real, specs.imag + # shape (B, F, T) + spec_mags = torch.sqrt(real ** 2 + imag ** 2 + 1e-8) + # shape (B, F, T) + spec_phase = torch.atan2(imag, real) + # shape (B, 2*F, T) + cspecs = torch.stack([real, imag], 1) + # shape (B, 2*F, T-1) + cspecs = cspecs[:, :, 1:] + + out = cspecs + encoder_out = [] + + for idx, layer in enumerate(self.encoder): + out = layer(out) + encoder_out.append(out) + # shape (B, C, F, T) + batch_size, channels, dims, lengths = out.size() + # shape (T, B, C, F) + out = out.permute(3, 0, 1, 2) + if self.use_clstm: + # shape (T, B, C // 2, F) + r_rnn_in = out[:, :, : channels // 2] + # shape (T, B, C // 2, F) + i_rnn_in = out[:, :, channels // 2 :] + # shape (T, B, C // 2 * F) + r_rnn_in = torch.reshape( + r_rnn_in, [lengths, batch_size, channels // 2 * dims] + ) + # shape (T, B, C // 2 * F) + i_rnn_in = torch.reshape( + i_rnn_in, [lengths, batch_size, channels // 2 * dims] + ) + + r_rnn_in, i_rnn_in = self.enhance([r_rnn_in, i_rnn_in]) + # shape (T, B, C // 2, F) + r_rnn_in = torch.reshape( + r_rnn_in, [lengths, batch_size, channels // 2, dims] + ) + # shape (T, B, C // 2, F) + i_rnn_in = torch.reshape( + i_rnn_in, [lengths, batch_size, channels // 2, dims] + ) + # shape (T, B, C, F) + out = torch.cat([r_rnn_in, i_rnn_in], 2) + + else: + # shape (T, B, C*F) + out = torch.reshape(out, [lengths, batch_size, channels * dims]) + out, _ = self.enhance(out) + out = self.tranform(out) + # shape (T, B, C, F) + out = torch.reshape(out, [lengths, batch_size, channels, dims]) + # shape (B, C, F, T) + out = out.permute(1, 2, 3, 0) + + for idx in range(len(self.decoder)): + # skip connection + out = complex_cat([out, encoder_out[-1 - idx]], 1) + out = self.decoder[idx](out) + out = out[..., 1:] + # out shape (B, 2, F, T) + + # shape (B, F-1, T) + mask_real = out[:, 0] + # shape (B, F-1, T) + mask_imag = out[:, 1] + # shape (B, F, T) + mask_real = F.pad(mask_real, [0, 0, 1, 0]) + # shape (B, F, T) + mask_imag = F.pad(mask_imag, [0, 0, 1, 0]) + + # mask shape (B, T, F) + if is_torch_1_9_plus and self.use_builtin_complex: + complex_mask = torch.complex( + mask_real.permute(0, 2, 1), mask_imag.permute(0, 2, 1) + ) + else: + complex_mask = ComplexTensor( + mask_real.permute(0, 2, 1), mask_imag.permute(0, 2, 1) + ) + + others = OrderedDict( + zip( + ["mask_spk{}".format(i + 1) for i in range(self.num_spk)], + [complex_mask], + ) + ) + + if self.masking_mode == "E": + mask_mags = (mask_real ** 2 + mask_imag ** 2) ** 0.5 + # mask_mags = (mask_real ** 2 + mask_imag ** 2 + EPS) ** 0.5 + real_phase = mask_real / (mask_mags + EPS) + imag_phase = mask_imag / (mask_mags + EPS) + # mask_phase = torch.atan2(imag_phase + EPS, real_phase + EPS) + mask_phase = torch.atan2(imag_phase, real_phase) + mask_mags = torch.tanh(mask_mags) + est_mags = mask_mags * spec_mags + est_phase = spec_phase + mask_phase + real = est_mags * torch.cos(est_phase) + imag = est_mags * torch.sin(est_phase) + elif self.masking_mode == "C": + real, imag = ( + real * mask_real - imag * mask_imag, + real * mask_imag + imag * mask_real, + ) + elif self.masking_mode == "R": + real, imag = real * mask_real, imag * mask_imag + + masked = [] + # shape (B, T, F) + if is_torch_1_9_plus and self.use_builtin_complex: + masked.append(torch.complex(real.permute(0, 2, 1), imag.permute(0, 2, 1))) + else: + masked.append(ComplexTensor(real.permute(0, 2, 1), imag.permute(0, 2, 1))) + + return (masked, ilens, others) + + def flatten_parameters(self): + if isinstance(self.enhance, nn.LSTM): + self.enhance.flatten_parameters() + + @property + def num_spk(self): + return 1 diff --git a/espnet2/tasks/enh.py b/espnet2/tasks/enh.py index 2a722cba554..30678034d55 100644 --- a/espnet2/tasks/enh.py +++ b/espnet2/tasks/enh.py @@ -33,6 +33,7 @@ from espnet2.enh.separator.asteroid_models import AsteroidModel_Converter from espnet2.enh.separator.conformer_separator import ConformerSeparator from espnet2.enh.separator.dprnn_separator import DPRNNSeparator +from espnet2.enh.separator.dccrn_separator import DCCRNSeparator from espnet2.enh.separator.neural_beamformer import NeuralBeamformer from espnet2.enh.separator.rnn_separator import RNNSeparator from espnet2.enh.separator.tcn_separator import TCNSeparator @@ -60,6 +61,7 @@ rnn=RNNSeparator, tcn=TCNSeparator, dprnn=DPRNNSeparator, + dccrn=DCCRNSeparator, transformer=TransformerSeparator, conformer=ConformerSeparator, wpe_beamformer=NeuralBeamformer, From a046497eb7d186aa8fe9de6cb9e865526b0df10b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9E=97=E5=B0=91=E9=9B=84?= Date: Mon, 21 Feb 2022 22:07:00 +0800 Subject: [PATCH 02/34] Add DCCRN separator, modified espnet2/tasks/enh.py, add a configuration file of dccrn for egs2/dns_ins20 --- espnet2/enh/layers/complexnn.py | 13 +++++++++++++ espnet2/enh/separator/dccrn_separator.py | 1 + 2 files changed, 14 insertions(+) diff --git a/espnet2/enh/layers/complexnn.py b/espnet2/enh/layers/complexnn.py index dd1941c5a5d..07e14d8c39a 100644 --- a/espnet2/enh/layers/complexnn.py +++ b/espnet2/enh/layers/complexnn.py @@ -1,3 +1,16 @@ +# Copyright 2020 huyanxin + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import torch import torch.nn as nn import torch.nn.functional as F diff --git a/espnet2/enh/separator/dccrn_separator.py b/espnet2/enh/separator/dccrn_separator.py index 515a50c4369..3ec12c53a2d 100644 --- a/espnet2/enh/separator/dccrn_separator.py +++ b/espnet2/enh/separator/dccrn_separator.py @@ -1,3 +1,4 @@ +# this is modified from https://github.com/huyanxin/DeepComplexCRN/blob/master/dc_crn.py from collections import OrderedDict from distutils.version import LooseVersion from typing import List, Tuple, Union From b4a9ebaa7c4b3407424ca01496ee9e8da4720f2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9E=97=E5=B0=91=E9=9B=84?= Date: Tue, 22 Feb 2022 20:29:21 +0800 Subject: [PATCH 03/34] Modified espnet2/enh/separator/dccrn_separator.py to support variable num_spk and use_noise_mask option --- espnet2/enh/separator/dccrn_separator.py | 191 ++++++++++++++--------- 1 file changed, 119 insertions(+), 72 deletions(-) diff --git a/espnet2/enh/separator/dccrn_separator.py b/espnet2/enh/separator/dccrn_separator.py index 3ec12c53a2d..392e5904de9 100644 --- a/espnet2/enh/separator/dccrn_separator.py +++ b/espnet2/enh/separator/dccrn_separator.py @@ -1,4 +1,3 @@ -# this is modified from https://github.com/huyanxin/DeepComplexCRN/blob/master/dc_crn.py from collections import OrderedDict from distutils.version import LooseVersion from typing import List, Tuple, Union @@ -20,49 +19,50 @@ EPS = torch.finfo(torch.double).eps -class DCCRNSeparator(AbsSeparator): +class DCCRNSeparatorTest(AbsSeparator): def __init__( self, - idim, - rnn_layer=2, - rnn_units=256, - fft_len=512, - masking_mode="E", - use_clstm=True, - bidirectional=False, - use_cbn=False, - kernel_size=5, - kernel_num=[32, 64, 128, 256, 256, 256], + idim: int, + num_spk: int = 1, + rnn_layer: int = 2, + rnn_units: int = 256, + masking_mode: str = "E", + use_clstm: bool = True, + bidirectional: bool = False, + use_cbn: bool = False, + kernel_size: int = 5, + kernel_num: List[int] = [32, 64, 128, 256, 256, 256], use_builtin_complex: bool = True, + use_noise_mask: bool = False, ): """DCCRN separator Args: + idim (int): input dimension。 num_spk (int, optional): number of speakers. Defaults to 1. rnn_layer (int, optional): number of lstm layers in the crn. Defaults to 2. - rnn_units (int, optional): number of features in the hidden state, - for complex-lstm, rnn_units = real+imag. Defaults to 128. - fft_len (int, optional): n_fft. Defaults to 512. + rnn_units (int, optional): number of features in the hidden state, for complex-lstm, rnn_units = real+imag. Defaults to 128. masking_mode (str, optional): decide how to use the estimated mask. Defaults to "E". use_clstm (bool, optional): whether to use complex LSTM or not. Defaults to False. bidirectional (bool, optional): whether to use bidirectional LSTM or not. Defaults to False. use_cbn (bool, optional): whether to use complex batch normalization. Defaults to False. kernel_size (int, optional): convolution kernel size. Defaults to 5. - kernel_num (list, optional): output dimension of each convolution layer of the encoder(decoder). - Defaults to [16, 32, 64, 128, 256, 256]. + kernel_num (list, optional): output dimension of each convolution layer of the encoder(decoder). Defaults to [16, 32, 64, 128, 256, 256]. use_builtin_complex (bool, optional): use torch.complex if True, else use ComplexTensor. + use_noise_mask (bool, optional): whether to estimate the mask of noise signal. References - [1] : "DCCRN: Deep Complex Convolution Recurrent Network for Phase-Aware Speech Enhancement", - Yanxin Hu et al. https://arxiv.org/abs/2008.00264 + Yanxin Hu et al. https://arxiv.org/abs/2008.00264 - [2] : https://github.com/huyanxin/DeepComplexCRN """ super().__init__() self.use_builtin_complex = use_builtin_complex + self._num_spk = num_spk + self.use_noise_mask = use_noise_mask # Network config self.rnn_units = rnn_units - self.fft_len = fft_len self.hidden_layers = rnn_layer self.kernel_size = kernel_size self.kernel_num = [2] + kernel_num @@ -89,7 +89,7 @@ def __init__( nn.PReLU(), ) ) - hidden_dim = self.fft_len // (2 ** (len(self.kernel_num))) + hidden_dim = (idim - 1) // (2 ** (len(self.kernel_num) - 1)) if self.use_clstm: rnns = [] @@ -144,7 +144,9 @@ def __init__( nn.Sequential( ComplexConvTranspose2d( self.kernel_num[idx] * 2, - self.kernel_num[idx - 1], + self.kernel_num[idx - 1] * (self._num_spk + 1) + if self.use_noise_mask + else self.kernel_num[idx - 1] * self._num_spk, kernel_size=(self.kernel_size, 2), stride=(2, 1), padding=(2, 0), @@ -237,60 +239,17 @@ def forward( out = complex_cat([out, encoder_out[-1 - idx]], 1) out = self.decoder[idx](out) out = out[..., 1:] - # out shape (B, 2, F, T) - - # shape (B, F-1, T) - mask_real = out[:, 0] - # shape (B, F-1, T) - mask_imag = out[:, 1] - # shape (B, F, T) - mask_real = F.pad(mask_real, [0, 0, 1, 0]) - # shape (B, F, T) - mask_imag = F.pad(mask_imag, [0, 0, 1, 0]) - - # mask shape (B, T, F) - if is_torch_1_9_plus and self.use_builtin_complex: - complex_mask = torch.complex( - mask_real.permute(0, 2, 1), mask_imag.permute(0, 2, 1) - ) - else: - complex_mask = ComplexTensor( - mask_real.permute(0, 2, 1), mask_imag.permute(0, 2, 1) - ) + # out shape = (B, 2*num_spk, F-1, T) if self.use_noise_mask == False else (B, 2*(num_spk+1), F-1, T) + masks = self.create_masks(out) + masked = self.apply_masks(masks, spec_mags, spec_phase) others = OrderedDict( - zip( - ["mask_spk{}".format(i + 1) for i in range(self.num_spk)], - [complex_mask], - ) + zip(["mask_spk{}".format(i + 1) for i in range(self.num_spk)], masks,) ) - if self.masking_mode == "E": - mask_mags = (mask_real ** 2 + mask_imag ** 2) ** 0.5 - # mask_mags = (mask_real ** 2 + mask_imag ** 2 + EPS) ** 0.5 - real_phase = mask_real / (mask_mags + EPS) - imag_phase = mask_imag / (mask_mags + EPS) - # mask_phase = torch.atan2(imag_phase + EPS, real_phase + EPS) - mask_phase = torch.atan2(imag_phase, real_phase) - mask_mags = torch.tanh(mask_mags) - est_mags = mask_mags * spec_mags - est_phase = spec_phase + mask_phase - real = est_mags * torch.cos(est_phase) - imag = est_mags * torch.sin(est_phase) - elif self.masking_mode == "C": - real, imag = ( - real * mask_real - imag * mask_imag, - real * mask_imag + imag * mask_real, - ) - elif self.masking_mode == "R": - real, imag = real * mask_real, imag * mask_imag - - masked = [] - # shape (B, T, F) - if is_torch_1_9_plus and self.use_builtin_complex: - masked.append(torch.complex(real.permute(0, 2, 1), imag.permute(0, 2, 1))) - else: - masked.append(ComplexTensor(real.permute(0, 2, 1), imag.permute(0, 2, 1))) + if self.use_noise_mask: + others["mask_noise1"] = masks[-1] + others["noise1"] = masked.pop(-1) return (masked, ilens, others) @@ -298,6 +257,94 @@ def flatten_parameters(self): if isinstance(self.enhance, nn.LSTM): self.enhance.flatten_parameters() + def create_masks(self, mask_tensor: torch.Tensor): + """create estimated mask for each speaker + + Args: + mask_tensor (torch.Tensor): the output of the decoder, shape(B, 2*num_spk, F-1, T) + """ + if self.use_noise_mask: + assert mask_tensor.shape[1] == 2 * (self._num_spk + 1), mask_tensor.shape[1] + else: + assert mask_tensor.shape[1] == 2 * self._num_spk, mask_tensor.shape[1] + + masks = [] + for idx in range(mask_tensor.shape[1] // 2): + # shape (B, F-1, T) + mask_real = mask_tensor[:, idx * 2] + # shape (B, F-1, T) + mask_imag = mask_tensor[:, idx * 2 + 1] + # shape (B, F, T) + mask_real = F.pad(mask_real, [0, 0, 1, 0]) + # shape (B, F, T) + mask_imag = F.pad(mask_imag, [0, 0, 1, 0]) + + # mask shape (B, T, F) + if is_torch_1_9_plus and self.use_builtin_complex: + complex_mask = torch.complex( + mask_real.permute(0, 2, 1), mask_imag.permute(0, 2, 1) + ) + else: + complex_mask = ComplexTensor( + mask_real.permute(0, 2, 1), mask_imag.permute(0, 2, 1) + ) + + masks.append(complex_mask) + + return masks + + def apply_masks( + self, + masks: List[Union[torch.Tensor, ComplexTensor]], + spec_mags: torch.Tensor, + spec_phase: torch.Tensor, + ): + """apply masks + + Args: + masks (List[Union[torch.Tensor, ComplexTensor]]): estimated masks, [(B, T, F), ...] + spec_mags (torch.Tensor): magnitude of the noisy spectrum, (B, F, T) + spec_phase (torch.Tensor): phase of the noisy spectrum, (B, F, T) + + Returns: + masked (List[Union(torch.Tensor, ComplexTensor)]): [(B, T, F), ...] + """ + masked = [] + for i in range(len(masks)): + # shape (B, T, F) --> (B, F, T) + mask_real = masks[i].real.permute(0, 2, 1) + mask_imag = masks[i].imag.permute(0, 2, 1) + if self.masking_mode == "E": + mask_mags = (mask_real ** 2 + mask_imag ** 2) ** 0.5 + # mask_mags = (mask_real ** 2 + mask_imag ** 2 + EPS) ** 0.5 + real_phase = mask_real / (mask_mags + EPS) + imag_phase = mask_imag / (mask_mags + EPS) + # mask_phase = torch.atan2(imag_phase + EPS, real_phase + EPS) + mask_phase = torch.atan2(imag_phase, real_phase) + mask_mags = torch.tanh(mask_mags) + est_mags = mask_mags * spec_mags + est_phase = spec_phase + mask_phase + real = est_mags * torch.cos(est_phase) + imag = est_mags * torch.sin(est_phase) + elif self.masking_mode == "C": + real, imag = ( + real * mask_real - imag * mask_imag, + real * mask_imag + imag * mask_real, + ) + elif self.masking_mode == "R": + real, imag = real * mask_real, imag * mask_imag + + # shape (B, F, T) --> (B, T, F) + if is_torch_1_9_plus and self.use_builtin_complex: + masked.append( + torch.complex(real.permute(0, 2, 1), imag.permute(0, 2, 1)) + ) + else: + masked.append( + ComplexTensor(real.permute(0, 2, 1), imag.permute(0, 2, 1)) + ) + return masked + @property def num_spk(self): - return 1 + return self._num_spk From cf1046684136d58cdbeeb25b67f86fecb27fd87a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9E=97=E5=B0=91=E9=9B=84?= Date: Tue, 22 Feb 2022 20:33:24 +0800 Subject: [PATCH 04/34] Modified espnet2/enh/separator/dccrn_separator.py to support variable num_spk and use_noise_mask option --- espnet2/enh/separator/dccrn_separator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/espnet2/enh/separator/dccrn_separator.py b/espnet2/enh/separator/dccrn_separator.py index 392e5904de9..f269a9cd3f4 100644 --- a/espnet2/enh/separator/dccrn_separator.py +++ b/espnet2/enh/separator/dccrn_separator.py @@ -19,7 +19,7 @@ EPS = torch.finfo(torch.double).eps -class DCCRNSeparatorTest(AbsSeparator): +class DCCRNSeparator(AbsSeparator): def __init__( self, idim: int, From 7de0d5bb9f4297e66a5a4d2b9dcca2f4808ef16b Mon Sep 17 00:00:00 2001 From: LinShaoxiong Date: Wed, 23 Feb 2022 22:38:37 +0800 Subject: [PATCH 05/34] Modified the code to pass the test --- espnet2/enh/layers/complexnn.py | 46 ++++---- espnet2/enh/separator/dccrn_separator.py | 78 ++++++++------ .../enh/separator/test_dccrn_separator.py | 101 ++++++++++++++++++ test/espnet2/enh/test_espnet_model.py | 66 ++++++++---- 4 files changed, 215 insertions(+), 76 deletions(-) create mode 100644 test/espnet2/enh/separator/test_dccrn_separator.py diff --git a/espnet2/enh/layers/complexnn.py b/espnet2/enh/layers/complexnn.py index 07e14d8c39a..518282e2aed 100644 --- a/espnet2/enh/layers/complexnn.py +++ b/espnet2/enh/layers/complexnn.py @@ -48,7 +48,7 @@ def __init__( batch_first=False, ): super(NavieComplexLSTM, self).__init__() - + self.bidirectional = bidirectional self.input_dim = input_size // 2 self.rnn_units = hidden_size // 2 self.real_lstm = nn.LSTM( @@ -94,7 +94,6 @@ def forward(self, inputs): if self.projection_dim is not None: real_out = self.r_trans(real_out) imag_out = self.i_trans(imag_out) - # print(real_out.shape,imag_out.shape) return [real_out, imag_out] def flatten_parameters(self): @@ -129,12 +128,12 @@ def __init__( complex_axis=1, ): """ - in_channels: real+imag - out_channels: real+imag - kernel_size : input [B,C,D,T] kernel size in [D,T] - padding : input [B,C,D,T] padding in [D,T] - causal: if causal, will padding time dimension's left side, - otherwise both + in_channels: real+imag + out_channels: real+imag + kernel_size : input [B,C,D,T] kernel size in [D,T] + padding : input [B,C,D,T] padding in [D,T] + causal: if causal, will padding time dimension's left side, + otherwise both """ super(ComplexConv2d, self).__init__() self.in_channels = in_channels // 2 @@ -186,8 +185,12 @@ def forward(self, inputs): if isinstance(inputs, torch.Tensor): real, imag = torch.chunk(inputs, 2, self.complex_axis) - real2real = self.real_conv(real,) - imag2imag = self.imag_conv(imag,) + real2real = self.real_conv( + real, + ) + imag2imag = self.imag_conv( + imag, + ) real2imag = self.imag_conv(real) imag2real = self.real_conv(imag) @@ -213,8 +216,8 @@ def __init__( groups=1, ): """ - in_channels: real+imag - out_channels: real+imag + in_channels: real+imag + out_channels: real+imag """ super(ComplexConvTranspose2d, self).__init__() self.in_channels = in_channels // 2 @@ -251,7 +254,6 @@ def __init__( nn.init.constant_(self.imag_conv.bias, 0.0) def forward(self, inputs): - if isinstance(inputs, torch.Tensor): real, imag = torch.chunk(inputs, 2, self.complex_axis) elif isinstance(inputs, tuple) or isinstance(inputs, list): @@ -267,23 +269,24 @@ def forward(self, inputs): if isinstance(inputs, torch.Tensor): real, imag = torch.chunk(inputs, 2, self.complex_axis) - real2real = self.real_conv(real,) - imag2imag = self.imag_conv(imag,) + real2real = self.real_conv( + real, + ) + imag2imag = self.imag_conv( + imag, + ) real2imag = self.imag_conv(real) imag2real = self.real_conv(imag) real = real2real - imag2imag imag = real2imag + imag2real + out = torch.cat([real, imag], self.complex_axis) return out -# Source: https://github.com/ChihebTrabelsi/deep_complex_networks/tree/pytorch -# from https://github.com/IMLHF/SE_DCUNet/blob/f28bf1661121c8901ad38149ea827693f1830715/models/layers/complexnn.py#L55 - - class ComplexBatchNorm(torch.nn.Module): def __init__( self, @@ -486,8 +489,7 @@ def extra_repr(self): onet1 = dc_crn7.ComplexConv2d(12, 12, kernel_size=(3, 2), padding=(2, 1)) onet2 = dc_crn7.ComplexConvTranspose2d(12, 12, kernel_size=(3, 2), padding=(2, 1)) inputs = torch.randn([1, 12, 12, 10]) - # print(onet1.real_kernel[0,0,0,0]) + nnet1 = ComplexConv2d(12, 12, kernel_size=(3, 2), padding=(2, 1), causal=True) - # print(nnet1.real_conv.weight[0,0,0,0]) + nnet2 = ComplexConvTranspose2d(12, 12, kernel_size=(3, 2), padding=(2, 1)) - print(torch.mean(nnet1(inputs) - onet1(inputs))) diff --git a/espnet2/enh/separator/dccrn_separator.py b/espnet2/enh/separator/dccrn_separator.py index f269a9cd3f4..c7d571123f1 100644 --- a/espnet2/enh/separator/dccrn_separator.py +++ b/espnet2/enh/separator/dccrn_separator.py @@ -22,7 +22,7 @@ class DCCRNSeparator(AbsSeparator): def __init__( self, - idim: int, + input_dim: int, num_spk: int = 1, rnn_layer: int = 2, rnn_units: int = 256, @@ -38,29 +38,26 @@ def __init__( """DCCRN separator Args: - idim (int): input dimension。 + input_dim (int): input dimension。 num_spk (int, optional): number of speakers. Defaults to 1. rnn_layer (int, optional): number of lstm layers in the crn. Defaults to 2. - rnn_units (int, optional): number of features in the hidden state, for complex-lstm, rnn_units = real+imag. Defaults to 128. - masking_mode (str, optional): decide how to use the estimated mask. Defaults to "E". - use_clstm (bool, optional): whether to use complex LSTM or not. Defaults to False. - bidirectional (bool, optional): whether to use bidirectional LSTM or not. Defaults to False. - use_cbn (bool, optional): whether to use complex batch normalization. Defaults to False. + rnn_units (int, optional): rnn units. Defaults to 128. + masking_mode (str, optional): usage of the estimated mask. Defaults to "E". + use_clstm (bool, optional): whether use complex LSTM. Defaults to False. + bidirectional (bool, optional): whether use BLSTM. Defaults to False. + use_cbn (bool, optional): whether use complex BN. Defaults to False. kernel_size (int, optional): convolution kernel size. Defaults to 5. - kernel_num (list, optional): output dimension of each convolution layer of the encoder(decoder). Defaults to [16, 32, 64, 128, 256, 256]. - use_builtin_complex (bool, optional): use torch.complex if True, else use ComplexTensor. - use_noise_mask (bool, optional): whether to estimate the mask of noise signal. - - References - - [1] : "DCCRN: Deep Complex Convolution Recurrent Network for Phase-Aware Speech Enhancement", - Yanxin Hu et al. https://arxiv.org/abs/2008.00264 - - [2] : https://github.com/huyanxin/DeepComplexCRN + kernel_num (list, optional): output dimension of each layer of the encoder. + use_builtin_complex (bool, optional): torch.complex if True, + else ComplexTensor. + use_noise_mask (bool, optional): whether to estimate the mask of noise. """ super().__init__() self.use_builtin_complex = use_builtin_complex self._num_spk = num_spk self.use_noise_mask = use_noise_mask - + if masking_mode not in ["C", "E", "R"]: + raise ValueError("Unsupported masking mode: %s" % masking_mode) # Network config self.rnn_units = rnn_units self.hidden_layers = rnn_layer @@ -89,7 +86,10 @@ def __init__( nn.PReLU(), ) ) - hidden_dim = (idim - 1) // (2 ** (len(self.kernel_num) - 1)) + hidden_dim = (input_dim - 1 + 2 ** (len(self.kernel_num) - 1) - 1) // ( + 2 ** (len(self.kernel_num) - 1) + ) + hidden_dim = hidden_dim if hidden_dim > 0 else 1 if self.use_clstm: rnns = [] @@ -98,7 +98,7 @@ def __init__( NavieComplexLSTM( input_size=hidden_dim * self.kernel_num[-1] if idx == 0 - else self.rnn_units, + else self.rnn_units * fac, hidden_size=self.rnn_units, bidirectional=bidirectional, batch_first=False, @@ -179,13 +179,14 @@ def forward( # shape (B, T, F) --> (B, F, T) specs = input.permute(0, 2, 1) real, imag = specs.real, specs.imag - # shape (B, F, T) - spec_mags = torch.sqrt(real ** 2 + imag ** 2 + 1e-8) - # shape (B, F, T) - spec_phase = torch.atan2(imag, real) - # shape (B, 2*F, T) + + # # shape (B, F, T) + # spec_mags = torch.sqrt(real**2 + imag**2 + 1e-8) + # # shape (B, F, T) + # spec_phase = torch.atan2(imag, real) + # shape (B, 2, F, T) cspecs = torch.stack([real, imag], 1) - # shape (B, 2*F, T-1) + # shape (B, 2, F-1, T) cspecs = cspecs[:, :, 1:] out = cspecs @@ -211,7 +212,6 @@ def forward( i_rnn_in = torch.reshape( i_rnn_in, [lengths, batch_size, channels // 2 * dims] ) - r_rnn_in, i_rnn_in = self.enhance([r_rnn_in, i_rnn_in]) # shape (T, B, C // 2, F) r_rnn_in = torch.reshape( @@ -239,12 +239,16 @@ def forward( out = complex_cat([out, encoder_out[-1 - idx]], 1) out = self.decoder[idx](out) out = out[..., 1:] - # out shape = (B, 2*num_spk, F-1, T) if self.use_noise_mask == False else (B, 2*(num_spk+1), F-1, T) + # out shape = (B, 2*num_spk, F-1, T) if self.use_noise_mask == False + # else (B, 2*(num_spk+1), F-1, T) masks = self.create_masks(out) - masked = self.apply_masks(masks, spec_mags, spec_phase) + masked = self.apply_masks(masks, real, imag) others = OrderedDict( - zip(["mask_spk{}".format(i + 1) for i in range(self.num_spk)], masks,) + zip( + ["mask_spk{}".format(i + 1) for i in range(self.num_spk)], + masks, + ) ) if self.use_noise_mask: @@ -261,7 +265,7 @@ def create_masks(self, mask_tensor: torch.Tensor): """create estimated mask for each speaker Args: - mask_tensor (torch.Tensor): the output of the decoder, shape(B, 2*num_spk, F-1, T) + mask_tensor (torch.Tensor): output of decoder, shape(B, 2*num_spk, F-1, T) """ if self.use_noise_mask: assert mask_tensor.shape[1] == 2 * (self._num_spk + 1), mask_tensor.shape[1] @@ -296,15 +300,15 @@ def create_masks(self, mask_tensor: torch.Tensor): def apply_masks( self, masks: List[Union[torch.Tensor, ComplexTensor]], - spec_mags: torch.Tensor, - spec_phase: torch.Tensor, + real: torch.Tensor, + imag: torch.Tensor, ): """apply masks Args: - masks (List[Union[torch.Tensor, ComplexTensor]]): estimated masks, [(B, T, F), ...] - spec_mags (torch.Tensor): magnitude of the noisy spectrum, (B, F, T) - spec_phase (torch.Tensor): phase of the noisy spectrum, (B, F, T) + masks : est_masks, [(B, T, F), ...] + real (torch.Tensor): real part of the noisy spectrum, (B, F, T) + imag (torch.Tensor): imag part of the noisy spectrum, (B, F, T) Returns: masked (List[Union(torch.Tensor, ComplexTensor)]): [(B, T, F), ...] @@ -315,7 +319,11 @@ def apply_masks( mask_real = masks[i].real.permute(0, 2, 1) mask_imag = masks[i].imag.permute(0, 2, 1) if self.masking_mode == "E": - mask_mags = (mask_real ** 2 + mask_imag ** 2) ** 0.5 + # shape (B, F, T) + spec_mags = torch.sqrt(real**2 + imag**2 + 1e-8) + # shape (B, F, T) + spec_phase = torch.atan2(imag, real) + mask_mags = (mask_real**2 + mask_imag**2) ** 0.5 # mask_mags = (mask_real ** 2 + mask_imag ** 2 + EPS) ** 0.5 real_phase = mask_real / (mask_mags + EPS) imag_phase = mask_imag / (mask_mags + EPS) diff --git a/test/espnet2/enh/separator/test_dccrn_separator.py b/test/espnet2/enh/separator/test_dccrn_separator.py new file mode 100644 index 00000000000..4e5ea5fe65f --- /dev/null +++ b/test/espnet2/enh/separator/test_dccrn_separator.py @@ -0,0 +1,101 @@ +import pytest + +import torch +from torch_complex import ComplexTensor + +from espnet2.enh.separator.dccrn_separator import DCCRNSeparator + + +@pytest.mark.parametrize("input_dim", [9]) +@pytest.mark.parametrize("num_spk", [1, 2]) +@pytest.mark.parametrize("rnn_layer", [2, 3]) +@pytest.mark.parametrize("rnn_units", [256]) +@pytest.mark.parametrize("masking_mode", ["E", "C", "R"]) +@pytest.mark.parametrize("use_clstm", [True, False]) +@pytest.mark.parametrize("bidirectional", [True, False]) +@pytest.mark.parametrize("use_cbn", [True, False]) +@pytest.mark.parametrize("kernel_size", [5]) +@pytest.mark.parametrize("use_builtin_complex", [True, False]) +@pytest.mark.parametrize("use_noise_mask", [True, False]) +def test_dccrn_separator_forward_backward_complex( + input_dim, + num_spk, + rnn_layer, + rnn_units, + masking_mode, + use_clstm, + bidirectional, + use_cbn, + kernel_size, + use_builtin_complex, + use_noise_mask, +): + model = DCCRNSeparator( + input_dim=input_dim, + num_spk=num_spk, + rnn_layer=rnn_layer, + rnn_units=rnn_units, + masking_mode=masking_mode, + use_clstm=use_clstm, + bidirectional=bidirectional, + use_cbn=use_cbn, + kernel_size=kernel_size, + kernel_num=[ + 32, + 64, + 128, + ], + use_builtin_complex=use_builtin_complex, + use_noise_mask=use_noise_mask, + ) + model.train() + + real = torch.rand(2, 10, input_dim) + imag = torch.rand(2, 10, input_dim) + x = ComplexTensor(real, imag) + x_lens = torch.tensor([10, 8], dtype=torch.long) + + masked, flens, others = model(x, ilens=x_lens) + print("type masked[0]: ", type(masked[0])) + print("masked[0]: ", masked[0]) + + if use_builtin_complex: + assert isinstance(masked[0], torch.Tensor) + else: + assert isinstance(masked[0], ComplexTensor) + assert len(masked) == num_spk + + masked[0].abs().mean().backward() + + +def test_dccrn_separator_invalid_type(): + with pytest.raises(ValueError): + DCCRNSeparator( + input_dim=10, + masking_mode="fff", + ) + + +def test_rnn_separator_output(): + real = torch.rand(2, 10, 9) + imag = torch.rand(2, 10, 9) + x = ComplexTensor(real, imag) + x_lens = torch.tensor([10, 8], dtype=torch.long) + + for num_spk in range(1, 3): + model = DCCRNSeparator( + input_dim=9, + num_spk=num_spk, + kernel_num=[ + 32, + 64, + 128, + ], + ) + model.eval() + specs, _, others = model(x, x_lens) + assert isinstance(specs, list) + assert isinstance(others, dict) + for n in range(num_spk): + assert "mask_spk{}".format(n + 1) in others + assert specs[n].shape == others["mask_spk{}".format(n + 1)].shape diff --git a/test/espnet2/enh/test_espnet_model.py b/test/espnet2/enh/test_espnet_model.py index b51ea67fd7d..ada6fe55297 100644 --- a/test/espnet2/enh/test_espnet_model.py +++ b/test/espnet2/enh/test_espnet_model.py @@ -14,52 +14,54 @@ from espnet2.enh.loss.wrappers.fixed_order import FixedOrderSolver from espnet2.enh.loss.wrappers.pit_solver import PITSolver from espnet2.enh.separator.dprnn_separator import DPRNNSeparator +from espnet2.enh.separator.dccrn_separator import DCCRNSeparator from espnet2.enh.separator.neural_beamformer import NeuralBeamformer from espnet2.enh.separator.rnn_separator import RNNSeparator from espnet2.enh.separator.tcn_separator import TCNSeparator from espnet2.enh.separator.transformer_separator import TransformerSeparator + is_torch_1_9_plus = LooseVersion(torch.__version__) >= LooseVersion("1.9.0") stft_encoder = STFTEncoder( - n_fft=28, - hop_length=16, + n_fft=16, + hop_length=8, ) stft_encoder_bultin_complex = STFTEncoder( - n_fft=28, - hop_length=16, + n_fft=16, + hop_length=8, use_builtin_complex=True, ) stft_decoder = STFTDecoder( - n_fft=28, - hop_length=16, + n_fft=16, + hop_length=8, ) conv_encoder = ConvEncoder( - channel=15, - kernel_size=32, - stride=16, + channel=9, + kernel_size=20, + stride=10, ) conv_decoder = ConvDecoder( - channel=15, - kernel_size=32, - stride=16, + channel=9, + kernel_size=20, + stride=10, ) rnn_separator = RNNSeparator( - input_dim=15, + input_dim=9, layer=1, unit=10, ) -dprnn_separator = DPRNNSeparator(input_dim=15, layer=1, unit=10, segment_size=4) +dprnn_separator = DPRNNSeparator(input_dim=9, layer=1, unit=10, segment_size=4) tcn_separator = TCNSeparator( - input_dim=15, + input_dim=9, layer=2, stack=1, bottleneck_dim=10, @@ -68,14 +70,31 @@ ) transformer_separator = TransformerSeparator( - input_dim=15, + input_dim=9, adim=8, aheads=2, layers=2, linear_units=10, ) - +dccrn_separator = DCCRNSeparator( + input_dim=9, + num_spk=1, + rnn_layer=2, + rnn_units=256, + masking_mode="E", + use_clstm=True, + bidirectional=False, + use_cbn=False, + kernel_size=5, + kernel_num=[ + 32, + 64, + 128, + ], + use_builtin_complex=True, + use_noise_mask=False, +) si_snr_loss = SISNRLoss() tf_mse_loss = FrequencyDomainMSE() tf_l1_loss = FrequencyDomainL1() @@ -93,15 +112,24 @@ ], ) @pytest.mark.parametrize( - "separator", [rnn_separator, dprnn_separator, tcn_separator, transformer_separator] + "separator", + [ + rnn_separator, + dprnn_separator, + dccrn_separator, + tcn_separator, + transformer_separator, + ], ) @pytest.mark.parametrize("training", [True, False]) @pytest.mark.parametrize("loss_wrappers", [[pit_wrapper, fix_order_solver]]) def test_single_channel_model(encoder, decoder, separator, training, loss_wrappers): + # DCCRN separator dose not support ConvEncoder and ConvDecoder + if isinstance(encoder, ConvEncoder) and isinstance(separator, DCCRNSeparator): + return inputs = torch.randn(2, 300) ilens = torch.LongTensor([300, 200]) speech_refs = [torch.randn(2, 300).float(), torch.randn(2, 300).float()] - enh_model = ESPnetEnhancementModel( encoder=encoder, separator=separator, From 4c845283862b33052872ce45e2d7a5f233950f39 Mon Sep 17 00:00:00 2001 From: LinShaoxiong Date: Wed, 23 Feb 2022 22:48:44 +0800 Subject: [PATCH 06/34] Removed some unused code --- espnet2/enh/layers/complexnn.py | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/espnet2/enh/layers/complexnn.py b/espnet2/enh/layers/complexnn.py index 518282e2aed..6c60c7536da 100644 --- a/espnet2/enh/layers/complexnn.py +++ b/espnet2/enh/layers/complexnn.py @@ -16,28 +16,6 @@ import torch.nn.functional as F -def get_casual_padding1d(): - pass - - -def get_casual_padding2d(): - pass - - -class cPReLU(nn.Module): - def __init__(self, complex_axis=1): - super(cPReLU, self).__init__() - self.r_prelu = nn.PReLU() - self.i_prelu = nn.PReLU() - self.complex_axis = complex_axis - - def forward(self, inputs): - real, imag = torch.chunk(inputs, 2, self.complex_axis) - real = self.r_prelu(real) - imag = self.i_prelu(imag) - return torch.cat([real, imag], self.complex_axis) - - class NavieComplexLSTM(nn.Module): def __init__( self, From d36483a5a9c7cdbccf071b19f493d6009e11bb78 Mon Sep 17 00:00:00 2001 From: LinShaoxiong Date: Thu, 24 Feb 2022 15:07:09 +0800 Subject: [PATCH 07/34] Modified docstrings and import order --- espnet2/enh/layers/complexnn.py | 45 +++--------------------- espnet2/enh/separator/dccrn_separator.py | 21 +++++------ espnet2/tasks/enh.py | 2 +- test/espnet2/enh/test_espnet_model.py | 2 +- 4 files changed, 17 insertions(+), 53 deletions(-) diff --git a/espnet2/enh/layers/complexnn.py b/espnet2/enh/layers/complexnn.py index 6c60c7536da..5b458ca0ac2 100644 --- a/espnet2/enh/layers/complexnn.py +++ b/espnet2/enh/layers/complexnn.py @@ -1,16 +1,3 @@ -# Copyright 2020 huyanxin - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at - -# http://www.apache.org/licenses/LICENSE-2.0 - -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. import torch import torch.nn as nn import torch.nn.functional as F @@ -105,7 +92,8 @@ def __init__( causal=True, complex_axis=1, ): - """ + """ComplexConv2d. + in_channels: real+imag out_channels: real+imag kernel_size : input [B,C,D,T] kernel size in [D,T] @@ -193,7 +181,8 @@ def __init__( complex_axis=1, groups=1, ): - """ + """ComplexConvTranspose2d. + in_channels: real+imag out_channels: real+imag """ @@ -338,7 +327,6 @@ def _check_input_dim(self, xr, xi): assert xr.size(1) == self.num_features def forward(self, inputs): - # self._check_input_dim(xr, xi) xr, xi = torch.chunk(inputs, 2, axis=self.complex_axis) exponential_average_factor = 0.0 @@ -350,22 +338,17 @@ def forward(self, inputs): else: # use exponential moving average exponential_average_factor = self.momentum - # # NOTE: The precise meaning of the "training flag" is: # True: Normalize using batch statistics, update running statistics # if they are being collected. # False: Normalize using running statistics, ignore batch statistics. - # training = self.training or not self.track_running_stats redux = [i for i in reversed(range(xr.dim())) if i != 1] vdim = [1] * xr.dim() vdim[1] = xr.size(1) - # # Mean M Computation and Centering - # # Includes running mean update if training and running. - # if training: Mr, Mi = xr, xi for d in redux: @@ -379,12 +362,9 @@ def forward(self, inputs): Mi = self.RMi.view(vdim) xr, xi = xr - Mr, xi - Mi - # # Variance Matrix V Computation - # # Includes epsilon numerical stabilizer/Tikhonov regularizer. # Includes running variance update if training and running. - # if training: Vrr = xr * xr Vri = xr * xi @@ -405,9 +385,7 @@ def forward(self, inputs): Vri = Vri Vii = Vii + self.eps - # # Matrix Inverse Square Root U = V^-0.5 - # # sqrt of a 2x2 matrix, # - https://en.wikipedia.org/wiki/Square_root_of_a_2_by_2_matrix tau = Vrr + Vii @@ -421,7 +399,6 @@ def forward(self, inputs): Uii = (s + Vrr) * rst Uri = (-Vri) * rst - # # Optionally left-multiply U by affine weights W to produce combined # weights Z, left-multiply the inputs by Z, then optionally bias them. # @@ -429,7 +406,6 @@ def forward(self, inputs): # y = WUx + B # y = [Wrr Wri][Urr Uri] [xr] + [Br] # [Wir Wii][Uir Uii] [xi] [Bi] - # if self.affine: Wrr, Wri, Wii = ( self.Wrr.view(vdim), @@ -458,16 +434,3 @@ def extra_repr(self): "{num_features}, eps={eps}, momentum={momentum}, affine={affine}, " "track_running_stats={track_running_stats}".format(**self.__dict__) ) - - -if __name__ == "__main__": - import dc_crn7 - - torch.manual_seed(20) - onet1 = dc_crn7.ComplexConv2d(12, 12, kernel_size=(3, 2), padding=(2, 1)) - onet2 = dc_crn7.ComplexConvTranspose2d(12, 12, kernel_size=(3, 2), padding=(2, 1)) - inputs = torch.randn([1, 12, 12, 10]) - - nnet1 = ComplexConv2d(12, 12, kernel_size=(3, 2), padding=(2, 1), causal=True) - - nnet2 = ComplexConvTranspose2d(12, 12, kernel_size=(3, 2), padding=(2, 1)) diff --git a/espnet2/enh/separator/dccrn_separator.py b/espnet2/enh/separator/dccrn_separator.py index c7d571123f1..ac0e13b0217 100644 --- a/espnet2/enh/separator/dccrn_separator.py +++ b/espnet2/enh/separator/dccrn_separator.py @@ -1,20 +1,21 @@ from collections import OrderedDict from distutils.version import LooseVersion -from typing import List, Tuple, Union +from typing import List +from typing import Tuple +from typing import Union import torch import torch.nn as nn import torch.nn.functional as F -from espnet2.enh.layers.complexnn import ( - ComplexBatchNorm, - ComplexConv2d, - ComplexConvTranspose2d, - NavieComplexLSTM, - complex_cat, -) -from espnet2.enh.separator.abs_separator import AbsSeparator from torch_complex.tensor import ComplexTensor +from espnet2.enh.layers.complexnn import complex_cat +from espnet2.enh.layers.complexnn import ComplexBatchNorm +from espnet2.enh.layers.complexnn import ComplexConv2d +from espnet2.enh.layers.complexnn import ComplexConvTranspose2d +from espnet2.enh.layers.complexnn import NavieComplexLSTM +from espnet2.enh.separator.abs_separator import AbsSeparator + is_torch_1_9_plus = LooseVersion(torch.__version__) >= LooseVersion("1.9.0") EPS = torch.finfo(torch.double).eps @@ -35,7 +36,7 @@ def __init__( use_builtin_complex: bool = True, use_noise_mask: bool = False, ): - """DCCRN separator + """DCCRN separator. Args: input_dim (int): input dimension。 diff --git a/espnet2/tasks/enh.py b/espnet2/tasks/enh.py index 30678034d55..56eec0199dc 100644 --- a/espnet2/tasks/enh.py +++ b/espnet2/tasks/enh.py @@ -32,8 +32,8 @@ from espnet2.enh.separator.abs_separator import AbsSeparator from espnet2.enh.separator.asteroid_models import AsteroidModel_Converter from espnet2.enh.separator.conformer_separator import ConformerSeparator -from espnet2.enh.separator.dprnn_separator import DPRNNSeparator from espnet2.enh.separator.dccrn_separator import DCCRNSeparator +from espnet2.enh.separator.dprnn_separator import DPRNNSeparator from espnet2.enh.separator.neural_beamformer import NeuralBeamformer from espnet2.enh.separator.rnn_separator import RNNSeparator from espnet2.enh.separator.tcn_separator import TCNSeparator diff --git a/test/espnet2/enh/test_espnet_model.py b/test/espnet2/enh/test_espnet_model.py index ada6fe55297..cb2a864900f 100644 --- a/test/espnet2/enh/test_espnet_model.py +++ b/test/espnet2/enh/test_espnet_model.py @@ -13,8 +13,8 @@ from espnet2.enh.loss.criterions.time_domain import SISNRLoss from espnet2.enh.loss.wrappers.fixed_order import FixedOrderSolver from espnet2.enh.loss.wrappers.pit_solver import PITSolver -from espnet2.enh.separator.dprnn_separator import DPRNNSeparator from espnet2.enh.separator.dccrn_separator import DCCRNSeparator +from espnet2.enh.separator.dprnn_separator import DPRNNSeparator from espnet2.enh.separator.neural_beamformer import NeuralBeamformer from espnet2.enh.separator.rnn_separator import RNNSeparator from espnet2.enh.separator.tcn_separator import TCNSeparator From 3b53aedc654fd30a828689c2139a1e130adac077 Mon Sep 17 00:00:00 2001 From: YushiUeda Date: Fri, 25 Feb 2022 00:13:16 -0500 Subject: [PATCH 08/34] initial commit --- egs2/swbd_sentiment/asr1/asr.sh | 1 + egs2/swbd_sentiment/asr1/cmd.sh | 110 ++++++++++++++++ egs2/swbd_sentiment/asr1/conf/fbank.conf | 2 + egs2/swbd_sentiment/asr1/conf/pbs.conf | 11 ++ egs2/swbd_sentiment/asr1/conf/pitch.conf | 1 + egs2/swbd_sentiment/asr1/conf/queue.conf | 12 ++ egs2/swbd_sentiment/asr1/conf/slurm.conf | 14 ++ egs2/swbd_sentiment/asr1/db.sh | 1 + .../asr1/local/MSU_single_letter.txt | 1 + egs2/swbd_sentiment/asr1/local/data.sh | 83 ++++++++++++ egs2/swbd_sentiment/asr1/local/dict.patch | 1 + .../asr1/local/extend_segments.pl | 1 + .../asr1/local/format_acronyms_dict.py | 1 + .../asr1/local/map_acronyms_transcripts.py | 1 + egs2/swbd_sentiment/asr1/local/path.sh | 0 .../asr1/local/prepare_sentiment.py | 122 ++++++++++++++++++ egs2/swbd_sentiment/asr1/local/score.sh | 116 +++++++++++++++++ .../asr1/local/swbd1_data_download.sh | 1 + .../asr1/local/swbd1_data_prep.sh | 1 + .../asr1/local/swbd1_fix_speakerid.pl | 1 + .../asr1/local/swbd1_map_words.pl | 1 + .../asr1/local/swbd1_prepare_dict.sh | 1 + egs2/swbd_sentiment/asr1/path.sh | 1 + egs2/swbd_sentiment/asr1/pyscripts | 1 + egs2/swbd_sentiment/asr1/scripts | 1 + egs2/swbd_sentiment/asr1/steps | 1 + egs2/swbd_sentiment/asr1/utils | 1 + 27 files changed, 488 insertions(+) create mode 120000 egs2/swbd_sentiment/asr1/asr.sh create mode 100644 egs2/swbd_sentiment/asr1/cmd.sh create mode 100644 egs2/swbd_sentiment/asr1/conf/fbank.conf create mode 100644 egs2/swbd_sentiment/asr1/conf/pbs.conf create mode 100644 egs2/swbd_sentiment/asr1/conf/pitch.conf create mode 100644 egs2/swbd_sentiment/asr1/conf/queue.conf create mode 100644 egs2/swbd_sentiment/asr1/conf/slurm.conf create mode 120000 egs2/swbd_sentiment/asr1/db.sh create mode 120000 egs2/swbd_sentiment/asr1/local/MSU_single_letter.txt create mode 100755 egs2/swbd_sentiment/asr1/local/data.sh create mode 120000 egs2/swbd_sentiment/asr1/local/dict.patch create mode 120000 egs2/swbd_sentiment/asr1/local/extend_segments.pl create mode 120000 egs2/swbd_sentiment/asr1/local/format_acronyms_dict.py create mode 120000 egs2/swbd_sentiment/asr1/local/map_acronyms_transcripts.py create mode 100644 egs2/swbd_sentiment/asr1/local/path.sh create mode 100755 egs2/swbd_sentiment/asr1/local/prepare_sentiment.py create mode 100755 egs2/swbd_sentiment/asr1/local/score.sh create mode 120000 egs2/swbd_sentiment/asr1/local/swbd1_data_download.sh create mode 120000 egs2/swbd_sentiment/asr1/local/swbd1_data_prep.sh create mode 120000 egs2/swbd_sentiment/asr1/local/swbd1_fix_speakerid.pl create mode 120000 egs2/swbd_sentiment/asr1/local/swbd1_map_words.pl create mode 120000 egs2/swbd_sentiment/asr1/local/swbd1_prepare_dict.sh create mode 120000 egs2/swbd_sentiment/asr1/path.sh create mode 120000 egs2/swbd_sentiment/asr1/pyscripts create mode 120000 egs2/swbd_sentiment/asr1/scripts create mode 120000 egs2/swbd_sentiment/asr1/steps create mode 120000 egs2/swbd_sentiment/asr1/utils diff --git a/egs2/swbd_sentiment/asr1/asr.sh b/egs2/swbd_sentiment/asr1/asr.sh new file mode 120000 index 00000000000..60b05122cfd --- /dev/null +++ b/egs2/swbd_sentiment/asr1/asr.sh @@ -0,0 +1 @@ +../../TEMPLATE/asr1/asr.sh \ No newline at end of file diff --git a/egs2/swbd_sentiment/asr1/cmd.sh b/egs2/swbd_sentiment/asr1/cmd.sh new file mode 100644 index 00000000000..2aae6919fef --- /dev/null +++ b/egs2/swbd_sentiment/asr1/cmd.sh @@ -0,0 +1,110 @@ +# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ====== +# Usage: .pl [options] JOB=1: +# e.g. +# run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB +# +# Options: +# --time