Skip to content

Compute deltas #268

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 22 commits into from
Sep 19, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions test/test_compliance_kaldi.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,5 +319,6 @@ def test_resample_waveform_multi_channel(self):
single_channel_sampled = kaldi.resample_waveform(single_channel, sample_rate, sample_rate // 2)
self.assertTrue(torch.allclose(multi_sound_sampled[i, :], single_channel_sampled, rtol=1e-4))


if __name__ == '__main__':
unittest.main()
26 changes: 26 additions & 0 deletions test/test_functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,32 @@
class TestFunctional(unittest.TestCase):
data_sizes = [(2, 20), (3, 15), (4, 10)]
number_of_trials = 100
specgram = torch.tensor([1., 2., 3., 4.])

def _test_compute_deltas(self, specgram, expected, win_length=3, atol=1e-6, rtol=1e-8):
computed = F.compute_deltas(specgram, win_length=win_length)
self.assertTrue(computed.shape == expected.shape, (computed.shape, expected.shape))
torch.testing.assert_allclose(computed, expected, atol=atol, rtol=rtol)

def test_compute_deltas_onechannel(self):
specgram = self.specgram.unsqueeze(0).unsqueeze(0)
expected = torch.tensor([[[0.5, 1.0, 1.0, 0.5]]])
self._test_compute_deltas(specgram, expected)

def test_compute_deltas_twochannel(self):
specgram = self.specgram.repeat(1, 2, 1)
expected = torch.tensor([[[0.5, 1.0, 1.0, 0.5],
[0.5, 1.0, 1.0, 0.5]]])
self._test_compute_deltas(specgram, expected)

def test_compute_deltas_randn(self):
channel = 13
n_mfcc = channel * 3
time = 1021
win_length = 2 * 7 + 1
specgram = torch.randn(channel, n_mfcc, time)
computed = F.compute_deltas(specgram, win_length=win_length)
self.assertTrue(computed.shape == specgram.shape, (computed.shape, specgram.shape))

def _compare_estimate(self, sound, estimate, atol=1e-6, rtol=1e-8):
# trim sound for case when constructed signal is shorter than original
Expand Down
35 changes: 34 additions & 1 deletion test/test_transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@

import torch
import torchaudio
from torchaudio.common_utils import IMPORT_LIBROSA, IMPORT_SCIPY
import torchaudio.transforms as transforms
import torchaudio.functional as F
from torchaudio.common_utils import IMPORT_LIBROSA, IMPORT_SCIPY
import unittest
import common_utils

Expand Down Expand Up @@ -281,5 +282,37 @@ def test_resample_size(self):
# we expect the downsampled signal to have half as many samples
self.assertTrue(down_sampled.size(-1) == waveform.size(-1) // 2)

def test_compute_deltas(self):
channel = 13
n_mfcc = channel * 3
time = 1021
win_length = 2 * 7 + 1
specgram = torch.randn(channel, n_mfcc, time)
transform = transforms.ComputeDeltas(win_length=win_length)
computed = transform(specgram)
self.assertTrue(computed.shape == specgram.shape, (computed.shape, specgram.shape))

def test_compute_deltas_transform_same_as_functional(self, atol=1e-6, rtol=1e-8):
channel = 13
n_mfcc = channel * 3
time = 1021
win_length = 2 * 7 + 1
specgram = torch.randn(channel, n_mfcc, time)

transform = transforms.ComputeDeltas(win_length=win_length)
computed_transform = transform(specgram)

computed_functional = F.compute_deltas(specgram, win_length=win_length)
torch.testing.assert_allclose(computed_functional, computed_transform, atol=atol, rtol=rtol)

def test_compute_deltas_twochannel(self):
specgram = torch.tensor([1., 2., 3., 4.]).repeat(1, 2, 1)
expected = torch.tensor([[[0.5, 1.0, 1.0, 0.5],
[0.5, 1.0, 1.0, 0.5]]])
transform = transforms.ComputeDeltas()
computed = transform(specgram)
self.assertTrue(computed.shape == specgram.shape, (computed.shape, specgram.shape))


if __name__ == '__main__':
unittest.main()
48 changes: 48 additions & 0 deletions torchaudio/functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
"biquad",
]


# TODO: remove this once https://github.com/pytorch/pytorch/issues/21478 gets solved
@torch.jit.ignore
def _stft(
Expand Down Expand Up @@ -652,3 +653,50 @@ def lowpass_biquad(waveform, sample_rate, cutoff_freq, Q=0.707):
a1 = -2 * math.cos(w0)
a2 = 1 - alpha
return biquad(waveform, b0, b1, b2, a0, a1, a2)


def compute_deltas(specgram, win_length=5, mode="replicate"):
# type: (Tensor, int, str) -> Tensor
r"""Compute delta coefficients of a tensor, usually a spectrogram:

.. math::
d_t = \frac{\sum_{n=1}^{\text{N}} n (c_{t+n} - c_{t-n})}{2 \sum_{n=1}^{\text{N} n^2}

where :math:`d_t` is the deltas at time :math:`t`,
:math:`c_t` is the spectrogram coeffcients at time :math:`t`,
:math:`N` is (`win_length`-1)//2.

Args:
specgram (torch.Tensor): Tensor of audio of dimension (channel, n_mfcc, time)
win_length (int): The window length used for computing delta
mode (str): Mode parameter passed to padding

Returns:
deltas (torch.Tensor): Tensor of audio of dimension (channel, n_mfcc, time)

Example
>>> specgram = torch.randn(1, 40, 1000)
>>> delta = compute_deltas(specgram)
>>> delta2 = compute_deltas(delta)
"""

assert win_length >= 3
assert specgram.dim() == 3
assert not specgram.shape[1] % specgram.shape[0]

n = (win_length - 1) // 2

# twice sum of integer squared
denom = n * (n + 1) * (2 * n + 1) / 3

specgram = torch.nn.functional.pad(specgram, (n, n), mode=mode)

kernel = (
torch
.arange(-n, n + 1, 1, device=specgram.device, dtype=specgram.dtype)
.repeat(specgram.shape[1], specgram.shape[0], 1)
)

return torch.nn.functional.conv1d(
specgram, kernel, groups=specgram.shape[1] // specgram.shape[0]
) / denom
27 changes: 27 additions & 0 deletions torchaudio/transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,3 +365,30 @@ def forward(self, waveform):
return kaldi.resample_waveform(waveform, self.orig_freq, self.new_freq)

raise ValueError('Invalid resampling method: %s' % (self.resampling_method))


class ComputeDeltas(torch.jit.ScriptModule):
Copy link

@zhangguanheng66 zhangguanheng66 Sep 9, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if this depends on the jit mode, it's better to add a jit test to make sure that it works properly.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you have an example of what you have in mind?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You just need to add a text case to ensure every operator is traceable.

Copy link
Contributor Author

@vincentqb vincentqb Sep 11, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There was already a test verifying that the shape of the output is correct. I added a test to verify that the output matches the functional, see here. Thanks for pointing out this link, but could you clarify what you would like to verify? If the module were not to compile, the test would not run at all.

r"""Compute delta coefficients of a tensor, usually a spectrogram.

See `torchaudio.functional.compute_deltas` for more details.

Args:
win_length (int): The window length used for computing delta.
"""
__constants__ = ['win_length']

def __init__(self, win_length=5, mode="replicate"):
super(ComputeDeltas, self).__init__()
self.win_length = win_length
self.mode = torch.jit.Attribute(mode, str)

@torch.jit.script_method
def forward(self, specgram):
r"""
Args:
specgram (torch.Tensor): Tensor of audio of dimension (channel, n_mfcc, time)

Returns:
deltas (torch.Tensor): Tensor of audio of dimension (channel, n_mfcc, time)
"""
return F.compute_deltas(specgram, win_length=self.win_length, mode=self.mode)