Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update audio api examples #46938

Merged
merged 14 commits into from
Oct 18, 2022
1 change: 1 addition & 0 deletions python/paddle/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@
import paddle.reader # noqa: F401
import paddle.static # noqa: F401
import paddle.vision # noqa: F401
import paddle.audio # noqa: F401
import paddle.geometric # noqa: F401

from .tensor.attribute import is_complex # noqa: F401
Expand Down
15 changes: 11 additions & 4 deletions python/paddle/audio/features/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,14 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .layers import LogMelSpectrogram
from .layers import MelSpectrogram
from .layers import MFCC
from .layers import Spectrogram
from .layers import LogMelSpectrogram # noqa: F401
from .layers import MelSpectrogram # noqa: F401
from .layers import MFCC # noqa: F401
from .layers import Spectrogram # noqa: F401

__all__ = [ # noqa
'LogMelSpectrogram',
'MelSpectrogram',
'MFCC',
'Spectrogram',
]
85 changes: 78 additions & 7 deletions python/paddle/audio/features/layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,6 @@
from ..functional import power_to_db
from ..functional.window import get_window

__all__ = [
'Spectrogram',
'MelSpectrogram',
'LogMelSpectrogram',
'MFCC',
]


class Spectrogram(nn.Layer):
"""Compute spectrogram of given signals, typically audio waveforms.
Expand All @@ -45,6 +38,27 @@ class Spectrogram(nn.Layer):
center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
dtype (str, optional): Data type of input and window. Defaults to 'float32'.
Returns:
:ref:`api_paddle_nn_Layer`. An instance of Spectrogram.
Examples:
.. code-block:: python
import paddle
from paddle.audio.features import Spectrogram
sample_rate = 16000
wav_duration = 0.5
num_channels = 1
num_frames = sample_rate * wav_duration
wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1
waveform = wav_data.tile([num_channels, 1])
feature_extractor = Spectrogram(n_fft=512, window = 'hann', power = 1.0)
feats = feature_extractor(waveform)
"""

def __init__(self,
Expand Down Expand Up @@ -108,6 +122,25 @@ class MelSpectrogram(nn.Layer):
htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False.
norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'.
dtype (str, optional): Data type of input and window. Defaults to 'float32'.
Returns:
:ref:`api_paddle_nn_Layer`. An instance of MelSpectrogram.
Examples:
.. code-block:: python
import paddle
from paddle.audio.features import MelSpectrogram
sample_rate = 16000
wav_duration = 0.5
num_channels = 1
num_frames = sample_rate * wav_duration
wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1
waveform = wav_data.tile([num_channels, 1])
feature_extractor = MelSpectrogram(sr=sample_rate, n_fft=512, window = 'hann', power = 1.0)
feats = feature_extractor(waveform)
"""

def __init__(self,
Expand Down Expand Up @@ -186,6 +219,25 @@ class LogMelSpectrogram(nn.Layer):
amin (float, optional): The minimum value of input magnitude. Defaults to 1e-10.
top_db (Optional[float], optional): The maximum db value of spectrogram. Defaults to None.
dtype (str, optional): Data type of input and window. Defaults to 'float32'.
Returns:
:ref:`api_paddle_nn_Layer`. An instance of LogMelSpectrogram.
Examples:
.. code-block:: python
import paddle
from paddle.audio.features import LogMelSpectrogram
sample_rate = 16000
wav_duration = 0.5
num_channels = 1
num_frames = sample_rate * wav_duration
wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1
waveform = wav_data.tile([num_channels, 1])
feature_extractor = LogMelSpectrogram(sr=sample_rate, n_fft=512, window = 'hann', power = 1.0)
feats = feature_extractor(waveform)
"""

def __init__(self,
Expand Down Expand Up @@ -265,6 +317,25 @@ class MFCC(nn.Layer):
amin (float, optional): The minimum value of input magnitude. Defaults to 1e-10.
top_db (Optional[float], optional): The maximum db value of spectrogram. Defaults to None.
dtype (str, optional): Data type of input and window. Defaults to 'float32'.
Returns:
:ref:`api_paddle_nn_Layer`. An instance of MFCC.
Examples:
.. code-block:: python
import paddle
from paddle.audio.features import MFCC
sample_rate = 16000
wav_duration = 0.5
num_channels = 1
num_frames = sample_rate * wav_duration
wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1
waveform = wav_data.tile([num_channels, 1])
feature_extractor = MFCC(sr=sample_rate, n_fft=512, window = 'hann')
feats = feature_extractor(waveform)
"""

def __init__(self,
Expand Down
27 changes: 19 additions & 8 deletions python/paddle/audio/functional/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,22 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .functional import compute_fbank_matrix
from .functional import create_dct
from .functional import fft_frequencies
from .functional import hz_to_mel
from .functional import mel_frequencies
from .functional import mel_to_hz
from .functional import power_to_db
from .window import get_window
from .functional import compute_fbank_matrix # noqa: F401
from .functional import create_dct # noqa: F401
from .functional import fft_frequencies # noqa: F401
from .functional import hz_to_mel # noqa: F401
from .functional import mel_frequencies # noqa: F401
from .functional import mel_to_hz # noqa: F401
from .functional import power_to_db # noqa: F401
from .window import get_window # noqa: F401

__all__ = [ # noqa
'compute_fbank_matrix',
'create_dct',
'fft_frequencies',
'hz_to_mel',
'mel_frequencies',
'mel_to_hz',
'power_to_db',
'get_window',
]
79 changes: 69 additions & 10 deletions python/paddle/audio/functional/functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,6 @@
import paddle
from paddle import Tensor

__all__ = [
'hz_to_mel',
'mel_to_hz',
'mel_frequencies',
'fft_frequencies',
'compute_fbank_matrix',
'power_to_db',
'create_dct',
]


def hz_to_mel(freq: Union[Tensor, float],
htk: bool = False) -> Union[Tensor, float]:
Expand All @@ -40,6 +30,16 @@ def hz_to_mel(freq: Union[Tensor, float],

Returns:
Union[Tensor, float]: Frequency in mels.

Examples:
.. code-block:: python

import paddle

val = 3.0
htk_flag = True
mel_paddle_tensor = paddle.audio.functional.hz_to_mel(
paddle.to_tensor(val), htk_flag)
"""

if htk:
Expand Down Expand Up @@ -83,6 +83,17 @@ def mel_to_hz(mel: Union[float, Tensor],

Returns:
Union[float, Tensor]: Frequencies in Hz.

Examples:
.. code-block:: python

import paddle

val = 3.0
htk_flag = True
mel_paddle_tensor = paddle.audio.functional.mel_to_hz(
paddle.to_tensor(val), htk_flag)

"""
if htk:
return 700.0 * (10.0**(mel / 2595.0) - 1.0)
Expand Down Expand Up @@ -121,6 +132,19 @@ def mel_frequencies(n_mels: int = 64,

Returns:
Tensor: Tensor of n_mels frequencies in Hz with shape `(n_mels,)`.

Examples:
.. code-block:: python

import paddle

n_mels = 64
f_min = 0.5
f_max = 10000
htk_flag = True

paddle_mel_freq = paddle.audio.functional.mel_frequencies(
n_mels, f_min, f_max, htk_flag, 'float64')
"""
# 'Center freqs' of mel bands - uniformly spaced between limits
min_mel = hz_to_mel(f_min, htk=htk)
Expand All @@ -140,6 +164,15 @@ def fft_frequencies(sr: int, n_fft: int, dtype: str = 'float32') -> Tensor:

Returns:
Tensor: FFT frequencies in Hz with shape `(n_fft//2 + 1,)`.

Examples:
.. code-block:: python

import paddle

sr = 16000
n_fft = 128
fft_freq = paddle.audio.functional.fft_frequencies(sr, n_fft)
"""
return paddle.linspace(0, float(sr) / 2, int(1 + n_fft // 2), dtype=dtype)

Expand All @@ -166,6 +199,15 @@ def compute_fbank_matrix(sr: int,

Returns:
Tensor: Mel transform matrix with shape `(n_mels, n_fft//2 + 1)`.

Examples:
.. code-block:: python

import paddle

n_mfcc = 23
n_mels = 51
paddle_dct = paddle.audio.functional.create_dct(n_mfcc, n_mels)
"""

if f_max is None:
Expand Down Expand Up @@ -221,6 +263,15 @@ def power_to_db(spect: Tensor,

Returns:
Tensor: Power spectrogram in db scale.

Examples:
.. code-block:: python

import paddle

val = 3.0
decibel_paddle = paddle.audio.functional.power_to_db(
paddle.to_tensor(val))
"""
if amin <= 0:
raise Exception("amin must be strictly positive")
Expand Down Expand Up @@ -254,6 +305,14 @@ def create_dct(n_mfcc: int,

Returns:
Tensor: The DCT matrix with shape `(n_mels, n_mfcc)`.

Examples:
.. code-block:: python

import paddle
n_mfcc = 23
n_mels = 257
dct = paddle.audio.functional.create_dct(n_mfcc, n_mels)
"""
n = paddle.arange(n_mels, dtype=dtype)
k = paddle.arange(n_mfcc, dtype=dtype).unsqueeze(1)
Expand Down
15 changes: 11 additions & 4 deletions python/paddle/audio/functional/window.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,6 @@
import paddle
from paddle import Tensor

__all__ = [
'get_window',
]


def _cat(x: List[Tensor], data_type: str) -> Tensor:
l = [paddle.to_tensor(_, data_type) for _ in x]
Expand Down Expand Up @@ -323,6 +319,17 @@ def get_window(window: Union[str, Tuple[str, float]],
Returns:
Tensor: The window represented as a tensor.
Examples:
.. code-block:: python
import paddle
n_fft = 512
cosine_window = paddle.audio.functional.get_window('cosine', n_fft)
std = 7
gussian_window = paddle.audio.functional.get_window(('gaussian',std), n_fft)
"""
sym = not fftbins

Expand Down
2 changes: 1 addition & 1 deletion python/paddle/tests/test_audio_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def test_window(self, window_type: str, n_fft: int):
decimal=5)

@parameterize([1, 512])
def test_gussian_window_and_exception(self, n_fft: int):
def test_gaussian_window_and_exception(self, n_fft: int):
window_scipy_gaussain = signal.windows.gaussian(n_fft, std=7)
window_paddle_gaussian = paddle.audio.functional.get_window(
('gaussian', 7), n_fft, False)
Expand Down