Skip to content

Commit

Permalink
update audio api examples (#46938) (#47126)
Browse files Browse the repository at this point in the history
* update audio api examples

* fix format

* format

* fix

* test api

* fix format

* fix static check error

* fix doc error

* fix ci

* fix api error

* update api.spec

* fix ci

* fix typo in window gaussian
  • Loading branch information
SmileGoat authored Oct 19, 2022
1 parent fcb9c0b commit f08c104
Show file tree
Hide file tree
Showing 7 changed files with 190 additions and 34 deletions.
1 change: 1 addition & 0 deletions python/paddle/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@
import paddle.reader # noqa: F401
import paddle.static # noqa: F401
import paddle.vision # noqa: F401
import paddle.audio # noqa: F401
import paddle.geometric # noqa: F401

from .tensor.attribute import is_complex # noqa: F401
Expand Down
15 changes: 11 additions & 4 deletions python/paddle/audio/features/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,14 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .layers import LogMelSpectrogram
from .layers import MelSpectrogram
from .layers import MFCC
from .layers import Spectrogram
from .layers import LogMelSpectrogram # noqa: F401
from .layers import MelSpectrogram # noqa: F401
from .layers import MFCC # noqa: F401
from .layers import Spectrogram # noqa: F401

__all__ = [ # noqa
'LogMelSpectrogram',
'MelSpectrogram',
'MFCC',
'Spectrogram',
]
85 changes: 78 additions & 7 deletions python/paddle/audio/features/layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,6 @@
from ..functional import power_to_db
from ..functional.window import get_window

__all__ = [
'Spectrogram',
'MelSpectrogram',
'LogMelSpectrogram',
'MFCC',
]


class Spectrogram(nn.Layer):
"""Compute spectrogram of given signals, typically audio waveforms.
Expand All @@ -45,6 +38,27 @@ class Spectrogram(nn.Layer):
center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
dtype (str, optional): Data type of input and window. Defaults to 'float32'.
Returns:
:ref:`api_paddle_nn_Layer`. An instance of Spectrogram.
Examples:
.. code-block:: python
import paddle
from paddle.audio.features import Spectrogram
sample_rate = 16000
wav_duration = 0.5
num_channels = 1
num_frames = sample_rate * wav_duration
wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1
waveform = wav_data.tile([num_channels, 1])
feature_extractor = Spectrogram(n_fft=512, window = 'hann', power = 1.0)
feats = feature_extractor(waveform)
"""

def __init__(self,
Expand Down Expand Up @@ -108,6 +122,25 @@ class MelSpectrogram(nn.Layer):
htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False.
norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'.
dtype (str, optional): Data type of input and window. Defaults to 'float32'.
Returns:
:ref:`api_paddle_nn_Layer`. An instance of MelSpectrogram.
Examples:
.. code-block:: python
import paddle
from paddle.audio.features import MelSpectrogram
sample_rate = 16000
wav_duration = 0.5
num_channels = 1
num_frames = sample_rate * wav_duration
wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1
waveform = wav_data.tile([num_channels, 1])
feature_extractor = MelSpectrogram(sr=sample_rate, n_fft=512, window = 'hann', power = 1.0)
feats = feature_extractor(waveform)
"""

def __init__(self,
Expand Down Expand Up @@ -186,6 +219,25 @@ class LogMelSpectrogram(nn.Layer):
amin (float, optional): The minimum value of input magnitude. Defaults to 1e-10.
top_db (Optional[float], optional): The maximum db value of spectrogram. Defaults to None.
dtype (str, optional): Data type of input and window. Defaults to 'float32'.
Returns:
:ref:`api_paddle_nn_Layer`. An instance of LogMelSpectrogram.
Examples:
.. code-block:: python
import paddle
from paddle.audio.features import LogMelSpectrogram
sample_rate = 16000
wav_duration = 0.5
num_channels = 1
num_frames = sample_rate * wav_duration
wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1
waveform = wav_data.tile([num_channels, 1])
feature_extractor = LogMelSpectrogram(sr=sample_rate, n_fft=512, window = 'hann', power = 1.0)
feats = feature_extractor(waveform)
"""

def __init__(self,
Expand Down Expand Up @@ -265,6 +317,25 @@ class MFCC(nn.Layer):
amin (float, optional): The minimum value of input magnitude. Defaults to 1e-10.
top_db (Optional[float], optional): The maximum db value of spectrogram. Defaults to None.
dtype (str, optional): Data type of input and window. Defaults to 'float32'.
Returns:
:ref:`api_paddle_nn_Layer`. An instance of MFCC.
Examples:
.. code-block:: python
import paddle
from paddle.audio.features import MFCC
sample_rate = 16000
wav_duration = 0.5
num_channels = 1
num_frames = sample_rate * wav_duration
wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1
waveform = wav_data.tile([num_channels, 1])
feature_extractor = MFCC(sr=sample_rate, n_fft=512, window = 'hann')
feats = feature_extractor(waveform)
"""

def __init__(self,
Expand Down
27 changes: 19 additions & 8 deletions python/paddle/audio/functional/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,22 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .functional import compute_fbank_matrix
from .functional import create_dct
from .functional import fft_frequencies
from .functional import hz_to_mel
from .functional import mel_frequencies
from .functional import mel_to_hz
from .functional import power_to_db
from .window import get_window
from .functional import compute_fbank_matrix # noqa: F401
from .functional import create_dct # noqa: F401
from .functional import fft_frequencies # noqa: F401
from .functional import hz_to_mel # noqa: F401
from .functional import mel_frequencies # noqa: F401
from .functional import mel_to_hz # noqa: F401
from .functional import power_to_db # noqa: F401
from .window import get_window # noqa: F401

__all__ = [ # noqa
'compute_fbank_matrix',
'create_dct',
'fft_frequencies',
'hz_to_mel',
'mel_frequencies',
'mel_to_hz',
'power_to_db',
'get_window',
]
79 changes: 69 additions & 10 deletions python/paddle/audio/functional/functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,6 @@
import paddle
from paddle import Tensor

__all__ = [
'hz_to_mel',
'mel_to_hz',
'mel_frequencies',
'fft_frequencies',
'compute_fbank_matrix',
'power_to_db',
'create_dct',
]


def hz_to_mel(freq: Union[Tensor, float],
htk: bool = False) -> Union[Tensor, float]:
Expand All @@ -40,6 +30,16 @@ def hz_to_mel(freq: Union[Tensor, float],
Returns:
Union[Tensor, float]: Frequency in mels.
Examples:
.. code-block:: python
import paddle
val = 3.0
htk_flag = True
mel_paddle_tensor = paddle.audio.functional.hz_to_mel(
paddle.to_tensor(val), htk_flag)
"""

if htk:
Expand Down Expand Up @@ -83,6 +83,17 @@ def mel_to_hz(mel: Union[float, Tensor],
Returns:
Union[float, Tensor]: Frequencies in Hz.
Examples:
.. code-block:: python
import paddle
val = 3.0
htk_flag = True
mel_paddle_tensor = paddle.audio.functional.mel_to_hz(
paddle.to_tensor(val), htk_flag)
"""
if htk:
return 700.0 * (10.0**(mel / 2595.0) - 1.0)
Expand Down Expand Up @@ -121,6 +132,19 @@ def mel_frequencies(n_mels: int = 64,
Returns:
Tensor: Tensor of n_mels frequencies in Hz with shape `(n_mels,)`.
Examples:
.. code-block:: python
import paddle
n_mels = 64
f_min = 0.5
f_max = 10000
htk_flag = True
paddle_mel_freq = paddle.audio.functional.mel_frequencies(
n_mels, f_min, f_max, htk_flag, 'float64')
"""
# 'Center freqs' of mel bands - uniformly spaced between limits
min_mel = hz_to_mel(f_min, htk=htk)
Expand All @@ -140,6 +164,15 @@ def fft_frequencies(sr: int, n_fft: int, dtype: str = 'float32') -> Tensor:
Returns:
Tensor: FFT frequencies in Hz with shape `(n_fft//2 + 1,)`.
Examples:
.. code-block:: python
import paddle
sr = 16000
n_fft = 128
fft_freq = paddle.audio.functional.fft_frequencies(sr, n_fft)
"""
return paddle.linspace(0, float(sr) / 2, int(1 + n_fft // 2), dtype=dtype)

Expand All @@ -166,6 +199,15 @@ def compute_fbank_matrix(sr: int,
Returns:
Tensor: Mel transform matrix with shape `(n_mels, n_fft//2 + 1)`.
Examples:
.. code-block:: python
import paddle
n_mfcc = 23
n_mels = 51
paddle_dct = paddle.audio.functional.create_dct(n_mfcc, n_mels)
"""

if f_max is None:
Expand Down Expand Up @@ -221,6 +263,15 @@ def power_to_db(spect: Tensor,
Returns:
Tensor: Power spectrogram in db scale.
Examples:
.. code-block:: python
import paddle
val = 3.0
decibel_paddle = paddle.audio.functional.power_to_db(
paddle.to_tensor(val))
"""
if amin <= 0:
raise Exception("amin must be strictly positive")
Expand Down Expand Up @@ -254,6 +305,14 @@ def create_dct(n_mfcc: int,
Returns:
Tensor: The DCT matrix with shape `(n_mels, n_mfcc)`.
Examples:
.. code-block:: python
import paddle
n_mfcc = 23
n_mels = 257
dct = paddle.audio.functional.create_dct(n_mfcc, n_mels)
"""
n = paddle.arange(n_mels, dtype=dtype)
k = paddle.arange(n_mfcc, dtype=dtype).unsqueeze(1)
Expand Down
15 changes: 11 additions & 4 deletions python/paddle/audio/functional/window.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,6 @@
import paddle
from paddle import Tensor

__all__ = [
'get_window',
]


def _cat(x: List[Tensor], data_type: str) -> Tensor:
l = [paddle.to_tensor(_, data_type) for _ in x]
Expand Down Expand Up @@ -323,6 +319,17 @@ def get_window(window: Union[str, Tuple[str, float]],
Returns:
Tensor: The window represented as a tensor.
Examples:
.. code-block:: python
import paddle
n_fft = 512
cosine_window = paddle.audio.functional.get_window('cosine', n_fft)
std = 7
gussian_window = paddle.audio.functional.get_window(('gaussian',std), n_fft)
"""
sym = not fftbins

Expand Down
2 changes: 1 addition & 1 deletion python/paddle/tests/test_audio_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ def test_window(self, window_type: str, n_fft: int):
decimal=5)

@parameterize([1, 512])
def test_gussian_window_and_exception(self, n_fft: int):
def test_gaussian_window_and_exception(self, n_fft: int):
window_scipy_gaussain = signal.windows.gaussian(n_fft, std=7)
window_paddle_gaussian = paddle.audio.functional.get_window(
('gaussian', 7), n_fft, False)
Expand Down

0 comments on commit f08c104

Please sign in to comment.