diff --git a/beginner_source/audio_data_augmentation_tutorial.py b/beginner_source/audio_data_augmentation_tutorial.py index f3ac7917da1..4c0ff996696 100644 --- a/beginner_source/audio_data_augmentation_tutorial.py +++ b/beginner_source/audio_data_augmentation_tutorial.py @@ -1,16 +1,15 @@ # -*- coding: utf-8 -*- """ Audio Data Augmentation -======================= +================= ``torchaudio`` provides a variety of ways to augment audio data. - -In this tutorial, we look into a way to apply effects, filters, -RIR (room impulse response) and codecs. - -At the end, we synthesize noisy speech over phone from clean speech. """ +# When running this tutorial in Google Colab, install the required packages +# with the following. +# !pip install torchaudio + import torch import torchaudio import torchaudio.functional as F @@ -19,37 +18,167 @@ print(torchaudio.__version__) ###################################################################### -# Preparation -# ----------- -# -# First, we import the modules and download the audio assets we use in this tutorial. +# Preparing data and utility functions (skip this section) +# -------------------------------------------------------- # +#@title Prepare data and utility functions. {display-mode: "form"} +#@markdown +#@markdown You do not need to look into this cell. +#@markdown Just execute once and you are good to go. +#@markdown +#@markdown In this tutorial, we will use a speech data from [VOiCES dataset](https://iqtlabs.github.io/voices/), which is licensed under Creative Commos BY 4.0. + +#------------------------------------------------------------------------------- +# Preparation of data and helper functions. +#------------------------------------------------------------------------------- + import math +import os +import requests -from IPython.display import Audio import matplotlib.pyplot as plt +from IPython.display import Audio, display + + +_SAMPLE_DIR = "_sample_data" + +SAMPLE_WAV_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/steam-train-whistle-daniel_simon.wav" +SAMPLE_WAV_PATH = os.path.join(_SAMPLE_DIR, "steam.wav") + +SAMPLE_RIR_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit/distant-16k/room-response/rm1/impulse/Lab41-SRI-VOiCES-rm1-impulse-mc01-stu-clo.wav" +SAMPLE_RIR_PATH = os.path.join(_SAMPLE_DIR, "rir.wav") + +SAMPLE_WAV_SPEECH_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav" +SAMPLE_WAV_SPEECH_PATH = os.path.join(_SAMPLE_DIR, "speech.wav") + +SAMPLE_NOISE_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit/distant-16k/distractors/rm1/babb/Lab41-SRI-VOiCES-rm1-babb-mc01-stu-clo.wav" +SAMPLE_NOISE_PATH = os.path.join(_SAMPLE_DIR, "bg.wav") + +os.makedirs(_SAMPLE_DIR, exist_ok=True) + +def _fetch_data(): + uri = [ + (SAMPLE_WAV_URL, SAMPLE_WAV_PATH), + (SAMPLE_RIR_URL, SAMPLE_RIR_PATH), + (SAMPLE_WAV_SPEECH_URL, SAMPLE_WAV_SPEECH_PATH), + (SAMPLE_NOISE_URL, SAMPLE_NOISE_PATH), + ] + for url, path in uri: + with open(path, 'wb') as file_: + file_.write(requests.get(url).content) + +_fetch_data() + +def _get_sample(path, resample=None): + effects = [ + ["remix", "1"] + ] + if resample: + effects.extend([ + ["lowpass", f"{resample // 2}"], + ["rate", f'{resample}'], + ]) + return torchaudio.sox_effects.apply_effects_file(path, effects=effects) + +def get_sample(*, resample=None): + return _get_sample(SAMPLE_WAV_PATH, resample=resample) + +def get_speech_sample(*, resample=None): + return _get_sample(SAMPLE_WAV_SPEECH_PATH, resample=resample) + +def plot_waveform(waveform, sample_rate, title="Waveform", xlim=None, ylim=None): + waveform = waveform.numpy() + + num_channels, num_frames = waveform.shape + time_axis = torch.arange(0, num_frames) / sample_rate + + figure, axes = plt.subplots(num_channels, 1) + if num_channels == 1: + axes = [axes] + for c in range(num_channels): + axes[c].plot(time_axis, waveform[c], linewidth=1) + axes[c].grid(True) + if num_channels > 1: + axes[c].set_ylabel(f'Channel {c+1}') + if xlim: + axes[c].set_xlim(xlim) + if ylim: + axes[c].set_ylim(ylim) + figure.suptitle(title) + plt.show(block=False) + +def print_stats(waveform, sample_rate=None, src=None): + if src: + print("-" * 10) + print("Source:", src) + print("-" * 10) + if sample_rate: + print("Sample Rate:", sample_rate) + print("Shape:", tuple(waveform.shape)) + print("Dtype:", waveform.dtype) + print(f" - Max: {waveform.max().item():6.3f}") + print(f" - Min: {waveform.min().item():6.3f}") + print(f" - Mean: {waveform.mean().item():6.3f}") + print(f" - Std Dev: {waveform.std().item():6.3f}") + print() + print(waveform) + print() -from torchaudio.utils import download_asset - -SAMPLE_WAV = download_asset("tutorial-assets/steam-train-whistle-daniel_simon.wav") -SAMPLE_RIR = download_asset("tutorial-assets/Lab41-SRI-VOiCES-rm1-impulse-mc01-stu-clo-8000hz.wav") -SAMPLE_SPEECH = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042-8000hz.wav") -SAMPLE_NOISE = download_asset("tutorial-assets/Lab41-SRI-VOiCES-rm1-babb-mc01-stu-clo-8000hz.wav") +def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None): + waveform = waveform.numpy() + + num_channels, num_frames = waveform.shape + time_axis = torch.arange(0, num_frames) / sample_rate + + figure, axes = plt.subplots(num_channels, 1) + if num_channels == 1: + axes = [axes] + for c in range(num_channels): + axes[c].specgram(waveform[c], Fs=sample_rate) + if num_channels > 1: + axes[c].set_ylabel(f'Channel {c+1}') + if xlim: + axes[c].set_xlim(xlim) + figure.suptitle(title) + plt.show(block=False) + +def play_audio(waveform, sample_rate): + waveform = waveform.numpy() + + num_channels, num_frames = waveform.shape + if num_channels == 1: + display(Audio(waveform[0], rate=sample_rate)) + elif num_channels == 2: + display(Audio((waveform[0], waveform[1]), rate=sample_rate)) + else: + raise ValueError("Waveform with more than 2 channels are not supported.") + +def get_rir_sample(*, resample=None, processed=False): + rir_raw, sample_rate = _get_sample(SAMPLE_RIR_PATH, resample=resample) + if not processed: + return rir_raw, sample_rate + rir = rir_raw[:, int(sample_rate*1.01):int(sample_rate*1.3)] + rir = rir / torch.norm(rir, p=2) + rir = torch.flip(rir, [1]) + return rir, sample_rate + +def get_noise_sample(*, resample=None): + return _get_sample(SAMPLE_NOISE_PATH, resample=resample) ###################################################################### # Applying effects and filtering # ------------------------------ # -# :py:func:`torchaudio.sox_effects` allows for directly applying filters similar to +# ``torchaudio.sox_effects`` allows for directly applying filters similar to # those available in ``sox`` to Tensor objects and file object audio sources. # # There are two functions for this: # -# - :py:func:`torchaudio.sox_effects.apply_effects_tensor` for applying effects +# - ``torchaudio.sox_effects.apply_effects_tensor`` for applying effects # to Tensor. -# - :py:func:`torchaudio.sox_effects.apply_effects_file` for applying effects to +# - ``torchaudio.sox_effects.apply_effects_file`` for applying effects to # other audio sources. # # Both functions accept effect definitions in the form @@ -62,107 +191,55 @@ # documentation `__. # # **Tip** If you need to load and resample your audio data on the fly, -# then you can use :py:func:`torchaudio.sox_effects.apply_effects_file` -# with effect ``"rate"``. +# then you can use ``torchaudio.sox_effects.apply_effects_file`` with +# effect ``"rate"``. # -# **Note** :py:func:`torchaudio.sox_effects.apply_effects_file` accepts a -# file-like object or path-like object. -# Similar to :py:func:`torchaudio.load`, when the audio format cannot be +# **Note** ``apply_effects_file`` accepts a file-like object or path-like +# object. Similar to ``torchaudio.load``, when the audio format cannot be # inferred from either the file extension or header, you can provide # argument ``format`` to specify the format of the audio source. # # **Note** This process is not differentiable. # + # Load the data -waveform1, sample_rate1 = torchaudio.load(SAMPLE_WAV) +waveform1, sample_rate1 = get_sample(resample=16000) # Define effects effects = [ - ["lowpass", "-1", "300"], # apply single-pole lowpass filter - ["speed", "0.8"], # reduce the speed - # This only changes sample rate, so it is necessary to - # add `rate` effect with original sample rate after this. - ["rate", f"{sample_rate1}"], - ["reverb", "-w"], # Reverbration gives some dramatic feeling + ["lowpass", "-1", "300"], # apply single-pole lowpass filter + ["speed", "0.8"], # reduce the speed + # This only changes sample rate, so it is necessary to + # add `rate` effect with original sample rate after this. + ["rate", f"{sample_rate1}"], + ["reverb", "-w"], # Reverbration gives some dramatic feeling ] # Apply effects -waveform2, sample_rate2 = torchaudio.sox_effects.apply_effects_tensor(waveform1, sample_rate1, effects) +waveform2, sample_rate2 = torchaudio.sox_effects.apply_effects_tensor( + waveform1, sample_rate1, effects) -print(waveform1.shape, sample_rate1) -print(waveform2.shape, sample_rate2) +plot_waveform(waveform1, sample_rate1, title="Original", xlim=(-.1, 3.2)) +plot_waveform(waveform2, sample_rate2, title="Effects Applied", xlim=(-.1, 3.2)) +print_stats(waveform1, sample_rate=sample_rate1, src="Original") +print_stats(waveform2, sample_rate=sample_rate2, src="Effects Applied") ###################################################################### # Note that the number of frames and number of channels are different from # those of the original after the effects are applied. Let’s listen to the -# audio. +# audio. Doesn’t it sound more dramatic? # -def plot_waveform(waveform, sample_rate, title="Waveform", xlim=None): - waveform = waveform.numpy() - - num_channels, num_frames = waveform.shape - time_axis = torch.arange(0, num_frames) / sample_rate - - figure, axes = plt.subplots(num_channels, 1) - if num_channels == 1: - axes = [axes] - for c in range(num_channels): - axes[c].plot(time_axis, waveform[c], linewidth=1) - axes[c].grid(True) - if num_channels > 1: - axes[c].set_ylabel(f"Channel {c+1}") - if xlim: - axes[c].set_xlim(xlim) - figure.suptitle(title) - plt.show(block=False) - -###################################################################### -# - -def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None): - waveform = waveform.numpy() - - num_channels, _ = waveform.shape - - figure, axes = plt.subplots(num_channels, 1) - if num_channels == 1: - axes = [axes] - for c in range(num_channels): - axes[c].specgram(waveform[c], Fs=sample_rate) - if num_channels > 1: - axes[c].set_ylabel(f"Channel {c+1}") - if xlim: - axes[c].set_xlim(xlim) - figure.suptitle(title) - plt.show(block=False) - -###################################################################### -# Original: -# ~~~~~~~~~ -# - -plot_waveform(waveform1, sample_rate1, title="Original", xlim=(-0.1, 3.2)) plot_specgram(waveform1, sample_rate1, title="Original", xlim=(0, 3.04)) -Audio(waveform1, rate=sample_rate1) - -###################################################################### -# Effects applied: -# ~~~~~~~~~~~~~~~~ -# - -plot_waveform(waveform2, sample_rate2, title="Effects Applied", xlim=(-0.1, 3.2)) +play_audio(waveform1, sample_rate1) plot_specgram(waveform2, sample_rate2, title="Effects Applied", xlim=(0, 3.04)) -Audio(waveform2, rate=sample_rate2) +play_audio(waveform2, sample_rate2) -###################################################################### -# Doesn’t it sound more dramatic? -# ###################################################################### # Simulating room reverberation -# ----------------------------- +# ---------------------------- # # `Convolution # reverb `__ is a @@ -177,48 +254,44 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None): # and clap your hands. # -rir_raw, sample_rate = torchaudio.load(SAMPLE_RIR) -plot_waveform(rir_raw, sample_rate, title="Room Impulse Response (raw)") + +sample_rate = 8000 + +rir_raw, _ = get_rir_sample(resample=sample_rate) + +plot_waveform(rir_raw, sample_rate, title="Room Impulse Response (raw)", ylim=None) plot_specgram(rir_raw, sample_rate, title="Room Impulse Response (raw)") -Audio(rir_raw, rate=sample_rate) +play_audio(rir_raw, sample_rate) ###################################################################### # First, we need to clean up the RIR. We extract the main impulse, normalize # the signal power, then flip along the time axis. # -rir = rir_raw[:, int(sample_rate * 1.01) : int(sample_rate * 1.3)] +rir = rir_raw[:, int(sample_rate*1.01):int(sample_rate*1.3)] rir = rir / torch.norm(rir, p=2) -RIR = torch.flip(rir, [1]) +rir = torch.flip(rir, [1]) -plot_waveform(rir, sample_rate, title="Room Impulse Response") +print_stats(rir) +plot_waveform(rir, sample_rate, title="Room Impulse Response", ylim=None) ###################################################################### # Then, we convolve the speech signal with the RIR filter. # -speech, _ = torchaudio.load(SAMPLE_SPEECH) +speech, _ = get_speech_sample(resample=sample_rate) -speech_ = torch.nn.functional.pad(speech, (RIR.shape[1] - 1, 0)) -augmented = torch.nn.functional.conv1d(speech_[None, ...], RIR[None, ...])[0] +speech_ = torch.nn.functional.pad(speech, (rir.shape[1]-1, 0)) +augmented = torch.nn.functional.conv1d(speech_[None, ...], rir[None, ...])[0] -###################################################################### -# Original: -# ~~~~~~~~~ -# +plot_waveform(speech, sample_rate, title="Original", ylim=None) +plot_waveform(augmented, sample_rate, title="RIR Applied", ylim=None) -plot_waveform(speech, sample_rate, title="Original") plot_specgram(speech, sample_rate, title="Original") -Audio(speech, rate=sample_rate) +play_audio(speech, sample_rate) -###################################################################### -# RIR applied: -# ~~~~~~~~~~~~ -# - -plot_waveform(augmented, sample_rate, title="RIR Applied") plot_specgram(augmented, sample_rate, title="RIR Applied") -Audio(augmented, rate=sample_rate) +play_audio(augmented, sample_rate) ###################################################################### @@ -230,123 +303,58 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None): # intensity of noise is changing the Signal-to-Noise Ratio (SNR). # [`wikipedia `__] # -# $$ \\mathrm{SNR} = \\frac{P_{signal}}{P_{noise}} $$ +# \begin{align}\mathrm{SNR} = \frac{P_\mathrm{signal}}{P_\mathrm{noise}}\end{align} # -# $$ \\mathrm{SNR_{dB}} = 10 \\log _{{10}} \\mathrm {SNR} $$ +# \begin{align}{\mathrm {SNR_{{dB}}}}=10\log _{{10}}\left({\mathrm {SNR}}\right)\end{align} # -speech, _ = torchaudio.load(SAMPLE_SPEECH) -noise, _ = torchaudio.load(SAMPLE_NOISE) -noise = noise[:, : speech.shape[1]] - -speech_power = speech.norm(p=2) -noise_power = noise.norm(p=2) -snr_dbs = [20, 10, 3] -noisy_speeches = [] -for snr_db in snr_dbs: - snr = 10 ** (snr_db / 20) - scale = snr * noise_power / speech_power - noisy_speeches.append((scale * speech + noise) / 2) - -###################################################################### -# Background noise: -# ~~~~~~~~~~~~~~~~~ -# +sample_rate = 8000 +speech, _ = get_speech_sample(resample=sample_rate) +noise, _ = get_noise_sample(resample=sample_rate) +noise = noise[:, :speech.shape[1]] plot_waveform(noise, sample_rate, title="Background noise") plot_specgram(noise, sample_rate, title="Background noise") -Audio(noise, rate=sample_rate) - -###################################################################### -# SNR 20 dB: -# ~~~~~~~~~~ -# - -snr_db, noisy_speech = snr_dbs[0], noisy_speeches[0] -plot_waveform(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]") -plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]") -Audio(noisy_speech, rate=sample_rate) - -###################################################################### -# SNR 10 dB: -# ~~~~~~~~~~ -# - -snr_db, noisy_speech = snr_dbs[1], noisy_speeches[1] -plot_waveform(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]") -plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]") -Audio(noisy_speech, rate=sample_rate) +play_audio(noise, sample_rate) -###################################################################### -# SNR 3 dB: -# ~~~~~~~~~ -# +speech_power = speech.norm(p=2) +noise_power = noise.norm(p=2) -snr_db, noisy_speech = snr_dbs[2], noisy_speeches[2] -plot_waveform(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]") -plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]") -Audio(noisy_speech, rate=sample_rate) +for snr_db in [20, 10, 3]: + snr = math.exp(snr_db / 10) + scale = snr * noise_power / speech_power + noisy_speech = (scale * speech + noise) / 2 + plot_waveform(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]") + plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]") + play_audio(noisy_speech, sample_rate) ###################################################################### # Applying codec to Tensor object # ------------------------------- # -# :py:func:`torchaudio.functional.apply_codec` can apply codecs to -# a Tensor object. +# ``torchaudio.functional.apply_codec`` can apply codecs to a Tensor object. # # **Note** This process is not differentiable. # -waveform, sample_rate = torchaudio.load(SAMPLE_SPEECH) - -configs = [ - {"format": "wav", "encoding": "ULAW", "bits_per_sample": 8}, - {"format": "gsm"}, - {"format": "vorbis", "compression": -1}, -] -waveforms = [] -for param in configs: - augmented = F.apply_codec(waveform, sample_rate, **param) - waveforms.append(augmented) - -###################################################################### -# Original: -# ~~~~~~~~~ -# +waveform, sample_rate = get_speech_sample(resample=8000) -plot_waveform(waveform, sample_rate, title="Original") plot_specgram(waveform, sample_rate, title="Original") -Audio(waveform, rate=sample_rate) +play_audio(waveform, sample_rate) -###################################################################### -# 8 bit mu-law: -# ~~~~~~~~~~~~~ -# - -plot_waveform(waveforms[0], sample_rate, title="8 bit mu-law") -plot_specgram(waveforms[0], sample_rate, title="8 bit mu-law") -Audio(waveforms[0], rate=sample_rate) - -###################################################################### -# GSM-FR: -# ~~~~~~~ -# - -plot_waveform(waveforms[1], sample_rate, title="GSM-FR") -plot_specgram(waveforms[1], sample_rate, title="GSM-FR") -Audio(waveforms[1], rate=sample_rate) - -###################################################################### -# Vorbis: -# ~~~~~~~ -# - -plot_waveform(waveforms[2], sample_rate, title="Vorbis") -plot_specgram(waveforms[2], sample_rate, title="Vorbis") -Audio(waveforms[2], rate=sample_rate) +configs = [ + ({"format": "wav", "encoding": 'ULAW', "bits_per_sample": 8}, "8 bit mu-law"), + ({"format": "gsm"}, "GSM-FR"), + ({"format": "mp3", "compression": -9}, "MP3"), + ({"format": "vorbis", "compression": -1}, "Vorbis"), +] +for param, title in configs: + augmented = F.apply_codec(waveform, sample_rate, **param) + plot_specgram(augmented, sample_rate, title=title) + play_audio(augmented, sample_rate) ###################################################################### # Simulating a phone recoding @@ -358,86 +366,49 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None): # sample_rate = 16000 -original_speech, sample_rate = torchaudio.load(SAMPLE_SPEECH) +speech, _ = get_speech_sample(resample=sample_rate) -plot_specgram(original_speech, sample_rate, title="Original") +plot_specgram(speech, sample_rate, title="Original") +play_audio(speech, sample_rate) # Apply RIR -speech_ = torch.nn.functional.pad(original_speech, (RIR.shape[1] - 1, 0)) -rir_applied = torch.nn.functional.conv1d(speech_[None, ...], RIR[None, ...])[0] +rir, _ = get_rir_sample(resample=sample_rate, processed=True) +speech_ = torch.nn.functional.pad(speech, (rir.shape[1]-1, 0)) +speech = torch.nn.functional.conv1d(speech_[None, ...], rir[None, ...])[0] -plot_specgram(rir_applied, sample_rate, title="RIR Applied") +plot_specgram(speech, sample_rate, title="RIR Applied") +play_audio(speech, sample_rate) # Add background noise # Because the noise is recorded in the actual environment, we consider that # the noise contains the acoustic feature of the environment. Therefore, we add # the noise after RIR application. -noise, _ = torchaudio.load(SAMPLE_NOISE) -noise = noise[:, : rir_applied.shape[1]] +noise, _ = get_noise_sample(resample=sample_rate) +noise = noise[:, :speech.shape[1]] snr_db = 8 -scale = math.exp(snr_db / 10) * noise.norm(p=2) / rir_applied.norm(p=2) -bg_added = (scale * rir_applied + noise) / 2 +scale = math.exp(snr_db / 10) * noise.norm(p=2) / speech.norm(p=2) +speech = (scale * speech + noise) / 2 -plot_specgram(bg_added, sample_rate, title="BG noise added") +plot_specgram(speech, sample_rate, title="BG noise added") +play_audio(speech, sample_rate) # Apply filtering and change sample rate -filtered, sample_rate2 = torchaudio.sox_effects.apply_effects_tensor( - bg_added, - sample_rate, - effects=[ - ["lowpass", "4000"], - [ - "compand", - "0.02,0.05", - "-60,-60,-30,-10,-20,-8,-5,-8,-2,-8", - "-8", - "-7", - "0.05", - ], - ["rate", "8000"], - ], +speech, sample_rate = torchaudio.sox_effects.apply_effects_tensor( + speech, + sample_rate, + effects=[ + ["lowpass", "4000"], + ["compand", "0.02,0.05", "-60,-60,-30,-10,-20,-8,-5,-8,-2,-8", "-8", "-7", "0.05"], + ["rate", "8000"], + ], ) -plot_specgram(filtered, sample_rate2, title="Filtered") +plot_specgram(speech, sample_rate, title="Filtered") +play_audio(speech, sample_rate) # Apply telephony codec -codec_applied = F.apply_codec(filtered, sample_rate2, format="gsm") - -plot_specgram(codec_applied, sample_rate2, title="GSM Codec Applied") - - -###################################################################### -# Original speech: -# ~~~~~~~~~~~~~~~~ -# - -Audio(original_speech, rate=sample_rate) - -###################################################################### -# RIR applied: -# ~~~~~~~~~~~~ -# - -Audio(rir_applied, rate=sample_rate) - -###################################################################### -# Background noise added: -# ~~~~~~~~~~~~~~~~~~~~~~~ -# - -Audio(bg_added, rate=sample_rate) - -###################################################################### -# Filtered: -# ~~~~~~~~~ -# - -Audio(filtered, rate=sample_rate2) - -###################################################################### -# Codec aplied: -# ~~~~~~~~~~~~~ -# +speech = F.apply_codec(speech, sample_rate, format="gsm") -Audio(codec_applied, rate=sample_rate2) +plot_specgram(speech, sample_rate, title="GSM Codec Applied") +play_audio(speech, sample_rate) diff --git a/beginner_source/audio_datasets_tutorial.py b/beginner_source/audio_datasets_tutorial.py index f08ed99e0db..4b0e48f881a 100644 --- a/beginner_source/audio_datasets_tutorial.py +++ b/beginner_source/audio_datasets_tutorial.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- """ Audio Datasets -============== +======== ``torchaudio`` provides easy access to common, publicly accessible datasets. Please refer to the official documentation for the list of @@ -23,14 +23,14 @@ # -------------------------------------------------------- # -# @title Prepare data and utility functions. {display-mode: "form"} -# @markdown -# @markdown You do not need to look into this cell. -# @markdown Just execute once and you are good to go. +#@title Prepare data and utility functions. {display-mode: "form"} +#@markdown +#@markdown You do not need to look into this cell. +#@markdown Just execute once and you are good to go. -# ------------------------------------------------------------------------------- +#------------------------------------------------------------------------------- # Preparation of data and helper functions. -# ------------------------------------------------------------------------------- +#------------------------------------------------------------------------------- import multiprocessing import os @@ -38,50 +38,56 @@ from IPython.display import Audio, display -_SAMPLE_DIR = "_assets" +_SAMPLE_DIR = "_sample_data" YESNO_DATASET_PATH = os.path.join(_SAMPLE_DIR, "yes_no") os.makedirs(YESNO_DATASET_PATH, exist_ok=True) +def _download_yesno(): + if os.path.exists(os.path.join(YESNO_DATASET_PATH, "waves_yesno.tar.gz")): + return + torchaudio.datasets.YESNO(root=YESNO_DATASET_PATH, download=True) -def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None): - waveform = waveform.numpy() - - num_channels, num_frames = waveform.shape - - figure, axes = plt.subplots(num_channels, 1) - if num_channels == 1: - axes = [axes] - for c in range(num_channels): - axes[c].specgram(waveform[c], Fs=sample_rate) - if num_channels > 1: - axes[c].set_ylabel(f"Channel {c+1}") - if xlim: - axes[c].set_xlim(xlim) - figure.suptitle(title) - plt.show(block=False) +YESNO_DOWNLOAD_PROCESS = multiprocessing.Process(target=_download_yesno) +YESNO_DOWNLOAD_PROCESS.start() +def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None): + waveform = waveform.numpy() + + num_channels, num_frames = waveform.shape + time_axis = torch.arange(0, num_frames) / sample_rate + + figure, axes = plt.subplots(num_channels, 1) + if num_channels == 1: + axes = [axes] + for c in range(num_channels): + axes[c].specgram(waveform[c], Fs=sample_rate) + if num_channels > 1: + axes[c].set_ylabel(f'Channel {c+1}') + if xlim: + axes[c].set_xlim(xlim) + figure.suptitle(title) + plt.show(block=False) def play_audio(waveform, sample_rate): - waveform = waveform.numpy() - - num_channels, num_frames = waveform.shape - if num_channels == 1: - display(Audio(waveform[0], rate=sample_rate)) - elif num_channels == 2: - display(Audio((waveform[0], waveform[1]), rate=sample_rate)) - else: - raise ValueError("Waveform with more than 2 channels are not supported.") + waveform = waveform.numpy() + num_channels, num_frames = waveform.shape + if num_channels == 1: + display(Audio(waveform[0], rate=sample_rate)) + elif num_channels == 2: + display(Audio((waveform[0], waveform[1]), rate=sample_rate)) + else: + raise ValueError("Waveform with more than 2 channels are not supported.") ###################################################################### -# Here, we show how to use the -# :py:func:`torchaudio.datasets.YESNO` dataset. +# Here, we show how to use the ``YESNO`` dataset. # +YESNO_DOWNLOAD_PROCESS.join() dataset = torchaudio.datasets.YESNO(YESNO_DATASET_PATH, download=True) for i in [1, 3, 5]: - waveform, sample_rate, label = dataset[i] - plot_specgram(waveform, sample_rate, title=f"Sample {i}: {label}") - play_audio(waveform, sample_rate) + waveform, sample_rate, label = dataset[i] + plot_specgram(waveform, sample_rate, title=f"Sample {i}: {label}") + play_audio(waveform, sample_rate) diff --git a/beginner_source/audio_feature_augmentation_tutorial.py b/beginner_source/audio_feature_augmentation_tutorial.py index 3961dafbc74..c11696de311 100644 --- a/beginner_source/audio_feature_augmentation_tutorial.py +++ b/beginner_source/audio_feature_augmentation_tutorial.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- """ Audio Feature Augmentation -========================== +==================== """ # When running this tutorial in Google Colab, install the required packages @@ -20,90 +20,82 @@ # -------------------------------------------------------- # -# @title Prepare data and utility functions. {display-mode: "form"} -# @markdown -# @markdown You do not need to look into this cell. -# @markdown Just execute once and you are good to go. -# @markdown -# @markdown In this tutorial, we will use a speech data from [VOiCES dataset](https://iqtlabs.github.io/voices/), -# @markdown which is licensed under Creative Commos BY 4.0. +#@title Prepare data and utility functions. {display-mode: "form"} +#@markdown +#@markdown You do not need to look into this cell. +#@markdown Just execute once and you are good to go. +#@markdown +#@markdown In this tutorial, we will use a speech data from [VOiCES dataset](https://iqtlabs.github.io/voices/), which is licensed under Creative Commos BY 4.0. -# ------------------------------------------------------------------------------- +#------------------------------------------------------------------------------- # Preparation of data and helper functions. -# ------------------------------------------------------------------------------- +#------------------------------------------------------------------------------- import os +import requests import librosa import matplotlib.pyplot as plt -import requests -_SAMPLE_DIR = "_assets" +_SAMPLE_DIR = "_sample_data" -SAMPLE_WAV_SPEECH_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav" # noqa: E501 +SAMPLE_WAV_SPEECH_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav" SAMPLE_WAV_SPEECH_PATH = os.path.join(_SAMPLE_DIR, "speech.wav") os.makedirs(_SAMPLE_DIR, exist_ok=True) - def _fetch_data(): - uri = [ - (SAMPLE_WAV_SPEECH_URL, SAMPLE_WAV_SPEECH_PATH), - ] - for url, path in uri: - with open(path, "wb") as file_: - file_.write(requests.get(url).content) - + uri = [ + (SAMPLE_WAV_SPEECH_URL, SAMPLE_WAV_SPEECH_PATH), + ] + for url, path in uri: + with open(path, 'wb') as file_: + file_.write(requests.get(url).content) _fetch_data() - def _get_sample(path, resample=None): - effects = [["remix", "1"]] - if resample: - effects.extend( - [ - ["lowpass", f"{resample // 2}"], - ["rate", f"{resample}"], - ] - ) - return torchaudio.sox_effects.apply_effects_file(path, effects=effects) - + effects = [ + ["remix", "1"] + ] + if resample: + effects.extend([ + ["lowpass", f"{resample // 2}"], + ["rate", f'{resample}'], + ]) + return torchaudio.sox_effects.apply_effects_file(path, effects=effects) def get_speech_sample(*, resample=None): - return _get_sample(SAMPLE_WAV_SPEECH_PATH, resample=resample) - + return _get_sample(SAMPLE_WAV_SPEECH_PATH, resample=resample) def get_spectrogram( - n_fft=400, - win_len=None, - hop_len=None, - power=2.0, + n_fft = 400, + win_len = None, + hop_len = None, + power = 2.0, ): - waveform, _ = get_speech_sample() - spectrogram = T.Spectrogram( - n_fft=n_fft, - win_length=win_len, - hop_length=hop_len, - center=True, - pad_mode="reflect", - power=power, - ) - return spectrogram(waveform) - - -def plot_spectrogram(spec, title=None, ylabel="freq_bin", aspect="auto", xmax=None): - fig, axs = plt.subplots(1, 1) - axs.set_title(title or "Spectrogram (db)") - axs.set_ylabel(ylabel) - axs.set_xlabel("frame") - im = axs.imshow(librosa.power_to_db(spec), origin="lower", aspect=aspect) - if xmax: - axs.set_xlim((0, xmax)) - fig.colorbar(im, ax=axs) - plt.show(block=False) - + waveform, _ = get_speech_sample() + spectrogram = T.Spectrogram( + n_fft=n_fft, + win_length=win_len, + hop_length=hop_len, + center=True, + pad_mode="reflect", + power=power, + ) + return spectrogram(waveform) + +def plot_spectrogram(spec, title=None, ylabel='freq_bin', aspect='auto', xmax=None): + fig, axs = plt.subplots(1, 1) + axs.set_title(title or 'Spectrogram (db)') + axs.set_ylabel(ylabel) + axs.set_xlabel('frame') + im = axs.imshow(librosa.power_to_db(spec), origin='lower', aspect=aspect) + if xmax: + axs.set_xlim((0, xmax)) + fig.colorbar(im, ax=axs) + plt.show(block=False) ###################################################################### # SpecAugment @@ -112,33 +104,29 @@ def plot_spectrogram(spec, title=None, ylabel="freq_bin", aspect="auto", xmax=No # `SpecAugment `__ # is a popular spectrogram augmentation technique. # -# ``torchaudio`` implements :py:func:`torchaudio.transforms.TimeStretch`, -# :py:func:`torchaudio.transforms.TimeMasking` and -# :py:func:`torchaudio.transforms.FrequencyMasking`. +# ``torchaudio`` implements ``TimeStretch``, ``TimeMasking`` and +# ``FrequencyMasking``. # - -###################################################################### # TimeStretch -# ----------- +# ~~~~~~~~~~ # - spec = get_spectrogram(power=None) stretch = T.TimeStretch() rate = 1.2 spec_ = stretch(spec, rate) -plot_spectrogram(torch.abs(spec_[0]), title=f"Stretched x{rate}", aspect="equal", xmax=304) +plot_spectrogram(torch.abs(spec_[0]), title=f"Stretched x{rate}", aspect='equal', xmax=304) -plot_spectrogram(torch.abs(spec[0]), title="Original", aspect="equal", xmax=304) +plot_spectrogram(torch.abs(spec[0]), title="Original", aspect='equal', xmax=304) rate = 0.9 spec_ = stretch(spec, rate) -plot_spectrogram(torch.abs(spec_[0]), title=f"Stretched x{rate}", aspect="equal", xmax=304) +plot_spectrogram(torch.abs(spec_[0]), title=f"Stretched x{rate}", aspect='equal', xmax=304) ###################################################################### # TimeMasking -# ----------- +# ~~~~~~~~~~~ # torch.random.manual_seed(4) @@ -153,7 +141,7 @@ def plot_spectrogram(spec, title=None, ylabel="freq_bin", aspect="auto", xmax=No ###################################################################### # FrequencyMasking -# ---------------- +# ~~~~~~~~~~~~~~~~ # diff --git a/beginner_source/audio_feature_extractions_tutorial.py b/beginner_source/audio_feature_extractions_tutorial.py index 822c00d97ba..d2b3b858588 100644 --- a/beginner_source/audio_feature_extractions_tutorial.py +++ b/beginner_source/audio_feature_extractions_tutorial.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- """ Audio Feature Extractions -========================= +=================== ``torchaudio`` implements feature extractions commonly used in the audio domain. They are available in ``torchaudio.functional`` and @@ -11,10 +11,20 @@ They are stateless. ``transforms`` implements features as objects, -using implementations from ``functional`` and ``torch.nn.Module``. -They can be serialized using TorchScript. +using implementations from ``functional`` and ``torch.nn.Module``. Because all +transforms are subclasses of ``torch.nn.Module``, they can be serialized +using TorchScript. + +For the complete list of available features, please refer to the +documentation. In this tutorial, we will look into converting between the +time domain and frequency domain (``Spectrogram``, ``GriffinLim``, +``MelSpectrogram``). """ +# When running this tutorial in Google Colab, install the required packages +# with the following. +# !pip install torchaudio librosa + import torch import torchaudio import torchaudio.functional as F @@ -24,95 +34,186 @@ print(torchaudio.__version__) ###################################################################### -# Preparation -# ----------- -# -# .. note:: -# -# When running this tutorial in Google Colab, install the required packages +# Preparing data and utility functions (skip this section) +# -------------------------------------------------------- # -# .. code:: -# -# !pip install librosa -# -from IPython.display import Audio -import librosa -import matplotlib.pyplot as plt -from torchaudio.utils import download_asset - -torch.random.manual_seed(0) - -SAMPLE_SPEECH = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav") - - -def plot_waveform(waveform, sr, title="Waveform"): - waveform = waveform.numpy() - - num_channels, num_frames = waveform.shape - time_axis = torch.arange(0, num_frames) / sr - - figure, axes = plt.subplots(num_channels, 1) - axes.plot(time_axis, waveform[0], linewidth=1) - axes.grid(True) - figure.suptitle(title) - plt.show(block=False) +#@title Prepare data and utility functions. {display-mode: "form"} +#@markdown +#@markdown You do not need to look into this cell. +#@markdown Just execute once and you are good to go. +#@markdown +#@markdown In this tutorial, we will use a speech data from [VOiCES dataset](https://iqtlabs.github.io/voices/), which is licensed under Creative Commos BY 4.0. -def plot_spectrogram(specgram, title=None, ylabel="freq_bin"): - fig, axs = plt.subplots(1, 1) - axs.set_title(title or "Spectrogram (db)") - axs.set_ylabel(ylabel) - axs.set_xlabel("frame") - im = axs.imshow(librosa.power_to_db(specgram), origin="lower", aspect="auto") - fig.colorbar(im, ax=axs) - plt.show(block=False) +#------------------------------------------------------------------------------- +# Preparation of data and helper functions. +#------------------------------------------------------------------------------- +import os +import requests -def plot_fbank(fbank, title=None): - fig, axs = plt.subplots(1, 1) - axs.set_title(title or "Filter bank") - axs.imshow(fbank, aspect="auto") - axs.set_ylabel("frequency bin") - axs.set_xlabel("mel bin") - plt.show(block=False) - - -###################################################################### -# Overview of audio features -# -------------------------- -# -# The following diagram shows the relationship between common audio features -# and torchaudio APIs to generate them. -# -# .. image:: https://download.pytorch.org/torchaudio/tutorial-assets/torchaudio_feature_extractions.png -# -# For the complete list of available features, please refer to the -# documentation. -# - +import librosa +import matplotlib.pyplot as plt +from IPython.display import Audio, display + + +_SAMPLE_DIR = "_sample_data" + +SAMPLE_WAV_SPEECH_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav" +SAMPLE_WAV_SPEECH_PATH = os.path.join(_SAMPLE_DIR, "speech.wav") + +os.makedirs(_SAMPLE_DIR, exist_ok=True) + + +def _fetch_data(): + uri = [ + (SAMPLE_WAV_SPEECH_URL, SAMPLE_WAV_SPEECH_PATH), + ] + for url, path in uri: + with open(path, 'wb') as file_: + file_.write(requests.get(url).content) + +_fetch_data() + +def _get_sample(path, resample=None): + effects = [ + ["remix", "1"] + ] + if resample: + effects.extend([ + ["lowpass", f"{resample // 2}"], + ["rate", f'{resample}'], + ]) + return torchaudio.sox_effects.apply_effects_file(path, effects=effects) + +def get_speech_sample(*, resample=None): + return _get_sample(SAMPLE_WAV_SPEECH_PATH, resample=resample) + +def print_stats(waveform, sample_rate=None, src=None): + if src: + print("-" * 10) + print("Source:", src) + print("-" * 10) + if sample_rate: + print("Sample Rate:", sample_rate) + print("Shape:", tuple(waveform.shape)) + print("Dtype:", waveform.dtype) + print(f" - Max: {waveform.max().item():6.3f}") + print(f" - Min: {waveform.min().item():6.3f}") + print(f" - Mean: {waveform.mean().item():6.3f}") + print(f" - Std Dev: {waveform.std().item():6.3f}") + print() + print(waveform) + print() + +def plot_spectrogram(spec, title=None, ylabel='freq_bin', aspect='auto', xmax=None): + fig, axs = plt.subplots(1, 1) + axs.set_title(title or 'Spectrogram (db)') + axs.set_ylabel(ylabel) + axs.set_xlabel('frame') + im = axs.imshow(librosa.power_to_db(spec), origin='lower', aspect=aspect) + if xmax: + axs.set_xlim((0, xmax)) + fig.colorbar(im, ax=axs) + plt.show(block=False) + +def plot_waveform(waveform, sample_rate, title="Waveform", xlim=None, ylim=None): + waveform = waveform.numpy() + + num_channels, num_frames = waveform.shape + time_axis = torch.arange(0, num_frames) / sample_rate + + figure, axes = plt.subplots(num_channels, 1) + if num_channels == 1: + axes = [axes] + for c in range(num_channels): + axes[c].plot(time_axis, waveform[c], linewidth=1) + axes[c].grid(True) + if num_channels > 1: + axes[c].set_ylabel(f'Channel {c+1}') + if xlim: + axes[c].set_xlim(xlim) + if ylim: + axes[c].set_ylim(ylim) + figure.suptitle(title) + plt.show(block=False) + +def play_audio(waveform, sample_rate): + waveform = waveform.numpy() + + num_channels, num_frames = waveform.shape + if num_channels == 1: + display(Audio(waveform[0], rate=sample_rate)) + elif num_channels == 2: + display(Audio((waveform[0], waveform[1]), rate=sample_rate)) + else: + raise ValueError("Waveform with more than 2 channels are not supported.") + +def plot_mel_fbank(fbank, title=None): + fig, axs = plt.subplots(1, 1) + axs.set_title(title or 'Filter bank') + axs.imshow(fbank, aspect='auto') + axs.set_ylabel('frequency bin') + axs.set_xlabel('mel bin') + plt.show(block=False) + +def plot_pitch(waveform, sample_rate, pitch): + figure, axis = plt.subplots(1, 1) + axis.set_title("Pitch Feature") + axis.grid(True) + + end_time = waveform.shape[1] / sample_rate + time_axis = torch.linspace(0, end_time, waveform.shape[1]) + axis.plot(time_axis, waveform[0], linewidth=1, color='gray', alpha=0.3) + + axis2 = axis.twinx() + time_axis = torch.linspace(0, end_time, pitch.shape[1]) + ln2 = axis2.plot( + time_axis, pitch[0], linewidth=2, label='Pitch', color='green') + + axis2.legend(loc=0) + plt.show(block=False) + +def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc): + figure, axis = plt.subplots(1, 1) + axis.set_title("Kaldi Pitch Feature") + axis.grid(True) + + end_time = waveform.shape[1] / sample_rate + time_axis = torch.linspace(0, end_time, waveform.shape[1]) + axis.plot(time_axis, waveform[0], linewidth=1, color='gray', alpha=0.3) + + time_axis = torch.linspace(0, end_time, pitch.shape[1]) + ln1 = axis.plot(time_axis, pitch[0], linewidth=2, label='Pitch', color='green') + axis.set_ylim((-1.3, 1.3)) + + axis2 = axis.twinx() + time_axis = torch.linspace(0, end_time, nfcc.shape[1]) + ln2 = axis2.plot( + time_axis, nfcc[0], linewidth=2, label='NFCC', color='blue', linestyle='--') + + lns = ln1 + ln2 + labels = [l.get_label() for l in lns] + axis.legend(lns, labels, loc=0) + plt.show(block=False) ###################################################################### # Spectrogram # ----------- # # To get the frequency make-up of an audio signal as it varies with time, -# you can use :py:func:`torchaudio.transforms.Spectrogram`. +# you can use ``Spectrogram``. # -SPEECH_WAVEFORM, SAMPLE_RATE = torchaudio.load(SAMPLE_SPEECH) -plot_waveform(SPEECH_WAVEFORM, SAMPLE_RATE, title="Original waveform") -Audio(SPEECH_WAVEFORM.numpy(), rate=SAMPLE_RATE) - -###################################################################### -# +waveform, sample_rate = get_speech_sample() n_fft = 1024 win_length = None hop_length = 512 -# Define transform +# define transformation spectrogram = T.Spectrogram( n_fft=n_fft, win_length=win_length, @@ -121,17 +222,11 @@ def plot_fbank(fbank, title=None): pad_mode="reflect", power=2.0, ) +# Perform transformation +spec = spectrogram(waveform) -###################################################################### -# - -# Perform transform -spec = spectrogram(SPEECH_WAVEFORM) - -###################################################################### -# - -plot_spectrogram(spec[0], title="torchaudio") +print_stats(spec) +plot_spectrogram(spec[0], title='torchaudio') ###################################################################### # GriffinLim @@ -140,7 +235,11 @@ def plot_fbank(fbank, title=None): # To recover a waveform from a spectrogram, you can use ``GriffinLim``. # + torch.random.manual_seed(0) +waveform, sample_rate = get_speech_sample() +plot_waveform(waveform, sample_rate, title="Original") +play_audio(waveform, sample_rate) n_fft = 1024 win_length = None @@ -150,39 +249,30 @@ def plot_fbank(fbank, title=None): n_fft=n_fft, win_length=win_length, hop_length=hop_length, -)(SPEECH_WAVEFORM) - -###################################################################### -# +)(waveform) griffin_lim = T.GriffinLim( n_fft=n_fft, win_length=win_length, hop_length=hop_length, ) +waveform = griffin_lim(spec) -###################################################################### -# - -reconstructed_waveform = griffin_lim(spec) - -###################################################################### -# - -plot_waveform(reconstructed_waveform, SAMPLE_RATE, title="Reconstructed") -Audio(reconstructed_waveform, rate=SAMPLE_RATE) +plot_waveform(waveform, sample_rate, title="Reconstructed") +play_audio(waveform, sample_rate) ###################################################################### # Mel Filter Bank # --------------- # -# :py:func:`torchaudio.functional.melscale_fbanks` generates the filter bank +# ``torchaudio.functional.melscale_fbanks`` generates the filter bank # for converting frequency bins to mel-scale bins. # # Since this function does not require input audio/features, there is no -# equivalent transform in :py:func:`torchaudio.transforms`. +# equivalent transform in ``torchaudio.transforms``. # + n_fft = 256 n_mels = 64 sample_rate = 6000 @@ -190,16 +280,12 @@ def plot_fbank(fbank, title=None): mel_filters = F.melscale_fbanks( int(n_fft // 2 + 1), n_mels=n_mels, - f_min=0.0, - f_max=sample_rate / 2.0, + f_min=0., + f_max=sample_rate/2., sample_rate=sample_rate, - norm="slaney", + norm='slaney' ) - -###################################################################### -# - -plot_fbank(mel_filters, "Mel Filter Bank - torchaudio") +plot_mel_fbank(mel_filters, "Mel Filter Bank - torchaudio") ###################################################################### # Comparison against librosa @@ -209,34 +295,34 @@ def plot_fbank(fbank, title=None): # with ``librosa``. # + mel_filters_librosa = librosa.filters.mel( - sr=sample_rate, - n_fft=n_fft, + sample_rate, + n_fft, n_mels=n_mels, - fmin=0.0, - fmax=sample_rate / 2.0, - norm="slaney", + fmin=0., + fmax=sample_rate/2., + norm='slaney', htk=True, ).T -###################################################################### -# - -plot_fbank(mel_filters_librosa, "Mel Filter Bank - librosa") +plot_mel_fbank(mel_filters_librosa, "Mel Filter Bank - librosa") mse = torch.square(mel_filters - mel_filters_librosa).mean().item() -print("Mean Square Difference: ", mse) +print('Mean Square Difference: ', mse) ###################################################################### # MelSpectrogram # -------------- # # Generating a mel-scale spectrogram involves generating a spectrogram -# and performing mel-scale conversion. In ``torchaudio``, -# :py:func:`torchaudio.transforms.MelSpectrogram` provides +# and performing mel-scale conversion. In ``torchaudio``, ``MelSpectrogram`` provides # this functionality. # + +waveform, sample_rate = get_speech_sample() + n_fft = 1024 win_length = None hop_length = 512 @@ -250,18 +336,15 @@ def plot_fbank(fbank, title=None): center=True, pad_mode="reflect", power=2.0, - norm="slaney", + norm='slaney', onesided=True, n_mels=n_mels, mel_scale="htk", ) -melspec = mel_spectrogram(SPEECH_WAVEFORM) - -###################################################################### -# - -plot_spectrogram(melspec[0], title="MelSpectrogram - torchaudio", ylabel="mel freq") +melspec = mel_spectrogram(waveform) +plot_spectrogram( + melspec[0], title="MelSpectrogram - torchaudio", ylabel='mel freq') ###################################################################### # Comparison against librosa @@ -271,8 +354,9 @@ def plot_fbank(fbank, title=None): # spectrograms with ``librosa``. # + melspec_librosa = librosa.feature.melspectrogram( - y=SPEECH_WAVEFORM.numpy()[0], + waveform.numpy()[0], sr=sample_rate, n_fft=n_fft, hop_length=hop_length, @@ -281,23 +365,22 @@ def plot_fbank(fbank, title=None): pad_mode="reflect", power=2.0, n_mels=n_mels, - norm="slaney", + norm='slaney', htk=True, ) - -###################################################################### -# - -plot_spectrogram(melspec_librosa, title="MelSpectrogram - librosa", ylabel="mel freq") +plot_spectrogram( + melspec_librosa, title="MelSpectrogram - librosa", ylabel='mel freq') mse = torch.square(melspec - melspec_librosa).mean().item() -print("Mean Square Difference: ", mse) +print('Mean Square Difference: ', mse) ###################################################################### # MFCC # ---- # +waveform, sample_rate = get_speech_sample() + n_fft = 2048 win_length = None hop_length = 512 @@ -308,102 +391,48 @@ def plot_fbank(fbank, title=None): sample_rate=sample_rate, n_mfcc=n_mfcc, melkwargs={ - "n_fft": n_fft, - "n_mels": n_mels, - "hop_length": hop_length, - "mel_scale": "htk", - }, + 'n_fft': n_fft, + 'n_mels': n_mels, + 'hop_length': hop_length, + 'mel_scale': 'htk', + } ) -mfcc = mfcc_transform(SPEECH_WAVEFORM) - -###################################################################### -# +mfcc = mfcc_transform(waveform) plot_spectrogram(mfcc[0]) ###################################################################### -# Comparison against librosa -# ~~~~~~~~~~~~~~~~~~~~~~~~~~ +# Comparing against librosa +# ~~~~~~~~~~~~~~~~~~~~~~~~~ # + melspec = librosa.feature.melspectrogram( - y=SPEECH_WAVEFORM.numpy()[0], - sr=sample_rate, - n_fft=n_fft, - win_length=win_length, - hop_length=hop_length, - n_mels=n_mels, - htk=True, - norm=None, -) + y=waveform.numpy()[0], sr=sample_rate, n_fft=n_fft, + win_length=win_length, hop_length=hop_length, + n_mels=n_mels, htk=True, norm=None) mfcc_librosa = librosa.feature.mfcc( - S=librosa.core.spectrum.power_to_db(melspec), - n_mfcc=n_mfcc, - dct_type=2, - norm="ortho", -) - -###################################################################### -# + S=librosa.core.spectrum.power_to_db(melspec), + n_mfcc=n_mfcc, dct_type=2, norm='ortho') plot_spectrogram(mfcc_librosa) mse = torch.square(mfcc - mfcc_librosa).mean().item() -print("Mean Square Difference: ", mse) - -###################################################################### -# LFCC -# ---- -# - -n_fft = 2048 -win_length = None -hop_length = 512 -n_lfcc = 256 - -lfcc_transform = T.LFCC( - sample_rate=sample_rate, - n_lfcc=n_lfcc, - speckwargs={ - "n_fft": n_fft, - "win_length": win_length, - "hop_length": hop_length, - }, -) - -lfcc = lfcc_transform(SPEECH_WAVEFORM) -plot_spectrogram(lfcc[0]) +print('Mean Square Difference: ', mse) ###################################################################### # Pitch # ----- # -pitch = F.detect_pitch_frequency(SPEECH_WAVEFORM, SAMPLE_RATE) - -###################################################################### -# - -def plot_pitch(waveform, sr, pitch): - figure, axis = plt.subplots(1, 1) - axis.set_title("Pitch Feature") - axis.grid(True) - - end_time = waveform.shape[1] / sr - time_axis = torch.linspace(0, end_time, waveform.shape[1]) - axis.plot(time_axis, waveform[0], linewidth=1, color="gray", alpha=0.3) - - axis2 = axis.twinx() - time_axis = torch.linspace(0, end_time, pitch.shape[1]) - axis2.plot(time_axis, pitch[0], linewidth=2, label="Pitch", color="green") - axis2.legend(loc=0) - plt.show(block=False) +waveform, sample_rate = get_speech_sample() - -plot_pitch(SPEECH_WAVEFORM, SAMPLE_RATE, pitch) +pitch = F.detect_pitch_frequency(waveform, sample_rate) +plot_pitch(waveform, sample_rate, pitch) +play_audio(waveform, sample_rate) ###################################################################### # Kaldi Pitch (beta) @@ -411,7 +440,7 @@ def plot_pitch(waveform, sr, pitch): # # Kaldi Pitch feature [1] is a pitch detection mechanism tuned for automatic # speech recognition (ASR) applications. This is a beta feature in ``torchaudio``, -# and it is available as :py:func:`torchaudio.functional.compute_kaldi_pitch`. +# and it is available only in ``functional``. # # 1. A pitch extraction algorithm tuned for automatic speech recognition # @@ -425,33 +454,11 @@ def plot_pitch(waveform, sr, pitch): # [`paper `__] # -pitch_feature = F.compute_kaldi_pitch(SPEECH_WAVEFORM, SAMPLE_RATE) -pitch, nfcc = pitch_feature[..., 0], pitch_feature[..., 1] - -###################################################################### -# - -def plot_kaldi_pitch(waveform, sr, pitch, nfcc): - _, axis = plt.subplots(1, 1) - axis.set_title("Kaldi Pitch Feature") - axis.grid(True) - - end_time = waveform.shape[1] / sr - time_axis = torch.linspace(0, end_time, waveform.shape[1]) - axis.plot(time_axis, waveform[0], linewidth=1, color="gray", alpha=0.3) - time_axis = torch.linspace(0, end_time, pitch.shape[1]) - ln1 = axis.plot(time_axis, pitch[0], linewidth=2, label="Pitch", color="green") - axis.set_ylim((-1.3, 1.3)) - - axis2 = axis.twinx() - time_axis = torch.linspace(0, end_time, nfcc.shape[1]) - ln2 = axis2.plot(time_axis, nfcc[0], linewidth=2, label="NFCC", color="blue", linestyle="--") - - lns = ln1 + ln2 - labels = [l.get_label() for l in lns] - axis.legend(lns, labels, loc=0) - plt.show(block=False) +waveform, sample_rate = get_speech_sample(resample=16000) +pitch_feature = F.compute_kaldi_pitch(waveform, sample_rate) +pitch, nfcc = pitch_feature[..., 0], pitch_feature[..., 1] -plot_kaldi_pitch(SPEECH_WAVEFORM, SAMPLE_RATE, pitch, nfcc) +plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc) +play_audio(waveform, sample_rate) diff --git a/beginner_source/audio_io_tutorial.py b/beginner_source/audio_io_tutorial.py index 4917f1b1025..9fa23bbad55 100644 --- a/beginner_source/audio_io_tutorial.py +++ b/beginner_source/audio_io_tutorial.py @@ -3,10 +3,13 @@ Audio I/O ========= -This tutorial shows how to use TorchAudio's basic I/O API to load audio files -into PyTorch's Tensor object, and save Tensor objects to audio files. +``torchaudio`` integrates ``libsox`` and provides a rich set of audio I/O. """ +# When running this tutorial in Google Colab, install the required packages +# with the following. +# !pip install torchaudio boto3 + import torch import torchaudio @@ -14,47 +17,163 @@ print(torchaudio.__version__) ###################################################################### -# Preparation -# ----------- -# -# First, we import the modules and download the audio assets we use in this tutorial. -# -# .. note:: -# When running this tutorial in Google Colab, install the required packages -# with the following: +# Preparing data and utility functions (skip this section) +# -------------------------------------------------------- # -# .. code:: -# -# !pip install boto3 + +#@title Prepare data and utility functions. {display-mode: "form"} +#@markdown +#@markdown You do not need to look into this cell. +#@markdown Just execute once and you are good to go. +#@markdown +#@markdown In this tutorial, we will use a speech data from [VOiCES dataset](https://iqtlabs.github.io/voices/), which is licensed under Creative Commos BY 4.0. + import io import os +import requests import tarfile -import tempfile import boto3 -import matplotlib.pyplot as plt -import requests from botocore import UNSIGNED from botocore.config import Config -from IPython.display import Audio -from torchaudio.utils import download_asset +import matplotlib.pyplot as plt +from IPython.display import Audio, display -SAMPLE_GSM = download_asset("tutorial-assets/steam-train-whistle-daniel_simon.gsm") -SAMPLE_WAV = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav") -SAMPLE_WAV_8000 = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042-8000hz.wav") +_SAMPLE_DIR = "_sample_data" +SAMPLE_WAV_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/steam-train-whistle-daniel_simon.wav" +SAMPLE_WAV_PATH = os.path.join(_SAMPLE_DIR, "steam.wav") +SAMPLE_MP3_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/steam-train-whistle-daniel_simon.mp3" +SAMPLE_MP3_PATH = os.path.join(_SAMPLE_DIR, "steam.mp3") + +SAMPLE_GSM_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/steam-train-whistle-daniel_simon.gsm" +SAMPLE_GSM_PATH = os.path.join(_SAMPLE_DIR, "steam.gsm") + +SAMPLE_WAV_SPEECH_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav" +SAMPLE_WAV_SPEECH_PATH = os.path.join(_SAMPLE_DIR, "speech.wav") + +SAMPLE_TAR_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit.tar.gz" +SAMPLE_TAR_PATH = os.path.join(_SAMPLE_DIR, "sample.tar.gz") +SAMPLE_TAR_ITEM = "VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav" + +S3_BUCKET = "pytorch-tutorial-assets" +S3_KEY = "VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav" + + +def _fetch_data(): + os.makedirs(_SAMPLE_DIR, exist_ok=True) + uri = [ + (SAMPLE_WAV_URL, SAMPLE_WAV_PATH), + (SAMPLE_MP3_URL, SAMPLE_MP3_PATH), + (SAMPLE_GSM_URL, SAMPLE_GSM_PATH), + (SAMPLE_WAV_SPEECH_URL, SAMPLE_WAV_SPEECH_PATH), + (SAMPLE_TAR_URL, SAMPLE_TAR_PATH), + ] + for url, path in uri: + with open(path, 'wb') as file_: + file_.write(requests.get(url).content) + +_fetch_data() + +def print_stats(waveform, sample_rate=None, src=None): + if src: + print("-" * 10) + print("Source:", src) + print("-" * 10) + if sample_rate: + print("Sample Rate:", sample_rate) + print("Shape:", tuple(waveform.shape)) + print("Dtype:", waveform.dtype) + print(f" - Max: {waveform.max().item():6.3f}") + print(f" - Min: {waveform.min().item():6.3f}") + print(f" - Mean: {waveform.mean().item():6.3f}") + print(f" - Std Dev: {waveform.std().item():6.3f}") + print() + print(waveform) + print() + +def plot_waveform(waveform, sample_rate, title="Waveform", xlim=None, ylim=None): + waveform = waveform.numpy() + + num_channels, num_frames = waveform.shape + time_axis = torch.arange(0, num_frames) / sample_rate + + figure, axes = plt.subplots(num_channels, 1) + if num_channels == 1: + axes = [axes] + for c in range(num_channels): + axes[c].plot(time_axis, waveform[c], linewidth=1) + axes[c].grid(True) + if num_channels > 1: + axes[c].set_ylabel(f'Channel {c+1}') + if xlim: + axes[c].set_xlim(xlim) + if ylim: + axes[c].set_ylim(ylim) + figure.suptitle(title) + plt.show(block=False) + +def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None): + waveform = waveform.numpy() + + num_channels, num_frames = waveform.shape + time_axis = torch.arange(0, num_frames) / sample_rate + + figure, axes = plt.subplots(num_channels, 1) + if num_channels == 1: + axes = [axes] + for c in range(num_channels): + axes[c].specgram(waveform[c], Fs=sample_rate) + if num_channels > 1: + axes[c].set_ylabel(f'Channel {c+1}') + if xlim: + axes[c].set_xlim(xlim) + figure.suptitle(title) + plt.show(block=False) + +def play_audio(waveform, sample_rate): + waveform = waveform.numpy() + + num_channels, num_frames = waveform.shape + if num_channels == 1: + display(Audio(waveform[0], rate=sample_rate)) + elif num_channels == 2: + display(Audio((waveform[0], waveform[1]), rate=sample_rate)) + else: + raise ValueError("Waveform with more than 2 channels are not supported.") + +def _get_sample(path, resample=None): + effects = [ + ["remix", "1"] + ] + if resample: + effects.extend([ + ["lowpass", f"{resample // 2}"], + ["rate", f'{resample}'], + ]) + return torchaudio.sox_effects.apply_effects_file(path, effects=effects) + +def get_sample(*, resample=None): + return _get_sample(SAMPLE_WAV_PATH, resample=resample) + +def inspect_file(path): + print("-" * 10) + print("Source:", path) + print("-" * 10) + print(f" - File size: {os.path.getsize(path)} bytes") + print(f" - {torchaudio.info(path)}") ###################################################################### -# Querying audio metadata -# ----------------------- +# Quering audio metadata +# ---------------------- # -# Function :py:func:`torchaudio.info` fetches audio metadata. -# You can provide a path-like object or file-like object. +# Function ``torchaudio.info`` fetches audio metadata. You can provide +# a path-like object or file-like object. # -metadata = torchaudio.info(SAMPLE_WAV) +metadata = torchaudio.info(SAMPLE_WAV_PATH) print(metadata) ###################################################################### @@ -86,7 +205,6 @@ # - ``"OPUS"``: Opus [`opus-codec.org `__] # - ``"GSM"``: GSM-FR # [`wikipedia `__] -# - ``"HTK"``: Single channel 16-bit PCM # - ``"UNKNOWN"`` None of above # @@ -98,37 +216,49 @@ # - ``num_frames`` can be ``0`` for GSM-FR format. # -metadata = torchaudio.info(SAMPLE_GSM) +metadata = torchaudio.info(SAMPLE_MP3_PATH) +print(metadata) + +metadata = torchaudio.info(SAMPLE_GSM_PATH) print(metadata) ###################################################################### # Querying file-like object -# ------------------------- +# ~~~~~~~~~~~~~~~~~~~~~~~~~ # -# :py:func:`torchaudio.info` works on file-like objects. +# ``info`` works on file-like objects. # -url = "https://download.pytorch.org/torchaudio/tutorial-assets/steam-train-whistle-daniel_simon.wav" -with requests.get(url, stream=True) as response: - metadata = torchaudio.info(response.raw) +print("Source:", SAMPLE_WAV_URL) +with requests.get(SAMPLE_WAV_URL, stream=True) as response: + metadata = torchaudio.info(response.raw) print(metadata) ###################################################################### -# .. note:: +# **Note** When passing a file-like object, ``info`` does not read +# all of the underlying data; rather, it reads only a portion +# of the data from the beginning. +# Therefore, for a given audio format, it may not be able to retrieve the +# correct metadata, including the format itself. +# The following example illustrates this. # -# When passing a file-like object, ``info`` does not read -# all of the underlying data; rather, it reads only a portion -# of the data from the beginning. -# Therefore, for a given audio format, it may not be able to retrieve the -# correct metadata, including the format itself. In such case, you -# can pass ``format`` argument to specify the format of the audio. +# - Use argument ``format`` to specify the audio format of the input. +# - The returned metadata has ``num_frames = 0`` +# + +print("Source:", SAMPLE_MP3_URL) +with requests.get(SAMPLE_MP3_URL, stream=True) as response: + metadata = torchaudio.info(response.raw, format="mp3") + + print(f"Fetched {response.raw.tell()} bytes.") +print(metadata) ###################################################################### -# Loading audio data -# ------------------ +# Loading audio data into Tensor +# ------------------------------ # -# To load audio data, you can use :py:func:`torchaudio.load`. +# To load audio data, you can use ``torchaudio.load``. # # This function accepts a path-like object or file-like object as input. # @@ -136,112 +266,51 @@ # (``int``). # # By default, the resulting tensor object has ``dtype=torch.float32`` and -# its value range is ``[-1.0, 1.0]``. +# its value range is normalized within ``[-1.0, 1.0]``. # # For the list of supported format, please refer to `the torchaudio # documentation `__. # -waveform, sample_rate = torchaudio.load(SAMPLE_WAV) - +waveform, sample_rate = torchaudio.load(SAMPLE_WAV_SPEECH_PATH) -###################################################################### -# -def plot_waveform(waveform, sample_rate): - waveform = waveform.numpy() - - num_channels, num_frames = waveform.shape - time_axis = torch.arange(0, num_frames) / sample_rate - - figure, axes = plt.subplots(num_channels, 1) - if num_channels == 1: - axes = [axes] - for c in range(num_channels): - axes[c].plot(time_axis, waveform[c], linewidth=1) - axes[c].grid(True) - if num_channels > 1: - axes[c].set_ylabel(f"Channel {c+1}") - figure.suptitle("waveform") - plt.show(block=False) - - -###################################################################### -# +print_stats(waveform, sample_rate=sample_rate) plot_waveform(waveform, sample_rate) - - -###################################################################### -# -def plot_specgram(waveform, sample_rate, title="Spectrogram"): - waveform = waveform.numpy() - - num_channels, num_frames = waveform.shape - - figure, axes = plt.subplots(num_channels, 1) - if num_channels == 1: - axes = [axes] - for c in range(num_channels): - axes[c].specgram(waveform[c], Fs=sample_rate) - if num_channels > 1: - axes[c].set_ylabel(f"Channel {c+1}") - figure.suptitle(title) - plt.show(block=False) - - -###################################################################### -# plot_specgram(waveform, sample_rate) +play_audio(waveform, sample_rate) -###################################################################### -# -Audio(waveform.numpy()[0], rate=sample_rate) - ###################################################################### # Loading from file-like object -# ----------------------------- +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # -# The I/O functions support file-like objects. -# This allows for fetching and decoding audio data from locations +# ``torchaudio``\ ’s I/O functions now support file-like objects. This +# allows for fetching and decoding audio data from locations # within and beyond the local file system. # The following examples illustrate this. # -###################################################################### -# - # Load audio data as HTTP request -url = "https://download.pytorch.org/torchaudio/tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav" -with requests.get(url, stream=True) as response: - waveform, sample_rate = torchaudio.load(response.raw) +with requests.get(SAMPLE_WAV_SPEECH_URL, stream=True) as response: + waveform, sample_rate = torchaudio.load(response.raw) plot_specgram(waveform, sample_rate, title="HTTP datasource") -###################################################################### -# - # Load audio from tar file -tar_path = download_asset("tutorial-assets/VOiCES_devkit.tar.gz") -tar_item = "VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav" -with tarfile.open(tar_path, mode="r") as tarfile_: - fileobj = tarfile_.extractfile(tar_item) - waveform, sample_rate = torchaudio.load(fileobj) +with tarfile.open(SAMPLE_TAR_PATH, mode='r') as tarfile_: + fileobj = tarfile_.extractfile(SAMPLE_TAR_ITEM) + waveform, sample_rate = torchaudio.load(fileobj) plot_specgram(waveform, sample_rate, title="TAR file") -###################################################################### -# - # Load audio from S3 -bucket = "pytorch-tutorial-assets" -key = "VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav" -client = boto3.client("s3", config=Config(signature_version=UNSIGNED)) -response = client.get_object(Bucket=bucket, Key=key) -waveform, sample_rate = torchaudio.load(response["Body"]) +client = boto3.client('s3', config=Config(signature_version=UNSIGNED)) +response = client.get_object(Bucket=S3_BUCKET, Key=S3_KEY) +waveform, sample_rate = torchaudio.load(response['Body']) plot_specgram(waveform, sample_rate, title="From S3") ###################################################################### # Tips on slicing -# --------------- +# ~~~~~~~~~~~~~~~ # # Providing ``num_frames`` and ``frame_offset`` arguments restricts # decoding to the corresponding segment of the input. @@ -266,28 +335,29 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram"): frame_offset, num_frames = 16000, 16000 # Fetch and decode the 1 - 2 seconds -url = "https://download.pytorch.org/torchaudio/tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav" print("Fetching all the data...") -with requests.get(url, stream=True) as response: - waveform1, sample_rate1 = torchaudio.load(response.raw) - waveform1 = waveform1[:, frame_offset : frame_offset + num_frames] - print(f" - Fetched {response.raw.tell()} bytes") +with requests.get(SAMPLE_WAV_SPEECH_URL, stream=True) as response: + waveform1, sample_rate1 = torchaudio.load(response.raw) + waveform1 = waveform1[:, frame_offset:frame_offset+num_frames] + print(f" - Fetched {response.raw.tell()} bytes") print("Fetching until the requested frames are available...") -with requests.get(url, stream=True) as response: - waveform2, sample_rate2 = torchaudio.load(response.raw, frame_offset=frame_offset, num_frames=num_frames) - print(f" - Fetched {response.raw.tell()} bytes") +with requests.get(SAMPLE_WAV_SPEECH_URL, stream=True) as response: + waveform2, sample_rate2 = torchaudio.load( + response.raw, frame_offset=frame_offset, num_frames=num_frames) + print(f" - Fetched {response.raw.tell()} bytes") print("Checking the resulting waveform ... ", end="") assert (waveform1 == waveform2).all() print("matched!") + ###################################################################### # Saving audio to file # -------------------- # # To save audio data in formats interpretable by common applications, -# you can use :py:func:`torchaudio.save`. +# you can use ``torchaudio.save``. # # This function accepts a path-like object or file-like object. # @@ -302,72 +372,55 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram"): # ``bits_per_sample`` to change this behavior. For example, to save data # in 16-bit signed integer PCM, you can do the following. # -# .. note:: -# -# Saving data in encodings with a lower bit depth reduces the +# **Note** Saving data in encodings with lower bit depth reduces the # resulting file size but also precision. # -waveform, sample_rate = torchaudio.load(SAMPLE_WAV) - - -###################################################################### -# -def inspect_file(path): - print("-" * 10) - print("Source:", path) - print("-" * 10) - print(f" - File size: {os.path.getsize(path)} bytes") - print(f" - {torchaudio.info(path)}") - print() +waveform, sample_rate = get_sample() +print_stats(waveform, sample_rate=sample_rate) -###################################################################### -# # Save without any encoding option. # The function will pick up the encoding which # the provided data fit -with tempfile.TemporaryDirectory() as tempdir: - path = f"{tempdir}/save_example_default.wav" - torchaudio.save(path, waveform, sample_rate) - inspect_file(path) +path = "save_example_default.wav" +torchaudio.save(path, waveform, sample_rate) +inspect_file(path) -###################################################################### -# # Save as 16-bit signed integer Linear PCM # The resulting file occupies half the storage but loses precision -with tempfile.TemporaryDirectory() as tempdir: - path = f"{tempdir}/save_example_PCM_S16.wav" - torchaudio.save(path, waveform, sample_rate, encoding="PCM_S", bits_per_sample=16) - inspect_file(path) +path = "save_example_PCM_S16.wav" +torchaudio.save( + path, waveform, sample_rate, + encoding="PCM_S", bits_per_sample=16) +inspect_file(path) ###################################################################### -# :py:func:`torchaudio.save` can also handle other formats. -# To name a few: +# ``torchaudio.save`` can also handle other formats. To name a few: # +waveform, sample_rate = get_sample(resample=8000) + formats = [ - "flac", - "vorbis", - "sph", - "amb", - "amr-nb", - "gsm", + "mp3", + "flac", + "vorbis", + "sph", + "amb", + "amr-nb", + "gsm", ] -###################################################################### -# -waveform, sample_rate = torchaudio.load(SAMPLE_WAV_8000) -with tempfile.TemporaryDirectory() as tempdir: - for format in formats: - path = f"{tempdir}/save_example.{format}" - torchaudio.save(path, waveform, sample_rate, format=format) - inspect_file(path) +for format in formats: + path = f"save_example.{format}" + torchaudio.save(path, waveform, sample_rate, format=format) + inspect_file(path) + ###################################################################### # Saving to file-like object -# -------------------------- +# ~~~~~~~~~~~~~~~~~~~~~~~~~~ # # Similar to the other I/O functions, you can save audio to file-like # objects. When saving to a file-like object, argument ``format`` is @@ -375,7 +428,7 @@ def inspect_file(path): # -waveform, sample_rate = torchaudio.load(SAMPLE_WAV) +waveform, sample_rate = get_sample() # Saving to bytes buffer buffer_ = io.BytesIO() @@ -383,3 +436,4 @@ def inspect_file(path): buffer_.seek(0) print(buffer_.read(16)) + diff --git a/beginner_source/audio_resampling_tutorial.py b/beginner_source/audio_resampling_tutorial.py index 3ffd739980c..99e691cfa14 100644 --- a/beginner_source/audio_resampling_tutorial.py +++ b/beginner_source/audio_resampling_tutorial.py @@ -1,11 +1,16 @@ # -*- coding: utf-8 -*- """ Audio Resampling -================ +========== + +Here, we will walk through resampling audio waveforms using ``torchaudio``. -This tutorial shows how to use torchaudio's resampling API. """ +# When running this tutorial in Google Colab, install the required packages +# with the following. +# !pip install torchaudio librosa + import torch import torchaudio import torchaudio.functional as F @@ -15,126 +20,174 @@ print(torchaudio.__version__) ###################################################################### -# Preparation -# ----------- -# -# First, we import the modules and define the helper functions. -# -# .. note:: -# When running this tutorial in Google Colab, install the required packages -# with the following. -# -# .. code:: +# Preparing data and utility functions (skip this section) +# -------------------------------------------------------- # -# !pip install librosa + +#@title Prepare data and utility functions. {display-mode: "form"} +#@markdown +#@markdown You do not need to look into this cell. +#@markdown Just execute once and you are good to go. + +#------------------------------------------------------------------------------- +# Preparation of data and helper functions. +#------------------------------------------------------------------------------- import math import time import librosa import matplotlib.pyplot as plt -import pandas as pd from IPython.display import Audio, display +import pandas as pd -pd.set_option('display.max_rows', None) -pd.set_option('display.max_columns', None) DEFAULT_OFFSET = 201 +SWEEP_MAX_SAMPLE_RATE = 48000 +DEFAULT_LOWPASS_FILTER_WIDTH = 6 +DEFAULT_ROLLOFF = 0.99 +DEFAULT_RESAMPLING_METHOD = 'sinc_interpolation' def _get_log_freq(sample_rate, max_sweep_rate, offset): - """Get freqs evenly spaced out in log-scale, between [0, max_sweep_rate // 2] - - offset is used to avoid negative infinity `log(offset + x)`. + """Get freqs evenly spaced out in log-scale, between [0, max_sweep_rate // 2] - """ - start, stop = math.log(offset), math.log(offset + max_sweep_rate // 2) - return torch.exp(torch.linspace(start, stop, sample_rate, dtype=torch.double)) - offset + offset is used to avoid negative infinity `log(offset + x)`. + """ + half = sample_rate // 2 + start, stop = math.log(offset), math.log(offset + max_sweep_rate // 2) + return torch.exp(torch.linspace(start, stop, sample_rate, dtype=torch.double)) - offset def _get_inverse_log_freq(freq, sample_rate, offset): - """Find the time where the given frequency is given by _get_log_freq""" - half = sample_rate // 2 - return sample_rate * (math.log(1 + freq / offset) / math.log(1 + half / offset)) - + """Find the time where the given frequency is given by _get_log_freq""" + half = sample_rate // 2 + return sample_rate * (math.log(1 + freq / offset) / math.log(1 + half / offset)) def _get_freq_ticks(sample_rate, offset, f_max): - # Given the original sample rate used for generating the sweep, - # find the x-axis value where the log-scale major frequency values fall in - time, freq = [], [] - for exp in range(2, 5): - for v in range(1, 10): - f = v * 10**exp - if f < sample_rate // 2: - t = _get_inverse_log_freq(f, sample_rate, offset) / sample_rate - time.append(t) - freq.append(f) - t_max = _get_inverse_log_freq(f_max, sample_rate, offset) / sample_rate - time.append(t_max) - freq.append(f_max) - return time, freq - + # Given the original sample rate used for generating the sweep, + # find the x-axis value where the log-scale major frequency values fall in + time, freq = [], [] + for exp in range(2, 5): + for v in range(1, 10): + f = v * 10 ** exp + if f < sample_rate // 2: + t = _get_inverse_log_freq(f, sample_rate, offset) / sample_rate + time.append(t) + freq.append(f) + t_max = _get_inverse_log_freq(f_max, sample_rate, offset) / sample_rate + time.append(t_max) + freq.append(f_max) + return time, freq def get_sine_sweep(sample_rate, offset=DEFAULT_OFFSET): - max_sweep_rate = sample_rate - freq = _get_log_freq(sample_rate, max_sweep_rate, offset) - delta = 2 * math.pi * freq / sample_rate - cummulative = torch.cumsum(delta, dim=0) - signal = torch.sin(cummulative).unsqueeze(dim=0) - return signal - + max_sweep_rate = sample_rate + freq = _get_log_freq(sample_rate, max_sweep_rate, offset) + delta = 2 * math.pi * freq / sample_rate + cummulative = torch.cumsum(delta, dim=0) + signal = torch.sin(cummulative).unsqueeze(dim=0) + return signal + +def plot_sweep(waveform, sample_rate, title, max_sweep_rate=SWEEP_MAX_SAMPLE_RATE, offset=DEFAULT_OFFSET): + x_ticks = [100, 500, 1000, 5000, 10000, 20000, max_sweep_rate // 2] + y_ticks = [1000, 5000, 10000, 20000, sample_rate//2] + + time, freq = _get_freq_ticks(max_sweep_rate, offset, sample_rate // 2) + freq_x = [f if f in x_ticks and f <= max_sweep_rate // 2 else None for f in freq] + freq_y = [f for f in freq if f >= 1000 and f in y_ticks and f <= sample_rate // 2] + + figure, axis = plt.subplots(1, 1) + axis.specgram(waveform[0].numpy(), Fs=sample_rate) + plt.xticks(time, freq_x) + plt.yticks(freq_y, freq_y) + axis.set_xlabel('Original Signal Frequency (Hz, log scale)') + axis.set_ylabel('Waveform Frequency (Hz)') + axis.xaxis.grid(True, alpha=0.67) + axis.yaxis.grid(True, alpha=0.67) + figure.suptitle(f'{title} (sample rate: {sample_rate} Hz)') + plt.show(block=True) + +def play_audio(waveform, sample_rate): + waveform = waveform.numpy() + + num_channels, num_frames = waveform.shape + if num_channels == 1: + display(Audio(waveform[0], rate=sample_rate)) + elif num_channels == 2: + display(Audio((waveform[0], waveform[1]), rate=sample_rate)) + else: + raise ValueError("Waveform with more than 2 channels are not supported.") + +def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None): + waveform = waveform.numpy() + + num_channels, num_frames = waveform.shape + time_axis = torch.arange(0, num_frames) / sample_rate + + figure, axes = plt.subplots(num_channels, 1) + if num_channels == 1: + axes = [axes] + for c in range(num_channels): + axes[c].specgram(waveform[c], Fs=sample_rate) + if num_channels > 1: + axes[c].set_ylabel(f'Channel {c+1}') + if xlim: + axes[c].set_xlim(xlim) + figure.suptitle(title) + plt.show(block=False) -def plot_sweep( +def benchmark_resample( + method, waveform, sample_rate, - title, - max_sweep_rate=48000, - offset=DEFAULT_OFFSET, + resample_rate, + lowpass_filter_width=DEFAULT_LOWPASS_FILTER_WIDTH, + rolloff=DEFAULT_ROLLOFF, + resampling_method=DEFAULT_RESAMPLING_METHOD, + beta=None, + librosa_type=None, + iters=5 ): - x_ticks = [100, 500, 1000, 5000, 10000, 20000, max_sweep_rate // 2] - y_ticks = [1000, 5000, 10000, 20000, sample_rate // 2] - - time, freq = _get_freq_ticks(max_sweep_rate, offset, sample_rate // 2) - freq_x = [f if f in x_ticks and f <= max_sweep_rate // 2 else None for f in freq] - freq_y = [f for f in freq if f in y_ticks and 1000 <= f <= sample_rate // 2] - - figure, axis = plt.subplots(1, 1) - _, _, _, cax = axis.specgram(waveform[0].numpy(), Fs=sample_rate) - plt.xticks(time, freq_x) - plt.yticks(freq_y, freq_y) - axis.set_xlabel("Original Signal Frequency (Hz, log scale)") - axis.set_ylabel("Waveform Frequency (Hz)") - axis.xaxis.grid(True, alpha=0.67) - axis.yaxis.grid(True, alpha=0.67) - figure.suptitle(f"{title} (sample rate: {sample_rate} Hz)") - plt.colorbar(cax) - plt.show(block=True) - + if method == "functional": + begin = time.time() + for _ in range(iters): + F.resample(waveform, sample_rate, resample_rate, lowpass_filter_width=lowpass_filter_width, + rolloff=rolloff, resampling_method=resampling_method) + elapsed = time.time() - begin + return elapsed / iters + elif method == "transforms": + resampler = T.Resample(sample_rate, resample_rate, lowpass_filter_width=lowpass_filter_width, + rolloff=rolloff, resampling_method=resampling_method, dtype=waveform.dtype) + begin = time.time() + for _ in range(iters): + resampler(waveform) + elapsed = time.time() - begin + return elapsed / iters + elif method == "librosa": + waveform_np = waveform.squeeze().numpy() + begin = time.time() + for _ in range(iters): + librosa.resample(waveform_np, sample_rate, resample_rate, res_type=librosa_type) + elapsed = time.time() - begin + return elapsed / iters ###################################################################### -# Resampling Overview -# ------------------- -# # To resample an audio waveform from one freqeuncy to another, you can use -# :py:func:`torchaudio.transforms.Resample` or -# :py:func:`torchaudio.functional.resample`. -# ``transforms.Resample`` precomputes and caches the kernel used for resampling, -# while ``functional.resample`` computes it on the fly, so using -# ``torchaudio.transforms.Resample`` will result in a speedup when resampling +# ``transforms.Resample`` or ``functional.resample``. +# ``transforms.Resample`` precomputes and caches the kernel used for +# resampling, while ``functional.resample`` computes it on the fly, so +# using ``transforms.Resample`` will result in a speedup when resampling # multiple waveforms using the same parameters (see Benchmarking section). # # Both resampling methods use `bandlimited sinc # interpolation `__ to compute # signal values at arbitrary time steps. The implementation involves # convolution, so we can take advantage of GPU / multithreading for -# performance improvements. -# -# .. note:: -# -# When using resampling in multiple subprocesses, such as data loading -# with multiple worker processes, your application might create more -# threads than your system can handle efficiently. -# Setting ``torch.set_num_threads(1)`` might help in this case. +# performance improvements. When using resampling in multiple +# subprocesses, such as data loading with multiple worker processes, your +# application might create more threads than your system can handle +# efficiently. Setting ``torch.set_num_threads(1)`` might help in this +# case. # # Because a finite number of samples can only represent a finite number of # frequencies, resampling does not produce perfect results, and a variety @@ -150,24 +203,17 @@ def plot_sweep( # sample_rate = 48000 -waveform = get_sine_sweep(sample_rate) +resample_rate = 32000 +waveform = get_sine_sweep(sample_rate) plot_sweep(waveform, sample_rate, title="Original Waveform") -Audio(waveform.numpy()[0], rate=sample_rate) - -###################################################################### -# -# Now we resample (downsample) it. -# -# We see that in the spectrogram of the resampled waveform, there is an -# artifact, which was not present in the original waveform. +play_audio(waveform, sample_rate) -resample_rate = 32000 resampler = T.Resample(sample_rate, resample_rate, dtype=waveform.dtype) resampled_waveform = resampler(waveform) - plot_sweep(resampled_waveform, resample_rate, title="Resampled Waveform") -Audio(resampled_waveform.numpy()[0], rate=resample_rate) +play_audio(waveform, sample_rate) + ###################################################################### # Controling resampling quality with parameters @@ -185,18 +231,17 @@ def plot_sweep( # expensive. # + sample_rate = 48000 resample_rate = 32000 resampled_waveform = F.resample(waveform, sample_rate, resample_rate, lowpass_filter_width=6) plot_sweep(resampled_waveform, resample_rate, title="lowpass_filter_width=6") -###################################################################### -# - resampled_waveform = F.resample(waveform, sample_rate, resample_rate, lowpass_filter_width=128) plot_sweep(resampled_waveform, resample_rate, title="lowpass_filter_width=128") + ###################################################################### # Rolloff # ~~~~~~~ @@ -217,9 +262,6 @@ def plot_sweep( resampled_waveform = F.resample(waveform, sample_rate, resample_rate, rolloff=0.99) plot_sweep(resampled_waveform, resample_rate, title="rolloff=0.99") -###################################################################### -# - resampled_waveform = F.resample(waveform, sample_rate, resample_rate, rolloff=0.8) plot_sweep(resampled_waveform, resample_rate, title="rolloff=0.8") @@ -243,9 +285,6 @@ def plot_sweep( resampled_waveform = F.resample(waveform, sample_rate, resample_rate, resampling_method="sinc_interpolation") plot_sweep(resampled_waveform, resample_rate, title="Hann Window Default") -###################################################################### -# - resampled_waveform = F.resample(waveform, sample_rate, resample_rate, resampling_method="kaiser_window") plot_sweep(resampled_waveform, resample_rate, title="Kaiser Window Default") @@ -258,13 +297,11 @@ def plot_sweep( # that of librosa (resampy)’s kaiser window resampling, with some noise # + sample_rate = 48000 resample_rate = 32000 -###################################################################### -# kaiser_best -# ~~~~~~~~~~~ -# +### kaiser_best resampled_waveform = F.resample( waveform, sample_rate, @@ -272,28 +309,18 @@ def plot_sweep( lowpass_filter_width=64, rolloff=0.9475937167399596, resampling_method="kaiser_window", - beta=14.769656459379492, + beta=14.769656459379492 ) plot_sweep(resampled_waveform, resample_rate, title="Kaiser Window Best (torchaudio)") -###################################################################### -# - librosa_resampled_waveform = torch.from_numpy( - librosa.resample(waveform.squeeze().numpy(), orig_sr=sample_rate, target_sr=resample_rate, res_type="kaiser_best") -).unsqueeze(0) + librosa.resample(waveform.squeeze().numpy(), sample_rate, resample_rate, res_type='kaiser_best')).unsqueeze(0) plot_sweep(librosa_resampled_waveform, resample_rate, title="Kaiser Window Best (librosa)") -###################################################################### -# - mse = torch.square(resampled_waveform - librosa_resampled_waveform).mean().item() print("torchaudio and librosa kaiser best MSE:", mse) -###################################################################### -# kaiser_fast -# ~~~~~~~~~~~ -# +### kaiser_fast resampled_waveform = F.resample( waveform, sample_rate, @@ -301,24 +328,18 @@ def plot_sweep( lowpass_filter_width=16, rolloff=0.85, resampling_method="kaiser_window", - beta=8.555504641634386, + beta=8.555504641634386 ) -plot_sweep(resampled_waveform, resample_rate, title="Kaiser Window Fast (torchaudio)") - -###################################################################### -# +plot_specgram(resampled_waveform, resample_rate, title="Kaiser Window Fast (torchaudio)") librosa_resampled_waveform = torch.from_numpy( - librosa.resample(waveform.squeeze().numpy(), orig_sr=sample_rate, target_sr=resample_rate, res_type="kaiser_fast") -).unsqueeze(0) + librosa.resample(waveform.squeeze().numpy(), sample_rate, resample_rate, res_type='kaiser_fast')).unsqueeze(0) plot_sweep(librosa_resampled_waveform, resample_rate, title="Kaiser Window Fast (librosa)") -###################################################################### -# - mse = torch.square(resampled_waveform - librosa_resampled_waveform).mean().item() print("torchaudio and librosa kaiser fast MSE:", mse) + ###################################################################### # Performance Benchmarking # ------------------------ @@ -342,57 +363,6 @@ def plot_sweep( # -def benchmark_resample( - method, - waveform, - sample_rate, - resample_rate, - lowpass_filter_width=6, - rolloff=0.99, - resampling_method="sinc_interpolation", - beta=None, - librosa_type=None, - iters=5, -): - if method == "functional": - begin = time.monotonic() - for _ in range(iters): - F.resample( - waveform, - sample_rate, - resample_rate, - lowpass_filter_width=lowpass_filter_width, - rolloff=rolloff, - resampling_method=resampling_method, - ) - elapsed = time.monotonic() - begin - return elapsed / iters - elif method == "transforms": - resampler = T.Resample( - sample_rate, - resample_rate, - lowpass_filter_width=lowpass_filter_width, - rolloff=rolloff, - resampling_method=resampling_method, - dtype=waveform.dtype, - ) - begin = time.monotonic() - for _ in range(iters): - resampler(waveform) - elapsed = time.monotonic() - begin - return elapsed / iters - elif method == "librosa": - waveform_np = waveform.squeeze().numpy() - begin = time.monotonic() - for _ in range(iters): - librosa.resample(waveform_np, orig_sr=sample_rate, target_sr=resample_rate, res_type=librosa_type) - elapsed = time.monotonic() - begin - return elapsed / iters - - -###################################################################### -# - configs = { "downsample (48 -> 44.1 kHz)": [48000, 44100], "downsample (16 -> 8 kHz)": [16000, 8000], @@ -401,76 +371,71 @@ def benchmark_resample( } for label in configs: - times, rows = [], [] - sample_rate = configs[label][0] - resample_rate = configs[label][1] - waveform = get_sine_sweep(sample_rate) - - # sinc 64 zero-crossings - f_time = benchmark_resample("functional", waveform, sample_rate, resample_rate, lowpass_filter_width=64) - t_time = benchmark_resample("transforms", waveform, sample_rate, resample_rate, lowpass_filter_width=64) - times.append([None, 1000 * f_time, 1000 * t_time]) - rows.append("sinc (width 64)") - - # sinc 6 zero-crossings - f_time = benchmark_resample("functional", waveform, sample_rate, resample_rate, lowpass_filter_width=16) - t_time = benchmark_resample("transforms", waveform, sample_rate, resample_rate, lowpass_filter_width=16) - times.append([None, 1000 * f_time, 1000 * t_time]) - rows.append("sinc (width 16)") - - # kaiser best - lib_time = benchmark_resample("librosa", waveform, sample_rate, resample_rate, librosa_type="kaiser_best") - f_time = benchmark_resample( - "functional", - waveform, - sample_rate, - resample_rate, - lowpass_filter_width=64, - rolloff=0.9475937167399596, - resampling_method="kaiser_window", - beta=14.769656459379492, - ) - t_time = benchmark_resample( - "transforms", - waveform, - sample_rate, - resample_rate, - lowpass_filter_width=64, - rolloff=0.9475937167399596, - resampling_method="kaiser_window", - beta=14.769656459379492, - ) - times.append([1000 * lib_time, 1000 * f_time, 1000 * t_time]) - rows.append("kaiser_best") - - # kaiser fast - lib_time = benchmark_resample("librosa", waveform, sample_rate, resample_rate, librosa_type="kaiser_fast") - f_time = benchmark_resample( - "functional", - waveform, - sample_rate, - resample_rate, - lowpass_filter_width=16, - rolloff=0.85, - resampling_method="kaiser_window", - beta=8.555504641634386, - ) - t_time = benchmark_resample( - "transforms", - waveform, - sample_rate, - resample_rate, - lowpass_filter_width=16, - rolloff=0.85, - resampling_method="kaiser_window", - beta=8.555504641634386, - ) - times.append([1000 * lib_time, 1000 * f_time, 1000 * t_time]) - rows.append("kaiser_fast") - - df = pd.DataFrame(times, columns=["librosa", "functional", "transforms"], index=rows) - df.columns = pd.MultiIndex.from_product([[f"{label} time (ms)"], df.columns]) - - print(f"torchaudio: {torchaudio.__version__}") - print(f"librosa: {librosa.__version__}") - display(df.round(2)) + times, rows = [], [] + sample_rate = configs[label][0] + resample_rate = configs[label][1] + waveform = get_sine_sweep(sample_rate) + + # sinc 64 zero-crossings + f_time = benchmark_resample("functional", waveform, sample_rate, resample_rate, lowpass_filter_width=64) + t_time = benchmark_resample("transforms", waveform, sample_rate, resample_rate, lowpass_filter_width=64) + times.append([None, 1000 * f_time, 1000 * t_time]) + rows.append(f"sinc (width 64)") + + # sinc 6 zero-crossings + f_time = benchmark_resample("functional", waveform, sample_rate, resample_rate, lowpass_filter_width=16) + t_time = benchmark_resample("transforms", waveform, sample_rate, resample_rate, lowpass_filter_width=16) + times.append([None, 1000 * f_time, 1000 * t_time]) + rows.append(f"sinc (width 16)") + + # kaiser best + lib_time = benchmark_resample("librosa", waveform, sample_rate, resample_rate, librosa_type="kaiser_best") + f_time = benchmark_resample( + "functional", + waveform, + sample_rate, + resample_rate, + lowpass_filter_width=64, + rolloff=0.9475937167399596, + resampling_method="kaiser_window", + beta=14.769656459379492) + t_time = benchmark_resample( + "transforms", + waveform, + sample_rate, + resample_rate, + lowpass_filter_width=64, + rolloff=0.9475937167399596, + resampling_method="kaiser_window", + beta=14.769656459379492) + times.append([1000 * lib_time, 1000 * f_time, 1000 * t_time]) + rows.append(f"kaiser_best") + + # kaiser fast + lib_time = benchmark_resample("librosa", waveform, sample_rate, resample_rate, librosa_type="kaiser_fast") + f_time = benchmark_resample( + "functional", + waveform, + sample_rate, + resample_rate, + lowpass_filter_width=16, + rolloff=0.85, + resampling_method="kaiser_window", + beta=8.555504641634386) + t_time = benchmark_resample( + "transforms", + waveform, + sample_rate, + resample_rate, + lowpass_filter_width=16, + rolloff=0.85, + resampling_method="kaiser_window", + beta=8.555504641634386) + times.append([1000 * lib_time, 1000 * f_time, 1000 * t_time]) + rows.append(f"kaiser_fast") + + df = pd.DataFrame(times, + columns=["librosa", "functional", "transforms"], + index=rows) + df.columns = pd.MultiIndex.from_product([[f"{label} time (ms)"],df.columns]) + display(df.round(2))