Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/volume perturbation #382

Merged
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 30 additions & 1 deletion lhotse/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import numpy as np
from tqdm.auto import tqdm

from lhotse.augmentation import AudioTransform, Resample, Speed, Tempo
from lhotse.augmentation import AudioTransform, Resample, Speed, Tempo, Vol
from lhotse.serialization import Serializable
from lhotse.utils import (Decibels, NonPositiveEnergyError, Pathlike, Seconds, SetContainingAnything, SmartOpen,
asdict_nonull, compute_num_samples, exactly_one_not_null, fastcopy, ifnone,
Expand Down Expand Up @@ -403,6 +403,23 @@ def perturb_tempo(self, factor: float, affix_id: bool = True) -> 'Recording':
duration=new_duration,
transforms=transforms
)

def perturb_vol(self, factor: float, affix_id: bool = True) -> 'Recording':
"""
Return a new ``Recording`` that will lazily perturb the volume while loading audio.

:param factor: The volume scale to be applied (e.g. factor=1.1 means 1.1x louder).
:param affix_id: When true, we will modify the ``Recording.id`` field
by affixing it with "_tp{factor}".
:return: a modified copy of the current ``Recording``.
"""
transforms = self.transforms.copy() if self.transforms is not None else []
transforms.append(Vol(factor=factor).to_dict())
return fastcopy(
self,
id=f'{self.id}_vp{factor}' if affix_id else self.id,
transforms=transforms
)

def resample(self, sampling_rate: int) -> 'Recording':
"""
Expand Down Expand Up @@ -490,6 +507,7 @@ class RecordingSet(Serializable, Sequence[Recording]):
and executed upon reading the audio::

>>> recs_sp = recs.perturb_speed(factor=1.1)
>>> recs_vp = recs.perturb_vol(factor=2.)
>>> recs_24k = recs.resample(24000)
"""

Expand Down Expand Up @@ -688,6 +706,17 @@ def perturb_tempo(self, factor: float, affix_id: bool = True) -> 'RecordingSet':
:return: a ``RecordingSet`` containing the perturbed ``Recording`` objects.
"""
return RecordingSet.from_recordings(r.perturb_tempo(factor=factor, affix_id=affix_id) for r in self)

def perturb_vol(self, factor: float, affix_id: bool = True) -> 'RecordingSet':
"""
Return a new ``RecordingSet`` that will lazily perturb the volume while loading audio.

:param factor: The volume scale to be applied (e.g. factor=1.1 means 1.1x louder).
:param affix_id: When true, we will modify the ``Recording.id`` field
by affixing it with "_sp{factor}".
:return: a ``RecordingSet`` containing the perturbed ``Recording`` objects.
"""
return RecordingSet.from_recordings(r.perturb_vol(factor=factor, affix_id=affix_id) for r in self)

def resample(self, sampling_rate: int) -> 'RecordingSet':
"""
Expand Down
37 changes: 37 additions & 0 deletions lhotse/augmentation/torchaudio.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ class SoxEffectTransform:
>>> augment_fn = SoxEffectTransform(effects=[
>>> ['reverb', 50, 50, RandomValue(0, 100)],
>>> ['speed', RandomValue(0.9, 1.1)],
>>> ['vol', RandomValue(0.125, 2.)],
>>> ['rate', 16000],
>>> ])
>>> augmented = augment_fn(audio, 16000)
Expand Down Expand Up @@ -294,6 +295,38 @@ def reverse_timestamps(
start_sample / sampling_rate,
num_samples / sampling_rate if num_samples is not None else None,
)


@dataclass
class Vol(AudioTransform):
"""
Volume perturbation effect, the same one as invoked with `sox vol` in the command line.

It changes the amplitude of the original samples, so the absolute values of output samples will
be smaller or greater, depending on the vol factor.
"""
factor: float

def __call__(self, samples: np.ndarray, sampling_rate: int) -> np.ndarray:
sampling_rate = int(sampling_rate) # paranoia mode
effect = [['vol', str(self.factor)]]
if isinstance(samples, np.ndarray):
samples = torch.from_numpy(samples)
augmented, _ = torchaudio.sox_effects.apply_effects_tensor(samples, sampling_rate, effect)
return augmented.numpy()

def reverse_timestamps(
self,
offset: Seconds,
duration: Optional[Seconds],
sampling_rate: Optional[int] # Not used, made for compatibility purposes
) -> Tuple[Seconds, Optional[Seconds]]:
"""
This method just returnes the original offset and duration as vol perturbation
doesn't change any these audio properies.
"""

return offset, duration


def speed(sampling_rate: int) -> List[List[str]]:
Expand All @@ -310,6 +343,10 @@ def reverb(sampling_rate: int) -> List[List[str]]:
]


def vol(sampling_rate: int) -> List[List[str]]:
return [['vol', RandomValue(0.125, 2.)]]


def pitch(sampling_rate: int) -> List[List[str]]:
return [
# The returned values are 1/100ths of a semitone, meaning the default is up to a minor third shift up or down.
Expand Down
88 changes: 87 additions & 1 deletion lhotse/cut.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ class Cut:
>>> cut_append = cut.append(other_cut)
>>> cut_24k = cut.resample(24000)
>>> cut_sp = cut.perturb_speed(1.1)
>>> cut_vp = cut.perturb_vol(2.)

.. note::
All cut transformations are performed lazily, on-the-fly, upon calling ``load_audio`` or ``load_features``.
Expand Down Expand Up @@ -184,6 +185,7 @@ class Cut:
resample: Callable
perturb_speed: Callable
perturb_tempo: Callable
perturb_vol: Callable
map_supervisions: Callable
filter_supervisions: Callable
with_features_path_prefix: Callable
Expand Down Expand Up @@ -932,6 +934,36 @@ def perturb_tempo(self, factor: float, affix_id: bool = True) -> 'MonoCut':
duration=new_duration,
start=new_start
)

def perturb_vol(self, factor: float, affix_id: bool = True) -> 'MonoCut':
"""
Return a new ``MonoCut`` that will lazily perturb the volume while loading audio.

:param factor: The volume will be adjusted this many times (e.g. factor=1.1 means 1.1x louder).
:param affix_id: When true, we will modify the ``MonoCut.id`` field
by affixing it with "_vp{factor}".
:return: a modified copy of the current ``MonoCut``.
"""
# Pre-conditions
assert self.has_recording, 'Cannot perturb volume on a MonoCut without Recording.'
if self.has_features:
logging.warning(
'Attempting to perturb volume on a MonoCut that references pre-computed features. '
'The feature manifest will be detached, as we do not support feature-domain '
'volume perturbation.'
)
self.features = None
# Actual audio perturbation.
recording_vp = self.recording.perturb_vol(factor=factor, affix_id=affix_id)
# Match the supervision's id (and it's underlying recording id).
supervisions_vp = [s.perturb_vol(factor=factor, affix_id=affix_id) for s in self.supervisions]

return fastcopy(
self,
id=f'{self.id}_vp{factor}' if affix_id else self.id,
recording=recording_vp,
supervisions=supervisions_vp
)

def map_supervisions(self, transform_fn: Callable[[SupervisionSegment], SupervisionSegment]) -> Cut:
"""
Expand Down Expand Up @@ -1197,6 +1229,19 @@ def perturb_tempo(self, factor: float, affix_id: bool = True) -> 'PaddingCut':
num_features=new_num_features,
frame_shift=new_frame_shift
)

def perturb_vol(self, factor: float, affix_id: bool = True) -> 'PaddingCut':
"""
Return a new ``PaddingCut`` that will "mimic" the effect of volume perturbation
on amplitude of samples.

:param factor: The vol will be adjusted this many times (e.g. factor=1.1 means 1.1x louder).
:param affix_id: When true, we will modify the ``PaddingCut.id`` field
by affixing it with "_vp{factor}".
:return: a modified copy of the current ``PaddingCut``.
"""

return fastcopy(self, id=f'{self.id}_vp{factor}' if affix_id else self.id)

def drop_features(self) -> 'PaddingCut':
"""Return a copy of the current :class:`.PaddingCut`, detached from ``features``."""
Expand Down Expand Up @@ -1582,6 +1627,32 @@ def perturb_tempo(self, factor: float, affix_id: bool = True) -> 'MixedCut':
for track in self.tracks
]
)

def perturb_vol(self, factor: float, affix_id: bool = True) -> 'MixedCut':
"""
Return a new ``MixedCut`` that will lazily perturb the volume while loading audio.
Recordings of the underlying Cuts are updated to reflect volume change.

:param factor: The volume will be adjusted this many times (e.g. factor=1.1 means 1.1x louder).
:param affix_id: When true, we will modify the ``MixedCut.id`` field
by affixing it with "_vp{factor}".
:return: a modified copy of the current ``MixedCut``.
"""
# Pre-conditions
assert self.has_recording, 'Cannot perturb volume on a MonoCut without Recording.'
if self.has_features:
logging.warning(
'Attempting to perturb volume on a MixedCut that references pre-computed features. '
'The feature manifest(s) will be detached, as we do not support feature-domain '
'volume perturbation.'
)
return MixedCut(
id=f'{self.id}_vp{factor}' if affix_id else self.id,
tracks=[
fastcopy(track, cut=track.cut.perturb_vol(factor=factor, affix_id=affix_id))
for track in self.tracks
]
)

def load_features(self, mixed: bool = True) -> Optional[np.ndarray]:
"""
Expand Down Expand Up @@ -1973,11 +2044,12 @@ class CutSet(Serializable, Sequence[Cut]):
and executed upon reading the audio::

>>> cuts_sp = cuts.perturb_speed(factor=1.1)
>>> cuts_vp = cuts.perturb_vol(factor=2.)
>>> cuts_24k = cuts.resample(24000)

.. caution::
If the :class:`.CutSet` contained :class:`~lhotse.features.base.Features` manifests, they will be
detached after performing audio augmentations such as :meth:`.CutSet.perturb_speed` or :meth:`.CutSet.resample`.
detached after performing audio augmentations such as :meth:`.CutSet.perturb_speed` or :meth:`.CutSet.resample` or :meth:`.CutSet.perturb_vol`.

:class:`~lhotse.cut.CutSet` offers parallel feature extraction capabilities
(see `meth`:.CutSet.compute_and_store_features: for details),
Expand Down Expand Up @@ -2614,6 +2686,20 @@ def perturb_tempo(self, factor: float, affix_id: bool = True) -> 'CutSet':
:return: a modified copy of the ``CutSet``.
"""
return self.map(lambda cut: cut.perturb_tempo(factor=factor, affix_id=affix_id))

def perturb_vol(self, factor: float, affix_id: bool = True) -> 'CutSet':
"""
Return a new :class:`~lhotse.cut.CutSet` that contains volume perturbed cuts
with a factor of ``factor``. It requires the recording manifests to be present.
If the feature manifests are attached, they are dropped.
The supervision manifests are remaining the same.

:param factor: The resulting playback volume is ``factor`` times the original one.
:param affix_id: Should we modify the ID (useful if both versions of the same
cut are going to be present in a single manifest).
:return: a modified copy of the ``CutSet``.
"""
return self.map(lambda cut: cut.perturb_vol(factor=factor, affix_id=affix_id))

def mix(
self,
Expand Down
4 changes: 3 additions & 1 deletion lhotse/dataset/cut_transforms/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@
from .extra_padding import ExtraPadding
from .mix import CutMix
from .perturb_speed import PerturbSpeed
from .perturb_vol import PerturbVol

__all__ = [
'CutConcatenate',
'CutMix',
'ExtraPadding',
'PerturbSpeed'
'PerturbSpeed',
'PerturbVol'
]
34 changes: 34 additions & 0 deletions lhotse/dataset/cut_transforms/perturb_vol.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import random
from typing import List, Sequence, Union

from lhotse import CutSet


class PerturbVol:
"""
A transform on batch of cuts (``CutSet``) that perturbs the volume of the recordings
with a given probability :attr:`p`.

If the effect is applied, then one of the perturbation factors from the constructor's
:attr:`factors` parameter is sampled with uniform probability.
"""

def __init__(
self,
factors: Union[float, List[float]],
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
factors: Union[float, List[float]],
factors: Union[float, Sequence[float]],

This allows tuples without type checker conflicts

p: float,
randgen: random.Random = None
) -> None:
self.factors = factors if isinstance(factors, Sequence) else [factors]
self.p = p
self.random = randgen

def __call__(self, cuts: CutSet) -> CutSet:
if self.random is None:
self.random = random
return CutSet.from_cuts(
cut.perturb_vol(factor=self.random.choice(self.factors))
if self.random.random() >= self.p
else cut
for cut in cuts
)
24 changes: 22 additions & 2 deletions lhotse/supervision.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ def perturb_speed(
and duration (going through the sample counts).
:param affix_id: When true, we will modify the ``id`` and ``recording_id`` fields
by affixing it with "_sp{factor}".
:return: a modified copy of the current ``Recording``.
:return: a modified copy of the current ``SupervisionSegment``.
"""
start_sample = compute_num_samples(self.start, sampling_rate)
num_samples = compute_num_samples(self.duration, sampling_rate)
Expand Down Expand Up @@ -230,7 +230,7 @@ def perturb_tempo(
and duration (going through the sample counts).
:param affix_id: When true, we will modify the ``id`` and ``recording_id`` fields
by affixing it with "_tp{factor}".
:return: a modified copy of the current ``Recording``.
:return: a modified copy of the current ``SupervisionSegment``.
"""

# speed and tempo perturbation have the same effect on supervisions
Expand All @@ -240,6 +240,26 @@ def perturb_tempo(
id=f'{self.id}_tp{factor}' if affix_id else self.id,
recording_id=f'{self.recording_id}_tp{factor}' if affix_id else self.id,
)

def perturb_vol(
self,
factor: float,
affix_id: bool = True
) -> 'SupervisionSegment':
"""
Return a ``SupervisionSegment`` with modified ids.

:param factor: The volume will be adjusted this many times (e.g. factor=1.1 means 1.1x louder).
:param affix_id: When true, we will modify the ``id`` and ``recording_id`` fields
by affixing it with "_vp{factor}".
:return: a modified copy of the current ``SupervisionSegment``.
"""

return fastcopy(
self,
id=f'{self.id}_vp{factor}' if affix_id else self.id,
recording_id=f'{self.recording_id}_vp{factor}' if affix_id else self.id
)

def trim(self, end: Seconds, start: Seconds = 0) -> 'SupervisionSegment':
"""
Expand Down
Loading