Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/volume perturbation #382

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 30 additions & 1 deletion lhotse/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import numpy as np
from tqdm.auto import tqdm

from lhotse.augmentation import AudioTransform, Resample, Speed, Tempo
from lhotse.augmentation import AudioTransform, Resample, Speed, Tempo, Volume
from lhotse.serialization import Serializable
from lhotse.utils import (Decibels, NonPositiveEnergyError, Pathlike, Seconds, SetContainingAnything, SmartOpen,
asdict_nonull, compute_num_samples, exactly_one_not_null, fastcopy, ifnone,
Expand Down Expand Up @@ -404,6 +404,23 @@ def perturb_tempo(self, factor: float, affix_id: bool = True) -> 'Recording':
transforms=transforms
)

def perturb_volume(self, factor: float, affix_id: bool = True) -> 'Recording':
"""
Return a new ``Recording`` that will lazily perturb the volume while loading audio.

:param factor: The volume scale to be applied (e.g. factor=1.1 means 1.1x louder).
:param affix_id: When true, we will modify the ``Recording.id`` field
by affixing it with "_tp{factor}".
:return: a modified copy of the current ``Recording``.
"""
transforms = self.transforms.copy() if self.transforms is not None else []
transforms.append(Volume(factor=factor).to_dict())
return fastcopy(
self,
id=f'{self.id}_vp{factor}' if affix_id else self.id,
transforms=transforms
)

def resample(self, sampling_rate: int) -> 'Recording':
"""
Return a new ``Recording`` that will be lazily resampled while loading audio.
Expand Down Expand Up @@ -490,6 +507,7 @@ class RecordingSet(Serializable, Sequence[Recording]):
and executed upon reading the audio::

>>> recs_sp = recs.perturb_speed(factor=1.1)
>>> recs_vp = recs.perturb_volume(factor=2.)
>>> recs_24k = recs.resample(24000)
"""

Expand Down Expand Up @@ -689,6 +707,17 @@ def perturb_tempo(self, factor: float, affix_id: bool = True) -> 'RecordingSet':
"""
return RecordingSet.from_recordings(r.perturb_tempo(factor=factor, affix_id=affix_id) for r in self)

def perturb_volume(self, factor: float, affix_id: bool = True) -> 'RecordingSet':
"""
Return a new ``RecordingSet`` that will lazily perturb the volume while loading audio.

:param factor: The volume scale to be applied (e.g. factor=1.1 means 1.1x louder).
:param affix_id: When true, we will modify the ``Recording.id`` field
by affixing it with "_sp{factor}".
:return: a ``RecordingSet`` containing the perturbed ``Recording`` objects.
"""
return RecordingSet.from_recordings(r.perturb_volume(factor=factor, affix_id=affix_id) for r in self)

def resample(self, sampling_rate: int) -> 'RecordingSet':
"""
Apply resampling to all recordings in the ``RecordingSet`` and return a new ``RecordingSet``.
Expand Down
37 changes: 37 additions & 0 deletions lhotse/augmentation/torchaudio.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ class SoxEffectTransform:
>>> augment_fn = SoxEffectTransform(effects=[
>>> ['reverb', 50, 50, RandomValue(0, 100)],
>>> ['speed', RandomValue(0.9, 1.1)],
>>> ['volume', RandomValue(0.125, 2.)],
>>> ['rate', 16000],
>>> ])
>>> augmented = augment_fn(audio, 16000)
Expand Down Expand Up @@ -296,6 +297,38 @@ def reverse_timestamps(
)


@dataclass
class Volume(AudioTransform):
"""
Volume perturbation effect, the same one as invoked with `sox vol` in the command line.

It changes the amplitude of the original samples, so the absolute values of output samples will
be smaller or greater, depending on the vol factor.
"""
factor: float

def __call__(self, samples: np.ndarray, sampling_rate: int) -> np.ndarray:
sampling_rate = int(sampling_rate) # paranoia mode
effect = [['vol', str(self.factor)]]
if isinstance(samples, np.ndarray):
samples = torch.from_numpy(samples)
augmented, _ = torchaudio.sox_effects.apply_effects_tensor(samples, sampling_rate, effect)
return augmented.numpy()

def reverse_timestamps(
self,
offset: Seconds,
duration: Optional[Seconds],
sampling_rate: Optional[int] # Not used, made for compatibility purposes
) -> Tuple[Seconds, Optional[Seconds]]:
"""
This method just returnes the original offset and duration as volume perturbation
doesn't change any these audio properies.
"""

return offset, duration


def speed(sampling_rate: int) -> List[List[str]]:
return [
['speed', RandomValue(0.9, 1.1)],
Expand All @@ -310,6 +343,10 @@ def reverb(sampling_rate: int) -> List[List[str]]:
]


def volume(sampling_rate: int) -> List[List[str]]:
return [['vol', RandomValue(0.125, 2.)]]


def pitch(sampling_rate: int) -> List[List[str]]:
return [
# The returned values are 1/100ths of a semitone, meaning the default is up to a minor third shift up or down.
Expand Down
88 changes: 87 additions & 1 deletion lhotse/cut.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ class Cut:
>>> cut_append = cut.append(other_cut)
>>> cut_24k = cut.resample(24000)
>>> cut_sp = cut.perturb_speed(1.1)
>>> cut_vp = cut.perturb_volume(2.)

.. note::
All cut transformations are performed lazily, on-the-fly, upon calling ``load_audio`` or ``load_features``.
Expand Down Expand Up @@ -184,6 +185,7 @@ class Cut:
resample: Callable
perturb_speed: Callable
perturb_tempo: Callable
perturb_volume: Callable
map_supervisions: Callable
filter_supervisions: Callable
with_features_path_prefix: Callable
Expand Down Expand Up @@ -933,6 +935,36 @@ def perturb_tempo(self, factor: float, affix_id: bool = True) -> 'MonoCut':
start=new_start
)

def perturb_volume(self, factor: float, affix_id: bool = True) -> 'MonoCut':
"""
Return a new ``MonoCut`` that will lazily perturb the volume while loading audio.

:param factor: The volume will be adjusted this many times (e.g. factor=1.1 means 1.1x louder).
:param affix_id: When true, we will modify the ``MonoCut.id`` field
by affixing it with "_vp{factor}".
:return: a modified copy of the current ``MonoCut``.
"""
# Pre-conditions
assert self.has_recording, 'Cannot perturb volume on a MonoCut without Recording.'
if self.has_features:
logging.warning(
'Attempting to perturb volume on a MonoCut that references pre-computed features. '
'The feature manifest will be detached, as we do not support feature-domain '
'volume perturbation.'
)
self.features = None
# Actual audio perturbation.
recording_vp = self.recording.perturb_volume(factor=factor, affix_id=affix_id)
# Match the supervision's id (and it's underlying recording id).
supervisions_vp = [s.perturb_volume(factor=factor, affix_id=affix_id) for s in self.supervisions]

return fastcopy(
self,
id=f'{self.id}_vp{factor}' if affix_id else self.id,
recording=recording_vp,
supervisions=supervisions_vp
)

def map_supervisions(self, transform_fn: Callable[[SupervisionSegment], SupervisionSegment]) -> Cut:
"""
Modify the SupervisionSegments by `transform_fn` of this MonoCut.
Expand Down Expand Up @@ -1198,6 +1230,19 @@ def perturb_tempo(self, factor: float, affix_id: bool = True) -> 'PaddingCut':
frame_shift=new_frame_shift
)

def perturb_volume(self, factor: float, affix_id: bool = True) -> 'PaddingCut':
"""
Return a new ``PaddingCut`` that will "mimic" the effect of volume perturbation
on amplitude of samples.

:param factor: The volume will be adjusted this many times (e.g. factor=1.1 means 1.1x louder).
:param affix_id: When true, we will modify the ``PaddingCut.id`` field
by affixing it with "_vp{factor}".
:return: a modified copy of the current ``PaddingCut``.
"""

return fastcopy(self, id=f'{self.id}_vp{factor}' if affix_id else self.id)

def drop_features(self) -> 'PaddingCut':
"""Return a copy of the current :class:`.PaddingCut`, detached from ``features``."""
assert self.has_recording, f"Cannot detach features from a MonoCut with no Recording (cut ID = {self.id})."
Expand Down Expand Up @@ -1583,6 +1628,32 @@ def perturb_tempo(self, factor: float, affix_id: bool = True) -> 'MixedCut':
]
)

def perturb_volume(self, factor: float, affix_id: bool = True) -> 'MixedCut':
"""
Return a new ``MixedCut`` that will lazily perturb the volume while loading audio.
Recordings of the underlying Cuts are updated to reflect volume change.

:param factor: The volume will be adjusted this many times (e.g. factor=1.1 means 1.1x louder).
:param affix_id: When true, we will modify the ``MixedCut.id`` field
by affixing it with "_vp{factor}".
:return: a modified copy of the current ``MixedCut``.
"""
# Pre-conditions
assert self.has_recording, 'Cannot perturb volume on a MonoCut without Recording.'
if self.has_features:
logging.warning(
'Attempting to perturb volume on a MixedCut that references pre-computed features. '
'The feature manifest(s) will be detached, as we do not support feature-domain '
'volume perturbation.'
)
return MixedCut(
id=f'{self.id}_vp{factor}' if affix_id else self.id,
tracks=[
fastcopy(track, cut=track.cut.perturb_volume(factor=factor, affix_id=affix_id))
for track in self.tracks
]
)

def load_features(self, mixed: bool = True) -> Optional[np.ndarray]:
"""
Loads the features of the source cuts and mixes them on-the-fly.
Expand Down Expand Up @@ -1973,11 +2044,12 @@ class CutSet(Serializable, Sequence[Cut]):
and executed upon reading the audio::

>>> cuts_sp = cuts.perturb_speed(factor=1.1)
>>> cuts_vp = cuts.perturb_volume(factor=2.)
>>> cuts_24k = cuts.resample(24000)

.. caution::
If the :class:`.CutSet` contained :class:`~lhotse.features.base.Features` manifests, they will be
detached after performing audio augmentations such as :meth:`.CutSet.perturb_speed` or :meth:`.CutSet.resample`.
detached after performing audio augmentations such as :meth:`.CutSet.perturb_speed` or :meth:`.CutSet.resample` or :meth:`.CutSet.perturb_volume`.

:class:`~lhotse.cut.CutSet` offers parallel feature extraction capabilities
(see `meth`:.CutSet.compute_and_store_features: for details),
Expand Down Expand Up @@ -2615,6 +2687,20 @@ def perturb_tempo(self, factor: float, affix_id: bool = True) -> 'CutSet':
"""
return self.map(lambda cut: cut.perturb_tempo(factor=factor, affix_id=affix_id))

def perturb_volume(self, factor: float, affix_id: bool = True) -> 'CutSet':
"""
Return a new :class:`~lhotse.cut.CutSet` that contains volume perturbed cuts
with a factor of ``factor``. It requires the recording manifests to be present.
If the feature manifests are attached, they are dropped.
The supervision manifests are remaining the same.

:param factor: The resulting playback volume is ``factor`` times the original one.
:param affix_id: Should we modify the ID (useful if both versions of the same
cut are going to be present in a single manifest).
:return: a modified copy of the ``CutSet``.
"""
return self.map(lambda cut: cut.perturb_volume(factor=factor, affix_id=affix_id))

def mix(
self,
cuts: 'CutSet',
Expand Down
4 changes: 3 additions & 1 deletion lhotse/dataset/cut_transforms/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@
from .extra_padding import ExtraPadding
from .mix import CutMix
from .perturb_speed import PerturbSpeed
from .perturb_volume import PerturbVolume

__all__ = [
'CutConcatenate',
'CutMix',
'ExtraPadding',
'PerturbSpeed'
'PerturbSpeed',
'PerturbVolume'
]
2 changes: 1 addition & 1 deletion lhotse/dataset/cut_transforms/perturb_speed.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class PerturbSpeed:

def __init__(
self,
factors: Union[float, List[float]],
factors: Union[float, Sequence[float]],
p: float,
randgen: random.Random = None
) -> None:
Expand Down
34 changes: 34 additions & 0 deletions lhotse/dataset/cut_transforms/perturb_volume.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import random
from typing import List, Sequence, Union

from lhotse import CutSet


class PerturbVolume:
"""
A transform on batch of cuts (``CutSet``) that perturbs the volume of the recordings
with a given probability :attr:`p`.

If the effect is applied, then one of the perturbation factors from the constructor's
:attr:`factors` parameter is sampled with uniform probability.
"""

def __init__(
self,
factors: Union[float, Sequence[float]],
p: float,
randgen: random.Random = None
) -> None:
self.factors = factors if isinstance(factors, Sequence) else [factors]
self.p = p
self.random = randgen

def __call__(self, cuts: CutSet) -> CutSet:
if self.random is None:
self.random = random
return CutSet.from_cuts(
cut.perturb_volume(factor=self.random.choice(self.factors))
if self.random.random() >= self.p
else cut
for cut in cuts
)
28 changes: 24 additions & 4 deletions lhotse/supervision.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ def perturb_speed(
and duration (going through the sample counts).
:param affix_id: When true, we will modify the ``id`` and ``recording_id`` fields
by affixing it with "_sp{factor}".
:return: a modified copy of the current ``Recording``.
:return: a modified copy of the current ``SupervisionSegment``.
"""
start_sample = compute_num_samples(self.start, sampling_rate)
num_samples = compute_num_samples(self.duration, sampling_rate)
Expand All @@ -203,7 +203,7 @@ def perturb_speed(
return fastcopy(
self,
id=f'{self.id}_sp{factor}' if affix_id else self.id,
recording_id=f'{self.recording_id}_sp{factor}' if affix_id else self.id,
recording_id=f'{self.recording_id}_sp{factor}' if affix_id else self.recording_id,
start=new_start,
duration=new_duration,
alignment={
Expand All @@ -230,15 +230,35 @@ def perturb_tempo(
and duration (going through the sample counts).
:param affix_id: When true, we will modify the ``id`` and ``recording_id`` fields
by affixing it with "_tp{factor}".
:return: a modified copy of the current ``Recording``.
:return: a modified copy of the current ``SupervisionSegment``.
"""

# speed and tempo perturbation have the same effect on supervisions
perturbed = self.perturb_speed(factor, sampling_rate, affix_id=False)
return fastcopy(
perturbed,
id=f'{self.id}_tp{factor}' if affix_id else self.id,
recording_id=f'{self.recording_id}_tp{factor}' if affix_id else self.id,
recording_id=f'{self.recording_id}_tp{factor}' if affix_id else self.recording_id,
)

def perturb_volume(
self,
factor: float,
affix_id: bool = True
) -> 'SupervisionSegment':
"""
Return a ``SupervisionSegment`` with modified ids.

:param factor: The volume will be adjusted this many times (e.g. factor=1.1 means 1.1x louder).
:param affix_id: When true, we will modify the ``id`` and ``recording_id`` fields
by affixing it with "_vp{factor}".
:return: a modified copy of the current ``SupervisionSegment``.
"""

return fastcopy(
self,
id=f'{self.id}_vp{factor}' if affix_id else self.id,
recording_id=f'{self.recording_id}_vp{factor}' if affix_id else self.recording_id
)

def trim(self, end: Seconds, start: Seconds = 0) -> 'SupervisionSegment':
Expand Down
Loading