lhotse-speech · pzelasko · Aug 23, 2021 · Aug 22, 2021 · Aug 22, 2021 · Aug 23, 2021
diff --git a/lhotse/audio.py b/lhotse/audio.py
@@ -17,7 +17,7 @@
 import numpy as np
 from tqdm.auto import tqdm
 
-from lhotse.augmentation import AudioTransform, Resample, Speed, Tempo
+from lhotse.augmentation import AudioTransform, Resample, Speed, Tempo, Volume
 from lhotse.serialization import Serializable
 from lhotse.utils import (Decibels, NonPositiveEnergyError, Pathlike, Seconds, SetContainingAnything, SmartOpen,
                           asdict_nonull, compute_num_samples, exactly_one_not_null, fastcopy, ifnone,
@@ -404,6 +404,23 @@ def perturb_tempo(self, factor: float, affix_id: bool = True) -> 'Recording':
             transforms=transforms
         )
 
+    def perturb_volume(self, factor: float, affix_id: bool = True) -> 'Recording':
+        """
+        Return a new ``Recording`` that will lazily perturb the volume while loading audio.
+
+        :param factor: The volume scale to be applied (e.g. factor=1.1 means 1.1x louder).
+        :param affix_id: When true, we will modify the ``Recording.id`` field
+            by affixing it with "_tp{factor}".
+        :return: a modified copy of the current ``Recording``.
+        """
+        transforms = self.transforms.copy() if self.transforms is not None else []
+        transforms.append(Volume(factor=factor).to_dict())
+        return fastcopy(
+            self,
+            id=f'{self.id}_vp{factor}' if affix_id else self.id,
+            transforms=transforms
+        )
+
     def resample(self, sampling_rate: int) -> 'Recording':
         """
         Return a new ``Recording`` that will be lazily resampled while loading audio.
@@ -490,6 +507,7 @@ class RecordingSet(Serializable, Sequence[Recording]):
         and executed upon reading the audio::
 
             >>> recs_sp = recs.perturb_speed(factor=1.1)
+            >>> recs_vp = recs.perturb_volume(factor=2.)
             >>> recs_24k = recs.resample(24000)
     """
 
@@ -689,6 +707,17 @@ def perturb_tempo(self, factor: float, affix_id: bool = True) -> 'RecordingSet':
         """
         return RecordingSet.from_recordings(r.perturb_tempo(factor=factor, affix_id=affix_id) for r in self)
 
+    def perturb_volume(self, factor: float, affix_id: bool = True) -> 'RecordingSet':
+        """
+        Return a new ``RecordingSet`` that will lazily perturb the volume while loading audio.
+
+        :param factor: The volume scale to be applied (e.g. factor=1.1 means 1.1x louder).
+        :param affix_id: When true, we will modify the ``Recording.id`` field
+            by affixing it with "_sp{factor}".
+        :return: a ``RecordingSet`` containing the perturbed ``Recording`` objects.
+        """
+        return RecordingSet.from_recordings(r.perturb_volume(factor=factor, affix_id=affix_id) for r in self)
+
     def resample(self, sampling_rate: int) -> 'RecordingSet':
         """
         Apply resampling to all recordings in the ``RecordingSet`` and return a new ``RecordingSet``.

diff --git a/lhotse/augmentation/torchaudio.py b/lhotse/augmentation/torchaudio.py
@@ -44,6 +44,7 @@ class SoxEffectTransform:
         >>> augment_fn = SoxEffectTransform(effects=[
         >>>    ['reverb', 50, 50, RandomValue(0, 100)],
         >>>    ['speed', RandomValue(0.9, 1.1)],
+        >>>    ['volume', RandomValue(0.125, 2.)],
         >>>    ['rate', 16000],
         >>> ])
         >>> augmented = augment_fn(audio, 16000)
@@ -296,6 +297,38 @@ def reverse_timestamps(
         )
 
 
+@dataclass
+class Volume(AudioTransform):
+    """
+    Volume perturbation effect, the same one as invoked with `sox vol` in the command line.
+
+    It changes the amplitude of the original samples, so the absolute values of output samples will
+    be smaller or greater, depending on the vol factor.
+    """
+    factor: float
+
+    def __call__(self, samples: np.ndarray, sampling_rate: int) -> np.ndarray:
+        sampling_rate = int(sampling_rate)  # paranoia mode
+        effect = [['vol', str(self.factor)]]
+        if isinstance(samples, np.ndarray):
+            samples = torch.from_numpy(samples)
+        augmented, _ = torchaudio.sox_effects.apply_effects_tensor(samples, sampling_rate, effect)
+        return augmented.numpy()
+
+    def reverse_timestamps(
+            self,
+            offset: Seconds,
+            duration: Optional[Seconds],
+            sampling_rate: Optional[int] # Not used, made for compatibility purposes
+    ) -> Tuple[Seconds, Optional[Seconds]]:
+        """
+        This method just returnes the original offset and duration as volume perturbation
+        doesn't change any these audio properies.
+        """
+
+        return offset, duration
+
+
 def speed(sampling_rate: int) -> List[List[str]]:
     return [
         ['speed', RandomValue(0.9, 1.1)],
@@ -310,6 +343,10 @@ def reverb(sampling_rate: int) -> List[List[str]]:
     ]
 
 
+def volume(sampling_rate: int) -> List[List[str]]:
+    return [['vol', RandomValue(0.125, 2.)]]
+
+
 def pitch(sampling_rate: int) -> List[List[str]]:
     return [
         # The returned values are 1/100ths of a semitone, meaning the default is up to a minor third shift up or down.

diff --git a/lhotse/cut.py b/lhotse/cut.py
@@ -122,6 +122,7 @@ class Cut:
         >>> cut_append = cut.append(other_cut)
         >>> cut_24k = cut.resample(24000)
         >>> cut_sp = cut.perturb_speed(1.1)
+        >>> cut_vp = cut.perturb_volume(2.)
 
     .. note::
         All cut transformations are performed lazily, on-the-fly, upon calling ``load_audio`` or ``load_features``.
@@ -184,6 +185,7 @@ class Cut:
     resample: Callable
     perturb_speed: Callable
     perturb_tempo: Callable
+    perturb_volume: Callable
     map_supervisions: Callable
     filter_supervisions: Callable
     with_features_path_prefix: Callable
@@ -933,6 +935,36 @@ def perturb_tempo(self, factor: float, affix_id: bool = True) -> 'MonoCut':
             start=new_start
         )
 
+    def perturb_volume(self, factor: float, affix_id: bool = True) -> 'MonoCut':
+        """
+        Return a new ``MonoCut`` that will lazily perturb the volume while loading audio.
+
+        :param factor: The volume will be adjusted this many times (e.g. factor=1.1 means 1.1x louder).
+        :param affix_id: When true, we will modify the ``MonoCut.id`` field
+            by affixing it with "_vp{factor}".
+        :return: a modified copy of the current ``MonoCut``.
+        """
+        # Pre-conditions
+        assert self.has_recording, 'Cannot perturb volume on a MonoCut without Recording.'
+        if self.has_features:
+            logging.warning(
+                'Attempting to perturb volume on a MonoCut that references pre-computed features. '
+                'The feature manifest will be detached, as we do not support feature-domain '
+                'volume perturbation.'
+            )
+            self.features = None
+        # Actual audio perturbation.
+        recording_vp = self.recording.perturb_volume(factor=factor, affix_id=affix_id)
+        # Match the supervision's id (and it's underlying recording id).
+        supervisions_vp = [s.perturb_volume(factor=factor, affix_id=affix_id) for s in self.supervisions]
+
+        return fastcopy(
+            self,
+            id=f'{self.id}_vp{factor}' if affix_id else self.id,
+            recording=recording_vp,
+            supervisions=supervisions_vp
+        )
+
     def map_supervisions(self, transform_fn: Callable[[SupervisionSegment], SupervisionSegment]) -> Cut:
         """
         Modify the SupervisionSegments by `transform_fn` of this MonoCut.
@@ -1198,6 +1230,19 @@ def perturb_tempo(self, factor: float, affix_id: bool = True) -> 'PaddingCut':
             frame_shift=new_frame_shift
         )
 
+    def perturb_volume(self, factor: float, affix_id: bool = True) -> 'PaddingCut':
+        """
+        Return a new ``PaddingCut`` that will "mimic" the effect of volume perturbation
+        on amplitude of samples.
+
+        :param factor: The volume will be adjusted this many times (e.g. factor=1.1 means 1.1x louder).
+        :param affix_id: When true, we will modify the ``PaddingCut.id`` field
+            by affixing it with "_vp{factor}".
+        :return: a modified copy of the current ``PaddingCut``.
+        """
+
+        return fastcopy(self, id=f'{self.id}_vp{factor}' if affix_id else self.id)
+
     def drop_features(self) -> 'PaddingCut':
         """Return a copy of the current :class:`.PaddingCut`, detached from ``features``."""
         assert self.has_recording, f"Cannot detach features from a MonoCut with no Recording (cut ID = {self.id})."
@@ -1583,6 +1628,32 @@ def perturb_tempo(self, factor: float, affix_id: bool = True) -> 'MixedCut':
             ]
         )
 
+    def perturb_volume(self, factor: float, affix_id: bool = True) -> 'MixedCut':
+        """
+        Return a new ``MixedCut`` that will lazily perturb the volume while loading audio.
+        Recordings of the underlying Cuts are updated to reflect volume change.
+
+        :param factor: The volume will be adjusted this many times (e.g. factor=1.1 means 1.1x louder).
+        :param affix_id: When true, we will modify the ``MixedCut.id`` field
+            by affixing it with "_vp{factor}".
+        :return: a modified copy of the current ``MixedCut``.
+        """
+        # Pre-conditions
+        assert self.has_recording, 'Cannot perturb volume on a MonoCut without Recording.'
+        if self.has_features:
+            logging.warning(
+                'Attempting to perturb volume on a MixedCut that references pre-computed features. '
+                'The feature manifest(s) will be detached, as we do not support feature-domain '
+                'volume perturbation.'
+            )
+        return MixedCut(
+            id=f'{self.id}_vp{factor}' if affix_id else self.id,
+            tracks=[
+                fastcopy(track, cut=track.cut.perturb_volume(factor=factor, affix_id=affix_id))
+                for track in self.tracks
+            ]
+        )
+
     def load_features(self, mixed: bool = True) -> Optional[np.ndarray]:
         """
         Loads the features of the source cuts and mixes them on-the-fly.
@@ -1973,11 +2044,12 @@ class CutSet(Serializable, Sequence[Cut]):
     and executed upon reading the audio::
 
         >>> cuts_sp = cuts.perturb_speed(factor=1.1)
+        >>> cuts_vp = cuts.perturb_volume(factor=2.)
         >>> cuts_24k = cuts.resample(24000)
 
     .. caution::
         If the :class:`.CutSet` contained :class:`~lhotse.features.base.Features` manifests, they will be
-        detached after performing audio augmentations such as :meth:`.CutSet.perturb_speed` or :meth:`.CutSet.resample`.
+        detached after performing audio augmentations such as :meth:`.CutSet.perturb_speed` or :meth:`.CutSet.resample` or :meth:`.CutSet.perturb_volume`.
 
     :class:`~lhotse.cut.CutSet` offers parallel feature extraction capabilities
     (see `meth`:.CutSet.compute_and_store_features: for details),
@@ -2615,6 +2687,20 @@ def perturb_tempo(self, factor: float, affix_id: bool = True) -> 'CutSet':
         """
         return self.map(lambda cut: cut.perturb_tempo(factor=factor, affix_id=affix_id))
 
+    def perturb_volume(self, factor: float, affix_id: bool = True) -> 'CutSet':
+        """
+        Return a new :class:`~lhotse.cut.CutSet` that contains volume perturbed cuts
+        with a factor of ``factor``. It requires the recording manifests to be present.
+        If the feature manifests are attached, they are dropped.
+        The supervision manifests are remaining the same.
+
+        :param factor: The resulting playback volume is ``factor`` times the original one.
+        :param affix_id: Should we modify the ID (useful if both versions of the same
+            cut are going to be present in a single manifest).
+        :return: a modified copy of the ``CutSet``.
+        """
+        return self.map(lambda cut: cut.perturb_volume(factor=factor, affix_id=affix_id))
+
     def mix(
             self,
             cuts: 'CutSet',

diff --git a/lhotse/dataset/cut_transforms/__init__.py b/lhotse/dataset/cut_transforms/__init__.py
@@ -2,10 +2,12 @@
 from .extra_padding import ExtraPadding
 from .mix import CutMix
 from .perturb_speed import PerturbSpeed
+from .perturb_volume import PerturbVolume
 
 __all__ = [
     'CutConcatenate',
     'CutMix',
     'ExtraPadding',
-    'PerturbSpeed'
+    'PerturbSpeed',
+    'PerturbVolume'
 ]
diff --git a/lhotse/dataset/cut_transforms/perturb_speed.py b/lhotse/dataset/cut_transforms/perturb_speed.py
@@ -15,7 +15,7 @@ class PerturbSpeed:
 
     def __init__(
             self,
-            factors: Union[float, List[float]],
+            factors: Union[float, Sequence[float]],
             p: float,
             randgen: random.Random = None
     ) -> None:

diff --git a/lhotse/dataset/cut_transforms/perturb_volume.py b/lhotse/dataset/cut_transforms/perturb_volume.py
@@ -0,0 +1,34 @@
+import random
+from typing import List, Sequence, Union
+
+from lhotse import CutSet
+
+
+class PerturbVolume:
+    """
+    A transform on batch of cuts (``CutSet``) that perturbs the volume of the recordings
+    with a given probability :attr:`p`.
+
+    If the effect is applied, then one of the perturbation factors from the constructor's
+    :attr:`factors` parameter is sampled with uniform probability.
+    """
+
+    def __init__(
+            self,
+            factors: Union[float, Sequence[float]],
+            p: float,
+            randgen: random.Random = None
+    ) -> None:
+        self.factors = factors if isinstance(factors, Sequence) else [factors]
+        self.p = p
+        self.random = randgen
+
+    def __call__(self, cuts: CutSet) -> CutSet:
+        if self.random is None:
+            self.random = random
+        return CutSet.from_cuts(
+            cut.perturb_volume(factor=self.random.choice(self.factors))
+            if self.random.random() >= self.p
+            else cut
+            for cut in cuts
+        )
diff --git a/lhotse/supervision.py b/lhotse/supervision.py
@@ -194,7 +194,7 @@ def perturb_speed(
             and duration (going through the sample counts).
         :param affix_id: When true, we will modify the ``id`` and ``recording_id`` fields
             by affixing it with "_sp{factor}".
-        :return: a modified copy of the current ``Recording``.
+        :return: a modified copy of the current ``SupervisionSegment``.
         """
         start_sample = compute_num_samples(self.start, sampling_rate)
         num_samples = compute_num_samples(self.duration, sampling_rate)
@@ -203,7 +203,7 @@ def perturb_speed(
         return fastcopy(
             self,
             id=f'{self.id}_sp{factor}' if affix_id else self.id,
-            recording_id=f'{self.recording_id}_sp{factor}' if affix_id else self.id,
+            recording_id=f'{self.recording_id}_sp{factor}' if affix_id else self.recording_id,
             start=new_start,
             duration=new_duration,
             alignment={
@@ -230,15 +230,35 @@ def perturb_tempo(
             and duration (going through the sample counts).
         :param affix_id: When true, we will modify the ``id`` and ``recording_id`` fields
             by affixing it with "_tp{factor}".
-        :return: a modified copy of the current ``Recording``.
+        :return: a modified copy of the current ``SupervisionSegment``.
         """
 
         # speed and tempo perturbation have the same effect on supervisions
         perturbed = self.perturb_speed(factor, sampling_rate, affix_id=False)
         return fastcopy(
             perturbed,
             id=f'{self.id}_tp{factor}' if affix_id else self.id,
-            recording_id=f'{self.recording_id}_tp{factor}' if affix_id else self.id,
+            recording_id=f'{self.recording_id}_tp{factor}' if affix_id else self.recording_id,
+        )
+
+    def perturb_volume(
+            self,
+            factor: float,
+            affix_id: bool = True
+    ) -> 'SupervisionSegment':
+        """
+        Return a ``SupervisionSegment`` with modified ids.
+
+        :param factor: The volume will be adjusted this many times (e.g. factor=1.1 means 1.1x louder).
+        :param affix_id: When true, we will modify the ``id`` and ``recording_id`` fields
+            by affixing it with "_vp{factor}".
+        :return: a modified copy of the current ``SupervisionSegment``.
+        """
+
+        return fastcopy(
+            self,
+            id=f'{self.id}_vp{factor}' if affix_id else self.id,
+            recording_id=f'{self.recording_id}_vp{factor}' if affix_id else self.recording_id
         )
 
     def trim(self, end: Seconds, start: Seconds = 0) -> 'SupervisionSegment':