diff --git a/lhotse/audio.py b/lhotse/audio.py index e2ff3ec56..8a6db8866 100644 --- a/lhotse/audio.py +++ b/lhotse/audio.py @@ -17,7 +17,7 @@ import numpy as np from tqdm.auto import tqdm -from lhotse.augmentation import AudioTransform, Resample, Speed, Tempo +from lhotse.augmentation import AudioTransform, Resample, Speed, Tempo, Volume from lhotse.serialization import Serializable from lhotse.utils import (Decibels, NonPositiveEnergyError, Pathlike, Seconds, SetContainingAnything, SmartOpen, asdict_nonull, compute_num_samples, exactly_one_not_null, fastcopy, ifnone, @@ -404,6 +404,23 @@ def perturb_tempo(self, factor: float, affix_id: bool = True) -> 'Recording': transforms=transforms ) + def perturb_volume(self, factor: float, affix_id: bool = True) -> 'Recording': + """ + Return a new ``Recording`` that will lazily perturb the volume while loading audio. + + :param factor: The volume scale to be applied (e.g. factor=1.1 means 1.1x louder). + :param affix_id: When true, we will modify the ``Recording.id`` field + by affixing it with "_tp{factor}". + :return: a modified copy of the current ``Recording``. + """ + transforms = self.transforms.copy() if self.transforms is not None else [] + transforms.append(Volume(factor=factor).to_dict()) + return fastcopy( + self, + id=f'{self.id}_vp{factor}' if affix_id else self.id, + transforms=transforms + ) + def resample(self, sampling_rate: int) -> 'Recording': """ Return a new ``Recording`` that will be lazily resampled while loading audio. @@ -490,6 +507,7 @@ class RecordingSet(Serializable, Sequence[Recording]): and executed upon reading the audio:: >>> recs_sp = recs.perturb_speed(factor=1.1) + >>> recs_vp = recs.perturb_volume(factor=2.) >>> recs_24k = recs.resample(24000) """ @@ -689,6 +707,17 @@ def perturb_tempo(self, factor: float, affix_id: bool = True) -> 'RecordingSet': """ return RecordingSet.from_recordings(r.perturb_tempo(factor=factor, affix_id=affix_id) for r in self) + def perturb_volume(self, factor: float, affix_id: bool = True) -> 'RecordingSet': + """ + Return a new ``RecordingSet`` that will lazily perturb the volume while loading audio. + + :param factor: The volume scale to be applied (e.g. factor=1.1 means 1.1x louder). + :param affix_id: When true, we will modify the ``Recording.id`` field + by affixing it with "_sp{factor}". + :return: a ``RecordingSet`` containing the perturbed ``Recording`` objects. + """ + return RecordingSet.from_recordings(r.perturb_volume(factor=factor, affix_id=affix_id) for r in self) + def resample(self, sampling_rate: int) -> 'RecordingSet': """ Apply resampling to all recordings in the ``RecordingSet`` and return a new ``RecordingSet``. diff --git a/lhotse/augmentation/torchaudio.py b/lhotse/augmentation/torchaudio.py index 11a7de078..a89449a51 100644 --- a/lhotse/augmentation/torchaudio.py +++ b/lhotse/augmentation/torchaudio.py @@ -44,6 +44,7 @@ class SoxEffectTransform: >>> augment_fn = SoxEffectTransform(effects=[ >>> ['reverb', 50, 50, RandomValue(0, 100)], >>> ['speed', RandomValue(0.9, 1.1)], + >>> ['volume', RandomValue(0.125, 2.)], >>> ['rate', 16000], >>> ]) >>> augmented = augment_fn(audio, 16000) @@ -296,6 +297,38 @@ def reverse_timestamps( ) +@dataclass +class Volume(AudioTransform): + """ + Volume perturbation effect, the same one as invoked with `sox vol` in the command line. + + It changes the amplitude of the original samples, so the absolute values of output samples will + be smaller or greater, depending on the vol factor. + """ + factor: float + + def __call__(self, samples: np.ndarray, sampling_rate: int) -> np.ndarray: + sampling_rate = int(sampling_rate) # paranoia mode + effect = [['vol', str(self.factor)]] + if isinstance(samples, np.ndarray): + samples = torch.from_numpy(samples) + augmented, _ = torchaudio.sox_effects.apply_effects_tensor(samples, sampling_rate, effect) + return augmented.numpy() + + def reverse_timestamps( + self, + offset: Seconds, + duration: Optional[Seconds], + sampling_rate: Optional[int] # Not used, made for compatibility purposes + ) -> Tuple[Seconds, Optional[Seconds]]: + """ + This method just returnes the original offset and duration as volume perturbation + doesn't change any these audio properies. + """ + + return offset, duration + + def speed(sampling_rate: int) -> List[List[str]]: return [ ['speed', RandomValue(0.9, 1.1)], @@ -310,6 +343,10 @@ def reverb(sampling_rate: int) -> List[List[str]]: ] +def volume(sampling_rate: int) -> List[List[str]]: + return [['vol', RandomValue(0.125, 2.)]] + + def pitch(sampling_rate: int) -> List[List[str]]: return [ # The returned values are 1/100ths of a semitone, meaning the default is up to a minor third shift up or down. diff --git a/lhotse/cut.py b/lhotse/cut.py index 36062e5e0..1407fc57b 100644 --- a/lhotse/cut.py +++ b/lhotse/cut.py @@ -122,6 +122,7 @@ class Cut: >>> cut_append = cut.append(other_cut) >>> cut_24k = cut.resample(24000) >>> cut_sp = cut.perturb_speed(1.1) + >>> cut_vp = cut.perturb_volume(2.) .. note:: All cut transformations are performed lazily, on-the-fly, upon calling ``load_audio`` or ``load_features``. @@ -184,6 +185,7 @@ class Cut: resample: Callable perturb_speed: Callable perturb_tempo: Callable + perturb_volume: Callable map_supervisions: Callable filter_supervisions: Callable with_features_path_prefix: Callable @@ -933,6 +935,36 @@ def perturb_tempo(self, factor: float, affix_id: bool = True) -> 'MonoCut': start=new_start ) + def perturb_volume(self, factor: float, affix_id: bool = True) -> 'MonoCut': + """ + Return a new ``MonoCut`` that will lazily perturb the volume while loading audio. + + :param factor: The volume will be adjusted this many times (e.g. factor=1.1 means 1.1x louder). + :param affix_id: When true, we will modify the ``MonoCut.id`` field + by affixing it with "_vp{factor}". + :return: a modified copy of the current ``MonoCut``. + """ + # Pre-conditions + assert self.has_recording, 'Cannot perturb volume on a MonoCut without Recording.' + if self.has_features: + logging.warning( + 'Attempting to perturb volume on a MonoCut that references pre-computed features. ' + 'The feature manifest will be detached, as we do not support feature-domain ' + 'volume perturbation.' + ) + self.features = None + # Actual audio perturbation. + recording_vp = self.recording.perturb_volume(factor=factor, affix_id=affix_id) + # Match the supervision's id (and it's underlying recording id). + supervisions_vp = [s.perturb_volume(factor=factor, affix_id=affix_id) for s in self.supervisions] + + return fastcopy( + self, + id=f'{self.id}_vp{factor}' if affix_id else self.id, + recording=recording_vp, + supervisions=supervisions_vp + ) + def map_supervisions(self, transform_fn: Callable[[SupervisionSegment], SupervisionSegment]) -> Cut: """ Modify the SupervisionSegments by `transform_fn` of this MonoCut. @@ -1198,6 +1230,19 @@ def perturb_tempo(self, factor: float, affix_id: bool = True) -> 'PaddingCut': frame_shift=new_frame_shift ) + def perturb_volume(self, factor: float, affix_id: bool = True) -> 'PaddingCut': + """ + Return a new ``PaddingCut`` that will "mimic" the effect of volume perturbation + on amplitude of samples. + + :param factor: The volume will be adjusted this many times (e.g. factor=1.1 means 1.1x louder). + :param affix_id: When true, we will modify the ``PaddingCut.id`` field + by affixing it with "_vp{factor}". + :return: a modified copy of the current ``PaddingCut``. + """ + + return fastcopy(self, id=f'{self.id}_vp{factor}' if affix_id else self.id) + def drop_features(self) -> 'PaddingCut': """Return a copy of the current :class:`.PaddingCut`, detached from ``features``.""" assert self.has_recording, f"Cannot detach features from a MonoCut with no Recording (cut ID = {self.id})." @@ -1583,6 +1628,32 @@ def perturb_tempo(self, factor: float, affix_id: bool = True) -> 'MixedCut': ] ) + def perturb_volume(self, factor: float, affix_id: bool = True) -> 'MixedCut': + """ + Return a new ``MixedCut`` that will lazily perturb the volume while loading audio. + Recordings of the underlying Cuts are updated to reflect volume change. + + :param factor: The volume will be adjusted this many times (e.g. factor=1.1 means 1.1x louder). + :param affix_id: When true, we will modify the ``MixedCut.id`` field + by affixing it with "_vp{factor}". + :return: a modified copy of the current ``MixedCut``. + """ + # Pre-conditions + assert self.has_recording, 'Cannot perturb volume on a MonoCut without Recording.' + if self.has_features: + logging.warning( + 'Attempting to perturb volume on a MixedCut that references pre-computed features. ' + 'The feature manifest(s) will be detached, as we do not support feature-domain ' + 'volume perturbation.' + ) + return MixedCut( + id=f'{self.id}_vp{factor}' if affix_id else self.id, + tracks=[ + fastcopy(track, cut=track.cut.perturb_volume(factor=factor, affix_id=affix_id)) + for track in self.tracks + ] + ) + def load_features(self, mixed: bool = True) -> Optional[np.ndarray]: """ Loads the features of the source cuts and mixes them on-the-fly. @@ -1973,11 +2044,12 @@ class CutSet(Serializable, Sequence[Cut]): and executed upon reading the audio:: >>> cuts_sp = cuts.perturb_speed(factor=1.1) + >>> cuts_vp = cuts.perturb_volume(factor=2.) >>> cuts_24k = cuts.resample(24000) .. caution:: If the :class:`.CutSet` contained :class:`~lhotse.features.base.Features` manifests, they will be - detached after performing audio augmentations such as :meth:`.CutSet.perturb_speed` or :meth:`.CutSet.resample`. + detached after performing audio augmentations such as :meth:`.CutSet.perturb_speed` or :meth:`.CutSet.resample` or :meth:`.CutSet.perturb_volume`. :class:`~lhotse.cut.CutSet` offers parallel feature extraction capabilities (see `meth`:.CutSet.compute_and_store_features: for details), @@ -2615,6 +2687,20 @@ def perturb_tempo(self, factor: float, affix_id: bool = True) -> 'CutSet': """ return self.map(lambda cut: cut.perturb_tempo(factor=factor, affix_id=affix_id)) + def perturb_volume(self, factor: float, affix_id: bool = True) -> 'CutSet': + """ + Return a new :class:`~lhotse.cut.CutSet` that contains volume perturbed cuts + with a factor of ``factor``. It requires the recording manifests to be present. + If the feature manifests are attached, they are dropped. + The supervision manifests are remaining the same. + + :param factor: The resulting playback volume is ``factor`` times the original one. + :param affix_id: Should we modify the ID (useful if both versions of the same + cut are going to be present in a single manifest). + :return: a modified copy of the ``CutSet``. + """ + return self.map(lambda cut: cut.perturb_volume(factor=factor, affix_id=affix_id)) + def mix( self, cuts: 'CutSet', diff --git a/lhotse/dataset/cut_transforms/__init__.py b/lhotse/dataset/cut_transforms/__init__.py index dd18f4ad7..da4d8f02e 100644 --- a/lhotse/dataset/cut_transforms/__init__.py +++ b/lhotse/dataset/cut_transforms/__init__.py @@ -2,10 +2,12 @@ from .extra_padding import ExtraPadding from .mix import CutMix from .perturb_speed import PerturbSpeed +from .perturb_volume import PerturbVolume __all__ = [ 'CutConcatenate', 'CutMix', 'ExtraPadding', - 'PerturbSpeed' + 'PerturbSpeed', + 'PerturbVolume' ] diff --git a/lhotse/dataset/cut_transforms/perturb_speed.py b/lhotse/dataset/cut_transforms/perturb_speed.py index dde701276..b4333dfad 100644 --- a/lhotse/dataset/cut_transforms/perturb_speed.py +++ b/lhotse/dataset/cut_transforms/perturb_speed.py @@ -15,7 +15,7 @@ class PerturbSpeed: def __init__( self, - factors: Union[float, List[float]], + factors: Union[float, Sequence[float]], p: float, randgen: random.Random = None ) -> None: diff --git a/lhotse/dataset/cut_transforms/perturb_volume.py b/lhotse/dataset/cut_transforms/perturb_volume.py new file mode 100644 index 000000000..ee68f6ea7 --- /dev/null +++ b/lhotse/dataset/cut_transforms/perturb_volume.py @@ -0,0 +1,34 @@ +import random +from typing import List, Sequence, Union + +from lhotse import CutSet + + +class PerturbVolume: + """ + A transform on batch of cuts (``CutSet``) that perturbs the volume of the recordings + with a given probability :attr:`p`. + + If the effect is applied, then one of the perturbation factors from the constructor's + :attr:`factors` parameter is sampled with uniform probability. + """ + + def __init__( + self, + factors: Union[float, Sequence[float]], + p: float, + randgen: random.Random = None + ) -> None: + self.factors = factors if isinstance(factors, Sequence) else [factors] + self.p = p + self.random = randgen + + def __call__(self, cuts: CutSet) -> CutSet: + if self.random is None: + self.random = random + return CutSet.from_cuts( + cut.perturb_volume(factor=self.random.choice(self.factors)) + if self.random.random() >= self.p + else cut + for cut in cuts + ) diff --git a/lhotse/supervision.py b/lhotse/supervision.py index ec23d3228..bafb75322 100644 --- a/lhotse/supervision.py +++ b/lhotse/supervision.py @@ -194,7 +194,7 @@ def perturb_speed( and duration (going through the sample counts). :param affix_id: When true, we will modify the ``id`` and ``recording_id`` fields by affixing it with "_sp{factor}". - :return: a modified copy of the current ``Recording``. + :return: a modified copy of the current ``SupervisionSegment``. """ start_sample = compute_num_samples(self.start, sampling_rate) num_samples = compute_num_samples(self.duration, sampling_rate) @@ -203,7 +203,7 @@ def perturb_speed( return fastcopy( self, id=f'{self.id}_sp{factor}' if affix_id else self.id, - recording_id=f'{self.recording_id}_sp{factor}' if affix_id else self.id, + recording_id=f'{self.recording_id}_sp{factor}' if affix_id else self.recording_id, start=new_start, duration=new_duration, alignment={ @@ -230,7 +230,7 @@ def perturb_tempo( and duration (going through the sample counts). :param affix_id: When true, we will modify the ``id`` and ``recording_id`` fields by affixing it with "_tp{factor}". - :return: a modified copy of the current ``Recording``. + :return: a modified copy of the current ``SupervisionSegment``. """ # speed and tempo perturbation have the same effect on supervisions @@ -238,7 +238,27 @@ def perturb_tempo( return fastcopy( perturbed, id=f'{self.id}_tp{factor}' if affix_id else self.id, - recording_id=f'{self.recording_id}_tp{factor}' if affix_id else self.id, + recording_id=f'{self.recording_id}_tp{factor}' if affix_id else self.recording_id, + ) + + def perturb_volume( + self, + factor: float, + affix_id: bool = True + ) -> 'SupervisionSegment': + """ + Return a ``SupervisionSegment`` with modified ids. + + :param factor: The volume will be adjusted this many times (e.g. factor=1.1 means 1.1x louder). + :param affix_id: When true, we will modify the ``id`` and ``recording_id`` fields + by affixing it with "_vp{factor}". + :return: a modified copy of the current ``SupervisionSegment``. + """ + + return fastcopy( + self, + id=f'{self.id}_vp{factor}' if affix_id else self.id, + recording_id=f'{self.recording_id}_vp{factor}' if affix_id else self.recording_id ) def trim(self, end: Seconds, start: Seconds = 0) -> 'SupervisionSegment': diff --git a/test/augmentation/test_torchaudio.py b/test/augmentation/test_torchaudio.py index 8cb917756..e3f155c6e 100644 --- a/test/augmentation/test_torchaudio.py +++ b/test/augmentation/test_torchaudio.py @@ -4,10 +4,11 @@ import torch from hypothesis import given, settings from hypothesis import strategies as st +from numpy.testing import assert_array_almost_equal torchaudio = pytest.importorskip('torchaudio', minversion='0.6') -from lhotse.augmentation import SoxEffectTransform, Tempo, pitch, reverb, speed, Speed +from lhotse.augmentation import SoxEffectTransform, pitch, reverb, speed, volume, Speed, Tempo, Volume from lhotse import AudioTransform, MonoCut, Recording, Resample, Seconds SAMPLING_RATE = 16000 @@ -18,7 +19,13 @@ def audio(): return torch.sin(2 * math.pi * torch.linspace(0, 1, 16000)).unsqueeze(0).numpy() -@pytest.mark.parametrize('effect', [reverb, pitch, speed]) +# to avoid clipping during volume perturbation test +@pytest.fixture +def audio_volume(): + return torch.sin(2 * math.pi * torch.linspace(0, 1, 16000)).unsqueeze(0).numpy() / 3. + + +@pytest.mark.parametrize('effect', [reverb, pitch, speed, volume]) def test_example_augmentation(audio, effect): augment_fn = SoxEffectTransform(effects=effect(SAMPLING_RATE)) augmented_audio = augment_fn(audio, sampling_rate=SAMPLING_RATE) @@ -38,31 +45,68 @@ def test_speed_does_not_change_num_samples(audio): assert augmented_audio != audio +def test_volume_does_not_change_num_samples(audio): + augment_fn = SoxEffectTransform(effects=volume(SAMPLING_RATE)) + for _ in range(10): + augmented_audio = augment_fn(audio, sampling_rate=SAMPLING_RATE) + assert augmented_audio.shape == audio.shape + assert augmented_audio != audio + + def test_speed(audio): speed = Speed(factor=1.1) perturbed = speed(audio, SAMPLING_RATE) assert perturbed.shape == (1, 14545) -def test_deserialize_transform(audio): +@pytest.mark.parametrize('scale', [0.125, 1., 2.]) +def test_volume(audio_volume, scale): + volume = Volume(factor=scale) + audio_perturbed = volume(audio_volume, SAMPLING_RATE) + + assert audio_perturbed.shape == audio_volume.shape + assert_array_almost_equal(audio_perturbed, scale * audio_volume) + + +def test_deserialize_transform_speed(audio): speed = AudioTransform.from_dict({'name': 'Speed', 'kwargs': {'factor': 1.1}}) - perturbed = speed(audio, SAMPLING_RATE) - assert perturbed.shape == (1, 14545) + perturbed_speed = speed(audio, SAMPLING_RATE) + + assert perturbed_speed.shape == (1, 14545) + +def test_deserialize_transform_volume(audio): + volume = AudioTransform.from_dict({'name': 'Volume', 'kwargs': {'factor': 0.5}}) + perturbed_volume = volume(audio, SAMPLING_RATE) -def test_serialize_deserialize_transform(audio): + assert perturbed_volume.shape == audio.shape + assert_array_almost_equal(perturbed_volume, audio * 0.5) + + +def test_serialize_deserialize_transform_speed(audio): speed_orig = Speed(factor=1.1) - data = speed_orig.to_dict() - speed = AudioTransform.from_dict(data) - perturbed = speed(audio, SAMPLING_RATE) - assert perturbed.shape == (1, 14545) + data_speed = speed_orig.to_dict() + speed = AudioTransform.from_dict(data_speed) + perturbed_speed = speed(audio, SAMPLING_RATE) + + assert perturbed_speed.shape == (1, 14545) + + +def test_serialize_deserialize_transform_volume(audio): + volume_orig = Volume(factor=0.5) + data_volume = volume_orig.to_dict() + volume = AudioTransform.from_dict(data_volume) + perturbed_volume = volume(audio, SAMPLING_RATE) + + assert perturbed_volume.shape == audio.shape + assert_array_almost_equal(perturbed_volume, audio * 0.5) @pytest.mark.parametrize('sampling_rate', [8000, 16000, 22050, 32000, 44100, 48000]) def test_resample(audio, sampling_rate): - speed = Resample(source_sampling_rate=16000, target_sampling_rate=sampling_rate) - perturbed = speed(audio, SAMPLING_RATE) - assert perturbed.shape == (1, sampling_rate) + resample = Resample(source_sampling_rate=16000, target_sampling_rate=sampling_rate) + resampled = resample(audio, SAMPLING_RATE) + assert resampled.shape == (1, sampling_rate) def test_tempo(audio): @@ -77,6 +121,8 @@ def test_tempo(audio): st.one_of([st.just(v) for v in [8000, 22050, 32000, 44100, 48000]]), # Speed perturbation values st.one_of([st.just(v) for v in [0.9, 0.95, 1.05, 1.1]]), + # Volume perturbation values + st.one_of([st.just(v) for v in [0.125, 0.5, 1.5, 2.]]), # Resampling first? st.booleans(), # Cut duration (full recording has 16.04s) @@ -85,15 +131,16 @@ def test_tempo(audio): def test_augmentation_chain_randomized( target_sampling_rate: int, sp_factor: float, + vp_factor: float, resample_first: bool, cut_duration: Seconds ): recording = Recording.from_file('test/fixtures/libri/libri-1088-134315-0000.wav') if resample_first: - recording_aug = recording.resample(target_sampling_rate).perturb_speed(sp_factor) + recording_aug = recording.resample(target_sampling_rate).perturb_speed(sp_factor).perturb_volume(vp_factor) else: - recording_aug = recording.perturb_speed(sp_factor).resample(target_sampling_rate) + recording_aug = recording.perturb_speed(sp_factor).resample(target_sampling_rate).perturb_volume(vp_factor) audio_aug = recording_aug.load_audio() assert audio_aug.shape[1] == recording_aug.num_samples diff --git a/test/cut/test_cut_augmentation.py b/test/cut/test_cut_augmentation.py index 17c6174ec..d7210c464 100644 --- a/test/cut/test_cut_augmentation.py +++ b/test/cut/test_cut_augmentation.py @@ -1,5 +1,7 @@ import pytest +import numpy as np + from lhotse import AudioSource, CutSet, MonoCut, Recording, SupervisionSegment from lhotse.cut import PaddingCut @@ -134,10 +136,19 @@ def test_cut_set_perturb_speed_doesnt_duplicate_transforms(cut_with_supervision) assert len(cut.recording.transforms) == 1 +def test_cut_set_perturb_volume_doesnt_duplicate_transforms(cut_with_supervision): + cuts = CutSet.from_cuts([cut_with_supervision, cut_with_supervision.with_id('other-id')]) + cuts_vp = cuts.perturb_volume(2.) + for cut in cuts_vp: + # This prevents a bug regression where multiple cuts referencing the same recording would + # attach transforms to the same manifest + assert len(cut.recording.transforms) == 1 + + def test_cut_set_resample_doesnt_duplicate_transforms(cut_with_supervision): cuts = CutSet.from_cuts([cut_with_supervision, cut_with_supervision.with_id('other-id')]) - cuts_sp = cuts.resample(44100) - for cut in cuts_sp: + cuts_res = cuts.resample(44100) + for cut in cuts_res: # This prevents a bug regression where multiple cuts referencing the same recording would # attach transforms to the same manifest assert len(cut.recording.transforms) == 1 @@ -208,7 +219,7 @@ def test_cut_start01_perturb_speed09(cut_with_supervision_start01): assert recording_samples.shape[1] == 4444 -def test_mixed_cut_start01_perturb(cut_with_supervision_start01): +def test_mixed_cut_start01_perturb_speed(cut_with_supervision_start01): mixed_sp = ( cut_with_supervision_start01 .append(cut_with_supervision_start01) @@ -231,14 +242,53 @@ def test_mixed_cut_start01_perturb(cut_with_supervision_start01): assert cut_samples.shape[1] == 2909 * 2 -def test_padding_cut_perturb(): +def test_mixed_cut_start01_perturb_volume(cut_with_supervision_start01): + mixed_vp = ( + cut_with_supervision_start01 + .append(cut_with_supervision_start01) + .perturb_volume(0.125) + ) + assert mixed_vp.start == 0 # MixedCut always starts at 0 + assert mixed_vp.duration == cut_with_supervision_start01.duration * 2 + assert mixed_vp.end == cut_with_supervision_start01.duration * 2 + assert mixed_vp.num_samples == cut_with_supervision_start01.num_samples * 2 + + assert mixed_vp.supervisions[0].start == cut_with_supervision_start01.supervisions[0].start + assert mixed_vp.supervisions[0].duration == cut_with_supervision_start01.supervisions[0].duration + assert mixed_vp.supervisions[0].end == cut_with_supervision_start01.supervisions[0].end + assert mixed_vp.supervisions[1].start == (cut_with_supervision_start01.duration + + cut_with_supervision_start01.supervisions[0].start) + assert mixed_vp.supervisions[1].duration == cut_with_supervision_start01.supervisions[0].duration + assert mixed_vp.supervisions[1].end == (cut_with_supervision_start01.duration + + cut_with_supervision_start01.supervisions[0].end) + + + cut_samples = mixed_vp.load_audio() + cut_with_supervision_start01_samples = cut_with_supervision_start01.load_audio() + assert (cut_samples.shape[0] == cut_with_supervision_start01_samples.shape[0] and + cut_samples.shape[1] == cut_with_supervision_start01_samples.shape[1] * 2) + np.testing.assert_array_almost_equal( + cut_samples, + np.hstack((cut_with_supervision_start01_samples, cut_with_supervision_start01_samples)) * 0.125 + ) + + +def test_padding_cut_perturb_speed(): cut = PaddingCut(id='cut', duration=5.75, sampling_rate=16000, feat_value=1e-10, num_samples=92000) cut_sp = cut.perturb_speed(1.1) assert cut_sp.num_samples == 83636 assert cut_sp.duration == 5.22725 -def test_cut_set_perturb(cut_with_supervision, cut_with_supervision_start01): +def test_padding_cut_perturb_volume(): + cut = PaddingCut(id='cut', duration=5.75, sampling_rate=16000, feat_value=1e-10, num_samples=92000) + cut_vp = cut.perturb_volume(0.125) + assert cut_vp.num_samples == cut.num_samples + assert cut_vp.duration == cut.duration + np.testing.assert_array_almost_equal(cut_vp.load_audio(), cut.load_audio()) + + +def test_cut_set_perturb_speed(cut_with_supervision, cut_with_supervision_start01): cut_set = CutSet.from_cuts([cut_with_supervision, cut_with_supervision_start01]) cs_sp = cut_set.perturb_speed(1.1) for cut_sp, cut in zip(cs_sp, cut_set): @@ -248,13 +298,13 @@ def test_cut_set_perturb(cut_with_supervision, cut_with_supervision_start01): @pytest.fixture() -def resampling_cuts(cut_with_supervision, cut_with_supervision_start01): +def cut_set(cut_with_supervision, cut_with_supervision_start01): return CutSet.from_cuts([cut_with_supervision, cut_with_supervision_start01]) @pytest.mark.parametrize('cut_id', ['cut', 'cut_start01']) -def test_resample_cut(resampling_cuts, cut_id): - original = resampling_cuts[cut_id] +def test_resample_cut(cut_set, cut_id): + original = cut_set[cut_id] resampled = original.resample(16000) assert original.sampling_rate == 8000 assert resampled.sampling_rate == 16000 @@ -263,6 +313,31 @@ def test_resample_cut(resampling_cuts, cut_id): assert samples.shape[1] == resampled.num_samples +@pytest.mark.parametrize('cut_id', ['cut', 'cut_start01']) +@pytest.mark.parametrize('scale', [0.125, 2.]) +def test_cut_perturb_volume(cut_set, cut_id, scale): + + cut = cut_set[cut_id] + cut_vp = cut.perturb_volume(scale) + assert cut_vp.start == cut.start + assert cut_vp.duration == cut.duration + assert cut_vp.end == cut.end + assert cut_vp.num_samples == cut.num_samples + + assert cut_vp.recording.duration == cut.recording.duration + assert cut_vp.recording.num_samples == cut.recording.num_samples + + assert cut_vp.supervisions[0].start == cut.supervisions[0].start + assert cut_vp.supervisions[0].duration == cut.supervisions[0].duration + assert cut_vp.supervisions[0].end == cut.supervisions[0].end + + assert cut_vp.load_audio().shape == cut.load_audio().shape + assert cut_vp.recording.load_audio().shape == cut.recording.load_audio().shape + + np.testing.assert_array_almost_equal(cut_vp.load_audio(), cut.load_audio() * scale) + np.testing.assert_array_almost_equal(cut_vp.recording.load_audio(), cut.recording.load_audio() * scale) + + def test_resample_padding_cut(): original = PaddingCut(id='cut', duration=5.75, sampling_rate=16000, feat_value=1e-10, num_samples=92000) resampled = original.resample(8000) @@ -283,9 +358,9 @@ def test_resample_mixed_cut(cut_with_supervision_start01): @pytest.mark.parametrize('affix_id', [True, False]) -def test_resample_cut_set(resampling_cuts, affix_id): - resampled_cs = resampling_cuts.resample(16000, affix_id=affix_id) - for original, resampled in zip(resampling_cuts, resampled_cs): +def test_resample_cut_set(cut_set, affix_id): + resampled_cs = cut_set.resample(16000, affix_id=affix_id) + for original, resampled in zip(cut_set, resampled_cs): if affix_id: assert original.id != resampled.id assert resampled.id.endswith('_rs16000') @@ -296,3 +371,19 @@ def test_resample_cut_set(resampling_cuts, affix_id): assert resampled.num_samples == 2 * original.num_samples samples = resampled.load_audio() assert samples.shape[1] == resampled.num_samples + + +@pytest.mark.parametrize('scale', [0.125, 2.]) +@pytest.mark.parametrize('affix_id', [True, False]) +def test_cut_set_perturb_volume(cut_set, affix_id, scale): + perturbed_vp_cs = cut_set.perturb_volume(scale, affix_id=affix_id) + for original, perturbed_vp in zip(cut_set, perturbed_vp_cs): + if affix_id: + assert original.id != perturbed_vp.id + assert perturbed_vp.id.endswith(f'_vp{scale}') + else: + assert original.id == perturbed_vp.id + assert original.sampling_rate == perturbed_vp.sampling_rate + assert original.num_samples == perturbed_vp.num_samples + assert original.load_audio().shape == perturbed_vp.load_audio().shape + np.testing.assert_array_almost_equal(perturbed_vp.load_audio(), original.load_audio() * scale) diff --git a/test/dataset/test_cut_transforms.py b/test/dataset/test_cut_transforms.py index f875cedc9..7eba1c46a 100644 --- a/test/dataset/test_cut_transforms.py +++ b/test/dataset/test_cut_transforms.py @@ -6,7 +6,7 @@ from lhotse import CutSet from lhotse.cut import MixedCut from lhotse.dataset import CutMix, ExtraPadding -from lhotse.dataset import PerturbSpeed +from lhotse.dataset import PerturbSpeed, PerturbVolume from lhotse.testing.dummies import DummyManifest @@ -14,7 +14,7 @@ def test_perturb_speed(): tfnm = PerturbSpeed(factors=[0.9, 1.1], p=0.5, randgen=random.Random(42)) cuts = DummyManifest(CutSet, begin_id=0, end_id=10) cuts_sp = tfnm(cuts) - print(set(c.duration for c in cuts_sp)) + assert all( # The duration will not be exactly 0.9 and 1.1 because perturb speed # will round to a physically-viable duration based on the sampling_rate @@ -24,6 +24,20 @@ def test_perturb_speed(): ) +def test_perturb_volume(): + tfnm = PerturbVolume(factors=[0.125, 2.], p=0.5, randgen=random.Random(42)) + cuts = DummyManifest(CutSet, begin_id=0, end_id=10) + cuts_vp = tfnm(cuts) + + assert all( + cut.duration == 1. and + cut.start == 0. and + cut.recording.sampling_rate == 16000 and + cut.recording.num_samples == 16000 and + cut.recording.duration == 1.0 for cut in cuts_vp + ) + + def test_cutmix(): speech_cuts = DummyManifest(CutSet, begin_id=0, end_id=10) for c in speech_cuts: diff --git a/test/known_issues/test_augment_with_executor.py b/test/known_issues/test_augment_with_executor.py index d38b97d68..e0b074dfe 100644 --- a/test/known_issues/test_augment_with_executor.py +++ b/test/known_issues/test_augment_with_executor.py @@ -32,11 +32,40 @@ def test_wav_augment_with_executor(self, exec_type): cut = self.with_cut(sampling_rate=16000, num_samples=16000) with TemporaryDirectory() as d, \ exec_type(max_workers=4) as ex: - cut_set = CutSet.from_cuts( + cut_set_speed = CutSet.from_cuts( cut.with_id(str(i)) for i in range(100) ).perturb_speed(1.1) # perturb_speed uses torchaudio SoX effect that could hang # Just test that it runs and does not hang. - cut_set_feats = cut_set.compute_and_store_features( + cut_set_speed_feats = cut_set_speed.compute_and_store_features( + extractor=Fbank(), + storage_path=d, + executor=ex + ) + + @pytest.mark.parametrize( + 'exec_type', + [ + # Multithreading works + ThreadPoolExecutor, + # Multiprocessing works, but only when using the "spawn" context (in testing) + pytest.param( + partial(ProcessPoolExecutor, mp_context=multiprocessing.get_context("spawn")), + marks=pytest.mark.skipif( + sys.version_info[0] == 3 and sys.version_info[1] < 7, + reason="The mp_context argument is introduced in Python 3.7" + ) + ), + ] + ) + def test_wav_augment_with_executor(self, exec_type): + cut = self.with_cut(sampling_rate=16000, num_samples=16000) + with TemporaryDirectory() as d, \ + exec_type(max_workers=4) as ex: + cut_set_volume = CutSet.from_cuts( + cut.with_id(str(i)) for i in range(100) + ).perturb_volume(0.125) # perturb_volume uses torchaudio SoX effect that could hang + # Just test that it runs and does not hang. + cut_set_volume_feats = cut_set_volume.compute_and_store_features( extractor=Fbank(), storage_path=d, executor=ex diff --git a/test/test_recording_set.py b/test/test_recording_set.py index f8ec5ab5f..8eddd8629 100644 --- a/test/test_recording_set.py +++ b/test/test_recording_set.py @@ -209,6 +209,28 @@ def test_recording_perturb_tempo(recording, factor, affix_id): assert samples.shape[1] == rec_sp.num_samples +@pytest.mark.parametrize( + ['factor', 'affix_id'], + [ + (1.0, True), + (1.0, False), + (0.125, True), + (0.125, False), + (2., True), + (2., False), + ] +) +def test_recording_perturb_volume(recording, factor, affix_id): + rec_vp = recording.perturb_volume(factor=factor, affix_id=affix_id) + if affix_id: + assert rec_vp.id == f'{recording.id}_vp{factor}' + else: + assert rec_vp.id == recording.id + samples = rec_vp.load_audio() + assert samples.shape[0] == rec_vp.num_channels + assert samples.shape[1] == rec_vp.num_samples + + def test_recording_set_perturb_speed(recording_set): recs_sp = recording_set.perturb_speed(factor=1.1) for r, r_sp in zip(recording_set, recs_sp): @@ -223,6 +245,13 @@ def test_recording_set_perturb_tempo(recording_set): assert r.sampling_rate == r_tp.sampling_rate +def test_recording_set_perturb_volume(recording_set): + recs_vp = recording_set.perturb_volume(factor=2.) + for r, r_vp in zip(recording_set, recs_vp): + assert r.duration == r_vp.duration + assert r.sampling_rate == r_vp.sampling_rate + + @pytest.mark.parametrize('sampling_rate', [8000, 16000, 22050, 32000, 44100, 48000]) def test_recording_resample(recording, sampling_rate): rec_sp = recording.resample(sampling_rate)