From 9c1330a8523a2cc28c5d983a1d59ae4d4b05f117 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BD=AD=E9=9C=87=E4=B8=9C?= <275331498@qq.com> Date: Sat, 23 Nov 2024 08:41:47 +0800 Subject: [PATCH] change max_frames to max_duration in docs (#1419) * change max_frames to max_duration in docs * minor fix --- docs/datasets.rst | 8 ++++---- lhotse/cut/data.py | 2 +- lhotse/cut/mixed.py | 2 +- lhotse/cut/padding.py | 2 +- lhotse/cut/set.py | 2 +- lhotse/dataset/audio_tagging.py | 2 +- lhotse/dataset/sampling/bucketing.py | 4 ++-- lhotse/dataset/sampling/cut_pairs.py | 8 ++++---- lhotse/dataset/sampling/dynamic.py | 2 +- lhotse/dataset/sampling/simple.py | 12 ++++++------ lhotse/dataset/sampling/weighted_simple.py | 2 +- lhotse/dataset/speech_recognition.py | 2 +- lhotse/dataset/speech_translation.py | 2 +- lhotse/dataset/surt.py | 2 +- 14 files changed, 26 insertions(+), 26 deletions(-) diff --git a/docs/datasets.rst b/docs/datasets.rst index 03df15ea1..d3609e955 100644 --- a/docs/datasets.rst +++ b/docs/datasets.rst @@ -28,7 +28,7 @@ It allows for interesting collation methods - e.g. **padding the speech with noi The items for mini-batch creation are selected by the ``Sampler``. Lhotse defines ``Sampler`` classes that are initialized with :class:`~lhotse.cut.CutSet`'s, so that they can look up specific properties of an utterance to stratify the sampling. -For example, :class:`~lhotse.dataset.sampling.SimpleCutSampler` has a defined ``max_frames`` attribute, and it will keep sampling cuts for a batch until they do not exceed the specified number of frames. +For example, :class:`~lhotse.dataset.sampling.SimpleCutSampler` has a defined ``max_duration`` attribute, and it will keep sampling cuts for a batch until they do not exceed the specified number of seconds. Another strategy — used in :class:`~lhotse.dataset.sampling.BucketingSampler` — will first group the cuts of similar durations into buckets, and then randomly select a bucket to draw the whole batch from. For tasks where both input and output of the model are speech utterances, we can use the :class:`~lhotse.dataset.sampling.CutPairsSampler`, which accepts two :class:`~lhotse.cut.CutSet`'s and will match the cuts in them by their IDs. @@ -38,11 +38,11 @@ A typical Lhotse's dataset API usage might look like this: .. code-block:: from torch.utils.data import DataLoader - from lhotse.dataset import SpeechRecognitionDataset, SimpleCutSampler + from lhotse.dataset import K2SpeechRecognitionDataset, SimpleCutSampler cuts = CutSet(...) - dset = SpeechRecognitionDataset(cuts) - sampler = SimpleCutSampler(cuts, max_frames=50000) + dset = K2SpeechRecognitionDataset(cuts) + sampler = SimpleCutSampler(cuts, max_duration=500) # Dataset performs batching by itself, so we have to indicate that # to the DataLoader with batch_size=None dloader = DataLoader(dset, sampler=sampler, batch_size=None, num_workers=1) diff --git a/lhotse/cut/data.py b/lhotse/cut/data.py index ad47ca381..a939db5a2 100644 --- a/lhotse/cut/data.py +++ b/lhotse/cut/data.py @@ -723,7 +723,7 @@ def pad( """ Return a new MixedCut, padded with zeros in the recording, and ``pad_feat_value`` in each feature bin. - The user can choose to pad either to a specific `duration`; a specific number of frames `max_frames`; + The user can choose to pad either to a specific `duration`; a specific number of frames `num_frames`; or a specific number of samples `num_samples`. The three arguments are mutually exclusive. :param duration: The cut's minimal duration after padding. diff --git a/lhotse/cut/mixed.py b/lhotse/cut/mixed.py index 01acf248d..cd83d29e0 100644 --- a/lhotse/cut/mixed.py +++ b/lhotse/cut/mixed.py @@ -622,7 +622,7 @@ def pad( """ Return a new MixedCut, padded with zeros in the recording, and ``pad_feat_value`` in each feature bin. - The user can choose to pad either to a specific `duration`; a specific number of frames `max_frames`; + The user can choose to pad either to a specific `duration`; a specific number of frames `num_frames`; or a specific number of samples `num_samples`. The three arguments are mutually exclusive. :param duration: The cut's minimal duration after padding. diff --git a/lhotse/cut/padding.py b/lhotse/cut/padding.py index c535bde2b..a95be6062 100644 --- a/lhotse/cut/padding.py +++ b/lhotse/cut/padding.py @@ -236,7 +236,7 @@ def pad( """ Return a new MixedCut, padded with zeros in the recording, and ``pad_feat_value`` in each feature bin. - The user can choose to pad either to a specific `duration`; a specific number of frames `max_frames`; + The user can choose to pad either to a specific `duration`; a specific number of frames `num_frames`; or a specific number of samples `num_samples`. The three arguments are mutually exclusive. :param duration: The cut's minimal duration after padding. diff --git a/lhotse/cut/set.py b/lhotse/cut/set.py index 2a7afd16c..5a62ba21c 100644 --- a/lhotse/cut/set.py +++ b/lhotse/cut/set.py @@ -2821,7 +2821,7 @@ def pad( """ Return a new MixedCut, padded with zeros in the recording, and ``pad_feat_value`` in each feature bin. - The user can choose to pad either to a specific `duration`; a specific number of frames `max_frames`; + The user can choose to pad either to a specific `duration`; a specific number of frames `num_frames`; or a specific number of samples `num_samples`. The three arguments are mutually exclusive. :param cut: DataCut to be padded. diff --git a/lhotse/dataset/audio_tagging.py b/lhotse/dataset/audio_tagging.py index 0ca44a687..fbf370fd6 100644 --- a/lhotse/dataset/audio_tagging.py +++ b/lhotse/dataset/audio_tagging.py @@ -78,7 +78,7 @@ def __init__( def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List[str]]]: """ Return a new batch, with the batch size automatically determined using the constraints - of max_frames and max_cuts. + of max_duration and max_cuts. """ self.hdf5_fix.update() diff --git a/lhotse/dataset/sampling/bucketing.py b/lhotse/dataset/sampling/bucketing.py index dd53551cc..b869185b6 100644 --- a/lhotse/dataset/sampling/bucketing.py +++ b/lhotse/dataset/sampling/bucketing.py @@ -30,7 +30,7 @@ class BucketingSampler(CutSampler): ... # BucketingSampler specific args ... sampler_type=SimpleCutSampler, num_buckets=20, ... # Args passed into SimpleCutSampler - ... max_frames=20000 + ... max_duration=200 ... ) Bucketing sampler with 20 buckets, sampling pairs of source-target cuts:: @@ -40,7 +40,7 @@ class BucketingSampler(CutSampler): ... # BucketingSampler specific args ... sampler_type=CutPairsSampler, num_buckets=20, ... # Args passed into CutPairsSampler - ... max_source_frames=20000, max_target_frames=15000 + ... max_source_duration=200, max_target_duration=150 ... ) """ diff --git a/lhotse/dataset/sampling/cut_pairs.py b/lhotse/dataset/sampling/cut_pairs.py index 1582158d2..cd13353d8 100644 --- a/lhotse/dataset/sampling/cut_pairs.py +++ b/lhotse/dataset/sampling/cut_pairs.py @@ -12,10 +12,10 @@ class CutPairsSampler(CutSampler): It expects that both CutSet's strictly consist of Cuts with corresponding IDs. It behaves like an iterable that yields lists of strings (cut IDs). - When one of :attr:`max_frames`, :attr:`max_samples`, or :attr:`max_duration` is specified, + When one of :attr:`max_source_duration`, :attr:`max_target_duration`, or :attr:`max_cuts` is specified, the batch size is dynamic. Exactly zero or one of those constraints can be specified. - Padding required to collate the batch does not contribute to max frames/samples/duration. + Padding required to collate the batch does not contribute to max source_duration/target_duration. """ def __init__( @@ -229,7 +229,7 @@ def _next_batch(self) -> Tuple[CutSet, CutSet]: self.source_constraints.add(next_source_cut) self.target_constraints.add(next_target_cut) - # Did we exceed the max_source_frames and max_cuts constraints? + # Did we exceed the max_source_duration and max_cuts constraints? if ( not self.source_constraints.exceeded() and not self.target_constraints.exceeded() @@ -249,7 +249,7 @@ def _next_batch(self) -> Tuple[CutSet, CutSet]: # and return the cut anyway. warnings.warn( "The first cut drawn in batch collection violates one of the max_... constraints" - "we'll return it anyway. Consider increasing max_source_frames/max_cuts/etc." + "we'll return it anyway. Consider increasing max_source_duration/max_cuts/etc." ) source_cuts.append(next_source_cut) target_cuts.append(next_target_cut) diff --git a/lhotse/dataset/sampling/dynamic.py b/lhotse/dataset/sampling/dynamic.py index 2d36b4130..dc5858010 100644 --- a/lhotse/dataset/sampling/dynamic.py +++ b/lhotse/dataset/sampling/dynamic.py @@ -335,7 +335,7 @@ def detuplify( else next_cut_or_tpl ) - # Did we exceed the max_frames and max_cuts constraints? + # Did we exceed the max_duration and max_cuts constraints? if self.constraint.close_to_exceeding(): # Yes. Finish sampling this batch. if self.constraint.exceeded() and len(cuts) == 1: diff --git a/lhotse/dataset/sampling/simple.py b/lhotse/dataset/sampling/simple.py index 66b56dae2..a8ca079c4 100644 --- a/lhotse/dataset/sampling/simple.py +++ b/lhotse/dataset/sampling/simple.py @@ -11,10 +11,10 @@ class SimpleCutSampler(CutSampler): Samples cuts from a CutSet to satisfy the input constraints. It behaves like an iterable that yields lists of strings (cut IDs). - When one of :attr:`max_frames`, :attr:`max_samples`, or :attr:`max_duration` is specified, + When one of :attr:`max_duration`, or :attr:`max_cuts` is specified, the batch size is dynamic. Exactly zero or one of those constraints can be specified. - Padding required to collate the batch does not contribute to max frames/samples/duration. + Padding required to collate the batch does not contribute to max duration. Example usage:: @@ -197,10 +197,10 @@ def _next_batch(self) -> CutSet: self.diagnostics.discard_single(next_cut) continue - # Track the duration/frames/etc. constraints. + # Track the duration/etc. constraints. self.time_constraint.add(next_cut) - # Did we exceed the max_frames and max_cuts constraints? + # Did we exceed the max_duration and max_cuts constraints? if not self.time_constraint.exceeded(): # No - add the next cut to the batch, and keep trying. cuts.append(next_cut) @@ -215,9 +215,9 @@ def _next_batch(self) -> CutSet: # and return the cut anyway. warnings.warn( "The first cut drawn in batch collection violates " - "the max_frames, max_cuts, or max_duration constraints - " + "the max_duration, or max_cuts constraints - " "we'll return it anyway. " - "Consider increasing max_frames/max_cuts/max_duration." + "Consider increasing max_duration/max_cuts." ) cuts.append(next_cut) diff --git a/lhotse/dataset/sampling/weighted_simple.py b/lhotse/dataset/sampling/weighted_simple.py index 7c3f76034..4a3191b02 100644 --- a/lhotse/dataset/sampling/weighted_simple.py +++ b/lhotse/dataset/sampling/weighted_simple.py @@ -15,7 +15,7 @@ class WeightedSimpleCutSampler(SimpleCutSampler): When performing sampling, it avoids having duplicated cuts in the same batch. The sampler terminates if the number of sampled cuts reach :attr:`num_samples` - When one of :attr:`max_frames`, :attr:`max_samples`, or :attr:`max_duration` is specified, + When one of :attr:`max_duration`, or :attr:`max_cuts` is specified, the batch size is dynamic. Example usage: diff --git a/lhotse/dataset/speech_recognition.py b/lhotse/dataset/speech_recognition.py index 4c9919f99..4a3520b37 100644 --- a/lhotse/dataset/speech_recognition.py +++ b/lhotse/dataset/speech_recognition.py @@ -94,7 +94,7 @@ def __init__( def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List[str]]]: """ Return a new batch, with the batch size automatically determined using the constraints - of max_frames and max_cuts. + of max_duration and max_cuts. """ validate_for_asr(cuts) diff --git a/lhotse/dataset/speech_translation.py b/lhotse/dataset/speech_translation.py index 672d27069..1def4475b 100644 --- a/lhotse/dataset/speech_translation.py +++ b/lhotse/dataset/speech_translation.py @@ -97,7 +97,7 @@ def __init__( def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List[str]]]: """ Return a new batch, with the batch size automatically determined using the constraints - of max_frames and max_cuts. + of max_duration and max_cuts. """ validate_for_asr(cuts) self.hdf5_fix.update() diff --git a/lhotse/dataset/surt.py b/lhotse/dataset/surt.py index 8eda83b5f..5e424353c 100644 --- a/lhotse/dataset/surt.py +++ b/lhotse/dataset/surt.py @@ -170,7 +170,7 @@ def __init__( def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List[str]]]: """ Return a new batch, with the batch size automatically determined using the constraints - of max_frames and max_cuts. + of max_duration and max_cuts. """ validate_for_asr(cuts)