Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add chunked sph file processing #367

Merged
Merged
52 changes: 49 additions & 3 deletions lhotse/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from typing import Any, Callable, Dict, Iterable, List, Mapping, NamedTuple, Optional, Sequence, Tuple, Union

import numpy as np
import soundfile as sf
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please stick to a local import, otherwise building the docs is going to fail (the import tries to load libsndfile.so into memory which is not available on read-the-docs servers).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, sorry, will move it back

from tqdm.auto import tqdm

from lhotse.augmentation import AudioTransform, Resample, Speed
Expand Down Expand Up @@ -231,8 +232,7 @@ def from_file(
else:
try:
# Try to parse the file using pysoundfile first.
import soundfile
info = soundfile.info(str(path))
info = sf.info(str(path))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there a missing case for using sph_info?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, good catch.

except:
# Try to parse the file using audioread as a fallback.
info = audioread_info(str(path))
Expand Down Expand Up @@ -815,8 +815,13 @@ def read_audio(
duration=duration,
force_opus_sampling_rate=force_opus_sampling_rate,
)
elif isinstance(path_or_fd, (str, Path)) and str(path_or_fd).lower().endswith('.sph'):
return read_sph(
path_or_fd,
offset=offset,
duration=duration
)
try:
import soundfile as sf
with sf.SoundFile(path_or_fd) as sf_desc:
sampling_rate = sf_desc.samplerate
if offset > 0:
Expand Down Expand Up @@ -1102,3 +1107,44 @@ def parse_channel_from_ffmpeg_output(ffmpeg_stderr: bytes) -> str:
f"Could not determine the number of channels for OPUS file from the following ffmpeg output "
f"(shown as bytestring due to avoid possible encoding issues):\n{str(ffmpeg_stderr)}"
)


def sph_info(path: Pathlike) -> LibsndfileCompatibleAudioInfo:
samples, sampling_rate = read_sph(path)
return LibsndfileCompatibleAudioInfo(
channels=samples.shape[0],
frames=samples.shape[1],
samplerate=sampling_rate,
duration=samples.shape[1] / sampling_rate
)


def read_sph(
sph_path: Pathlike,
offset: Seconds = 0.0,
duration: Optional[Seconds] = None
) -> Tuple[np.ndarray, int]:
"""
Reads SPH files using sph2pipe in a shell subprocess.
Unlike audioread, correctly supports offsets and durations for reading short chunks.

:return: a tuple of audio samples and the sampling rate.
"""

sph_path = Path(sph_path)

# Construct the sph2pipe command depending on the arguments passed.
cmd = f'sph2pipe -f wav -p -t {offset}:'

if duration is not None:
cmd += f'{round(offset + duration, 5)}'
# Add the input specifier after offset and duration.
cmd += f' {sph_path}'

# Actual audio reading.
proc = BytesIO(run(cmd, shell=True, stdout=PIPE, stderr=PIPE).stdout)
videodanchik marked this conversation as resolved.
Show resolved Hide resolved
with sf.SoundFile(proc) as sf_desc:
audio, sampling_rate = sf_desc.read(dtype=np.float32), sf_desc.samplerate
audio = audio.reshape(1, -1) if sf_desc.channels == 1 else audio.T

return audio, sampling_rate