Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/fisher spanish #376

Merged
merged 3 commits into from
Aug 16, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/corpus.rst
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ a CLI tool that create the manifests given a corpus directory.
- :func:`lhotse.recipes.prepare_broadcast_news`
* - Fisher English Part 1, 2
- :func:`lhotse.recipes.prepare_fisher_english`
* - Fisher Spanish
- :func:`lhotse.recipes.prepare_fisher_spanish`
* - GALE Arabic Broadcast Speech
- :func:`lhotse.recipes.prepare_gale_arabic`
* - GALE Mandarin Broadcast Speech
Expand Down
1 change: 1 addition & 0 deletions lhotse/bin/modes/recipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from .cslu_kids import *
from .dihard3 import *
from .fisher_english import *
from .fisher_spanish import *
from .gale_arabic import *
from .gale_mandarin import *
from .gigaspeech import *
Expand Down
34 changes: 34 additions & 0 deletions lhotse/bin/modes/recipes/fisher_spanish.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import click

from lhotse.bin.modes import prepare
from lhotse.recipes import prepare_fisher_spanish
from lhotse.utils import Pathlike


@prepare.command(context_settings=dict(show_default=True))
@click.argument('audio-dir', type=click.Path(exists=True, file_okay=False))
@click.argument('transcript-dir', type=click.Path(exists=True, file_okay=False))
@click.argument('output-dir', type=click.Path())
@click.option('--absolute-paths', default=False,
help='Whether to return absolute or relative (to the corpus dir) paths for recordings.')
def fisher_english(
audio_dir: Pathlike,
transcript_dir: Pathlike,
output_dir: Pathlike,
absolute_paths: bool
):
"""
The Fisher Spanish corpus preparation.

\b
This is conversational telephone speech collected as 2-channel μ-law, 8kHz-sampled data.
The catalog number LDC2010S01 for audio corpus and LDC2010T04 for transcripts.

This data is not available for free - your institution needs to have an LDC subscription.
"""
prepare_fisher_spanish(
audio_dir_path=audio_dir,
transcript_dir_path=transcript_dir,
output_dir=output_dir,
absolute_paths=absolute_paths
)
1 change: 1 addition & 0 deletions lhotse/recipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from .cslu_kids import prepare_cslu_kids
from .dihard3 import prepare_dihard3
from .fisher_english import prepare_fisher_english
from .fisher_spanish import prepare_fisher_spanish
from .gale_arabic import prepare_gale_arabic
from .gale_mandarin import prepare_gale_mandarin
from .gigaspeech import prepare_gigaspeech
Expand Down
11 changes: 4 additions & 7 deletions lhotse/recipes/fisher_english.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,12 +72,11 @@ def create_supervision(

def walk_dirs_parallel(dirs: List[Pathlike], pattern: str, pbar_desc: str) -> List[Path]:

get_path_inputs = [(Path(dir_path), pattern) for dir_path in dirs]
output_paths = [None] * len(dirs)
njobs = min(len(dirs), os.cpu_count() * 4)
with ThreadPoolExecutor(njobs) as executor:
with tqdm(total=len(output_paths)) as pbar:
pbar.set_description(pbar_desc)
get_path_inputs = [(Path(dir_path), pattern) for dir_path in dirs]
with tqdm(total=len(get_path_inputs), desc=pbar_desc) as pbar:
for k, tmp_output_paths in enumerate(executor.map(get_paths, get_path_inputs)):
output_paths[k] = tmp_output_paths
pbar.update()
Expand Down Expand Up @@ -143,8 +142,7 @@ def prepare_fisher_english(
create_recordings_input = [(p, None if absolute_paths else 5) for p in audio_paths]
recordings = [None] * len(audio_paths)
with ThreadPoolExecutor(os.cpu_count() * 4) as executor:
with tqdm(total=len(audio_paths)) as pbar:
pbar.set_description('Collect recordings')
with tqdm(total=len(create_recordings_input), desc='Collect recordings') as pbar:
for i, reco in enumerate(executor.map(create_recording, create_recordings_input)):
recordings[i] = reco
pbar.update()
Expand All @@ -154,8 +152,7 @@ def prepare_fisher_english(
create_supervisions_input = [(sessions, p) for p in transcript_paths]
supervisions = [None] * len(create_supervisions_input)
with ThreadPoolExecutor(os.cpu_count() * 4) as executor:
with tqdm(total=len(create_supervisions_input)) as pbar:
pbar.set_description('Create supervisions')
with tqdm(total=len(create_supervisions_input), desc='Create supervisions') as pbar:
for i, tmp_supervisions in enumerate(executor.map(create_supervision, create_supervisions_input)):
supervisions[i] = tmp_supervisions
pbar.update()
Expand Down
119 changes: 119 additions & 0 deletions lhotse/recipes/fisher_spanish.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
"""
About the Fisher Spanish corpus

This is conversational telephone speech collected as 2-channel μ-law, 8kHz-sampled data.
The catalog number LDC2010S01 for audio corpus and LDC2010T04 for transcripts.

This data is not available for free - your institution needs to have an LDC subscription.
"""

import codecs
import os
import itertools as it

from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from tqdm.auto import tqdm
from typing import Dict, List, Tuple, Optional, Union

from lhotse.audio import RecordingSet
from lhotse.qa import fix_manifests, validate_recordings_and_supervisions
from lhotse.recipes.fisher_english import create_recording
from lhotse.supervision import SupervisionSegment, SupervisionSet
from lhotse.utils import Pathlike, check_and_rglob


def create_supervision(
sessions_and_transcript_path: Tuple[Dict[str, Dict[str, str]], Pathlike]
) -> List[SupervisionSegment]:

sessions, transcript_path = sessions_and_transcript_path
transcript_path = Path(transcript_path)
with codecs.open(transcript_path, 'r', 'utf8') as trans_f:

lines = [l.rstrip('\n') for l in trans_f.readlines()][3:]
lines = [l.split('\t') for l in lines if l.strip() != '']
lines = [
[float(l[2]), float(l[3]), int(l[1]), ' '.join([w for w in l[7].split() if w.strip() != ''])]
for l in lines
]

segments = [
SupervisionSegment(
id=transcript_path.stem + '-' + str(k).zfill(len(str(len(lines)))),
recording_id=transcript_path.stem,
start=round(l[0], 10),
duration=round(l[1] - l[0], 10),
channel=l[2],
text=l[3],
language='Spanish',
speaker=sessions[transcript_path.stem.split('_')[2]][l[2]]
)
for k, l in enumerate(lines)
]

return segments


def prepare_fisher_spanish(
audio_dir_path: Pathlike,
transcript_dir_path: Pathlike,
output_dir: Optional[Pathlike] = None,
absolute_paths: bool = False
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:

"""
Prepares manifests for Fisher Spanish.
We create two manifests: one with recordings, and the other one with text supervisions.

:param audio_dir_path: Path to audio directory (usually LDC2010S01).
:param transcript_dir_path: Path to transcript directory (usually LDC2010T04).
:param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
:param absolute_paths: Whether to return absolute or relative (to the corpus dir) paths for recordings.
:return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
"""

audio_dir_path, transcript_dir_path = Path(audio_dir_path), Path(transcript_dir_path)

audio_paths = check_and_rglob(audio_dir_path, '*.sph')
transcript_paths = check_and_rglob(transcript_dir_path, '*.tdf')

sessions_data_path = check_and_rglob(transcript_dir_path, '*_call.tbl')[0]
with codecs.open(sessions_data_path, 'r', 'utf8') as sessions_data_f:
session_lines = [l.rstrip('\n').split(',') for l in sessions_data_f.readlines()][1:]
sessions = {l[0]: {0: l[2], 1: l[8]} for l in session_lines}

assert len(transcript_paths) == len(sessions) == len(audio_paths)

create_recordings_input = [(p, None if absolute_paths else 4) for p in audio_paths]
recordings = [None] * len(audio_paths)
with ThreadPoolExecutor(os.cpu_count() * 4) as executor:
with tqdm(total=len(audio_paths), desc='Collect recordings') as pbar:
for i, reco in enumerate(executor.map(create_recording, create_recordings_input)):
recordings[i] = reco
pbar.update()
recordings = RecordingSet.from_recordings(recordings)

create_supervisions_input = [(sessions, p) for p in transcript_paths]
supervisions = [None] * len(create_supervisions_input)
with ThreadPoolExecutor(os.cpu_count() * 4) as executor:
with tqdm(total=len(create_supervisions_input), desc='Create supervisions') as pbar:
for i, tmp_supervisions in enumerate(executor.map(create_supervision, create_supervisions_input)):
supervisions[i] = tmp_supervisions
pbar.update()
supervisions = list(it.chain.from_iterable(supervisions))
supervisions = SupervisionSet.from_segments(supervisions).filter(lambda s: s.duration > 0.)

recordings, supervisions = fix_manifests(recordings, supervisions)
validate_recordings_and_supervisions(recordings, supervisions)

if output_dir is not None:
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
recordings.to_json(output_dir / 'recordings.json')
supervisions.to_json(output_dir / 'supervisions.json')

return {
'recordings': recordings,
'supervisions': supervisions
}