Skip to content

Commit

Permalink
binarize optimizations
Browse files Browse the repository at this point in the history
config as dict

feat(metric): add detection, precision, and recall diarization metrics (pyannote#1808)

fix: ensure round number of frames

fix: update pipeline parameters in separation tutorial (pyannote#1829)

setup: update torch_audiomentations, pyannote.database, pyannote.metrics and typer dependencies

BREAKING: fix `cache_dir` support, rename `use_auth_token` to `token`, and other changes

feat: add segmentation error rate (pyannote#1836)

doc: improve HF-related docstring

feat: add CLI to apply (and benchmark) pretrained pipelines

binarize annotation optimizations

pass checkpoint as a dict

Squashed commit of the following:

commit 2ccac02
Author: benniekiss <63211101+benniekiss@users.noreply.github.com>
Date:   Wed Jan 29 17:48:15 2025 -0500

    pass checkpoint as a dict

Squashed commit of the following:

commit 0f3b2fe
Author: benniekiss <63211101+benniekiss@users.noreply.github.com>
Date:   Wed Jan 29 17:58:10 2025 -0500

    dont rename tracks when generating annotation

    * let the user decide how to rename tracks, if necessary
    * reduces a costly step for long audios

commit 649c060
Author: benniekiss <63211101+benniekiss@users.noreply.github.com>
Date:   Wed Sep 11 10:31:20 2024 -0400

    conditionally use optimized method

pass checkpoint as a dict

cleanup
  • Loading branch information
hbredin authored and benniekiss committed Feb 3, 2025
1 parent 9b079e8 commit d98842b
Show file tree
Hide file tree
Showing 30 changed files with 768 additions and 584 deletions.
7 changes: 6 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,17 @@ Models can now be stored alongside their pipelines in the same repository, strea
- accept `pyannote/speaker-diarization-x.x` pipeline user agreement
- ~~accept `pyannote/segmentation-3.0` model user agreement~~
- ~~accept `pyannote/wespeaker-voxceleb-resnet34-LM` model user agreement~~
- load pipeline with `Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=True)`
- load pipeline with `Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", token=True)`

#### Improve speech separation quality

Clipping and speaker/source alignment issues in speech separation pipeline have been fixed.

### Breaking changes

- BREAKING(hub): rename `use_auth_token` to `token`
- BREAKING(cache): rely on `huggingface_hub` caching directory (`PYANNOTE_CACHE` is no longer used)
- BREAKING(inference): `Inference` now only supports already instantiated models
- BREAKING(task): drop support for `multilabel` training in `SpeakerDiarization` task
- BREAKING(task): drop support for `warm_up` option in `SpeakerDiarization` task
- BREAKING(task): drop support for `weigh_by_cardinality` option in `SpeakerDiarization` task
Expand All @@ -32,6 +35,8 @@ Clipping and speaker/source alignment issues in speech separation pipeline have
- feat(utils): add `hidden` option to `ProgressHook`
- feat(utils): add `FilterByNumberOfSpeakers` protocol files filter
- feat(core): add `Calibration` class to calibrate logits/distances into probabilities
- feat(metric): add `DetectionErrorRate`, `SegmentationErrorRate`, `DiarizationPrecision`, and `DiarizationRecall` metrics
- feat(cli): add CLI to apply (and benchmark) pretrained pipelines

### Improvements

Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ Consider switching to [pyannoteAI](https://www.pyannote.ai) for better and faste
from pyannote.audio import Pipeline
pipeline = Pipeline.from_pretrained(
"pyannote/speaker-diarization-3.1",
use_auth_token="HUGGINGFACE_ACCESS_TOKEN_GOES_HERE")
token="HUGGINGFACE_ACCESS_TOKEN_GOES_HERE")

# send pipeline to GPU (when available)
import torch
Expand Down
221 changes: 221 additions & 0 deletions pyannote/audio/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
#!/usr/bin/env python
# encoding: utf-8

# MIT License
#
# Copyright (c) 2024- CNRS
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.


import sys
from contextlib import nullcontext
from enum import Enum
from pathlib import Path
from typing import Optional

import pyannote.database
import torch
import typer
from pyannote.core import Annotation
from typing_extensions import Annotated

from pyannote.audio import Pipeline


class Subset(str, Enum):
train = "train"
development = "development"
test = "test"


class Device(str, Enum):
CPU = "cpu"
CUDA = "cuda"
MPS = "mps"
AUTO = "auto"


def parse_device(device: Device) -> torch.device:
if device == Device.AUTO:
if torch.cuda.is_available():
device = Device.CUDA

elif torch.backends.mps.is_available():
device = Device.MPS

else:
device = Device.CPU

return torch.device(device.value)


app = typer.Typer()


# TODO: add option to download pretrained pipeline for later use without internet


@app.command("apply")
def apply(
pipeline: Annotated[
str,
typer.Argument(
help="Pretrained pipeline (e.g. pyannote/speaker-diarization-3.1)"
),
],
audio: Annotated[
Path,
typer.Argument(
help="Path to audio file",
exists=True,
file_okay=True,
readable=True,
),
],
into: Annotated[
Path,
typer.Option(
help="Path to file where results are saved.",
exists=False,
dir_okay=False,
file_okay=True,
writable=True,
resolve_path=True,
),
] = None,
device: Annotated[
Device, typer.Option(help="Accelerator to use (CPU, CUDA, MPS)")
] = Device.AUTO,
):
"""
Apply a pretrained PIPELINE to an AUDIO file
"""

# load pretrained pipeline
pretrained_pipeline = Pipeline.from_pretrained(pipeline)

# send pipeline to device
torch_device = parse_device(device)
pretrained_pipeline.to(torch_device)

# apply pipeline to audio file
prediction: Annotation = pretrained_pipeline(audio)

# save (or print) results
with open(into, "w") if into else nullcontext(sys.stdout) as rttm:
prediction.write_rttm(rttm)


@app.command("benchmark")
def benchmark(
pipeline: Annotated[
str,
typer.Argument(
help="Pretrained pipeline (e.g. pyannote/speaker-diarization-3.1)"
),
],
protocol: Annotated[
str,
typer.Argument(help="Benchmarked protocol"),
],
into: Annotated[
Path,
typer.Argument(
help="Directory into which benchmark results are saved",
exists=True,
dir_okay=True,
file_okay=False,
writable=True,
resolve_path=True,
),
],
subset: Annotated[
Subset,
typer.Option(
help="Benchmarked subset",
case_sensitive=False,
),
] = Subset.test,
device: Annotated[
Device, typer.Option(help="Accelerator to use (CPU, CUDA, MPS)")
] = Device.AUTO,
registry: Annotated[
Optional[Path],
typer.Option(
help="Loaded registry",
exists=True,
dir_okay=False,
file_okay=True,
readable=True,
),
] = None,
):
"""
Benchmark a pretrained PIPELINE
"""

# load pretrained pipeline
pretrained_pipeline = Pipeline.from_pretrained(pipeline)

# send pipeline to device
torch_device = parse_device(device)
pretrained_pipeline.to(torch_device)

# load pipeline metric (when available)
try:
metric = pretrained_pipeline.get_metric()
except NotImplementedError:
metric = None

# load protocol from (optional) registry
if registry:
pyannote.database.registry.load_database(registry)

loaded_protocol = pyannote.database.registry.get_protocol(
protocol, {"audio": pyannote.database.FileFinder()}
)

with open(into / f"{protocol}.{subset.value}.rttm", "w") as rttm:
for file in getattr(loaded_protocol, subset.value)():
prediction: Annotation = pretrained_pipeline(file)
prediction.write_rttm(rttm)
rttm.flush()

if metric is None:
continue

groundtruth = file.get("annotation", None)
if groundtruth is None:
continue

annotated = file.get("annotated", None)
_ = metric(groundtruth, prediction, uem=annotated)

if metric is None:
return

with open(into / f"{protocol}.{subset.value}.txt", "w") as txt:
txt.write(str(metric))

print(str(metric))


if __name__ == "__main__":
app()
2 changes: 1 addition & 1 deletion pyannote/audio/core/calibration.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ def from_pretrained(
if os.path.isfile(checkpoint):
return cls.from_file(checkpoint)

path = download_from_hf_hub(
_, _, path = download_from_hf_hub(
checkpoint,
AssetFileName.Calibration,
subfolder=subfolder,
Expand Down
21 changes: 3 additions & 18 deletions pyannote/audio/core/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
# SOFTWARE.

import warnings
from pathlib import Path
from typing import Callable, List, Optional, Text, Tuple, Union

import numpy as np
Expand Down Expand Up @@ -74,15 +73,11 @@ class Inference(BaseInference):
device : torch.device, optional
Device used for inference. Defaults to `model.device`.
In case `device` and `model.device` are different, model is sent to device.
use_auth_token : str, optional
When loading a private huggingface.co model, set `use_auth_token`
to True or to a string containing your hugginface.co authentication
token that can be obtained by running `huggingface-cli login`
"""

def __init__(
self,
model: Union[Model, Text, Path],
model: Model,
window: Text = "sliding",
duration: Optional[float] = None,
step: Optional[float] = None,
Expand All @@ -91,20 +86,10 @@ def __init__(
skip_conversion: bool = False,
device: Optional[torch.device] = None,
batch_size: int = 32,
use_auth_token: Union[Text, None] = None,
):
# ~~~~ model ~~~~~

self.model = (
model
if isinstance(model, Model)
else Model.from_pretrained(
model,
map_location=device,
strict=False,
use_auth_token=use_auth_token,
)
)
self.model = model

if device is None:
device = self.model.device
Expand Down Expand Up @@ -616,7 +601,7 @@ def aggregate(
# mask ~ (num_frames_per_chunk, num_classes)-shaped np.ndarray
mask = 1 - np.isnan(score)
np.nan_to_num(score, copy=False, nan=0.0)

start_frame = frames.closest_frame(chunk.start + 0.5 * frames.duration)

aggregated_output[start_frame : start_frame + num_frames_per_chunk] += (
Expand Down
Loading

0 comments on commit d98842b

Please sign in to comment.