binarize optimizations

config as dict feat(metric): add detection, precision, and recall diarization metrics (pyannote#1808) fix: ensure round number of frames fix: update pipeline parameters in separation tutorial (pyannote#1829) setup: update torch_audiomentations, pyannote.database, pyannote.metrics and typer dependencies BREAKING: fix `cache_dir` support, rename `use_auth_token` to `token`, and other changes feat: add segmentation error rate (pyannote#1836) doc: improve HF-related docstring feat: add CLI to apply (and benchmark) pretrained pipelines binarize annotation optimizations pass checkpoint as a dict Squashed commit of the following: commit 2ccac02 Author: benniekiss <63211101+benniekiss@users.noreply.github.com> Date: Wed Jan 29 17:48:15 2025 -0500 pass checkpoint as a dict Squashed commit of the following: commit 0f3b2fe Author: benniekiss <63211101+benniekiss@users.noreply.github.com> Date: Wed Jan 29 17:58:10 2025 -0500 dont rename tracks when generating annotation * let the user decide how to rename tracks, if necessary * reduces a costly step for long audios commit 649c060 Author: benniekiss <63211101+benniekiss@users.noreply.github.com> Date: Wed Sep 11 10:31:20 2024 -0400 conditionally use optimized method pass checkpoint as a dict cleanup
benniekiss · Feb 3, 2025 · d98842b · d98842b
1 parent 9b079e8
commit d98842b
Show file tree

Hide file tree

Showing 30 changed files with 768 additions and 584 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,14 +10,17 @@ Models can now be stored alongside their pipelines in the same repository, strea
 - accept `pyannote/speaker-diarization-x.x` pipeline user agreement
 - ~~accept `pyannote/segmentation-3.0` model user agreement~~
 - ~~accept `pyannote/wespeaker-voxceleb-resnet34-LM` model user agreement~~
-- load pipeline with `Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=True)`
+- load pipeline with `Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", token=True)`
 
 #### Improve speech separation quality
 
 Clipping and speaker/source alignment issues in speech separation pipeline have been fixed.
 
 ### Breaking changes
 
+- BREAKING(hub): rename `use_auth_token` to `token`
+- BREAKING(cache): rely on `huggingface_hub` caching directory (`PYANNOTE_CACHE` is no longer used)
+- BREAKING(inference): `Inference` now only supports already instantiated models
 - BREAKING(task): drop support for `multilabel` training in `SpeakerDiarization` task
 - BREAKING(task): drop support for `warm_up` option in `SpeakerDiarization` task
 - BREAKING(task): drop support for `weigh_by_cardinality` option in `SpeakerDiarization` task
@@ -32,6 +35,8 @@ Clipping and speaker/source alignment issues in speech separation pipeline have
 - feat(utils): add `hidden` option to `ProgressHook`
 - feat(utils): add `FilterByNumberOfSpeakers` protocol files filter
 - feat(core): add `Calibration` class to calibrate logits/distances into probabilities
+- feat(metric): add `DetectionErrorRate`, `SegmentationErrorRate`, `DiarizationPrecision`, and `DiarizationRecall` metrics
+- feat(cli): add CLI to apply (and benchmark) pretrained pipelines
 
 ### Improvements
 

diff --git a/README.md b/README.md
@@ -20,7 +20,7 @@ Consider switching to [pyannoteAI](https://www.pyannote.ai) for better and faste
 from pyannote.audio import Pipeline
 pipeline = Pipeline.from_pretrained(
     "pyannote/speaker-diarization-3.1",
-    use_auth_token="HUGGINGFACE_ACCESS_TOKEN_GOES_HERE")
+    token="HUGGINGFACE_ACCESS_TOKEN_GOES_HERE")
 
 # send pipeline to GPU (when available)
 import torch

diff --git a/pyannote/audio/__main__.py b/pyannote/audio/__main__.py
@@ -0,0 +1,221 @@
+#!/usr/bin/env python
+# encoding: utf-8
+
+# MIT License
+#
+# Copyright (c) 2024- CNRS
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+
+import sys
+from contextlib import nullcontext
+from enum import Enum
+from pathlib import Path
+from typing import Optional
+
+import pyannote.database
+import torch
+import typer
+from pyannote.core import Annotation
+from typing_extensions import Annotated
+
+from pyannote.audio import Pipeline
+
+
+class Subset(str, Enum):
+    train = "train"
+    development = "development"
+    test = "test"
+
+
+class Device(str, Enum):
+    CPU = "cpu"
+    CUDA = "cuda"
+    MPS = "mps"
+    AUTO = "auto"
+
+
+def parse_device(device: Device) -> torch.device:
+    if device == Device.AUTO:
+        if torch.cuda.is_available():
+            device = Device.CUDA
+
+        elif torch.backends.mps.is_available():
+            device = Device.MPS
+
+        else:
+            device = Device.CPU
+
+    return torch.device(device.value)
+
+
+app = typer.Typer()
+
+
+# TODO: add option to download pretrained pipeline for later use without internet
+
+
+@app.command("apply")
+def apply(
+    pipeline: Annotated[
+        str,
+        typer.Argument(
+            help="Pretrained pipeline (e.g. pyannote/speaker-diarization-3.1)"
+        ),
+    ],
+    audio: Annotated[
+        Path,
+        typer.Argument(
+            help="Path to audio file",
+            exists=True,
+            file_okay=True,
+            readable=True,
+        ),
+    ],
+    into: Annotated[
+        Path,
+        typer.Option(
+            help="Path to file where results are saved.",
+            exists=False,
+            dir_okay=False,
+            file_okay=True,
+            writable=True,
+            resolve_path=True,
+        ),
+    ] = None,
+    device: Annotated[
+        Device, typer.Option(help="Accelerator to use (CPU, CUDA, MPS)")
+    ] = Device.AUTO,
+):
+    """
+    Apply a pretrained PIPELINE to an AUDIO file
+    """
+
+    # load pretrained pipeline
+    pretrained_pipeline = Pipeline.from_pretrained(pipeline)
+
+    # send pipeline to device
+    torch_device = parse_device(device)
+    pretrained_pipeline.to(torch_device)
+
+    # apply pipeline to audio file
+    prediction: Annotation = pretrained_pipeline(audio)
+
+    # save (or print) results
+    with open(into, "w") if into else nullcontext(sys.stdout) as rttm:
+        prediction.write_rttm(rttm)
+
+
+@app.command("benchmark")
+def benchmark(
+    pipeline: Annotated[
+        str,
+        typer.Argument(
+            help="Pretrained pipeline (e.g. pyannote/speaker-diarization-3.1)"
+        ),
+    ],
+    protocol: Annotated[
+        str,
+        typer.Argument(help="Benchmarked protocol"),
+    ],
+    into: Annotated[
+        Path,
+        typer.Argument(
+            help="Directory into which benchmark results are saved",
+            exists=True,
+            dir_okay=True,
+            file_okay=False,
+            writable=True,
+            resolve_path=True,
+        ),
+    ],
+    subset: Annotated[
+        Subset,
+        typer.Option(
+            help="Benchmarked subset",
+            case_sensitive=False,
+        ),
+    ] = Subset.test,
+    device: Annotated[
+        Device, typer.Option(help="Accelerator to use (CPU, CUDA, MPS)")
+    ] = Device.AUTO,
+    registry: Annotated[
+        Optional[Path],
+        typer.Option(
+            help="Loaded registry",
+            exists=True,
+            dir_okay=False,
+            file_okay=True,
+            readable=True,
+        ),
+    ] = None,
+):
+    """
+    Benchmark a pretrained PIPELINE
+    """
+
+    # load pretrained pipeline
+    pretrained_pipeline = Pipeline.from_pretrained(pipeline)
+
+    # send pipeline to device
+    torch_device = parse_device(device)
+    pretrained_pipeline.to(torch_device)
+
+    # load pipeline metric (when available)
+    try:
+        metric = pretrained_pipeline.get_metric()
+    except NotImplementedError:
+        metric = None
+
+    # load protocol from (optional) registry
+    if registry:
+        pyannote.database.registry.load_database(registry)
+
+    loaded_protocol = pyannote.database.registry.get_protocol(
+        protocol, {"audio": pyannote.database.FileFinder()}
+    )
+
+    with open(into / f"{protocol}.{subset.value}.rttm", "w") as rttm:
+        for file in getattr(loaded_protocol, subset.value)():
+            prediction: Annotation = pretrained_pipeline(file)
+            prediction.write_rttm(rttm)
+            rttm.flush()
+
+            if metric is None:
+                continue
+
+            groundtruth = file.get("annotation", None)
+            if groundtruth is None:
+                continue
+
+            annotated = file.get("annotated", None)
+            _ = metric(groundtruth, prediction, uem=annotated)
+
+    if metric is None:
+        return
+
+    with open(into / f"{protocol}.{subset.value}.txt", "w") as txt:
+        txt.write(str(metric))
+
+    print(str(metric))
+
+
+if __name__ == "__main__":
+    app()
diff --git a/pyannote/audio/core/calibration.py b/pyannote/audio/core/calibration.py
@@ -137,7 +137,7 @@ def from_pretrained(
         if os.path.isfile(checkpoint):
             return cls.from_file(checkpoint)
 
-        path = download_from_hf_hub(
+        _, _, path = download_from_hf_hub(
             checkpoint,
             AssetFileName.Calibration,
             subfolder=subfolder,

diff --git a/pyannote/audio/core/inference.py b/pyannote/audio/core/inference.py
@@ -21,7 +21,6 @@
 # SOFTWARE.
 
 import warnings
-from pathlib import Path
 from typing import Callable, List, Optional, Text, Tuple, Union
 
 import numpy as np
@@ -74,15 +73,11 @@ class Inference(BaseInference):
     device : torch.device, optional
         Device used for inference. Defaults to `model.device`.
         In case `device` and `model.device` are different, model is sent to device.
-    use_auth_token : str, optional
-        When loading a private huggingface.co model, set `use_auth_token`
-        to True or to a string containing your hugginface.co authentication
-        token that can be obtained by running `huggingface-cli login`
     """
 
     def __init__(
         self,
-        model: Union[Model, Text, Path],
+        model: Model,
         window: Text = "sliding",
         duration: Optional[float] = None,
         step: Optional[float] = None,
@@ -91,20 +86,10 @@ def __init__(
         skip_conversion: bool = False,
         device: Optional[torch.device] = None,
         batch_size: int = 32,
-        use_auth_token: Union[Text, None] = None,
     ):
         # ~~~~ model ~~~~~
 
-        self.model = (
-            model
-            if isinstance(model, Model)
-            else Model.from_pretrained(
-                model,
-                map_location=device,
-                strict=False,
-                use_auth_token=use_auth_token,
-            )
-        )
+        self.model = model
 
         if device is None:
             device = self.model.device
@@ -616,7 +601,7 @@ def aggregate(
             # mask ~ (num_frames_per_chunk, num_classes)-shaped np.ndarray
             mask = 1 - np.isnan(score)
             np.nan_to_num(score, copy=False, nan=0.0)
-            
+
             start_frame = frames.closest_frame(chunk.start + 0.5 * frames.duration)
 
             aggregated_output[start_frame : start_frame + num_frames_per_chunk] += (