juanmc2005 · juanmc2005 · Apr 19, 2023 · Apr 19, 2023 · Apr 19, 2023 · Apr 19, 2023
diff --git a/README.md b/README.md
@@ -24,7 +24,7 @@
     </a>
     <span> | </span>
     <a href="#-custom-models">
-      🤖 Custom models
+      🤖 Add your model
     </a>
     <span> | </span>
     <a href="#-tune-hyper-parameters">
@@ -110,32 +110,32 @@ See `diart.stream -h` for more options.
 
 ### From python
 
-Use `RealTimeInference` to easily run a pipeline on an audio source and write the results to disk:
+Use `StreamingInference` to run a pipeline on an audio source and write the results to disk:
 
 ```python
-from diart import OnlineSpeakerDiarization
+from diart import SpeakerDiarization
 from diart.sources import MicrophoneAudioSource
-from diart.inference import RealTimeInference
+from diart.inference import StreamingInference
 from diart.sinks import RTTMWriter
 
-pipeline = OnlineSpeakerDiarization()
+pipeline = SpeakerDiarization()
 mic = MicrophoneAudioSource(pipeline.config.sample_rate)
-inference = RealTimeInference(pipeline, mic, do_plot=True)
+inference = StreamingInference(pipeline, mic, do_plot=True)
 inference.attach_observers(RTTMWriter(mic.uri, "/output/file.rttm"))
 prediction = inference()
 ```
 
 For inference and evaluation on a dataset we recommend to use `Benchmark` (see notes on [reproducibility](#reproducibility)).
 
-## 🤖 Custom models
+## 🤖 Add your model
 
-Third-party models can be integrated seamlessly by subclassing `SegmentationModel` and `EmbeddingModel` (which are PyTorch `Module` subclasses):
+Third-party models can be integrated by subclassing `SegmentationModel` and `EmbeddingModel` (both PyTorch `nn.Module`):
 
 ```python
-from diart import OnlineSpeakerDiarization, PipelineConfig
+from diart import SpeakerDiarization, SpeakerDiarizationConfig
 from diart.models import EmbeddingModel, SegmentationModel
 from diart.sources import MicrophoneAudioSource
-from diart.inference import RealTimeInference
+from diart.inference import StreamingInference
 
 
 def model_loader():
@@ -168,19 +168,19 @@ class MyEmbeddingModel(EmbeddingModel):
         return self.model(waveform, weights)
 
 
-config = PipelineConfig(
+config = SpeakerDiarizationConfig(
     segmentation=MySegmentationModel(),
     embedding=MyEmbeddingModel()
 )
-pipeline = OnlineSpeakerDiarization(config)
+pipeline = SpeakerDiarization(config)
 mic = MicrophoneAudioSource(config.sample_rate)
-inference = RealTimeInference(pipeline, mic)
+inference = StreamingInference(pipeline, mic)
 prediction = inference()
 ```
 
 ## 📈 Tune hyper-parameters
 
-Diart implements a hyper-parameter optimizer based on [optuna](https://optuna.readthedocs.io/en/stable/index.html) that allows you to tune any pipeline to any dataset.
+Diart implements an optimizer based on [optuna](https://optuna.readthedocs.io/en/stable/index.html) that allows you to tune pipeline hyper-parameters to your needs.
 
 ### From the command line
 
@@ -281,7 +281,7 @@ diart.serve --host 0.0.0.0 --port 7007
 diart.client microphone --host <server-address> --port 7007
 ```
 
-**Note:** please make sure that the client uses the same `step` and `sample_rate` than the server with `--step` and `-sr`.
+**Note:** make sure that the client uses the same `step` and `sample_rate` than the server with `--step` and `-sr`.
 
 See `-h` for more options.
 
@@ -290,13 +290,13 @@ See `-h` for more options.
 For customized solutions, a server can also be created in python using the `WebSocketAudioSource`:
 
 ```python
-from diart import OnlineSpeakerDiarization
+from diart import SpeakerDiarization
 from diart.sources import WebSocketAudioSource
-from diart.inference import RealTimeInference
+from diart.inference import StreamingInference
 
-pipeline = OnlineSpeakerDiarization()
+pipeline = SpeakerDiarization()
 source = WebSocketAudioSource(pipeline.config.sample_rate, "localhost", 7007)
-inference = RealTimeInference(pipeline, source)
+inference = StreamingInference(pipeline, source)
 inference.attach_hooks(lambda ann_wav: source.send(ann_wav[0].to_rttm()))
 prediction = inference()
 ```
@@ -354,14 +354,14 @@ or using the inference API:
 
 ```python
 from diart.inference import Benchmark, Parallelize
-from diart import OnlineSpeakerDiarization, PipelineConfig
+from diart import SpeakerDiarization, SpeakerDiarizationConfig
 from diart.models import SegmentationModel
 
 benchmark = Benchmark("/wav/dir", "/rttm/dir")
 
 name = "pyannote/segmentation@Interspeech2021"
 segmentation = SegmentationModel.from_pyannote(name)
-config = PipelineConfig(
+config = SpeakerDiarizationConfig(
     # Set the model used in the paper
     segmentation=segmentation,
     step=0.5,
@@ -370,12 +370,12 @@ config = PipelineConfig(
     rho_update=0.422,
     delta_new=1.517
 )
-benchmark(OnlineSpeakerDiarization, config)
+benchmark(SpeakerDiarization, config)
 
 # Run the same benchmark in parallel
 p_benchmark = Parallelize(benchmark, num_workers=4)
 if __name__ == "__main__":  # Needed for multiprocessing
-    p_benchmark(OnlineSpeakerDiarization, config)
+    p_benchmark(SpeakerDiarization, config)
 ```
 
 This pre-calculates model outputs in batches, so it runs a lot faster.

diff --git a/requirements.txt b/requirements.txt
@@ -9,6 +9,7 @@ pandas>=1.4.2
 torch>=1.12.1
 torchvision>=0.14.0
 torchaudio>=0.12.1,<1.0
+torchmetrics>=0.11.1
 pyannote.audio>=2.1.1
 pyannote.core>=4.5
 pyannote.database>=4.1.1

diff --git a/setup.cfg b/setup.cfg
@@ -2,11 +2,11 @@
 name=diart
 version=0.7.0
 author=Juan Manuel Coria
-description=Speaker diarization in real time
+description=Streaming speaker diarization in real-time
 long_description=file: README.md
 long_description_content_type=text/markdown
 keywords=speaker diarization, streaming, online, real time, rxpy
-url=https://github.com/juanmc2005/StreamingSpeakerDiarization
+url=https://github.com/juanmc2005/diart
 license=MIT
 classifiers=
     Development Status :: 4 - Beta
@@ -31,6 +31,7 @@ install_requires=
     torch>=1.12.1
     torchvision>=0.14.0
     torchaudio>=0.12.1,<1.0
+    torchmetrics>=0.11.1
     pyannote.audio>=2.1.1
     pyannote.core>=4.5
     pyannote.database>=4.1.1

diff --git a/src/diart/__init__.py b/src/diart/__init__.py
@@ -1,6 +1,10 @@
-from .blocks import (
-    OnlineSpeakerDiarization,
-    BasePipeline,
+from .pipelines import (
+    Pipeline,
     PipelineConfig,
-    BasePipelineConfig,
+    SpeakerDiarization,
+    SpeakerDiarizationConfig,
+    VoiceActivityDetection,
+    VoiceActivityDetectionConfig,
+    Transcription,
+    TranscriptionConfig,
 )
diff --git a/src/diart/blocks/__init__.py b/src/diart/blocks/__init__.py
@@ -5,14 +5,13 @@
     FirstOnlyStrategy,
     DelayedAggregation,
 )
-from .clustering import OnlineSpeakerClustering
+from .clustering import IncrementalSpeakerClustering
 from .embedding import (
     SpeakerEmbedding,
     OverlappedSpeechPenalty,
     EmbeddingNormalization,
     OverlapAwareSpeakerEmbedding,
 )
 from .segmentation import SpeakerSegmentation
-from .diarization import OnlineSpeakerDiarization, BasePipeline
-from .config import BasePipelineConfig, PipelineConfig
 from .utils import Binarize, Resample, AdjustVolume
+from .asr import SpeechRecognition
diff --git a/src/diart/blocks/asr.py b/src/diart/blocks/asr.py
@@ -0,0 +1,66 @@
+from pathlib import Path
+from typing import Optional, Union, List, Text
+
+import torch
+from einops import rearrange
+
+from .. import models as m
+from ..features import TemporalFeatureFormatter, TemporalFeatures
+
+
+class SpeechRecognition:
+    def __init__(self, model: m.SpeechRecognitionModel, device: Optional[torch.device] = None):
+        self.model = model
+        self.model.eval()
+        self.device = device
+        if self.device is None:
+            self.device = torch.device("cpu")
+        self.model.to(self.device)
+        self.formatter = TemporalFeatureFormatter()
+
+    @staticmethod
+    def from_whisper(
+        name: Text,
+        download_path: Optional[Union[Text, Path]] = None,
+        in_memory: bool = False,
+        fp16: bool = False,
+        no_speech_threshold: float = 0.6,
+        compression_ratio_threshold: Optional[float] = 2.4,
+        logprob_threshold: Optional[float] = -1,
+        decode_with_fallback: bool = False,
+        device: Optional[Union[Text, torch.device]] = None,
+    ) -> 'SpeechRecognition':
+        asr_model = m.SpeechRecognitionModel.from_whisper(
+            name,
+            download_path,
+            in_memory,
+            fp16,
+            no_speech_threshold,
+            compression_ratio_threshold,
+            logprob_threshold,
+            decode_with_fallback,
+        )
+        return SpeechRecognition(asr_model, device)
+
+    def __call__(self, waveform: TemporalFeatures) -> List[m.TranscriptionResult]:
+        """
+        Compute the transcription of input audio.
+
+        Parameters
+        ----------
+        waveform: TemporalFeatures, shape (samples, channels) or (batch, samples, channels)
+            Audio to transcribe
+
+        Returns
+        -------
+        transcriptions: List[Transcription]
+            A list of timestamped transcriptions
+        """
+        with torch.no_grad():
+            wave = rearrange(
+                self.formatter.cast(waveform),
+                "batch sample channel -> batch channel sample"
+            )
+            # output = self.model(wave.to(self.device)).cpu()
+            output = self.model(wave.to(self.device))
+        return output
diff --git a/src/diart/blocks/clustering.py b/src/diart/blocks/clustering.py
@@ -7,7 +7,7 @@
 from ..mapping import SpeakerMap, SpeakerMapBuilder
 
 
-class OnlineSpeakerClustering:
+class IncrementalSpeakerClustering:
     """Implements constrained incremental online clustering of speakers and manages cluster centers.
 
     Parameters