diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index b0c41e325..8daf0eef8 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -34,7 +34,7 @@ jobs: java-version: "8" - name: Install dependencies - Linux - run: sudo apt-get update && sudo apt-get install libsndfile1 portaudio19-dev + run: sudo apt-get update && sudo apt-get install libsndfile1 portaudio19 if: matrix.os == 'ubuntu-latest' - name: Install dependencies - macOS diff --git a/docker/base/Dockerfile b/docker/base/Dockerfile index ca94fceeb..f898deb47 100644 --- a/docker/base/Dockerfile +++ b/docker/base/Dockerfile @@ -21,7 +21,7 @@ ENV LANG=C.UTF-8 RUN \ # Install required packages apt-get update && \ - apt-get -y --no-install-recommends install libgomp1 libsndfile1 portaudio19-dev gcc g++ python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python3-pip && \ + apt-get -y --no-install-recommends install libgomp1 libsndfile1 portaudio19 gcc g++ python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python3-pip && \ rm -rf /var/lib/apt/lists && \ \ # Install txtai project and dependencies diff --git a/docs/install.md b/docs/install.md index fc2f9fd2d..ee4c6fed2 100644 --- a/docs/install.md +++ b/docs/install.md @@ -130,13 +130,13 @@ Additional environment specific prerequisites are below. ### Linux -The AudioStream and Microphone pipelines require the [PortAudio](https://people.csail.mit.edu/hubert/pyaudio) system library. The Transcription pipeline requires the [SoundFile](https://github.com/bastibe/python-soundfile#installation) system library. +The AudioStream and Microphone pipelines require the [PortAudio](https://python-sounddevice.readthedocs.io/en/0.5.0/installation.html) system library. The Transcription pipeline requires the [SoundFile](https://github.com/bastibe/python-soundfile#installation) system library. ### macOS Older versions of Faiss have a runtime dependency on `libomp` for macOS. Run `brew install libomp` in this case. -The AudioStream and Microphone pipelines require the [PortAudio](https://people.csail.mit.edu/hubert/pyaudio) system library. +The AudioStream and Microphone pipelines require the [PortAudio](https://python-sounddevice.readthedocs.io/en/0.5.0/installation.html) system library. Run `brew install portaudio`. ### Windows diff --git a/docs/pipeline/audio/texttospeech.md b/docs/pipeline/audio/texttospeech.md index 0278eea1a..31f6a3d42 100644 --- a/docs/pipeline/audio/texttospeech.md +++ b/docs/pipeline/audio/texttospeech.md @@ -15,6 +15,16 @@ from txtai.pipeline import TextToSpeech # Create and run pipeline tts = TextToSpeech() tts("Say something here") + +# Stream audio - incrementally generates snippets of audio +yield from tts( + "Say something here. And say something else", + streaming=True +) + +# Generate audio using a speaker id +tts = TextToSpeech("neuml/vctk-vits-onnx") +tts("Say something here", speaker=42) ``` See the link below for a more detailed example. @@ -27,6 +37,7 @@ This pipeline is backed by ONNX models from the Hugging Face Hub. The following - [ljspeech-jets-onnx](https://huggingface.co/NeuML/ljspeech-jets-onnx) - [ljspeech-vits-onnx](https://huggingface.co/NeuML/ljspeech-vits-onnx) +- [vctk-vits-onnx](https://huggingface.co/NeuML/vctk-vits-onnx) ## Configuration-driven example diff --git a/setup.py b/setup.py index 70ea4bff3..3bff64961 100644 --- a/setup.py +++ b/setup.py @@ -56,9 +56,7 @@ extras["pipeline-audio"] = [ "onnx>=1.11.0", "onnxruntime>=1.11.0", - "pyaudio>=0.2.14", "scipy>=1.4.1", - "speechrecognition>=3.10.4", "sounddevice>=0.5.0", "soundfile>=0.10.3.post1", "ttstokenizer>=1.0.0", diff --git a/src/python/txtai/pipeline/audio/audiostream.py b/src/python/txtai/pipeline/audio/audiostream.py index 274a4147c..1c5641494 100644 --- a/src/python/txtai/pipeline/audio/audiostream.py +++ b/src/python/txtai/pipeline/audio/audiostream.py @@ -12,7 +12,7 @@ import sounddevice as sd SOUNDDEVICE = True -except ImportError: +except (ImportError, OSError): SOUNDDEVICE = False from ..base import Pipeline @@ -35,7 +35,7 @@ def __init__(self, rate=22050): """ if not SOUNDDEVICE: - raise ImportError('AudioStream pipeline is not available - install "pipeline" extra to enable') + raise ImportError("SoundDevice library not installed or portaudio library not found") # Sampler rate self.rate = rate diff --git a/src/python/txtai/pipeline/audio/microphone.py b/src/python/txtai/pipeline/audio/microphone.py index bae016b28..bb833c4e8 100644 --- a/src/python/txtai/pipeline/audio/microphone.py +++ b/src/python/txtai/pipeline/audio/microphone.py @@ -2,106 +2,269 @@ Microphone module """ +import logging + import numpy as np # Conditional import try: - import speech_recognition as sr + import sounddevice as sd import webrtcvad - PYAUDIO = True -except ImportError: - PYAUDIO = False + from scipy.fft import rfft, rfftfreq + from scipy.signal import butter, sosfilt + + SOUNDDEVICE = True +except (ImportError, OSError): + SOUNDDEVICE = False from ..base import Pipeline +# Logging configuration +logger = logging.getLogger(__name__) + class Microphone(Pipeline): """ - Reads input audio from a microphone device. This pipeline is designed to run on local machines given + Reads input speech from a microphone device. This pipeline is designed to run on local machines given that it requires access to read from an input device. """ - def __init__(self, rate=16000, vadmode=1, vadframe=30, vadthreshold=0.6): + def __init__(self, rate=16000, vadmode=3, vadframe=20, vadthreshold=0.6, voicestart=300, voiceend=3400, active=5, pause=8): """ Creates a new Microphone pipeline. Args: - rate: sample rate to record audio in, defaults to 16 kHz - vadmode: aggressiveness of the voice activity detector, defaults to 1 - vadframe: voice activity detector frame size in ms, defaults to 30 + rate: sample rate to record audio in, defaults to 16000 (16 kHz) + vadmode: aggressiveness of the voice activity detector (1 - 3), defaults to 3, which is the most aggressive filter + vadframe: voice activity detector frame size in ms, defaults to 20 vadthreshold: percentage of frames (0.0 - 1.0) that must be voice to be considered speech, defaults to 0.6 + voicestart: starting frequency to use for voice filtering, defaults to 300 + voiceend: ending frequency to use for voice filtering, defaults to 3400 + active: minimum number of active speech chunks to require before considering this speech, defaults to 5 + pause: number of non-speech chunks to keep before considering speech complete, defaults to 8 """ - if not PYAUDIO: - raise ImportError('Microphone pipeline is not available - install "pipeline" extra to enable') + if not SOUNDDEVICE: + raise ImportError("SoundDevice library not installed or portaudio library not found") + + # Sample rate + self.rate = rate # Voice activity detector self.vad = webrtcvad.Vad(vadmode) self.vadframe = vadframe self.vadthreshold = vadthreshold - # Sample rate - self.rate = rate + # Voice spectrum + self.voicestart = voicestart + self.voiceend = voiceend - # Speech recognition config - self.recognizer = sr.Recognizer() + # Audio chunks counts + self.active = active + self.pause = pause def __call__(self, device=None): - # Read from microphone - with sr.Microphone(sample_rate=self.rate) as source: - # Calibrate microphone - self.recognizer.adjust_for_ambient_noise(source) - - # Wait for speech - audio = None - while audio is None: - audio = self.listen(source) + # Listen for audio + audio = self.listen(device[0] if isinstance(device, list) else device) - # Return single element if single element passed in - return (audio, self.rate) if device is None or not isinstance(device, list) else [(audio, self.rate)] + # Return single element if single element passed in + return (audio, self.rate) if device is None or not isinstance(device, list) else [(audio, self.rate)] - def listen(self, source): + def listen(self, device): """ - Listens for audio from source. Returns audio if it passes the voice - activity detector. + Listens for speech. Detected speech is converted to 32-bit floats for compatibility with + automatic speech recognition (ASR) pipelines. + + This method blocks until speech is detected. Args: - source: microphone source + device: input device Returns: - audio if present, else None + audio """ - audio = self.recognizer.listen(source) - if self.detect(audio.frame_data, audio.sample_rate): - # Convert to WAV - data = audio.get_wav_data() + # Record in 100ms chunks + chunksize = self.rate // 10 + + # Open input stream + stream = sd.RawInputStream(device=device, samplerate=self.rate, channels=1, blocksize=chunksize, dtype=np.int16) + + # Start the input stream + stream.start() + + record, speech, nospeech, chunks = True, 0, 0, [] + while record: + # Read chunk + chunk, _ = stream.read(chunksize) - # Convert to float32 - s16 = np.frombuffer(data, dtype=np.int16, count=len(data) // 2, offset=0) - return s16.astype(np.float32, order="C") / 32768 + # Detect speech using WebRTC VAD for audio chunk + detect = self.detect(chunk) + speech = speech + 1 if detect else speech + nospeech = 0 if detect else nospeech + 1 - return None + # Save chunk, if this is an active stream + if speech: + chunks.append(chunk) - def detect(self, audio, rate): + # Pause limit has been reached, check if this audio should be accepted + if nospeech >= self.pause: + logger.debug("Audio detected and being analyzed") + if speech >= self.active and self.isspeech(chunks[:-nospeech]): + # Disable recording + record = False + else: + # Reset parameters and keep recording + logger.debug("Speech not detected") + speech, nospeech, chunks = 0, 0, [] + + # Stop the input stream + stream.stop() + + # Convert to float32 and return + audio = np.frombuffer(b"".join(chunks), np.int16) + return self.float32(audio) + + def isspeech(self, chunks): """ - Voice activity detector. + Runs an ensemble of Voice Activity Detection (VAD) methods. Returns true if speech is + detected in the input audio chunks. Args: - audio: input waveform data - rate: sample rate + chunks: input audio chunks as byte buffers + + Returns: + True if speech is detected, False otherwise + """ + + # Convert to NumPy array for processing + audio = np.frombuffer(b"".join(chunks), dtype=np.int16) + + # Ensemble of: + # - WebRTC VAD with a human voice range butterworth bandpass filter applied to the signal + # - FFT applied to detect the energy ratio for human voice range vs total range + return self.detectband(audio) and self.detectenergy(audio) + + def detect(self, buffer): + """ + Detect speech using the WebRTC Voice Activity Detector (VAD). + + Args: + buffer: input audio buffer frame as bytes Returns: True if the number of audio frames with audio pass vadthreshold, False otherwise """ - n = int(rate * (self.vadframe / 1000.0) * 2) + n = int(self.rate * (self.vadframe / 1000.0) * 2) offset = 0 detects = [] - while offset + n < len(audio): - detects.append(1 if self.vad.is_speech(audio[offset : offset + n], rate) else 0) + while offset + n <= len(buffer): + detects.append(1 if self.vad.is_speech(buffer[offset : offset + n], self.rate) else 0) offset += n - return sum(detects) / len(detects) >= self.vadthreshold if detects else 0 + # Calculate detection ratio and return + ratio = sum(detects) / len(detects) if detects else 0 + logger.debug("DETECT %.4f", ratio) + return ratio >= self.vadthreshold + + def detectband(self, audio): + """ + Detects speech using audio data filtered through a butterworth band filter + with the human voice range. + + Args: + audio: input audio data as an NumPy array + + Returns: + True if speech is detected, False otherwise + """ + + # Upsample to float32 + audio = self.float32(audio) + + # Human voice frequency range + low = self.voicestart / (0.5 * self.rate) + high = self.voiceend / (0.5 * self.rate) + + # Low and high pass filter using human voice range + sos = butter(5, Wn=[low, high], btype="band", output="sos") + audio = sosfilt(sos, audio) + + # Scale back to int16 + audio = self.int16(audio) + + # Pass filtered signal to WebRTC VAD + return self.detect(audio.tobytes()) + + def detectenergy(self, audio): + """ + Detects speech by comparing the signal energy of the human voice range + to the overall signal energy. + + Args: + audio: input audio data as an NumPy array + + Returns: + True if speech is detected, False otherwise + """ + + # Calculate signal frequency + frequency = rfftfreq(len(audio), 1.0 / self.rate) + frequency = frequency[1:] + + # Calculate signal energy using amplitude + energy = np.abs(rfft(audio)) + energy = energy[1:] + energy = energy**2 + + # Get energy for each frequency + energyfreq = {} + for x, freq in enumerate(frequency): + if abs(freq) not in energyfreq: + energyfreq[abs(freq)] = energy[x] * 2 + + # Sum speech energy + speechenergy = 0 + for f, e in energyfreq.items(): + if self.voicestart <= f <= self.voiceend: + speechenergy += e + + # Calculate ratio of speech energy to total energy and return + ratio = speechenergy / sum(energyfreq.values()) + logger.debug("SPEECH %.4f", ratio) + return ratio >= self.vadthreshold + + def float32(self, audio): + """ + Converts an input NumPy array with 16-bit ints to 32-bit floats. + + Args: + audio: input audio array as 16-bit ints + + Returns: + audio array as 32-bit floats + """ + + i = np.iinfo(audio.dtype) + abs_max = 2 ** (i.bits - 1) + offset = i.min + abs_max + return (audio.astype(np.float32) - offset) / abs_max + + def int16(self, audio): + """ + Converts an input NumPy array with 32-bit floats to 16-bit ints. + + Args: + audio: input audio array as 32-bit floats + + Returns: + audio array as 16-bit ints + """ + + i = np.iinfo(np.int16) + absmax = 2 ** (i.bits - 1) + offset = i.min + absmax + return (audio * absmax + offset).clip(i.min, i.max).astype(np.int16) diff --git a/test/python/testoptional.py b/test/python/testoptional.py index 6d1ba14e8..9394e28e2 100644 --- a/test/python/testoptional.py +++ b/test/python/testoptional.py @@ -47,7 +47,6 @@ def setUpClass(cls): "sentence_transformers", "sounddevice", "soundfile", - "speech_recognition", "sqlalchemy", "sqlite_vec", "tika", diff --git a/test/python/testpipeline/testmicrophone.py b/test/python/testpipeline/testmicrophone.py index 9997a1d06..2b7a9bcfc 100644 --- a/test/python/testpipeline/testmicrophone.py +++ b/test/python/testpipeline/testmicrophone.py @@ -6,6 +6,7 @@ from unittest.mock import patch +import numpy as np import soundfile as sf from txtai.pipeline import Microphone @@ -19,43 +20,62 @@ class TestMicrophone(unittest.TestCase): Microphone tests. """ - @patch("speech_recognition.Recognizer.listen") - @patch("speech_recognition.Recognizer.adjust_for_ambient_noise") - @patch("speech_recognition.Microphone") - # pylint: disable=C0115,C0116,W0613 - def testMicrophone(self, microphone, ambient, listen): + # pylint: disable=C0115,C0116 + @patch("sounddevice.RawInputStream") + def testMicrophone(self, inputstream): """ Test listening to microphone """ - class Audio: - def __init__(self): - self.frame_data, self.sample_rate = None, None + class RawInputStream: + def __init__(self, **kwargs): + self.args = kwargs - def get_wav_data(self): - return self.frame_data + # Read audio data + self.index, self.passes = 0, 0 + audio, self.samplerate = sf.read(Utils.PATH + "/Make_huge_profits.wav") - def speech(device): - # Read audio data - raw, samplerate = sf.read(Utils.PATH + "/Make_huge_profits.wav") + # Convert data to PCM + self.audio = self.int16(audio) - audio = Audio() - audio.frame_data, audio.sample_rate = raw, samplerate - return audio + # Start with random data to test that speech is not detected + self.data = np.concatenate((self.audio * 50, np.zeros(shape=self.audio.shape, dtype=np.int16))) - def nospeech(device): - audio = Audio() - audio.sample_rate = 16000 - audio.frame_data = b"\x00\x00" * int(audio.sample_rate * 30 / 1000) - return audio + def start(self): + pass - microphone.return_value.__enter__.return_value = (0, 1) - ambient.return_value = True + def stop(self): + pass - pipeline = Microphone() + def read(self, size): + # Get chunk + chunk = self.data[self.index : self.index + size] + self.index += size + + # Initial pass is random data, 2nd pass is speech data + if self.index > len(self.data): + if not self.passes: + self.index, self.passes = 0, self.passes + 1 + self.data = self.audio + elif self.index >= len(self.audio) * 10: + # Break out of loop if speech continues to not be detected + raise IOError("Data exhausted") + + return chunk, False - listen.side_effect = speech - self.assertIsNotNone(pipeline([1])) + def int16(self, data): + i = np.iinfo(np.int16) + absmax = 2 ** (i.bits - 1) + offset = i.min + absmax + return (data * absmax + offset).clip(i.min, i.max).astype(np.int16) + + # Mock input stream + inputstream.return_value = RawInputStream() + + # Create microphone pipeline and read data + pipeline = Microphone() + data, rate = pipeline() - listen.side_effect = nospeech - self.assertIsNone(pipeline.listen(microphone)) + # Validate sample rate and length of data + self.assertEqual(len(data), 91220) + self.assertEqual(rate, 16000)