pipecat-ai · ankykong · Jul 24, 2024 · Jul 25, 2024 · aconchillo · Oct 12, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -128,6 +128,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   processing metrics indicate the time a processor needs to generate all its
   output. Note that not all processors generate these kind of metrics.
 
+- `noisereduce.py` which allows you to run noisereduce to reduce background noises 
+  on calls. Important for calls run through Twilio. Added example: 
+  `examples/foundational/07c-i-interruptible-deepgram-noisereduce.py` of noisereduce 
+  with DeepgramSTT
+
 ### Changed
 
 - `WhisperSTTService` model can now also be a string.

diff --git a/examples/foundational/07c-i-interruptible-deepgram-noisereduce.py b/examples/foundational/07c-i-interruptible-deepgram-noisereduce.py
@@ -0,0 +1,101 @@
+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+import asyncio
+import aiohttp
+import os
+import sys
+
+from pipecat.frames.frames import LLMMessagesFrame
+from pipecat.pipeline.pipeline import Pipeline
+from pipecat.pipeline.runner import PipelineRunner
+from pipecat.pipeline.task import PipelineParams, PipelineTask
+from pipecat.processors.aggregators.llm_response import (
+    LLMAssistantResponseAggregator, LLMUserResponseAggregator)
+from pipecat.services.deepgram import DeepgramSTTService, DeepgramTTSService
+from pipecat.services.openai import OpenAILLMService
+from pipecat.transports.services.daily import DailyParams, DailyTransport
+from pipecat.vad.silero import SileroVADAnalyzer
+from pipecat.services.noisereduce import NoiseReduce
+
+from runner import configure
+
+from loguru import logger
+
+from dotenv import load_dotenv
+load_dotenv(override=True)
+
+logger.remove(0)
+logger.add(sys.stderr, level="DEBUG")
+
+
+async def main(room_url: str, token):
+    async with aiohttp.ClientSession() as session:
+        transport = DailyTransport(
+            room_url,
+            token,
+            "Respond bot",
+            DailyParams(
+                audio_out_enabled=True,
+                vad_enabled=True,
+                vad_analyzer=SileroVADAnalyzer(),
+                vad_audio_passthrough=True
+            )
+        )
+
+        stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
+
+        nr = NoiseReduce()
+
+        tts = DeepgramTTSService(
+            aiohttp_session=session,
+            api_key=os.getenv("DEEPGRAM_API_KEY"),
+            voice="aura-helios-en"
+        )
+
+        llm = OpenAILLMService(
+            api_key=os.getenv("OPENAI_API_KEY"),
+            model="gpt-4o")
+
+        messages = [
+            {
+                "role": "system",
+                "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
+            },
+        ]
+
+        tma_in = LLMUserResponseAggregator(messages)
+        tma_out = LLMAssistantResponseAggregator(messages)
+
+        pipeline = Pipeline([
+            transport.input(),   # Transport user input
+            nr,                  # Noise reducer
+            stt,                 # STT
+            tma_in,              # User responses
+            llm,                 # LLM
+            tts,                 # TTS
+            transport.output(),  # Transport bot output
+            tma_out              # Assistant spoken responses
+        ])
+
+        task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
+
+        @transport.event_handler("on_first_participant_joined")
+        async def on_first_participant_joined(transport, participant):
+            transport.capture_participant_transcription(participant["id"])
+            # Kick off the conversation.
+            messages.append(
+                {"role": "system", "content": "Please introduce yourself to the user."})
+            await task.queue_frames([LLMMessagesFrame(messages)])
+
+        runner = PipelineRunner()
+
+        await runner.run(task)
+
+
+if __name__ == "__main__":
+    (url, token) = configure()
+    asyncio.run(main(url, token))
diff --git a/linux-py3.10-requirements.txt b/linux-py3.10-requirements.txt
@@ -490,6 +490,8 @@ werkzeug==3.0.3
     # via flask
 yarl==1.9.4
     # via aiohttp
+noisereduce==3.0.2
+    # via noisereduce
 
 # The following packages are considered to be unsafe in a requirements file:
 # setuptools
diff --git a/macos-py3.10-requirements.txt b/macos-py3.10-requirements.txt
@@ -454,6 +454,8 @@ werkzeug==3.0.3
     # via flask
 yarl==1.9.4
     # via aiohttp
-
+noisereduce==3.0.2
+    # via noisereduce
+
 # The following packages are considered to be unsafe in a requirements file:
 # setuptools
diff --git a/pyproject.toml b/pyproject.toml
@@ -54,6 +54,7 @@ silero = [ "torch~=2.3.1", "torchaudio~=2.3.1" ]
 websocket = [ "websockets~=12.0", "fastapi~=0.111.0" ]
 whisper = [ "faster-whisper~=1.0.3" ]
 xtts = [ "resampy~=0.4.3" ]
+noisereduce = [ "noisereduce~=3.0.2" ]
 
 [tool.setuptools.packages.find]
 # All the following settings are optional:

diff --git a/src/pipecat/services/noisereduce.py b/src/pipecat/services/noisereduce.py
@@ -0,0 +1,36 @@
+
+import noisereduce as nr
+from loguru import logger
+import numpy as np
+from pipecat.frames.frames import (
+    AudioRawFrame,
+    Frame,
+)
+from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
+
+
+class NoiseReduce(FrameProcessor):
+    def __init__(self):
+        super().__init__()
+
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+        if isinstance(frame, AudioRawFrame):
+            self.reduce_noise(frame)
+        await self.push_frame(frame, direction)
+
+    def reduce_noise(self, frame: AudioRawFrame):
+        if frame.num_channels != 1:
+            logger.error(f"Expected 1 channel, got {frame.num_channels}")
+            return
+
+        # load data
+        data = np.frombuffer(frame.audio, dtype=np.int16)
+
+        # Add a small epsilon to avoid division by zero
+        epsilon = 1e-10
+        data = data.astype(np.float32) + epsilon
+
+        # perform noise reduction
+        reduced_noise = nr.reduce_noise(y=data, sr=frame.sample_rate)
+        frame.audio = np.clip(reduced_noise, -32768, 32767).astype(np.int16).tobytes()