Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Consolidate update frames classes into a single UpdateSettingsFrame class #517

Merged
merged 4 commits into from
Sep 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,9 @@ async def on_connected(processor):

### Changed

- Updated individual update settings frame classes into a single UpdateSettingsFrame
class for STT, LLM, and TTS.

- We now distinguish between input and output audio and image frames. We
introduce `InputAudioRawFrame`, `OutputAudioRawFrame`, `InputImageRawFrame`
and `OutputImageRawFrame` (and other subclasses of those). The input frames
Expand Down
131 changes: 31 additions & 100 deletions src/pipecat/frames/frames.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,8 @@
# SPDX-License-Identifier: BSD 2-Clause License
#

from typing import Any, List, Optional, Tuple

from dataclasses import dataclass, field
from typing import Any, List, Optional, Tuple, Union

from pipecat.clocks.base_clock import BaseClock
from pipecat.metrics.metrics import MetricsData
Expand Down Expand Up @@ -528,113 +527,45 @@ def __str__(self):


@dataclass
class LLMModelUpdateFrame(ControlFrame):
"""A control frame containing a request to update to a new LLM model."""

model: str


@dataclass
class LLMTemperatureUpdateFrame(ControlFrame):
"""A control frame containing a request to update to a new LLM temperature."""

temperature: float


@dataclass
class LLMTopKUpdateFrame(ControlFrame):
"""A control frame containing a request to update to a new LLM top_k."""
class LLMUpdateSettingsFrame(ControlFrame):
"""A control frame containing a request to update LLM settings."""

top_k: int
model: Optional[str] = None
temperature: Optional[float] = None
top_k: Optional[int] = None
top_p: Optional[float] = None
frequency_penalty: Optional[float] = None
presence_penalty: Optional[float] = None
max_tokens: Optional[int] = None
seed: Optional[int] = None
extra: dict = field(default_factory=dict)


@dataclass
class LLMTopPUpdateFrame(ControlFrame):
"""A control frame containing a request to update to a new LLM top_p."""

top_p: float


@dataclass
class LLMFrequencyPenaltyUpdateFrame(ControlFrame):
"""A control frame containing a request to update to a new LLM frequency
penalty.

"""

frequency_penalty: float


@dataclass
class LLMPresencePenaltyUpdateFrame(ControlFrame):
"""A control frame containing a request to update to a new LLM presence
penalty.

"""

presence_penalty: float


@dataclass
class LLMMaxTokensUpdateFrame(ControlFrame):
"""A control frame containing a request to update to a new LLM max tokens."""

max_tokens: int


@dataclass
class LLMSeedUpdateFrame(ControlFrame):
"""A control frame containing a request to update to a new LLM seed."""

seed: int


@dataclass
class LLMExtraUpdateFrame(ControlFrame):
"""A control frame containing a request to update to a new LLM extra params."""

extra: dict


@dataclass
class TTSModelUpdateFrame(ControlFrame):
"""A control frame containing a request to update the TTS model."""

model: str


@dataclass
class TTSVoiceUpdateFrame(ControlFrame):
"""A control frame containing a request to update to a new TTS voice."""

voice: str


@dataclass
class TTSLanguageUpdateFrame(ControlFrame):
"""A control frame containing a request to update to a new TTS language and
optional voice.

"""

language: Language


@dataclass
class STTModelUpdateFrame(ControlFrame):
"""A control frame containing a request to update the STT model and optional
language.

"""
class TTSUpdateSettingsFrame(ControlFrame):
"""A control frame containing a request to update TTS settings."""

model: str
model: Optional[str] = None
voice: Optional[str] = None
language: Optional[Language] = None
speed: Optional[Union[str, float]] = None
emotion: Optional[List[str]] = None
engine: Optional[str] = None
pitch: Optional[str] = None
rate: Optional[str] = None
volume: Optional[str] = None
emphasis: Optional[str] = None
style: Optional[str] = None
style_degree: Optional[str] = None
role: Optional[str] = None


@dataclass
class STTLanguageUpdateFrame(ControlFrame):
"""A control frame containing a request to update to STT language."""
class STTUpdateSettingsFrame(ControlFrame):
"""A control frame containing a request to update STT settings."""

language: Language
model: Optional[str] = None
language: Optional[Language] = None


@dataclass
Expand Down
106 changes: 85 additions & 21 deletions src/pipecat/services/ai_services.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@
import asyncio
import io
import wave

from abc import abstractmethod
from typing import AsyncGenerator, List, Optional, Tuple
from typing import AsyncGenerator, List, Optional, Tuple, Union

from loguru import logger

from pipecat.frames.frames import (
AudioRawFrame,
Expand All @@ -18,31 +19,26 @@
ErrorFrame,
Frame,
LLMFullResponseEndFrame,
STTLanguageUpdateFrame,
STTModelUpdateFrame,
StartFrame,
StartInterruptionFrame,
STTUpdateSettingsFrame,
TextFrame,
TTSAudioRawFrame,
TTSLanguageUpdateFrame,
TTSModelUpdateFrame,
TTSSpeakFrame,
TTSStartedFrame,
TTSStoppedFrame,
TTSVoiceUpdateFrame,
TextFrame,
TTSUpdateSettingsFrame,
UserImageRequestFrame,
VisionImageRawFrame,
)
from pipecat.metrics.metrics import MetricsData
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.transcriptions.language import Language
from pipecat.utils.audio import calculate_audio_volume
from pipecat.utils.string import match_endofsentence
from pipecat.utils.time import seconds_to_nanoseconds
from pipecat.utils.utils import exp_smoothing
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext

from loguru import logger


class AIService(FrameProcessor):
Expand Down Expand Up @@ -174,6 +170,46 @@ async def set_voice(self, voice: str):
async def set_language(self, language: Language):
pass

@abstractmethod
async def set_speed(self, speed: Union[str, float]):
pass

@abstractmethod
async def set_emotion(self, emotion: List[str]):
pass

@abstractmethod
async def set_engine(self, engine: str):
pass

@abstractmethod
async def set_pitch(self, pitch: str):
pass

@abstractmethod
async def set_rate(self, rate: str):
pass

@abstractmethod
async def set_volume(self, volume: str):
pass

@abstractmethod
async def set_emphasis(self, emphasis: str):
pass

@abstractmethod
async def set_style(self, style: str):
pass

@abstractmethod
async def set_style_degree(self, style_degree: str):
pass

@abstractmethod
async def set_role(self, role: str):
pass

# Converts the text to audio.
@abstractmethod
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
Expand Down Expand Up @@ -212,6 +248,34 @@ async def _push_tts_frames(self, text: str):
# interrupted, the text is not added to the assistant context.
await self.push_frame(TextFrame(text))

async def _update_tts_settings(self, frame: TTSUpdateSettingsFrame):
if frame.model is not None:
await self.set_model(frame.model)
if frame.voice is not None:
await self.set_voice(frame.voice)
if frame.language is not None:
await self.set_language(frame.language)
if frame.speed is not None:
await self.set_speed(frame.speed)
if frame.emotion is not None:
await self.set_emotion(frame.emotion)
if frame.engine is not None:
await self.set_engine(frame.engine)
if frame.pitch is not None:
await self.set_pitch(frame.pitch)
if frame.rate is not None:
await self.set_rate(frame.rate)
if frame.volume is not None:
await self.set_volume(frame.volume)
if frame.emphasis is not None:
await self.set_emphasis(frame.emphasis)
if frame.style is not None:
await self.set_style(frame.style)
if frame.style_degree is not None:
await self.set_style_degree(frame.style_degree)
if frame.role is not None:
await self.set_role(frame.role)

async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)

Expand All @@ -230,12 +294,8 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
await self.push_frame(frame, direction)
elif isinstance(frame, TTSSpeakFrame):
await self._push_tts_frames(frame.text)
elif isinstance(frame, TTSModelUpdateFrame):
await self.set_model(frame.model)
elif isinstance(frame, TTSVoiceUpdateFrame):
await self.set_voice(frame.voice)
elif isinstance(frame, TTSLanguageUpdateFrame):
await self.set_language(frame.language)
elif isinstance(frame, TTSUpdateSettingsFrame):
await self._update_tts_settings(frame)
else:
await self.push_frame(frame, direction)

Expand Down Expand Up @@ -397,6 +457,12 @@ async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
"""Returns transcript as a string"""
pass

async def _update_stt_settings(self, frame: STTUpdateSettingsFrame):
if frame.model is not None:
await self.set_model(frame.model)
if frame.language is not None:
await self.set_language(frame.language)

async def process_audio_frame(self, frame: AudioRawFrame):
await self.process_generator(self.run_stt(frame.audio))

Expand All @@ -408,10 +474,8 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
# In this service we accumulate audio internally and at the end we
# push a TextFrame. We don't really want to push audio frames down.
await self.process_audio_frame(frame)
elif isinstance(frame, STTModelUpdateFrame):
await self.set_model(frame.model)
elif isinstance(frame, STTLanguageUpdateFrame):
await self.set_language(frame.language)
elif isinstance(frame, STTUpdateSettingsFrame):
await self._update_stt_settings(frame)
else:
await self.push_frame(frame, direction)

Expand Down
Loading
Loading