diff --git a/CHANGELOG.md b/CHANGELOG.md index 1279227..e7703f3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,6 +28,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). - Document ingestion pipeline: load, split, embed, store via Qdrant (#472) - File size guard (50 MiB default) and path canonicalization for document loaders - Audio input support: `Attachment`/`AttachmentKind` types, `SpeechToText` trait, OpenAI Whisper backend behind `stt` feature flag (#520, #521, #522) +- Telegram voice and audio message handling with automatic file download (#524) +- STT bootstrap wiring: `WhisperProvider` created from `[llm.stt]` config behind `stt` feature (#529) ## [0.10.0] - 2026-02-18 diff --git a/Cargo.lock b/Cargo.lock index 8af10d7..2dd66ca 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8613,6 +8613,7 @@ dependencies = [ "opentelemetry", "opentelemetry-otlp", "opentelemetry_sdk", + "reqwest 0.13.2", "tempfile", "tokio", "tokio-stream", diff --git a/Cargo.toml b/Cargo.toml index 056e31b..1b58928 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -124,7 +124,7 @@ scheduler = ["dep:zeph-scheduler"] otel = ["dep:opentelemetry", "dep:opentelemetry_sdk", "dep:opentelemetry-otlp", "dep:tracing-opentelemetry"] pdf = ["zeph-memory/pdf"] mock = ["zeph-llm/mock", "zeph-memory/mock"] -stt = ["zeph-llm/stt"] +stt = ["zeph-llm/stt", "dep:reqwest"] [dependencies] anyhow.workspace = true @@ -151,6 +151,7 @@ zeph-tools.workspace = true zeph-gateway = { workspace = true, optional = true } zeph-scheduler = { workspace = true, optional = true } zeph-tui = { workspace = true, optional = true } +reqwest = { workspace = true, optional = true, features = ["rustls"] } [dev-dependencies] tempfile.workspace = true diff --git a/README.md b/README.md index 10b88e5..8bca9de 100644 --- a/README.md +++ b/README.md @@ -236,8 +236,8 @@ Skills **evolve**: failure detection triggers self-reflection, and the agent gen |----------|-------------| | **MCP** | Connect external tool servers (stdio + HTTP) with SSRF protection | | **A2A** | Agent-to-agent communication via JSON-RPC 2.0 with SSE streaming | -| **Audio input** | Speech-to-text transcription via OpenAI Whisper (25 MB limit) | -| **Channels** | CLI, Telegram, Discord, Slack, TUI — all with streaming support | +| **Audio input** | Speech-to-text via OpenAI Whisper (25 MB limit); Telegram voice messages transcribed automatically | +| **Channels** | CLI, Telegram (text + voice), Discord, Slack, TUI — all with streaming support | | **Gateway** | HTTP webhook ingestion with bearer auth and rate limiting | | **Native tool_use** | Structured tool calling via Claude/OpenAI APIs; text fallback for local models | diff --git a/crates/zeph-channels/README.md b/crates/zeph-channels/README.md index 701c75d..9853c3a 100644 --- a/crates/zeph-channels/README.md +++ b/crates/zeph-channels/README.md @@ -11,7 +11,7 @@ Implements I/O channel adapters that connect the agent to different frontends. S | Module | Description | |--------|-------------| | `cli` | `CliChannel` — interactive terminal I/O | -| `telegram` | Telegram adapter via teloxide with streaming | +| `telegram` | Telegram adapter via teloxide with streaming; voice/audio message detection and file download | | `discord` | Discord adapter (optional feature) | | `slack` | Slack adapter (optional feature) | | `any` | `AnyChannel` — enum dispatch over all channels | diff --git a/crates/zeph-channels/src/telegram.rs b/crates/zeph-channels/src/telegram.rs index 2145e3d..53d3f09 100644 --- a/crates/zeph-channels/src/telegram.rs +++ b/crates/zeph-channels/src/telegram.rs @@ -4,7 +4,7 @@ use crate::markdown::markdown_to_telegram; use teloxide::prelude::*; use teloxide::types::{ChatAction, MessageId, ParseMode}; use tokio::sync::mpsc; -use zeph_core::channel::{Channel, ChannelError, ChannelMessage}; +use zeph_core::channel::{Attachment, AttachmentKind, Channel, ChannelError, ChannelMessage}; const MAX_MESSAGE_LEN: usize = 4096; @@ -24,6 +24,7 @@ pub struct TelegramChannel { struct IncomingMessage { chat_id: ChatId, text: String, + attachments: Vec, } impl TelegramChannel { @@ -62,7 +63,7 @@ impl TelegramChannel { let allowed = self.allowed_users.clone(); tokio::spawn(async move { - let handler = Update::filter_message().endpoint(move |msg: Message, _bot: Bot| { + let handler = Update::filter_message().endpoint(move |msg: Message, bot: Bot| { let tx = tx.clone(); let allowed = allowed.clone(); async move { @@ -81,14 +82,38 @@ impl TelegramChannel { } } - let Some(text) = msg.text() else { + let text = msg.text().unwrap_or_default().to_string(); + let mut attachments = Vec::new(); + + let audio_file_id = msg + .voice() + .map(|v| v.file.id.0.clone()) + .or_else(|| msg.audio().map(|a| a.file.id.0.clone())); + + if let Some(file_id) = audio_file_id { + match download_file(&bot, file_id).await { + Ok(data) => { + attachments.push(Attachment { + kind: AttachmentKind::Audio, + data, + filename: msg.audio().and_then(|a| a.file_name.clone()), + }); + } + Err(e) => { + tracing::warn!("failed to download audio attachment: {e}"); + } + } + } + + if text.is_empty() && attachments.is_empty() { return respond(()); - }; + } let _ = tx .send(IncomingMessage { chat_id: msg.chat.id, - text: text.to_string(), + text, + attachments, }) .await; @@ -203,13 +228,27 @@ impl TelegramChannel { } } +async fn download_file(bot: &Bot, file_id: String) -> Result, String> { + use teloxide::net::Download; + + let file = bot + .get_file(file_id.into()) + .await + .map_err(|e| format!("get_file: {e}"))?; + let mut buf: Vec = Vec::new(); + bot.download_file(&file.path, &mut buf) + .await + .map_err(|e| format!("download_file: {e}"))?; + Ok(buf) +} + impl Channel for TelegramChannel { fn try_recv(&mut self) -> Option { self.rx.try_recv().ok().map(|incoming| { self.chat_id = Some(incoming.chat_id); ChannelMessage { text: incoming.text, - attachments: vec![], + attachments: incoming.attachments, } }) } @@ -252,7 +291,7 @@ impl Channel for TelegramChannel { return Ok(Some(ChannelMessage { text: incoming.text, - attachments: vec![], + attachments: incoming.attachments, })); } } diff --git a/docs/src/getting-started/configuration.md b/docs/src/getting-started/configuration.md index 04f5931..06ce0f3 100644 --- a/docs/src/getting-started/configuration.md +++ b/docs/src/getting-started/configuration.md @@ -95,6 +95,11 @@ max_tokens = 4096 # embedding_model = "text-embedding-3-small" # reasoning_effort = "medium" # low, medium, high (for reasoning models) +[llm.stt] +provider = "whisper" +model = "whisper-1" +# Requires `stt` feature. Uses the OpenAI API key from [llm.openai] or ZEPH_OPENAI_API_KEY. + [skills] paths = ["./skills"] max_active_skills = 5 # Top-K skills per query via embedding similarity diff --git a/docs/src/guide/audio-input.md b/docs/src/guide/audio-input.md index 2cd5963..8868a92 100644 --- a/docs/src/guide/audio-input.md +++ b/docs/src/guide/audio-input.md @@ -35,6 +35,14 @@ The Whisper provider inherits the OpenAI API key from the `[llm.openai]` section | OpenAI Whisper API | `whisper` | `stt` | Available | | Local Whisper (candle) | — | — | Planned | +## Telegram Voice Messages + +The Telegram channel automatically detects voice and audio messages. When a user sends a voice note or audio file, the adapter downloads the file bytes via the Telegram Bot API and wraps them as an `Attachment` with `AttachmentKind::Audio`. The attachment then follows the standard transcription pipeline described above. + +Download failures (network errors, expired file links) are logged at `warn` level and gracefully skipped — the message is delivered without an attachment rather than causing an error. + +Bootstrap wiring is automatic: when `[llm.stt]` is present in the config and the `stt` feature is enabled, `main.rs` creates a `WhisperProvider` and injects it into the agent via `with_stt()`. No additional setup is needed beyond the configuration shown above. + ## Limitations - **25 MB file size limit** — audio files exceeding this are rejected before upload. diff --git a/docs/src/guide/channels.md b/docs/src/guide/channels.md index 9fc1f32..90db8e4 100644 --- a/docs/src/guide/channels.md +++ b/docs/src/guide/channels.md @@ -256,7 +256,7 @@ When the queue is full (10 messages), new input is silently dropped until space ## Attachments -`ChannelMessage` supports an optional `attachments` field carrying `Attachment` values with typed `AttachmentKind` variants (Audio, Image, Video, File). When the `stt` feature is enabled, audio attachments are automatically transcribed before entering the agent loop. See [Audio Input](audio-input.md) for details. +`ChannelMessage` supports an optional `attachments` field carrying `Attachment` values with typed `AttachmentKind` variants (Audio, Image, Video, File). When the `stt` feature is enabled, audio attachments are automatically transcribed before entering the agent loop. The Telegram channel automatically downloads voice and audio messages and delivers them as attachments. See [Audio Input](audio-input.md) for details. ## Channel Selection Logic diff --git a/src/main.rs b/src/main.rs index 0b82ca3..e36ad3e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -423,6 +423,35 @@ async fn main() -> anyhow::Result<()> { let agent = agent.with_mcp(mcp_tools, mcp_registry, Some(mcp_manager), &config.mcp); let agent = agent.with_learning(config.skills.learning.clone()); + #[cfg(feature = "stt")] + let agent = if config.llm.stt.is_some() { + if let Some(ref api_key) = config.secrets.openai_api_key { + let base_url = config + .llm + .openai + .as_ref() + .map_or("https://api.openai.com/v1", |o| o.base_url.as_str()); + let model = config + .llm + .stt + .as_ref() + .map_or("whisper-1", |s| s.model.as_str()); + let whisper = zeph_llm::whisper::WhisperProvider::new( + reqwest::Client::new(), + api_key.expose(), + base_url, + model, + ); + tracing::info!("STT enabled via Whisper (model: {model})"); + agent.with_stt(Box::new(whisper)) + } else { + tracing::warn!("STT configured but ZEPH_OPENAI_API_KEY not found"); + agent + } + } else { + agent + }; + #[cfg(feature = "tui")] let tui_metrics_rx; #[cfg(feature = "tui")]