Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
- Document ingestion pipeline: load, split, embed, store via Qdrant (#472)
- File size guard (50 MiB default) and path canonicalization for document loaders
- Audio input support: `Attachment`/`AttachmentKind` types, `SpeechToText` trait, OpenAI Whisper backend behind `stt` feature flag (#520, #521, #522)
- Telegram voice and audio message handling with automatic file download (#524)
- STT bootstrap wiring: `WhisperProvider` created from `[llm.stt]` config behind `stt` feature (#529)

## [0.10.0] - 2026-02-18

Expand Down
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ scheduler = ["dep:zeph-scheduler"]
otel = ["dep:opentelemetry", "dep:opentelemetry_sdk", "dep:opentelemetry-otlp", "dep:tracing-opentelemetry"]
pdf = ["zeph-memory/pdf"]
mock = ["zeph-llm/mock", "zeph-memory/mock"]
stt = ["zeph-llm/stt"]
stt = ["zeph-llm/stt", "dep:reqwest"]

[dependencies]
anyhow.workspace = true
Expand All @@ -151,6 +151,7 @@ zeph-tools.workspace = true
zeph-gateway = { workspace = true, optional = true }
zeph-scheduler = { workspace = true, optional = true }
zeph-tui = { workspace = true, optional = true }
reqwest = { workspace = true, optional = true, features = ["rustls"] }

[dev-dependencies]
tempfile.workspace = true
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -236,8 +236,8 @@ Skills **evolve**: failure detection triggers self-reflection, and the agent gen
|----------|-------------|
| **MCP** | Connect external tool servers (stdio + HTTP) with SSRF protection |
| **A2A** | Agent-to-agent communication via JSON-RPC 2.0 with SSE streaming |
| **Audio input** | Speech-to-text transcription via OpenAI Whisper (25 MB limit) |
| **Channels** | CLI, Telegram, Discord, Slack, TUI — all with streaming support |
| **Audio input** | Speech-to-text via OpenAI Whisper (25 MB limit); Telegram voice messages transcribed automatically |
| **Channels** | CLI, Telegram (text + voice), Discord, Slack, TUI — all with streaming support |
| **Gateway** | HTTP webhook ingestion with bearer auth and rate limiting |
| **Native tool_use** | Structured tool calling via Claude/OpenAI APIs; text fallback for local models |

Expand Down
2 changes: 1 addition & 1 deletion crates/zeph-channels/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ Implements I/O channel adapters that connect the agent to different frontends. S
| Module | Description |
|--------|-------------|
| `cli` | `CliChannel` — interactive terminal I/O |
| `telegram` | Telegram adapter via teloxide with streaming |
| `telegram` | Telegram adapter via teloxide with streaming; voice/audio message detection and file download |
| `discord` | Discord adapter (optional feature) |
| `slack` | Slack adapter (optional feature) |
| `any` | `AnyChannel` — enum dispatch over all channels |
Expand Down
53 changes: 46 additions & 7 deletions crates/zeph-channels/src/telegram.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use crate::markdown::markdown_to_telegram;
use teloxide::prelude::*;
use teloxide::types::{ChatAction, MessageId, ParseMode};
use tokio::sync::mpsc;
use zeph_core::channel::{Channel, ChannelError, ChannelMessage};
use zeph_core::channel::{Attachment, AttachmentKind, Channel, ChannelError, ChannelMessage};

const MAX_MESSAGE_LEN: usize = 4096;

Expand All @@ -24,6 +24,7 @@ pub struct TelegramChannel {
struct IncomingMessage {
chat_id: ChatId,
text: String,
attachments: Vec<Attachment>,
}

impl TelegramChannel {
Expand Down Expand Up @@ -62,7 +63,7 @@ impl TelegramChannel {
let allowed = self.allowed_users.clone();

tokio::spawn(async move {
let handler = Update::filter_message().endpoint(move |msg: Message, _bot: Bot| {
let handler = Update::filter_message().endpoint(move |msg: Message, bot: Bot| {
let tx = tx.clone();
let allowed = allowed.clone();
async move {
Expand All @@ -81,14 +82,38 @@ impl TelegramChannel {
}
}

let Some(text) = msg.text() else {
let text = msg.text().unwrap_or_default().to_string();
let mut attachments = Vec::new();

let audio_file_id = msg
.voice()
.map(|v| v.file.id.0.clone())
.or_else(|| msg.audio().map(|a| a.file.id.0.clone()));

if let Some(file_id) = audio_file_id {
match download_file(&bot, file_id).await {
Ok(data) => {
attachments.push(Attachment {
kind: AttachmentKind::Audio,
data,
filename: msg.audio().and_then(|a| a.file_name.clone()),
});
}
Err(e) => {
tracing::warn!("failed to download audio attachment: {e}");
}
}
}

if text.is_empty() && attachments.is_empty() {
return respond(());
};
}

let _ = tx
.send(IncomingMessage {
chat_id: msg.chat.id,
text: text.to_string(),
text,
attachments,
})
.await;

Expand Down Expand Up @@ -203,13 +228,27 @@ impl TelegramChannel {
}
}

async fn download_file(bot: &Bot, file_id: String) -> Result<Vec<u8>, String> {
use teloxide::net::Download;

let file = bot
.get_file(file_id.into())
.await
.map_err(|e| format!("get_file: {e}"))?;
let mut buf: Vec<u8> = Vec::new();
bot.download_file(&file.path, &mut buf)
.await
.map_err(|e| format!("download_file: {e}"))?;
Ok(buf)
}

impl Channel for TelegramChannel {
fn try_recv(&mut self) -> Option<ChannelMessage> {
self.rx.try_recv().ok().map(|incoming| {
self.chat_id = Some(incoming.chat_id);
ChannelMessage {
text: incoming.text,
attachments: vec![],
attachments: incoming.attachments,
}
})
}
Expand Down Expand Up @@ -252,7 +291,7 @@ impl Channel for TelegramChannel {

return Ok(Some(ChannelMessage {
text: incoming.text,
attachments: vec![],
attachments: incoming.attachments,
}));
}
}
Expand Down
5 changes: 5 additions & 0 deletions docs/src/getting-started/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,11 @@ max_tokens = 4096
# embedding_model = "text-embedding-3-small"
# reasoning_effort = "medium" # low, medium, high (for reasoning models)

[llm.stt]
provider = "whisper"
model = "whisper-1"
# Requires `stt` feature. Uses the OpenAI API key from [llm.openai] or ZEPH_OPENAI_API_KEY.

[skills]
paths = ["./skills"]
max_active_skills = 5 # Top-K skills per query via embedding similarity
Expand Down
8 changes: 8 additions & 0 deletions docs/src/guide/audio-input.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,14 @@ The Whisper provider inherits the OpenAI API key from the `[llm.openai]` section
| OpenAI Whisper API | `whisper` | `stt` | Available |
| Local Whisper (candle) | — | — | Planned |

## Telegram Voice Messages

The Telegram channel automatically detects voice and audio messages. When a user sends a voice note or audio file, the adapter downloads the file bytes via the Telegram Bot API and wraps them as an `Attachment` with `AttachmentKind::Audio`. The attachment then follows the standard transcription pipeline described above.

Download failures (network errors, expired file links) are logged at `warn` level and gracefully skipped — the message is delivered without an attachment rather than causing an error.

Bootstrap wiring is automatic: when `[llm.stt]` is present in the config and the `stt` feature is enabled, `main.rs` creates a `WhisperProvider` and injects it into the agent via `with_stt()`. No additional setup is needed beyond the configuration shown above.

## Limitations

- **25 MB file size limit** — audio files exceeding this are rejected before upload.
Expand Down
2 changes: 1 addition & 1 deletion docs/src/guide/channels.md
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ When the queue is full (10 messages), new input is silently dropped until space

## Attachments

`ChannelMessage` supports an optional `attachments` field carrying `Attachment` values with typed `AttachmentKind` variants (Audio, Image, Video, File). When the `stt` feature is enabled, audio attachments are automatically transcribed before entering the agent loop. See [Audio Input](audio-input.md) for details.
`ChannelMessage` supports an optional `attachments` field carrying `Attachment` values with typed `AttachmentKind` variants (Audio, Image, Video, File). When the `stt` feature is enabled, audio attachments are automatically transcribed before entering the agent loop. The Telegram channel automatically downloads voice and audio messages and delivers them as attachments. See [Audio Input](audio-input.md) for details.

## Channel Selection Logic

Expand Down
29 changes: 29 additions & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -423,6 +423,35 @@ async fn main() -> anyhow::Result<()> {
let agent = agent.with_mcp(mcp_tools, mcp_registry, Some(mcp_manager), &config.mcp);
let agent = agent.with_learning(config.skills.learning.clone());

#[cfg(feature = "stt")]
let agent = if config.llm.stt.is_some() {
if let Some(ref api_key) = config.secrets.openai_api_key {
let base_url = config
.llm
.openai
.as_ref()
.map_or("https://api.openai.com/v1", |o| o.base_url.as_str());
let model = config
.llm
.stt
.as_ref()
.map_or("whisper-1", |s| s.model.as_str());
let whisper = zeph_llm::whisper::WhisperProvider::new(
reqwest::Client::new(),
api_key.expose(),
base_url,
model,
);
tracing::info!("STT enabled via Whisper (model: {model})");
agent.with_stt(Box::new(whisper))
} else {
tracing::warn!("STT configured but ZEPH_OPENAI_API_KEY not found");
agent
}
} else {
agent
};

#[cfg(feature = "tui")]
let tui_metrics_rx;
#[cfg(feature = "tui")]
Expand Down
Loading