From dbd73b2cae9f76ddf1c14174361e1113f27694c4 Mon Sep 17 00:00:00 2001 From: "Andrei G." Date: Thu, 19 Feb 2026 01:00:06 +0100 Subject: [PATCH 1/4] feat(llm): add vision (image input) support across all providers Introduce MessagePart::Image variant with base64 serde, LlmProvider::supports_vision() trait method, and per-provider implementations: - Claude: structured content path with base64 ImageSource blocks - OpenAI: array content format with data-URI image_url entries - Ollama: with_images() call on user messages, optional separate vision_model Agent layer extracts AttachmentKind::Image from ChannelMessage into MessagePart::Image (capped at 20 MB), constructs Message::from_parts when provider supports vision. Channels: - CLI: /image command reads local file into image attachment - Telegram: downloads largest PhotoSize and attaches as Image Config: vision_model field on LlmConfig; --init wizard prompts for it (Ollama only). Bootstrap wires vision_model into OllamaProvider. Closes #490 --- Cargo.lock | 1 + Cargo.toml | 1 + crates/zeph-channels/src/cli.rs | 29 ++- crates/zeph-channels/src/telegram.rs | 18 ++ crates/zeph-core/src/agent/mod.rs | 321 +++++++++++++++++++++------ crates/zeph-core/src/bootstrap.rs | 5 +- crates/zeph-core/src/config/types.rs | 3 + crates/zeph-llm/Cargo.toml | 1 + crates/zeph-llm/src/any.rs | 33 +++ crates/zeph-llm/src/claude.rs | 90 +++++++- crates/zeph-llm/src/ollama.rs | 65 +++++- crates/zeph-llm/src/openai.rs | 155 +++++++++++-- crates/zeph-llm/src/provider.rs | 77 +++++++ src/init.rs | 16 ++ 14 files changed, 718 insertions(+), 97 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index aa3118b2..42322e3b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8941,6 +8941,7 @@ name = "zeph-llm" version = "0.10.0" dependencies = [ "anyhow", + "base64 0.22.1", "candle-core", "candle-nn", "candle-transformers", diff --git a/Cargo.toml b/Cargo.toml index 09766213..6b530307 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,6 +15,7 @@ age = { version = "0.11.2", default-features = false } clap = { version = "4.5", features = ["derive"] } dialoguer = "0.11" anyhow = "1.0" +base64 = "0.22" candle-core = { version = "0.9", default-features = false } candle-nn = { version = "0.9", default-features = false } candle-transformers = { version = "0.9", default-features = false } diff --git a/crates/zeph-channels/src/cli.rs b/crates/zeph-channels/src/cli.rs index de2ab5be..edd6345f 100644 --- a/crates/zeph-channels/src/cli.rs +++ b/crates/zeph-channels/src/cli.rs @@ -1,4 +1,4 @@ -use zeph_core::channel::{Channel, ChannelError, ChannelMessage}; +use zeph_core::channel::{Attachment, AttachmentKind, Channel, ChannelError, ChannelMessage}; /// CLI channel that reads from stdin and writes to stdout. #[derive(Debug)] @@ -54,6 +54,33 @@ impl Channel for CliChannel { // Reset accumulated for new response self.accumulated.clear(); + // Handle /image command by reading the file into an attachment + if let Some(path) = trimmed.strip_prefix("/image").map(str::trim) { + if path.is_empty() { + println!("Usage: /image "); + return Ok(Some(ChannelMessage { + text: String::new(), + attachments: vec![], + })); + } + let path_owned = path.to_owned(); + let data = tokio::fs::read(&path_owned) + .await + .map_err(ChannelError::Io)?; + let filename = std::path::Path::new(&path_owned) + .file_name() + .and_then(|n| n.to_str()) + .map(str::to_owned); + return Ok(Some(ChannelMessage { + text: String::new(), + attachments: vec![Attachment { + kind: AttachmentKind::Image, + data, + filename, + }], + })); + } + Ok(Some(ChannelMessage { text: trimmed.to_string(), attachments: vec![], diff --git a/crates/zeph-channels/src/telegram.rs b/crates/zeph-channels/src/telegram.rs index 53d3f09b..c2d8b4a8 100644 --- a/crates/zeph-channels/src/telegram.rs +++ b/crates/zeph-channels/src/telegram.rs @@ -105,6 +105,24 @@ impl TelegramChannel { } } + // Handle photo attachments (pick the largest available size) + if let Some(photos) = msg.photo() + && let Some(photo) = photos.iter().max_by_key(|p| p.file.size) + { + match download_file(&bot, photo.file.id.0.clone()).await { + Ok(data) => { + attachments.push(Attachment { + kind: AttachmentKind::Image, + data, + filename: None, + }); + } + Err(e) => { + tracing::warn!("failed to download photo attachment: {e}"); + } + } + } + if text.is_empty() && attachments.is_empty() { return respond(()); } diff --git a/crates/zeph-core/src/agent/mod.rs b/crates/zeph-core/src/agent/mod.rs index e4e052d8..3ad21dac 100644 --- a/crates/zeph-core/src/agent/mod.rs +++ b/crates/zeph-core/src/agent/mod.rs @@ -48,14 +48,31 @@ const SUMMARY_PREFIX: &str = "[conversation summaries]\n"; const CROSS_SESSION_PREFIX: &str = "[cross-session context]\n"; const TOOL_OUTPUT_SUFFIX: &str = "\n```"; const MAX_AUDIO_BYTES: usize = 25 * 1024 * 1024; +const MAX_IMAGE_BYTES: usize = 20 * 1024 * 1024; fn format_tool_output(tool_name: &str, body: &str) -> String { format!("[tool output: {tool_name}]\n```\n{body}{TOOL_OUTPUT_SUFFIX}") } +fn detect_image_mime(filename: Option<&str>) -> String { + let ext = filename + .and_then(|f| std::path::Path::new(f).extension()) + .and_then(|e| e.to_str()) + .unwrap_or("") + .to_lowercase(); + match ext.as_str() { + "jpg" | "jpeg" => "image/jpeg", + "gif" => "image/gif", + "webp" => "image/webp", + _ => "image/png", + } + .to_owned() +} + struct QueuedMessage { text: String, received_at: Instant, + image_parts: Vec, } pub(super) struct MemoryState { @@ -535,14 +552,20 @@ impl Agent { self.message_queue.pop_back(); continue; } - self.enqueue_or_merge(msg.text); + self.enqueue_or_merge(msg.text, vec![]); } } - fn enqueue_or_merge(&mut self, text: String) { + fn enqueue_or_merge( + &mut self, + text: String, + image_parts: Vec, + ) { let now = Instant::now(); if let Some(last) = self.message_queue.back_mut() && now.duration_since(last.received_at) < MESSAGE_MERGE_WINDOW + && last.image_parts.is_empty() + && image_parts.is_empty() { last.text.push('\n'); last.text.push_str(&text); @@ -552,6 +575,7 @@ impl Agent { self.message_queue.push_back(QueuedMessage { text, received_at: now, + image_parts, }); } else { tracing::warn!("message queue full, dropping message"); @@ -615,9 +639,9 @@ impl Agent { loop { self.drain_channel(); - let text = if let Some(queued) = self.message_queue.pop_front() { + let (text, image_parts) = if let Some(queued) = self.message_queue.pop_front() { self.notify_queue_count().await; - queued.text + (queued.text, queued.image_parts) } else { let incoming = tokio::select! { result = self.channel.recv() => result?, @@ -636,7 +660,7 @@ impl Agent { }; let Some(msg) = incoming else { break }; self.drain_channel(); - self.resolve_message_text(msg).await + self.resolve_message(msg).await }; let trimmed = text.trim(); @@ -650,14 +674,18 @@ impl Agent { continue; } - self.process_user_message(text).await?; + self.process_user_message(text, image_parts).await?; } Ok(()) } - async fn resolve_message_text(&self, msg: crate::channel::ChannelMessage) -> String { + async fn resolve_message( + &self, + msg: crate::channel::ChannelMessage, + ) -> (String, Vec) { use crate::channel::AttachmentKind; + use zeph_llm::provider::MessagePart; let audio_attachments: Vec<_> = msg .attachments @@ -665,59 +693,87 @@ impl Agent { .filter(|a| a.kind == AttachmentKind::Audio) .collect(); - if audio_attachments.is_empty() { - return msg.text; - } + let image_attachments: Vec<_> = msg + .attachments + .iter() + .filter(|a| a.kind == AttachmentKind::Image) + .collect(); - let Some(stt) = &self.stt else { - tracing::warn!( - count = audio_attachments.len(), - "audio attachments received but no STT provider configured, dropping" - ); - return msg.text; + let text = if !audio_attachments.is_empty() + && let Some(stt) = self.stt.as_ref() + { + let mut transcribed_parts = Vec::new(); + for attachment in &audio_attachments { + if attachment.data.len() > MAX_AUDIO_BYTES { + tracing::warn!( + size = attachment.data.len(), + max = MAX_AUDIO_BYTES, + "audio attachment exceeds size limit, skipping" + ); + continue; + } + match stt + .transcribe(&attachment.data, attachment.filename.as_deref()) + .await + { + Ok(result) => { + tracing::info!( + len = result.text.len(), + language = ?result.language, + "audio transcribed" + ); + transcribed_parts.push(result.text); + } + Err(e) => { + tracing::error!(error = %e, "audio transcription failed"); + } + } + } + if transcribed_parts.is_empty() { + msg.text.clone() + } else { + let transcribed = transcribed_parts.join("\n"); + if msg.text.is_empty() { + transcribed + } else { + format!("[transcribed audio]\n{transcribed}\n\n{}", msg.text) + } + } + } else { + if !audio_attachments.is_empty() { + tracing::warn!( + count = audio_attachments.len(), + "audio attachments received but no STT provider configured, dropping" + ); + } + msg.text.clone() }; - let mut transcribed_parts = Vec::new(); - for attachment in &audio_attachments { - if attachment.data.len() > MAX_AUDIO_BYTES { + let mut image_parts = Vec::new(); + for attachment in image_attachments { + if attachment.data.len() > MAX_IMAGE_BYTES { tracing::warn!( size = attachment.data.len(), - max = MAX_AUDIO_BYTES, - "audio attachment exceeds size limit, skipping" + max = MAX_IMAGE_BYTES, + "image attachment exceeds size limit, skipping" ); continue; } - match stt - .transcribe(&attachment.data, attachment.filename.as_deref()) - .await - { - Ok(result) => { - tracing::info!( - len = result.text.len(), - language = ?result.language, - "audio transcribed" - ); - transcribed_parts.push(result.text); - } - Err(e) => { - tracing::error!(error = %e, "audio transcription failed"); - } - } - } - - if transcribed_parts.is_empty() { - return msg.text; + let mime_type = detect_image_mime(attachment.filename.as_deref()); + image_parts.push(MessagePart::Image { + data: attachment.data.clone(), + mime_type, + }); } - let transcribed = transcribed_parts.join("\n"); - if msg.text.is_empty() { - transcribed - } else { - format!("[transcribed audio]\n{transcribed}\n\n{}", msg.text) - } + (text, image_parts) } - async fn process_user_message(&mut self, text: String) -> Result<(), error::AgentError> { + async fn process_user_message( + &mut self, + text: String, + image_parts: Vec, + ) -> Result<(), error::AgentError> { self.cancel_token = CancellationToken::new(); let signal = Arc::clone(&self.cancel_signal); let token = self.cancel_token.clone(); @@ -748,6 +804,12 @@ impl Agent { return Ok(()); } + if let Some(path) = trimmed.strip_prefix("/image ") { + return self + .handle_image_command(path.trim(), &mut image_parts.into_iter().collect()) + .await; + } + self.rebuild_system_prompt(&text).await; if let Err(e) = self.maybe_compact().await { @@ -760,11 +822,24 @@ impl Agent { self.reflection_used = false; - self.push_message(Message { - role: Role::User, - content: text.clone(), - parts: vec![], - }); + let user_msg = if !image_parts.is_empty() && self.provider.supports_vision() { + let mut parts = vec![zeph_llm::provider::MessagePart::Text { text: text.clone() }]; + parts.extend(image_parts); + Message::from_parts(Role::User, parts) + } else { + if !image_parts.is_empty() { + tracing::warn!( + count = image_parts.len(), + "image attachments dropped: provider does not support vision" + ); + } + Message { + role: Role::User, + content: text.clone(), + parts: vec![], + } + }; + self.push_message(user_msg); self.persist_message(Role::User, &text).await; if let Err(e) = self.process_response().await { @@ -778,6 +853,39 @@ impl Agent { Ok(()) } + async fn handle_image_command( + &mut self, + path: &str, + extra_parts: &mut Vec, + ) -> Result<(), error::AgentError> { + use zeph_llm::provider::MessagePart; + + let data = match std::fs::read(path) { + Ok(d) => d, + Err(e) => { + self.channel + .send(&format!("Cannot read image {path}: {e}")) + .await?; + return Ok(()); + } + }; + if data.len() > MAX_IMAGE_BYTES { + self.channel + .send(&format!( + "Image {path} exceeds size limit ({} MB), skipping", + MAX_IMAGE_BYTES / 1024 / 1024 + )) + .await?; + return Ok(()); + } + let mime_type = detect_image_mime(Some(path)); + extra_parts.push(MessagePart::Image { data, mime_type }); + self.channel + .send(&format!("Image loaded: {path}. Send your message.")) + .await?; + Ok(()) + } + async fn handle_skills_command(&mut self) -> Result<(), error::AgentError> { use std::fmt::Write; @@ -981,7 +1089,7 @@ async fn recv_optional(rx: &mut Option>) -> Option { #[cfg(test)] pub(super) mod agent_tests { pub(crate) use super::*; - use crate::channel::ChannelMessage; + use crate::channel::{Attachment, AttachmentKind, ChannelMessage}; use std::sync::{Arc, Mutex}; use zeph_llm::mock::MockProvider; use zeph_tools::executor::{ToolError, ToolOutput}; @@ -1889,7 +1997,7 @@ pub(super) mod agent_tests { let executor = MockToolExecutor::no_tools(); let mut agent = Agent::new(provider, channel, registry, None, 5, executor); - agent.enqueue_or_merge("hello".into()); + agent.enqueue_or_merge("hello".into(), vec![]); assert_eq!(agent.message_queue.len(), 1); assert_eq!(agent.message_queue[0].text, "hello"); } @@ -1902,8 +2010,8 @@ pub(super) mod agent_tests { let executor = MockToolExecutor::no_tools(); let mut agent = Agent::new(provider, channel, registry, None, 5, executor); - agent.enqueue_or_merge("first".into()); - agent.enqueue_or_merge("second".into()); + agent.enqueue_or_merge("first".into(), vec![]); + agent.enqueue_or_merge("second".into(), vec![]); assert_eq!(agent.message_queue.len(), 1); assert_eq!(agent.message_queue[0].text, "first\nsecond"); } @@ -1919,8 +2027,9 @@ pub(super) mod agent_tests { agent.message_queue.push_back(QueuedMessage { text: "old".into(), received_at: Instant::now() - Duration::from_secs(2), + image_parts: vec![], }); - agent.enqueue_or_merge("new".into()); + agent.enqueue_or_merge("new".into(), vec![]); assert_eq!(agent.message_queue.len(), 2); assert_eq!(agent.message_queue[0].text, "old"); assert_eq!(agent.message_queue[1].text, "new"); @@ -1938,9 +2047,10 @@ pub(super) mod agent_tests { agent.message_queue.push_back(QueuedMessage { text: format!("msg{i}"), received_at: Instant::now() - Duration::from_secs(2), + image_parts: vec![], }); } - agent.enqueue_or_merge("overflow".into()); + agent.enqueue_or_merge("overflow".into(), vec![]); assert_eq!(agent.message_queue.len(), MAX_QUEUE_SIZE); } @@ -1952,11 +2062,11 @@ pub(super) mod agent_tests { let executor = MockToolExecutor::no_tools(); let mut agent = Agent::new(provider, channel, registry, None, 5, executor); - agent.enqueue_or_merge("a".into()); + agent.enqueue_or_merge("a".into(), vec![]); // Wait past merge window agent.message_queue.back_mut().unwrap().received_at = Instant::now() - Duration::from_secs(1); - agent.enqueue_or_merge("b".into()); + agent.enqueue_or_merge("b".into(), vec![]); assert_eq!(agent.message_queue.len(), 2); let count = agent.clear_queue(); @@ -1994,6 +2104,7 @@ pub(super) mod agent_tests { agent.message_queue.push_back(QueuedMessage { text: format!("pre{i}"), received_at: Instant::now() - Duration::from_secs(2), + image_parts: vec![], }); } agent.drain_channel(); @@ -2013,6 +2124,7 @@ pub(super) mod agent_tests { agent.message_queue.push_back(QueuedMessage { text: format!("msg{i}"), received_at: Instant::now() - Duration::from_secs(2), + image_parts: vec![], }); } @@ -2096,7 +2208,7 @@ pub(super) mod agent_tests { assert!(token2.is_cancelled()); } - mod resolve_message_text_tests { + mod resolve_message_tests { use super::*; use crate::channel::{Attachment, AttachmentKind, ChannelMessage}; use std::future::Future; @@ -2165,7 +2277,7 @@ pub(super) mod agent_tests { text: "hello".into(), attachments: vec![], }; - assert_eq!(agent.resolve_message_text(msg).await, "hello"); + assert_eq!(agent.resolve_message(msg).await.0, "hello"); } #[tokio::test] @@ -2175,7 +2287,7 @@ pub(super) mod agent_tests { text: "hello".into(), attachments: vec![audio_attachment(b"audio-data")], }; - assert_eq!(agent.resolve_message_text(msg).await, "hello"); + assert_eq!(agent.resolve_message(msg).await.0, "hello"); } #[tokio::test] @@ -2185,7 +2297,7 @@ pub(super) mod agent_tests { text: "original".into(), attachments: vec![audio_attachment(b"audio-data")], }; - let result = agent.resolve_message_text(msg).await; + let (result, _) = agent.resolve_message(msg).await; assert!(result.contains("[transcribed audio]")); assert!(result.contains("transcribed text")); assert!(result.contains("original")); @@ -2198,7 +2310,7 @@ pub(super) mod agent_tests { text: String::new(), attachments: vec![audio_attachment(b"audio-data")], }; - let result = agent.resolve_message_text(msg).await; + let (result, _) = agent.resolve_message(msg).await; assert_eq!(result, "transcribed text"); } @@ -2209,7 +2321,7 @@ pub(super) mod agent_tests { text: "original".into(), attachments: vec![audio_attachment(b"audio-data")], }; - assert_eq!(agent.resolve_message_text(msg).await, "original"); + assert_eq!(agent.resolve_message(msg).await.0, "original"); } #[tokio::test] @@ -2223,7 +2335,7 @@ pub(super) mod agent_tests { audio_attachment(b"a3"), ], }; - let result = agent.resolve_message_text(msg).await; + let (result, _) = agent.resolve_message(msg).await; assert_eq!(result, "chunk\nchunk\nchunk"); } @@ -2239,7 +2351,78 @@ pub(super) mod agent_tests { filename: None, }], }; - assert_eq!(agent.resolve_message_text(msg).await, "original"); + assert_eq!(agent.resolve_message(msg).await.0, "original"); + } + } + + #[test] + fn detect_image_mime_jpeg() { + assert_eq!(detect_image_mime(Some("photo.jpg")), "image/jpeg"); + assert_eq!(detect_image_mime(Some("photo.jpeg")), "image/jpeg"); + } + + #[test] + fn detect_image_mime_gif() { + assert_eq!(detect_image_mime(Some("anim.gif")), "image/gif"); + } + + #[test] + fn detect_image_mime_webp() { + assert_eq!(detect_image_mime(Some("img.webp")), "image/webp"); + } + + #[test] + fn detect_image_mime_unknown_defaults_png() { + assert_eq!(detect_image_mime(Some("file.bmp")), "image/png"); + assert_eq!(detect_image_mime(None), "image/png"); + } + + #[tokio::test] + async fn resolve_message_extracts_image_attachment() { + let provider = mock_provider(vec![]); + let channel = MockChannel::new(vec![]); + let registry = create_test_registry(); + let executor = MockToolExecutor::no_tools(); + let agent = Agent::new(provider, channel, registry, None, 5, executor); + + let msg = ChannelMessage { + text: "look at this".into(), + attachments: vec![Attachment { + kind: AttachmentKind::Image, + data: vec![0u8; 16], + filename: Some("test.jpg".into()), + }], + }; + let (text, parts) = agent.resolve_message(msg).await; + assert_eq!(text, "look at this"); + assert_eq!(parts.len(), 1); + match &parts[0] { + zeph_llm::provider::MessagePart::Image { mime_type, data } => { + assert_eq!(mime_type, "image/jpeg"); + assert_eq!(data.len(), 16); + } + _ => panic!("expected Image part"), } } + + #[tokio::test] + async fn resolve_message_drops_oversized_image() { + let provider = mock_provider(vec![]); + let channel = MockChannel::new(vec![]); + let registry = create_test_registry(); + let executor = MockToolExecutor::no_tools(); + let agent = Agent::new(provider, channel, registry, None, 5, executor); + + let msg = ChannelMessage { + text: "big image".into(), + attachments: vec![Attachment { + kind: AttachmentKind::Image, + data: vec![0u8; MAX_IMAGE_BYTES + 1], + filename: Some("huge.png".into()), + }], + }; + let (text, parts) = agent.resolve_message(msg).await; + assert_eq!(text, "big image"); + assert!(parts.is_empty()); + } } diff --git a/crates/zeph-core/src/bootstrap.rs b/crates/zeph-core/src/bootstrap.rs index 52fd2482..1f1f4b8b 100644 --- a/crates/zeph-core/src/bootstrap.rs +++ b/crates/zeph-core/src/bootstrap.rs @@ -517,11 +517,14 @@ pub fn create_provider(config: &Config) -> anyhow::Result { pub fn create_named_provider(name: &str, config: &Config) -> anyhow::Result { match name { "ollama" => { - let provider = OllamaProvider::new( + let mut provider = OllamaProvider::new( &config.llm.base_url, config.llm.model.clone(), config.llm.embedding_model.clone(), ); + if let Some(ref vm) = config.llm.vision_model { + provider = provider.with_vision_model(vm.clone()); + } Ok(AnyProvider::Ollama(provider)) } "claude" => { diff --git a/crates/zeph-core/src/config/types.rs b/crates/zeph-core/src/config/types.rs index 3d7fd3d6..d9415da5 100644 --- a/crates/zeph-core/src/config/types.rs +++ b/crates/zeph-core/src/config/types.rs @@ -115,6 +115,8 @@ pub struct LlmConfig { #[serde(skip_serializing_if = "Option::is_none")] pub router: Option, pub stt: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub vision_model: Option, } fn default_embedding_model() -> String { @@ -995,6 +997,7 @@ impl Default for Config { compatible: None, router: None, stt: None, + vision_model: None, }, skills: SkillsConfig { paths: vec!["./skills".into()], diff --git a/crates/zeph-llm/Cargo.toml b/crates/zeph-llm/Cargo.toml index e469fafd..d67f05e5 100644 --- a/crates/zeph-llm/Cargo.toml +++ b/crates/zeph-llm/Cargo.toml @@ -15,6 +15,7 @@ cuda = ["candle", "candle-core/cuda", "candle-nn/cuda", "candle-transformers/cud metal = ["candle", "candle-core/metal", "candle-nn/metal", "candle-transformers/metal"] [dependencies] +base64.workspace = true thiserror.workspace = true candle-core = { workspace = true, optional = true } candle-nn = { workspace = true, optional = true } diff --git a/crates/zeph-llm/src/any.rs b/crates/zeph-llm/src/any.rs index 8a1da5cd..337b4e0f 100644 --- a/crates/zeph-llm/src/any.rs +++ b/crates/zeph-llm/src/any.rs @@ -127,6 +127,10 @@ impl LlmProvider for AnyProvider { delegate_provider!(self, |p| p.supports_structured_output()) } + fn supports_vision(&self) -> bool { + delegate_provider!(self, |p| p.supports_vision()) + } + fn supports_tool_use(&self) -> bool { delegate_provider!(self, |p| p.supports_tool_use()) } @@ -477,4 +481,33 @@ mod tests { )); assert!(!provider.supports_structured_output()); } + + #[test] + fn any_claude_supports_vision() { + let provider = AnyProvider::Claude(ClaudeProvider::new("key".into(), "model".into(), 1024)); + assert!(provider.supports_vision()); + } + + #[test] + fn any_openai_supports_vision() { + let provider = AnyProvider::OpenAi(crate::openai::OpenAiProvider::new( + "key".into(), + "https://api.openai.com/v1".into(), + "gpt-4o".into(), + 1024, + None, + None, + )); + assert!(provider.supports_vision()); + } + + #[test] + fn any_ollama_supports_vision() { + let provider = AnyProvider::Ollama(OllamaProvider::new( + "http://localhost:11434", + "test".into(), + "embed".into(), + )); + assert!(provider.supports_vision()); + } } diff --git a/crates/zeph-llm/src/claude.rs b/crates/zeph-llm/src/claude.rs index 2592b8c6..304bb4ff 100644 --- a/crates/zeph-llm/src/claude.rs +++ b/crates/zeph-llm/src/claude.rs @@ -2,6 +2,7 @@ use std::fmt; use std::time::Duration; use crate::error::LlmError; +use base64::{Engine, engine::general_purpose::STANDARD}; use eventsource_stream::Eventsource; use serde::{Deserialize, Serialize}; use tokio_stream::StreamExt; @@ -90,7 +91,35 @@ impl ClaudeProvider { } } + fn has_image_parts(messages: &[Message]) -> bool { + messages.iter().any(|m| { + m.parts + .iter() + .any(|p| matches!(p, MessagePart::Image { .. })) + }) + } + fn build_request(&self, messages: &[Message], stream: bool) -> reqwest::RequestBuilder { + if Self::has_image_parts(messages) { + let (system, chat_messages) = split_messages_structured(messages); + let system_blocks = system.map(|s| split_system_into_blocks(&s)); + let body = VisionRequestBody { + model: &self.model, + max_tokens: self.max_tokens, + system: system_blocks, + messages: &chat_messages, + stream, + }; + return self + .client + .post(API_URL) + .header("x-api-key", &self.api_key) + .header("anthropic-version", ANTHROPIC_VERSION) + .header("anthropic-beta", ANTHROPIC_BETA) + .header("content-type", "application/json") + .json(&body); + } + let (system, chat_messages) = split_messages(messages); let system_blocks = system.map(|s| split_system_into_blocks(&s)); @@ -147,6 +176,22 @@ impl ClaudeProvider { ))); } + if Self::has_image_parts(messages) { + let resp: ToolApiResponse = serde_json::from_str(&text)?; + if let Some(ref usage) = resp.usage { + log_cache_usage(usage); + self.store_cache_usage(usage); + } + let extracted = resp.content.into_iter().find_map(|b| { + if let AnthropicContentBlock::Text { text } = b { + Some(text) + } else { + None + } + }); + return extracted.ok_or(LlmError::EmptyResponse { provider: "claude" }); + } + let resp: ApiResponse = serde_json::from_str(&text)?; if let Some(ref usage) = resp.usage { @@ -332,6 +377,10 @@ impl LlmProvider for ClaudeProvider { )) } + fn supports_vision(&self) -> bool { + true + } + fn supports_tool_use(&self) -> bool { true } @@ -648,6 +697,17 @@ enum AnthropicContentBlock { #[serde(default, skip_serializing_if = "std::ops::Not::not")] is_error: bool, }, + Image { + source: ImageSource, + }, +} + +#[derive(Serialize, Deserialize, Debug, Clone)] +struct ImageSource { + #[serde(rename = "type")] + source_type: String, + media_type: String, + data: String, } #[derive(Deserialize)] @@ -667,7 +727,7 @@ fn parse_tool_response(resp: ToolApiResponse) -> ChatResponse { AnthropicContentBlock::ToolUse { id, name, input } => { tool_calls.push(ToolUseRequest { id, name, input }); } - AnthropicContentBlock::ToolResult { .. } => {} + AnthropicContentBlock::ToolResult { .. } | AnthropicContentBlock::Image { .. } => {} } } @@ -697,14 +757,16 @@ fn split_messages_structured(messages: &[Message]) -> (Option, Vec (Option, Vec { + blocks.push(AnthropicContentBlock::Image { + source: ImageSource { + source_type: "base64".to_owned(), + media_type: mime_type.clone(), + data: STANDARD.encode(data), + }, + }); + } } } chat.push(StructuredApiMessage { @@ -789,6 +860,17 @@ struct RequestBody<'a> { stream: bool, } +#[derive(Serialize)] +struct VisionRequestBody<'a> { + model: &'a str, + max_tokens: u32, + #[serde(skip_serializing_if = "Option::is_none")] + system: Option>, + messages: &'a [StructuredApiMessage], + #[serde(skip_serializing_if = "std::ops::Not::not")] + stream: bool, +} + #[derive(Serialize)] struct ApiMessage<'a> { role: &'a str, diff --git a/crates/zeph-llm/src/ollama.rs b/crates/zeph-llm/src/ollama.rs index dca53d19..35960f38 100644 --- a/crates/zeph-llm/src/ollama.rs +++ b/crates/zeph-llm/src/ollama.rs @@ -1,12 +1,14 @@ use ollama_rs::Ollama; use crate::error::LlmError; +use base64::{Engine, engine::general_purpose::STANDARD}; use ollama_rs::generation::chat::ChatMessage; use ollama_rs::generation::chat::request::ChatMessageRequest; use ollama_rs::generation::embeddings::request::{EmbeddingsInput, GenerateEmbeddingsRequest}; +use ollama_rs::generation::images::Image as OllamaImage; use tokio_stream::StreamExt; -use crate::provider::{ChatStream, LlmProvider, Message, Role}; +use crate::provider::{ChatStream, LlmProvider, Message, MessagePart, Role}; #[derive(Debug)] pub struct ModelInfo { @@ -19,6 +21,7 @@ pub struct OllamaProvider { model: String, embedding_model: String, context_window_size: Option, + vision_model: Option, } impl OllamaProvider { @@ -30,9 +33,16 @@ impl OllamaProvider { model, embedding_model, context_window_size: None, + vision_model: None, } } + #[must_use] + pub fn with_vision_model(mut self, model: String) -> Self { + self.vision_model = Some(model); + self + } + /// Set context window size (typically from /api/show response). pub fn set_context_window(&mut self, size: usize) { self.context_window_size = Some(size); @@ -101,10 +111,24 @@ impl LlmProvider for OllamaProvider { self.context_window_size } + fn supports_vision(&self) -> bool { + true + } + async fn chat(&self, messages: &[Message]) -> Result { + let has_images = messages.iter().any(|m| { + m.parts + .iter() + .any(|p| matches!(p, MessagePart::Image { .. })) + }); + let model = if has_images { + self.vision_model.as_deref().unwrap_or(&self.model) + } else { + &self.model + }; let ollama_messages: Vec = messages.iter().map(convert_message).collect(); - let request = ChatMessageRequest::new(self.model.clone(), ollama_messages); + let request = ChatMessageRequest::new(model.to_owned(), ollama_messages); let response = self .client @@ -116,8 +140,18 @@ impl LlmProvider for OllamaProvider { } async fn chat_stream(&self, messages: &[Message]) -> Result { + let has_images = messages.iter().any(|m| { + m.parts + .iter() + .any(|p| matches!(p, MessagePart::Image { .. })) + }); + let model = if has_images { + self.vision_model.as_deref().unwrap_or(&self.model) + } else { + &self.model + }; let ollama_messages: Vec = messages.iter().map(convert_message).collect(); - let request = ChatMessageRequest::new(self.model.clone(), ollama_messages); + let request = ChatMessageRequest::new(model.to_owned(), ollama_messages); let stream = self .client @@ -166,10 +200,29 @@ impl LlmProvider for OllamaProvider { } fn convert_message(msg: &Message) -> ChatMessage { + let images: Vec = msg + .parts + .iter() + .filter_map(|p| match p { + MessagePart::Image { data, .. } => { + Some(OllamaImage::from_base64(STANDARD.encode(data))) + } + _ => None, + }) + .collect(); + + let text = msg.to_llm_content().to_string(); + match msg.role { - Role::System => ChatMessage::system(msg.to_llm_content().to_string()), - Role::User => ChatMessage::user(msg.to_llm_content().to_string()), - Role::Assistant => ChatMessage::assistant(msg.to_llm_content().to_string()), + Role::System => ChatMessage::system(text), + Role::Assistant => ChatMessage::assistant(text), + Role::User => { + if images.is_empty() { + ChatMessage::user(text) + } else { + ChatMessage::user(text).with_images(images) + } + } } } diff --git a/crates/zeph-llm/src/openai.rs b/crates/zeph-llm/src/openai.rs index 263b777b..96957cf3 100644 --- a/crates/zeph-llm/src/openai.rs +++ b/crates/zeph-llm/src/openai.rs @@ -2,6 +2,7 @@ use std::fmt; use std::time::Duration; use crate::error::LlmError; +use base64::{Engine, engine::general_purpose::STANDARD}; use eventsource_stream::Eventsource; use serde::{Deserialize, Serialize}; use tokio_stream::StreamExt; @@ -112,29 +113,45 @@ impl OpenAiProvider { } async fn send_request(&self, messages: &[Message]) -> Result { - let api_messages = convert_messages(messages); let reasoning = self .reasoning_effort .as_deref() .map(|effort| Reasoning { effort }); - let body = ChatRequest { - model: &self.model, - messages: &api_messages, - max_tokens: self.max_tokens, - stream: false, - reasoning, + let response = if has_image_parts(messages) { + let vision_messages = convert_messages_vision(messages); + let body = VisionChatRequest { + model: &self.model, + messages: vision_messages, + max_tokens: self.max_tokens, + stream: false, + reasoning, + }; + self.client + .post(format!("{}/chat/completions", self.base_url)) + .header("Authorization", format!("Bearer {}", self.api_key)) + .header("Content-Type", "application/json") + .json(&body) + .send() + .await? + } else { + let api_messages = convert_messages(messages); + let body = ChatRequest { + model: &self.model, + messages: &api_messages, + max_tokens: self.max_tokens, + stream: false, + reasoning, + }; + self.client + .post(format!("{}/chat/completions", self.base_url)) + .header("Authorization", format!("Bearer {}", self.api_key)) + .header("Content-Type", "application/json") + .json(&body) + .send() + .await? }; - let response = self - .client - .post(format!("{}/chat/completions", self.base_url)) - .header("Authorization", format!("Bearer {}", self.api_key)) - .header("Content-Type", "application/json") - .json(&body) - .send() - .await?; - let status = response.status(); let text = response.text().await.map_err(LlmError::Http)?; @@ -365,6 +382,10 @@ impl LlmProvider for OpenAiProvider { serde_json::from_str::(content).map_err(|e| LlmError::StructuredParse(e.to_string())) } + fn supports_vision(&self) -> bool { + true + } + fn supports_tool_use(&self) -> bool { true } @@ -465,6 +486,108 @@ impl LlmProvider for OpenAiProvider { } } +#[derive(Serialize)] +#[serde(tag = "type", rename_all = "snake_case")] +enum OpenAiContentPart { + Text { text: String }, + ImageUrl { image_url: ImageUrlDetail }, +} + +#[derive(Serialize)] +struct ImageUrlDetail { + url: String, +} + +#[derive(Serialize)] +struct VisionApiMessage { + role: String, + content: Vec, +} + +#[derive(Serialize)] +struct VisionChatRequest<'a> { + model: &'a str, + messages: Vec, + max_tokens: u32, + #[serde(skip_serializing_if = "std::ops::Not::not")] + stream: bool, + #[serde(skip_serializing_if = "Option::is_none")] + reasoning: Option>, +} + +fn has_image_parts(messages: &[Message]) -> bool { + messages.iter().any(|m| { + m.parts + .iter() + .any(|p| matches!(p, MessagePart::Image { .. })) + }) +} + +fn convert_messages_vision(messages: &[Message]) -> Vec { + messages + .iter() + .map(|msg| { + let role = match msg.role { + Role::System => "system", + Role::User => "user", + Role::Assistant => "assistant", + }; + let has_images = msg + .parts + .iter() + .any(|p| matches!(p, MessagePart::Image { .. })); + if has_images { + let mut parts = Vec::new(); + let text = msg.to_llm_content(); + if !text.is_empty() { + let text_str: String = msg + .parts + .iter() + .filter_map(|p| match p { + MessagePart::Text { text } + | MessagePart::Recall { text } + | MessagePart::CodeContext { text } + | MessagePart::Summary { text } + | MessagePart::CrossSession { text } => Some(text.as_str()), + _ => None, + }) + .collect::>() + .join(""); + if !text_str.is_empty() { + parts.push(OpenAiContentPart::Text { text: text_str }); + } + } + for part in &msg.parts { + if let MessagePart::Image { data, mime_type } = part { + let b64 = STANDARD.encode(data); + parts.push(OpenAiContentPart::ImageUrl { + image_url: ImageUrlDetail { + url: format!("data:{mime_type};base64,{b64}"), + }, + }); + } + } + if parts.is_empty() { + parts.push(OpenAiContentPart::Text { + text: msg.to_llm_content().to_owned(), + }); + } + VisionApiMessage { + role: role.to_owned(), + content: parts, + } + } else { + VisionApiMessage { + role: role.to_owned(), + content: vec![OpenAiContentPart::Text { + text: msg.to_llm_content().to_owned(), + }], + } + } + }) + .collect() +} + fn parse_sse_event(data: &str) -> Option> { if data == "[DONE]" { return None; diff --git a/crates/zeph-llm/src/provider.rs b/crates/zeph-llm/src/provider.rs index c76b7d80..d9ba1347 100644 --- a/crates/zeph-llm/src/provider.rs +++ b/crates/zeph-llm/src/provider.rs @@ -93,6 +93,31 @@ pub enum MessagePart { #[serde(default)] is_error: bool, }, + Image { + #[serde(with = "serde_bytes_base64")] + data: Vec, + mime_type: String, + }, +} + +mod serde_bytes_base64 { + use base64::{Engine, engine::general_purpose::STANDARD}; + use serde::{Deserialize, Deserializer, Serializer}; + + pub fn serialize(bytes: &[u8], s: S) -> Result + where + S: Serializer, + { + s.serialize_str(&STANDARD.encode(bytes)) + } + + pub fn deserialize<'de, D>(d: D) -> Result, D::Error> + where + D: Deserializer<'de>, + { + let s = String::deserialize(d)?; + STANDARD.decode(&s).map_err(serde::de::Error::custom) + } } #[derive(Clone, Debug, Serialize, Deserialize)] @@ -166,6 +191,9 @@ impl Message { } => { let _ = write!(out, "[tool_result: {tool_use_id}]\n{content}"); } + MessagePart::Image { data, mime_type } => { + let _ = write!(out, "[image: {mime_type}, {} bytes]", data.len()); + } } } out @@ -213,6 +241,11 @@ pub trait LlmProvider: Send + Sync { /// Provider name for logging and identification. fn name(&self) -> &'static str; + /// Whether this provider supports image input (vision). + fn supports_vision(&self) -> bool { + false + } + /// Whether this provider supports native `tool_use` / function calling. fn supports_tool_use(&self) -> bool { false @@ -951,4 +984,48 @@ mod tests { "structured output parse failed: test error" ); } + + #[test] + fn message_part_image_roundtrip_json() { + let part = MessagePart::Image { + data: vec![1, 2, 3, 4], + mime_type: "image/jpeg".into(), + }; + let json = serde_json::to_string(&part).unwrap(); + let decoded: MessagePart = serde_json::from_str(&json).unwrap(); + match decoded { + MessagePart::Image { data, mime_type } => { + assert_eq!(data, vec![1, 2, 3, 4]); + assert_eq!(mime_type, "image/jpeg"); + } + _ => panic!("expected Image variant"), + } + } + + #[test] + fn flatten_parts_includes_image_placeholder() { + let msg = Message::from_parts( + Role::User, + vec![ + MessagePart::Text { + text: "see this".into(), + }, + MessagePart::Image { + data: vec![0u8; 100], + mime_type: "image/png".into(), + }, + ], + ); + let content = msg.to_llm_content(); + assert!(content.contains("see this")); + assert!(content.contains("[image: image/png")); + } + + #[test] + fn supports_vision_default_false() { + let provider = StubProvider { + response: String::new(), + }; + assert!(!provider.supports_vision()); + } } diff --git a/src/init.rs b/src/init.rs index 4e43d092..c810addd 100644 --- a/src/init.rs +++ b/src/init.rs @@ -13,6 +13,7 @@ pub(crate) struct WizardState { pub(crate) base_url: Option, pub(crate) model: Option, pub(crate) embedding_model: Option, + pub(crate) vision_model: Option, pub(crate) api_key: Option, pub(crate) compatible_name: Option, pub(crate) sqlite_path: Option, @@ -135,6 +136,20 @@ fn step_llm(state: &mut WizardState) -> anyhow::Result<()> { .interact_text()?, ); + if state.provider == Some(ProviderKind::Ollama) { + let use_vision = Confirm::new() + .with_prompt("Use a separate model for vision (image input)?") + .default(false) + .interact()?; + if use_vision { + state.vision_model = Some( + Input::new() + .with_prompt("Vision model name (e.g. llava:13b)") + .interact_text()?, + ); + } + } + println!(); Ok(()) } @@ -281,6 +296,7 @@ pub(crate) fn build_config(state: &WizardState) -> Config { }, router: None, stt: None, + vision_model: state.vision_model.clone().filter(|s| !s.is_empty()), }; config.memory = MemoryConfig { From a9bc9ee63e35227159afc3b9dcd1da26f1670cc4 Mon Sep 17 00:00:00 2001 From: "Andrei G." Date: Thu, 19 Feb 2026 01:25:20 +0100 Subject: [PATCH 2/4] =?UTF-8?q?fix(vision):=20address=20review=20issues=20?= =?UTF-8?q?=E2=80=94=20perf,=20security,=20and=20test=20coverage?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PERF: replace iter().clone() with consuming partition in resolve_message so image data is moved rather than cloned (up to 20 MB saved per message). SEC: reject path traversal (Component::ParentDir) in /image command before calling std::fs::read. SEC: add pre-download size guard in Telegram photo handler; skip download if photo.file.size exceeds 20 MB and pass capacity hint to Vec::with_capacity. BUG: remove redundant to_llm_content() wrapping in convert_messages_vision; collect text parts directly to avoid false-empty check on image messages. TEST: add unit tests for OpenAI has_image_parts, convert_messages_vision (data-URI format, text-only, image-only cases). TEST: add unit tests for Claude AnthropicContentBlock::Image serialization, split_messages_structured with image parts, has_image_parts detection. TEST: add unit tests for Ollama with_vision_model builder, convert_message with Image parts (base64 encoding), model selection switch. TEST: add unit tests for handle_image_command path traversal rejection, missing file error path, and successful file load with mime detection. --- crates/zeph-channels/src/telegram.rs | 43 ++++++---- crates/zeph-core/src/agent/mod.rs | 111 +++++++++++++++++++++---- crates/zeph-llm/src/claude.rs | 71 ++++++++++++++++ crates/zeph-llm/src/ollama.rs | 81 ++++++++++++++++++ crates/zeph-llm/src/openai.rs | 118 +++++++++++++++++++++++---- 5 files changed, 375 insertions(+), 49 deletions(-) diff --git a/crates/zeph-channels/src/telegram.rs b/crates/zeph-channels/src/telegram.rs index c2d8b4a8..18f77f64 100644 --- a/crates/zeph-channels/src/telegram.rs +++ b/crates/zeph-channels/src/telegram.rs @@ -7,6 +7,7 @@ use tokio::sync::mpsc; use zeph_core::channel::{Attachment, AttachmentKind, Channel, ChannelError, ChannelMessage}; const MAX_MESSAGE_LEN: usize = 4096; +const MAX_IMAGE_BYTES: u32 = 20 * 1024 * 1024; /// Telegram channel adapter using teloxide. #[derive(Debug)] @@ -87,11 +88,11 @@ impl TelegramChannel { let audio_file_id = msg .voice() - .map(|v| v.file.id.0.clone()) - .or_else(|| msg.audio().map(|a| a.file.id.0.clone())); + .map(|v| (v.file.id.0.clone(), v.file.size)) + .or_else(|| msg.audio().map(|a| (a.file.id.0.clone(), a.file.size))); - if let Some(file_id) = audio_file_id { - match download_file(&bot, file_id).await { + if let Some((file_id, file_size)) = audio_file_id { + match download_file(&bot, file_id, file_size).await { Ok(data) => { attachments.push(Attachment { kind: AttachmentKind::Audio, @@ -109,16 +110,26 @@ impl TelegramChannel { if let Some(photos) = msg.photo() && let Some(photo) = photos.iter().max_by_key(|p| p.file.size) { - match download_file(&bot, photo.file.id.0.clone()).await { - Ok(data) => { - attachments.push(Attachment { - kind: AttachmentKind::Image, - data, - filename: None, - }); - } - Err(e) => { - tracing::warn!("failed to download photo attachment: {e}"); + if photo.file.size > MAX_IMAGE_BYTES { + tracing::warn!( + size = photo.file.size, + max = MAX_IMAGE_BYTES, + "photo exceeds size limit, skipping" + ); + } else { + match download_file(&bot, photo.file.id.0.clone(), photo.file.size) + .await + { + Ok(data) => { + attachments.push(Attachment { + kind: AttachmentKind::Image, + data, + filename: None, + }); + } + Err(e) => { + tracing::warn!("failed to download photo attachment: {e}"); + } } } } @@ -246,14 +257,14 @@ impl TelegramChannel { } } -async fn download_file(bot: &Bot, file_id: String) -> Result, String> { +async fn download_file(bot: &Bot, file_id: String, capacity: u32) -> Result, String> { use teloxide::net::Download; let file = bot .get_file(file_id.into()) .await .map_err(|e| format!("get_file: {e}"))?; - let mut buf: Vec = Vec::new(); + let mut buf: Vec = Vec::with_capacity(capacity as usize); bot.download_file(&file.path, &mut buf) .await .map_err(|e| format!("download_file: {e}"))?; diff --git a/crates/zeph-core/src/agent/mod.rs b/crates/zeph-core/src/agent/mod.rs index 3ad21dac..db6e83a7 100644 --- a/crates/zeph-core/src/agent/mod.rs +++ b/crates/zeph-core/src/agent/mod.rs @@ -684,20 +684,15 @@ impl Agent { &self, msg: crate::channel::ChannelMessage, ) -> (String, Vec) { - use crate::channel::AttachmentKind; + use crate::channel::{Attachment, AttachmentKind}; use zeph_llm::provider::MessagePart; - let audio_attachments: Vec<_> = msg - .attachments - .iter() - .filter(|a| a.kind == AttachmentKind::Audio) - .collect(); + let text_base = msg.text.clone(); - let image_attachments: Vec<_> = msg + let (audio_attachments, image_attachments): (Vec, Vec) = msg .attachments - .iter() - .filter(|a| a.kind == AttachmentKind::Image) - .collect(); + .into_iter() + .partition(|a| a.kind == AttachmentKind::Audio); let text = if !audio_attachments.is_empty() && let Some(stt) = self.stt.as_ref() @@ -730,13 +725,13 @@ impl Agent { } } if transcribed_parts.is_empty() { - msg.text.clone() + text_base } else { let transcribed = transcribed_parts.join("\n"); - if msg.text.is_empty() { + if text_base.is_empty() { transcribed } else { - format!("[transcribed audio]\n{transcribed}\n\n{}", msg.text) + format!("[transcribed audio]\n{transcribed}\n\n{text_base}") } } } else { @@ -746,7 +741,7 @@ impl Agent { "audio attachments received but no STT provider configured, dropping" ); } - msg.text.clone() + text_base }; let mut image_parts = Vec::new(); @@ -761,7 +756,7 @@ impl Agent { } let mime_type = detect_image_mime(attachment.filename.as_deref()); image_parts.push(MessagePart::Image { - data: attachment.data.clone(), + data: attachment.data, mime_type, }); } @@ -858,8 +853,20 @@ impl Agent { path: &str, extra_parts: &mut Vec, ) -> Result<(), error::AgentError> { + use std::path::Component; use zeph_llm::provider::MessagePart; + // Reject paths that traverse outside the current directory. + let has_parent_dir = std::path::Path::new(path) + .components() + .any(|c| c == Component::ParentDir); + if has_parent_dir { + self.channel + .send("Invalid image path: path traversal not allowed") + .await?; + return Ok(()); + } + let data = match std::fs::read(path) { Ok(d) => d, Err(e) => { @@ -1127,6 +1134,10 @@ pub(super) mod agent_tests { self.confirmations = Arc::new(Mutex::new(confirmations)); self } + + pub(super) fn sent_messages(&self) -> Vec { + self.sent.lock().unwrap().clone() + } } impl Channel for MockChannel { @@ -2425,4 +2436,74 @@ pub(super) mod agent_tests { assert_eq!(text, "big image"); assert!(parts.is_empty()); } + + #[tokio::test] + async fn handle_image_command_rejects_path_traversal() { + let provider = mock_provider(vec![]); + let channel = MockChannel::new(vec![]); + let registry = create_test_registry(); + let executor = MockToolExecutor::no_tools(); + let mut agent = Agent::new(provider, channel, registry, None, 5, executor); + + let mut parts = Vec::new(); + let result = agent + .handle_image_command("../../etc/passwd", &mut parts) + .await; + assert!(result.is_ok()); + assert!(parts.is_empty()); + // Channel should have received an error message + let sent = agent.channel.sent_messages(); + assert!(sent.iter().any(|m| m.contains("traversal"))); + } + + #[tokio::test] + async fn handle_image_command_missing_file_sends_error() { + let provider = mock_provider(vec![]); + let channel = MockChannel::new(vec![]); + let registry = create_test_registry(); + let executor = MockToolExecutor::no_tools(); + let mut agent = Agent::new(provider, channel, registry, None, 5, executor); + + let mut parts = Vec::new(); + let result = agent + .handle_image_command("/nonexistent/image.png", &mut parts) + .await; + assert!(result.is_ok()); + assert!(parts.is_empty()); + let sent = agent.channel.sent_messages(); + assert!(sent.iter().any(|m| m.contains("Cannot read image"))); + } + + #[tokio::test] + async fn handle_image_command_loads_valid_file() { + use std::io::Write; + let provider = mock_provider(vec![]); + let channel = MockChannel::new(vec![]); + let registry = create_test_registry(); + let executor = MockToolExecutor::no_tools(); + let mut agent = Agent::new(provider, channel, registry, None, 5, executor); + + // Write a small temp image + let mut tmp = tempfile::NamedTempFile::with_suffix(".jpg").unwrap(); + let data = vec![0xFFu8, 0xD8, 0xFF, 0xE0]; + tmp.write_all(&data).unwrap(); + let path = tmp.path().to_str().unwrap().to_owned(); + + let mut parts = Vec::new(); + let result = agent.handle_image_command(&path, &mut parts).await; + assert!(result.is_ok()); + assert_eq!(parts.len(), 1); + match &parts[0] { + zeph_llm::provider::MessagePart::Image { + data: img_data, + mime_type, + } => { + assert_eq!(img_data, &data); + assert_eq!(mime_type, "image/jpeg"); + } + _ => panic!("expected Image part"), + } + let sent = agent.channel.sent_messages(); + assert!(sent.iter().any(|m| m.contains("Image loaded"))); + } } diff --git a/crates/zeph-llm/src/claude.rs b/crates/zeph-llm/src/claude.rs index 304bb4ff..ed0bbf54 100644 --- a/crates/zeph-llm/src/claude.rs +++ b/crates/zeph-llm/src/claude.rs @@ -1718,4 +1718,75 @@ mod tests { Duration::from_secs(4) ); } + + #[test] + fn anthropic_content_block_image_serializes_correctly() { + let block = AnthropicContentBlock::Image { + source: ImageSource { + source_type: "base64".to_owned(), + media_type: "image/jpeg".to_owned(), + data: "abc123".to_owned(), + }, + }; + let json = serde_json::to_value(&block).unwrap(); + assert_eq!(json["type"], "image"); + assert_eq!(json["source"]["type"], "base64"); + assert_eq!(json["source"]["media_type"], "image/jpeg"); + assert_eq!(json["source"]["data"], "abc123"); + } + + #[test] + fn split_messages_structured_produces_image_block() { + use base64::{Engine, engine::general_purpose::STANDARD}; + + let data = vec![0xFFu8, 0xD8, 0xFF]; + let msg = Message::from_parts( + Role::User, + vec![ + MessagePart::Text { + text: "look at this".into(), + }, + MessagePart::Image { + data: data.clone(), + mime_type: "image/jpeg".into(), + }, + ], + ); + let (system, chat) = split_messages_structured(&[msg]); + assert!(system.is_none()); + assert_eq!(chat.len(), 1); + assert_eq!(chat[0].role, "user"); + match &chat[0].content { + StructuredContent::Blocks(blocks) => { + assert_eq!(blocks.len(), 2); + match &blocks[0] { + AnthropicContentBlock::Text { text } => assert_eq!(text, "look at this"), + _ => panic!("expected Text block first"), + } + match &blocks[1] { + AnthropicContentBlock::Image { source } => { + assert_eq!(source.source_type, "base64"); + assert_eq!(source.media_type, "image/jpeg"); + assert_eq!(source.data, STANDARD.encode(&data)); + } + _ => panic!("expected Image block second"), + } + } + _ => panic!("expected Blocks content"), + } + } + + #[test] + fn has_image_parts_detects_image_in_messages() { + let with_image = Message::from_parts( + Role::User, + vec![MessagePart::Image { + data: vec![1], + mime_type: "image/png".into(), + }], + ); + let without_image = Message::from_legacy(Role::User, "plain text"); + assert!(ClaudeProvider::has_image_parts(&[with_image])); + assert!(!ClaudeProvider::has_image_parts(&[without_image])); + } } diff --git a/crates/zeph-llm/src/ollama.rs b/crates/zeph-llm/src/ollama.rs index 35960f38..711b757e 100644 --- a/crates/zeph-llm/src/ollama.rs +++ b/crates/zeph-llm/src/ollama.rs @@ -614,4 +614,85 @@ mod tests { assert!(embedding.len() > 100); assert!(embedding.iter().all(|v| v.is_finite())); } + + #[test] + fn with_vision_model_sets_field() { + let provider = OllamaProvider::new("http://localhost:11434", "main".into(), "embed".into()) + .with_vision_model("llava:13b".into()); + assert_eq!(provider.vision_model.as_deref(), Some("llava:13b")); + } + + #[test] + fn with_vision_model_builder_returns_self() { + let provider = OllamaProvider::new("http://localhost:11434", "main".into(), "embed".into()) + .with_vision_model("llava:7b".into()); + assert_eq!(provider.model, "main"); + assert_eq!(provider.vision_model.as_deref(), Some("llava:7b")); + } + + #[test] + fn convert_message_text_only_has_no_images() { + let msg = Message::from_legacy(Role::User, "hello"); + let chat_msg = convert_message(&msg); + // No images attached — role should be User, content non-empty + assert_eq!( + chat_msg.role, + ollama_rs::generation::chat::MessageRole::User + ); + assert!(!chat_msg.content.is_empty()); + } + + #[test] + fn convert_message_with_image_encodes_base64() { + use base64::{Engine, engine::general_purpose::STANDARD}; + + let data = vec![0xFFu8, 0xD8, 0xFF]; + let msg = Message::from_parts( + Role::User, + vec![ + MessagePart::Text { + text: "describe".into(), + }, + MessagePart::Image { + data: data.clone(), + mime_type: "image/jpeg".into(), + }, + ], + ); + let chat_msg = convert_message(&msg); + let images = chat_msg.images.unwrap_or_default(); + assert_eq!(images.len(), 1); + // OllamaImage stores the base64 string internally — verify via Debug/format + let img_debug = format!("{:?}", images[0]); + assert!(img_debug.contains(&STANDARD.encode(&data))); + } + + #[test] + fn model_selection_uses_vision_model_when_images_present() { + let provider = OllamaProvider::new("http://localhost:11434", "main".into(), "embed".into()) + .with_vision_model("llava:13b".into()); + + let has_images = true; + let selected = if has_images { + provider.vision_model.as_deref().unwrap_or(&provider.model) + } else { + &provider.model + }; + assert_eq!(selected, "llava:13b"); + + let has_images = false; + let selected = if has_images { + provider.vision_model.as_deref().unwrap_or(&provider.model) + } else { + &provider.model + }; + assert_eq!(selected, "main"); + } + + #[test] + fn model_selection_falls_back_to_main_without_vision_model() { + let provider = OllamaProvider::new("http://localhost:11434", "main".into(), "embed".into()); + let selected = provider.vision_model.as_deref().unwrap_or(&provider.model); + assert_eq!(selected, "main"); + } } diff --git a/crates/zeph-llm/src/openai.rs b/crates/zeph-llm/src/openai.rs index 96957cf3..fdfb4d95 100644 --- a/crates/zeph-llm/src/openai.rs +++ b/crates/zeph-llm/src/openai.rs @@ -538,24 +538,21 @@ fn convert_messages_vision(messages: &[Message]) -> Vec { .any(|p| matches!(p, MessagePart::Image { .. })); if has_images { let mut parts = Vec::new(); - let text = msg.to_llm_content(); - if !text.is_empty() { - let text_str: String = msg - .parts - .iter() - .filter_map(|p| match p { - MessagePart::Text { text } - | MessagePart::Recall { text } - | MessagePart::CodeContext { text } - | MessagePart::Summary { text } - | MessagePart::CrossSession { text } => Some(text.as_str()), - _ => None, - }) - .collect::>() - .join(""); - if !text_str.is_empty() { - parts.push(OpenAiContentPart::Text { text: text_str }); - } + let text_str: String = msg + .parts + .iter() + .filter_map(|p| match p { + MessagePart::Text { text } + | MessagePart::Recall { text } + | MessagePart::CodeContext { text } + | MessagePart::Summary { text } + | MessagePart::CrossSession { text } => Some(text.as_str()), + _ => None, + }) + .collect::>() + .join(""); + if !text_str.is_empty() { + parts.push(OpenAiContentPart::Text { text: text_str }); } for part in &msg.parts { if let MessagePart::Image { data, mime_type } = part { @@ -1574,4 +1571,89 @@ mod tests { let cloned = p.clone(); assert!(cloned.last_cache_usage().is_none()); } + + #[test] + fn has_image_parts_detects_image() { + let msg_with_image = Message::from_parts( + Role::User, + vec![ + MessagePart::Text { + text: "look".into(), + }, + MessagePart::Image { + data: vec![1, 2, 3], + mime_type: "image/png".into(), + }, + ], + ); + let msg_text_only = Message::from_legacy(Role::User, "plain"); + assert!(has_image_parts(&[msg_with_image])); + assert!(!has_image_parts(&[msg_text_only])); + assert!(!has_image_parts(&[])); + } + + #[test] + fn convert_messages_vision_produces_data_uri() { + let data = vec![0xFFu8, 0xD8, 0xFF]; // JPEG magic bytes + let msg = Message::from_parts( + Role::User, + vec![ + MessagePart::Text { + text: "describe this".into(), + }, + MessagePart::Image { + data: data.clone(), + mime_type: "image/jpeg".into(), + }, + ], + ); + let converted = convert_messages_vision(&[msg]); + assert_eq!(converted.len(), 1); + assert_eq!(converted[0].role, "user"); + // Should have text part + image_url part + assert_eq!(converted[0].content.len(), 2); + match &converted[0].content[0] { + OpenAiContentPart::Text { text } => assert_eq!(text, "describe this"), + _ => panic!("expected Text part first"), + } + match &converted[0].content[1] { + OpenAiContentPart::ImageUrl { image_url } => { + use base64::{Engine, engine::general_purpose::STANDARD}; + let expected = format!("data:image/jpeg;base64,{}", STANDARD.encode(&data)); + assert_eq!(image_url.url, expected); + } + _ => panic!("expected ImageUrl part second"), + } + } + + #[test] + fn convert_messages_vision_text_only_message() { + let msg = Message::from_legacy(Role::System, "system prompt"); + let converted = convert_messages_vision(&[msg]); + assert_eq!(converted.len(), 1); + assert_eq!(converted[0].role, "system"); + assert_eq!(converted[0].content.len(), 1); + match &converted[0].content[0] { + OpenAiContentPart::Text { text } => assert_eq!(text, "system prompt"), + _ => panic!("expected Text part"), + } + } + + #[test] + fn convert_messages_vision_image_only_no_text_part() { + let msg = Message::from_parts( + Role::User, + vec![MessagePart::Image { + data: vec![1], + mime_type: "image/png".into(), + }], + ); + let converted = convert_messages_vision(&[msg]); + // No text parts collected → only image_url + assert_eq!(converted[0].content.len(), 1); + assert!(matches!( + &converted[0].content[0], + OpenAiContentPart::ImageUrl { .. } + )); + } } From f9fb4fd3e59f666446e227f1f62d9a71604a6705 Mon Sep 17 00:00:00 2001 From: "Andrei G." Date: Thu, 19 Feb 2026 01:31:52 +0100 Subject: [PATCH 3/4] docs: add vision support documentation, update changelog and READMEs --- CHANGELOG.md | 10 ++++ README.md | 1 + crates/zeph-channels/README.md | 4 +- crates/zeph-core/README.md | 2 +- crates/zeph-llm/README.md | 1 + docs/src/SUMMARY.md | 1 + docs/src/architecture/crates.md | 3 +- docs/src/getting-started/configuration.md | 2 + docs/src/guide/channels.md | 2 + docs/src/guide/vision.md | 73 +++++++++++++++++++++++ 10 files changed, 95 insertions(+), 4 deletions(-) create mode 100644 docs/src/guide/vision.md diff --git a/CHANGELOG.md b/CHANGELOG.md index b6261ca1..50196fa5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,16 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). - Redundant syntax highlighting and markdown parsing on every TUI frame: per-message render cache with content-hash keying (#501) ### Added +- Vision (image input) support across Claude, OpenAI, and Ollama providers (#490) +- `MessagePart::Image` content type with base64 serialization +- `LlmProvider::supports_vision()` trait method for runtime capability detection +- Claude structured content with `AnthropicContentBlock::Image` variant +- OpenAI array content format with `image_url` data-URI encoding +- Ollama `with_images()` support with optional `vision_model` config for dedicated model routing +- `/image ` command in CLI and TUI channels +- Telegram photo message handling with pre-download size guard +- `vision_model` field in `[llm.ollama]` config section and `--init` wizard update +- 20 MB max image size limit and path traversal protection - Interactive configuration wizard via `zeph init` subcommand with 5-step setup (LLM provider, memory, channels, secrets backend, config generation) - clap-based CLI argument parsing with `--help`, `--version` support - `Serialize` derive on `Config` and all nested types for TOML generation diff --git a/README.md b/README.md index 04512755..ed24ce31 100644 --- a/README.md +++ b/README.md @@ -240,6 +240,7 @@ Skills **evolve**: failure detection triggers self-reflection, and the agent gen | **MCP** | Connect external tool servers (stdio + HTTP) with SSRF protection | | **A2A** | Agent-to-agent communication via JSON-RPC 2.0 with SSE streaming | | **Audio input** | Speech-to-text via OpenAI Whisper API or local Candle Whisper (offline, feature-gated); Telegram and Slack audio files transcribed automatically | +| **Vision** | Image input via CLI (`/image`), TUI (`/image`), and Telegram photo messages; supported by Claude, OpenAI, and Ollama providers (20 MB max, automatic MIME detection) | | **Channels** | CLI, Telegram (text + voice), Discord, Slack, TUI — all with streaming support | | **Gateway** | HTTP webhook ingestion with bearer auth and rate limiting | | **Native tool_use** | Structured tool calling via Claude/OpenAI APIs; text fallback for local models | diff --git a/crates/zeph-channels/README.md b/crates/zeph-channels/README.md index 77864c3c..4d8f3315 100644 --- a/crates/zeph-channels/README.md +++ b/crates/zeph-channels/README.md @@ -10,8 +10,8 @@ Implements I/O channel adapters that connect the agent to different frontends. S | Module | Description | |--------|-------------| -| `cli` | `CliChannel` — interactive terminal I/O | -| `telegram` | Telegram adapter via teloxide with streaming; voice/audio message detection and file download | +| `cli` | `CliChannel` — interactive terminal I/O with `/image` command for vision input | +| `telegram` | Telegram adapter via teloxide with streaming; voice/audio message detection and file download; photo message support for vision input | | `discord` | Discord adapter (optional feature) | | `slack` | Slack adapter (optional feature); audio file detection and download with Bearer auth | | `any` | `AnyChannel` — enum dispatch over all channels | diff --git a/crates/zeph-core/README.md b/crates/zeph-core/README.md index 617a7b89..a7cc9d1e 100644 --- a/crates/zeph-core/README.md +++ b/crates/zeph-core/README.md @@ -12,7 +12,7 @@ Core orchestration crate for the Zeph agent. Manages the main agent loop, bootst |--------|-------------| | `agent` | `Agent` — main loop driving inference and tool execution | | `bootstrap` | `AppBuilder` — fluent builder for application startup | -| `channel` | `Channel` trait defining I/O adapters; `Attachment` / `AttachmentKind` for multimodal inputs | +| `channel` | `Channel` trait defining I/O adapters; `Attachment` / `AttachmentKind` for multimodal inputs (images, audio) | | `config` | TOML config with `ZEPH_*` env overrides | | `context` | LLM context assembly from history, skills, memory | | `cost` | Token cost tracking and budgeting | diff --git a/crates/zeph-llm/README.md b/crates/zeph-llm/README.md index 0cf2404e..c245ff9c 100644 --- a/crates/zeph-llm/README.md +++ b/crates/zeph-llm/README.md @@ -18,6 +18,7 @@ Defines the `LlmProvider` trait and ships concrete backends for Ollama, Claude, | `candle_provider` | Local inference via Candle (optional feature) | | `orchestrator` | Multi-model coordination and fallback | | `router` | Model selection and routing logic | +| `vision` | Image input support — base64-encoded images in LLM requests; optional dedicated `vision_model` per provider | | `stt` | `SpeechToText` trait and `WhisperProvider` (OpenAI Whisper, feature-gated behind `stt`) | | `candle_whisper` | Local offline STT via Candle (whisper-tiny/base/small, feature-gated behind `candle`) | | `error` | `LlmError` — unified error type | diff --git a/docs/src/SUMMARY.md b/docs/src/SUMMARY.md index 5b506469..62320593 100644 --- a/docs/src/SUMMARY.md +++ b/docs/src/SUMMARY.md @@ -28,6 +28,7 @@ - [Tool System](guide/tools.md) - [Pipeline API](guide/pipeline.md) - [Audio Input](guide/audio-input.md) +- [Vision (Image Input)](guide/vision.md) - [TUI Dashboard](guide/tui.md) - [Observability & Cost](guide/observability.md) - [Code Indexing](guide/code-indexing.md) diff --git a/docs/src/architecture/crates.md b/docs/src/architecture/crates.md index 8a4cc2f2..35a005aa 100644 --- a/docs/src/architecture/crates.md +++ b/docs/src/architecture/crates.md @@ -23,7 +23,8 @@ Agent loop, bootstrap orchestration, configuration loading, and context builder. LLM provider abstraction and backend implementations. -- `LlmProvider` trait — `chat()`, `chat_stream()`, `embed()`, `supports_streaming()`, `supports_embeddings()` +- `LlmProvider` trait — `chat()`, `chat_stream()`, `embed()`, `supports_streaming()`, `supports_embeddings()`, `supports_vision()` +- `MessagePart::Image` — image content part (raw bytes + MIME type) for multimodal input - `EmbedFuture` / `EmbedFn` — canonical type aliases for embedding closures, re-exported by downstream crates (`zeph-skills`, `zeph-mcp`) - `OllamaProvider` — local inference via ollama-rs - `ClaudeProvider` — Anthropic Messages API with SSE streaming diff --git a/docs/src/getting-started/configuration.md b/docs/src/getting-started/configuration.md index 91e4e246..45606d91 100644 --- a/docs/src/getting-started/configuration.md +++ b/docs/src/getting-started/configuration.md @@ -83,6 +83,7 @@ provider = "ollama" # ollama, claude, openai, candle, compatible, orchestrator, base_url = "http://localhost:11434" model = "mistral:7b" embedding_model = "qwen3-embedding" # Model for text embeddings +# vision_model = "llava:13b" # Ollama only: dedicated model for image requests [llm.cloud] model = "claude-sonnet-4-5-20250929" @@ -192,6 +193,7 @@ rate_limit = 60 | `ZEPH_LLM_BASE_URL` | Ollama API endpoint | | `ZEPH_LLM_MODEL` | Model name for Ollama | | `ZEPH_LLM_EMBEDDING_MODEL` | Embedding model for Ollama (default: `qwen3-embedding`) | +| `ZEPH_LLM_VISION_MODEL` | Vision model for Ollama image requests (optional) | | `ZEPH_CLAUDE_API_KEY` | Anthropic API key (required for Claude) | | `ZEPH_OPENAI_API_KEY` | OpenAI API key (required for OpenAI provider) | | `ZEPH_TELEGRAM_TOKEN` | Telegram bot token (enables Telegram mode) | diff --git a/docs/src/guide/channels.md b/docs/src/guide/channels.md index 3ddcd8c1..4d47a26b 100644 --- a/docs/src/guide/channels.md +++ b/docs/src/guide/channels.md @@ -258,6 +258,8 @@ When the queue is full (10 messages), new input is silently dropped until space `ChannelMessage` supports an optional `attachments` field carrying `Attachment` values with typed `AttachmentKind` variants (Audio, Image, Video, File). When the `stt` feature is enabled, audio attachments are automatically transcribed before entering the agent loop. The Telegram channel automatically downloads voice and audio messages and delivers them as attachments. The Slack channel detects audio file uploads and voice messages (`audio/*`, `video/webm`), downloads them via `url_private_download` with host validation (`.slack.com` only) and a 25 MB size limit, and delivers them as audio attachments. See [Audio Input](audio-input.md) for details. +Image attachments are forwarded directly to the LLM as `MessagePart::Image` content. In CLI and TUI, use the `/image ` command to attach an image. In Telegram, send a photo directly. Images are subject to a 20 MB size limit. See [Vision](vision.md) for details. + ## Channel Selection Logic Zeph selects the channel at startup based on the following priority: diff --git a/docs/src/guide/vision.md b/docs/src/guide/vision.md new file mode 100644 index 00000000..f0f281da --- /dev/null +++ b/docs/src/guide/vision.md @@ -0,0 +1,73 @@ +# Vision (Image Input) + +Zeph supports image input across all channels. Images are sent to the LLM as inline content parts alongside the text prompt, enabling visual reasoning tasks such as screenshot analysis, diagram interpretation, and document extraction. + +## Pipeline + +``` +Image attachment → MessagePart::Image → LLM provider (base64) → Response +``` + +When a `ChannelMessage` contains an image `Attachment`, the agent converts it to a `MessagePart::Image` (raw bytes + MIME type). The active LLM provider encodes the image into its native API format and sends it as part of the chat request. + +## Provider Support + +Not all providers support vision. The `LlmProvider::supports_vision()` method indicates capability at runtime. + +| Provider | Vision | Format | +|----------|--------|--------| +| Claude | Yes | `AnthropicContentBlock::Image` (base64 source) | +| OpenAI | Yes | Array content with `image_url` data-URI | +| Ollama | Yes | `with_images()` API; optional `vision_model` routing | +| Candle | No | Text-only | + +### Ollama Vision Model Routing + +Ollama can route image requests to a dedicated vision model (e.g., `llava`, `bakllava`) while keeping a smaller text model for regular queries. Set the `vision_model` field: + +```toml +[llm] +provider = "ollama" +model = "mistral:7b" +vision_model = "llava:13b" +``` + +When `vision_model` is set and the message contains an image, Ollama uses the vision model for that request. When unset, images are sent to the default model (which must support vision). + +## Sending Images + +### CLI and TUI + +Use the `/image` slash command followed by a file path: + +``` +/image /path/to/screenshot.png What is shown in this image? +``` + +The path can be absolute or relative to the working directory. Supported formats: JPEG, PNG, GIF, WebP. + +### Telegram + +Send a photo directly in the chat. The Telegram channel downloads the image via the Bot API (using the largest available photo size) and delivers it as an `Attachment` with `AttachmentKind::Image`. The text caption, if present, is used as the accompanying prompt. + +A pre-download size guard rejects images exceeding 20 MB before the download begins. + +## Configuration + +```toml +[llm] +vision_model = "llava:13b" # Ollama only: dedicated model for image requests +``` + +| Variable | Description | Default | +|----------|-------------|---------| +| `ZEPH_LLM_VISION_MODEL` | Vision model name for Ollama | (none) | + +The `zeph init` wizard includes a prompt for the vision model when configuring the Ollama provider. + +## Limits + +- **20 MB maximum image size** -- images exceeding this limit are rejected. +- **Path traversal protection** -- the `/image` command validates file paths to prevent directory traversal attacks. +- **One image per message** -- additional image attachments in the same message are ignored. +- **No image generation** -- only image input (vision) is supported; image output is not. From 4fd9924fbc2d103f3dd9e39f0e8808cee6383d8e Mon Sep 17 00:00:00 2001 From: "Andrei G." Date: Thu, 19 Feb 2026 01:43:44 +0100 Subject: [PATCH 4/4] test(channels): add CLI and Telegram vision tests for image command and photo handling --- crates/zeph-channels/Cargo.toml | 1 + crates/zeph-channels/src/cli.rs | 60 ++++++++++++++++++++++++++++ crates/zeph-channels/src/telegram.rs | 37 +++++++++++++++++ 3 files changed, 98 insertions(+) diff --git a/crates/zeph-channels/Cargo.toml b/crates/zeph-channels/Cargo.toml index 0d8ce1be..cdeae173 100644 --- a/crates/zeph-channels/Cargo.toml +++ b/crates/zeph-channels/Cargo.toml @@ -33,6 +33,7 @@ harness = false [dev-dependencies] criterion.workspace = true +tempfile.workspace = true [lints] workspace = true diff --git a/crates/zeph-channels/src/cli.rs b/crates/zeph-channels/src/cli.rs index edd6345f..3d64945b 100644 --- a/crates/zeph-channels/src/cli.rs +++ b/crates/zeph-channels/src/cli.rs @@ -170,4 +170,64 @@ mod tests { let mut ch = CliChannel::new(); ch.flush_chunks().await.unwrap(); } + + #[tokio::test] + async fn image_command_valid_file_creates_attachment() { + use std::io::Write; + + let mut tmp = tempfile::NamedTempFile::new().unwrap(); + let image_bytes = b"\x89PNG\r\n\x1a\nfake-image-data"; + tmp.write_all(image_bytes).unwrap(); + tmp.flush().unwrap(); + + let path = tmp.path().to_str().unwrap().to_owned(); + let filename = tmp.path().file_name().unwrap().to_str().unwrap().to_owned(); + + // Simulate /image parsing: strip prefix and read file + let trimmed = format!("/image {path}"); + let arg = trimmed.strip_prefix("/image").map(str::trim).unwrap(); + assert!(!arg.is_empty()); + + let data = tokio::fs::read(arg).await.unwrap(); + let parsed_filename = std::path::Path::new(arg) + .file_name() + .and_then(|n| n.to_str()) + .map(str::to_owned); + + assert_eq!(data, image_bytes); + assert_eq!(parsed_filename, Some(filename)); + + let attachment = Attachment { + kind: AttachmentKind::Image, + data, + filename: parsed_filename, + }; + assert_eq!(attachment.kind, AttachmentKind::Image); + assert_eq!(attachment.data, image_bytes); + } + + #[tokio::test] + async fn image_command_missing_file_returns_io_error() { + let result = tokio::fs::read("/nonexistent/path/image.png").await; + assert!(result.is_err()); + // Verify it maps to ChannelError::Io correctly + let err = ChannelError::Io(result.unwrap_err()); + assert!(matches!(err, ChannelError::Io(_))); + } + + #[test] + fn image_command_empty_args_detected() { + // "/image " with only whitespace after stripping prefix yields empty arg + let trimmed = "/image"; + let arg = trimmed.strip_prefix("/image").map(str::trim).unwrap_or(""); + assert!(arg.is_empty()); + + // "/image " (with trailing space) + let trimmed_space = "/image "; + let arg_space = trimmed_space + .strip_prefix("/image") + .map(str::trim) + .unwrap_or(""); + assert!(arg_space.is_empty()); + } } diff --git a/crates/zeph-channels/src/telegram.rs b/crates/zeph-channels/src/telegram.rs index 18f77f64..93329b9c 100644 --- a/crates/zeph-channels/src/telegram.rs +++ b/crates/zeph-channels/src/telegram.rs @@ -476,4 +476,41 @@ mod tests { assert!(channel.last_edit.is_none()); assert!(channel.message_id.is_none()); } + + #[test] + fn max_image_bytes_is_20_mib() { + assert_eq!(MAX_IMAGE_BYTES, 20 * 1024 * 1024); + } + + #[test] + fn photo_size_limit_enforcement() { + // Mirrors the guard in the photo extraction handler: + // photos.iter().max_by_key(|p| p.file.size) followed by + // if photo.file.size > MAX_IMAGE_BYTES { skip } else { download } + let size_within_limit: u32 = MAX_IMAGE_BYTES - 1; + let size_at_limit: u32 = MAX_IMAGE_BYTES; + let size_over_limit: u32 = MAX_IMAGE_BYTES + 1; + + assert!(size_within_limit <= MAX_IMAGE_BYTES); + assert!(size_at_limit <= MAX_IMAGE_BYTES); + assert!(size_over_limit > MAX_IMAGE_BYTES); + } + + #[test] + fn should_not_send_update_within_threshold() { + let token = "test_token".to_string(); + let allowed_users = Vec::new(); + let mut channel = TelegramChannel::new(token, allowed_users); + // Set last_edit to 1 second ago (well within the 10-second threshold) + channel.last_edit = Some(Instant::now() - Duration::from_secs(1)); + assert!(!channel.should_send_update()); + } + + #[test] + fn start_rejects_empty_allowed_users() { + let channel = TelegramChannel::new("test_token".to_string(), Vec::new()); + let result = channel.start(); + assert!(result.is_err()); + assert!(matches!(result.unwrap_err(), ChannelError::Other(_))); + } }