diff --git a/CHANGELOG.md b/CHANGELOG.md index 7c46fee5..0d68429f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,14 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). ## [Unreleased] ### Added +- Temporal decay scoring in `SemanticMemory::recall()` — time-based score attenuation with configurable half-life (#745) +- MMR (Maximal Marginal Relevance) re-ranking in `SemanticMemory::recall()` — post-processing for result diversity (#744) +- Compact XML skills prompt format (`format_skills_prompt_compact`) for low-budget contexts (#747) +- `SkillPromptMode` enum (`full`/`compact`/`auto`) with auto-selection based on context budget (#747) +- Adaptive chunked context compaction — parallel chunk summarization via `join_all` (#746) +- `with_ranking_options()` builder for `SemanticMemory` to configure temporal decay and MMR +- `message_timestamps()` method on `SqliteStore` for Unix epoch retrieval via `strftime` +- `get_vectors()` method on `EmbeddingStore` for raw vector fetch from SQLite `vector_points` - SQLite-backed `SqliteVectorStore` as embedded alternative to Qdrant for zero-dependency vector search (#741) - `vector_backend` config option to select between `qdrant` and `sqlite` vector backends - Credential scrubbing in LLM context pipeline via `scrub_content()` — redacts secrets and paths before LLM calls (#743) diff --git a/README.md b/README.md index 5bce8a06..b73d1185 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ Most AI agent frameworks dump every tool description, skill, and raw output into - **Semantic skill selection** — embeds skills as vectors, retrieves only top-K relevant per query instead of injecting all - **Smart output filtering** — command-aware filters strip 70-99% of noise before context injection -- **Two-tier context pruning** — selective eviction + LLM compaction keeps the window clean +- **Two-tier context pruning** — selective eviction + adaptive chunked compaction with parallel summarization keeps the window clean - **Proportional budget allocation** — context space distributed by purpose, not arrival order ## Installation @@ -61,8 +61,8 @@ zeph --tui # run with TUI dashboard | | | |---|---| | **Hybrid inference** | Ollama, Claude, OpenAI, Candle (GGUF), any OpenAI-compatible API. Multi-model orchestrator with fallback chains | -| **Skills-first architecture** | YAML+Markdown skill files with semantic matching, self-learning evolution, and 4-tier trust model | -| **Semantic memory** | SQLite + Qdrant (or embedded SQLite vector search) with summarization, credential scrubbing, cross-session recall, and vector retrieval | +| **Skills-first architecture** | YAML+Markdown skill files with semantic matching, self-learning evolution, 4-tier trust model, and compact prompt mode for small-context models | +| **Semantic memory** | SQLite + Qdrant (or embedded SQLite vector search) with MMR re-ranking, temporal decay scoring, adaptive chunked compaction, credential scrubbing, cross-session recall, and vector retrieval | | **Multi-channel I/O** | CLI, Telegram, Discord, Slack, TUI — all with streaming. Vision and speech-to-text input | | **Protocols** | MCP client (stdio + HTTP), A2A agent-to-agent communication, sub-agent orchestration | | **Defense-in-depth** | Shell sandbox, tool permissions, secret redaction, SSRF protection, skill trust quarantine, audit logging | diff --git a/crates/zeph-core/README.md b/crates/zeph-core/README.md index b65d9d86..32b3e500 100644 --- a/crates/zeph-core/README.md +++ b/crates/zeph-core/README.md @@ -24,7 +24,7 @@ Core orchestration crate for the Zeph agent. Manages the main agent loop, bootst | `bootstrap` | `AppBuilder` — fluent builder for application startup | | `channel` | `Channel` trait defining I/O adapters; `LoopbackChannel` / `LoopbackHandle` for headless daemon I/O; `Attachment` / `AttachmentKind` for multimodal inputs | | `config` | TOML config with `ZEPH_*` env overrides; typed `ConfigError` (Io, Parse, Validation, Vault) | -| `context` | LLM context assembly from history, skills, memory | +| `context` | LLM context assembly from history, skills, memory; adaptive chunked compaction with parallel summarization | | `cost` | Token cost tracking and budgeting | | `daemon` | Background daemon mode with PID file lifecycle (optional feature) | | `metrics` | Runtime metrics collection | @@ -48,6 +48,14 @@ Key `AgentConfig` fields (TOML section `[agent]`): | `summary_model` | string? | `null` | — | Model used for context summarization | | `auto_update_check` | bool | `true` | `ZEPH_AUTO_UPDATE_CHECK` | Check GitHub releases for a newer version on startup / via scheduler | +Key `MemoryConfig` fields (TOML section `[memory]`): + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `vector_backend` | `"qdrant"` / `"sqlite"` | `"qdrant"` | Vector search backend | +| `token_safety_margin` | f32 | `1.0` | Safety multiplier for token budget estimation (validated: must be >= 1.0) | +| `redact_credentials` | bool | `true` | Scrub secrets and paths before LLM context injection | + ```toml [agent] auto_update_check = true # set to false to disable update notifications diff --git a/crates/zeph-core/src/agent/builder.rs b/crates/zeph-core/src/agent/builder.rs index cf79c5bc..0abeb9b4 100644 --- a/crates/zeph-core/src/agent/builder.rs +++ b/crates/zeph-core/src/agent/builder.rs @@ -67,6 +67,12 @@ impl Agent { self } + #[must_use] + pub fn with_skill_prompt_mode(mut self, mode: crate::config::SkillPromptMode) -> Self { + self.skill_state.prompt_mode = mode; + self + } + #[must_use] pub fn with_shutdown(mut self, rx: watch::Receiver) -> Self { self.shutdown = rx; diff --git a/crates/zeph-core/src/agent/context.rs b/crates/zeph-core/src/agent/context.rs index 1224c948..fcdadc5c 100644 --- a/crates/zeph-core/src/agent/context.rs +++ b/crates/zeph-core/src/agent/context.rs @@ -1,11 +1,13 @@ use std::borrow::Cow; use std::fmt::Write; +use futures::StreamExt as _; + use zeph_llm::provider::MessagePart; use zeph_memory::semantic::estimate_tokens; use zeph_skills::ScoredMatch; use zeph_skills::loader::SkillMeta; -use zeph_skills::prompt::format_skills_catalog; +use zeph_skills::prompt::{format_skills_catalog, format_skills_prompt_compact}; use crate::redact::scrub_content; @@ -15,6 +17,43 @@ use super::{ format_skills_prompt, }; +fn chunk_messages(messages: &[Message], budget: usize, oversized: usize) -> Vec> { + let mut chunks: Vec> = Vec::new(); + let mut current: Vec = Vec::new(); + let mut current_tokens = 0usize; + + for msg in messages { + let msg_tokens = estimate_tokens(&msg.content); + + if msg_tokens >= oversized { + // Oversized message gets its own chunk + if !current.is_empty() { + chunks.push(std::mem::take(&mut current)); + current_tokens = 0; + } + chunks.push(vec![msg.clone()]); + } else if current_tokens + msg_tokens > budget && !current.is_empty() { + chunks.push(std::mem::take(&mut current)); + current_tokens = 0; + current.push(msg.clone()); + current_tokens += msg_tokens; + } else { + current.push(msg.clone()); + current_tokens += msg_tokens; + } + } + + if !current.is_empty() { + chunks.push(current); + } + + if chunks.is_empty() { + chunks.push(Vec::new()); + } + + chunks +} + impl Agent { #[allow( clippy::cast_precision_loss, @@ -44,25 +83,13 @@ impl Agent { should } - pub(super) async fn compact_context(&mut self) -> Result<(), super::error::AgentError> { - let preserve_tail = self.context_state.compaction_preserve_tail; - - if self.messages.len() <= preserve_tail + 1 { - return Ok(()); - } - - let compact_end = self.messages.len() - preserve_tail; - let to_compact = &self.messages[1..compact_end]; - if to_compact.is_empty() { - return Ok(()); - } - - let estimated_len: usize = to_compact + fn build_chunk_prompt(messages: &[Message]) -> String { + let estimated_len: usize = messages .iter() .map(|m| "[assistant]: ".len() + m.content.len() + 2) .sum(); let mut history_text = String::with_capacity(estimated_len); - for (i, m) in to_compact.iter().enumerate() { + for (i, m) in messages.iter().enumerate() { if i > 0 { history_text.push_str("\n\n"); } @@ -71,11 +98,10 @@ impl Agent { Role::Assistant => "assistant", Role::System => "system", }; - // write! to String never fails, safe to ignore let _ = write!(history_text, "[{role}]: {}", m.content); } - let compaction_prompt = format!( + format!( "Summarize this conversation excerpt into a structured continuation note. \ Include:\n\ 1. Task overview\n\ @@ -87,16 +113,110 @@ impl Agent { Keep it concise but preserve all actionable details.\n\ \n\ Conversation:\n{history_text}" + ) + } + + async fn summarize_messages( + &self, + messages: &[Message], + ) -> Result { + const CHUNK_TOKEN_BUDGET: usize = 4096; + const OVERSIZED_THRESHOLD: usize = CHUNK_TOKEN_BUDGET / 2; + + let chunks = chunk_messages(messages, CHUNK_TOKEN_BUDGET, OVERSIZED_THRESHOLD); + + if chunks.len() <= 1 { + let prompt = Self::build_chunk_prompt(messages); + return self + .summary_or_primary_provider() + .chat(&[Message { + role: Role::User, + content: prompt, + parts: vec![], + }]) + .await + .map_err(Into::into); + } + + // Summarize chunks with bounded concurrency to prevent runaway API calls + let provider = self.summary_or_primary_provider(); + let results: Vec<_> = futures::stream::iter(chunks.iter().map(|chunk| { + let prompt = Self::build_chunk_prompt(chunk); + let p = provider.clone(); + async move { + p.chat(&[Message { + role: Role::User, + content: prompt, + parts: vec![], + }]) + .await + } + })) + .buffer_unordered(4) + .collect() + .await; + + let partial_summaries: Vec = results + .into_iter() + .collect::, _>>() + .unwrap_or_else(|e| { + tracing::warn!("chunked compaction: one or more chunks failed: {e:#}, falling back to single-pass"); + Vec::new() + }); + + if partial_summaries.is_empty() { + // Fallback: single-pass on full messages + let prompt = Self::build_chunk_prompt(messages); + return self + .summary_or_primary_provider() + .chat(&[Message { + role: Role::User, + content: prompt, + parts: vec![], + }]) + .await + .map_err(Into::into); + } + + // Consolidate partial summaries + let numbered: String = partial_summaries + .iter() + .enumerate() + .map(|(i, s)| format!("{}. {s}", i + 1)) + .collect::>() + .join("\n\n"); + + let consolidation_prompt = format!( + "Merge these partial conversation summaries into a single coherent continuation note.\n\ + Include: task overview, current state, key discoveries, next steps, critical context.\n\ + \n\ + Partial summaries:\n{numbered}" ); - let summary = self - .summary_or_primary_provider() + self.summary_or_primary_provider() .chat(&[Message { role: Role::User, - content: compaction_prompt, + content: consolidation_prompt, parts: vec![], }]) - .await?; + .await + .map_err(Into::into) + } + + pub(super) async fn compact_context(&mut self) -> Result<(), super::error::AgentError> { + let preserve_tail = self.context_state.compaction_preserve_tail; + + if self.messages.len() <= preserve_tail + 1 { + return Ok(()); + } + + let compact_end = self.messages.len() - preserve_tail; + let to_compact = &self.messages[1..compact_end]; + if to_compact.is_empty() { + return Ok(()); + } + + let summary = self.summarize_messages(to_compact).await?; let compacted_count = to_compact.len(); self.messages.drain(1..compact_end); @@ -831,7 +951,25 @@ impl Agent { .collect(); let trust_map = self.build_skill_trust_map().await; - let skills_prompt = format_skills_prompt(&active_skills, &trust_map); + + let effective_mode = match self.skill_state.prompt_mode { + crate::config::SkillPromptMode::Auto => { + if let Some(ref budget) = self.context_state.budget + && budget.max_tokens() < 8192 + { + crate::config::SkillPromptMode::Compact + } else { + crate::config::SkillPromptMode::Full + } + } + other => other, + }; + + let skills_prompt = if effective_mode == crate::config::SkillPromptMode::Compact { + format_skills_prompt_compact(&active_skills) + } else { + format_skills_prompt(&active_skills, &trust_map) + }; let catalog_prompt = format_skills_catalog(&remaining_skills); self.skill_state .last_skills_prompt @@ -915,6 +1053,121 @@ mod tests { #[allow(clippy::wildcard_imports)] use crate::agent::agent_tests::*; + #[test] + fn chunk_messages_empty_input_returns_single_empty_chunk() { + let messages: &[Message] = &[]; + let chunks = chunk_messages(messages, 4096, 2048); + assert_eq!(chunks.len(), 1); + assert!(chunks[0].is_empty()); + } + + #[test] + fn chunk_messages_single_oversized_message_gets_own_chunk() { + // A message >= oversized threshold goes into its own chunk + let oversized_content = "x".repeat(2048 * 4 + 1); // > 2048 tokens + let messages = vec![Message { + role: Role::User, + content: oversized_content.clone(), + parts: vec![], + }]; + let chunks = chunk_messages(&messages, 4096, 2048); + assert_eq!(chunks.len(), 1); + assert_eq!(chunks[0][0].content, oversized_content); + } + + #[test] + fn chunk_messages_splits_at_budget_boundary() { + // Two messages each consuming exactly half of budget → should fit in one chunk + // Use messages whose token count is just under half of budget + let half = "w".repeat(1000 * 4); // 1000 tokens + let messages = vec![ + Message { + role: Role::User, + content: half.clone(), + parts: vec![], + }, + Message { + role: Role::User, + content: half.clone(), + parts: vec![], + }, + Message { + role: Role::User, + content: half.clone(), + parts: vec![], + }, + ]; + // budget = 2000 tokens: first two fit, third overflows → 2 chunks + let chunks = chunk_messages(&messages, 2000, 4096); + assert!(chunks.len() >= 2, "expected split into multiple chunks"); + } + + // SF-5: SkillPromptMode::Auto threshold + #[test] + fn skill_prompt_mode_auto_selects_compact_when_budget_below_8192() { + let provider = mock_provider(vec![]); + let channel = MockChannel::new(vec![]); + let registry = create_test_registry(); + let executor = MockToolExecutor::no_tools(); + + let agent = Agent::new(provider, channel, registry, None, 5, executor) + .with_context_budget(4096, 0.20, 0.80, 4, 0); + + // Auto mode: budget < 8192 → Compact + let effective_mode = match crate::config::SkillPromptMode::Auto { + crate::config::SkillPromptMode::Auto => { + if let Some(ref budget) = agent.context_state.budget + && budget.max_tokens() < 8192 + { + crate::config::SkillPromptMode::Compact + } else { + crate::config::SkillPromptMode::Full + } + } + other => other, + }; + assert_eq!(effective_mode, crate::config::SkillPromptMode::Compact); + } + + #[test] + fn skill_prompt_mode_auto_selects_full_when_budget_above_8192() { + let provider = mock_provider(vec![]); + let channel = MockChannel::new(vec![]); + let registry = create_test_registry(); + let executor = MockToolExecutor::no_tools(); + + let agent = Agent::new(provider, channel, registry, None, 5, executor) + .with_context_budget(16384, 0.20, 0.80, 4, 0); + + // Auto mode: budget >= 8192 → Full + let effective_mode = match crate::config::SkillPromptMode::Auto { + crate::config::SkillPromptMode::Auto => { + if let Some(ref budget) = agent.context_state.budget + && budget.max_tokens() < 8192 + { + crate::config::SkillPromptMode::Compact + } else { + crate::config::SkillPromptMode::Full + } + } + other => other, + }; + assert_eq!(effective_mode, crate::config::SkillPromptMode::Full); + } + + // SF-6: SkillPromptMode::Compact forced config + #[test] + fn skill_prompt_mode_compact_forced_regardless_of_budget() { + // Even with a large budget, Compact mode stays Compact + let effective_mode = match crate::config::SkillPromptMode::Compact { + crate::config::SkillPromptMode::Auto => { + crate::config::SkillPromptMode::Full // would normally pick Full + } + other => other, + }; + assert_eq!(effective_mode, crate::config::SkillPromptMode::Compact); + } + #[test] fn should_compact_disabled_without_budget() { let provider = mock_provider(vec![]); diff --git a/crates/zeph-core/src/agent/mod.rs b/crates/zeph-core/src/agent/mod.rs index 723e94b8..e3c40547 100644 --- a/crates/zeph-core/src/agent/mod.rs +++ b/crates/zeph-core/src/agent/mod.rs @@ -37,7 +37,7 @@ use zeph_tools::executor::{ErasedToolExecutor, ToolExecutor}; use crate::channel::Channel; use crate::config::Config; use crate::config::LearningConfig; -use crate::config::{SecurityConfig, TimeoutConfig}; +use crate::config::{SecurityConfig, SkillPromptMode, TimeoutConfig}; use crate::config_watcher::ConfigEvent; use crate::context::{ContextBudget, EnvironmentContext, build_system_prompt}; use crate::cost::CostTracker; @@ -88,6 +88,7 @@ pub(super) struct SkillState { pub(super) skill_reload_rx: Option>, pub(super) active_skill_names: Vec, pub(super) last_skills_prompt: String, + pub(super) prompt_mode: SkillPromptMode, /// Custom secrets available at runtime: key=hyphenated name, value=secret. pub(super) available_custom_secrets: HashMap, } @@ -210,6 +211,7 @@ impl Agent { skill_reload_rx: None, active_skill_names: Vec::new(), last_skills_prompt: skills_prompt, + prompt_mode: SkillPromptMode::Auto, available_custom_secrets: HashMap::new(), }, context_state: ContextState { diff --git a/crates/zeph-core/src/config/env.rs b/crates/zeph-core/src/config/env.rs index 8b06f182..2565acd9 100644 --- a/crates/zeph-core/src/config/env.rs +++ b/crates/zeph-core/src/config/env.rs @@ -96,6 +96,39 @@ impl Config { { self.skills.learning.auto_activate = auto_activate; } + if let Ok(v) = std::env::var("ZEPH_SKILLS_PROMPT_MODE") { + match v.to_lowercase().as_str() { + "full" => self.skills.prompt_mode = super::SkillPromptMode::Full, + "compact" => self.skills.prompt_mode = super::SkillPromptMode::Compact, + "auto" => self.skills.prompt_mode = super::SkillPromptMode::Auto, + _ => {} + } + } + if let Ok(v) = std::env::var("ZEPH_MEMORY_SEMANTIC_TEMPORAL_DECAY_ENABLED") + && let Ok(enabled) = v.parse::() + { + self.memory.semantic.temporal_decay_enabled = enabled; + } + if let Ok(v) = std::env::var("ZEPH_MEMORY_SEMANTIC_TEMPORAL_DECAY_HALF_LIFE_DAYS") + && let Ok(days) = v.parse::() + { + self.memory.semantic.temporal_decay_half_life_days = days; + } + if let Ok(v) = std::env::var("ZEPH_MEMORY_SEMANTIC_MMR_ENABLED") + && let Ok(enabled) = v.parse::() + { + self.memory.semantic.mmr_enabled = enabled; + } + if let Ok(v) = std::env::var("ZEPH_MEMORY_SEMANTIC_MMR_LAMBDA") + && let Ok(lambda) = v.parse::() + { + self.memory.semantic.mmr_lambda = lambda; + } + if let Ok(v) = std::env::var("ZEPH_MEMORY_TOKEN_SAFETY_MARGIN") + && let Ok(margin) = v.parse::() + { + self.memory.token_safety_margin = margin.clamp(0.1, 10.0); + } if let Ok(v) = std::env::var("ZEPH_TOOLS_SUMMARIZE_OUTPUT") && let Ok(enabled) = v.parse::() { diff --git a/crates/zeph-core/src/config/mod.rs b/crates/zeph-core/src/config/mod.rs index e055df14..2f8f212f 100644 --- a/crates/zeph-core/src/config/mod.rs +++ b/crates/zeph-core/src/config/mod.rs @@ -75,6 +75,12 @@ impl Config { "gateway.rate_limit must be > 0".into(), )); } + if self.memory.token_safety_margin <= 0.0 { + return Err(ConfigError::Validation(format!( + "token_safety_margin must be > 0.0, got {}", + self.memory.token_safety_margin + ))); + } Ok(()) } diff --git a/crates/zeph-core/src/config/snapshots/zeph_core__config__types__tests__config_default_snapshot.snap b/crates/zeph-core/src/config/snapshots/zeph_core__config__types__tests__config_default_snapshot.snap index 51f9554c..84a15c39 100644 --- a/crates/zeph-core/src/config/snapshots/zeph_core__config__types__tests__config_default_snapshot.snap +++ b/crates/zeph-core/src/config/snapshots/zeph_core__config__types__tests__config_default_snapshot.snap @@ -1,6 +1,6 @@ --- source: crates/zeph-core/src/config/types.rs -assertion_line: 1113 +assertion_line: 1156 expression: toml_str --- [agent] @@ -18,6 +18,7 @@ embedding_model = "qwen3-embedding" paths = ["./skills"] max_active_skills = 5 disambiguation_threshold = 0.05 +prompt_mode = "auto" [skills.learning] enabled = false @@ -54,6 +55,10 @@ enabled = true recall_limit = 5 vector_weight = 0.7 keyword_weight = 0.3 +temporal_decay_enabled = false +temporal_decay_half_life_days = 30 +mmr_enabled = false +mmr_lambda = 0.7 [tools] enabled = true diff --git a/crates/zeph-core/src/config/types.rs b/crates/zeph-core/src/config/types.rs index 151dae60..44ae0f48 100644 --- a/crates/zeph-core/src/config/types.rs +++ b/crates/zeph-core/src/config/types.rs @@ -302,6 +302,16 @@ pub struct OrchestratorProviderConfig { pub device: Option, } +/// Controls how skills are formatted in the system prompt. +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Deserialize, Serialize)] +#[serde(rename_all = "lowercase")] +pub enum SkillPromptMode { + Full, + Compact, + #[default] + Auto, +} + #[derive(Debug, Deserialize, Serialize)] pub struct SkillsConfig { pub paths: Vec, @@ -313,6 +323,8 @@ pub struct SkillsConfig { pub learning: LearningConfig, #[serde(default)] pub trust: TrustConfig, + #[serde(default)] + pub prompt_mode: SkillPromptMode, } fn default_disambiguation_threshold() -> f32 { @@ -564,6 +576,22 @@ pub struct SemanticConfig { pub vector_weight: f64, #[serde(default = "default_keyword_weight")] pub keyword_weight: f64, + #[serde(default)] + pub temporal_decay_enabled: bool, + #[serde(default = "default_temporal_decay_half_life_days")] + pub temporal_decay_half_life_days: u32, + #[serde(default)] + pub mmr_enabled: bool, + #[serde(default = "default_mmr_lambda")] + pub mmr_lambda: f32, +} + +fn default_temporal_decay_half_life_days() -> u32 { + 30 +} + +fn default_mmr_lambda() -> f32 { + 0.7 } impl Default for SemanticConfig { @@ -573,6 +601,10 @@ impl Default for SemanticConfig { recall_limit: default_recall_limit(), vector_weight: default_vector_weight(), keyword_weight: default_keyword_weight(), + temporal_decay_enabled: false, + temporal_decay_half_life_days: default_temporal_decay_half_life_days(), + mmr_enabled: false, + mmr_lambda: default_mmr_lambda(), } } } @@ -1060,6 +1092,7 @@ impl Default for Config { disambiguation_threshold: default_disambiguation_threshold(), learning: LearningConfig::default(), trust: TrustConfig::default(), + prompt_mode: SkillPromptMode::Auto, }, memory: MemoryConfig { sqlite_path: "./data/zeph.db".into(), diff --git a/crates/zeph-memory/README.md b/crates/zeph-memory/README.md index e30cc1a4..002bb344 100644 --- a/crates/zeph-memory/README.md +++ b/crates/zeph-memory/README.md @@ -9,7 +9,9 @@ Semantic memory with SQLite and Qdrant for Zeph agent. ## Overview -Provides durable conversation storage via SQLite and semantic retrieval through Qdrant vector search. The `SemanticMemory` orchestrator combines both backends, enabling the agent to recall relevant context from past conversations using embedding similarity. +Provides durable conversation storage via SQLite and semantic retrieval through Qdrant vector search (or embedded SQLite vector backend). The `SemanticMemory` orchestrator combines both backends, enabling the agent to recall relevant context from past conversations using embedding similarity. + +Recall quality is enhanced by MMR (Maximal Marginal Relevance) re-ranking for result diversity and temporal decay scoring for recency bias. Both are configurable via `SemanticConfig`. Includes a document ingestion subsystem for loading, chunking, and storing user documents (text, Markdown, PDF) into Qdrant for RAG workflows. @@ -27,6 +29,7 @@ Includes a document ingestion subsystem for loading, chunking, and storing user | `document::splitter` | `TextSplitter` with configurable chunking | | `document::pipeline` | `IngestionPipeline` — load, split, embed, store via Qdrant | | `vector_store` | `VectorStore` trait and `VectorPoint` types | +| `sqlite_vector` | `SqliteVectorStore` — embedded SQLite-backed vector search as zero-dependency Qdrant alternative | | `embedding_store` | `EmbeddingStore` — high-level embedding CRUD | | `embeddable` | `Embeddable` trait and `EmbeddingRegistry` — generic Qdrant sync/search for any embeddable type | | `types` | `ConversationId`, `MessageId`, shared types | @@ -34,6 +37,15 @@ Includes a document ingestion subsystem for loading, chunking, and storing user **Re-exports:** `MemoryError`, `QdrantOps`, `ConversationId`, `MessageId`, `Document`, `DocumentLoader`, `TextLoader`, `TextSplitter`, `IngestionPipeline`, `Chunk`, `SplitterConfig`, `DocumentError`, `DocumentMetadata`, `PdfLoader` (behind `pdf` feature), `Embeddable`, `EmbeddingRegistry` +## Ranking options + +| Option | Config field | Default | Description | +|--------|-------------|---------|-------------| +| MMR re-ranking | `semantic.mmr_enabled` | `false` | Post-retrieval diversity via Maximal Marginal Relevance | +| MMR lambda | `semantic.mmr_lambda` | `0.7` | Balance between relevance (1.0) and diversity (0.0) | +| Temporal decay | `semantic.temporal_decay_enabled` | `false` | Time-based score attenuation favoring recent memories | +| Decay half-life | `semantic.temporal_decay_half_life_days` | `30` | Days until a memory's score drops to 50% | + ## Features | Feature | Description | diff --git a/crates/zeph-memory/src/embedding_store.rs b/crates/zeph-memory/src/embedding_store.rs index 198995b2..256c5ed1 100644 --- a/crates/zeph-memory/src/embedding_store.rs +++ b/crates/zeph-memory/src/embedding_store.rs @@ -303,6 +303,52 @@ impl EmbeddingStore { Ok(results) } + /// Fetch raw vectors for the given message IDs from the `SQLite` vector store. + /// + /// Returns an empty map when using Qdrant backend (vectors not locally stored). + /// + /// # Errors + /// + /// Returns an error if the `SQLite` query fails. + pub async fn get_vectors( + &self, + ids: &[MessageId], + ) -> Result>, MemoryError> { + if ids.is_empty() { + return Ok(std::collections::HashMap::new()); + } + + let placeholders: String = ids.iter().map(|_| "?").collect::>().join(","); + let query = format!( + "SELECT em.message_id, vp.vector \ + FROM embeddings_metadata em \ + JOIN vector_points vp ON vp.id = em.qdrant_point_id \ + WHERE em.message_id IN ({placeholders})" + ); + let mut q = sqlx::query_as::<_, (MessageId, Vec)>(&query); + for &id in ids { + q = q.bind(id); + } + + let rows = q.fetch_all(&self.pool).await.unwrap_or_default(); + + let map = rows + .into_iter() + .filter_map(|(msg_id, blob)| { + if blob.len() % 4 != 0 { + return None; + } + let vec: Vec = blob + .chunks_exact(4) + .map(|b| f32::from_le_bytes([b[0], b[1], b[2], b[3]])) + .collect(); + Some((msg_id, vec)) + }) + .collect(); + + Ok(map) + } + /// Check whether an embedding already exists for the given message ID. /// /// # Errors diff --git a/crates/zeph-memory/src/semantic.rs b/crates/zeph-memory/src/semantic.rs index 7ccc98c2..a310297e 100644 --- a/crates/zeph-memory/src/semantic.rs +++ b/crates/zeph-memory/src/semantic.rs @@ -49,6 +49,95 @@ pub fn estimate_tokens(text: &str) -> usize { text.chars().count() / 4 } +fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 { + if a.len() != b.len() || a.is_empty() { + return 0.0; + } + let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum(); + let norm_a: f32 = a.iter().map(|x| x * x).sum::().sqrt(); + let norm_b: f32 = b.iter().map(|x| x * x).sum::().sqrt(); + if norm_a == 0.0 || norm_b == 0.0 { + return 0.0; + } + dot / (norm_a * norm_b) +} + +fn apply_temporal_decay( + ranked: &mut [(MessageId, f64)], + timestamps: &std::collections::HashMap, + half_life_days: u32, +) { + if half_life_days == 0 { + return; + } + let now = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs() + .cast_signed(); + let lambda = std::f64::consts::LN_2 / f64::from(half_life_days); + + for (msg_id, score) in ranked.iter_mut() { + if let Some(&ts) = timestamps.get(msg_id) { + #[allow(clippy::cast_precision_loss)] + let age_days = (now - ts).max(0) as f64 / 86400.0; + *score *= (-lambda * age_days).exp(); + } + } +} + +fn apply_mmr( + ranked: &[(MessageId, f64)], + vectors: &std::collections::HashMap>, + lambda: f32, + limit: usize, +) -> Vec<(MessageId, f64)> { + if ranked.is_empty() || limit == 0 { + return Vec::new(); + } + + let lambda = f64::from(lambda); + let mut selected: Vec<(MessageId, f64)> = Vec::with_capacity(limit); + let mut remaining: Vec<(MessageId, f64)> = ranked.to_vec(); + + while selected.len() < limit && !remaining.is_empty() { + let best_idx = if selected.is_empty() { + // Pick highest relevance first + 0 + } else { + let mut best = 0usize; + let mut best_score = f64::NEG_INFINITY; + + for (i, &(cand_id, relevance)) in remaining.iter().enumerate() { + let max_sim = if let Some(cand_vec) = vectors.get(&cand_id) { + selected + .iter() + .filter_map(|(sel_id, _)| vectors.get(sel_id)) + .map(|sel_vec| f64::from(cosine_similarity(cand_vec, sel_vec))) + .fold(f64::NEG_INFINITY, f64::max) + } else { + 0.0 + }; + let max_sim = if max_sim == f64::NEG_INFINITY { + 0.0 + } else { + max_sim + }; + let mmr_score = lambda * relevance - (1.0 - lambda) * max_sim; + if mmr_score > best_score { + best_score = mmr_score; + best = i; + } + } + best + }; + + selected.push(remaining.remove(best_idx)); + } + + selected +} + fn build_summarization_prompt(messages: &[(MessageId, String, String)]) -> String { let mut prompt = String::from( "Summarize the following conversation. Extract key facts, decisions, entities, \ @@ -74,6 +163,10 @@ pub struct SemanticMemory { embedding_model: String, vector_weight: f64, keyword_weight: f64, + temporal_decay_enabled: bool, + temporal_decay_half_life_days: u32, + mmr_enabled: bool, + mmr_lambda: f32, } impl SemanticMemory { @@ -124,9 +217,29 @@ impl SemanticMemory { embedding_model: embedding_model.into(), vector_weight, keyword_weight, + temporal_decay_enabled: false, + temporal_decay_half_life_days: 30, + mmr_enabled: false, + mmr_lambda: 0.7, }) } + /// Configure temporal decay and MMR re-ranking options. + #[must_use] + pub fn with_ranking_options( + mut self, + temporal_decay_enabled: bool, + temporal_decay_half_life_days: u32, + mmr_enabled: bool, + mmr_lambda: f32, + ) -> Self { + self.temporal_decay_enabled = temporal_decay_enabled; + self.temporal_decay_half_life_days = temporal_decay_half_life_days; + self.mmr_enabled = mmr_enabled; + self.mmr_lambda = mmr_lambda; + self + } + /// Create a `SemanticMemory` using the `SQLite`-embedded vector backend. /// /// # Errors @@ -150,6 +263,10 @@ impl SemanticMemory { embedding_model: embedding_model.into(), vector_weight, keyword_weight, + temporal_decay_enabled: false, + temporal_decay_half_life_days: 30, + mmr_enabled: false, + mmr_lambda: 0.7, }) } @@ -268,6 +385,7 @@ impl SemanticMemory { /// # Errors /// /// Returns an error if embedding generation, Qdrant search, or FTS5 query fails. + #[allow(clippy::too_many_lines)] pub async fn recall( &self, query: &str, @@ -333,10 +451,51 @@ impl SemanticMemory { return Ok(Vec::new()); } - // Sort by combined score descending, take top `limit` + // Sort by combined score descending let mut ranked: Vec<(MessageId, f64)> = scores.into_iter().collect(); ranked.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal)); - ranked.truncate(limit); + + // Apply temporal decay (before MMR) + if self.temporal_decay_enabled && self.temporal_decay_half_life_days > 0 { + let ids: Vec = ranked.iter().map(|r| r.0).collect(); + match self.sqlite.message_timestamps(&ids).await { + Ok(timestamps) => { + apply_temporal_decay( + &mut ranked, + ×tamps, + self.temporal_decay_half_life_days, + ); + ranked + .sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal)); + } + Err(e) => { + tracing::warn!("temporal decay: failed to fetch timestamps: {e:#}"); + } + } + } + + // Apply MMR re-ranking (after decay, before truncation) + if self.mmr_enabled && !vector_results.is_empty() { + if let Some(qdrant) = &self.qdrant { + let ids: Vec = ranked.iter().map(|r| r.0).collect(); + match qdrant.get_vectors(&ids).await { + Ok(vec_map) if !vec_map.is_empty() => { + ranked = apply_mmr(&ranked, &vec_map, self.mmr_lambda, limit); + } + Ok(_) => { + ranked.truncate(limit); + } + Err(e) => { + tracing::warn!("MMR: failed to fetch vectors: {e:#}"); + ranked.truncate(limit); + } + } + } else { + ranked.truncate(limit); + } + } else { + ranked.truncate(limit); + } let ids: Vec = ranked.iter().map(|r| r.0).collect(); let messages = self.sqlite.messages_by_ids(&ids).await?; @@ -839,6 +998,10 @@ mod tests { embedding_model: "test-model".into(), vector_weight: 0.7, keyword_weight: 0.3, + temporal_decay_enabled: false, + temporal_decay_half_life_days: 30, + mmr_enabled: false, + mmr_lambda: 0.7, } } @@ -1323,6 +1486,10 @@ mod tests { embedding_model: "test-model".into(), vector_weight: 0.7, keyword_weight: 0.3, + temporal_decay_enabled: false, + temporal_decay_half_life_days: 30, + mmr_enabled: false, + mmr_lambda: 0.7, }; let cid = memory.sqlite().create_conversation().await.unwrap(); @@ -1452,6 +1619,10 @@ mod tests { embedding_model: "test".into(), vector_weight: 0.7, keyword_weight: 0.3, + temporal_decay_enabled: false, + temporal_decay_half_life_days: 30, + mmr_enabled: false, + mmr_lambda: 0.7, }; let cid = memory.sqlite().create_conversation().await.unwrap(); @@ -1717,6 +1888,10 @@ mod tests { embedding_model: "test".into(), vector_weight: 0.7, keyword_weight: 0.3, + temporal_decay_enabled: false, + temporal_decay_half_life_days: 30, + mmr_enabled: false, + mmr_lambda: 0.7, }; let cid = memory.sqlite().create_conversation().await.unwrap(); @@ -1739,6 +1914,229 @@ mod tests { assert!(!summaries[0].content.is_empty()); } + // Temporal decay tests + + #[test] + fn temporal_decay_disabled_leaves_scores_unchanged() { + let mut ranked = vec![(MessageId(1), 1.0f64), (MessageId(2), 0.5f64)]; + let timestamps = std::collections::HashMap::new(); + apply_temporal_decay(&mut ranked, ×tamps, 30); + assert!((ranked[0].1 - 1.0).abs() < f64::EPSILON); + assert!((ranked[1].1 - 0.5).abs() < f64::EPSILON); + } + + #[test] + fn temporal_decay_zero_age_preserves_score() { + let now = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs() + .cast_signed(); + let mut ranked = vec![(MessageId(1), 1.0f64)]; + let mut timestamps = std::collections::HashMap::new(); + timestamps.insert(MessageId(1), now); + apply_temporal_decay(&mut ranked, ×tamps, 30); + // age = 0 days, exp(0) = 1.0 → no change + assert!((ranked[0].1 - 1.0).abs() < 0.01); + } + + #[test] + fn temporal_decay_half_life_halves_score() { + // Age exactly half_life_days → score should be halved + let half_life = 30u32; + let age_secs = i64::from(half_life) * 86400; + let now = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs() + .cast_signed(); + let ts = now - age_secs; + let mut ranked = vec![(MessageId(1), 1.0f64)]; + let mut timestamps = std::collections::HashMap::new(); + timestamps.insert(MessageId(1), ts); + apply_temporal_decay(&mut ranked, ×tamps, half_life); + // exp(-ln2) = 0.5 + assert!( + (ranked[0].1 - 0.5).abs() < 0.01, + "score was {}", + ranked[0].1 + ); + } + + // MMR tests + + #[test] + fn mmr_empty_input_returns_empty() { + let ranked = vec![]; + let vectors = std::collections::HashMap::new(); + let result = apply_mmr(&ranked, &vectors, 0.7, 5); + assert!(result.is_empty()); + } + + #[test] + fn mmr_returns_up_to_limit() { + let ranked = vec![ + (MessageId(1), 1.0f64), + (MessageId(2), 0.9f64), + (MessageId(3), 0.8f64), + ]; + let mut vectors = std::collections::HashMap::new(); + vectors.insert(MessageId(1), vec![1.0f32, 0.0]); + vectors.insert(MessageId(2), vec![0.0f32, 1.0]); + vectors.insert(MessageId(3), vec![1.0f32, 0.0]); + let result = apply_mmr(&ranked, &vectors, 0.7, 2); + assert_eq!(result.len(), 2); + } + + #[test] + fn mmr_without_vectors_picks_by_relevance() { + let ranked = vec![(MessageId(1), 1.0f64), (MessageId(2), 0.5f64)]; + let vectors = std::collections::HashMap::new(); + let result = apply_mmr(&ranked, &vectors, 0.7, 2); + assert_eq!(result.len(), 2); + assert_eq!(result[0].0, MessageId(1)); + } + + #[test] + fn mmr_prefers_diverse_over_redundant() { + // Two candidates with same relevance but msg 2 is orthogonal (more diverse) + let ranked = vec![ + (MessageId(1), 1.0f64), // selected first + (MessageId(2), 0.9f64), // orthogonal to 1 + (MessageId(3), 0.9f64), // parallel to 1 (redundant) + ]; + let mut vectors = std::collections::HashMap::new(); + vectors.insert(MessageId(1), vec![1.0f32, 0.0]); + vectors.insert(MessageId(2), vec![0.0f32, 1.0]); // orthogonal + vectors.insert(MessageId(3), vec![1.0f32, 0.0]); // same as 1 + let result = apply_mmr(&ranked, &vectors, 0.5, 2); + assert_eq!(result.len(), 2); + assert_eq!(result[0].0, MessageId(1)); + // msg 2 should be preferred over msg 3 (diverse) + assert_eq!(result[1].0, MessageId(2)); + } + + #[test] + fn temporal_decay_half_life_zero_is_noop() { + let now = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs() + .cast_signed(); + let age_secs = 30i64 * 86400; + let ts = now - age_secs; + let mut ranked = vec![(MessageId(1), 1.0f64)]; + let mut timestamps = std::collections::HashMap::new(); + timestamps.insert(MessageId(1), ts); + // half_life=0 → guard returns early, score must remain 1.0 + apply_temporal_decay(&mut ranked, ×tamps, 0); + assert!( + (ranked[0].1 - 1.0).abs() < f64::EPSILON, + "score was {}", + ranked[0].1 + ); + } + + #[test] + fn temporal_decay_huge_age_near_zero() { + let now = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs() + .cast_signed(); + // 10 years = ~3650 days + let age_secs = 3650i64 * 86400; + let ts = now - age_secs; + let mut ranked = vec![(MessageId(1), 1.0f64)]; + let mut timestamps = std::collections::HashMap::new(); + timestamps.insert(MessageId(1), ts); + apply_temporal_decay(&mut ranked, ×tamps, 30); + // After 3650 days with half_life=30, score should be essentially 0 + assert!(ranked[0].1 < 0.001, "score was {}", ranked[0].1); + } + + #[test] + fn temporal_decay_small_half_life() { + // Very small half_life (1 day), age = 7 days → 2^(-7) ≈ 0.0078 + let now = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs() + .cast_signed(); + let ts = now - 7 * 86400i64; + let mut ranked = vec![(MessageId(1), 1.0f64)]; + let mut timestamps = std::collections::HashMap::new(); + timestamps.insert(MessageId(1), ts); + apply_temporal_decay(&mut ranked, ×tamps, 1); + assert!(ranked[0].1 < 0.01, "score was {}", ranked[0].1); + } + + #[test] + fn mmr_lambda_zero_max_diversity() { + // lambda=0 → pure diversity: second item should be most dissimilar + let ranked = vec![ + (MessageId(1), 1.0f64), // selected first (always highest relevance) + (MessageId(2), 0.9f64), // orthogonal to 1 + (MessageId(3), 0.85f64), // parallel to 1 (max_sim=1.0) + ]; + let mut vectors = std::collections::HashMap::new(); + vectors.insert(MessageId(1), vec![1.0f32, 0.0]); + vectors.insert(MessageId(2), vec![0.0f32, 1.0]); // orthogonal + vectors.insert(MessageId(3), vec![1.0f32, 0.0]); // same direction + let result = apply_mmr(&ranked, &vectors, 0.0, 3); + assert_eq!(result.len(), 3); + // After 1 is selected: mmr(2) = 0 - (1-0)*0 = 0, mmr(3) = 0 - 1*1 = -1 → 2 wins + assert_eq!(result[1].0, MessageId(2)); + } + + #[test] + fn mmr_lambda_one_pure_relevance() { + // lambda=1 → pure relevance, should pick in relevance order + let ranked = vec![ + (MessageId(1), 1.0f64), + (MessageId(2), 0.8f64), + (MessageId(3), 0.6f64), + ]; + let mut vectors = std::collections::HashMap::new(); + vectors.insert(MessageId(1), vec![1.0f32, 0.0]); + vectors.insert(MessageId(2), vec![0.0f32, 1.0]); + vectors.insert(MessageId(3), vec![0.5f32, 0.5]); + let result = apply_mmr(&ranked, &vectors, 1.0, 3); + assert_eq!(result.len(), 3); + assert_eq!(result[0].0, MessageId(1)); + assert_eq!(result[1].0, MessageId(2)); + assert_eq!(result[2].0, MessageId(3)); + } + + #[test] + fn mmr_limit_zero_returns_empty() { + let ranked = vec![(MessageId(1), 1.0f64), (MessageId(2), 0.8f64)]; + let mut vectors = std::collections::HashMap::new(); + vectors.insert(MessageId(1), vec![1.0f32, 0.0]); + vectors.insert(MessageId(2), vec![0.0f32, 1.0]); + let result = apply_mmr(&ranked, &vectors, 0.7, 0); + assert!(result.is_empty()); + } + + #[test] + fn mmr_duplicate_vectors_penalizes_second() { + // Two items with identical embeddings: second should be heavily penalized + let ranked = vec![ + (MessageId(1), 1.0f64), + (MessageId(2), 1.0f64), // same relevance, same direction + (MessageId(3), 0.9f64), // orthogonal, lower relevance + ]; + let mut vectors = std::collections::HashMap::new(); + vectors.insert(MessageId(1), vec![1.0f32, 0.0]); + vectors.insert(MessageId(2), vec![1.0f32, 0.0]); // duplicate + vectors.insert(MessageId(3), vec![0.0f32, 1.0]); // orthogonal + let result = apply_mmr(&ranked, &vectors, 0.5, 3); + assert_eq!(result.len(), 3); + assert_eq!(result[0].0, MessageId(1)); + // msg3 (orthogonal) should be preferred over msg2 (duplicate) with lambda=0.5 + assert_eq!(result[1].0, MessageId(3)); + } + // Priority 3: proptest use proptest::prelude::*; diff --git a/crates/zeph-memory/src/sqlite/messages.rs b/crates/zeph-memory/src/sqlite/messages.rs index f9ef875a..4791f5f1 100644 --- a/crates/zeph-memory/src/sqlite/messages.rs +++ b/crates/zeph-memory/src/sqlite/messages.rs @@ -298,6 +298,35 @@ impl SqliteStore { Ok(rows) } + /// Fetch creation timestamps (Unix epoch seconds) for the given message IDs. + /// + /// Messages without a `created_at` column fall back to 0. + /// + /// # Errors + /// + /// Returns an error if the query fails. + pub async fn message_timestamps( + &self, + ids: &[MessageId], + ) -> Result, MemoryError> { + if ids.is_empty() { + return Ok(std::collections::HashMap::new()); + } + + let placeholders: String = ids.iter().map(|_| "?").collect::>().join(","); + let query = format!( + "SELECT id, COALESCE(CAST(strftime('%s', created_at) AS INTEGER), 0) \ + FROM messages WHERE id IN ({placeholders})" + ); + let mut q = sqlx::query_as::<_, (MessageId, i64)>(&query); + for &id in ids { + q = q.bind(id); + } + + let rows = q.fetch_all(&self.pool).await?; + Ok(rows.into_iter().collect()) + } + /// Load a range of messages after a given message ID. /// /// # Errors diff --git a/crates/zeph-skills/README.md b/crates/zeph-skills/README.md index baad07d1..855441ba 100644 --- a/crates/zeph-skills/README.md +++ b/crates/zeph-skills/README.md @@ -22,11 +22,21 @@ Parses SKILL.md files (YAML frontmatter + markdown body) from the `skills/` dire | `evolution` | Self-learning skill generation and refinement | | `trust` | `SkillTrust`, `TrustLevel` — skill trust scoring | | `watcher` | Filesystem watcher for skill hot-reload | -| `prompt` | Skill-to-prompt formatting | +| `prompt` | Skill-to-prompt formatting (`full`, `compact`, `auto` modes via `SkillPromptMode`) | | `manager` | `SkillManager` — install, remove, verify, and list external skills (`~/.config/zeph/skills/`) | **Re-exports:** `SkillError`, `SkillTrust`, `TrustLevel`, `compute_skill_hash` +## Prompt modes + +The `prompt_mode` config option (`[skills]` section) controls how skills are serialized into the LLM system prompt: + +| Mode | Description | +|------|-------------| +| `full` | Full XML format with complete skill body (default) | +| `compact` | Condensed XML with name, description, and trigger list only | +| `auto` | Selects `compact` when context budget is below threshold, `full` otherwise | + ## Installation ```bash diff --git a/crates/zeph-skills/src/prompt.rs b/crates/zeph-skills/src/prompt.rs index 19a99270..0d8f45e6 100644 --- a/crates/zeph-skills/src/prompt.rs +++ b/crates/zeph-skills/src/prompt.rs @@ -127,6 +127,28 @@ pub fn wrap_quarantined(skill_name: &str, body: &str) -> String { ) } +/// Format skills as a compact single-line XML list (name + description + path only). +/// +/// Used when the model context window is small (< 8192 tokens) to save space. +#[must_use] +pub fn format_skills_prompt_compact(skills: &[Skill]) -> String { + if skills.is_empty() { + return String::new(); + } + + let mut out = String::from("\n"); + for skill in skills { + let _ = writeln!( + out, + " ", + skill.name(), + skill.description(), + ); + } + out.push_str(""); + out +} + #[must_use] pub fn format_skills_catalog(skills: &[Skill]) -> String { if skills.is_empty() { @@ -381,6 +403,42 @@ mod tests { assert!(!output.contains("Inject ")); } + #[test] + fn compact_empty_returns_empty_string() { + let empty: &[Skill] = &[]; + assert_eq!(format_skills_prompt_compact(empty), ""); + } + + #[test] + fn compact_single_skill_no_path() { + let skills = vec![make_skill("my-skill", "Does things.", "body")]; + let output = format_skills_prompt_compact(&skills); + assert!(output.starts_with("")); + assert!(output.ends_with("")); + assert!(output.contains("name=\"my-skill\"")); + assert!(output.contains("description=\"Does things.\"")); + assert!(!output.contains("path="), "path must not be present"); + } + + #[test] + fn compact_multiple_skills() { + let skills = vec![ + make_skill("a", "desc a", "body a"), + make_skill("b", "desc b", "body b"), + ]; + let output = format_skills_prompt_compact(&skills); + assert!(output.contains("name=\"a\"")); + assert!(output.contains("name=\"b\"")); + assert!(!output.contains("path=")); + } + + #[test] + fn compact_mode_attribute_present() { + let skills = vec![make_skill("x", "y", "z")]; + let output = format_skills_prompt_compact(&skills); + assert!(output.contains("mode=\"compact\"")); + } + #[test] fn format_skills_catalog_empty() { let empty: &[Skill] = &[]; diff --git a/docs/src/advanced/context.md b/docs/src/advanced/context.md index 8cfee952..c56d5db0 100644 --- a/docs/src/advanced/context.md +++ b/docs/src/advanced/context.md @@ -68,7 +68,12 @@ Available tokens (after reserving 20% for response) are split proportionally. Wh ## Semantic Recall Injection -When semantic memory is enabled, the agent queries Qdrant for messages relevant to the current user query. Results are injected as transient system messages (prefixed with `[semantic recall]`) that are: +When semantic memory is enabled, the agent queries the vector backend for messages relevant to the current user query. Two optional post-processing stages improve result quality: + +- **Temporal decay** — exponential score attenuation based on message age. Configure via `memory.semantic.temporal_decay_enabled` and `temporal_decay_half_life_days` (default: 30). +- **MMR re-ranking** — Maximal Marginal Relevance diversifies results by penalizing similarity to already-selected items. Configure via `memory.semantic.mmr_enabled` and `mmr_lambda` (default: 0.7, range 0.0-1.0). + +Results are injected as transient system messages (prefixed with `[semantic recall]`) that are: - Removed and re-injected on every turn (never stale) - Not persisted to SQLite @@ -103,14 +108,17 @@ Before invoking the LLM for compaction, Zeph scans messages outside the protecte - Pruned parts are persisted to SQLite before clearing, so pruning state survives session restarts - The `tool_output_prunes` metric tracks how many parts were pruned -### Tier 2: LLM Compaction (Fallback) +### Tier 2: Chunked LLM Compaction (Fallback) + +If Tier 1 does not free enough tokens, adaptive chunked compaction runs: -If Tier 1 does not free enough tokens, the standard LLM compaction runs: +1. Middle messages (between system prompt and last N recent) are split into ~4096-token chunks +2. Chunks are summarized in parallel via `futures::stream::buffer_unordered(4)` — up to 4 concurrent LLM calls +3. Partial summaries are merged into a final summary via a second LLM pass +4. All middle messages are replaced with a single summary message +5. Last `compaction_preserve_tail` messages (default: 4) are always preserved -1. Middle messages (between system prompt and last N recent) are extracted -2. Sent to the LLM with a structured summarization prompt -3. Replaced with a single summary message -4. Last `compaction_preserve_tail` messages (default: 4) are always preserved +If a single chunk fits all messages, or if chunked summarization fails, the system falls back to a single-pass summarization over the full message range. Both tiers are idempotent and run automatically during the agent loop. @@ -128,9 +136,26 @@ When `tools.summarize_output = true`, long tool outputs are sent through the LLM export ZEPH_TOOLS_SUMMARIZE_OUTPUT=true ``` +## Skill Prompt Modes + +The `skills.prompt_mode` setting controls how matched skills are rendered in the system prompt: + +| Mode | Behavior | +|------|----------| +| `full` | Full XML skill bodies with instructions, examples, and references | +| `compact` | Condensed XML with name, description, and trigger list only (~80% smaller) | +| `auto` (default) | Selects `compact` when the remaining context budget is below 8192 tokens, `full` otherwise | + +```toml +[skills] +prompt_mode = "auto" # "full", "compact", or "auto" +``` + +`compact` mode is useful for small context windows or when many skills are active. It preserves enough information for the model to select the right skill while minimizing token consumption. + ## Progressive Skill Loading -Skills matched by embedding similarity (top-K) are injected with their full body. Remaining skills are listed in a description-only `` catalog — giving the model awareness of all capabilities while consuming minimal tokens. +Skills matched by embedding similarity (top-K) are injected with their full body (or compact summary, depending on `prompt_mode`). Remaining skills are listed in a description-only `` catalog — giving the model awareness of all capabilities while consuming minimal tokens. ## ZEPH.md Project Config @@ -151,4 +176,8 @@ Found configs are concatenated (global first, then ancestors from root to cwd) a | `ZEPH_MEMORY_COMPACTION_PRESERVE_TAIL` | Messages preserved during compaction | `4` | | `ZEPH_MEMORY_PRUNE_PROTECT_TOKENS` | Tokens protected from Tier 1 tool output pruning | `40000` | | `ZEPH_MEMORY_CROSS_SESSION_SCORE_THRESHOLD` | Minimum relevance score for cross-session memory results | `0.35` | +| `ZEPH_MEMORY_SEMANTIC_TEMPORAL_DECAY_ENABLED` | Enable temporal decay scoring | `false` | +| `ZEPH_MEMORY_SEMANTIC_TEMPORAL_DECAY_HALF_LIFE_DAYS` | Half-life for temporal decay | `30` | +| `ZEPH_MEMORY_SEMANTIC_MMR_ENABLED` | Enable MMR re-ranking | `false` | +| `ZEPH_MEMORY_SEMANTIC_MMR_LAMBDA` | MMR relevance-diversity trade-off | `0.7` | | `ZEPH_TOOLS_SUMMARIZE_OUTPUT` | Enable LLM-based tool output summarization | `false` | diff --git a/docs/src/architecture/token-efficiency.md b/docs/src/architecture/token-efficiency.md index 009fb110..9b3d42d2 100644 --- a/docs/src/architecture/token-efficiency.md +++ b/docs/src/architecture/token-efficiency.md @@ -85,7 +85,7 @@ Zeph estimates token counts using a `chars / 4` heuristic instead of the naive ` ### Two-Tier Context Pruning -Long conversations accumulate tool outputs that consume significant context space. Zeph uses a two-tier strategy: Tier 1 selectively prunes old tool outputs (cheap, no LLM call), and Tier 2 falls back to full LLM compaction only when Tier 1 is insufficient. See [Context Engineering](../advanced/context.md) for details. +Long conversations accumulate tool outputs that consume significant context space. Zeph uses a two-tier strategy: Tier 1 selectively prunes old tool outputs (cheap, no LLM call), and Tier 2 falls back to adaptive chunked LLM compaction — splitting messages into ~4096-token chunks, summarizing up to 4 in parallel, and merging results. See [Context Engineering](../advanced/context.md) for details. ## Configuration diff --git a/docs/src/concepts/memory.md b/docs/src/concepts/memory.md index 1f3885aa..666c8045 100644 --- a/docs/src/concepts/memory.md +++ b/docs/src/concepts/memory.md @@ -21,7 +21,28 @@ Two vector backends are available: Semantic memory uses hybrid search — vector similarity combined with SQLite FTS5 keyword search — to improve recall quality. When the vector backend is unavailable, Zeph falls back to keyword-only search. -Setup with embedded SQLite vectors (no external dependencies): +### Result Quality: MMR and Temporal Decay + +Two post-processing stages improve recall quality beyond raw similarity: + +- **Temporal decay** attenuates scores based on message age. A configurable half-life (default: 30 days) ensures recent context is preferred over stale information. Scores decay exponentially: a message at 1 half-life gets 50% weight, at 2 half-lives 25%, etc. +- **MMR re-ranking** (Maximal Marginal Relevance) reduces redundancy in results by penalizing candidates too similar to already-selected items. The `mmr_lambda` parameter (default: 0.7) controls the relevance-diversity trade-off: higher values favor relevance, lower values favor diversity. + +Both are disabled by default. Enable them in `[memory.semantic]`: + +```toml +[memory.semantic] +enabled = true +recall_limit = 5 +temporal_decay_enabled = true +temporal_decay_half_life_days = 30 +mmr_enabled = true +mmr_lambda = 0.7 +``` + +### Quick Setup + +Embedded SQLite vectors (no external dependencies): ```toml [memory] @@ -32,7 +53,7 @@ enabled = true recall_limit = 5 ``` -For Qdrant (production): +Qdrant (production): ```toml [memory] @@ -58,7 +79,7 @@ When `context_budget_tokens` is set (default: 0 = unlimited), Zeph allocates the A two-tier pruning system manages overflow: 1. **Tool output pruning** (cheap) — replaces old tool outputs with short placeholders -2. **LLM compaction** (fallback) — summarizes middle messages when pruning is not enough +2. **Chunked LLM compaction** (fallback) — splits middle messages into ~4096-token chunks, summarizes them in parallel (up to 4 concurrent LLM calls), then merges partial summaries. Falls back to single-pass if any chunk fails. Both tiers run automatically. See [Context Engineering](../advanced/context.md) for tuning options. diff --git a/docs/src/concepts/skills.md b/docs/src/concepts/skills.md index bf809de7..87d3746c 100644 --- a/docs/src/concepts/skills.md +++ b/docs/src/concepts/skills.md @@ -38,6 +38,7 @@ Use `/skills` in chat to see active skills and their usage statistics. - **Hot-reload**: edit a `SKILL.md` file, changes apply without restart - **Two matching backends**: in-memory (default) or Qdrant (faster startup with many skills, delta sync via BLAKE3 hash) - **Secret gating**: skills that declare `x-requires-secrets` in their frontmatter are excluded from the prompt if the required secrets are not present in the vault. This prevents the agent from attempting to use a skill that would fail due to missing credentials +- **Compact prompt mode**: when context budget is tight, `skills.prompt_mode = "auto"` (default) switches to a condensed XML format that includes only name, description, and triggers — ~80% smaller than full bodies. Force with `"compact"` or disable with `"full"`. See [Context Engineering — Skill Prompt Modes](../advanced/context.md#skill-prompt-modes) ## External Skill Management diff --git a/docs/src/guides/semantic-memory.md b/docs/src/guides/semantic-memory.md index a641d536..8050dd7d 100644 --- a/docs/src/guides/semantic-memory.md +++ b/docs/src/guides/semantic-memory.md @@ -73,6 +73,30 @@ keyword_weight = 0.3 # Weight for FTS5 keyword relevance When Qdrant is unavailable, only keyword search runs (effectively `keyword_weight = 1.0`). +## Temporal Decay + +Enable time-based score attenuation to prefer recent context over stale information: + +```toml +[memory.semantic] +temporal_decay_enabled = true +temporal_decay_half_life_days = 30 # Score halves every 30 days +``` + +Scores decay exponentially: at 1 half-life a message retains 50% of its original score, at 2 half-lives 25%, and so on. Adjust `temporal_decay_half_life_days` based on how quickly your project context changes. + +## MMR Re-ranking + +Enable Maximal Marginal Relevance to diversify recall results and reduce redundancy: + +```toml +[memory.semantic] +mmr_enabled = true +mmr_lambda = 0.7 # 0.0 = max diversity, 1.0 = pure relevance +``` + +MMR iteratively selects results that are both relevant to the query and dissimilar to already-selected items. The default `mmr_lambda = 0.7` works well for most use cases. Lower it if you see too many semantically similar results in recall. + ## Storage Architecture | Store | Purpose | diff --git a/docs/src/reference/configuration.md b/docs/src/reference/configuration.md index 894bc534..e48dcd9e 100644 --- a/docs/src/reference/configuration.md +++ b/docs/src/reference/configuration.md @@ -27,6 +27,7 @@ Priority: `--config` > `ZEPH_CONFIG` > `config/default.toml`. |-------|-----------| | `memory.history_limit` | <= 10,000 | | `memory.context_budget_tokens` | <= 1,000,000 (when > 0) | +| `memory.token_safety_margin` | > 0.0 | | `agent.max_tool_iterations` | <= 100 | | `a2a.rate_limit` | > 0 | | `gateway.rate_limit` | > 0 | @@ -87,6 +88,7 @@ model = "whisper-1" paths = ["./skills"] max_active_skills = 5 # Top-K skills per query via embedding similarity disambiguation_threshold = 0.05 # LLM disambiguation when top-2 score delta < threshold (0.0 = disabled) +prompt_mode = "auto" # Skill prompt format: "full", "compact", or "auto" (default: "auto") [memory] sqlite_path = "./data/zeph.db" @@ -104,6 +106,10 @@ redact_credentials = true # Scrub credential patterns from LLM context (defa [memory.semantic] enabled = false # Enable semantic search via Qdrant recall_limit = 5 # Number of semantically relevant messages to inject +temporal_decay_enabled = false # Attenuate scores by message age (default: false) +temporal_decay_half_life_days = 30 # Half-life for temporal decay in days (default: 30) +mmr_enabled = false # MMR re-ranking for result diversity (default: false) +mmr_lambda = 0.7 # MMR relevance-diversity trade-off, 0.0-1.0 (default: 0.7) [tools] enabled = true @@ -249,6 +255,10 @@ Field resolution: per-provider value → parent section (`[llm]`, `[llm.cloud]`) | `ZEPH_MEMORY_REDACT_CREDENTIALS` | Scrub credentials from LLM context (default: true) | | `ZEPH_MEMORY_SEMANTIC_ENABLED` | Enable semantic memory (default: false) | | `ZEPH_MEMORY_RECALL_LIMIT` | Max semantically relevant messages to recall (default: 5) | +| `ZEPH_MEMORY_SEMANTIC_TEMPORAL_DECAY_ENABLED` | Enable temporal decay scoring (default: false) | +| `ZEPH_MEMORY_SEMANTIC_TEMPORAL_DECAY_HALF_LIFE_DAYS` | Half-life for temporal decay in days (default: 30) | +| `ZEPH_MEMORY_SEMANTIC_MMR_ENABLED` | Enable MMR re-ranking (default: false) | +| `ZEPH_MEMORY_SEMANTIC_MMR_LAMBDA` | MMR relevance-diversity trade-off (default: 0.7) | | `ZEPH_SKILLS_MAX_ACTIVE` | Max skills per query via embedding match (default: 5) | | `ZEPH_AGENT_MAX_TOOL_ITERATIONS` | Max tool loop iterations per response (default: 10) | | `ZEPH_TOOLS_SUMMARIZE_OUTPUT` | Enable LLM-based tool output summarization (default: false) |