Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,16 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
## [Unreleased]

### Added
- `autosave_assistant` and `autosave_min_length` config fields in `MemoryConfig` — assistant responses skip embedding when disabled (#748)
- `SemanticMemory::save_only()` — persist message to SQLite without generating a vector embedding (#748)
- `ResponseCache` in `zeph-memory` — SQLite-backed LLM response cache with blake3 key hashing and TTL expiry (#750)
- `response_cache_enabled` and `response_cache_ttl_secs` config fields in `LlmConfig` (#750)
- Background `cleanup_expired()` task for response cache (runs every 10 minutes) (#750)
- `ZEPH_MEMORY_AUTOSAVE_ASSISTANT`, `ZEPH_MEMORY_AUTOSAVE_MIN_LENGTH` env overrides (#748)
- `ZEPH_LLM_RESPONSE_CACHE_ENABLED`, `ZEPH_LLM_RESPONSE_CACHE_TTL_SECS` env overrides (#750)
- `MemorySnapshot`, `export_snapshot()`, `import_snapshot()` in `zeph-memory/src/snapshot.rs` (#749)
- `zeph memory export <path>` and `zeph memory import <path>` CLI subcommands (#749)
- SQLite migration `012_response_cache.sql` for the response cache table (#750)
- Temporal decay scoring in `SemanticMemory::recall()` — time-based score attenuation with configurable half-life (#745)
- MMR (Maximal Marginal Relevance) re-ranking in `SemanticMemory::recall()` — post-processing for result diversity (#744)
- Compact XML skills prompt format (`format_skills_prompt_compact`) for low-budget contexts (#747)
Expand Down
4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ slack = ["zeph-channels/slack"]
index = ["dep:zeph-index", "zeph-core/index"]
gateway = ["dep:zeph-gateway"]
daemon = ["zeph-core/daemon"]
scheduler = ["dep:zeph-scheduler", "dep:serde_json"]
scheduler = ["dep:zeph-scheduler"]
otel = ["dep:opentelemetry", "dep:opentelemetry_sdk", "dep:opentelemetry-otlp", "dep:tracing-opentelemetry"]
pdf = ["zeph-memory/pdf"]
mock = ["zeph-llm/mock", "zeph-memory/mock"]
Expand Down Expand Up @@ -170,7 +170,7 @@ zeph-gateway = { workspace = true, optional = true }
zeph-scheduler = { workspace = true, optional = true }
zeph-tui = { workspace = true, optional = true }
reqwest = { workspace = true, optional = true, features = ["rustls"] }
serde_json = { workspace = true, optional = true }
serde_json.workspace = true

[dev-dependencies]
serial_test.workspace = true
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,9 @@ zeph --tui # run with TUI dashboard

| | |
|---|---|
| **Hybrid inference** | Ollama, Claude, OpenAI, Candle (GGUF), any OpenAI-compatible API. Multi-model orchestrator with fallback chains |
| **Hybrid inference** | Ollama, Claude, OpenAI, Candle (GGUF), any OpenAI-compatible API. Multi-model orchestrator with fallback chains. Response cache with blake3 hashing and TTL |
| **Skills-first architecture** | YAML+Markdown skill files with semantic matching, self-learning evolution, 4-tier trust model, and compact prompt mode for small-context models |
| **Semantic memory** | SQLite + Qdrant (or embedded SQLite vector search) with MMR re-ranking, temporal decay scoring, adaptive chunked compaction, credential scrubbing, cross-session recall, and vector retrieval |
| **Semantic memory** | SQLite + Qdrant (or embedded SQLite vector search) with MMR re-ranking, temporal decay scoring, adaptive chunked compaction, credential scrubbing, cross-session recall, vector retrieval, autosave assistant responses, and snapshot export/import |
| **Multi-channel I/O** | CLI, Telegram, Discord, Slack, TUI — all with streaming. Vision and speech-to-text input |
| **Protocols** | MCP client (stdio + HTTP), A2A agent-to-agent communication, sub-agent orchestration |
| **Defense-in-depth** | Shell sandbox, tool permissions, secret redaction, SSRF protection, skill trust quarantine, audit logging |
Expand Down
22 changes: 22 additions & 0 deletions config/default.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@ max_tokens = 4096
# embedding_model = "text-embedding-3-small"
# reasoning_effort = "medium" # low, medium, high (for reasoning models)

# LLM response cache (SQLite-backed, blake3 key hashing)
# response_cache_enabled = false
# response_cache_ttl_secs = 3600

# Speech-to-text provider (Whisper API)
# [llm.stt]
# provider = "whisper"
Expand Down Expand Up @@ -140,6 +144,8 @@ max_tokens = 4096
paths = ["./skills"]
# Maximum number of skills to inject into context per query (embedding-based selection)
max_active_skills = 5
# Prompt mode: "full" (inject full SKILL.md), "compact" (name+description only), "auto" (compact if budget < 8192)
# prompt_mode = "auto"
# Minimum score delta for skill disambiguation (0.0-1.0)
# disambiguation_threshold = 0.05

Expand Down Expand Up @@ -190,6 +196,16 @@ compaction_preserve_tail = 6
prune_protect_tokens = 40000
# Minimum relevance score for cross-session memory results (0.0-1.0)
cross_session_score_threshold = 0.35
# Vector backend: "qdrant" (external) or "sqlite" (embedded, zero-dependency)
# vector_backend = "qdrant"
# Token safety margin multiplier for compaction budget (must be > 0)
# token_safety_margin = 1.0
# Redact credentials from LLM context before sending
# redact_credentials = true
# Auto-save assistant responses to semantic memory
# autosave_assistant = false
# Minimum character length for autosave (shorter responses skip embedding)
# autosave_min_length = 20

[memory.semantic]
# Enable semantic memory with vector search
Expand All @@ -199,6 +215,12 @@ recall_limit = 5
# Hybrid search weights (vector + FTS5 keyword). Must sum to 1.0.
vector_weight = 0.7
keyword_weight = 0.3
# Temporal decay: penalize older memories by age
# temporal_decay_enabled = false
# temporal_decay_half_life_days = 30
# MMR re-ranking: diversify recall results
# mmr_enabled = false
# mmr_lambda = 0.7

# Code RAG: AST-based code indexing and hybrid retrieval
# Requires Qdrant and tree-sitter grammars (feature "index", not enabled by default)
Expand Down
2 changes: 2 additions & 0 deletions crates/zeph-core/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ Key `MemoryConfig` fields (TOML section `[memory]`):
| `vector_backend` | `"qdrant"` / `"sqlite"` | `"qdrant"` | Vector search backend |
| `token_safety_margin` | f32 | `1.0` | Safety multiplier for token budget estimation (validated: must be >= 1.0) |
| `redact_credentials` | bool | `true` | Scrub secrets and paths before LLM context injection |
| `autosave_assistant` | bool | `false` | Persist assistant responses to semantic memory automatically |
| `autosave_min_length` | usize | `20` | Minimum response length (chars) to trigger autosave |

```toml
[agent]
Expand Down
16 changes: 16 additions & 0 deletions crates/zeph-core/src/agent/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,22 @@ use zeph_memory::semantic::SemanticMemory;
use zeph_skills::watcher::SkillEvent;

impl<C: Channel> Agent<C> {
#[must_use]
pub fn with_autosave_config(mut self, autosave_assistant: bool, min_length: usize) -> Self {
self.memory_state.autosave_assistant = autosave_assistant;
self.memory_state.autosave_min_length = min_length;
self
}

#[must_use]
pub fn with_response_cache(
mut self,
cache: std::sync::Arc<zeph_memory::ResponseCache>,
) -> Self {
self.response_cache = Some(cache);
self
}

#[must_use]
pub fn with_stt(mut self, stt: Box<dyn zeph_llm::stt::SpeechToText>) -> Self {
self.stt = Some(stt);
Expand Down
10 changes: 10 additions & 0 deletions crates/zeph-core/src/agent/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ pub(super) struct MemoryState {
pub(super) recall_limit: usize,
pub(super) summarization_threshold: usize,
pub(super) cross_session_score_threshold: f32,
pub(super) autosave_assistant: bool,
pub(super) autosave_min_length: usize,
}

pub(super) struct SkillState {
Expand Down Expand Up @@ -158,6 +160,7 @@ pub struct Agent<C: Channel> {
update_notify_rx: Option<mpsc::Receiver<String>>,
#[allow(dead_code)]
pub(crate) subagent_manager: Option<crate::subagent::SubAgentManager>,
pub(super) response_cache: Option<std::sync::Arc<zeph_memory::ResponseCache>>,
}

impl<C: Channel> Agent<C> {
Expand Down Expand Up @@ -199,6 +202,8 @@ impl<C: Channel> Agent<C> {
recall_limit: 5,
summarization_threshold: 50,
cross_session_score_threshold: 0.35,
autosave_assistant: false,
autosave_min_length: 20,
},
skill_state: SkillState {
registry,
Expand Down Expand Up @@ -262,6 +267,7 @@ impl<C: Channel> Agent<C> {
stt: None,
update_notify_rx: None,
subagent_manager: None,
response_cache: None,
}
}

Expand Down Expand Up @@ -1050,6 +1056,10 @@ pub(super) mod agent_tests {
pub(crate) fn sent_messages(&self) -> Vec<String> {
self.sent.lock().unwrap().clone()
}

pub(crate) fn sent_chunks(&self) -> Vec<String> {
self.chunks.lock().unwrap().clone()
}
}

impl Channel for MockChannel {
Expand Down
Loading
Loading