bug-ops · bug-ops · Feb 23, 2026 · Feb 22, 2026 · Feb 22, 2026 · Feb 23, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,16 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
 ## [Unreleased]
 
 ### Added
+- `autosave_assistant` and `autosave_min_length` config fields in `MemoryConfig` — assistant responses skip embedding when disabled (#748)
+- `SemanticMemory::save_only()` — persist message to SQLite without generating a vector embedding (#748)
+- `ResponseCache` in `zeph-memory` — SQLite-backed LLM response cache with blake3 key hashing and TTL expiry (#750)
+- `response_cache_enabled` and `response_cache_ttl_secs` config fields in `LlmConfig` (#750)
+- Background `cleanup_expired()` task for response cache (runs every 10 minutes) (#750)
+- `ZEPH_MEMORY_AUTOSAVE_ASSISTANT`, `ZEPH_MEMORY_AUTOSAVE_MIN_LENGTH` env overrides (#748)
+- `ZEPH_LLM_RESPONSE_CACHE_ENABLED`, `ZEPH_LLM_RESPONSE_CACHE_TTL_SECS` env overrides (#750)
+- `MemorySnapshot`, `export_snapshot()`, `import_snapshot()` in `zeph-memory/src/snapshot.rs` (#749)
+- `zeph memory export <path>` and `zeph memory import <path>` CLI subcommands (#749)
+- SQLite migration `012_response_cache.sql` for the response cache table (#750)
 - Temporal decay scoring in `SemanticMemory::recall()` — time-based score attenuation with configurable half-life (#745)
 - MMR (Maximal Marginal Relevance) re-ranking in `SemanticMemory::recall()` — post-processing for result diversity (#744)
 - Compact XML skills prompt format (`format_skills_prompt_compact`) for low-budget contexts (#747)

diff --git a/Cargo.toml b/Cargo.toml
@@ -138,7 +138,7 @@ slack = ["zeph-channels/slack"]
 index = ["dep:zeph-index", "zeph-core/index"]
 gateway = ["dep:zeph-gateway"]
 daemon = ["zeph-core/daemon"]
-scheduler = ["dep:zeph-scheduler", "dep:serde_json"]
+scheduler = ["dep:zeph-scheduler"]
 otel = ["dep:opentelemetry", "dep:opentelemetry_sdk", "dep:opentelemetry-otlp", "dep:tracing-opentelemetry"]
 pdf = ["zeph-memory/pdf"]
 mock = ["zeph-llm/mock", "zeph-memory/mock"]
@@ -170,7 +170,7 @@ zeph-gateway = { workspace = true, optional = true }
 zeph-scheduler = { workspace = true, optional = true }
 zeph-tui = { workspace = true, optional = true }
 reqwest = { workspace = true, optional = true, features = ["rustls"] }
-serde_json = { workspace = true, optional = true }
+serde_json.workspace = true
 
 [dev-dependencies]
 serial_test.workspace = true

diff --git a/README.md b/README.md
@@ -60,9 +60,9 @@ zeph --tui         # run with TUI dashboard
 
 | | |
 |---|---|
-| **Hybrid inference** | Ollama, Claude, OpenAI, Candle (GGUF), any OpenAI-compatible API. Multi-model orchestrator with fallback chains |
+| **Hybrid inference** | Ollama, Claude, OpenAI, Candle (GGUF), any OpenAI-compatible API. Multi-model orchestrator with fallback chains. Response cache with blake3 hashing and TTL |
 | **Skills-first architecture** | YAML+Markdown skill files with semantic matching, self-learning evolution, 4-tier trust model, and compact prompt mode for small-context models |
-| **Semantic memory** | SQLite + Qdrant (or embedded SQLite vector search) with MMR re-ranking, temporal decay scoring, adaptive chunked compaction, credential scrubbing, cross-session recall, and vector retrieval |
+| **Semantic memory** | SQLite + Qdrant (or embedded SQLite vector search) with MMR re-ranking, temporal decay scoring, adaptive chunked compaction, credential scrubbing, cross-session recall, vector retrieval, autosave assistant responses, and snapshot export/import |
 | **Multi-channel I/O** | CLI, Telegram, Discord, Slack, TUI — all with streaming. Vision and speech-to-text input |
 | **Protocols** | MCP client (stdio + HTTP), A2A agent-to-agent communication, sub-agent orchestration |
 | **Defense-in-depth** | Shell sandbox, tool permissions, secret redaction, SSRF protection, skill trust quarantine, audit logging |

diff --git a/config/default.toml b/config/default.toml
@@ -33,6 +33,10 @@ max_tokens = 4096
 # embedding_model = "text-embedding-3-small"
 # reasoning_effort = "medium"  # low, medium, high (for reasoning models)
 
+# LLM response cache (SQLite-backed, blake3 key hashing)
+# response_cache_enabled = false
+# response_cache_ttl_secs = 3600
+
 # Speech-to-text provider (Whisper API)
 # [llm.stt]
 # provider = "whisper"
@@ -140,6 +144,8 @@ max_tokens = 4096
 paths = ["./skills"]
 # Maximum number of skills to inject into context per query (embedding-based selection)
 max_active_skills = 5
+# Prompt mode: "full" (inject full SKILL.md), "compact" (name+description only), "auto" (compact if budget < 8192)
+# prompt_mode = "auto"
 # Minimum score delta for skill disambiguation (0.0-1.0)
 # disambiguation_threshold = 0.05
 
@@ -190,6 +196,16 @@ compaction_preserve_tail = 6
 prune_protect_tokens = 40000
 # Minimum relevance score for cross-session memory results (0.0-1.0)
 cross_session_score_threshold = 0.35
+# Vector backend: "qdrant" (external) or "sqlite" (embedded, zero-dependency)
+# vector_backend = "qdrant"
+# Token safety margin multiplier for compaction budget (must be > 0)
+# token_safety_margin = 1.0
+# Redact credentials from LLM context before sending
+# redact_credentials = true
+# Auto-save assistant responses to semantic memory
+# autosave_assistant = false
+# Minimum character length for autosave (shorter responses skip embedding)
+# autosave_min_length = 20
 
 [memory.semantic]
 # Enable semantic memory with vector search
@@ -199,6 +215,12 @@ recall_limit = 5
 # Hybrid search weights (vector + FTS5 keyword). Must sum to 1.0.
 vector_weight = 0.7
 keyword_weight = 0.3
+# Temporal decay: penalize older memories by age
+# temporal_decay_enabled = false
+# temporal_decay_half_life_days = 30
+# MMR re-ranking: diversify recall results
+# mmr_enabled = false
+# mmr_lambda = 0.7
 
 # Code RAG: AST-based code indexing and hybrid retrieval
 # Requires Qdrant and tree-sitter grammars (feature "index", not enabled by default)

diff --git a/crates/zeph-core/README.md b/crates/zeph-core/README.md
@@ -55,6 +55,8 @@ Key `MemoryConfig` fields (TOML section `[memory]`):
 | `vector_backend` | `"qdrant"` / `"sqlite"` | `"qdrant"` | Vector search backend |
 | `token_safety_margin` | f32 | `1.0` | Safety multiplier for token budget estimation (validated: must be >= 1.0) |
 | `redact_credentials` | bool | `true` | Scrub secrets and paths before LLM context injection |
+| `autosave_assistant` | bool | `false` | Persist assistant responses to semantic memory automatically |
+| `autosave_min_length` | usize | `20` | Minimum response length (chars) to trigger autosave |
 
 ```toml
 [agent]

diff --git a/crates/zeph-core/src/agent/builder.rs b/crates/zeph-core/src/agent/builder.rs
@@ -16,6 +16,22 @@ use zeph_memory::semantic::SemanticMemory;
 use zeph_skills::watcher::SkillEvent;
 
 impl<C: Channel> Agent<C> {
+    #[must_use]
+    pub fn with_autosave_config(mut self, autosave_assistant: bool, min_length: usize) -> Self {
+        self.memory_state.autosave_assistant = autosave_assistant;
+        self.memory_state.autosave_min_length = min_length;
+        self
+    }
+
+    #[must_use]
+    pub fn with_response_cache(
+        mut self,
+        cache: std::sync::Arc<zeph_memory::ResponseCache>,
+    ) -> Self {
+        self.response_cache = Some(cache);
+        self
+    }
+
     #[must_use]
     pub fn with_stt(mut self, stt: Box<dyn zeph_llm::stt::SpeechToText>) -> Self {
         self.stt = Some(stt);

diff --git a/crates/zeph-core/src/agent/mod.rs b/crates/zeph-core/src/agent/mod.rs
@@ -75,6 +75,8 @@ pub(super) struct MemoryState {
     pub(super) recall_limit: usize,
     pub(super) summarization_threshold: usize,
     pub(super) cross_session_score_threshold: f32,
+    pub(super) autosave_assistant: bool,
+    pub(super) autosave_min_length: usize,
 }
 
 pub(super) struct SkillState {
@@ -158,6 +160,7 @@ pub struct Agent<C: Channel> {
     update_notify_rx: Option<mpsc::Receiver<String>>,
     #[allow(dead_code)]
     pub(crate) subagent_manager: Option<crate::subagent::SubAgentManager>,
+    pub(super) response_cache: Option<std::sync::Arc<zeph_memory::ResponseCache>>,
 }
 
 impl<C: Channel> Agent<C> {
@@ -199,6 +202,8 @@ impl<C: Channel> Agent<C> {
                 recall_limit: 5,
                 summarization_threshold: 50,
                 cross_session_score_threshold: 0.35,
+                autosave_assistant: false,
+                autosave_min_length: 20,
             },
             skill_state: SkillState {
                 registry,
@@ -262,6 +267,7 @@ impl<C: Channel> Agent<C> {
             stt: None,
             update_notify_rx: None,
             subagent_manager: None,
+            response_cache: None,
         }
     }
 
@@ -1050,6 +1056,10 @@ pub(super) mod agent_tests {
         pub(crate) fn sent_messages(&self) -> Vec<String> {
             self.sent.lock().unwrap().clone()
         }
+
+        pub(crate) fn sent_chunks(&self) -> Vec<String> {
+            self.chunks.lock().unwrap().clone()
+        }
     }
 
     impl Channel for MockChannel {