fix: address final PR review recommendations (#579)

manavgup · manavgup · commit ca4b9f1c5928 · 2025-11-06T11:41:16.000-05:00
Addresses remaining issues from PR review comment: 1. Fixed Async Context Manager (CRITICAL - Issue #1) - Added warning that default calls sync connect/disconnect - Documented that implementations with async clients should override - Included example of proper async implementation - Prevents event loop blocking for async implementations 2. Updated Connection Logging (Issue #2) - Changed from INFO to DEBUG level for connect/disconnect - Added class name to log messages for clarity - Emphasizes these are flag-only operations (not real connections) - Reduces log noise for base implementation 3. Added Batch Size Upper Bound Warning (Issue #4) - Warns when batch_size > 10,000 (memory concerns) - Recommends 100-1000 for optimal performance - Helps prevent out-of-memory errors - Non-blocking (warning only, not error) 4. Documented Timeout Limitation (Issue #3) - Clarified that default implementation doesn't enforce timeout - Added example with signal-based timeout enforcement (Unix) - Added simple example without timeout - Subclasses can choose appropriate timeout strategy All changes are backward compatible and non-breaking. Test Results: - 24/24 tests passing - All linting checks pass (Ruff, format) Signed-off-by: manavgup <manavg@gmail.com>
diff --git a/backend/vectordbs/vector_store.py b/backend/vectordbs/vector_store.py
@@ -87,7 +87,9 @@ def connect(self) -> None:
         """
         self._connected = True
         self._connection_metadata["connected_at"] = time.time()
-        logger.info("Connected to vector store (base implementation)")
+        logger.debug(
+            "Connection flag set for %s (base implementation - override for real connections)", self.__class__.__name__
+        )
 
     def disconnect(self) -> None:
         """Close connection to the vector database.
@@ -115,7 +117,10 @@ def disconnect(self) -> None:
         """
         self._connected = False
         self._connection_metadata["disconnected_at"] = time.time()
-        logger.info("Disconnected from vector store (base implementation)")
+        logger.debug(
+            "Connection flag cleared for %s (base implementation - override for real disconnections)",
+            self.__class__.__name__,
+        )
 
     @property
     def is_connected(self) -> bool:
@@ -174,31 +179,33 @@ async def async_connection_context(self) -> AsyncIterator[None]:
         disconnecting connections that IT created. If a connection already exists,
         it leaves it intact on exit to avoid breaking calling code.
 
+        Warning:
+            The default implementation calls synchronous connect()/disconnect() methods,
+            which may block the event loop. Subclasses with async database clients should
+            override this method to use async connection methods instead.
+
         Usage:
             async with vector_store.async_connection_context():
                 await vector_store.async_add_documents(...)
 
-        Example with existing connection:
-            >>> store = VectorStore(settings)
-            >>> store.connect()  # Manual connection
-            >>> async with store.async_connection_context():
-            ...     await store.async_query(...)  # Uses existing connection
-            >>> # Connection still active after context exit
-            >>> store.is_connected
-            True
-
-        Example without existing connection:
-            >>> store = VectorStore(settings)
-            >>> async with store.async_connection_context():
-            ...     await store.async_query(...)  # Creates connection
-            >>> # Connection cleaned up after context exit
-            >>> store.is_connected
-            False
+        Example for implementations with async clients (override recommended):
+            >>> @asynccontextmanager
+            ... async def async_connection_context(self):
+            ...     needs_disconnect = False
+            ...     try:
+            ...         if not self._connected:
+            ...             await self.async_connect()  # Async method
+            ...             needs_disconnect = True
+            ...         yield
+            ...     finally:
+            ...         if needs_disconnect:
+            ...             await self.async_disconnect()  # Async method
         """
         # Track if WE created the connection
         needs_disconnect = False
         try:
             if not self._connected:
+                # Default uses sync methods - override if using async client
                 self.connect()
                 needs_disconnect = True  # Only disconnect what we connected
             yield
@@ -274,7 +281,9 @@ def _health_check_impl(self, timeout: float) -> dict[str, Any]:  # noqa: ARG002
         memory usage, query latency).
 
         Args:
-            timeout: Maximum time to wait for health check in seconds
+            timeout: Maximum time to wait for health check in seconds.
+                Note: The default implementation does not enforce this timeout.
+                Subclasses should implement timeout handling for actual health checks.
 
         Returns:
             Dictionary with health status information. Default keys:
@@ -286,7 +295,25 @@ def _health_check_impl(self, timeout: float) -> dict[str, Any]:  # noqa: ARG002
             VectorStoreError: If health check fails due to connection issues
             TimeoutError: If health check exceeds timeout duration
 
-        Example:
+        Example with timeout enforcement:
+            >>> import signal
+            >>> def _health_check_impl(self, timeout: float) -> dict[str, Any]:
+            ...     def timeout_handler(signum, frame):
+            ...         raise TimeoutError(f"Health check exceeded {timeout}s")
+            ...
+            ...     # Set timeout (Unix-like systems only)
+            ...     signal.signal(signal.SIGALRM, timeout_handler)
+            ...     signal.alarm(int(timeout))
+            ...     try:
+            ...         # Perform actual health check
+            ...         result = self.client.health_check()
+            ...         signal.alarm(0)  # Cancel alarm
+            ...         return {"status": "healthy", "nodes": result.nodes}
+            ...     except Exception:
+            ...         signal.alarm(0)  # Cancel alarm
+            ...         raise
+
+        Example without timeout (simple):
             >>> def _health_check_impl(self, timeout: float) -> dict[str, Any]:
             ...     return {
             ...         "status": "healthy",
@@ -484,6 +511,16 @@ def _batch_chunks(self, chunks: list[EmbeddedChunk], batch_size: int) -> list[li
                 "Common batch sizes: 100 (conservative), 500 (balanced), 1000 (aggressive)"
             )
 
+        # Warn about very large batch sizes that may cause memory issues
+        if batch_size > 10000:
+            logger.warning(
+                "Batch size %d is very large and may cause memory issues. "
+                "Consider using smaller batches (100-1000 recommended). "
+                "Collection: %s",
+                batch_size,
+                getattr(chunks[0], "collection_name", "unknown") if chunks else "unknown",
+            )
+
         batches: list[list[EmbeddedChunk]] = []
         for i in range(0, len(chunks), batch_size):
             batches.append(chunks[i : i + batch_size])