Skip to content

Commit

Permalink
Address review feedback:
Browse files Browse the repository at this point in the history
    - Optimize runtime performance and reduce unnecessary allocations
    - Enhance documentation
    - Clean up test utilities

Signed-off-by: shamb0 <r.raajey@gmail.com>
  • Loading branch information
shamb0 committed Nov 19, 2024
1 parent c4fdcbe commit 77d29c2
Show file tree
Hide file tree
Showing 9 changed files with 296 additions and 87 deletions.
12 changes: 0 additions & 12 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 2 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
[workspace]
members = ["swiftide", "swiftide-*", "examples", "benchmarks"]
default-members = ["swiftide", "swiftide-*"]

resolver = "2"

Expand Down Expand Up @@ -39,7 +40,6 @@ indoc = { version = "2.0" }
regex = { version = "1.11.1" }
uuid = { version = "1.10", features = ["v3", "v4", "serde"] }
dyn-clone = { version = "1.0" }
once_cell = { version = "1.20.2" }

# Integrations
spider = { version = "2.13" }
Expand Down Expand Up @@ -92,8 +92,7 @@ temp-dir = "0.1.13"
wiremock = "0.6.0"
test-case = "3.3.1"
insta = { version = "1.41.1", features = ["yaml"] }
tempfile = "3.10.1"
portpicker = "0.1.1"


[workspace.lints.rust]
unsafe_code = "forbid"
Expand Down
2 changes: 1 addition & 1 deletion swiftide-integrations/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ strum = { workspace = true }
strum_macros = { workspace = true }
regex = { workspace = true }
futures-util = { workspace = true }
once_cell = { workspace = true }

# Integrations
async-openai = { workspace = true, optional = true }
Expand Down Expand Up @@ -89,6 +88,7 @@ test-case = { workspace = true }
indoc = { workspace = true }
insta = { workspace = true }


[features]
default = ["rustls"]
# Ensures rustls is used
Expand Down
90 changes: 89 additions & 1 deletion swiftide-integrations/src/pgvector/fixtures.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,43 @@
//! This module implements common types and helper utilities for unit tests related to the pgvector
//! Test fixtures and utilities for pgvector integration testing.
//!
//! Provides test infrastructure and helper types to verify vector storage and retrieval:
//! - Mock data generation for different embedding modes
//! - Test containers for `PostgreSQL` with pgvector extension
//! - Common test scenarios and assertions
//!
//! # Examples
//!
//! ```rust
//! use swiftide_integrations::pgvector::fixtures::{TestContext, PgVectorTestData};
//! use swiftide_core::indexing::{EmbedMode, EmbeddedField};
//!
//! # async fn example() -> Result<(), Box<dyn std::error::Error>> {
//! // Initialize test context with PostgreSQL container
//! let context = TestContext::setup_with_cfg(
//! Some(vec!["category", "priority"]),
//! vec![EmbeddedField::Combined].into_iter().collect()
//! ).await?;
//!
//! // Create test data for different embedding modes
//! let test_data = PgVectorTestData {
//! embed_mode: EmbedMode::SingleWithMetadata,
//! chunk: "test content",
//! metadata: None,
//! vectors: vec![PgVectorTestData::create_test_vector(
//! EmbeddedField::Combined,
//! 1.0
//! )],
//! };
//! # Ok(())
//! # }
//! ```
//!
//! The module supports testing for:
//! - Single embedding with/without metadata
//! - Per-field embeddings
//! - Combined embedding modes
//! - Different vector configurations
//! - Various metadata scenarios
use crate::pgvector::PgVector;
use std::collections::HashSet;
use swiftide_core::{
Expand All @@ -7,11 +46,36 @@ use swiftide_core::{
};
use testcontainers::{ContainerAsync, GenericImage};

/// Test data structure for pgvector integration testing.
///
/// Provides a flexible structure to test different embedding modes and configurations,
/// including metadata handling and vector generation.
///
/// # Examples
///
/// ```rust
/// use swiftide_integrations::pgvector::fixtures::PgVectorTestData;
/// use swiftide_core::indexing::{EmbedMode, EmbeddedField};
///
/// let test_data = PgVectorTestData {
/// embed_mode: EmbedMode::SingleWithMetadata,
/// chunk: "test content",
/// metadata: None,
/// vectors: vec![PgVectorTestData::create_test_vector(
/// EmbeddedField::Combined,
/// 1.0
/// )],
/// };
/// ```
#[derive(Clone)]
pub(crate) struct PgVectorTestData<'a> {
/// Embedding mode for the test case
pub embed_mode: indexing::EmbedMode,
/// Test content chunk
pub chunk: &'a str,
/// Optional metadata for testing metadata handling
pub metadata: Option<indexing::Metadata>,
/// Vector embeddings with their corresponding fields
pub vectors: Vec<(indexing::EmbeddedField, Vec<f32>)>,
}

Expand Down Expand Up @@ -42,8 +106,32 @@ impl<'a> PgVectorTestData<'a> {
}
}

/// Test context managing `PostgreSQL` container and pgvector storage.
///
/// Handles the lifecycle of test containers and provides configured storage
/// instances for testing.
///
/// # Examples
///
/// ```rust
/// # use swiftide_integrations::pgvector::fixtures::TestContext;
/// # use swiftide_core::indexing::EmbeddedField;
/// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
/// // Setup test context with specific configuration
/// let context = TestContext::setup_with_cfg(
/// Some(vec!["category"]),
/// vec![EmbeddedField::Combined].into_iter().collect()
/// ).await?;
///
/// // Use context for testing
/// context.pgv_storage.setup().await?;
/// # Ok(())
/// # }
/// ```
pub(crate) struct TestContext {
/// Configured pgvector storage instance
pub(crate) pgv_storage: PgVector,
/// Container instance running `PostgreSQL` with pgvector
_pgv_db_container: ContainerAsync<GenericImage>,
}

Expand Down
38 changes: 31 additions & 7 deletions swiftide-integrations/src/pgvector/mod.rs
Original file line number Diff line number Diff line change
@@ -1,19 +1,38 @@
//! This module integrates with the pgvector database, providing functionalities to create and manage vector collections,
//! store data, and optimize indexing for efficient searches.
//! Integration module for `PostgreSQL` vector database (pgvector) operations.
//!
//! pgvector is utilized in both the `indexing::Pipeline` and `query::Pipeline` modules.
//! This module provides a client interface for vector similarity search operations using pgvector,
//! supporting:
//! - Vector collection management with configurable schemas
//! - Efficient vector storage and indexing
//! - Connection pooling with automatic retries
//! - Batch operations for optimized performance
//!
//! The functionality is primarily used through the [`PgVector`] client, which implements
//! the [`Persist`] trait for seamless integration with indexing and query pipelines.
//!
//! # Example
//! ```rust
//! # use swiftide_integrations::pgvector::PgVector;
//! # async fn example() -> anyhow::Result<()> {
//! let client = PgVector::builder()
//! .db_url("postgresql://localhost:5432/vectors")
//! .vector_size(384)
//! .build()?;
//!
//! # Ok(())
//! # }
//! ```
#[cfg(test)]
mod fixtures;

mod persist;
mod pgv_table_types;
use anyhow::Result;
use derive_builder::Builder;
use once_cell::sync::OnceCell;
use sqlx::PgPool;
use std::fmt;
use std::sync::Arc;
use std::sync::OnceLock;
use tokio::time::Duration;

use pgv_table_types::{FieldConfig, MetadataConfig, VectorConfig};
Expand Down Expand Up @@ -70,8 +89,12 @@ pub struct PgVector {
db_conn_retry_delay: Duration,

/// Lazy-initialized database connection pool.
#[builder(default = "Arc::new(OnceCell::new())")]
connection_pool: Arc<OnceCell<PgPool>>,
#[builder(default = "Arc::new(OnceLock::new())")]
connection_pool: Arc<OnceLock<PgPool>>,

/// SQL statement used for executing bulk insert.
#[builder(default = "Arc::new(OnceLock::new())")]
sql_stmt_bulk_insert: Arc<OnceLock<String>>,
}

impl fmt::Debug for PgVector {
Expand Down Expand Up @@ -346,6 +369,7 @@ mod tests {
])
; "Both mode with metadata")]
#[test_log::test(tokio::test)]
#[ignore]
async fn test_persist_nodes(
test_cases: Vec<PgVectorTestData<'_>>,
vector_fields: HashSet<EmbeddedField>,
Expand Down
22 changes: 18 additions & 4 deletions swiftide-integrations/src/pgvector/persist.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
//! This module implements the `Persist` trait for the `PgVector` struct.
//! It provides methods for setting up storage, saving individual nodes, and batch-storing multiple nodes.
//! This integration enables the Swiftide project to use `PgVector` as a storage backend.
//! Storage persistence implementation for vector embeddings.
//!
//! Implements the [`Persist`] trait for [`PgVector`], providing vector storage capabilities:
//! - Database schema initialization and setup
//! - Single-node storage operations
//! - Optimized batch storage with configurable batch sizes
//!
//! The implementation ensures thread-safe concurrent access and handles
//! connection management automatically.
use crate::pgvector::PgVector;
use anyhow::Result;
use anyhow::{anyhow, Result};
use async_trait::async_trait;
use swiftide_core::{
indexing::{IndexingStream, Node},
Expand All @@ -16,6 +22,14 @@ impl Persist for PgVector {
// Get or initialize the connection pool
let pool = self.pool_get_or_initialize().await?;

if self.sql_stmt_bulk_insert.get().is_none() {
let sql = self.generate_unnest_upsert_sql()?;

self.sql_stmt_bulk_insert
.set(sql)
.map_err(|_| anyhow!("SQL bulk store statement is already set"))?;
}

let mut tx = pool.begin().await?;

// Create extension
Expand Down
Loading

0 comments on commit 77d29c2

Please sign in to comment.