From 25c2ad15e596714df1f5ac6756aadfdc6adf1051 Mon Sep 17 00:00:00 2001 From: Sanket Kedia <sanket@trychroma.com> Date: Tue, 22 Apr 2025 13:05:53 -0700 Subject: [PATCH 1/5] [ENH] Turn on spann by default --- rust/frontend/sample_configs/distributed.yaml | 2 +- rust/frontend/sample_configs/tilt_config.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/rust/frontend/sample_configs/distributed.yaml b/rust/frontend/sample_configs/distributed.yaml index ee42bffe86f..c1a3826ed39 100644 --- a/rust/frontend/sample_configs/distributed.yaml +++ b/rust/frontend/sample_configs/distributed.yaml @@ -46,4 +46,4 @@ scorecard: - "collection_id:*" score: 100 enable_span_indexing: true -default_knn_index: "hnsw" +default_knn_index: "spann" diff --git a/rust/frontend/sample_configs/tilt_config.yaml b/rust/frontend/sample_configs/tilt_config.yaml index 520bfd87bf9..9530ecfd521 100644 --- a/rust/frontend/sample_configs/tilt_config.yaml +++ b/rust/frontend/sample_configs/tilt_config.yaml @@ -54,4 +54,4 @@ scorecard: circuit_breaker: requests: 1000 enable_span_indexing: true -default_knn_index: "hnsw" +default_knn_index: "spann" From 914d366278929717f0c1f9223a9791acbba7f9ae Mon Sep 17 00:00:00 2001 From: Sanket Kedia <sanket@trychroma.com> Date: Thu, 24 Apr 2025 20:31:36 -0700 Subject: [PATCH 2/5] Take lock before doing hnsw.open() --- rust/index/src/spann/types.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/rust/index/src/spann/types.rs b/rust/index/src/spann/types.rs index 3b9f753fd91..9349e1f56f2 100644 --- a/rust/index/src/spann/types.rs +++ b/rust/index/src/spann/types.rs @@ -2022,6 +2022,11 @@ impl<'me> SpannIndexReader<'me> { dimensionality: usize, ef_search: usize, ) -> Result<HnswIndexRef, SpannIndexReaderError> { + // We take a lock here to synchronize concurrent open of the same index. + // Otherwise, we could end up with a corrupted index since the filesystem + // operations are not guaranteed to be atomic. + // The lock is a partitioned mutex to allow for higher concurrency across collections. + let _guard = hnsw_provider.write_mutex.lock(id).await; match hnsw_provider.get(id, cache_key).await { Some(index) => Ok(index), None => { From 49ad1f5b90e66e59665b742a0385609b5f78fd84 Mon Sep 17 00:00:00 2001 From: Sanket Kedia <sanket@trychroma.com> Date: Thu, 24 Apr 2025 20:34:49 -0700 Subject: [PATCH 3/5] fix rust test --- rust/frontend/src/impls/service_based_frontend.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/rust/frontend/src/impls/service_based_frontend.rs b/rust/frontend/src/impls/service_based_frontend.rs index 4d79e258b20..5c269164fa9 100644 --- a/rust/frontend/src/impls/service_based_frontend.rs +++ b/rust/frontend/src/impls/service_based_frontend.rs @@ -1438,9 +1438,13 @@ mod tests { assert!(segments.iter().any( |s| s.r#type == SegmentType::BlockfileMetadata && s.scope == SegmentScope::METADATA )); - assert!(segments - .iter() - .any(|s| s.r#type == SegmentType::HnswDistributed && s.scope == SegmentScope::VECTOR)); + assert!( + segments.iter().any( + |s| s.r#type == SegmentType::HnswDistributed && s.scope == SegmentScope::VECTOR + ) || segments + .iter() + .any(|s| s.r#type == SegmentType::Spann && s.scope == SegmentScope::VECTOR) + ); assert!(segments .iter() .any(|s| s.r#type == SegmentType::BlockfileRecord && s.scope == SegmentScope::RECORD)); From d05268fa2113d7dfb389290804e031b64d906a1d Mon Sep 17 00:00:00 2001 From: Sanket Kedia <sanket@trychroma.com> Date: Thu, 24 Apr 2025 22:42:15 -0700 Subject: [PATCH 4/5] fix test --- chromadb/test/property/invariants.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/chromadb/test/property/invariants.py b/chromadb/test/property/invariants.py index c48c5abd832..761ad3707ea 100644 --- a/chromadb/test/property/invariants.py +++ b/chromadb/test/property/invariants.py @@ -246,16 +246,19 @@ def fd_not_exceeding_threadpool_size(threadpool_size: int) -> None: ) def get_space(collection: Collection): + # TODO: this is a hack to get the space + # We should update the tests to not pass space via metadata instead use collection + # configuration_json + space = None if "hnsw:space" in collection.metadata: - return collection.metadata["hnsw:space"] + space = collection.metadata["hnsw:space"] if collection._model.configuration_json is None: - return None + return space if 'spann' in collection._model.configuration_json and collection._model.configuration_json.get('spann') is not None and 'space' in collection._model.configuration_json.get('spann'): - return collection._model.configuration_json.get('spann').get('space') + space = collection._model.configuration_json.get('spann').get('space') elif 'hnsw' in collection._model.configuration_json and collection._model.configuration_json.get('hnsw') is not None and 'space' in collection._model.configuration_json.get('hnsw'): - return collection._model.configuration_json.get('hnsw').get('space') - else: - return None + space = collection._model.configuration_json.get('hnsw').get('space') + return space def ann_accuracy( collection: Collection, From 7d79e57409e732c72e49dee68220543af0985d13 Mon Sep 17 00:00:00 2001 From: Sanket Kedia <sanket@trychroma.com> Date: Thu, 24 Apr 2025 23:11:23 -0700 Subject: [PATCH 5/5] Modify get_space --- chromadb/test/property/invariants.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/chromadb/test/property/invariants.py b/chromadb/test/property/invariants.py index 761ad3707ea..58b780c80d1 100644 --- a/chromadb/test/property/invariants.py +++ b/chromadb/test/property/invariants.py @@ -257,7 +257,8 @@ def get_space(collection: Collection): if 'spann' in collection._model.configuration_json and collection._model.configuration_json.get('spann') is not None and 'space' in collection._model.configuration_json.get('spann'): space = collection._model.configuration_json.get('spann').get('space') elif 'hnsw' in collection._model.configuration_json and collection._model.configuration_json.get('hnsw') is not None and 'space' in collection._model.configuration_json.get('hnsw'): - space = collection._model.configuration_json.get('hnsw').get('space') + if space is None: + space = collection._model.configuration_json.get('hnsw').get('space') return space def ann_accuracy(