From dcaeb208f15ac888d93a6791ec8486a09f0e7bc6 Mon Sep 17 00:00:00 2001 From: Pedro Rico Pinazo Date: Tue, 27 Feb 2024 13:40:01 +0000 Subject: [PATCH 1/9] initial faiss integration --- Cargo.lock | 16 +++ raphtory-graphql/src/server.rs | 2 + raphtory/Cargo.toml | 86 +++++++---- raphtory/src/vectors/document_ref.rs | 1 + raphtory/src/vectors/embeddings.rs | 30 ---- raphtory/src/vectors/faiss_store.rs | 84 +++++++++++ raphtory/src/vectors/mod.rs | 85 ++++++++++- .../src/vectors/similarity_search_utils.rs | 10 +- raphtory/src/vectors/vectorisable.rs | 30 +++- raphtory/src/vectors/vectorised_graph.rs | 133 ++++++++++++++---- .../src/vectors/vectorised_graph_storage.rs | 1 + 11 files changed, 390 insertions(+), 88 deletions(-) create mode 100644 raphtory/src/vectors/faiss_store.rs diff --git a/Cargo.lock b/Cargo.lock index d7b03494e1..c61c56507c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1128,6 +1128,21 @@ dependencies = [ "serde", ] +[[package]] +name = "faiss" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3ffe048432786028b0a30aa1d13e10e08ced380439ba4a83fe5c227d2dd9733" +dependencies = [ + "faiss-sys", +] + +[[package]] +name = "faiss-sys" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b9c008fc56422bf34357f17226d9c5a5c2ef6245b4774759c5f67112e46915e" + [[package]] name = "fast_chemail" version = "0.9.6" @@ -2845,6 +2860,7 @@ dependencies = [ "display-error-chain", "dotenv", "enum_dispatch", + "faiss", "flate2", "futures-util", "genawaiter", diff --git a/raphtory-graphql/src/server.rs b/raphtory-graphql/src/server.rs index 66e266a4f8..e8ddda0b16 100644 --- a/raphtory-graphql/src/server.rs +++ b/raphtory-graphql/src/server.rs @@ -86,6 +86,7 @@ impl RaphtoryServer { embedding: F, cache: &Path, template: Option, + faiss: boolean, ) -> Self where F: EmbeddingFunction + Clone + 'static, @@ -115,6 +116,7 @@ impl RaphtoryServer { Some(graph_cache), true, template.clone(), + faiss, true, ) .await; diff --git a/raphtory/Cargo.toml b/raphtory/Cargo.toml index 4479d4413f..bc615fddd4 100644 --- a/raphtory/Cargo.toml +++ b/raphtory/Cargo.toml @@ -18,21 +18,25 @@ homepage.workspace = true bincode = "1" chrono = { version = "0.4.31", features = ["serde"] } genawaiter = "0.99" -itertools= "0.12.0" +itertools = "0.12.0" num-traits = "0.2" -parking_lot = { version = "0.12" , features = ["serde", "arc_lock", "send_guard"] } +parking_lot = { version = "0.12", features = [ + "serde", + "arc_lock", + "send_guard", +] } once_cell = "1" rand = "0.8.5" rand_distr = "0.4.3" rayon = "1" regex = "1" rustc-hash = "1.1.0" -serde = { version = "1", features = ["derive","rc"] } +serde = { version = "1", features = ["derive", "rc"] } sorted_vector_map = "0.1" thiserror = "1" twox-hash = "1.6.3" lock_api = { version = "0.4", features = ["arc_lock", "serde"] } -dashmap = {version ="5", features = ["serde"] } +dashmap = { version = "5", features = ["serde"] } enum_dispatch = "0.3" ordered-float = "4.1.1" glam = "0.25.0" @@ -40,29 +44,38 @@ quad-rand = "0.2.1" serde_json = "1" # io optional dependencies -csv = {version="1.1.6", optional=true} -zip = {version ="0.6.6", optional=true} -neo4rs = {version="0.6.1", optional=true} -bzip2 = {version="0.4", optional=true} -flate2 = {version="1.0", optional=true} -reqwest = { version = "0.11.14", features = ["blocking", "rustls-tls"], default-features = false, optional=true} -tokio = { version = "1.27.0", features = ["full"], optional=true} +csv = { version = "1.1.6", optional = true } +zip = { version = "0.6.6", optional = true } +neo4rs = { version = "0.6.1", optional = true } +bzip2 = { version = "0.4", optional = true } +flate2 = { version = "1.0", optional = true } +reqwest = { version = "0.11.14", features = [ + "blocking", + "rustls-tls", +], default-features = false, optional = true } +tokio = { version = "1.27.0", features = ["full"], optional = true } # search optional dependencies -tantivy = {version= "0.21.1", optional=true} +tantivy = { version = "0.21.1", optional = true } # vectors optional dependencies -futures-util = {version="0.3.0", optional=true} -async-trait = {version="0.1.73", optional=true} -async-openai = {version= "0.17.1", optional=true} +futures-util = { version = "0.3.0", optional = true } +async-trait = { version = "0.1.73", optional = true } +async-openai = { version = "0.17.1", optional = true } +faiss = { version = "0.12.1", optional = true } # python binding optional dependencies -pyo3 = {version= "0.20.0", features=["multiple-pymethods", "chrono"], optional=true} -pyo3-asyncio = { version = "0.20.0", features = ["tokio-runtime"], optional=true } -num = {version="0.4.0", optional=true} -display-error-chain = {version= "0.2.0", optional=true} -arrow2 = {version= "0.18.0", optional=true} -kdam = { version="0.5.1", features = ["notebook"], optional = true} +pyo3 = { version = "0.20.0", features = [ + "multiple-pymethods", + "chrono", +], optional = true } +pyo3-asyncio = { version = "0.20.0", features = [ + "tokio-runtime", +], optional = true } +num = { version = "0.4.0", optional = true } +display-error-chain = { version = "0.2.0", optional = true } +arrow2 = { version = "0.18.0", optional = true } +kdam = { version = "0.5.1", features = ["notebook"], optional = true } [dev-dependencies] @@ -72,18 +85,39 @@ quickcheck = "1" quickcheck_macros = "1" tempfile = "3.2" tempdir = "0.3" -tokio = { version = "1.27.0", features = ["full"]} # for vector testing -dotenv = "0.15.0" # for vector testing +tokio = { version = "1.27.0", features = ["full"] } # for vector testing +dotenv = "0.15.0" # for vector testing streaming-stats = "0.2" proptest = "1.4.0" [features] default = [] # Enables the graph loader io module -io = ["dep:zip", "dep:neo4rs", "dep:bzip2", "dep:flate2", "dep:csv", "dep:reqwest", "dep:tokio"] +io = [ + "dep:zip", + "dep:neo4rs", + "dep:bzip2", + "dep:flate2", + "dep:csv", + "dep:reqwest", + "dep:tokio", +] # Enables generating the pyo3 python bindings -python = ["io", "dep:pyo3", "dep:pyo3-asyncio", "dep:num", "dep:display-error-chain", "dep:arrow2", "dep:kdam"] +python = [ + "io", + "dep:pyo3", + "dep:pyo3-asyncio", + "dep:num", + "dep:display-error-chain", + "dep:arrow2", + "dep:kdam", +] # search search = ["dep:tantivy"] # vectors -vectors = ["dep:futures-util", "dep:async-trait", "dep:async-openai"] +vectors = [ + "dep:futures-util", + "dep:async-trait", + "dep:async-openai", + "dep:faiss", +] diff --git a/raphtory/src/vectors/document_ref.rs b/raphtory/src/vectors/document_ref.rs index 55dd920491..67cf5e6ce2 100644 --- a/raphtory/src/vectors/document_ref.rs +++ b/raphtory/src/vectors/document_ref.rs @@ -8,6 +8,7 @@ use crate::{ use serde::{Deserialize, Serialize}; use std::hash::{Hash, Hasher}; +// TODO: this is not a refence, find another name, like CompressedDocument /// this struct contains the minimum amount of information need to regenerate a document using a /// template and to quickly apply windows over them #[derive(Clone, Debug, Serialize, Deserialize)] diff --git a/raphtory/src/vectors/embeddings.rs b/raphtory/src/vectors/embeddings.rs index 18e949c339..111aa3ae43 100644 --- a/raphtory/src/vectors/embeddings.rs +++ b/raphtory/src/vectors/embeddings.rs @@ -18,33 +18,3 @@ pub async fn openai_embedding(texts: Vec) -> Vec { println!("Generated embeddings successfully"); response.data.into_iter().map(|e| e.embedding).collect_vec() } - -// this is currently commented out so we don't need to add any new dependencies -// but might be potentially useful in the future -// async fn sentence_transformers_embeddings(texts: Vec) -> Vec { -// println!("computing embeddings for {} texts", texts.len()); -// Python::with_gil(|py| { -// let sentence_transformers = py.import("sentence_transformers")?; -// let locals = [("sentence_transformers", sentence_transformers)].into_py_dict(py); -// locals.set_item("texts", texts); -// -// let pyarray: &PyArray2 = py -// .eval( -// &format!( -// "sentence_transformers.SentenceTransformer('thenlper/gte-small').encode(texts)" -// ), -// Some(locals), -// None, -// )? -// .extract()?; -// -// let readonly = pyarray.readonly(); -// let chunks = readonly.as_slice().unwrap().chunks(384).into_iter(); -// let embeddings = chunks -// .map(|chunk| chunk.iter().copied().collect_vec()) -// .collect_vec(); -// -// Ok::>, Box>(embeddings) -// }) -// .unwrap() -// } diff --git a/raphtory/src/vectors/faiss_store.rs b/raphtory/src/vectors/faiss_store.rs new file mode 100644 index 0000000000..d1ad2b31eb --- /dev/null +++ b/raphtory/src/vectors/faiss_store.rs @@ -0,0 +1,84 @@ +use super::{document_ref::DocumentRef, entity_id::EntityId, Embedding}; +use faiss::{index::IndexImpl, index_factory, Idx, Index, MetricType}; +use itertools::Itertools; +use std::collections::HashMap; + +#[derive(Clone)] +pub(crate) struct DocumentPointer { + pub(crate) entity: EntityId, + pub(crate) subindex: usize, // TODO: reduce this inside and provide nice error when there are too much documents for some entity +} + +pub(crate) struct FaissIndex { + mapping: Vec, + index: IndexImpl, +} + +impl FaissIndex { + fn get(&self, idx: &Idx) -> DocumentPointer { + self.mapping + .get(idx.get().unwrap() as usize) + .unwrap() + .clone() + } + + /// This function returns a vector just to take ownership of Faiss results + pub(crate) fn search(&mut self, query: &Embedding, limit: usize) -> Vec { + let result = self.index.search(query.as_slice(), limit); + match result { + Ok(result) => result.labels.iter().map(|idx| self.get(idx)).collect_vec(), + Err(_) => vec![], + } + } +} + +pub(crate) struct FaissStore { + pub(crate) nodes: FaissIndex, + pub(crate) edges: FaissIndex, +} + +impl FaissStore { + pub(crate) fn from_refs( + nodes: &HashMap>, + edges: &HashMap>, + ) -> Self { + Self { + nodes: build_entity_index(nodes), + edges: build_entity_index(edges), + } + } + + // pub(crate) fn search_nodes( + // &self, + // query: Embedding, + // limit: usize, + // ) -> Vec<(DocumentPointer, f32)> { + // search(&self.nodes, query, limit) + // } + + // pub(crate) fn search_edges( + // &self, + // query: Embedding, + // limit: usize, + // ) -> Vec<(DocumentPointer, f32)> { + // search(&self.edges, query, limit) + // } +} + +fn build_entity_index(entities: &HashMap>) -> FaissIndex { + let mut index = index_factory(3, "IDMap,Flat", MetricType::InnerProduct).unwrap(); // FIXME: this can't be 3, needs to be variable!!!!!!! + let mut mapping = vec![]; + for (entity, doc_refs) in entities { + for (subindex, doc_ref) in doc_refs.iter().enumerate() { + let entity = entity.clone(); + mapping.push(DocumentPointer { entity, subindex }); + let embedding = doc_ref.embedding.as_slice(); + let index_for_faiss = mapping.len() as i64; + // this can be improved by putting embeddings in a contiguous memory slice + index + .add_with_ids(embedding, &[index_for_faiss.into()]) + .unwrap(); + } + } + FaissIndex { index, mapping } +} diff --git a/raphtory/src/vectors/mod.rs b/raphtory/src/vectors/mod.rs index 81abb6e722..b90b746b01 100644 --- a/raphtory/src/vectors/mod.rs +++ b/raphtory/src/vectors/mod.rs @@ -7,6 +7,7 @@ pub mod document_template; mod embedding_cache; pub mod embeddings; mod entity_id; +mod faiss_store; pub mod graph_entity; mod similarity_search_utils; pub mod splitting; @@ -166,7 +167,7 @@ mod vector_tests { g.add_node(0, "test", NO_PROPS, None).unwrap(); // the following succeeds with no cache set up - g.vectorise(Box::new(fake_embedding), None, true, false) + g.vectorise(Box::new(fake_embedding), None, true, false, false) .await; let path = "/tmp/raphtory/very/deep/path/embedding-cache-test"; @@ -178,6 +179,7 @@ mod vector_tests { Some(PathBuf::from(path)), true, false, + false, ) .await; @@ -188,6 +190,7 @@ mod vector_tests { Some(PathBuf::from(path)), true, false, + false, ) .await; } @@ -199,7 +202,7 @@ mod vector_tests { let g = Graph::new(); let cache = PathBuf::from("/tmp/raphtory/vector-cache-lotr-test"); let vectors = g - .vectorise(Box::new(fake_embedding), Some(cache), true, false) + .vectorise(Box::new(fake_embedding), Some(cache), true, false, false) .await; let embedding: Embedding = fake_embedding(vec!["whatever".to_owned()]).await.remove(0); let docs = vectors @@ -288,6 +291,7 @@ age: 30"###; true, FakeMultiDocumentTemplate, false, + false, ) .await; @@ -347,6 +351,7 @@ age: 30"###; true, FakeTemplateWithIntervals, false, + false, ) .await; @@ -429,6 +434,7 @@ age: 30"###; true, CustomTemplate, false, + false, ) .await; @@ -466,4 +472,79 @@ age: 30"###; .get_documents(); assert!(docs[0].content().contains("Frodo appeared with Gandalf")); } + + use faiss::{index_factory, Index, MetricType}; + + async fn predictable_embedding(texts: Vec) -> Vec { + texts + .into_iter() + .map(|text| vec![text.parse::().unwrap(), 0.0, 0.0, 0.0, 0.0]) + .collect_vec() + } + + struct PredictableTemplate; + + impl DocumentTemplate for PredictableTemplate { + fn graph(&self, graph: &G) -> Box> { + DefaultTemplate.graph(graph) + } + + fn node(&self, node: &NodeView) -> Box> { + Box::new(std::iter::once(node.name().to_string().into())) + } + fn edge(&self, edge: &EdgeView) -> Box> { + Box::new(std::iter::once(edge.src().name().to_string().into())) + } + } + + #[tokio::test] + async fn test_faiss_2() { + let g = Graph::new(); + for n in 1..=100 { + g.add_node(0, n, NO_PROPS, None); + g.add_edge(0, 1, n, NO_PROPS, None); + } + + let v = g + .vectorise_with_template( + Box::new(predictable_embedding), + None, + false, + PredictableTemplate, + true, + true, + ) + .await; + + let selection = v.append_nodes_by_similarity(&vec![5.0, 0.0, 0.0, 0.0, 0.0], 1, None); + + let (doc, score) = selection.get_documents_with_scores().remove(0); + assert_eq!(doc.into_content(), "5"); + assert_eq!(score, 1.0); + } + + #[test] + fn test_faiss() { + let v1 = [0.0, 1.0, 1.0]; + let v2 = [0.0, 1.0, 1.0]; + let my_query = [0.0, 1.0, 1.0]; + + let mut index = index_factory(3, "IDMap,Flat", MetricType::L2).unwrap(); + index.add_with_ids(&v1, &[0.into()]).unwrap(); + index.add_with_ids(&v2, &[1.into()]).unwrap(); + + let result = index.search(&my_query, 5).unwrap(); + println!("---------------------"); + println!("got result"); + println!("{result:?}"); + for (i, (l, d)) in result + .labels + .iter() + .zip(result.distances.iter()) + .enumerate() + { + println!("#{}: {} (D={})", i + 1, *l, *d); + } + println!("---------------------"); + } } diff --git a/raphtory/src/vectors/similarity_search_utils.rs b/raphtory/src/vectors/similarity_search_utils.rs index 40e53d813d..3f3f1974e8 100644 --- a/raphtory/src/vectors/similarity_search_utils.rs +++ b/raphtory/src/vectors/similarity_search_utils.rs @@ -41,7 +41,13 @@ fn cosine(vector1: &Embedding, vector2: &Embedding) -> f32 { let normalized = dot_product / (x_length.sqrt() * y_length.sqrt()); // println!("cosine for {vector1:?} and {vector2:?} is {normalized}"); - assert!(normalized <= 1.001); - assert!(normalized >= -1.001); + assert!( + normalized <= 1.001, + "not valid result: {normalized} for vectors:\n{vector1:?}\n{vector2:?}" + ); + assert!( + normalized >= -1.001, + "not valid result: {normalized} for vectors:\n{vector1:?}\n{vector2:?}" + ); normalized } diff --git a/raphtory/src/vectors/vectorisable.rs b/raphtory/src/vectors/vectorisable.rs index 564b0c8909..f337bd300b 100644 --- a/raphtory/src/vectors/vectorisable.rs +++ b/raphtory/src/vectors/vectorisable.rs @@ -11,8 +11,11 @@ use crate::{ }; use async_trait::async_trait; use itertools::Itertools; +use parking_lot::RwLock; use std::{collections::HashMap, path::PathBuf}; +use super::faiss_store::{self, FaissStore}; + const CHUNK_SIZE: usize = 1000; #[derive(Clone, Debug)] @@ -32,7 +35,7 @@ pub trait Vectorisable { /// * cache - the file to be used as a cache to avoid calling the embedding function /// * overwrite_cache - whether or not to overwrite the cache if there are new embeddings /// * verbose - whether or not to print logs reporting the progress - /// + /// /// # Returns: /// A VectorisedGraph with all the documents/embeddings computed and with an initial empty selection async fn vectorise( @@ -40,6 +43,7 @@ pub trait Vectorisable { embedding: Box, cache_file: Option, override_cache: bool, + faiss: bool, verbose: bool, ) -> VectorisedGraph; @@ -51,7 +55,7 @@ pub trait Vectorisable { /// * overwrite_cache - whether or not to overwrite the cache if there are new embeddings /// * template - the template to use to translate entities into documents /// * verbose - whether or not to print logs reporting the progress - /// + /// /// # Returns: /// A VectorisedGraph with all the documents/embeddings computed and with an initial empty selection async fn vectorise_with_template>( @@ -60,6 +64,7 @@ pub trait Vectorisable { cache: Option, override_cache: bool, template: T, + faiss: bool, verbose: bool, ) -> VectorisedGraph; } @@ -71,10 +76,18 @@ impl Vectorisable for G { embedding: Box, cache: Option, overwrite_cache: bool, + faiss: bool, verbose: bool, ) -> VectorisedGraph { - self.vectorise_with_template(embedding, cache, overwrite_cache, DefaultTemplate, verbose) - .await + self.vectorise_with_template( + embedding, + cache, + overwrite_cache, + DefaultTemplate, + faiss, + verbose, + ) + .await } async fn vectorise_with_template>( @@ -83,6 +96,7 @@ impl Vectorisable for G { cache: Option, overwrite_cache: bool, template: T, + faiss: bool, verbose: bool, ) -> VectorisedGraph { let graph_docs = @@ -145,6 +159,13 @@ impl Vectorisable for G { cache_storage.iter().for_each(|cache| cache.dump_to_disk()); } + let faiss_store = if faiss { + let store = FaissStore::from_refs(&node_refs, &edge_refs); + Some(RwLock::new(store).into()) + } else { + None + }; + VectorisedGraph::new( self.clone(), template.into(), @@ -152,6 +173,7 @@ impl Vectorisable for G { graph_refs.into(), node_refs.into(), edge_refs.into(), + faiss_store, vec![], ) } diff --git a/raphtory/src/vectors/vectorised_graph.rs b/raphtory/src/vectors/vectorised_graph.rs index f2b2b2b8ee..4f4d4cdb32 100644 --- a/raphtory/src/vectors/vectorised_graph.rs +++ b/raphtory/src/vectors/vectorised_graph.rs @@ -15,12 +15,34 @@ use crate::{ }, }; use itertools::{chain, Itertools}; +use parking_lot::RwLock; use std::{ collections::{HashMap, HashSet}, path::PathBuf, sync::Arc, }; +use super::faiss_store::{DocumentPointer, FaissIndex, FaissStore}; + +// enum IndexInput<'a> { +// Native(Box)>>), +// Faiss(Vec<&'a mut FaissIndex>), +// } + +// impl<'a, I: IntoIterator)> + 'a> From +// for IndexInput<'a> +// { +// fn from(value: I) -> Self { +// IndexInput::Native(Box::new(value.into_iter())) +// } +// } + +enum AppendMode { + Nodes, + Edges, + Both, +} + #[derive(Clone, Copy)] enum ExpansionPath { Nodes, @@ -36,6 +58,7 @@ pub struct VectorisedGraph> { pub(crate) graph_documents: Arc>, pub(crate) node_documents: Arc>>, // TODO: replace with FxHashMap pub(crate) edge_documents: Arc>>, + faiss_store: Option>>, selected_docs: Vec<(DocumentRef, f32)>, empty_vec: Vec, } @@ -53,6 +76,7 @@ impl> Clone for VectorisedGraph> VectorisedGraph { graph_documents: Arc>, node_documents: Arc>>, edge_documents: Arc>>, + faiss_store: Option>>, selected_docs: Vec<(DocumentRef, f32)>, ) -> Self { Self { @@ -75,6 +100,7 @@ impl> VectorisedGraph { graph_documents, node_documents, edge_documents, + faiss_store, selected_docs, empty_vec: vec![], } @@ -181,6 +207,7 @@ impl> VectorisedGraph { } } + // FIXME: this is not included graph documents as of now /// Add the top `limit` documents to the current selection using `query` /// /// # Arguments @@ -196,8 +223,7 @@ impl> VectorisedGraph { limit: usize, window: Option<(i64, i64)>, ) -> Self { - let joined = chain!(self.node_documents.iter(), self.edge_documents.iter()); - self.add_top_documents(joined, query, limit, window) + self.add_top_documents(AppendMode::Both, query, limit, window) } /// Add the top `limit` node documents to the current selection using `query` @@ -215,7 +241,7 @@ impl> VectorisedGraph { limit: usize, window: Option<(i64, i64)>, ) -> Self { - self.add_top_documents(self.node_documents.as_ref(), query, limit, window) + self.add_top_documents(AppendMode::Nodes, query, limit, window) } /// Add the top `limit` edge documents to the current selection using `query` @@ -233,7 +259,7 @@ impl> VectorisedGraph { limit: usize, window: Option<(i64, i64)>, ) -> Self { - self.add_top_documents(self.edge_documents.as_ref(), query, limit, window) + self.add_top_documents(AppendMode::Edges, query, limit, window) } /// Add all the documents `hops` hops away to the selection @@ -312,7 +338,7 @@ impl> VectorisedGraph { /// /// # Arguments /// * query - the text or the embedding to score against - /// * limit - the maximum number of new documents to add + /// * limit - the maximum number of new documents to add /// * window - the window where documents need to belong to in order to be considered /// /// # Returns @@ -421,34 +447,64 @@ impl> VectorisedGraph { } } - fn add_top_documents<'a, I>( + fn add_top_documents<'a>( &self, - document_groups: I, + mode: AppendMode, query: &Embedding, limit: usize, window: Option<(i64, i64)>, - ) -> Self - where - I: IntoIterator)> + 'a, - { - let documents = document_groups - .into_iter() - .flat_map(|(_, embeddings)| embeddings); - - let window_docs: Box> = match window { - None => Box::new(documents), - Some((start, end)) => { - let windowed_graph = self.source_graph.window(start, end); - let filtered = documents.filter(move |document| { - document.exists_on_window(Some(&windowed_graph), window) + ) -> Self { + // we don't want to use faiss if there is a window set + let valid_faiss_store = window.and_then(|_| self.faiss_store.clone()); + let filtered: Box> = match valid_faiss_store { + None => { + let document_groups: Box)>> = + match mode { + AppendMode::Nodes => Box::new(self.node_documents.iter()), + AppendMode::Edges => Box::new(self.edge_documents.iter()), + AppendMode::Both => Box::new(chain!( + self.node_documents.iter(), + self.edge_documents.iter() + )), + }; + let documents = document_groups.flat_map(|(_, embeddings)| embeddings); + match window { + None => Box::new(documents), + Some((start, end)) => { + let windowed_graph = self.source_graph.window(start, end); + let filtered = documents.filter(move |document| { + document.exists_on_window(Some(&windowed_graph), window) + }); + Box::new(filtered) + } + } + } + Some(store) => { + let mut store = store.write(); + let pointers = match mode { + AppendMode::Nodes => store.nodes.search(query, limit).into_iter(), + AppendMode::Edges => store.edges.search(query, limit).into_iter(), + AppendMode::Both => { + let mut pointers = store.nodes.search(query, limit); + pointers.extend(store.edges.search(query, limit)); + pointers.into_iter() + } + }; + let doc_refs = pointers.map(|DocumentPointer { entity, subindex }| { + let doc_group = match entity { + EntityId::Node { .. } => self.node_documents.get(&entity), + EntityId::Edge { .. } => self.edge_documents.get(&entity), + EntityId::Graph { .. } => panic!("this is illegal"), + }; + doc_group.unwrap().get(subindex).unwrap() }); - Box::new(filtered) + Box::new(doc_refs.collect_vec().into_iter()) } }; let new_len = self.selected_docs.len() + limit; - let scored_nodes = score_documents(query, window_docs.cloned()); // TODO: try to remove this clone - let candidates = find_top_k(scored_nodes, usize::MAX); + let scored_docs = score_documents(query, filtered.cloned()); // TODO: try to remove this clone + let candidates = find_top_k(scored_docs, limit); // TODO: review, this used to be usize::MAX instead of limit let new_selected = extend_selection(self.selected_docs.clone(), candidates, new_len); Self { @@ -457,6 +513,35 @@ impl> VectorisedGraph { } } + // fn native_search<'a, I>( + // &self, + // document_groups: I, + // query: &Embedding, + // limit: usize, + // window: Option<(i64, i64)>, + // ) -> impl Iterator + // where + // I: IntoIterator)> + 'a, + // { + // let documents = document_groups + // .into_iter() + // .flat_map(|(_, embeddings)| embeddings); + + // let window_docs: Box> = match window { + // None => Box::new(documents), + // Some((start, end)) => { + // let windowed_graph = self.source_graph.window(start, end); + // let filtered = documents.filter(move |document| { + // document.exists_on_window(Some(&windowed_graph), window) + // }); + // Box::new(filtered) + // } + // }; + + // let scored_docs = score_documents(query, window_docs.cloned()); // TODO: try to remove this clone + // find_top_k(scored_docs, limit) // TODO: review, this used to be usize::MAX instead of limit + // } + // this might return the document used as input, uniqueness need to be check outside of this fn get_context<'a, W: StaticGraphViewOps>( &'a self, diff --git a/raphtory/src/vectors/vectorised_graph_storage.rs b/raphtory/src/vectors/vectorised_graph_storage.rs index 4adda4ab20..5dc450be40 100644 --- a/raphtory/src/vectors/vectorised_graph_storage.rs +++ b/raphtory/src/vectors/vectorised_graph_storage.rs @@ -151,6 +151,7 @@ impl VectorisedGraphStorage { Arc::new(graph_documents), Arc::new(node_documents), Arc::new(edge_documents), + None, // FIXME: recompute the faiss store optionally vec![], )) } From 12ef93b68ca230aee8f4ec32c8596b78d1dfed14 Mon Sep 17 00:00:00 2001 From: Pedro Rico Pinazo Date: Tue, 27 Feb 2024 22:39:19 +0000 Subject: [PATCH 2/9] fix multiple bugs --- raphtory/src/vectors/faiss_store.rs | 34 +++++--- raphtory/src/vectors/mod.rs | 103 ++++++++++++++++------- raphtory/src/vectors/vectorised_graph.rs | 13 ++- 3 files changed, 107 insertions(+), 43 deletions(-) diff --git a/raphtory/src/vectors/faiss_store.rs b/raphtory/src/vectors/faiss_store.rs index d1ad2b31eb..6a9978f0b5 100644 --- a/raphtory/src/vectors/faiss_store.rs +++ b/raphtory/src/vectors/faiss_store.rs @@ -3,7 +3,7 @@ use faiss::{index::IndexImpl, index_factory, Idx, Index, MetricType}; use itertools::Itertools; use std::collections::HashMap; -#[derive(Clone)] +#[derive(Clone, Debug)] pub(crate) struct DocumentPointer { pub(crate) entity: EntityId, pub(crate) subindex: usize, // TODO: reduce this inside and provide nice error when there are too much documents for some entity @@ -15,18 +15,20 @@ pub(crate) struct FaissIndex { } impl FaissIndex { - fn get(&self, idx: &Idx) -> DocumentPointer { - self.mapping - .get(idx.get().unwrap() as usize) - .unwrap() - .clone() + fn get(&self, idx: u64) -> DocumentPointer { + self.mapping.get(idx as usize).unwrap().clone() } /// This function returns a vector just to take ownership of Faiss results pub(crate) fn search(&mut self, query: &Embedding, limit: usize) -> Vec { + // TODO: assert that the length of the query is correct let result = self.index.search(query.as_slice(), limit); match result { - Ok(result) => result.labels.iter().map(|idx| self.get(idx)).collect_vec(), + Ok(result) => { + dbg!(&result); + let valid_labels = result.labels.iter().filter_map(|idx| idx.get()); + valid_labels.map(|idx| self.get(idx)).collect_vec() + } Err(_) => vec![], } } @@ -42,9 +44,16 @@ impl FaissStore { nodes: &HashMap>, edges: &HashMap>, ) -> Self { + // TODO: review, this doesnt froup if there are empty groups! + let maybe_node_group = nodes.iter().next(); + let maybe_edge_group = edges.iter().next(); + let maybe_group = maybe_node_group.or(maybe_edge_group); + let maybe_vector = maybe_group.and_then(|(_, docs)| docs.get(0)); + let dim = maybe_vector.map(|vec| vec.embedding.len()).unwrap_or(1) as u32; + dbg!(&dim); Self { - nodes: build_entity_index(nodes), - edges: build_entity_index(edges), + nodes: build_entity_index(nodes, dim), + edges: build_entity_index(edges, dim), } } @@ -65,15 +74,16 @@ impl FaissStore { // } } -fn build_entity_index(entities: &HashMap>) -> FaissIndex { - let mut index = index_factory(3, "IDMap,Flat", MetricType::InnerProduct).unwrap(); // FIXME: this can't be 3, needs to be variable!!!!!!! +fn build_entity_index(entities: &HashMap>, dim: u32) -> FaissIndex { + dbg!(&dim); + let mut index = index_factory(dim, "IDMap,Flat", MetricType::InnerProduct).unwrap(); let mut mapping = vec![]; for (entity, doc_refs) in entities { for (subindex, doc_ref) in doc_refs.iter().enumerate() { let entity = entity.clone(); + let index_for_faiss = mapping.len() as i64; mapping.push(DocumentPointer { entity, subindex }); let embedding = doc_ref.embedding.as_slice(); - let index_for_faiss = mapping.len() as i64; // this can be improved by putting embeddings in a contiguous memory slice index .add_with_ids(embedding, &[index_for_faiss.into()]) diff --git a/raphtory/src/vectors/mod.rs b/raphtory/src/vectors/mod.rs index b90b746b01..caa5703fbc 100644 --- a/raphtory/src/vectors/mod.rs +++ b/raphtory/src/vectors/mod.rs @@ -478,7 +478,14 @@ age: 30"###; async fn predictable_embedding(texts: Vec) -> Vec { texts .into_iter() - .map(|text| vec![text.parse::().unwrap(), 0.0, 0.0, 0.0, 0.0]) + .map(|text| { + let index = text.parse::().unwrap(); + let mut vector = vec![0.0; 10]; + if let Some(element) = vector.get_mut(index) { + *element = 1.0; + }; + vector + }) .collect_vec() } @@ -498,11 +505,32 @@ age: 30"###; } #[tokio::test] - async fn test_faiss_2() { + async fn test_faiss_empty() { let g = Graph::new(); - for n in 1..=100 { + + let v = g + .vectorise_with_template( + Box::new(predictable_embedding), + None, + false, + PredictableTemplate, + true, + true, + ) + .await; + + let selection = v.append_nodes_by_similarity(&vec![5.0, 0.0, 0.0, 0.0, 0.0], 10, None); + + let len = selection.get_documents().len(); + assert_eq!(len, 0); + } + + #[tokio::test] + async fn test_faiss_full() { + let g = Graph::new(); + for n in 0..10 { g.add_node(0, n, NO_PROPS, None); - g.add_edge(0, 1, n, NO_PROPS, None); + g.add_edge(0, n, 1, NO_PROPS, None); } let v = g @@ -516,35 +544,50 @@ age: 30"###; ) .await; - let selection = v.append_nodes_by_similarity(&vec![5.0, 0.0, 0.0, 0.0, 0.0], 1, None); + let query = vec![1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]; + let selection = v.append_nodes_by_similarity(&query, 20, None); - let (doc, score) = selection.get_documents_with_scores().remove(0); - assert_eq!(doc.into_content(), "5"); - assert_eq!(score, 1.0); + let len = selection.get_documents_with_scores().len(); + assert_eq!(len, 10); } - #[test] - fn test_faiss() { - let v1 = [0.0, 1.0, 1.0]; - let v2 = [0.0, 1.0, 1.0]; - let my_query = [0.0, 1.0, 1.0]; - - let mut index = index_factory(3, "IDMap,Flat", MetricType::L2).unwrap(); - index.add_with_ids(&v1, &[0.into()]).unwrap(); - index.add_with_ids(&v2, &[1.into()]).unwrap(); - - let result = index.search(&my_query, 5).unwrap(); - println!("---------------------"); - println!("got result"); - println!("{result:?}"); - for (i, (l, d)) in result - .labels - .iter() - .zip(result.distances.iter()) - .enumerate() - { - println!("#{}: {} (D={})", i + 1, *l, *d); + #[tokio::test] + async fn test_faiss_normal() { + let g = Graph::new(); + for n in 0..10 { + g.add_node(0, n, NO_PROPS, None); + g.add_edge(0, n, 0, NO_PROPS, None); } - println!("---------------------"); + + let v = g + .vectorise_with_template( + Box::new(predictable_embedding), + None, + false, + PredictableTemplate, + true, + true, + ) + .await; + + let query = vec![1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]; + + let selection = v.append_nodes_by_similarity(&query, 5, None); + let (doc, score) = selection.get_documents_with_scores().remove(0); + assert_eq!(doc.into_content(), "0"); + assert_eq!(score, 1.0); + + let selection = v.append_edges_by_similarity(&query, 5, None); + let (doc, score) = selection.get_documents_with_scores().remove(0); + assert_eq!(doc.into_content(), "0"); + assert_eq!(score, 1.0); + + let selection = v.append_by_similarity(&query, 5, None); + let (doc, score) = selection.get_documents_with_scores().remove(0); + assert_eq!(doc.into_content(), "0"); + assert_eq!(score, 1.0); + let (doc, score) = selection.get_documents_with_scores().remove(0); + assert_eq!(doc.into_content(), "0"); + assert_eq!(score, 1.0); } } diff --git a/raphtory/src/vectors/vectorised_graph.rs b/raphtory/src/vectors/vectorised_graph.rs index 4f4d4cdb32..8a7c11060b 100644 --- a/raphtory/src/vectors/vectorised_graph.rs +++ b/raphtory/src/vectors/vectorised_graph.rs @@ -37,6 +37,7 @@ use super::faiss_store::{DocumentPointer, FaissIndex, FaissStore}; // } // } +#[derive(Debug)] enum AppendMode { Nodes, Edges, @@ -454,8 +455,16 @@ impl> VectorisedGraph { limit: usize, window: Option<(i64, i64)>, ) -> Self { + println!("---------------here-------------------"); + dbg!(&self.faiss_store.is_some()); // we don't want to use faiss if there is a window set - let valid_faiss_store = window.and_then(|_| self.faiss_store.clone()); + let valid_faiss_store = match window { + Some(_) => None, + None => self.faiss_store.clone(), + }; + + dbg!(&valid_faiss_store.is_some()); + dbg!(&mode); let filtered: Box> = match valid_faiss_store { None => { let document_groups: Box)>> = @@ -490,6 +499,8 @@ impl> VectorisedGraph { pointers.into_iter() } }; + println!("--------------------------------"); + dbg!(&pointers); let doc_refs = pointers.map(|DocumentPointer { entity, subindex }| { let doc_group = match entity { EntityId::Node { .. } => self.node_documents.get(&entity), From 1890e2bac894b027bcc3441c4b625947748d607b Mon Sep 17 00:00:00 2001 From: Pedro Rico Pinazo Date: Wed, 28 Feb 2024 11:36:15 +0000 Subject: [PATCH 3/9] fix python build --- raphtory-graphql/src/server.rs | 3 +-- raphtory/src/python/packages/vectors.rs | 21 ++++++++++++++++----- raphtory/src/vectors/faiss_store.rs | 2 +- raphtory/src/vectors/vectorisable.rs | 2 +- raphtory/src/vectors/vectorised_graph.rs | 2 +- 5 files changed, 20 insertions(+), 10 deletions(-) diff --git a/raphtory-graphql/src/server.rs b/raphtory-graphql/src/server.rs index e8ddda0b16..a4511f727b 100644 --- a/raphtory-graphql/src/server.rs +++ b/raphtory-graphql/src/server.rs @@ -86,7 +86,6 @@ impl RaphtoryServer { embedding: F, cache: &Path, template: Option, - faiss: boolean, ) -> Self where F: EmbeddingFunction + Clone + 'static, @@ -116,7 +115,7 @@ impl RaphtoryServer { Some(graph_cache), true, template.clone(), - faiss, + true, true, ) .await; diff --git a/raphtory/src/python/packages/vectors.rs b/raphtory/src/python/packages/vectors.rs index eef3e2e51b..9db528764a 100644 --- a/raphtory/src/python/packages/vectors.rs +++ b/raphtory/src/python/packages/vectors.rs @@ -31,7 +31,7 @@ use pyo3::{ prelude::*, types::{PyFunction, PyList}, }; -use std::{path::PathBuf, sync::Arc}; +use std::{collections::HashMap, path::PathBuf, sync::Arc}; pub type PyWindow = Option<(PyTime, PyTime)>; @@ -300,7 +300,7 @@ impl PyGraphView { /// /// Returns: /// A VectorisedGraph with all the documents/embeddings computed and with an initial empty selection - #[pyo3(signature = (embedding, cache = None, overwrite_cache = false, graph_document = None, node_document = None, edge_document = None, verbose = false))] + #[pyo3(signature = (embedding, cache = None, overwrite_cache = false, graph_document = None, node_document = None, edge_document = None, index = "native", verbose = false))] fn vectorise( &self, embedding: &PyFunction, @@ -309,23 +309,34 @@ impl PyGraphView { graph_document: Option, node_document: Option, edge_document: Option, + index: &str, verbose: bool, - ) -> DynamicVectorisedGraph { + ) -> PyResult { let embedding: Py = embedding.into(); let graph = self.graph.clone(); let cache = cache.map(PathBuf::from); let template = PyDocumentTemplate::new(graph_document, node_document, edge_document); - execute_async_task(move || async move { + + let index_enum = HashMap::from([("flat", true), ("native", false)]); + let use_faiss = index_enum.get(index).cloned().ok_or_else(|| { + let valid_values = index_enum.keys().join(", "); + let message = format!("invalid value for `index`. Valid values are: {valid_values}"); + PyAttributeError::new_err(message) + })?; + + let vectorised_graph = execute_async_task(move || async move { graph .vectorise_with_template( Box::new(embedding.clone()), cache, overwrite_cache, Arc::new(template) as Arc>, + use_faiss, verbose, ) .await - }) + }); + Ok(vectorised_graph) } } diff --git a/raphtory/src/vectors/faiss_store.rs b/raphtory/src/vectors/faiss_store.rs index 6a9978f0b5..1724cbf66d 100644 --- a/raphtory/src/vectors/faiss_store.rs +++ b/raphtory/src/vectors/faiss_store.rs @@ -1,5 +1,5 @@ use super::{document_ref::DocumentRef, entity_id::EntityId, Embedding}; -use faiss::{index::IndexImpl, index_factory, Idx, Index, MetricType}; +use faiss::{index::IndexImpl, index_factory, Index, MetricType}; use itertools::Itertools; use std::collections::HashMap; diff --git a/raphtory/src/vectors/vectorisable.rs b/raphtory/src/vectors/vectorisable.rs index f337bd300b..a72e4524c2 100644 --- a/raphtory/src/vectors/vectorisable.rs +++ b/raphtory/src/vectors/vectorisable.rs @@ -14,7 +14,7 @@ use itertools::Itertools; use parking_lot::RwLock; use std::{collections::HashMap, path::PathBuf}; -use super::faiss_store::{self, FaissStore}; +use super::faiss_store::FaissStore; const CHUNK_SIZE: usize = 1000; diff --git a/raphtory/src/vectors/vectorised_graph.rs b/raphtory/src/vectors/vectorised_graph.rs index 8a7c11060b..163d8f3de9 100644 --- a/raphtory/src/vectors/vectorised_graph.rs +++ b/raphtory/src/vectors/vectorised_graph.rs @@ -22,7 +22,7 @@ use std::{ sync::Arc, }; -use super::faiss_store::{DocumentPointer, FaissIndex, FaissStore}; +use super::faiss_store::{DocumentPointer, FaissStore}; // enum IndexInput<'a> { // Native(Box)>>), From 3ec40f2239d7f15e538a7e564ddd66f169ebe988 Mon Sep 17 00:00:00 2001 From: Pedro Rico Pinazo Date: Thu, 29 Feb 2024 12:03:05 +0000 Subject: [PATCH 4/9] enable index training and add bench --- Cargo.lock | 3 + raphtory-benchmark/Cargo.toml | 9 ++- raphtory-benchmark/benches/vectors.rs | 83 ++++++++++++++++++++++++ raphtory/src/vectors/faiss_store.rs | 43 +++++++----- raphtory/src/vectors/vectorised_graph.rs | 6 -- 5 files changed, 118 insertions(+), 26 deletions(-) create mode 100644 raphtory-benchmark/benches/vectors.rs diff --git a/Cargo.lock b/Cargo.lock index c61c56507c..05dc929941 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -716,6 +716,7 @@ dependencies = [ "ciborium", "clap", "criterion-plot", + "futures", "is-terminal", "itertools 0.10.5", "num-traits", @@ -728,6 +729,7 @@ dependencies = [ "serde_derive", "serde_json", "tinytemplate", + "tokio", "walkdir", ] @@ -2909,6 +2911,7 @@ dependencies = [ "raphtory", "rayon", "sorted_vector_map", + "tokio", ] [[package]] diff --git a/raphtory-benchmark/Cargo.toml b/raphtory-benchmark/Cargo.toml index 2dc29e3e0a..af298291fa 100644 --- a/raphtory-benchmark/Cargo.toml +++ b/raphtory-benchmark/Cargo.toml @@ -6,8 +6,9 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -criterion = "0.5.1" -raphtory = { path = "../raphtory" , features=["io"]} +criterion = { version = "0.5.1", features = ["async_tokio"] } +tokio = { version = "1.27.0", features = ["full"] } +raphtory = { path = "../raphtory", features = ["io", "vectors"] } sorted_vector_map = "0.1" rand = "0.8.5" rayon = "1" @@ -35,3 +36,7 @@ harness = false [[bench]] name = "edge_add" harness = false + +[[bench]] +name = "vectors" +harness = false diff --git a/raphtory-benchmark/benches/vectors.rs b/raphtory-benchmark/benches/vectors.rs new file mode 100644 index 0000000000..73ada1da54 --- /dev/null +++ b/raphtory-benchmark/benches/vectors.rs @@ -0,0 +1,83 @@ +use criterion::{criterion_group, criterion_main, Criterion}; +use rand::{ + distributions::{Alphanumeric, DistString}, + thread_rng, Rng, +}; +use raphtory::core::DocumentInput; +use raphtory::db::graph::views::deletion_graph::GraphWithDeletions; +use raphtory::vectors::{document_template::DocumentTemplate, vectorisable::Vectorisable}; +use raphtory::{core::entities::nodes::input_node::InputNode, prelude::*, vectors::Embedding}; +use std::path::PathBuf; +use tokio::runtime::Runtime; + +mod common; + +async fn random_embedding(texts: Vec) -> Vec { + let mut rng = thread_rng(); + texts + .iter() + .map(|_| (0..1536).map(|_| rng.gen()).collect()) + .collect() +} + +struct EmptyTemplate; + +impl DocumentTemplate for EmptyTemplate { + fn graph(&self, _graph: &Graph) -> Box> { + Box::new(std::iter::empty()) + } + + fn node( + &self, + _node: &raphtory::db::graph::node::NodeView, + ) -> Box> { + Box::new(std::iter::once("".into())) + } + + fn edge( + &self, + _edge: &raphtory::db::graph::edge::EdgeView, + ) -> Box> { + Box::new(std::iter::once("".into())) + } +} + +pub fn vectors(c: &mut Criterion) { + let rt = Runtime::new().unwrap(); + let g = Graph::new(); + for id in 0..500_000 { + g.add_node(0, id, NO_PROPS, None).unwrap(); + } + for id in 0..500_000 { + g.add_edge(0, 0, id, NO_PROPS, None).unwrap(); + } + let query = rt.block_on(random_embedding(vec!["".to_owned()])).remove(0); + let cache_path = || Some(PathBuf::from("/tmp/raphtory/vector-bench")); + + let native_vectorised_graph = rt.block_on(g.vectorise_with_template( + Box::new(random_embedding), + cache_path(), + true, + EmptyTemplate, + false, // use faiss + false, + )); + c.bench_function("native-index", |b| { + b.iter(|| native_vectorised_graph.append_by_similarity(&query, 1, None)); + }); + + let faiss_vectorised_graph = rt.block_on(g.vectorise_with_template( + Box::new(random_embedding), + cache_path(), + true, + EmptyTemplate, + true, // use faiss + false, + )); + c.bench_function("faiss-index", |b| { + b.iter(|| faiss_vectorised_graph.append_by_similarity(&query, 1, None)); + }); +} + +criterion_group!(benches, vectors); +criterion_main!(benches); diff --git a/raphtory/src/vectors/faiss_store.rs b/raphtory/src/vectors/faiss_store.rs index 1724cbf66d..81d9eedd54 100644 --- a/raphtory/src/vectors/faiss_store.rs +++ b/raphtory/src/vectors/faiss_store.rs @@ -1,5 +1,5 @@ use super::{document_ref::DocumentRef, entity_id::EntityId, Embedding}; -use faiss::{index::IndexImpl, index_factory, Index, MetricType}; +use faiss::{index::IndexImpl, index_factory, Idx, Index, MetricType}; use itertools::Itertools; use std::collections::HashMap; @@ -25,7 +25,6 @@ impl FaissIndex { let result = self.index.search(query.as_slice(), limit); match result { Ok(result) => { - dbg!(&result); let valid_labels = result.labels.iter().filter_map(|idx| idx.get()); valid_labels.map(|idx| self.get(idx)).collect_vec() } @@ -50,7 +49,6 @@ impl FaissStore { let maybe_group = maybe_node_group.or(maybe_edge_group); let maybe_vector = maybe_group.and_then(|(_, docs)| docs.get(0)); let dim = maybe_vector.map(|vec| vec.embedding.len()).unwrap_or(1) as u32; - dbg!(&dim); Self { nodes: build_entity_index(nodes, dim), edges: build_entity_index(edges, dim), @@ -75,20 +73,29 @@ impl FaissStore { } fn build_entity_index(entities: &HashMap>, dim: u32) -> FaissIndex { - dbg!(&dim); - let mut index = index_factory(dim, "IDMap,Flat", MetricType::InnerProduct).unwrap(); - let mut mapping = vec![]; - for (entity, doc_refs) in entities { - for (subindex, doc_ref) in doc_refs.iter().enumerate() { - let entity = entity.clone(); - let index_for_faiss = mapping.len() as i64; - mapping.push(DocumentPointer { entity, subindex }); - let embedding = doc_ref.embedding.as_slice(); - // this can be improved by putting embeddings in a contiguous memory slice - index - .add_with_ids(embedding, &[index_for_faiss.into()]) - .unwrap(); - } - } + let mut index = index_factory(dim, "IVF4096_HNSW32,Flat", MetricType::InnerProduct).unwrap(); + + let flattened = entities.iter().flat_map(|(entity, docs)| { + docs.iter() + .enumerate() + .map(|(subindex, doc)| (entity.clone(), subindex, doc)) + }); + let mapping = flattened + .clone() + .map(|(entity, subindex, _)| DocumentPointer { entity, subindex }) + .collect_vec(); + let data_vec = flattened + .clone() + .flat_map(|(_, _, doc)| doc.embedding.clone()) + .collect_vec(); + let data = data_vec.as_slice(); + let ids: Vec = flattened + .enumerate() + .map(|(id, _)| (id as i64).into()) + .collect_vec(); + + index.train(data).unwrap(); + index.add_with_ids(data, ids.as_slice()).unwrap(); + FaissIndex { index, mapping } } diff --git a/raphtory/src/vectors/vectorised_graph.rs b/raphtory/src/vectors/vectorised_graph.rs index 163d8f3de9..395a9be3b6 100644 --- a/raphtory/src/vectors/vectorised_graph.rs +++ b/raphtory/src/vectors/vectorised_graph.rs @@ -455,16 +455,12 @@ impl> VectorisedGraph { limit: usize, window: Option<(i64, i64)>, ) -> Self { - println!("---------------here-------------------"); - dbg!(&self.faiss_store.is_some()); // we don't want to use faiss if there is a window set let valid_faiss_store = match window { Some(_) => None, None => self.faiss_store.clone(), }; - dbg!(&valid_faiss_store.is_some()); - dbg!(&mode); let filtered: Box> = match valid_faiss_store { None => { let document_groups: Box)>> = @@ -499,8 +495,6 @@ impl> VectorisedGraph { pointers.into_iter() } }; - println!("--------------------------------"); - dbg!(&pointers); let doc_refs = pointers.map(|DocumentPointer { entity, subindex }| { let doc_group = match entity { EntityId::Node { .. } => self.node_documents.get(&entity), From 0551e01d5095576b6e53618a05be0f3507e51ba6 Mon Sep 17 00:00:00 2001 From: Pedro Rico Pinazo Date: Mon, 4 Mar 2024 16:38:59 +0000 Subject: [PATCH 5/9] merge ondisk example running --- .gitmodules | 3 ++ Cargo.toml | 23 +++++++----- disk-faiss/Cargo.toml | 21 +++++++++++ disk-faiss/build.rs | 42 +++++++++++++++++++++ disk-faiss/src/lib.rs | 79 ++++++++++++++++++++++++++++++++++++++++ disk-test/Cargo.toml | 16 ++++++++ disk-test/src/main.rs | 85 +++++++++++++++++++++++++++++++++++++++++++ faiss-rs | 1 + raphtory/Cargo.toml | 4 +- 9 files changed, 264 insertions(+), 10 deletions(-) create mode 100644 .gitmodules create mode 100644 disk-faiss/Cargo.toml create mode 100644 disk-faiss/build.rs create mode 100644 disk-faiss/src/lib.rs create mode 100644 disk-test/Cargo.toml create mode 100644 disk-test/src/main.rs create mode 160000 faiss-rs diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000000..634eb0b5ff --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "faiss-rs"] + path = faiss-rs + url = git@github.com:Enet4/faiss-rs.git diff --git a/Cargo.toml b/Cargo.toml index 3e6df1be68..88640eb6e3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,13 +1,18 @@ [workspace] members = [ - "raphtory", - "raphtory-benchmark", - "examples/rust", - "examples/netflow", - "python", - "js-raphtory", - "raphtory-graphql", - "comparison-benchmark/rust/raphtory-rust-benchmark" + "raphtory", + "raphtory-benchmark", + "examples/rust", + "examples/netflow", + "python", + "js-raphtory", + "raphtory-graphql", + "comparison-benchmark/rust/raphtory-rust-benchmark", + "faiss-rs", + "faiss-rs/faiss-sys", + "pometry-faiss", + "disk-faiss", + "disk-test", ] default-members = ["raphtory"] @@ -25,4 +30,4 @@ edition = "2021" [profile.release-with-debug] inherits = "release" -debug = true \ No newline at end of file +debug = true diff --git a/disk-faiss/Cargo.toml b/disk-faiss/Cargo.toml new file mode 100644 index 0000000000..feb5b0a777 --- /dev/null +++ b/disk-faiss/Cargo.toml @@ -0,0 +1,21 @@ +[package] +name = "disk-faiss" +# description = "Raphtory GraphQL server" +edition.workspace = true +rust-version.workspace = true +version.workspace = true +keywords.workspace = true +authors.workspace = true +documentation.workspace = true +repository.workspace = true +license.workspace = true +readme.workspace = true +homepage.workspace = true +# links = "faiss_c" +build = "build.rs" + +[dependencies] +cpp = "0.5.4" + +[build-dependencies] +cpp_build = "0.5.4" diff --git a/disk-faiss/build.rs b/disk-faiss/build.rs new file mode 100644 index 0000000000..205bc9a34b --- /dev/null +++ b/disk-faiss/build.rs @@ -0,0 +1,42 @@ +use std::path::PathBuf; + +extern crate cpp_build; + +fn main() { + if let Ok(paths) = std::env::var("LD_LIBRARY_PATH") { + for path in paths.split(":") { + if path != "" { + println!("cargo:rustc-link-search={}", path); + } + } + }; + + println!("cargo:rustc-link-search=/usr/local/lib"); + println!("cargo:rustc-link-search=/usr/lib"); + + if get_os_type() == "macos" { + println!("cargo:rustc-link-lib=omp"); + println!("cargo:rustc-link-lib=faiss"); + } else { + println!("cargo:rustc-link-lib=static=faiss"); + println!("cargo:rustc-link-lib=gomp"); + println!("cargo:rustc-link-lib=blas"); + println!("cargo:rustc-link-lib=lapack"); + } + + cpp_build::Config::new() + .include(PathBuf::from( + "/Users/pedrorico/pometry/raphtory/faiss-rs/faiss-sys/faiss", + )) + .build("src/lib.rs"); +} + +fn get_os_type() -> &'static str { + if cfg!(target_os = "linux") { + return "linux"; + } else if cfg!(target_os = "macos") { + return "macos"; + } else { + panic!("unknow os type"); + } +} diff --git a/disk-faiss/src/lib.rs b/disk-faiss/src/lib.rs new file mode 100644 index 0000000000..26b7a025f9 --- /dev/null +++ b/disk-faiss/src/lib.rs @@ -0,0 +1,79 @@ +#![recursion_limit = "512"] +// #![cfg_attr(not(test), allow(dead_code, unused_imports))] +// #![allow(unused)] + +// #[macro_use] +// extern crate cpp; + +use cpp::cpp; + +cpp! {{ + #include + #include + #include + #include + #include + #include +}} + +pub fn merge_ondisk(index: &str, shards: Vec<&str>, ivfdata: &str, output: &str) { + let index_path = std::ffi::CString::new(index).unwrap(); + let index_path = index.as_ptr(); + + let shards: Vec<_> = shards + .iter() + .map(|shard| std::ffi::CString::new(*shard).unwrap()) + .collect(); + let shards: Vec<_> = shards.iter().map(|shard| shard.as_ptr()).collect(); + let shards = &shards; + + let ivfdata = std::ffi::CString::new(ivfdata).unwrap(); + let ivfdata = ivfdata.as_ptr(); + + let output = std::ffi::CString::new(output).unwrap(); + let output = output.as_ptr(); + + unsafe { + cpp!([index_path as "const char *", shards as "std::vector *", ivfdata as "const char *", output as "const char *"] { + + // try { + faiss::IndexIVFFlat* index = (faiss::IndexIVFFlat*) faiss::read_index(index_path, 0); // TODO: review: 0???????????????????? + // auto index_ref = dynamic_cast(index); + // return *index_ref ; + + std::vector ivfs; + for (const auto& shard : *shards) { + faiss::IndexIVFFlat* ivf = (faiss::IndexIVFFlat*) faiss::read_index(shard, faiss::IO_FLAG_MMAP); + ivfs.push_back(ivf); + } + + if (index->ntotal != 0) { + std::exit(1); + } + + auto invlists = faiss::OnDiskInvertedLists( + index->nlist, index->code_size, ivfdata + ); + + // auto ivf_vector = faiss::InvertedListsPtrVector(); + // for (const auto& ivf : ivfs) { + // ivf_vector.push_back(ivf); + // } + + const faiss::InvertedLists **ivfs_data = (const faiss::InvertedLists**) ivfs.data(); + auto ntotal = invlists.merge_from(ivfs_data, ivfs.size()); // TODO: this has a verbose parameter I can use + + index->ntotal = ntotal; + index->replace_invlists(&invlists, true); + // invlists.this.disown(); ???????????????????????? + + faiss::write_index(index, output); + // } catch (const std::exception &e) { + // std::cerr << "cpp exception"; + // std::cerr << e.what(); + // } catch { + // std::cerr << "unknown exception"; + // } + }) + }; +} diff --git a/disk-test/Cargo.toml b/disk-test/Cargo.toml new file mode 100644 index 0000000000..1507b67213 --- /dev/null +++ b/disk-test/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "disk-test" +edition.workspace = true +rust-version.workspace = true +version.workspace = true +keywords.workspace = true +authors.workspace = true +documentation.workspace = true +repository.workspace = true +license.workspace = true +readme.workspace = true +homepage.workspace = true + +[dependencies] +disk-faiss = { path = "../disk-faiss" } +faiss = { path = "../faiss-rs" } diff --git a/disk-test/src/main.rs b/disk-test/src/main.rs new file mode 100644 index 0000000000..698fce7bef --- /dev/null +++ b/disk-test/src/main.rs @@ -0,0 +1,85 @@ +use disk_faiss::merge_ondisk; +use faiss::{index_factory, read_index, write_index, Idx, Index, MetricType}; +use std::fs; +use std::io::Error as IoError; +use std::io::ErrorKind::InvalidData; + +struct FVecsContent { + dimensions: u32, + vectors: Vec, +} + +fn read_fvecs(file_name: &str) -> Result { + let data = fs::read(file_name)?; + let (dim_data, vector_data) = data.split_at(4); + let dim = dim_data + .try_into() + .map_err(|e| IoError::new(InvalidData, e))?; + let dimensions = u32::from_le_bytes(dim); + let vectors: Vec<_> = vector_data + .chunks_exact(4) + .map(|chunk| f32::from_le_bytes(chunk.try_into().unwrap())) + .collect(); + + Ok(FVecsContent { + dimensions, + vectors, + }) +} + +fn main() { + let tmpfile = |filename: &str| "/tmp/faiss-disk-test/".to_owned() + filename; + + println!("Training index"); + let FVecsContent { + dimensions, + vectors, + } = read_fvecs("resources/sift/sift_learn.fvecs").unwrap(); + // println!("dimensions -> {dimensions}"); + // let sample: Vec<_> = flattened_vectors.iter().take(8).collect(); + // println!("sample -> {sample:?}"); + let mut index = index_factory(dimensions, "IVF4096,Flat", MetricType::InnerProduct).unwrap(); + index.train(vectors.as_slice()).unwrap(); + write_index(&index, tmpfile("trained.index")).unwrap(); + + println!("Splitting vectors into files"); + let vectors = read_fvecs("resources/sift/sift_base.fvecs") + .unwrap() + .vectors; + + let num_chunks = 4; + let num_vectors = vectors.len() / dimensions as usize; + let vectors_per_chunk = num_vectors / num_chunks + 1; + let chunk_files: Vec<_> = (0..num_chunks) + .map(|chunk_number| format!("block_{chunk_number}.index")) + .collect(); + let chunk_files: Vec<_> = chunk_files.iter().map(|f| f.as_str()).collect(); + + for ((chunk_number, chunk), filename) in vectors + .chunks(vectors_per_chunk * dimensions as usize) + .enumerate() + .zip(chunk_files.iter()) + { + let first_id = vectors_per_chunk * chunk_number; + let ids_range = first_id..(first_id + chunk.len()); + let ids: Vec<_> = ids_range.map(|id| Idx::from(id as i64)).collect(); + let mut index = read_index(tmpfile("trained.index")).unwrap(); + index.add_with_ids(chunk, ids.as_slice()).unwrap(); + write_index(&index, tmpfile(filename)).unwrap(); + } + + println!("merging indexes on disk"); + merge_ondisk( + "trained.index", + chunk_files, + &tmpfile("merged_index.ivfdata"), + &tmpfile("populated.index"), + ); + + println!("using the ondisk index"); + let mut index = read_index(&tmpfile("populated.index")).unwrap(); + let queries = read_fvecs("resources/sift/sift_query.fvecs").unwrap(); + + let result = index.search(&queries.vectors, 5).unwrap(); + println!("result: {result:?}"); +} diff --git a/faiss-rs b/faiss-rs new file mode 160000 index 0000000000..358f4cd98b --- /dev/null +++ b/faiss-rs @@ -0,0 +1 @@ +Subproject commit 358f4cd98b4bdf66013e9b792140944e6b275d69 diff --git a/raphtory/Cargo.toml b/raphtory/Cargo.toml index bc615fddd4..8e36e71809 100644 --- a/raphtory/Cargo.toml +++ b/raphtory/Cargo.toml @@ -62,7 +62,9 @@ tantivy = { version = "0.21.1", optional = true } futures-util = { version = "0.3.0", optional = true } async-trait = { version = "0.1.73", optional = true } async-openai = { version = "0.17.1", optional = true } -faiss = { version = "0.12.1", optional = true } +# faiss = { version = "0.12.1", optional = true, features = ["static"] } +# faiss = { path = "../faiss-rs", optional = true, features = ["static"] } +faiss = { path = "../faiss-rs", optional = true } # python binding optional dependencies pyo3 = { version = "0.20.0", features = [ From cce5414e5f35911bf37ead3703c1bf534f5023ff Mon Sep 17 00:00:00 2001 From: Pedro Rico Pinazo Date: Tue, 5 Mar 2024 15:34:14 +0000 Subject: [PATCH 6/9] doing progress --- Cargo.lock | 777 +++++++++++++++++++++++--- disk-faiss/src/lib.rs | 92 +-- disk-test/src/main.rs | 16 +- raphtory-benchmark/Cargo.toml | 1 + raphtory-benchmark/benches/vectors.rs | 128 ++++- 5 files changed, 876 insertions(+), 138 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 05dc929941..fb7a4023b2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -169,7 +169,7 @@ dependencies = [ "hash_hasher", "hashbrown 0.14.3", "num-traits", - "rustc_version", + "rustc_version 0.4.0", "simdutf8", ] @@ -231,8 +231,8 @@ dependencies = [ "async-graphql-parser", "darling 0.20.6", "proc-macro-crate 1.3.1", - "proc-macro2", - "quote", + "proc-macro2 1.0.78", + "quote 1.0.35", "strum", "syn 2.0.50", "thiserror", @@ -320,8 +320,8 @@ version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" dependencies = [ - "proc-macro2", - "quote", + "proc-macro2 1.0.78", + "quote 1.0.35", "syn 2.0.50", ] @@ -331,11 +331,20 @@ version = "0.1.77" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c980ee35e870bd1a4d2c8294d4c04d0499e67bca1e4b5cefcc693c2fa00caea9" dependencies = [ - "proc-macro2", - "quote", + "proc-macro2 1.0.78", + "quote 1.0.35", "syn 2.0.50", ] +[[package]] +name = "autocfg" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0dde43e75fd43e8a1bf86103336bc699aa8d17ad1be60c76c0bdfd4828e19b78" +dependencies = [ + "autocfg 1.1.0", +] + [[package]] name = "autocfg" version = "1.1.0" @@ -464,8 +473,8 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "965ab7eb5f8f97d2a083c799f3a1b994fc397b2fe2da5d1da1626ce15a39f2b1" dependencies = [ - "proc-macro2", - "quote", + "proc-macro2 1.0.78", + "quote 1.0.35", "syn 2.0.50", ] @@ -590,6 +599,27 @@ dependencies = [ "inout", ] +[[package]] +name = "clang" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26c4a9fd36cd08afe44217c95f31698862e48bee3380cbc60184d95c91abf996" +dependencies = [ + "clang-sys", + "lazy_static", + "libc", +] + +[[package]] +name = "clang-sys" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7f7c04e52c35222fffcc3a115b5daf5f7e2bfb71c13c4e2321afe1fc71859c2" +dependencies = [ + "glob 0.2.11", + "libc", +] + [[package]] name = "clap" version = "4.5.1" @@ -619,8 +649,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "307bc0538d5f0f83b8248db3087aa92fe504e4691294d0c96c0eabc33f47ba47" dependencies = [ "heck", - "proc-macro2", - "quote", + "proc-macro2 1.0.78", + "quote 1.0.35", "syn 2.0.50", ] @@ -630,6 +660,24 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "98cc8fbded0c607b7ba9dd60cd98df59af97e84d24e49c8557331cfc26d301ce" +[[package]] +name = "cloudabi" +version = "0.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f" +dependencies = [ + "bitflags 1.3.2", +] + +[[package]] +name = "cmake" +version = "0.1.50" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a31c789563b815f77f4250caee12365734369f942439b7defd71e18a48197130" +dependencies = [ + "cc", +] + [[package]] name = "colorchoice" version = "1.0.0" @@ -646,7 +694,7 @@ dependencies = [ "json5", "lazy_static", "nom", - "pathdiff", + "pathdiff 0.2.1", "ron", "rust-ini", "serde", @@ -687,6 +735,56 @@ version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" +[[package]] +name = "cpp" +version = "0.5.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa65869ef853e45c60e9828aa08cdd1398cb6e13f3911d9cb2a079b144fcd64" +dependencies = [ + "cpp_macros", +] + +[[package]] +name = "cpp_build" +version = "0.5.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e361fae2caf9758164b24da3eedd7f7d7451be30d90d8e7b5d2be29a2f0cf5b" +dependencies = [ + "cc", + "cpp_common", + "lazy_static", + "proc-macro2 1.0.78", + "regex", + "syn 2.0.50", + "unicode-xid 0.2.4", +] + +[[package]] +name = "cpp_common" +version = "0.5.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e1a2532e4ed4ea13031c13bc7bc0dbca4aae32df48e9d77f0d1e743179f2ea1" +dependencies = [ + "lazy_static", + "proc-macro2 1.0.78", + "syn 2.0.50", +] + +[[package]] +name = "cpp_macros" +version = "0.5.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47ec9cc90633446f779ef481a9ce5a0077107dd5b87016440448d908625a83fd" +dependencies = [ + "aho-corasick", + "byteorder", + "cpp_common", + "lazy_static", + "proc-macro2 1.0.78", + "quote 1.0.35", + "syn 2.0.50", +] + [[package]] name = "cpufeatures" version = "0.2.12" @@ -842,8 +940,8 @@ checksum = "109c1ca6e6b7f82cc233a97004ea8ed7ca123a9af07a8230878fcfda9b158bf0" dependencies = [ "fnv", "ident_case", - "proc-macro2", - "quote", + "proc-macro2 1.0.78", + "quote 1.0.35", "strsim 0.10.0", "syn 1.0.109", ] @@ -856,8 +954,8 @@ checksum = "33043dcd19068b8192064c704b3f83eb464f91f1ff527b44a4e2b08d9cdb8855" dependencies = [ "fnv", "ident_case", - "proc-macro2", - "quote", + "proc-macro2 1.0.78", + "quote 1.0.35", "strsim 0.10.0", "syn 2.0.50", ] @@ -869,7 +967,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a4aab4dbc9f7611d8b55048a3a16d2d010c2c8334e46304b40ac1cc14bf3b48e" dependencies = [ "darling_core 0.14.4", - "quote", + "quote 1.0.35", "syn 1.0.109", ] @@ -880,7 +978,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c5a91391accf613803c2a9bf9abccdbaa07c54b4244a5b64883f9c3c137c86be" dependencies = [ "darling_core 0.20.6", - "quote", + "quote 1.0.35", "syn 2.0.50", ] @@ -949,8 +1047,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c11bdc11a0c47bc7d37d582b5285da6849c96681023680b906673c5707af7b0f" dependencies = [ "darling 0.14.4", - "proc-macro2", - "quote", + "proc-macro2 1.0.78", + "quote 1.0.35", "syn 1.0.109", ] @@ -964,6 +1062,18 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "derive_more" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f57d78cf3bd45270dad4e70c21ec77a960b36c7a841ff9db76aaa775a8fb871" +dependencies = [ + "proc-macro2 0.4.30", + "quote 0.6.13", + "rustc_version 0.2.3", + "syn 0.15.44", +] + [[package]] name = "diff" version = "0.1.13" @@ -981,6 +1091,22 @@ dependencies = [ "subtle", ] +[[package]] +name = "disk-faiss" +version = "0.7.0" +dependencies = [ + "cpp", + "cpp_build", +] + +[[package]] +name = "disk-test" +version = "0.7.0" +dependencies = [ + "disk-faiss", + "faiss", +] + [[package]] name = "display-error-chain" version = "0.2.0" @@ -1031,8 +1157,8 @@ dependencies = [ "Inflector", "darling 0.20.6", "proc-macro-crate 1.3.1", - "proc-macro2", - "quote", + "proc-macro2 1.0.78", + "quote 1.0.35", "syn 2.0.50", "thiserror", ] @@ -1059,8 +1185,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f33313078bb8d4d05a2733a94ac4c2d8a0df9a2b84424ebf4f33bfc224a890e" dependencies = [ "once_cell", - "proc-macro2", - "quote", + "proc-macro2 1.0.78", + "quote 1.0.35", "syn 2.0.50", ] @@ -1131,19 +1257,40 @@ dependencies = [ ] [[package]] -name = "faiss" -version = "0.12.1" +name = "failure" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d32e9bd16cc02eae7db7ef620b392808b89f6a5e16bb3497d159c6b92a0f4f86" +dependencies = [ + "backtrace", + "failure_derive", +] + +[[package]] +name = "failure_derive" +version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3ffe048432786028b0a30aa1d13e10e08ced380439ba4a83fe5c227d2dd9733" +checksum = "aa4da3c766cd7a0db8242e326e9e4e081edd567072893ed320008189715366a4" +dependencies = [ + "proc-macro2 1.0.78", + "quote 1.0.35", + "syn 1.0.109", + "synstructure", +] + +[[package]] +name = "faiss" +version = "0.12.2-alpha.0" dependencies = [ "faiss-sys", ] [[package]] name = "faiss-sys" -version = "0.6.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b9c008fc56422bf34357f17226d9c5a5c2ef6245b4774759c5f67112e46915e" +version = "0.6.3-alpha.0" +dependencies = [ + "cmake", +] [[package]] name = "fast_chemail" @@ -1225,6 +1372,16 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba" +[[package]] +name = "futf" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" +dependencies = [ + "mac", + "new_debug_unreachable", +] + [[package]] name = "futures" version = "0.3.30" @@ -1279,8 +1436,8 @@ version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ - "proc-macro2", - "quote", + "proc-macro2 1.0.78", + "quote 1.0.35", "syn 2.0.50", ] @@ -1345,8 +1502,8 @@ checksum = "784f84eebc366e15251c4a8c3acee82a6a6f427949776ecb88377362a9621738" dependencies = [ "proc-macro-error", "proc-macro-hack", - "proc-macro2", - "quote", + "proc-macro2 1.0.78", + "quote 1.0.35", "syn 1.0.109", ] @@ -1409,6 +1566,12 @@ version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "151665d9be52f9bb40fc7966565d39666f2d1e69233571b71b87791c7e0528b3" +[[package]] +name = "glob" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8be18de09a56b60ed0edf84bc9df007e30040691af7acd1c41874faac5895bfb" + [[package]] name = "glob" version = "0.3.1" @@ -1528,6 +1691,20 @@ dependencies = [ "digest", ] +[[package]] +name = "html5ever" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ce65ac8028cf5a287a7dbf6c4e0a6cf2dcf022ed5b167a81bae66ebf599a8b7" +dependencies = [ + "log", + "mac", + "markup5ever", + "proc-macro2 0.4.30", + "quote 0.6.13", + "syn 0.15.44", +] + [[package]] name = "htmlescape" version = "0.3.1" @@ -1712,6 +1889,15 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "itertools" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f56a2d0bc861f9165be4eb3442afd3c236d8a98afd426f65d92324ae1091a484" +dependencies = [ + "either", +] + [[package]] name = "itertools" version = "0.10.5" @@ -1791,6 +1977,16 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "kernel32-sys" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d" +dependencies = [ + "winapi 0.2.8", + "winapi-build", +] + [[package]] name = "lazy_static" version = "1.4.0" @@ -1833,7 +2029,7 @@ version = "0.4.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45" dependencies = [ - "autocfg", + "autocfg 1.1.0", "scopeguard", "serde", ] @@ -1873,6 +2069,29 @@ version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "912b45c753ff5f7f5208307e8ace7d2a2e30d024e26d3509f3dce546c044ce15" +[[package]] +name = "mac" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" + +[[package]] +name = "markup5ever" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1af46a727284117e09780d05038b1ce6fc9c76cc6df183c3dae5a8955a25e21" +dependencies = [ + "log", + "phf", + "phf_codegen", + "serde", + "serde_derive", + "serde_json", + "string_cache", + "string_cache_codegen", + "tendril", +] + [[package]] name = "matchers" version = "0.1.0" @@ -1913,7 +2132,7 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c" dependencies = [ - "autocfg", + "autocfg 1.1.0", ] [[package]] @@ -2015,7 +2234,7 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2cf52bfa6042343585f458f38f18094a53e8d8c417221867918e9f0a6885f42a" dependencies = [ - "quote", + "quote 1.0.35", "syn 1.0.109", ] @@ -2028,6 +2247,12 @@ dependencies = [ "raphtory", ] +[[package]] +name = "new_debug_unreachable" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54" + [[package]] name = "nix" version = "0.27.1" @@ -2056,7 +2281,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84" dependencies = [ "overload", - "winapi", + "winapi 0.3.9", ] [[package]] @@ -2079,7 +2304,7 @@ version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "608e7659b5c3d7cba262d894801b9ec9d00de989e8a82bd4bef91d08da45cdc0" dependencies = [ - "autocfg", + "autocfg 1.1.0", "num-integer", "num-traits", ] @@ -2114,7 +2339,7 @@ version = "0.1.44" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d869c01cc0c455284163fd0092f1f93835385ccab5a98a0dcc497b2f8bf055a9" dependencies = [ - "autocfg", + "autocfg 1.1.0", "num-integer", "num-traits", ] @@ -2125,7 +2350,7 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0638a1c9d0a3c0914158145bc76cff373a75a627e6ecbfb71cbe6f453a5a19b0" dependencies = [ - "autocfg", + "autocfg 1.1.0", "num-bigint", "num-integer", "num-traits", @@ -2137,7 +2362,7 @@ version = "0.2.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "da0df0e5185db44f69b44f26786fe401b6c293d1907744beaa7fa62b2e5a517a" dependencies = [ - "autocfg", + "autocfg 1.1.0", "libm", ] @@ -2239,7 +2464,7 @@ dependencies = [ "futures-channel", "futures-executor", "futures-util", - "glob", + "glob 0.3.1", "once_cell", "opentelemetry", "ordered-float 4.2.0", @@ -2336,6 +2561,12 @@ dependencies = [ "subtle", ] +[[package]] +name = "pathdiff" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3bf70094d203e07844da868b634207e71bfab254fe713171fae9a6e751ccf31" + [[package]] name = "pathdiff" version = "0.2.1" @@ -2389,8 +2620,8 @@ checksum = "1381c29a877c6d34b8c176e734f35d7f7f5b3adaefe940cb4d1bb7af94678e2e" dependencies = [ "pest", "pest_meta", - "proc-macro2", - "quote", + "proc-macro2 1.0.78", + "quote 1.0.35", "syn 2.0.50", ] @@ -2405,6 +2636,44 @@ dependencies = [ "sha2", ] +[[package]] +name = "phf" +version = "0.7.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3da44b85f8e8dfaec21adae67f95d93244b2ecf6ad2a692320598dcc8e6dd18" +dependencies = [ + "phf_shared", +] + +[[package]] +name = "phf_codegen" +version = "0.7.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b03e85129e324ad4166b06b2c7491ae27fe3ec353af72e72cd1654c7225d517e" +dependencies = [ + "phf_generator", + "phf_shared", +] + +[[package]] +name = "phf_generator" +version = "0.7.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09364cc93c159b8b06b1f4dd8a4398984503483891b0c26b867cf431fb132662" +dependencies = [ + "phf_shared", + "rand 0.6.5", +] + +[[package]] +name = "phf_shared" +version = "0.7.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "234f71a15de2288bcb7e3b6515828d22af7ec8598ee6d24c3b526fa0a80b67a0" +dependencies = [ + "siphasher", +] + [[package]] name = "pin-project-lite" version = "0.2.13" @@ -2491,11 +2760,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42ddcf4680d8d867e1e375116203846acb088483fa2070244f90589f458bbb31" dependencies = [ "proc-macro-crate 2.0.0", - "proc-macro2", - "quote", + "proc-macro2 1.0.78", + "quote 1.0.35", "syn 2.0.50", ] +[[package]] +name = "pometry-faiss" +version = "0.7.0" +dependencies = [ + "ritual", +] + [[package]] name = "powerfmt" version = "0.2.0" @@ -2508,6 +2784,12 @@ version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" +[[package]] +name = "precomputed-hash" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" + [[package]] name = "pretty_assertions" version = "1.4.0" @@ -2544,8 +2826,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "18f33027081eba0a6d8aba6d1b1c3a3be58cbb12106341c2d5759fcd9b5277e7" dependencies = [ "proc-macro-error-attr", - "proc-macro2", - "quote", + "proc-macro2 1.0.78", + "quote 1.0.35", "syn 1.0.109", "version_check", ] @@ -2556,8 +2838,8 @@ version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a5b4b77fdb63c1eca72173d68d24501c54ab1269409f6b672c85deb18af69de" dependencies = [ - "proc-macro2", - "quote", + "proc-macro2 1.0.78", + "quote 1.0.35", "syn 1.0.109", "syn-mid", "version_check", @@ -2569,6 +2851,15 @@ version = "0.5.20+deprecated" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" +[[package]] +name = "proc-macro2" +version = "0.4.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf3d2011ab5c909338f7887f4fc896d35932e29146c12c8d01da6b22a80ba759" +dependencies = [ + "unicode-xid 0.1.0", +] + [[package]] name = "proc-macro2" version = "1.0.78" @@ -2591,7 +2882,7 @@ dependencies = [ "num-traits", "rand 0.8.5", "rand_chacha 0.3.1", - "rand_xorshift", + "rand_xorshift 0.3.0", "regex-syntax 0.8.2", "rusty-fork", "tempfile", @@ -2656,9 +2947,9 @@ version = "0.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "05f738b4e40d50b5711957f142878cfa0f28e054aa0ebdfc3fd137a843f74ed3" dependencies = [ - "proc-macro2", + "proc-macro2 1.0.78", "pyo3-macros-backend", - "quote", + "quote 1.0.35", "syn 2.0.50", ] @@ -2669,8 +2960,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fc910d4851847827daf9d6cdd4a823fbdaab5b8818325c5e97a86da79e8881f" dependencies = [ "heck", - "proc-macro2", - "quote", + "proc-macro2 1.0.78", + "quote 1.0.35", "syn 2.0.50", ] @@ -2715,18 +3006,27 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b22a693222d716a9587786f37ac3f6b4faedb5b80c23914e7303ff5a1d8016e9" dependencies = [ - "proc-macro2", - "quote", + "proc-macro2 1.0.78", + "quote 1.0.35", "syn 1.0.109", ] +[[package]] +name = "quote" +version = "0.6.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ce23b6b870e8f94f81fb0a363d65d86675884b34a09043c81e5562f11c1f8e1" +dependencies = [ + "proc-macro2 0.4.30", +] + [[package]] name = "quote" version = "1.0.35" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" dependencies = [ - "proc-macro2", + "proc-macro2 1.0.78", ] [[package]] @@ -2739,7 +3039,26 @@ dependencies = [ "libc", "rand_core 0.3.1", "rdrand", - "winapi", + "winapi 0.3.9", +] + +[[package]] +name = "rand" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d71dacdc3c88c1fde3885a3be3fbab9f35724e6ce99467f7d9c5026132184ca" +dependencies = [ + "autocfg 0.1.8", + "libc", + "rand_chacha 0.1.1", + "rand_core 0.4.2", + "rand_hc 0.1.0", + "rand_isaac", + "rand_jitter", + "rand_os", + "rand_pcg", + "rand_xorshift 0.1.1", + "winapi 0.3.9", ] [[package]] @@ -2752,7 +3071,7 @@ dependencies = [ "libc", "rand_chacha 0.2.2", "rand_core 0.5.1", - "rand_hc", + "rand_hc 0.2.0", ] [[package]] @@ -2766,6 +3085,16 @@ dependencies = [ "rand_core 0.6.4", ] +[[package]] +name = "rand_chacha" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "556d3a1ca6600bfcbab7c7c91ccb085ac7fbbcd70e008a98742e7847f4f7bcef" +dependencies = [ + "autocfg 0.1.8", + "rand_core 0.3.1", +] + [[package]] name = "rand_chacha" version = "0.2.2" @@ -2829,6 +3158,15 @@ dependencies = [ "rand 0.8.5", ] +[[package]] +name = "rand_hc" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b40677c7be09ae76218dc623efbf7b18e34bced3f38883af07bb75630a21bc4" +dependencies = [ + "rand_core 0.3.1", +] + [[package]] name = "rand_hc" version = "0.2.0" @@ -2838,6 +3176,59 @@ dependencies = [ "rand_core 0.5.1", ] +[[package]] +name = "rand_isaac" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ded997c9d5f13925be2a6fd7e66bf1872597f759fd9dd93513dd7e92e5a5ee08" +dependencies = [ + "rand_core 0.3.1", +] + +[[package]] +name = "rand_jitter" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1166d5c91dc97b88d1decc3285bb0a99ed84b05cfd0bc2341bdf2d43fc41e39b" +dependencies = [ + "libc", + "rand_core 0.4.2", + "winapi 0.3.9", +] + +[[package]] +name = "rand_os" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b75f676a1e053fc562eafbb47838d67c84801e38fc1ba459e8f180deabd5071" +dependencies = [ + "cloudabi", + "fuchsia-cprng", + "libc", + "rand_core 0.4.2", + "rdrand", + "winapi 0.3.9", +] + +[[package]] +name = "rand_pcg" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "abf9b09b01790cfe0364f52bf32995ea3c39f4d2dd011eac241d2914146d0b44" +dependencies = [ + "autocfg 0.1.8", + "rand_core 0.4.2", +] + +[[package]] +name = "rand_xorshift" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cbf7e9e623549b0e21f6e97cf8ecf247c1a8fd2e8a992ae265314300b2455d5c" +dependencies = [ + "rand_core 0.3.1", +] + [[package]] name = "rand_xorshift" version = "0.3.0" @@ -2907,6 +3298,7 @@ name = "raphtory-benchmark" version = "0.7.0" dependencies = [ "criterion", + "faiss", "rand 0.8.5", "raphtory", "rayon", @@ -3070,7 +3462,7 @@ version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7" dependencies = [ - "winapi", + "winapi 0.3.9", ] [[package]] @@ -3161,7 +3553,7 @@ dependencies = [ "spin 0.5.2", "untrusted 0.7.1", "web-sys", - "winapi", + "winapi 0.3.9", ] [[package]] @@ -3179,6 +3571,45 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "ritual" +version = "0.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e21346a0a16287a0ff14d9956a8d1261728af3f2f6f83a8cafa5195d29669a1" +dependencies = [ + "clang", + "derive_more", + "itertools 0.8.2", + "log", + "pathdiff 0.1.0", + "regex", + "ritual_common", + "select", + "serde", + "serde_derive", + "tempdir", +] + +[[package]] +name = "ritual_common" +version = "0.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41a82c336d1b2b71264758d23d8dde602c1558310eb893c3230d7f5f49933d7f" +dependencies = [ + "bincode", + "failure", + "itertools 0.8.2", + "lazy_static", + "log", + "num_cpus", + "regex", + "serde", + "serde_derive", + "serde_json", + "term-painter", + "toml 0.4.10", +] + [[package]] name = "ron" version = "0.7.1" @@ -3222,13 +3653,22 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" +[[package]] +name = "rustc_version" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a" +dependencies = [ + "semver 0.9.0", +] + [[package]] name = "rustc_version" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" dependencies = [ - "semver", + "semver 1.0.22", ] [[package]] @@ -3394,12 +3834,37 @@ dependencies = [ "libc", ] +[[package]] +name = "select" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac645958c62108d11f90f8d34e4dc2799c838fc995ed4c2075867a2a8d5be76b" +dependencies = [ + "bit-set", + "html5ever", +] + +[[package]] +name = "semver" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403" +dependencies = [ + "semver-parser", +] + [[package]] name = "semver" version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "92d43fe69e652f3df9bdc2b85b2854a0825b86e4fb76bc44d945137d053639ca" +[[package]] +name = "semver-parser" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" + [[package]] name = "serde" version = "1.0.197" @@ -3426,8 +3891,8 @@ version = "1.0.197" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7eb0b34b42edc17f6b7cac84a52a1c5f0e1bb2227e997ca9011ea3dd34e8610b" dependencies = [ - "proc-macro2", - "quote", + "proc-macro2 1.0.78", + "quote 1.0.35", "syn 2.0.50", ] @@ -3509,6 +3974,12 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f27f6278552951f1f2b8cf9da965d10969b2efdea95a6ec47987ab46edfe263a" +[[package]] +name = "siphasher" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b8de496cf83d4ed58b6be86c3a275b8602f6ffe98d3024a869e124147a9a3ac" + [[package]] name = "sketches-ddsketch" version = "0.2.2" @@ -3524,7 +3995,7 @@ version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" dependencies = [ - "autocfg", + "autocfg 1.1.0", ] [[package]] @@ -3585,6 +4056,40 @@ dependencies = [ "num-traits", ] +[[package]] +name = "string_cache" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89c058a82f9fd69b1becf8c274f412281038877c553182f1d02eb027045a2d67" +dependencies = [ + "lazy_static", + "new_debug_unreachable", + "phf_shared", + "precomputed-hash", + "serde", + "string_cache_codegen", + "string_cache_shared", +] + +[[package]] +name = "string_cache_codegen" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0f45ed1b65bf9a4bf2f7b7dc59212d1926e9eaf00fa998988e420fd124467c6" +dependencies = [ + "phf_generator", + "phf_shared", + "proc-macro2 1.0.78", + "quote 1.0.35", + "string_cache_shared", +] + +[[package]] +name = "string_cache_shared" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1884d1bc09741d466d9b14e6d37ac89d6909cbcac41dd9ae982d4d063bbedfc" + [[package]] name = "strsim" version = "0.10.0" @@ -3613,8 +4118,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "23dc1fa9ac9c169a78ba62f0b841814b7abae11bdd047b9c58f893439e309ea0" dependencies = [ "heck", - "proc-macro2", - "quote", + "proc-macro2 1.0.78", + "quote 1.0.35", "rustversion", "syn 2.0.50", ] @@ -3625,14 +4130,25 @@ version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" +[[package]] +name = "syn" +version = "0.15.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ca4b3b69a77cbe1ffc9e198781b7acb0c7365a883670e8f1c1bc66fba79a5c5" +dependencies = [ + "proc-macro2 0.4.30", + "quote 0.6.13", + "unicode-xid 0.1.0", +] + [[package]] name = "syn" version = "1.0.109" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" dependencies = [ - "proc-macro2", - "quote", + "proc-macro2 1.0.78", + "quote 1.0.35", "unicode-ident", ] @@ -3642,8 +4158,8 @@ version = "2.0.50" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "74f1bdc9872430ce9b75da68329d1c1746faf50ffac5f19e02b71e37ff881ffb" dependencies = [ - "proc-macro2", - "quote", + "proc-macro2 1.0.78", + "quote 1.0.35", "unicode-ident", ] @@ -3653,8 +4169,8 @@ version = "0.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fea305d57546cc8cd04feb14b62ec84bf17f50e3f7b12560d7bfa9265f39d9ed" dependencies = [ - "proc-macro2", - "quote", + "proc-macro2 1.0.78", + "quote 1.0.35", "syn 1.0.109", ] @@ -3664,6 +4180,18 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160" +[[package]] +name = "synstructure" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f36bdaa60a83aca3921b5259d5400cbf5e90fc51931376a9bd4a0eb79aa7210f" +dependencies = [ + "proc-macro2 1.0.78", + "quote 1.0.35", + "syn 1.0.109", + "unicode-xid 0.2.4", +] + [[package]] name = "system-configuration" version = "0.5.1" @@ -3734,7 +4262,7 @@ dependencies = [ "thiserror", "time", "uuid", - "winapi", + "winapi 0.3.9", ] [[package]] @@ -3864,6 +4392,36 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "tendril" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0" +dependencies = [ + "futf", + "mac", + "utf-8", +] + +[[package]] +name = "term" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa63644f74ce96fbeb9b794f66aff2a52d601cbd5e80f4b97123e3899f4570f1" +dependencies = [ + "kernel32-sys", + "winapi 0.2.8", +] + +[[package]] +name = "term-painter" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcaa948f0e3e38470cd8dc8dcfe561a75c9e43f28075bb183845be2b9b3c08cf" +dependencies = [ + "term", +] + [[package]] name = "terminal_size" version = "0.3.0" @@ -3889,8 +4447,8 @@ version = "1.0.57" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a953cb265bef375dae3de6663da4d3804eee9682ea80d8e2542529b73c531c81" dependencies = [ - "proc-macro2", - "quote", + "proc-macro2 1.0.78", + "quote 1.0.35", "syn 2.0.50", ] @@ -4007,8 +4565,8 @@ version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" dependencies = [ - "proc-macro2", - "quote", + "proc-macro2 1.0.78", + "quote 1.0.35", "syn 2.0.50", ] @@ -4060,6 +4618,15 @@ dependencies = [ "tracing", ] +[[package]] +name = "toml" +version = "0.4.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "758664fc71a3a69038656bee8b6be6477d2a6c315a6b81f7081f591bffa4111f" +dependencies = [ + "serde", +] + [[package]] name = "toml" version = "0.5.11" @@ -4137,8 +4704,8 @@ version = "0.1.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ - "proc-macro2", - "quote", + "proc-macro2 1.0.78", + "quote 1.0.35", "syn 2.0.50", ] @@ -4292,6 +4859,18 @@ dependencies = [ "tinyvec", ] +[[package]] +name = "unicode-xid" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc" + +[[package]] +name = "unicode-xid" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f962df74c8c05a667b5ee8bcf162993134c104e96440b663c8daa176dc772d8c" + [[package]] name = "unindent" version = "0.2.3" @@ -4426,8 +5005,8 @@ dependencies = [ "bumpalo", "log", "once_cell", - "proc-macro2", - "quote", + "proc-macro2 1.0.78", + "quote 1.0.35", "syn 2.0.50", "wasm-bindgen-shared", ] @@ -4450,7 +5029,7 @@ version = "0.2.91" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b30af9e2d358182b5c7449424f017eba305ed32a7010509ede96cdc4696c46ed" dependencies = [ - "quote", + "quote 1.0.35", "wasm-bindgen-macro-support", ] @@ -4460,8 +5039,8 @@ version = "0.2.91" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "642f325be6301eb8107a83d12a8ac6c1e1c54345a7ef1a9261962dfefda09e66" dependencies = [ - "proc-macro2", - "quote", + "proc-macro2 1.0.78", + "quote 1.0.35", "syn 2.0.50", "wasm-bindgen-backend", "wasm-bindgen-shared", @@ -4493,8 +5072,8 @@ version = "0.3.41" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a5211b7550606857312bba1d978a8ec75692eae187becc5e680444fffc5e6f89" dependencies = [ - "proc-macro2", - "quote", + "proc-macro2 1.0.78", + "quote 1.0.35", "syn 2.0.50", ] @@ -4555,7 +5134,7 @@ dependencies = [ "cfg-if 0.1.10", "libc", "memory_units", - "winapi", + "winapi 0.3.9", ] [[package]] @@ -4564,6 +5143,12 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "495ec47bf3c1345005f40724f0269362c8556cbc43aed0526ed44cae1d35fceb" +[[package]] +name = "winapi" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a" + [[package]] name = "winapi" version = "0.3.9" @@ -4574,6 +5159,12 @@ dependencies = [ "winapi-x86_64-pc-windows-gnu", ] +[[package]] +name = "winapi-build" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc" + [[package]] name = "winapi-i686-pc-windows-gnu" version = "0.4.0" @@ -4586,7 +5177,7 @@ version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596" dependencies = [ - "winapi", + "winapi 0.3.9", ] [[package]] @@ -4805,8 +5396,8 @@ version = "0.7.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" dependencies = [ - "proc-macro2", - "quote", + "proc-macro2 1.0.78", + "quote 1.0.35", "syn 2.0.50", ] diff --git a/disk-faiss/src/lib.rs b/disk-faiss/src/lib.rs index 26b7a025f9..2848c0ac30 100644 --- a/disk-faiss/src/lib.rs +++ b/disk-faiss/src/lib.rs @@ -26,6 +26,7 @@ pub fn merge_ondisk(index: &str, shards: Vec<&str>, ivfdata: &str, output: &str) .collect(); let shards: Vec<_> = shards.iter().map(|shard| shard.as_ptr()).collect(); let shards = &shards; + let num_shards: u32 = shards.len() as u32; let ivfdata = std::ffi::CString::new(ivfdata).unwrap(); let ivfdata = ivfdata.as_ptr(); @@ -34,46 +35,59 @@ pub fn merge_ondisk(index: &str, shards: Vec<&str>, ivfdata: &str, output: &str) let output = output.as_ptr(); unsafe { - cpp!([index_path as "const char *", shards as "std::vector *", ivfdata as "const char *", output as "const char *"] { - - // try { - faiss::IndexIVFFlat* index = (faiss::IndexIVFFlat*) faiss::read_index(index_path, 0); // TODO: review: 0???????????????????? - // auto index_ref = dynamic_cast(index); - // return *index_ref ; - - std::vector ivfs; - for (const auto& shard : *shards) { - faiss::IndexIVFFlat* ivf = (faiss::IndexIVFFlat*) faiss::read_index(shard, faiss::IO_FLAG_MMAP); - ivfs.push_back(ivf); - } - - if (index->ntotal != 0) { - std::exit(1); - } - - auto invlists = faiss::OnDiskInvertedLists( - index->nlist, index->code_size, ivfdata - ); - - // auto ivf_vector = faiss::InvertedListsPtrVector(); - // for (const auto& ivf : ivfs) { - // ivf_vector.push_back(ivf); - // } - - const faiss::InvertedLists **ivfs_data = (const faiss::InvertedLists**) ivfs.data(); - auto ntotal = invlists.merge_from(ivfs_data, ivfs.size()); // TODO: this has a verbose parameter I can use - - index->ntotal = ntotal; - index->replace_invlists(&invlists, true); - // invlists.this.disown(); ???????????????????????? - - faiss::write_index(index, output); - // } catch (const std::exception &e) { - // std::cerr << "cpp exception"; - // std::cerr << e.what(); - // } catch { - // std::cerr << "unknown exception"; + cpp!([index_path as "const char *", shards as "std::vector *", num_shards as "uint32_t", ivfdata as "const char *", output as "const char *"] { + + try { + std::cout << "----here----" << std::endl; + std::vector ivfs; + std::cout << "reading shards -> " << shards->size() << std::endl; + for (unsigned int i = 0; i < num_shards; ++i) { + // std::cout << "reading " << shard << std::endl; + const char * shard = shards->at(i); + faiss::IndexIVFFlat* ivf = (faiss::IndexIVFFlat*) faiss::read_index(shard, faiss::IO_FLAG_MMAP); + std::cout << "success reading" << std::endl; + ivfs.push_back(ivf); + std::cout << "success adding it" << std::endl; + + ivf->own_invlists = false; + delete ivf; + } + + std::cout << "---- after reading shards ----" << std::endl; + faiss::IndexIVFFlat* index = (faiss::IndexIVFFlat*) faiss::read_index(index_path); // TODO: review: 0 as second parameter???????????????????? + + if (index->ntotal != 0) { + std::exit(1); + } + std::cout << "nlist: " << index->nlist << std::endl; + std::cout << "code_size: " << index->code_size << std::endl; + std::cout << "---- about to call on disk inverted lists ----" << std::endl; + auto invlists = new faiss::OnDiskInvertedLists( + index->nlist, index->code_size, ivfdata + ); + std::cout << "----here----" << std::endl; + + // auto ivf_vector = faiss::InvertedListsPtrVector(); + // for (const auto& ivf : ivfs) { + // ivf_vector.push_back(ivf); // } + + const faiss::InvertedLists **ivfs_data = (const faiss::InvertedLists**) ivfs.data(); + auto ntotal = invlists->merge_from(ivfs_data, ivfs.size()); // TODO: this has a verbose parameter I can use + std::cout << "--here--"; + + index->ntotal = ntotal; + index->replace_invlists(invlists, true); + std::cout << "--here--"; + // invlists.this.disown(); ???????????????????????? + + faiss::write_index(index, output); + + } catch (const std::exception &e) { + std::cerr << "cpp exception" << std::endl; + std::cerr << e.what() << std::endl; + throw e; + } }) }; } diff --git a/disk-test/src/main.rs b/disk-test/src/main.rs index 698fce7bef..b77a223348 100644 --- a/disk-test/src/main.rs +++ b/disk-test/src/main.rs @@ -27,9 +27,11 @@ fn read_fvecs(file_name: &str) -> Result { }) } -fn main() { - let tmpfile = |filename: &str| "/tmp/faiss-disk-test/".to_owned() + filename; +fn tmpfile>(filename: S) -> String { + return "/tmp/faiss-disk-test/".to_owned() + filename.as_ref(); +} +fn main() { println!("Training index"); let FVecsContent { dimensions, @@ -40,6 +42,10 @@ fn main() { // println!("sample -> {sample:?}"); let mut index = index_factory(dimensions, "IVF4096,Flat", MetricType::InnerProduct).unwrap(); index.train(vectors.as_slice()).unwrap(); + + let index = index.into_ivf_flat().unwrap(); + dbg!(&index.nlist()); + dbg!(&index.ntotal()); write_index(&index, tmpfile("trained.index")).unwrap(); println!("Splitting vectors into files"); @@ -51,7 +57,7 @@ fn main() { let num_vectors = vectors.len() / dimensions as usize; let vectors_per_chunk = num_vectors / num_chunks + 1; let chunk_files: Vec<_> = (0..num_chunks) - .map(|chunk_number| format!("block_{chunk_number}.index")) + .map(|chunk_number| tmpfile(format!("block_{chunk_number}.index"))) .collect(); let chunk_files: Vec<_> = chunk_files.iter().map(|f| f.as_str()).collect(); @@ -65,12 +71,12 @@ fn main() { let ids: Vec<_> = ids_range.map(|id| Idx::from(id as i64)).collect(); let mut index = read_index(tmpfile("trained.index")).unwrap(); index.add_with_ids(chunk, ids.as_slice()).unwrap(); - write_index(&index, tmpfile(filename)).unwrap(); + write_index(&index, filename).unwrap(); } println!("merging indexes on disk"); merge_ondisk( - "trained.index", + &tmpfile("trained.index"), chunk_files, &tmpfile("merged_index.ivfdata"), &tmpfile("populated.index"), diff --git a/raphtory-benchmark/Cargo.toml b/raphtory-benchmark/Cargo.toml index af298291fa..7e5b706862 100644 --- a/raphtory-benchmark/Cargo.toml +++ b/raphtory-benchmark/Cargo.toml @@ -12,6 +12,7 @@ raphtory = { path = "../raphtory", features = ["io", "vectors"] } sorted_vector_map = "0.1" rand = "0.8.5" rayon = "1" +faiss = { path = "../faiss-rs" } [[bench]] name = "tgraph_benchmarks" diff --git a/raphtory-benchmark/benches/vectors.rs b/raphtory-benchmark/benches/vectors.rs index 73ada1da54..a9efac840e 100644 --- a/raphtory-benchmark/benches/vectors.rs +++ b/raphtory-benchmark/benches/vectors.rs @@ -79,5 +79,131 @@ pub fn vectors(c: &mut Criterion) { }); } -criterion_group!(benches, vectors); +struct FVecsContent { + dimensions: u32, + vectors: Vec, +} + +use std::fs; +use std::io::Error as IoError; +use std::io::ErrorKind::InvalidData; + +fn read_fvecs(file_name: &str) -> Result { + let data = fs::read(file_name)?; + let (dim_data, vector_data) = data.split_at(4); + let dim = dim_data + .try_into() + .map_err(|e| IoError::new(InvalidData, e))?; + let dimensions = u32::from_le_bytes(dim); + let vectors: Vec<_> = vector_data + .chunks_exact(4) + .map(|chunk| f32::from_le_bytes(chunk.try_into().unwrap())) + .collect(); + + Ok(FVecsContent { + dimensions, + vectors, + }) +} + +// fn read_fvecs(file_name: &str) -> Result { +// let mut file = File::open(file_name)?; +// let mut buffer = [0u8; 4]; +// file.read_exact(&mut buffer); +// let mut cursor = Cursor::new(buffer); +// let value = cursor.read_i32::()?; +// ... +// } + +use faiss::index::io_flags::IoFlags; +use faiss::index::IndexImpl; +use faiss::index::{io::read_index_with_flags, NativeIndex}; +use faiss::{index_factory, read_index, write_index, Idx, Index, MetricType}; + +// this is based on https://github.com/facebookresearch/faiss/blob/12b92e9fa5d8e8fb3da53c57af9ff007c826b1ee/contrib/ondisk.py +fn merge_on_disk(trained_index: &IndexImpl, shard_fnames: Vec<&str>, ivfdata_fname: &str) { + // assert not isinstance( + // trained_index, faiss.IndexIVFPQR + // ), "IndexIVFPQR is not supported as an on disk index." + // + + let ivfs = shard_fnames.iter().map(|filename| { + let index = read_index_with_flags(filename, IoFlags::MEM_MAP).unwrap(); + index.into_ivf_flat().unwrap(); + }); + + let index_ivf = trained_index.into_ivf_flat().unwrap(); + + assert_eq!(trained_index.ntotal(), 0, "works only on empty index") + + // FIXME: can't figure out next line !! + // invlists = faiss.OnDiskInvertedLists( + // index_ivf.nlist, index_ivf.code_size, ivfdata_fname + // ) + + // # merge all the inverted lists + // ivf_vector = faiss.InvertedListsPtrVector() + // for ivf in ivfs: + // ivf_vector.push_back(ivf) + + // LOG.info("merge %d inverted lists " % ivf_vector.size()) + // ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size()) + + // # now replace the inverted lists in the output index + // index.ntotal = index_ivf.ntotal = ntotal + // index_ivf.replace_invlists(invlists, True) + // invlists.this.disown() +} + +pub fn faiss(c: &mut Criterion) { + let rt = Runtime::new().unwrap(); + + let tmpfile = |filename: &str| "/tmp/faiss-disk-test/".to_owned() + filename; + + println!("Training index"); + let FVecsContent { + dimensions, + vectors, + } = read_fvecs("resources/sift/sift_learn.fvecs").unwrap(); + // println!("dimensions -> {dimensions}"); + // let sample: Vec<_> = flattened_vectors.iter().take(8).collect(); + // println!("sample -> {sample:?}"); + let mut index = index_factory(dimensions, "IVF4096,Flat", MetricType::InnerProduct).unwrap(); + index.train(vectors.as_slice()).unwrap(); + write_index(&index, tmpfile("trained.index")).unwrap(); + + println!("Splitting vectors into files"); + let vectors = read_fvecs("resources/sift/sift_base.fvecs") + .unwrap() + .vectors; + + let num_vectors = vectors.len() / dimensions as usize; + let vectors_per_chunk = num_vectors / 4 + 1; + + for (chunk_number, chunk) in vectors + .chunks(vectors_per_chunk * dimensions as usize) + .enumerate() + { + let first_id = vectors_per_chunk * chunk_number; + let ids_range = first_id..(first_id + chunk.len()); + let ids: Vec<_> = ids_range.map(|id| Idx::from(id as i64)).collect(); + let mut index = read_index(tmpfile("trained.index")).unwrap(); + index.add_with_ids(chunk, ids.as_slice()).unwrap(); + let block_name = format!("block_{chunk_number}.index"); + write_index(&index, tmpfile(block_name.as_str())).unwrap(); + } + + println!("loading trained index"); + let index = read_index(tmpfile("trained.index")).unwrap(); + + // let block_fnames: Vec<_> = (0..4).map(|n_chunk| tmpfile(format!("block_{n_chunk}.index").as_str())).collect(); + // merge_ondisk(index, block_fnames, tmpdir + "merged_index.ivfdata").unwrap(); + // write_index(&index, tmpdir + "populated.index").unwrap(); + + // c.bench_function("faiss-index", |b| { + // b.iter(|| ()); + // }); +} + +criterion_group!(benches, faiss); criterion_main!(benches); From b348be30a2558d34d242b35bac138b237b237dd0 Mon Sep 17 00:00:00 2001 From: Pedro Rico Pinazo Date: Thu, 7 Mar 2024 12:17:41 +0000 Subject: [PATCH 7/9] add file chunking to bench against large input files --- Cargo.lock | 696 +++++-------------------------------- Cargo.toml | 1 - disk-faiss/src/lib.rs | 91 ++--- disk-test/Cargo.toml | 8 + disk-test/benches/bench.rs | 239 +++++++++++++ disk-test/src/lib.rs | 83 +++++ disk-test/src/main.rs | 49 +-- 7 files changed, 482 insertions(+), 685 deletions(-) create mode 100644 disk-test/benches/bench.rs create mode 100644 disk-test/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index fb7a4023b2..b43a09a63f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -169,7 +169,7 @@ dependencies = [ "hash_hasher", "hashbrown 0.14.3", "num-traits", - "rustc_version 0.4.0", + "rustc_version", "simdutf8", ] @@ -231,8 +231,8 @@ dependencies = [ "async-graphql-parser", "darling 0.20.6", "proc-macro-crate 1.3.1", - "proc-macro2 1.0.78", - "quote 1.0.35", + "proc-macro2", + "quote", "strum", "syn 2.0.50", "thiserror", @@ -320,8 +320,8 @@ version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" dependencies = [ - "proc-macro2 1.0.78", - "quote 1.0.35", + "proc-macro2", + "quote", "syn 2.0.50", ] @@ -331,20 +331,11 @@ version = "0.1.77" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c980ee35e870bd1a4d2c8294d4c04d0499e67bca1e4b5cefcc693c2fa00caea9" dependencies = [ - "proc-macro2 1.0.78", - "quote 1.0.35", + "proc-macro2", + "quote", "syn 2.0.50", ] -[[package]] -name = "autocfg" -version = "0.1.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0dde43e75fd43e8a1bf86103336bc699aa8d17ad1be60c76c0bdfd4828e19b78" -dependencies = [ - "autocfg 1.1.0", -] - [[package]] name = "autocfg" version = "1.1.0" @@ -473,8 +464,8 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "965ab7eb5f8f97d2a083c799f3a1b994fc397b2fe2da5d1da1626ce15a39f2b1" dependencies = [ - "proc-macro2 1.0.78", - "quote 1.0.35", + "proc-macro2", + "quote", "syn 2.0.50", ] @@ -599,27 +590,6 @@ dependencies = [ "inout", ] -[[package]] -name = "clang" -version = "0.20.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26c4a9fd36cd08afe44217c95f31698862e48bee3380cbc60184d95c91abf996" -dependencies = [ - "clang-sys", - "lazy_static", - "libc", -] - -[[package]] -name = "clang-sys" -version = "0.23.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7f7c04e52c35222fffcc3a115b5daf5f7e2bfb71c13c4e2321afe1fc71859c2" -dependencies = [ - "glob 0.2.11", - "libc", -] - [[package]] name = "clap" version = "4.5.1" @@ -649,8 +619,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "307bc0538d5f0f83b8248db3087aa92fe504e4691294d0c96c0eabc33f47ba47" dependencies = [ "heck", - "proc-macro2 1.0.78", - "quote 1.0.35", + "proc-macro2", + "quote", "syn 2.0.50", ] @@ -660,15 +630,6 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "98cc8fbded0c607b7ba9dd60cd98df59af97e84d24e49c8557331cfc26d301ce" -[[package]] -name = "cloudabi" -version = "0.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f" -dependencies = [ - "bitflags 1.3.2", -] - [[package]] name = "cmake" version = "0.1.50" @@ -694,7 +655,7 @@ dependencies = [ "json5", "lazy_static", "nom", - "pathdiff 0.2.1", + "pathdiff", "ron", "rust-ini", "serde", @@ -753,10 +714,10 @@ dependencies = [ "cc", "cpp_common", "lazy_static", - "proc-macro2 1.0.78", + "proc-macro2", "regex", "syn 2.0.50", - "unicode-xid 0.2.4", + "unicode-xid", ] [[package]] @@ -766,7 +727,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3e1a2532e4ed4ea13031c13bc7bc0dbca4aae32df48e9d77f0d1e743179f2ea1" dependencies = [ "lazy_static", - "proc-macro2 1.0.78", + "proc-macro2", "syn 2.0.50", ] @@ -780,8 +741,8 @@ dependencies = [ "byteorder", "cpp_common", "lazy_static", - "proc-macro2 1.0.78", - "quote 1.0.35", + "proc-macro2", + "quote", "syn 2.0.50", ] @@ -940,8 +901,8 @@ checksum = "109c1ca6e6b7f82cc233a97004ea8ed7ca123a9af07a8230878fcfda9b158bf0" dependencies = [ "fnv", "ident_case", - "proc-macro2 1.0.78", - "quote 1.0.35", + "proc-macro2", + "quote", "strsim 0.10.0", "syn 1.0.109", ] @@ -954,8 +915,8 @@ checksum = "33043dcd19068b8192064c704b3f83eb464f91f1ff527b44a4e2b08d9cdb8855" dependencies = [ "fnv", "ident_case", - "proc-macro2 1.0.78", - "quote 1.0.35", + "proc-macro2", + "quote", "strsim 0.10.0", "syn 2.0.50", ] @@ -967,7 +928,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a4aab4dbc9f7611d8b55048a3a16d2d010c2c8334e46304b40ac1cc14bf3b48e" dependencies = [ "darling_core 0.14.4", - "quote 1.0.35", + "quote", "syn 1.0.109", ] @@ -978,7 +939,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c5a91391accf613803c2a9bf9abccdbaa07c54b4244a5b64883f9c3c137c86be" dependencies = [ "darling_core 0.20.6", - "quote 1.0.35", + "quote", "syn 2.0.50", ] @@ -1047,8 +1008,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c11bdc11a0c47bc7d37d582b5285da6849c96681023680b906673c5707af7b0f" dependencies = [ "darling 0.14.4", - "proc-macro2 1.0.78", - "quote 1.0.35", + "proc-macro2", + "quote", "syn 1.0.109", ] @@ -1062,18 +1023,6 @@ dependencies = [ "syn 1.0.109", ] -[[package]] -name = "derive_more" -version = "0.13.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f57d78cf3bd45270dad4e70c21ec77a960b36c7a841ff9db76aaa775a8fb871" -dependencies = [ - "proc-macro2 0.4.30", - "quote 0.6.13", - "rustc_version 0.2.3", - "syn 0.15.44", -] - [[package]] name = "diff" version = "0.1.13" @@ -1103,8 +1052,10 @@ dependencies = [ name = "disk-test" version = "0.7.0" dependencies = [ + "criterion", "disk-faiss", "faiss", + "rand 0.8.5", ] [[package]] @@ -1157,8 +1108,8 @@ dependencies = [ "Inflector", "darling 0.20.6", "proc-macro-crate 1.3.1", - "proc-macro2 1.0.78", - "quote 1.0.35", + "proc-macro2", + "quote", "syn 2.0.50", "thiserror", ] @@ -1185,8 +1136,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f33313078bb8d4d05a2733a94ac4c2d8a0df9a2b84424ebf4f33bfc224a890e" dependencies = [ "once_cell", - "proc-macro2 1.0.78", - "quote 1.0.35", + "proc-macro2", + "quote", "syn 2.0.50", ] @@ -1256,28 +1207,6 @@ dependencies = [ "serde", ] -[[package]] -name = "failure" -version = "0.1.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d32e9bd16cc02eae7db7ef620b392808b89f6a5e16bb3497d159c6b92a0f4f86" -dependencies = [ - "backtrace", - "failure_derive", -] - -[[package]] -name = "failure_derive" -version = "0.1.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa4da3c766cd7a0db8242e326e9e4e081edd567072893ed320008189715366a4" -dependencies = [ - "proc-macro2 1.0.78", - "quote 1.0.35", - "syn 1.0.109", - "synstructure", -] - [[package]] name = "faiss" version = "0.12.2-alpha.0" @@ -1372,16 +1301,6 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba" -[[package]] -name = "futf" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" -dependencies = [ - "mac", - "new_debug_unreachable", -] - [[package]] name = "futures" version = "0.3.30" @@ -1436,8 +1355,8 @@ version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ - "proc-macro2 1.0.78", - "quote 1.0.35", + "proc-macro2", + "quote", "syn 2.0.50", ] @@ -1502,8 +1421,8 @@ checksum = "784f84eebc366e15251c4a8c3acee82a6a6f427949776ecb88377362a9621738" dependencies = [ "proc-macro-error", "proc-macro-hack", - "proc-macro2 1.0.78", - "quote 1.0.35", + "proc-macro2", + "quote", "syn 1.0.109", ] @@ -1566,12 +1485,6 @@ version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "151665d9be52f9bb40fc7966565d39666f2d1e69233571b71b87791c7e0528b3" -[[package]] -name = "glob" -version = "0.2.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8be18de09a56b60ed0edf84bc9df007e30040691af7acd1c41874faac5895bfb" - [[package]] name = "glob" version = "0.3.1" @@ -1691,20 +1604,6 @@ dependencies = [ "digest", ] -[[package]] -name = "html5ever" -version = "0.23.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ce65ac8028cf5a287a7dbf6c4e0a6cf2dcf022ed5b167a81bae66ebf599a8b7" -dependencies = [ - "log", - "mac", - "markup5ever", - "proc-macro2 0.4.30", - "quote 0.6.13", - "syn 0.15.44", -] - [[package]] name = "htmlescape" version = "0.3.1" @@ -1889,15 +1788,6 @@ dependencies = [ "windows-sys 0.52.0", ] -[[package]] -name = "itertools" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f56a2d0bc861f9165be4eb3442afd3c236d8a98afd426f65d92324ae1091a484" -dependencies = [ - "either", -] - [[package]] name = "itertools" version = "0.10.5" @@ -1977,16 +1867,6 @@ dependencies = [ "windows-sys 0.52.0", ] -[[package]] -name = "kernel32-sys" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d" -dependencies = [ - "winapi 0.2.8", - "winapi-build", -] - [[package]] name = "lazy_static" version = "1.4.0" @@ -2029,7 +1909,7 @@ version = "0.4.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45" dependencies = [ - "autocfg 1.1.0", + "autocfg", "scopeguard", "serde", ] @@ -2069,29 +1949,6 @@ version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "912b45c753ff5f7f5208307e8ace7d2a2e30d024e26d3509f3dce546c044ce15" -[[package]] -name = "mac" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" - -[[package]] -name = "markup5ever" -version = "0.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1af46a727284117e09780d05038b1ce6fc9c76cc6df183c3dae5a8955a25e21" -dependencies = [ - "log", - "phf", - "phf_codegen", - "serde", - "serde_derive", - "serde_json", - "string_cache", - "string_cache_codegen", - "tendril", -] - [[package]] name = "matchers" version = "0.1.0" @@ -2132,7 +1989,7 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c" dependencies = [ - "autocfg 1.1.0", + "autocfg", ] [[package]] @@ -2234,7 +2091,7 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2cf52bfa6042343585f458f38f18094a53e8d8c417221867918e9f0a6885f42a" dependencies = [ - "quote 1.0.35", + "quote", "syn 1.0.109", ] @@ -2247,12 +2104,6 @@ dependencies = [ "raphtory", ] -[[package]] -name = "new_debug_unreachable" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54" - [[package]] name = "nix" version = "0.27.1" @@ -2281,7 +2132,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84" dependencies = [ "overload", - "winapi 0.3.9", + "winapi", ] [[package]] @@ -2304,7 +2155,7 @@ version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "608e7659b5c3d7cba262d894801b9ec9d00de989e8a82bd4bef91d08da45cdc0" dependencies = [ - "autocfg 1.1.0", + "autocfg", "num-integer", "num-traits", ] @@ -2339,7 +2190,7 @@ version = "0.1.44" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d869c01cc0c455284163fd0092f1f93835385ccab5a98a0dcc497b2f8bf055a9" dependencies = [ - "autocfg 1.1.0", + "autocfg", "num-integer", "num-traits", ] @@ -2350,7 +2201,7 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0638a1c9d0a3c0914158145bc76cff373a75a627e6ecbfb71cbe6f453a5a19b0" dependencies = [ - "autocfg 1.1.0", + "autocfg", "num-bigint", "num-integer", "num-traits", @@ -2362,7 +2213,7 @@ version = "0.2.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "da0df0e5185db44f69b44f26786fe401b6c293d1907744beaa7fa62b2e5a517a" dependencies = [ - "autocfg 1.1.0", + "autocfg", "libm", ] @@ -2464,7 +2315,7 @@ dependencies = [ "futures-channel", "futures-executor", "futures-util", - "glob 0.3.1", + "glob", "once_cell", "opentelemetry", "ordered-float 4.2.0", @@ -2561,12 +2412,6 @@ dependencies = [ "subtle", ] -[[package]] -name = "pathdiff" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a3bf70094d203e07844da868b634207e71bfab254fe713171fae9a6e751ccf31" - [[package]] name = "pathdiff" version = "0.2.1" @@ -2620,8 +2465,8 @@ checksum = "1381c29a877c6d34b8c176e734f35d7f7f5b3adaefe940cb4d1bb7af94678e2e" dependencies = [ "pest", "pest_meta", - "proc-macro2 1.0.78", - "quote 1.0.35", + "proc-macro2", + "quote", "syn 2.0.50", ] @@ -2636,44 +2481,6 @@ dependencies = [ "sha2", ] -[[package]] -name = "phf" -version = "0.7.24" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3da44b85f8e8dfaec21adae67f95d93244b2ecf6ad2a692320598dcc8e6dd18" -dependencies = [ - "phf_shared", -] - -[[package]] -name = "phf_codegen" -version = "0.7.24" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b03e85129e324ad4166b06b2c7491ae27fe3ec353af72e72cd1654c7225d517e" -dependencies = [ - "phf_generator", - "phf_shared", -] - -[[package]] -name = "phf_generator" -version = "0.7.24" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09364cc93c159b8b06b1f4dd8a4398984503483891b0c26b867cf431fb132662" -dependencies = [ - "phf_shared", - "rand 0.6.5", -] - -[[package]] -name = "phf_shared" -version = "0.7.24" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "234f71a15de2288bcb7e3b6515828d22af7ec8598ee6d24c3b526fa0a80b67a0" -dependencies = [ - "siphasher", -] - [[package]] name = "pin-project-lite" version = "0.2.13" @@ -2760,18 +2567,11 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42ddcf4680d8d867e1e375116203846acb088483fa2070244f90589f458bbb31" dependencies = [ "proc-macro-crate 2.0.0", - "proc-macro2 1.0.78", - "quote 1.0.35", + "proc-macro2", + "quote", "syn 2.0.50", ] -[[package]] -name = "pometry-faiss" -version = "0.7.0" -dependencies = [ - "ritual", -] - [[package]] name = "powerfmt" version = "0.2.0" @@ -2784,12 +2584,6 @@ version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" -[[package]] -name = "precomputed-hash" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" - [[package]] name = "pretty_assertions" version = "1.4.0" @@ -2826,8 +2620,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "18f33027081eba0a6d8aba6d1b1c3a3be58cbb12106341c2d5759fcd9b5277e7" dependencies = [ "proc-macro-error-attr", - "proc-macro2 1.0.78", - "quote 1.0.35", + "proc-macro2", + "quote", "syn 1.0.109", "version_check", ] @@ -2838,8 +2632,8 @@ version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a5b4b77fdb63c1eca72173d68d24501c54ab1269409f6b672c85deb18af69de" dependencies = [ - "proc-macro2 1.0.78", - "quote 1.0.35", + "proc-macro2", + "quote", "syn 1.0.109", "syn-mid", "version_check", @@ -2851,15 +2645,6 @@ version = "0.5.20+deprecated" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" -[[package]] -name = "proc-macro2" -version = "0.4.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf3d2011ab5c909338f7887f4fc896d35932e29146c12c8d01da6b22a80ba759" -dependencies = [ - "unicode-xid 0.1.0", -] - [[package]] name = "proc-macro2" version = "1.0.78" @@ -2882,7 +2667,7 @@ dependencies = [ "num-traits", "rand 0.8.5", "rand_chacha 0.3.1", - "rand_xorshift 0.3.0", + "rand_xorshift", "regex-syntax 0.8.2", "rusty-fork", "tempfile", @@ -2947,9 +2732,9 @@ version = "0.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "05f738b4e40d50b5711957f142878cfa0f28e054aa0ebdfc3fd137a843f74ed3" dependencies = [ - "proc-macro2 1.0.78", + "proc-macro2", "pyo3-macros-backend", - "quote 1.0.35", + "quote", "syn 2.0.50", ] @@ -2960,8 +2745,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fc910d4851847827daf9d6cdd4a823fbdaab5b8818325c5e97a86da79e8881f" dependencies = [ "heck", - "proc-macro2 1.0.78", - "quote 1.0.35", + "proc-macro2", + "quote", "syn 2.0.50", ] @@ -3006,27 +2791,18 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b22a693222d716a9587786f37ac3f6b4faedb5b80c23914e7303ff5a1d8016e9" dependencies = [ - "proc-macro2 1.0.78", - "quote 1.0.35", + "proc-macro2", + "quote", "syn 1.0.109", ] -[[package]] -name = "quote" -version = "0.6.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ce23b6b870e8f94f81fb0a363d65d86675884b34a09043c81e5562f11c1f8e1" -dependencies = [ - "proc-macro2 0.4.30", -] - [[package]] name = "quote" version = "1.0.35" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" dependencies = [ - "proc-macro2 1.0.78", + "proc-macro2", ] [[package]] @@ -3039,26 +2815,7 @@ dependencies = [ "libc", "rand_core 0.3.1", "rdrand", - "winapi 0.3.9", -] - -[[package]] -name = "rand" -version = "0.6.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d71dacdc3c88c1fde3885a3be3fbab9f35724e6ce99467f7d9c5026132184ca" -dependencies = [ - "autocfg 0.1.8", - "libc", - "rand_chacha 0.1.1", - "rand_core 0.4.2", - "rand_hc 0.1.0", - "rand_isaac", - "rand_jitter", - "rand_os", - "rand_pcg", - "rand_xorshift 0.1.1", - "winapi 0.3.9", + "winapi", ] [[package]] @@ -3071,7 +2828,7 @@ dependencies = [ "libc", "rand_chacha 0.2.2", "rand_core 0.5.1", - "rand_hc 0.2.0", + "rand_hc", ] [[package]] @@ -3085,16 +2842,6 @@ dependencies = [ "rand_core 0.6.4", ] -[[package]] -name = "rand_chacha" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "556d3a1ca6600bfcbab7c7c91ccb085ac7fbbcd70e008a98742e7847f4f7bcef" -dependencies = [ - "autocfg 0.1.8", - "rand_core 0.3.1", -] - [[package]] name = "rand_chacha" version = "0.2.2" @@ -3158,15 +2905,6 @@ dependencies = [ "rand 0.8.5", ] -[[package]] -name = "rand_hc" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b40677c7be09ae76218dc623efbf7b18e34bced3f38883af07bb75630a21bc4" -dependencies = [ - "rand_core 0.3.1", -] - [[package]] name = "rand_hc" version = "0.2.0" @@ -3176,59 +2914,6 @@ dependencies = [ "rand_core 0.5.1", ] -[[package]] -name = "rand_isaac" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ded997c9d5f13925be2a6fd7e66bf1872597f759fd9dd93513dd7e92e5a5ee08" -dependencies = [ - "rand_core 0.3.1", -] - -[[package]] -name = "rand_jitter" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1166d5c91dc97b88d1decc3285bb0a99ed84b05cfd0bc2341bdf2d43fc41e39b" -dependencies = [ - "libc", - "rand_core 0.4.2", - "winapi 0.3.9", -] - -[[package]] -name = "rand_os" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b75f676a1e053fc562eafbb47838d67c84801e38fc1ba459e8f180deabd5071" -dependencies = [ - "cloudabi", - "fuchsia-cprng", - "libc", - "rand_core 0.4.2", - "rdrand", - "winapi 0.3.9", -] - -[[package]] -name = "rand_pcg" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abf9b09b01790cfe0364f52bf32995ea3c39f4d2dd011eac241d2914146d0b44" -dependencies = [ - "autocfg 0.1.8", - "rand_core 0.4.2", -] - -[[package]] -name = "rand_xorshift" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cbf7e9e623549b0e21f6e97cf8ecf247c1a8fd2e8a992ae265314300b2455d5c" -dependencies = [ - "rand_core 0.3.1", -] - [[package]] name = "rand_xorshift" version = "0.3.0" @@ -3462,7 +3147,7 @@ version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7" dependencies = [ - "winapi 0.3.9", + "winapi", ] [[package]] @@ -3553,7 +3238,7 @@ dependencies = [ "spin 0.5.2", "untrusted 0.7.1", "web-sys", - "winapi 0.3.9", + "winapi", ] [[package]] @@ -3571,45 +3256,6 @@ dependencies = [ "windows-sys 0.52.0", ] -[[package]] -name = "ritual" -version = "0.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e21346a0a16287a0ff14d9956a8d1261728af3f2f6f83a8cafa5195d29669a1" -dependencies = [ - "clang", - "derive_more", - "itertools 0.8.2", - "log", - "pathdiff 0.1.0", - "regex", - "ritual_common", - "select", - "serde", - "serde_derive", - "tempdir", -] - -[[package]] -name = "ritual_common" -version = "0.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41a82c336d1b2b71264758d23d8dde602c1558310eb893c3230d7f5f49933d7f" -dependencies = [ - "bincode", - "failure", - "itertools 0.8.2", - "lazy_static", - "log", - "num_cpus", - "regex", - "serde", - "serde_derive", - "serde_json", - "term-painter", - "toml 0.4.10", -] - [[package]] name = "ron" version = "0.7.1" @@ -3653,22 +3299,13 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" -[[package]] -name = "rustc_version" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a" -dependencies = [ - "semver 0.9.0", -] - [[package]] name = "rustc_version" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" dependencies = [ - "semver 1.0.22", + "semver", ] [[package]] @@ -3834,37 +3471,12 @@ dependencies = [ "libc", ] -[[package]] -name = "select" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac645958c62108d11f90f8d34e4dc2799c838fc995ed4c2075867a2a8d5be76b" -dependencies = [ - "bit-set", - "html5ever", -] - -[[package]] -name = "semver" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403" -dependencies = [ - "semver-parser", -] - [[package]] name = "semver" version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "92d43fe69e652f3df9bdc2b85b2854a0825b86e4fb76bc44d945137d053639ca" -[[package]] -name = "semver-parser" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" - [[package]] name = "serde" version = "1.0.197" @@ -3891,8 +3503,8 @@ version = "1.0.197" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7eb0b34b42edc17f6b7cac84a52a1c5f0e1bb2227e997ca9011ea3dd34e8610b" dependencies = [ - "proc-macro2 1.0.78", - "quote 1.0.35", + "proc-macro2", + "quote", "syn 2.0.50", ] @@ -3974,12 +3586,6 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f27f6278552951f1f2b8cf9da965d10969b2efdea95a6ec47987ab46edfe263a" -[[package]] -name = "siphasher" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b8de496cf83d4ed58b6be86c3a275b8602f6ffe98d3024a869e124147a9a3ac" - [[package]] name = "sketches-ddsketch" version = "0.2.2" @@ -3995,7 +3601,7 @@ version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" dependencies = [ - "autocfg 1.1.0", + "autocfg", ] [[package]] @@ -4056,40 +3662,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "string_cache" -version = "0.7.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89c058a82f9fd69b1becf8c274f412281038877c553182f1d02eb027045a2d67" -dependencies = [ - "lazy_static", - "new_debug_unreachable", - "phf_shared", - "precomputed-hash", - "serde", - "string_cache_codegen", - "string_cache_shared", -] - -[[package]] -name = "string_cache_codegen" -version = "0.4.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0f45ed1b65bf9a4bf2f7b7dc59212d1926e9eaf00fa998988e420fd124467c6" -dependencies = [ - "phf_generator", - "phf_shared", - "proc-macro2 1.0.78", - "quote 1.0.35", - "string_cache_shared", -] - -[[package]] -name = "string_cache_shared" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1884d1bc09741d466d9b14e6d37ac89d6909cbcac41dd9ae982d4d063bbedfc" - [[package]] name = "strsim" version = "0.10.0" @@ -4118,8 +3690,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "23dc1fa9ac9c169a78ba62f0b841814b7abae11bdd047b9c58f893439e309ea0" dependencies = [ "heck", - "proc-macro2 1.0.78", - "quote 1.0.35", + "proc-macro2", + "quote", "rustversion", "syn 2.0.50", ] @@ -4130,25 +3702,14 @@ version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" -[[package]] -name = "syn" -version = "0.15.44" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ca4b3b69a77cbe1ffc9e198781b7acb0c7365a883670e8f1c1bc66fba79a5c5" -dependencies = [ - "proc-macro2 0.4.30", - "quote 0.6.13", - "unicode-xid 0.1.0", -] - [[package]] name = "syn" version = "1.0.109" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" dependencies = [ - "proc-macro2 1.0.78", - "quote 1.0.35", + "proc-macro2", + "quote", "unicode-ident", ] @@ -4158,8 +3719,8 @@ version = "2.0.50" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "74f1bdc9872430ce9b75da68329d1c1746faf50ffac5f19e02b71e37ff881ffb" dependencies = [ - "proc-macro2 1.0.78", - "quote 1.0.35", + "proc-macro2", + "quote", "unicode-ident", ] @@ -4169,8 +3730,8 @@ version = "0.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fea305d57546cc8cd04feb14b62ec84bf17f50e3f7b12560d7bfa9265f39d9ed" dependencies = [ - "proc-macro2 1.0.78", - "quote 1.0.35", + "proc-macro2", + "quote", "syn 1.0.109", ] @@ -4180,18 +3741,6 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160" -[[package]] -name = "synstructure" -version = "0.12.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f36bdaa60a83aca3921b5259d5400cbf5e90fc51931376a9bd4a0eb79aa7210f" -dependencies = [ - "proc-macro2 1.0.78", - "quote 1.0.35", - "syn 1.0.109", - "unicode-xid 0.2.4", -] - [[package]] name = "system-configuration" version = "0.5.1" @@ -4262,7 +3811,7 @@ dependencies = [ "thiserror", "time", "uuid", - "winapi 0.3.9", + "winapi", ] [[package]] @@ -4392,36 +3941,6 @@ dependencies = [ "windows-sys 0.52.0", ] -[[package]] -name = "tendril" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0" -dependencies = [ - "futf", - "mac", - "utf-8", -] - -[[package]] -name = "term" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa63644f74ce96fbeb9b794f66aff2a52d601cbd5e80f4b97123e3899f4570f1" -dependencies = [ - "kernel32-sys", - "winapi 0.2.8", -] - -[[package]] -name = "term-painter" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dcaa948f0e3e38470cd8dc8dcfe561a75c9e43f28075bb183845be2b9b3c08cf" -dependencies = [ - "term", -] - [[package]] name = "terminal_size" version = "0.3.0" @@ -4447,8 +3966,8 @@ version = "1.0.57" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a953cb265bef375dae3de6663da4d3804eee9682ea80d8e2542529b73c531c81" dependencies = [ - "proc-macro2 1.0.78", - "quote 1.0.35", + "proc-macro2", + "quote", "syn 2.0.50", ] @@ -4565,8 +4084,8 @@ version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" dependencies = [ - "proc-macro2 1.0.78", - "quote 1.0.35", + "proc-macro2", + "quote", "syn 2.0.50", ] @@ -4618,15 +4137,6 @@ dependencies = [ "tracing", ] -[[package]] -name = "toml" -version = "0.4.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "758664fc71a3a69038656bee8b6be6477d2a6c315a6b81f7081f591bffa4111f" -dependencies = [ - "serde", -] - [[package]] name = "toml" version = "0.5.11" @@ -4704,8 +4214,8 @@ version = "0.1.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ - "proc-macro2 1.0.78", - "quote 1.0.35", + "proc-macro2", + "quote", "syn 2.0.50", ] @@ -4859,12 +4369,6 @@ dependencies = [ "tinyvec", ] -[[package]] -name = "unicode-xid" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc" - [[package]] name = "unicode-xid" version = "0.2.4" @@ -5005,8 +4509,8 @@ dependencies = [ "bumpalo", "log", "once_cell", - "proc-macro2 1.0.78", - "quote 1.0.35", + "proc-macro2", + "quote", "syn 2.0.50", "wasm-bindgen-shared", ] @@ -5029,7 +4533,7 @@ version = "0.2.91" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b30af9e2d358182b5c7449424f017eba305ed32a7010509ede96cdc4696c46ed" dependencies = [ - "quote 1.0.35", + "quote", "wasm-bindgen-macro-support", ] @@ -5039,8 +4543,8 @@ version = "0.2.91" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "642f325be6301eb8107a83d12a8ac6c1e1c54345a7ef1a9261962dfefda09e66" dependencies = [ - "proc-macro2 1.0.78", - "quote 1.0.35", + "proc-macro2", + "quote", "syn 2.0.50", "wasm-bindgen-backend", "wasm-bindgen-shared", @@ -5072,8 +4576,8 @@ version = "0.3.41" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a5211b7550606857312bba1d978a8ec75692eae187becc5e680444fffc5e6f89" dependencies = [ - "proc-macro2 1.0.78", - "quote 1.0.35", + "proc-macro2", + "quote", "syn 2.0.50", ] @@ -5134,7 +4638,7 @@ dependencies = [ "cfg-if 0.1.10", "libc", "memory_units", - "winapi 0.3.9", + "winapi", ] [[package]] @@ -5143,12 +4647,6 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "495ec47bf3c1345005f40724f0269362c8556cbc43aed0526ed44cae1d35fceb" -[[package]] -name = "winapi" -version = "0.2.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a" - [[package]] name = "winapi" version = "0.3.9" @@ -5159,12 +4657,6 @@ dependencies = [ "winapi-x86_64-pc-windows-gnu", ] -[[package]] -name = "winapi-build" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc" - [[package]] name = "winapi-i686-pc-windows-gnu" version = "0.4.0" @@ -5177,7 +4669,7 @@ version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596" dependencies = [ - "winapi 0.3.9", + "winapi", ] [[package]] @@ -5396,8 +4888,8 @@ version = "0.7.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" dependencies = [ - "proc-macro2 1.0.78", - "quote 1.0.35", + "proc-macro2", + "quote", "syn 2.0.50", ] diff --git a/Cargo.toml b/Cargo.toml index 88640eb6e3..7788a9877e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,6 @@ members = [ "comparison-benchmark/rust/raphtory-rust-benchmark", "faiss-rs", "faiss-rs/faiss-sys", - "pometry-faiss", "disk-faiss", "disk-test", ] diff --git a/disk-faiss/src/lib.rs b/disk-faiss/src/lib.rs index 2848c0ac30..11f8f5182b 100644 --- a/disk-faiss/src/lib.rs +++ b/disk-faiss/src/lib.rs @@ -36,57 +36,60 @@ pub fn merge_ondisk(index: &str, shards: Vec<&str>, ivfdata: &str, output: &str) unsafe { cpp!([index_path as "const char *", shards as "std::vector *", num_shards as "uint32_t", ivfdata as "const char *", output as "const char *"] { - try { - std::cout << "----here----" << std::endl; - std::vector ivfs; - std::cout << "reading shards -> " << shards->size() << std::endl; - for (unsigned int i = 0; i < num_shards; ++i) { - // std::cout << "reading " << shard << std::endl; - const char * shard = shards->at(i); - faiss::IndexIVFFlat* ivf = (faiss::IndexIVFFlat*) faiss::read_index(shard, faiss::IO_FLAG_MMAP); - std::cout << "success reading" << std::endl; - ivfs.push_back(ivf); - std::cout << "success adding it" << std::endl; - - ivf->own_invlists = false; - delete ivf; - } + std::vector ivfs; + std::cout << "reading shards -> " << shards->size() << std::endl; + size_t ntotal = 0; + for (unsigned int i = 0; i < num_shards; ++i) { + const char * shard = shards->at(i); + auto index = faiss::read_index(shard, faiss::IO_FLAG_MMAP); + auto ivf = dynamic_cast(index); + assert(ivf); - std::cout << "---- after reading shards ----" << std::endl; - faiss::IndexIVFFlat* index = (faiss::IndexIVFFlat*) faiss::read_index(index_path); // TODO: review: 0 as second parameter???????????????????? + ivfs.push_back(ivf->invlists); + ntotal += ivf->ntotal; - if (index->ntotal != 0) { - std::exit(1); - } - std::cout << "nlist: " << index->nlist << std::endl; - std::cout << "code_size: " << index->code_size << std::endl; - std::cout << "---- about to call on disk inverted lists ----" << std::endl; - auto invlists = new faiss::OnDiskInvertedLists( - index->nlist, index->code_size, ivfdata - ); - std::cout << "----here----" << std::endl; - - // auto ivf_vector = faiss::InvertedListsPtrVector(); - // for (const auto& ivf : ivfs) { - // ivf_vector.push_back(ivf); - // } - - const faiss::InvertedLists **ivfs_data = (const faiss::InvertedLists**) ivfs.data(); - auto ntotal = invlists->merge_from(ivfs_data, ivfs.size()); // TODO: this has a verbose parameter I can use - std::cout << "--here--"; - - index->ntotal = ntotal; - index->replace_invlists(invlists, true); - std::cout << "--here--"; - // invlists.this.disown(); ???????????????????????? - - faiss::write_index(index, output); + // ivf->own_invlists = false; + // delete ivf; + } + + auto index_raw = faiss::read_index(index_path); + auto index = dynamic_cast(index_raw); + assert(index); + + if (index->ntotal != 0) { + std::exit(1); + } + + auto il = new faiss::OnDiskInvertedLists(index->nlist, index->code_size, ivfdata); + il->merge_from(ivfs.data(), ivfs.size()); + + index->replace_invlists(il, true); + index->ntotal = ntotal; + + // auto invlists = new faiss::OnDiskInvertedLists( + // index->nlist, index->code_size, ivfdata + // ); + // std::cout << "----here----" << std::endl; + + // const faiss::InvertedLists **ivfs_data = (const faiss::InvertedLists**) ivfs.data(); + // std::cout << "---- about to merge lists with size ---- " << ivfs.size() << std::endl; + // auto ntotal = invlists->merge_from(ivfs_data, ivfs.size()); // TODO: this has a verbose parameter I can use + // std::cout << "----here----" << std::endl; + + // index->ntotal = ntotal; + // index->replace_invlists(invlists, true); + // invlists.this.disown(); ???????????????????????? + + faiss::write_index(index, output); } catch (const std::exception &e) { - std::cerr << "cpp exception" << std::endl; + std::cerr << "standard exception!" << std::endl; std::cerr << e.what() << std::endl; throw e; + } catch (...) { + std::cerr << "unknown exception!" << std::endl; + throw; } }) }; diff --git a/disk-test/Cargo.toml b/disk-test/Cargo.toml index 1507b67213..10ad0d796e 100644 --- a/disk-test/Cargo.toml +++ b/disk-test/Cargo.toml @@ -14,3 +14,11 @@ homepage.workspace = true [dependencies] disk-faiss = { path = "../disk-faiss" } faiss = { path = "../faiss-rs" } + +[dev-dependencies] +criterion = "0.5.1" +rand = "0.8.5" + +[[bench]] +name = "bench" +harness = false diff --git a/disk-test/benches/bench.rs b/disk-test/benches/bench.rs new file mode 100644 index 0000000000..2426fe9698 --- /dev/null +++ b/disk-test/benches/bench.rs @@ -0,0 +1,239 @@ +use criterion::{criterion_group, criterion_main, BatchSize, Criterion}; +use disk_faiss::merge_ondisk; +use disk_test::{FVecs, FVecsReader}; +use faiss::{index_factory, read_index, write_index, Idx, Index, MetricType}; +use rand::{self, RngCore}; +use std::fs; + +struct SerialGenerator { + current: usize, +} + +impl SerialGenerator { + fn new() -> Self { + Self { current: 0 } + } +} + +impl Iterator for SerialGenerator { + type Item = usize; + fn next(&mut self) -> Option { + self.current += 1; + Some(self.current) + } +} + +// fn prepare_index_simple(index_type: &str, learn: &str, base: &str, dataset_id: &str) -> String { +// let base_path = format!("/tmp/faiss-disk-test/{dataset_id}/"); +// fs::create_dir_all(base_path.clone()).unwrap(); +// let tmpfile = |filename: &str| base_path.clone() + filename.as_ref(); + +// println!("Training index"); +// let FVecs { +// dimensions, +// vectors, +// } = FVecs::from_file(learn).unwrap(); +// let mut index = index_factory(dimensions as u32, index_type, MetricType::InnerProduct).unwrap(); +// index.train(vectors.as_slice()).unwrap(); + +// let index = index.into_ivf_flat().unwrap(); +// write_index(&index, tmpfile("trained.index")).unwrap(); + +// let full_dataset = FVecs::from_file(base).unwrap(); +// let vectors = &full_dataset.vectors; +// let num_vectors = vectors.len() / dimensions as usize; +// println!("Splitting {num_vectors} vectors into 4 files"); + +// let num_chunks = 4; +// let vectors_per_chunk = num_vectors / num_chunks + 1; +// let chunk_files: Vec<_> = (0..num_chunks) +// .map(|chunk_number| tmpfile(format!("block_{chunk_number}.index").as_str())) +// .collect(); +// let chunk_files: Vec<_> = chunk_files.iter().map(|f| f.as_str()).collect(); + +// for ((chunk_number, chunk), filename) in vectors +// .chunks(vectors_per_chunk * dimensions as usize) +// .enumerate() +// .zip(chunk_files.iter()) +// { +// let first_id = vectors_per_chunk * chunk_number; +// let ids_range = first_id..(first_id + chunk.len()); +// let ids: Vec<_> = ids_range.map(|id| Idx::from(id as i64)).collect(); +// let mut index = read_index(tmpfile("trained.index")).unwrap(); +// index.add_with_ids(chunk, ids.as_slice()).unwrap(); +// write_index(&index, filename).unwrap(); +// } + +// println!("merging indexes on disk"); +// merge_ondisk( +// &tmpfile("trained.index"), +// chunk_files, +// &tmpfile("merged_index.ivfdata"), +// &tmpfile("populated.index"), +// ); +// tmpfile("populated.index") +// } + +fn prepare_index(index_type: &str, learn: &str, base: &[&str], dataset_id: &str) -> String { + let base_path = format!("/tmp/faiss-disk-test/{dataset_id}/"); + fs::create_dir_all(base_path.clone()).unwrap(); + let tmpfile = |filename: &str| base_path.clone() + filename.as_ref(); + + println!("Training index"); + let FVecs { + dimensions, + vectors, + } = FVecs::from_file(learn).unwrap(); + let mut index = index_factory(dimensions as u32, index_type, MetricType::InnerProduct).unwrap(); + println!("actually training index"); + index.train(vectors.as_slice()).unwrap(); + + let index = index.into_ivf_flat().unwrap(); + println!("writting index"); + write_index(&index, tmpfile("trained.index")).unwrap(); + + let shards = base + .iter() + .map(|filename| FVecsReader::from_file(filename, 10_000_000).unwrap()) + .flat_map(|reader| reader.into_iter()); + + // let (shard_iter, shard_files): (Box>>, Vec) = match base { + // [single_file] => { + // println!("Reading {single_file}"); + // let full_dataset = FVecs::from_file(single_file).unwrap(); + // let vectors = full_dataset.vectors; + // let num_vectors = vectors.len() / dimensions as usize; + // println!("Splitting {num_vectors} vectors into 4 files"); + // let num_chunks = 4; + // let vectors_per_chunk = num_vectors / num_chunks + 1; + // let shard_iter = vectors + // .chunks(vectors_per_chunk * dimensions as usize) + // .map(|chunk| chunk.iter().copied().collect()) + // .collect::>() + // .into_iter(); + // let shard_files: Vec<_> = (0..num_chunks) + // .map(|chunk_number| tmpfile(format!("shard_{chunk_number}.index").as_str())) + // .collect(); + // (Box::new(shard_iter), shard_files) + // } + // files => { + // let shard_iter = files + // .iter() + // .map(|file| FVecs::from_file(file).unwrap().vectors); + // let shard_files: Vec<_> = (0..files.len()) + // .map(|chunk_number| tmpfile(format!("shard_{chunk_number}.index").as_str())) + // .collect(); + // (Box::new(shard_iter), shard_files) + // } + // }; + // let shard_files: Vec<_> = shard_files.iter().map(|f| f.as_str()).collect(); + + // let mut id_gen = SerialGenerator::new(); + // for (chunk, filename) in shard_iter.zip(shard_files.iter()) { + // let ids: Vec<_> = id_gen + // .by_ref() + // .take(chunk.len()) + // .map(|id| Idx::from(id as i64)) + // .collect(); + // let mut index = read_index(tmpfile("trained.index")).unwrap(); + // index.add_with_ids(&chunk, &ids).unwrap(); + // write_index(&index, filename).unwrap(); + // } + + let shard_files_iter = + || (0..usize::MAX).map(|file_num| tmpfile(format!("shard_{file_num}.index").as_str())); + + // let mut id_gen = SerialGenerator::new(); + let mut id_gen = 0..usize::MAX; + for (chunk, filename) in shards.zip(shard_files_iter()) { + let num_vecs = chunk.len() / dimensions; + println!("Processing {} vectors", num_vecs); + let ids: Vec<_> = id_gen + .by_ref() + .take(num_vecs) + .map(|id| Idx::from(id as i64)) + .collect(); + let mut index = read_index(tmpfile("trained.index")).unwrap(); + index.add_with_ids(&chunk, &ids).unwrap(); + println!("Writting them to {filename}"); + write_index(&index, filename).unwrap(); + } + + let all_shard_files: Vec<_> = shard_files_iter() + .take_while(|filename| match fs::metadata(filename) { + Err(_) => false, + Ok(metadata) => metadata.is_file(), + }) + .collect(); + let all_shard_files: Vec<_> = all_shard_files + .iter() + .map(|filename| filename.as_str()) + .collect(); + + println!("merging indexes: {}", all_shard_files.join(", ")); + merge_ondisk( + &tmpfile("trained.index"), + all_shard_files, + &tmpfile("merged_index.ivfdata"), + &tmpfile("populated.index"), + ); + tmpfile("populated.index") +} + +fn bench(c: &mut Criterion) { + let mut rng = rand::thread_rng(); + + // let index_path = prepare_index( + // "IVF4096,Flat", + // "resources/sift/sift_learn.fvecs", + // &["resources/sift/sift_base.fvecs"], + // "sift", + // ); + // let mut index = read_index(index_path).unwrap(); + // let query_batch = FVecs::from_file("resources/sift/sift_query.fvecs").unwrap(); + // let queries: Vec<_> = query_batch.split().collect(); + // c.bench_function("sift 1M", |b| { + // b.iter_batched( + // || queries[rng.next_u64() as usize % queries.len()], + // |query| index.search(query, 1).unwrap(), + // BatchSize::SmallInput, + // ); + // }); + + let index_path = prepare_index( + "IVF4096,Flat", + "resources/deep/deep10M.fvecs", + &["resources/deep/deep10M.fvecs"], + "deep10", + ); + let mut index = read_index(index_path).unwrap(); + let query_batch = FVecs::from_file("resources/deep/deep1B_queries.fvecs").unwrap(); + let queries: Vec<_> = query_batch.split().collect(); + c.bench_function("deep 10M", |b| { + b.iter_batched( + || queries[rng.next_u64() as usize % queries.len()], + |query| index.search(query, 1).unwrap(), + BatchSize::SmallInput, + ); + }); + + // let index_path = prepare_index( + // "IVF4096,Flat", + // "resources/deep/deep10M.fvecs", + // &["resources/deep/base_00"], + // "deep30", + // ); + // let mut index = read_index(index_path).unwrap(); + // let query_batch = FVecs::from_file("resources/deep/deep1B_queries.fvecs").unwrap(); + // let queries: Vec<_> = query_batch.split().collect(); + // c.bench_function("deep 30M", |b| { + // b.iter_batched( + // || queries[rng.next_u64() as usize % queries.len()], + // |query| index.search(query, 1).unwrap(), + // BatchSize::SmallInput, + // ); + // }); +} + +criterion_group!(benches, bench); +criterion_main!(benches); diff --git a/disk-test/src/lib.rs b/disk-test/src/lib.rs new file mode 100644 index 0000000000..56a8cf09f1 --- /dev/null +++ b/disk-test/src/lib.rs @@ -0,0 +1,83 @@ +use std::fs; +use std::fs::File; +use std::io::ErrorKind::InvalidData; +use std::io::Read; +use std::io::{BufReader, Error as IoError}; + +pub struct FVecs { + pub dimensions: usize, + pub vectors: Vec, +} + +impl FVecs { + pub fn from_file(file_name: &str) -> Result { + let data = fs::read(file_name)?; + let (dim_data, vector_data) = data.split_at(4); + let dim = dim_data + .try_into() + .map_err(|e| IoError::new(InvalidData, e))?; + let dimensions = u32::from_le_bytes(dim) as usize; + let vectors: Vec<_> = vector_data + .chunks_exact(4) + .map(|chunk| f32::from_le_bytes(chunk.try_into().unwrap())) + .collect(); + + Ok(Self { + dimensions, + vectors, + }) + } + + pub fn split(&self) -> impl Iterator { + self.vectors.chunks(self.dimensions) + } + + pub fn get(&self, index: usize) -> Option<&[f32]> { + self.split().nth(index) + } +} + +pub struct FVecsReader { + pub dimensions: usize, + reader: BufReader, + chunk_size: usize, +} + +impl FVecsReader { + pub fn from_file(filename: &str, chunk_size: usize) -> Result { + let file = File::open(filename)?; + let mut reader = BufReader::new(file); + + let mut buffer = vec![0; 4]; + let dimensions = match reader.read(&mut buffer) { + Ok(4) => Ok(u32::from_le_bytes(buffer.try_into().unwrap()) as usize), + Ok(_) => Err(IoError::from(InvalidData)), + Err(e) => Err(e), + }?; + + Ok(Self { + dimensions, + reader, + chunk_size, + }) + } +} + +impl Iterator for FVecsReader { + type Item = Vec; + fn next(&mut self) -> Option { + let mut buffer = vec![0; self.chunk_size * self.dimensions * 4]; + match self.reader.read(&mut buffer) { + Err(_) => None, + Ok(0) => None, + Ok(n) => { + buffer.truncate(n); + let floats = buffer + .chunks_exact(4) + .map(|chunk| f32::from_le_bytes(chunk.try_into().unwrap())) + .collect(); + Some(floats) + } + } + } +} diff --git a/disk-test/src/main.rs b/disk-test/src/main.rs index b77a223348..2cb91d53d9 100644 --- a/disk-test/src/main.rs +++ b/disk-test/src/main.rs @@ -1,55 +1,26 @@ use disk_faiss::merge_ondisk; +use disk_test::FVecs; use faiss::{index_factory, read_index, write_index, Idx, Index, MetricType}; -use std::fs; -use std::io::Error as IoError; -use std::io::ErrorKind::InvalidData; - -struct FVecsContent { - dimensions: u32, - vectors: Vec, -} - -fn read_fvecs(file_name: &str) -> Result { - let data = fs::read(file_name)?; - let (dim_data, vector_data) = data.split_at(4); - let dim = dim_data - .try_into() - .map_err(|e| IoError::new(InvalidData, e))?; - let dimensions = u32::from_le_bytes(dim); - let vectors: Vec<_> = vector_data - .chunks_exact(4) - .map(|chunk| f32::from_le_bytes(chunk.try_into().unwrap())) - .collect(); - - Ok(FVecsContent { - dimensions, - vectors, - }) -} fn tmpfile>(filename: S) -> String { - return "/tmp/faiss-disk-test/".to_owned() + filename.as_ref(); + "/tmp/faiss-disk-test/".to_owned() + filename.as_ref() } fn main() { println!("Training index"); - let FVecsContent { + let FVecs { dimensions, vectors, - } = read_fvecs("resources/sift/sift_learn.fvecs").unwrap(); - // println!("dimensions -> {dimensions}"); - // let sample: Vec<_> = flattened_vectors.iter().take(8).collect(); - // println!("sample -> {sample:?}"); - let mut index = index_factory(dimensions, "IVF4096,Flat", MetricType::InnerProduct).unwrap(); + } = FVecs::from_file("resources/sift/sift_learn.fvecs").unwrap(); + let mut index = + index_factory(dimensions as u32, "IVF4096,Flat", MetricType::InnerProduct).unwrap(); index.train(vectors.as_slice()).unwrap(); let index = index.into_ivf_flat().unwrap(); - dbg!(&index.nlist()); - dbg!(&index.ntotal()); write_index(&index, tmpfile("trained.index")).unwrap(); println!("Splitting vectors into files"); - let vectors = read_fvecs("resources/sift/sift_base.fvecs") + let vectors = FVecs::from_file("resources/sift/sift_base.fvecs") .unwrap() .vectors; @@ -84,8 +55,10 @@ fn main() { println!("using the ondisk index"); let mut index = read_index(&tmpfile("populated.index")).unwrap(); - let queries = read_fvecs("resources/sift/sift_query.fvecs").unwrap(); + let queries = FVecs::from_file("resources/sift/sift_query.fvecs").unwrap(); + let first_query = queries.get(0).unwrap(); - let result = index.search(&queries.vectors, 5).unwrap(); + let result = index.search(first_query, 5).unwrap(); println!("result: {result:?}"); + // println!("success!!!!!!!"); } From 678455ef87891b7068723a90d36571d8d18c2b95 Mon Sep 17 00:00:00 2001 From: Pedro Rico Pinazo Date: Thu, 7 Mar 2024 16:32:20 +0000 Subject: [PATCH 8/9] test 325M --- disk-test/benches/bench.rs | 111 ++++++++++++++----------------------- disk-test/src/lib.rs | 28 +++++++--- 2 files changed, 64 insertions(+), 75 deletions(-) diff --git a/disk-test/benches/bench.rs b/disk-test/benches/bench.rs index 2426fe9698..f1520b894d 100644 --- a/disk-test/benches/bench.rs +++ b/disk-test/benches/bench.rs @@ -85,65 +85,26 @@ fn prepare_index(index_type: &str, learn: &str, base: &[&str], dataset_id: &str) vectors, } = FVecs::from_file(learn).unwrap(); let mut index = index_factory(dimensions as u32, index_type, MetricType::InnerProduct).unwrap(); - println!("actually training index"); index.train(vectors.as_slice()).unwrap(); let index = index.into_ivf_flat().unwrap(); - println!("writting index"); write_index(&index, tmpfile("trained.index")).unwrap(); let shards = base .iter() - .map(|filename| FVecsReader::from_file(filename, 10_000_000).unwrap()) + .enumerate() + .map(|(index, filename)| { + println!("opening new file {filename}"); + let dimensions = if index != 0 { + Some(dimensions) + } else { + None + }; + FVecsReader::from_file(filename, 10_000_000, dimensions).unwrap()}) .flat_map(|reader| reader.into_iter()); - - // let (shard_iter, shard_files): (Box>>, Vec) = match base { - // [single_file] => { - // println!("Reading {single_file}"); - // let full_dataset = FVecs::from_file(single_file).unwrap(); - // let vectors = full_dataset.vectors; - // let num_vectors = vectors.len() / dimensions as usize; - // println!("Splitting {num_vectors} vectors into 4 files"); - // let num_chunks = 4; - // let vectors_per_chunk = num_vectors / num_chunks + 1; - // let shard_iter = vectors - // .chunks(vectors_per_chunk * dimensions as usize) - // .map(|chunk| chunk.iter().copied().collect()) - // .collect::>() - // .into_iter(); - // let shard_files: Vec<_> = (0..num_chunks) - // .map(|chunk_number| tmpfile(format!("shard_{chunk_number}.index").as_str())) - // .collect(); - // (Box::new(shard_iter), shard_files) - // } - // files => { - // let shard_iter = files - // .iter() - // .map(|file| FVecs::from_file(file).unwrap().vectors); - // let shard_files: Vec<_> = (0..files.len()) - // .map(|chunk_number| tmpfile(format!("shard_{chunk_number}.index").as_str())) - // .collect(); - // (Box::new(shard_iter), shard_files) - // } - // }; - // let shard_files: Vec<_> = shard_files.iter().map(|f| f.as_str()).collect(); - - // let mut id_gen = SerialGenerator::new(); - // for (chunk, filename) in shard_iter.zip(shard_files.iter()) { - // let ids: Vec<_> = id_gen - // .by_ref() - // .take(chunk.len()) - // .map(|id| Idx::from(id as i64)) - // .collect(); - // let mut index = read_index(tmpfile("trained.index")).unwrap(); - // index.add_with_ids(&chunk, &ids).unwrap(); - // write_index(&index, filename).unwrap(); - // } - let shard_files_iter = || (0..usize::MAX).map(|file_num| tmpfile(format!("shard_{file_num}.index").as_str())); - // let mut id_gen = SerialGenerator::new(); let mut id_gen = 0..usize::MAX; for (chunk, filename) in shards.zip(shard_files_iter()) { let num_vecs = chunk.len() / dimensions; @@ -200,39 +161,53 @@ fn bench(c: &mut Criterion) { // ); // }); - let index_path = prepare_index( - "IVF4096,Flat", - "resources/deep/deep10M.fvecs", - &["resources/deep/deep10M.fvecs"], - "deep10", - ); - let mut index = read_index(index_path).unwrap(); - let query_batch = FVecs::from_file("resources/deep/deep1B_queries.fvecs").unwrap(); - let queries: Vec<_> = query_batch.split().collect(); - c.bench_function("deep 10M", |b| { - b.iter_batched( - || queries[rng.next_u64() as usize % queries.len()], - |query| index.search(query, 1).unwrap(), - BatchSize::SmallInput, - ); - }); - // let index_path = prepare_index( // "IVF4096,Flat", // "resources/deep/deep10M.fvecs", - // &["resources/deep/base_00"], - // "deep30", + // &["resources/deep/deep10M.fvecs"], + // "deep10", // ); // let mut index = read_index(index_path).unwrap(); // let query_batch = FVecs::from_file("resources/deep/deep1B_queries.fvecs").unwrap(); // let queries: Vec<_> = query_batch.split().collect(); - // c.bench_function("deep 30M", |b| { + // c.bench_function("deep 10M", |b| { // b.iter_batched( // || queries[rng.next_u64() as usize % queries.len()], // |query| index.search(query, 1).unwrap(), // BatchSize::SmallInput, // ); // }); + + let index_path = prepare_index( + "IVF262144_HNSW32,Flat", + "resources/deep/deep10M.fvecs", + &[ + "resources/deep/base_00", + "resources/deep/base_01", + "resources/deep/base_02", + "resources/deep/base_03", + "resources/deep/base_00", + "resources/deep/base_01", + "resources/deep/base_02", + "resources/deep/base_03", + "resources/deep/base_00", + "resources/deep/base_01", + "resources/deep/base_02", + "resources/deep/base_03", + ], + // &["resources/deep/deep10M.fvecs"], + "deep", + ); + let mut index = read_index(index_path).unwrap(); + let query_batch = FVecs::from_file("resources/deep/deep1B_queries.fvecs").unwrap(); + let queries: Vec<_> = query_batch.split().collect(); + c.bench_function("deep 325M", |b| { + b.iter_batched( + || queries[rng.next_u64() as usize % queries.len()], + |query| index.search(query, 1).unwrap(), + BatchSize::SmallInput, + ); + }); } criterion_group!(benches, bench); diff --git a/disk-test/src/lib.rs b/disk-test/src/lib.rs index 56a8cf09f1..5d2b8c4e94 100644 --- a/disk-test/src/lib.rs +++ b/disk-test/src/lib.rs @@ -44,16 +44,24 @@ pub struct FVecsReader { } impl FVecsReader { - pub fn from_file(filename: &str, chunk_size: usize) -> Result { + pub fn from_file( + filename: &str, + chunk_size: usize, + dimensions: Option, + ) -> Result { let file = File::open(filename)?; let mut reader = BufReader::new(file); - let mut buffer = vec![0; 4]; - let dimensions = match reader.read(&mut buffer) { - Ok(4) => Ok(u32::from_le_bytes(buffer.try_into().unwrap()) as usize), - Ok(_) => Err(IoError::from(InvalidData)), - Err(e) => Err(e), - }?; + let dimensions = + dimensions + .map(|d| Ok(d)) + .unwrap_or_else(|| match reader.read(&mut buffer) { + Ok(4) => Ok(u32::from_le_bytes(buffer.try_into().unwrap()) as usize), + Ok(_) => Err(IoError::from(InvalidData)), + Err(e) => Err(e), + })?; + + println!("dimensions for vectors in file {filename}: {dimensions}"); Ok(Self { dimensions, @@ -63,6 +71,12 @@ impl FVecsReader { } } +impl Drop for FVecsReader { + fn drop(&mut self) { + println!("dropping file reader"); + } +} + impl Iterator for FVecsReader { type Item = Vec; fn next(&mut self) -> Option { From 09211ef140c91c558e2c53fa5b308cd1c6f7e693 Mon Sep 17 00:00:00 2001 From: Pedro Rico Pinazo Date: Tue, 2 Apr 2024 11:44:26 +0100 Subject: [PATCH 9/9] fix typo --- raphtory/src/vectors/faiss_store.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/raphtory/src/vectors/faiss_store.rs b/raphtory/src/vectors/faiss_store.rs index 81d9eedd54..d2cf84759d 100644 --- a/raphtory/src/vectors/faiss_store.rs +++ b/raphtory/src/vectors/faiss_store.rs @@ -3,6 +3,11 @@ use faiss::{index::IndexImpl, index_factory, Idx, Index, MetricType}; use itertools::Itertools; use std::collections::HashMap; + +trait ExternalVectorIndex { + +} + #[derive(Clone, Debug)] pub(crate) struct DocumentPointer { pub(crate) entity: EntityId, @@ -43,7 +48,7 @@ impl FaissStore { nodes: &HashMap>, edges: &HashMap>, ) -> Self { - // TODO: review, this doesnt froup if there are empty groups! + // TODO: review, this doesnt group if there are empty groups! let maybe_node_group = nodes.iter().next(); let maybe_edge_group = edges.iter().next(); let maybe_group = maybe_node_group.or(maybe_edge_group);