From e6eee86b9aa637bdc6207f622f6bf64a5fbb5241 Mon Sep 17 00:00:00 2001 From: AliFlux Date: Mon, 11 Jul 2022 22:26:13 +0500 Subject: [PATCH 1/5] Adding conjunction by default parameter --- .gitignore | 1 + src/index.rs | 9 ++++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 72ff37d7..972d76c1 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,4 @@ __pycache__/ tantivy.so tantivy/tantivy.cpython*.so tantivy.egg-info/ +4.0 diff --git a/src/index.rs b/src/index.rs index a24647b2..90d47135 100644 --- a/src/index.rs +++ b/src/index.rs @@ -314,10 +314,12 @@ impl Index { /// field is specified in the query. /// #[args(reload_policy = "RELOAD_POLICY")] + #[args(conjunction_by_default = false)] pub fn parse_query( &self, query: &str, default_field_names: Option>, + conjunction_by_default: bool, ) -> PyResult { let mut default_fields = vec![]; let schema = self.index.schema(); @@ -348,8 +350,13 @@ impl Index { } } } - let parser = + let mut parser = tv::query::QueryParser::for_index(&self.index, default_fields); + + if conjunction_by_default { + parser.set_conjunction_by_default(); + } + let query = parser.parse_query(query).map_err(to_pyerr)?; Ok(Query { inner: query }) From a5ee926b22fed8ad7c4fab06411676f9dedac07d Mon Sep 17 00:00:00 2001 From: AliFlux Date: Tue, 12 Jul 2022 23:59:46 +0500 Subject: [PATCH 2/5] Adding explicity versioning --- pyproject.toml | 3 ++- requirements-dev.txt | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b97966dc..e08dc016 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,8 @@ [build-system] -requires = ["maturin"] +requires = ["maturin==0.13.0"] build-backend = "maturin" [project] name = "tantivy" requires-python = ">=3.7" + diff --git a/requirements-dev.txt b/requirements-dev.txt index 20faf861..7e271fc3 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,2 +1,2 @@ -maturin +maturin==0.13.0 pytest>=4.0 From efb5c71f45987769785a505947fce0beebd6c194 Mon Sep 17 00:00:00 2001 From: AliFlux Date: Wed, 13 Jul 2022 18:55:07 +0500 Subject: [PATCH 3/5] Adding facet collector + document boosting --- src/facet.rs | 2 +- src/index.rs | 11 +++++- src/schemabuilder.rs | 84 ++++++++++++++++++++++++++++++++++++++++++-- src/searcher.rs | 79 +++++++++++++++++++++++++++++++++++++++-- 4 files changed, 169 insertions(+), 7 deletions(-) diff --git a/src/facet.rs b/src/facet.rs index b02cfb54..1927f1e1 100644 --- a/src/facet.rs +++ b/src/facet.rs @@ -48,7 +48,7 @@ impl Facet { #[classmethod] fn from_string(_cls: &PyType, facet_string: &str) -> Facet { Facet { - inner: schema::Facet::from(facet_string), + inner: schema::Facet::from_text(facet_string).unwrap(), } } diff --git a/src/index.rs b/src/index.rs index 90d47135..22e0415e 100644 --- a/src/index.rs +++ b/src/index.rs @@ -346,7 +346,16 @@ impl Index { } else { for (field, field_entry) in self.index.schema().fields() { if field_entry.is_indexed() { - default_fields.push(field); + + match field_entry.field_type() { + tv::schema::FieldType::Facet(_) => { + // facets aren't suited for default fields + }, + _ => { + default_fields.push(field); + }, + } + } } } diff --git a/src/schemabuilder.rs b/src/schemabuilder.rs index 30cbd296..08190683 100644 --- a/src/schemabuilder.rs +++ b/src/schemabuilder.rs @@ -2,7 +2,7 @@ use pyo3::{exceptions, prelude::*}; -use tantivy::schema; +use tantivy::schema::{self, FacetOptions}; use crate::schema::Schema; use std::sync::{Arc, RwLock}; @@ -131,6 +131,50 @@ impl SchemaBuilder { Ok(self.clone()) } + /// Add a new float64 field to the schema. + /// Note: When adding value to the index, make sure that it is type-casted to float + /// Adding integers or other values may produce false result + /// + /// Args: + /// name (str): The name of the field. + /// stored (bool, optional): If true sets the field as stored, the + /// content of the field can be later restored from a Searcher. + /// Defaults to False. + /// indexed (bool, optional): If true sets the field to be indexed. + /// fast (str, optional): Set the f64 options as a single-valued fast + /// field. Fast fields are designed for random access. Access time + /// are similar to a random lookup in an array. If more than one + /// value is associated to a fast field, only the last one is kept. + /// Can be one of 'single' or 'multi'. If this is set to 'single, + /// the document must have exactly one value associated to the + /// document. If this is set to 'multi', the document can have any + /// number of values associated to the document. Defaults to None, + /// which disables this option. + /// + /// Returns the associated field handle. + /// Raises a ValueError if there was an error with the field creation. + #[args(stored = false, indexed = false)] + fn add_float_field( + &mut self, + name: &str, + stored: bool, + indexed: bool, + fast: Option<&str>, + ) -> PyResult { + let builder = &mut self.builder; + + let opts = SchemaBuilder::build_float_option(stored, indexed, fast)?; + + if let Some(builder) = builder.write().unwrap().as_mut() { + builder.add_f64_field(name, opts); + } else { + return Err(exceptions::PyValueError::new_err( + "Schema builder object isn't valid anymore.", + )); + } + Ok(self.clone()) + } + /// Add a new unsigned integer field to the schema. /// /// Args: @@ -267,13 +311,14 @@ impl SchemaBuilder { /// Add a Facet field to the schema. /// Args: /// name (str): The name of the field. + #[args(stored = false, indexed = false)] fn add_facet_field(&mut self, name: &str) -> PyResult { let builder = &mut self.builder; if let Some(builder) = builder.write().unwrap().as_mut() { - builder.add_facet_field(name, INDEXED); + builder.add_facet_field(name, FacetOptions::default()); } else { - return Err(exceptions::PyValueError::new_err( + return Err(exceptions::PyValueError::new_err( "Schema builder object isn't valid anymore.", )); } @@ -352,6 +397,39 @@ impl SchemaBuilder { Ok(opts) } + fn build_float_option( + stored: bool, + indexed: bool, + fast: Option<&str>, + ) -> PyResult { + let opts = schema::NumericOptions::default(); + + let opts = if stored { opts.set_stored() } else { opts }; + let opts = if indexed { opts.set_indexed() } else { opts }; + + let fast = match fast { + Some(f) => { + let f = f.to_lowercase(); + match f.as_ref() { + "single" => Some(schema::Cardinality::SingleValue), + "multi" => Some(schema::Cardinality::MultiValues), + _ => return Err(exceptions::PyValueError::new_err( + "Invalid index option, valid choices are: 'multivalue' and 'singlevalue'" + )), + } + } + None => None, + }; + + let opts = if let Some(f) = fast { + opts.set_fast(f) + } else { + opts + }; + + Ok(opts) + } + fn build_text_option( stored: bool, tokenizer_name: &str, diff --git a/src/searcher.rs b/src/searcher.rs index c2b67969..2c2c02ab 100644 --- a/src/searcher.rs +++ b/src/searcher.rs @@ -1,9 +1,13 @@ #![allow(clippy::new_ret_no_self)] +use std::collections::HashMap; use crate::{document::Document, get_field, query::Query, to_pyerr}; use pyo3::{exceptions::PyValueError, prelude::*}; use tantivy as tv; use tantivy::collector::{Count, MultiCollector, TopDocs}; +use tv::collector::{FacetCollector}; +use tv::fastfield::FastFieldReader; +use tv::{SegmentReader, Score, DocId}; /// Tantivy's Searcher class /// @@ -41,10 +45,15 @@ impl ToPyObject for Fruit { /// Object holding a results successful search. pub(crate) struct SearchResult { hits: Vec<(Fruit, DocAddress)>, + #[pyo3(get)] /// How many documents matched the query. Only available if `count` was set /// to true during the search. count: Option, + + #[pyo3(get)] + /// Results of facets using using `count_facets_by_field` parameter + facet_counts: Option>, } #[pymethods] @@ -87,6 +96,9 @@ impl Searcher { /// should be ordered by. The field must be declared as a fast field /// when building the schema. Note, this only works for unsigned /// fields. + /// weight_by_field (Field, optional): A schema field increases the + /// score of the document by the given value. It should be a fast + /// field of float data type /// offset (Field, optional): The offset from which the results have /// to be returned. /// @@ -100,7 +112,9 @@ impl Searcher { query: &Query, limit: usize, count: bool, + count_facets_by_field: Option<&str>, order_by_field: Option<&str>, + weight_by_field: Option<&str>, offset: usize, ) -> PyResult { let mut multicollector = MultiCollector::new(); @@ -111,8 +125,50 @@ impl Searcher { None }; + let facet_handle = if let Some(facet_name) = count_facets_by_field { + let field = get_field(&self.inner.index().schema(), facet_name)?; + let mut facet_collector = FacetCollector::for_field(field); + facet_collector.add_facet("/"); + Some(multicollector.add_collector(facet_collector)) + } else { + None + }; + let (mut multifruit, hits) = { - if let Some(order_by) = order_by_field { + + if let Some(weight_by) = weight_by_field { + + let field = get_field(&self.inner.index().schema(), weight_by)?; + let collector = TopDocs::with_limit(limit) + .and_offset(offset) + .tweak_score(move |segment_reader: &SegmentReader| { + let weight_reader = segment_reader.fast_fields().f64(field).unwrap(); + return move |doc: DocId, original_score: Score| { + let weight: f64 = weight_reader.get(doc); + let new_score = original_score + weight as f32; + return new_score + } + }); + + let top_docs_handle = multicollector.add_collector(collector); + let ret = self.inner.search(query.get(), &multicollector); + + match ret { + Ok(mut r) => { + let top_docs = top_docs_handle.extract(&mut r); + let result: Vec<(Fruit, DocAddress)> = top_docs + .iter() + .map(|(f, d)| { + (Fruit::Score(*f), DocAddress::from(d)) + }) + .collect(); + (r, result) + } + Err(e) => return Err(PyValueError::new_err(e.to_string())), + } + + } else if let Some(order_by) = order_by_field { + let field = get_field(&self.inner.index().schema(), order_by)?; let collector = TopDocs::with_limit(limit) .and_offset(offset) @@ -159,7 +215,26 @@ impl Searcher { None => None, }; - Ok(SearchResult { hits, count }) + let facet_counts:Option> = match facet_handle { + Some(h) => { + let facet_counts_obj = h.extract(&mut multifruit); + + let collection: Vec<(&tv::schema::Facet, u64)> = facet_counts_obj + .get("/") + .collect(); + + let mut facet_counts:HashMap = HashMap::new(); + + for (facet, count) in collection.iter() { + facet_counts.insert(facet.to_path_string(), *count); + } + + Some(facet_counts) + }, + None => None, + }; + + Ok(SearchResult { hits, count, facet_counts}) } /// Returns the overall number of documents in the index. From 90ed95ab669d537077802d5e2771f2abcbe49cfa Mon Sep 17 00:00:00 2001 From: AliFlux Date: Wed, 13 Jul 2022 19:14:00 +0500 Subject: [PATCH 4/5] Adding documentation for count_facets_by_field --- src/searcher.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/searcher.rs b/src/searcher.rs index 2c2c02ab..094339a1 100644 --- a/src/searcher.rs +++ b/src/searcher.rs @@ -92,6 +92,8 @@ impl Searcher { /// return. Defaults to 10. /// count (bool, optional): Should the number of documents that match /// the query be returned as well. Defaults to true. + /// count_facets_by_field (Field, optional): Return grouped number of + /// documents by the given facet field. Defaults to false /// order_by_field (Field, optional): A schema field that the results /// should be ordered by. The field must be declared as a fast field /// when building the schema. Note, this only works for unsigned From a5cef2bf69e366699888a781cbfbedd0bc0b3287 Mon Sep 17 00:00:00 2001 From: AliFlux Date: Wed, 26 Oct 2022 02:15:16 +0500 Subject: [PATCH 5/5] Minor lint fixes --- .gitignore | 1 - 4.0 | 26 ++++++++++++++++++++++++++ pyproject.toml | 2 +- src/schemabuilder.rs | 2 +- 4 files changed, 28 insertions(+), 3 deletions(-) create mode 100644 4.0 diff --git a/.gitignore b/.gitignore index 972d76c1..72ff37d7 100644 --- a/.gitignore +++ b/.gitignore @@ -9,4 +9,3 @@ __pycache__/ tantivy.so tantivy/tantivy.cpython*.so tantivy.egg-info/ -4.0 diff --git a/4.0 b/4.0 new file mode 100644 index 00000000..32616d23 --- /dev/null +++ b/4.0 @@ -0,0 +1,26 @@ +Collecting pytest + Downloading pytest-7.1.2-py3-none-any.whl (297 kB) +Requirement already satisfied: colorama in c:\users\alphaceph\anaconda3\envs\py310\lib\site-packages (from pytest) (0.4.4) +Collecting attrs>=19.2.0 + Downloading attrs-21.4.0-py2.py3-none-any.whl (60 kB) +Collecting py>=1.8.2 + Downloading py-1.11.0-py2.py3-none-any.whl (98 kB) +Collecting atomicwrites>=1.0 + Downloading atomicwrites-1.4.1.tar.gz (14 kB) +Collecting pluggy<2.0,>=0.12 + Downloading pluggy-1.0.0-py2.py3-none-any.whl (13 kB) +Collecting packaging + Downloading packaging-21.3-py3-none-any.whl (40 kB) +Collecting iniconfig + Downloading iniconfig-1.1.1-py2.py3-none-any.whl (5.0 kB) +Requirement already satisfied: tomli>=1.0.0 in c:\users\alphaceph\anaconda3\envs\py310\lib\site-packages (from pytest) (2.0.1) +Collecting pyparsing!=3.0.5,>=2.0.2 + Downloading pyparsing-3.0.9-py3-none-any.whl (98 kB) +Building wheels for collected packages: atomicwrites + Building wheel for atomicwrites (setup.py): started + Building wheel for atomicwrites (setup.py): finished with status 'done' + Created wheel for atomicwrites: filename=atomicwrites-1.4.1-py2.py3-none-any.whl size=6957 sha256=a1a268c4dc96c217af8ea7655cc187e388dcca401b511a0a00e532af25c25aee + Stored in directory: c:\users\alphaceph\appdata\local\pip\cache\wheels\34\07\0b\33b15f68736109f72ea0bb2499521d87312b932620737447a2 +Successfully built atomicwrites +Installing collected packages: pyparsing, py, pluggy, packaging, iniconfig, attrs, atomicwrites, pytest +Successfully installed atomicwrites-1.4.1 attrs-21.4.0 iniconfig-1.1.1 packaging-21.3 pluggy-1.0.0 py-1.11.0 pyparsing-3.0.9 pytest-7.1.2 diff --git a/pyproject.toml b/pyproject.toml index e08dc016..70990144 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["maturin==0.13.0"] +requires = ["maturin>=0.13,<0.14"] build-backend = "maturin" [project] diff --git a/src/schemabuilder.rs b/src/schemabuilder.rs index 08190683..f224c8ab 100644 --- a/src/schemabuilder.rs +++ b/src/schemabuilder.rs @@ -318,7 +318,7 @@ impl SchemaBuilder { if let Some(builder) = builder.write().unwrap().as_mut() { builder.add_facet_field(name, FacetOptions::default()); } else { - return Err(exceptions::PyValueError::new_err( + return Err(exceptions::PyValueError::new_err( "Schema builder object isn't valid anymore.", )); }