Skip to content

Commit

Permalink
Support lenient parser (quickwit-oss#114)
Browse files Browse the repository at this point in the history
  • Loading branch information
GodTamIt authored and st1020 committed Oct 7, 2023
1 parent 0660b78 commit d7a4075
Show file tree
Hide file tree
Showing 6 changed files with 1,116 additions and 1 deletion.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ crate-type = ["cdylib"]
pyo3-build-config = "0.19.1"

[dependencies]
base64 = "0.21"
chrono = "0.4.23"
tantivy = "0.21.0"
itertools = "0.10.5"
Expand Down
66 changes: 66 additions & 0 deletions src/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ use pyo3::{exceptions, prelude::*, types::PyAny};
use crate::{
document::{extract_value, Document},
get_field,
parser_error::QueryParserErrorIntoPy,
query::Query,
schema::Schema,
searcher::Searcher,
Expand Down Expand Up @@ -399,6 +400,71 @@ impl Index {

Ok(Query { inner: query })
}

/// Parse a query leniently.
///
/// This variant parses invalid query on a best effort basis. If some part of the query can't
/// reasonably be executed (range query without field, searching on a non existing field,
/// searching without precising field when no default field is provided...), they may get turned
/// into a "match-nothing" subquery.
///
/// Args:
/// query: the query, following the tantivy query language.
/// default_fields_names (List[Field]): A list of fields used to search if no
/// field is specified in the query.
///
/// Returns a tuple containing the parsed query and a list of errors.
///
/// Raises ValueError if a field in `default_field_names` is not defined or marked as indexed.
#[pyo3(signature = (query, default_field_names = None))]
pub fn parse_query_lenient(
&self,
query: &str,
default_field_names: Option<Vec<String>>,
) -> PyResult<(Query, Vec<PyObject>)> {
let schema = self.index.schema();

let default_fields = if let Some(default_field_names_vec) =
default_field_names
{
default_field_names_vec
.iter()
.map(|field_name| {
schema
.get_field(field_name)
.map_err(|_err| {
exceptions::PyValueError::new_err(format!(
"Field `{field_name}` is not defined in the schema."
))
})
.and_then(|field| {
schema.get_field_entry(field).is_indexed().then_some(field).ok_or(
exceptions::PyValueError::new_err(
format!(
"Field `{field_name}` is not set as indexed in the schema."
),
))
})
}).collect::<Result<Vec<_>, _>>()?
} else {
self.index
.schema()
.fields()
.filter_map(|(f, fe)| fe.is_indexed().then_some(f))
.collect::<Vec<_>>()
};

let parser =
tv::query::QueryParser::for_index(&self.index, default_fields);
let (query, errors) = parser.parse_query_lenient(query);

Python::with_gil(|py| {
let errors =
errors.into_iter().map(|err| err.into_py(py)).collect();

Ok((Query { inner: query }, errors))
})
}
}

impl Index {
Expand Down
53 changes: 52 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
use ::tantivy as tv;
use pyo3::{exceptions, prelude::*};
use pyo3::{exceptions, prelude::*, wrap_pymodule};

mod document;
mod facet;
mod index;
mod parser_error;
mod query;
mod schema;
mod schemabuilder;
Expand Down Expand Up @@ -84,6 +85,56 @@ fn tantivy(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_class::<Query>()?;
m.add_class::<Snippet>()?;
m.add_class::<SnippetGenerator>()?;

m.add_wrapped(wrap_pymodule!(query_parser_error))?;

Ok(())
}

/// Submodule containing all the possible errors that can be raised during
/// query parsing.
///
/// Example:
/// >>> import tantivy
/// >>> from tantivy import query_parser_error
///
/// >>> builder = tantivy.SchemaBuilder()
///
/// >>> title = builder.add_text_field("title", stored=True)
/// >>> body = builder.add_text_field("body")
/// >>> id = builder.add_unsigned_field("id")
/// >>> rating = builder.add_float_field("rating")
///
/// >>> schema = builder.build()
/// >>> index = tantivy.Index(schema)
///
/// >>> query, errors = index.parse_query_lenient(
/// "bod:'world' AND id:<3.5 AND rating:5.0"
/// )
///
/// >>> assert len(errors) == 2
/// >>> assert isinstance(errors[0], query_parser_error.FieldDoesNotExistError)
/// >>> assert isinstance(errors[1], query_parser_error.ExpectedIntError)
#[pymodule]
fn query_parser_error(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_class::<parser_error::SyntaxError>()?;
m.add_class::<parser_error::UnsupportedQueryError>()?;
m.add_class::<parser_error::FieldDoesNotExistError>()?;
m.add_class::<parser_error::ExpectedIntError>()?;
m.add_class::<parser_error::ExpectedBase64Error>()?;
m.add_class::<parser_error::ExpectedFloatError>()?;
m.add_class::<parser_error::ExpectedBoolError>()?;
m.add_class::<parser_error::AllButQueryForbiddenError>()?;
m.add_class::<parser_error::NoDefaultFieldDeclaredError>()?;
m.add_class::<parser_error::FieldNotIndexedError>()?;
m.add_class::<parser_error::FieldDoesNotHavePositionsIndexedError>()?;
m.add_class::<parser_error::PhrasePrefixRequiresAtLeastTwoTermsError>()?;
m.add_class::<parser_error::UnknownTokenizerError>()?;
m.add_class::<parser_error::RangeMustNotHavePhraseError>()?;
m.add_class::<parser_error::DateFormatError>()?;
m.add_class::<parser_error::FacetFormatError>()?;
m.add_class::<parser_error::IpFormatError>()?;

Ok(())
}

Expand Down
Loading

0 comments on commit d7a4075

Please sign in to comment.