Skip to content

Commit

Permalink
Snippet generator (fixes #36 and #63) (#122)
Browse files Browse the repository at this point in the history
Co-authored-by: Justin Greene <justin.greene@intouchsol.com>
Co-authored-by: Paul Masurel <paul@quickwit.io>
Co-authored-by: Pratyush Mittal <pratyushmittal@gmail.com>
Co-authored-by: mukeshsahnis <er.mks89@gmail.com>
  • Loading branch information
5 people authored Sep 13, 2023
1 parent 72e2210 commit f164b0e
Show file tree
Hide file tree
Showing 6 changed files with 132 additions and 6 deletions.
6 changes: 3 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,6 @@ tantivy.so
tantivy.dylib
tantivy/tantivy.cpython*.so
tantivy.egg-info/

# Exclude the mkdocs site directory
site/
.venv
.envrc
site/
15 changes: 15 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,21 @@ only supports python3.

# Development

For compiling Python module:

```bash
# create virtual env
python -m venv .venv
source .venv/bin/activate

# install maturin, the build tool for PyO3
pip install maturin

# compile and install python module in venv
maturin develop
```


Setting up a development environment can be done in a virtual environment using
[`nox`](https://nox.thea.codes) or using local packages using the provided `Makefile`.

Expand Down
2 changes: 1 addition & 1 deletion src/document.rs
Original file line number Diff line number Diff line change
Expand Up @@ -797,7 +797,7 @@ impl Document {
Ok(())
}

fn iter_values_for_field<'a>(
pub fn iter_values_for_field<'a>(
&'a self,
field: &str,
) -> impl Iterator<Item = &'a Value> + 'a {
Expand Down
6 changes: 6 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,16 @@ mod query;
mod schema;
mod schemabuilder;
mod searcher;
mod snippet;

use document::Document;
use facet::Facet;
use index::Index;
use query::Query;
use schema::Schema;
use schemabuilder::SchemaBuilder;
use searcher::{DocAddress, SearchResult, Searcher};
use snippet::{Snippet, SnippetGenerator};

/// Python bindings for the search engine library Tantivy.
///
Expand Down Expand Up @@ -76,6 +79,9 @@ fn tantivy(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_class::<Index>()?;
m.add_class::<DocAddress>()?;
m.add_class::<Facet>()?;
m.add_class::<Query>()?;
m.add_class::<Snippet>()?;
m.add_class::<SnippetGenerator>()?;
Ok(())
}

Expand Down
81 changes: 81 additions & 0 deletions src/snippet.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
use crate::to_pyerr;
use pyo3::prelude::*;
use tantivy as tv;

/// Tantivy schema.
///
/// The schema is very strict. To build the schema the `SchemaBuilder` class is
/// provided.
#[pyclass]
pub(crate) struct Snippet {
pub(crate) inner: tv::Snippet,
}

#[pyclass]
pub(crate) struct Range {
#[pyo3(get)]
start: usize,
#[pyo3(get)]
end: usize,
}

#[pymethods]
impl Snippet {
pub fn to_html(&self) -> PyResult<String> {
Ok(self.inner.to_html())
}

pub fn highlighted(&self) -> Vec<Range> {
let highlighted = self.inner.highlighted();
let results = highlighted
.iter()
.map(|r| Range {
start: r.start,
end: r.end,
})
.collect::<Vec<_>>();
results
}
}

#[pyclass]
pub(crate) struct SnippetGenerator {
pub(crate) field_name: String,
pub(crate) inner: tv::SnippetGenerator,
}

#[pymethods]
impl SnippetGenerator {
#[staticmethod]
pub fn create(
searcher: &crate::Searcher,
query: &crate::Query,
schema: &crate::Schema,
field_name: &str,
) -> PyResult<SnippetGenerator> {
let field = schema
.inner
.get_field(field_name)
.or(Err("field not found"))
.map_err(to_pyerr)?;
let generator =
tv::SnippetGenerator::create(&searcher.inner, query.get(), field)
.map_err(to_pyerr)?;

return Ok(SnippetGenerator {
field_name: field_name.to_string(),
inner: generator,
});
}

pub fn snippet_from_doc(&self, doc: &crate::Document) -> crate::Snippet {
let text: String = doc
.iter_values_for_field(&self.field_name)
.flat_map(tv::schema::Value::as_text)
.collect::<Vec<&str>>()
.join(" ");

let result = self.inner.snippet(&text);
Snippet { inner: result }
}
}
28 changes: 26 additions & 2 deletions tests/tantivy_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
import tantivy
import pickle
import pytest

from tantivy import Document, Index, SchemaBuilder
import tantivy
from tantivy import Document, Index, SchemaBuilder, SnippetGenerator


def schema():
Expand Down Expand Up @@ -784,3 +784,27 @@ def test_doc_address_pickle():
pickled = pickle.loads(pickle.dumps(orig))

assert orig == pickled


class TestSnippets(object):
def test_document_snippet(self, dir_index):
index_dir, _ = dir_index
doc_schema = schema()
index = Index(doc_schema, str(index_dir))
query = index.parse_query("sea whale", ["title", "body"])
searcher = index.searcher()
result = searcher.search(query)
assert len(result.hits) == 1

snippet_generator = SnippetGenerator.create(searcher, query, doc_schema, "title")

for (score, doc_address) in result.hits:
doc = searcher.doc(doc_address)
snippet = snippet_generator.snippet_from_doc(doc)
highlights = snippet.highlighted()
assert len(highlights) == 1
first = highlights[0]
assert first.start == 20
assert first.end == 23
html_snippet = snippet.to_html()
assert html_snippet == 'The Old Man and the <b>Sea</b>'

0 comments on commit f164b0e

Please sign in to comment.