Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add SnippetGenerator for highlighting [fixed] #63

Closed
wants to merge 10 commits into from
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,6 @@ tantivy.so
tantivy.dylib
tantivy/tantivy.cpython*.so
tantivy.egg-info/

.venv
.envrc
15 changes: 15 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,21 @@ only supports python3.

# Development

For compiling Python module:

```bash
# create virtual env
python -m venv .venv
source .venv/bin/activate

# install maturin, the build tool for PyO3
pip install maturin

# compile and install python module in venv
maturin develop
```


Setting up a development environment can be done in a virtual environment using
[`nox`](https://nox.thea.codes) or using local packages using the provided `Makefile`.

Expand Down
2 changes: 1 addition & 1 deletion src/document.rs
Original file line number Diff line number Diff line change
Expand Up @@ -409,7 +409,7 @@ impl Document {
}

impl Document {
fn iter_values_for_field<'a>(
pub fn iter_values_for_field<'a>(
&'a self,
field: &str,
) -> impl Iterator<Item = &'a Value> + 'a {
Expand Down
6 changes: 6 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,16 @@ mod query;
mod schema;
mod schemabuilder;
mod searcher;
mod snippet;

use document::Document;
use facet::Facet;
use index::Index;
use query::Query;
use schema::Schema;
use schemabuilder::SchemaBuilder;
use searcher::{DocAddress, Searcher};
use snippet::{Snippet, SnippetGenerator};

/// Python bindings for the search engine library Tantivy.
///
Expand Down Expand Up @@ -75,6 +78,9 @@ fn tantivy(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_class::<Index>()?;
m.add_class::<DocAddress>()?;
m.add_class::<Facet>()?;
m.add_class::<Query>()?;
m.add_class::<Snippet>()?;
m.add_class::<SnippetGenerator>()?;
Ok(())
}

Expand Down
81 changes: 81 additions & 0 deletions src/snippet.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
use crate::to_pyerr;
use pyo3::prelude::*;
use tantivy as tv;

/// Tantivy schema.
///
/// The schema is very strict. To build the schema the `SchemaBuilder` class is
/// provided.
#[pyclass]
pub(crate) struct Snippet {
pub(crate) inner: tv::Snippet,
}

#[pyclass]
pub(crate) struct Range {
#[pyo3(get)]
start: usize,
#[pyo3(get)]
end: usize,
}

#[pymethods]
impl Snippet {
pub fn to_html(&self) -> PyResult<String> {
Ok(self.inner.to_html())
}

pub fn highlighted(&self) -> Vec<Range> {
let highlighted = self.inner.highlighted();
let results = highlighted
.iter()
.map(|r| Range {
start: r.start,
end: r.end,
})
.collect::<Vec<_>>();
results
}
}

#[pyclass]
pub(crate) struct SnippetGenerator {
pub(crate) field_name: String,
pub(crate) inner: tv::SnippetGenerator,
}

#[pymethods]
impl SnippetGenerator {
#[staticmethod]
pub fn create(
searcher: &crate::Searcher,
query: &crate::Query,
schema: &crate::Schema,
field_name: &str,
) -> PyResult<SnippetGenerator> {
let field = schema
.inner
.get_field(field_name)
.ok_or("field not found")
.map_err(to_pyerr)?;
let generator =
tv::SnippetGenerator::create(&searcher.inner, query.get(), field)
.map_err(to_pyerr)?;

return Ok(SnippetGenerator {
field_name: field_name.to_string(),
inner: generator,
});
}

pub fn snippet_from_doc(&self, doc: &crate::Document) -> crate::Snippet {
let text: String = doc
.iter_values_for_field(&self.field_name)
.flat_map(tv::schema::Value::as_text)
.collect::<Vec<&str>>()
.join(" ");

let result = self.inner.snippet(&text);
Snippet { inner: result }
}
}
29 changes: 26 additions & 3 deletions tests/tantivy_test.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import tantivy
import pytest

from tantivy import Document, Index, SchemaBuilder
import tantivy
from tantivy import Document, Index, SchemaBuilder, SnippetGenerator


def schema():
Expand Down Expand Up @@ -458,3 +457,27 @@ def test_query_from_json_field(self):
# )
# result = index.searcher().search(query, 2)
# assert len(result.hits) == 1


class TestSnippets(object):
def test_document_snippet(self, dir_index):
index_dir, _ = dir_index
doc_schema = schema()
index = Index(doc_schema, str(index_dir))
query = index.parse_query("sea whale", ["title", "body"])
searcher = index.searcher()
result = searcher.search(query)
assert len(result.hits) == 1

snippet_generator = SnippetGenerator.create(searcher, query, doc_schema, "title")

for (score, doc_address) in result.hits:
doc = searcher.doc(doc_address)
snippet = snippet_generator.snippet_from_doc(doc)
highlights = snippet.highlighted()
assert len(highlights) == 1
first = highlights[0]
assert first.start == 20
assert first.end == 23
html_snippet = snippet.to_html()
assert html_snippet == 'The Old Man and the <b>Sea</b>'