Skip to content

Commit

Permalink
doc: Add mktestdocs, and tutorial for snippets. Fixes quickwit-oss#219
Browse files Browse the repository at this point in the history
  • Loading branch information
cjrh committed Mar 19, 2024
1 parent 61f37e7 commit 07b530d
Show file tree
Hide file tree
Showing 5 changed files with 108 additions and 6 deletions.
48 changes: 45 additions & 3 deletions docs/reference.md
Original file line number Diff line number Diff line change
@@ -1,12 +1,52 @@
# Reference

## Setup

We'll use a test index for the examples that follow.

```python
import os
from tantivy import SchemaBuilder, Index, Document
schema = (
SchemaBuilder()
.add_integer_field("doc_id", indexed=True, stored=True)
.add_text_field("title", stored=True)
.add_text_field("body")
.build()
)
index = Index(schema=schema, path=None)
writer = index.writer(heap_size=15_000_000, num_threads=1)
doc = Document()
doc.add_integer("doc_id", 1)
doc.add_text("title", "The Old Man and the Sea")
doc.add_text(
"body",
(
"He was an old man who fished alone in a skiff in"
"the Gulf Stream and he had gone eighty-four days "
"now without taking a fish."
),
)
writer.add_document(doc)

doc = Document()
doc.add_integer("doc_id", 2)
doc.add_text("title", "The Old Man and the Sea II")
doc.add_text("body", "He was an old man who sailed alone.")

writer.add_document(doc)
writer.commit()
index.reload()
```

## Valid Query Formats

tantivy-py supports the [query language](https://docs.rs/tantivy/latest/tantivy/query/struct.QueryParser.html#method.parse_query) used in tantivy.
Below a few basic query formats are shown:

- AND and OR conjunctions.
```python
searcher = index.searcher()
query = index.parse_query('(Old AND Man) OR Stream', ["title", "body"])
(best_score, best_doc_address) = searcher.search(query, 3).hits[0]
best_doc = searcher.doc(best_doc_address)
Expand All @@ -29,7 +69,7 @@ best_doc = searcher.doc(best_doc_address)

- integer search
```python
query = index.parse_query('"eighty-four days"', ["doc_id"])
query = index.parse_query('1', ["doc_id"])
(best_score, best_doc_address) = searcher.search(query, 3).hits[0]
best_doc = searcher.doc(best_doc_address)
```
Expand All @@ -46,8 +86,10 @@ the search query in additional quotes, as if a phrase query were being used.
The following will NOT work:

```python
# Raises ValueError
index.parse_query(r'sea\"', ["title", "body"])
try:
index.parse_query(r'sea\"', ["title", "body"])
except ValueError as e:
assert str(e) == r'Syntax Error: sea\"'
```

However, the following will succeed:
Expand Down
1 change: 1 addition & 0 deletions docs/requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
mkdocs==1.4.3
mktestdocs==0.2.1
52 changes: 49 additions & 3 deletions docs/tutorials.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
## Building an index and populating it

```python
import tempfile
import pathlib
import tantivy

# Declaring our schema.
Expand All @@ -20,7 +22,10 @@ To have a persistent index, use the path
parameter to store the index on the disk, e.g:

```python
index = tantivy.Index(schema, path=os.getcwd() + '/index')
tmpdir = tempfile.TemporaryDirectory()
index_path = pathlib.Path(tmpdir.name) / "index"
index_path.mkdir()
persistent_index = tantivy.Index(schema, path=str(index_path))
```

By default, tantivy offers the following tokenizers
Expand All @@ -44,7 +49,8 @@ which can be used in tantivy-py:

to use the above tokenizers, simply provide them as a parameter to `add_text_field`. e.g.
```python
schema_builder.add_text_field("body", stored=True, tokenizer_name='en_stem')
schema_builder_tok = tantivy.SchemaBuilder()
schema_builder_tok.add_text_field("body", stored=True, tokenizer_name='en_stem')
```

## Adding one document.
Expand Down Expand Up @@ -77,6 +83,46 @@ query = index.parse_query("fish days", ["title", "body"])
(best_score, best_doc_address) = searcher.search(query, 3).hits[0]
best_doc = searcher.doc(best_doc_address)
assert best_doc["title"] == ["The Old Man and the Sea"]
print(best_doc)
```

## Using the snippet generator

```python
hit_text = best_doc["body"][0]
print(f"{hit_text=}")
assert hit_text == (
"He was an old man who fished alone in a skiff in the "
"Gulf Stream and he had gone eighty-four days now "
"without taking a fish."
)

from tantivy import SnippetGenerator
snippet_generator = SnippetGenerator.create(
searcher, query, schema, "body"
)
snippet = snippet_generator.snippet_from_doc(best_doc)
```

The snippet object provides the hit ranges. These are the marker
offsets in the text that match the query.

```python
highlights = snippet.highlighted()
first_highlight = highlights[0]
assert first_highlight.start == 93
assert first_highlight.end == 97
assert hit_text[first_highlight.start:first_highlight.end] == "days"
```

The snippet object can also generate a marked-up HTML snippet:

```python
html_snippet = snippet.to_html()
assert html_snippet == (
"He was an old man who fished alone in a skiff in the "
"Gulf Stream and he had gone eighty-four <b>days</b> now "
"without taking a <b>fish</b>"
)
```


1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
maturin
pytest>=4.0
mktestdocs==0.2.1
12 changes: 12 additions & 0 deletions tests/test_docs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from pathlib import Path
import pytest

from mktestdocs import check_md_file

def test_hello():
assert True


@pytest.mark.parametrize("filepath", Path("docs").glob("**/*.md"), ids=str)
def test_docs(filepath):
check_md_file(filepath, memory=True)

0 comments on commit 07b530d

Please sign in to comment.