Skip to content

Commit

Permalink
Merge pull request #528 from stevennic/StrictPhraseHighlighting
Browse files Browse the repository at this point in the history
Strict Phrase Highlighting
  • Loading branch information
fortable1999 authored Jan 18, 2019
2 parents f081266 + 413fd95 commit c823d16
Show file tree
Hide file tree
Showing 5 changed files with 188 additions and 15 deletions.
114 changes: 107 additions & 7 deletions src/whoosh/highlight.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,13 @@
"""

from __future__ import division

from collections import deque
from heapq import nlargest
from itertools import groupby

from whoosh.compat import htmlescape
from whoosh.analysis import Token

from whoosh.compat import htmlescape

# The default value for the maximum chars to examine when fragmenting
DEFAULT_CHARLIMIT = 2 ** 15
Expand Down Expand Up @@ -106,7 +106,7 @@ class Fragment(object):
available).
"""

def __init__(self, text, matches, startchar=0, endchar= -1):
def __init__(self, text, matches, startchar=0, endchar=-1):
"""
:param text: the source text of the fragment.
:param matches: a list of objects which have ``startchar`` and
Expand Down Expand Up @@ -158,11 +158,107 @@ def __lt__(self, other):
# Tokenizing

def set_matched_filter(tokens, termset):
"""
Mark tokens to be highlighted as matched.
Phrase agnostic: highlights all matching tokens individually,
even if the terms are part of a phrase
:param tokens: Result tokens to scan for matched terms to highlight
:param termset: Query terms
:return: yield each token with t.matched = True / False, indicating if the
token should be highlighted
"""
for t in tokens:
t.matched = t.text in termset
yield t


def set_matched_filter_phrases(tokens, text, terms, phrases):
"""
Mark tokens to be highlighted as matched. Used for Strict Phrase highlighting.
Phrase-aware: highlights only individual matches for individual query terms
and phrase matches for phrase terms.
:param tokens: Result tokens
:param text: Result text to scan for matched terms to highlight
:param terms: Individual query terms
:param phrases: Query Phrases
:return: yield each token with t.matched = True / False, indicating if the
token should be highlighted
"""

"""
Implementation note: Because the Token object follows a Singleton pattern,
we can only read each one once. Because phrase matching requires rescanning,
we require a rendered token list (the text parameter) instead. The function must
still yield Token objects at the end, so the text list is used as a way to build a list
of Token indices (the matches set). The yield loop at the end uses this
to properly set .matched on the yielded Token objects.
"""
text = text.split()
matches = set()

# Match phrases
for phrase in phrases:
i = 0
n_phrase_words = len(phrase.words)
slop = phrase.slop
while i < len(text):
if phrase.words[0] == text[i]: # If first word matched
if slop == 1:
# Simple substring match
if text[i + 1:i + n_phrase_words] == phrase.words[1:]: # If rest of phrase matches
any(map(matches.add, range(i, i + n_phrase_words))) # Collect matching indices
# Advance past match area.
# Choosing to ignore possible overlapping matches for efficiency due to low probability.
i += n_phrase_words
else:
i += 1
else:
# Slop match
current_word_index = first_slop_match = last_slop_match = i
slop_matches = [first_slop_match]
for word in phrase.words[1:]:
try:
"""
Find the *last* occurrence of word in the slop substring by reversing it and mapping the index back.
If multiple tokens match in the substring, picking the first one can overlook valid matches.
For example, phrase is: 'one two three'~2
Target substring is: 'one two two six three', which is a valid match.
[0] [1] [2] [3] [4]
Looking for the first match will find [0], then [1] then fail since [3] is more than ~2 words away
Looking for the last match will find [0], then, given a choice between [1] or [2], will pick [2],
making [4] visible from there
"""
text_sub = text[current_word_index + 1:current_word_index + 1 + slop][::-1] # Substring to scan (reversed)
len_sub = len(text_sub)
next_word_index = len_sub - text_sub.index(word) - 1 # Map index back to unreversed list
last_slop_match = current_word_index + next_word_index + 1
slop_matches.append(last_slop_match)
current_word_index = last_slop_match
except ValueError:
# word not found in substring
i += 1
break
else:
i = last_slop_match
any(map(matches.add, slop_matches)) # Collect matching indices
else:
i += 1

# Match individual terms
for i, word in enumerate(text):
for term in terms:
if term.text == word:
matches.add(i)
break

for i, t in enumerate(tokens):
t.matched = i in matches
yield t


# Fragmenters

class Fragmenter(object):
Expand Down Expand Up @@ -791,7 +887,6 @@ def top_fragments(fragments, count, scorer, order, minscore=1):

def highlight(text, terms, analyzer, fragmenter, formatter, top=3,
scorer=None, minscore=1, order=FIRST, mode="query"):

if scorer is None:
scorer = BasicFragmentScorer()

Expand Down Expand Up @@ -881,7 +976,7 @@ def _merge_matched_tokens(tokens):
token = t.copy()
elif t.startchar <= token.endchar:
if t.endchar > token.endchar:
token.text += t.text[token.endchar-t.endchar:]
token.text += t.text[token.endchar - t.endchar:]
token.endchar = t.endchar
else:
yield token
Expand All @@ -892,7 +987,7 @@ def _merge_matched_tokens(tokens):
if token is not None:
yield token

def highlight_hit(self, hitobj, fieldname, text=None, top=3, minscore=1):
def highlight_hit(self, hitobj, fieldname, text=None, top=3, minscore=1, strict_phrase=False):
results = hitobj.results
schema = results.searcher.schema
field = schema[fieldname]
Expand Down Expand Up @@ -943,8 +1038,13 @@ def highlight_hit(self, hitobj, fieldname, text=None, top=3, minscore=1):
analyzer = results.searcher.schema[fieldname].analyzer
tokens = analyzer(text, positions=True, chars=True, mode="index",
removestops=False)

# Set Token.matched attribute for tokens that match a query term
tokens = set_matched_filter(tokens, words)
if strict_phrase:
terms, phrases = results.q.phrases()
tokens = set_matched_filter_phrases(tokens, text, terms, phrases)
else:
tokens = set_matched_filter(tokens, words)
tokens = self._merge_matched_tokens(tokens)
fragments = self.fragmenter.fragment_tokens(text, tokens)

Expand Down
10 changes: 5 additions & 5 deletions src/whoosh/query/compound.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ def simplify(self, ixreader):
subs = self.subqueries
if subs:
q = self.__class__([subq.simplify(ixreader) for subq in subs],
boost=self.boost).normalize()
boost=self.boost).normalize()
else:
q = qcore.NullQuery
return q
Expand Down Expand Up @@ -339,10 +339,10 @@ def _matcher(self, subs, searcher, context):
if matcher_type == self.AUTO_MATCHER:
dc = searcher.doc_count_all()
if (len(subs) < self.TOO_MANY_CLAUSES
and (needs_current
or self.scale
or len(subs) == 2
or dc > 5000)):
and (needs_current
or self.scale
or len(subs) == 2
or dc > 5000)):
# If the parent matcher needs the current match, or there's just
# two sub-matchers, use the standard binary tree of Unions
matcher_type = self.DEFAULT_MATCHER
Expand Down
23 changes: 23 additions & 0 deletions src/whoosh/query/qcore.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,29 @@ def existing_terms(self, ixreader, phrases=True, expand=False, fieldname=None):
termset.add((fieldname, btext))
return termset

def phrases(self):
"""
Recursively get all individual terms and phrases that are part of this Query
"""

from whoosh.query.positional import Phrase
from whoosh.query.terms import Term

terms = []
phrases = []

if isinstance(self, Phrase):
phrases.append(self)
else:
for query in self.children():
if isinstance(query, Term):
terms.append(query)
else:
t, p = query.phrases()
phrases.extend(p)
terms.extend(t)
return terms, phrases

def leaves(self):
"""Returns an iterator of all the leaf queries in this query tree as a
flat series.
Expand Down
4 changes: 2 additions & 2 deletions src/whoosh/searching.py
Original file line number Diff line number Diff line change
Expand Up @@ -1414,7 +1414,7 @@ def matched_terms(self):
raise NoTermsException
return self.results.docterms.get(self.docnum, [])

def highlights(self, fieldname, text=None, top=3, minscore=1):
def highlights(self, fieldname, text=None, top=3, minscore=1, strict_phrase=False):
"""Returns highlighted snippets from the given field::
r = searcher.search(myquery)
Expand Down Expand Up @@ -1450,7 +1450,7 @@ def highlights(self, fieldname, text=None, top=3, minscore=1):

hliter = self.results.highlighter
return hliter.highlight_hit(self, fieldname, text=text, top=top,
minscore=minscore)
minscore=minscore, strict_phrase=strict_phrase)

def more_like_this(self, fieldname, text=None, top=10, numterms=5,
model=classify.Bo1Model, normalize=True, filter=None):
Expand Down
52 changes: 51 additions & 1 deletion tests/test_highlighting.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from whoosh.filedb.filestore import RamStorage
from whoosh.util.testing import TempIndex


_doc = u("alfa bravo charlie delta echo foxtrot golf hotel india juliet " +
"kilo lima")

Expand All @@ -23,6 +22,57 @@ def test_null_fragment():
assert htext == "alfa BRAVO charlie delta echo foxtrot golf hotel INDIA juliet kilo lima"


def test_phrase_strict():
def search(searcher, query_string):
parser = qparser.QueryParser("title", schema=ix.schema)
q = parser.parse(u(query_string))
result = searcher.search(q, terms=True)
result.fragmenter = highlight.ContextFragmenter()
result.formatter = highlight.UppercaseFormatter()
return result

schema = fields.Schema(id=fields.ID(stored=True),
title=fields.TEXT(stored=True))
ix = RamStorage().create_index(schema)
w = ix.writer()
w.add_document(id=u("1"), title=u("strict phrase highlights phrase terms but not individual terms"))
w.commit()

with ix.searcher() as s:
# Phrase
r = search(s, "\"phrase terms\"")

# Non-strict
outputs = [hit.highlights("title", strict_phrase=False) for hit in r]
assert outputs == ["strict PHRASE highlights PHRASE TERMS but not individual...TERMS"]

# Strict
outputs = [hit.highlights("title", strict_phrase=True) for hit in r]
assert outputs == ["phrase highlights PHRASE TERMS but not individual"]

# Phrase with slop
r = search(s, "\"strict highlights terms\"~2")

# Non-strict
outputs = [hit.highlights("title", strict_phrase=False) for hit in r]
assert outputs == ["STRICT phrase HIGHLIGHTS phrase TERMS but not individual...TERMS"]

# Strict
outputs = [hit.highlights("title", strict_phrase=True) for hit in r]
assert outputs == ["STRICT phrase HIGHLIGHTS phrase TERMS but not individual"]

# Phrase with individual terms
r = search(s, "individual AND \"phrase terms\"")

# Non-strict
outputs = [hit.highlights("title", strict_phrase=False) for hit in r]
assert outputs == ["strict PHRASE highlights PHRASE TERMS but not INDIVIDUAL TERMS"]

# Strict
outputs = [hit.highlights("title", strict_phrase=True) for hit in r]
assert outputs == ["phrase highlights PHRASE TERMS but not INDIVIDUAL terms"]


def test_sentence_fragment():
text = u("This is the first sentence. This one doesn't have the word. " +
"This sentence is the second. Third sentence here.")
Expand Down

0 comments on commit c823d16

Please sign in to comment.