Merge pull request #528 from stevennic/StrictPhraseHighlighting

Strict Phrase Highlighting
whoosh-community · Jan 18, 2019 · c823d16 · c823d16
2 parents f081266 + 413fd95
commit c823d16
Show file tree

Hide file tree

Showing 5 changed files with 188 additions and 15 deletions.
diff --git a/src/whoosh/highlight.py b/src/whoosh/highlight.py
@@ -49,13 +49,13 @@
 """
 
 from __future__ import division
+
 from collections import deque
 from heapq import nlargest
 from itertools import groupby
 
-from whoosh.compat import htmlescape
 from whoosh.analysis import Token
-
+from whoosh.compat import htmlescape
 
 # The default value for the maximum chars to examine when fragmenting
 DEFAULT_CHARLIMIT = 2 ** 15
@@ -106,7 +106,7 @@ class Fragment(object):
         available).
     """
 
-    def __init__(self, text, matches, startchar=0, endchar= -1):
+    def __init__(self, text, matches, startchar=0, endchar=-1):
         """
         :param text: the source text of the fragment.
         :param matches: a list of objects which have ``startchar`` and
@@ -158,11 +158,107 @@ def __lt__(self, other):
 # Tokenizing
 
 def set_matched_filter(tokens, termset):
+    """
+    Mark tokens to be highlighted as matched.
+    Phrase agnostic: highlights all matching tokens individually,
+                     even if the terms are part of a phrase
+
+    :param tokens: Result tokens to scan for matched terms to highlight
+    :param termset: Query terms
+    :return: yield each token with t.matched = True / False, indicating if the
+             token should be highlighted
+    """
     for t in tokens:
         t.matched = t.text in termset
         yield t
 
 
+def set_matched_filter_phrases(tokens, text, terms, phrases):
+    """
+    Mark tokens to be highlighted as matched. Used for Strict Phrase highlighting.
+    Phrase-aware: highlights only individual matches for individual query terms
+                  and phrase matches for phrase terms.
+
+    :param tokens:  Result tokens
+    :param text:    Result text to scan for matched terms to highlight
+    :param terms:   Individual query terms
+    :param phrases: Query Phrases
+    :return: yield each token with t.matched = True / False, indicating if the
+             token should be highlighted
+    """
+
+    """
+    Implementation note: Because the Token object follows a Singleton pattern,
+    we can only read each one once. Because phrase matching requires rescanning,
+    we require a rendered token list (the text parameter) instead. The function must 
+    still yield Token objects at the end, so the text list is used as a way to build a list
+    of Token indices (the matches set). The yield loop at the end uses this
+    to properly set .matched on the yielded Token objects.
+    """
+    text = text.split()
+    matches = set()
+
+    # Match phrases
+    for phrase in phrases:
+        i = 0
+        n_phrase_words = len(phrase.words)
+        slop = phrase.slop
+        while i < len(text):
+            if phrase.words[0] == text[i]:  # If first word matched
+                if slop == 1:
+                    # Simple substring match
+                    if text[i + 1:i + n_phrase_words] == phrase.words[1:]:  # If rest of phrase matches
+                        any(map(matches.add, range(i, i + n_phrase_words)))  # Collect matching indices
+                        # Advance past match area.
+                        # Choosing to ignore possible overlapping matches for efficiency due to low probability.
+                        i += n_phrase_words
+                    else:
+                        i += 1
+                else:
+                    # Slop match
+                    current_word_index = first_slop_match = last_slop_match = i
+                    slop_matches = [first_slop_match]
+                    for word in phrase.words[1:]:
+                        try:
+                            """
+                            Find the *last* occurrence of word in the slop substring by reversing it and mapping the index back.
+                            If multiple tokens match in the substring, picking the first one can overlook valid matches.
+                            For example, phrase is: 'one two three'~2
+                            Target substring is:    'one two two six three', which is a valid match.
+                                                     [0] [1] [2] [3] [4]
+                            
+                            Looking for the first match will find [0], then [1] then fail since [3] is more than ~2 words away
+                            Looking for the last match will find [0], then, given a choice between [1] or [2], will pick [2],
+                            making [4] visible from there
+                            """
+                            text_sub = text[current_word_index + 1:current_word_index + 1 + slop][::-1]  # Substring to scan (reversed)
+                            len_sub = len(text_sub)
+                            next_word_index = len_sub - text_sub.index(word) - 1  # Map index back to unreversed list
+                            last_slop_match = current_word_index + next_word_index + 1
+                            slop_matches.append(last_slop_match)
+                            current_word_index = last_slop_match
+                        except ValueError:
+                            # word not found in substring
+                            i += 1
+                            break
+                    else:
+                        i = last_slop_match
+                        any(map(matches.add, slop_matches))  # Collect matching indices
+            else:
+                i += 1
+
+    # Match individual terms
+    for i, word in enumerate(text):
+        for term in terms:
+            if term.text == word:
+                matches.add(i)
+                break
+
+    for i, t in enumerate(tokens):
+        t.matched = i in matches
+        yield t
+
+
 # Fragmenters
 
 class Fragmenter(object):
@@ -791,7 +887,6 @@ def top_fragments(fragments, count, scorer, order, minscore=1):
 
 def highlight(text, terms, analyzer, fragmenter, formatter, top=3,
               scorer=None, minscore=1, order=FIRST, mode="query"):
-
     if scorer is None:
         scorer = BasicFragmentScorer()
 
@@ -881,7 +976,7 @@ def _merge_matched_tokens(tokens):
                 token = t.copy()
             elif t.startchar <= token.endchar:
                 if t.endchar > token.endchar:
-                    token.text += t.text[token.endchar-t.endchar:]
+                    token.text += t.text[token.endchar - t.endchar:]
                     token.endchar = t.endchar
             else:
                 yield token
@@ -892,7 +987,7 @@ def _merge_matched_tokens(tokens):
         if token is not None:
             yield token
 
-    def highlight_hit(self, hitobj, fieldname, text=None, top=3, minscore=1):
+    def highlight_hit(self, hitobj, fieldname, text=None, top=3, minscore=1, strict_phrase=False):
         results = hitobj.results
         schema = results.searcher.schema
         field = schema[fieldname]
@@ -943,8 +1038,13 @@ def highlight_hit(self, hitobj, fieldname, text=None, top=3, minscore=1):
             analyzer = results.searcher.schema[fieldname].analyzer
             tokens = analyzer(text, positions=True, chars=True, mode="index",
                               removestops=False)
+
             # Set Token.matched attribute for tokens that match a query term
-            tokens = set_matched_filter(tokens, words)
+            if strict_phrase:
+                terms, phrases = results.q.phrases()
+                tokens = set_matched_filter_phrases(tokens, text, terms, phrases)
+            else:
+                tokens = set_matched_filter(tokens, words)
             tokens = self._merge_matched_tokens(tokens)
             fragments = self.fragmenter.fragment_tokens(text, tokens)
 

diff --git a/src/whoosh/query/compound.py b/src/whoosh/query/compound.py
@@ -191,7 +191,7 @@ def simplify(self, ixreader):
         subs = self.subqueries
         if subs:
             q = self.__class__([subq.simplify(ixreader) for subq in subs],
-                                boost=self.boost).normalize()
+                               boost=self.boost).normalize()
         else:
             q = qcore.NullQuery
         return q
@@ -339,10 +339,10 @@ def _matcher(self, subs, searcher, context):
         if matcher_type == self.AUTO_MATCHER:
             dc = searcher.doc_count_all()
             if (len(subs) < self.TOO_MANY_CLAUSES
-                and (needs_current
-                     or self.scale
-                     or len(subs) == 2
-                     or dc > 5000)):
+                    and (needs_current
+                         or self.scale
+                         or len(subs) == 2
+                         or dc > 5000)):
                 # If the parent matcher needs the current match, or there's just
                 # two sub-matchers, use the standard binary tree of Unions
                 matcher_type = self.DEFAULT_MATCHER

diff --git a/src/whoosh/query/qcore.py b/src/whoosh/query/qcore.py
@@ -374,6 +374,29 @@ def existing_terms(self, ixreader, phrases=True, expand=False, fieldname=None):
                         termset.add((fieldname, btext))
         return termset
 
+    def phrases(self):
+        """
+        Recursively get all individual terms and phrases that are part of this Query
+        """
+
+        from whoosh.query.positional import Phrase
+        from whoosh.query.terms import Term
+
+        terms = []
+        phrases = []
+
+        if isinstance(self, Phrase):
+            phrases.append(self)
+        else:
+            for query in self.children():
+                if isinstance(query, Term):
+                    terms.append(query)
+                else:
+                    t, p = query.phrases()
+                    phrases.extend(p)
+                    terms.extend(t)
+        return terms, phrases
+
     def leaves(self):
         """Returns an iterator of all the leaf queries in this query tree as a
         flat series.

diff --git a/src/whoosh/searching.py b/src/whoosh/searching.py
@@ -1414,7 +1414,7 @@ def matched_terms(self):
             raise NoTermsException
         return self.results.docterms.get(self.docnum, [])
 
-    def highlights(self, fieldname, text=None, top=3, minscore=1):
+    def highlights(self, fieldname, text=None, top=3, minscore=1, strict_phrase=False):
         """Returns highlighted snippets from the given field::
 
             r = searcher.search(myquery)
@@ -1450,7 +1450,7 @@ def highlights(self, fieldname, text=None, top=3, minscore=1):
 
         hliter = self.results.highlighter
         return hliter.highlight_hit(self, fieldname, text=text, top=top,
-                                    minscore=minscore)
+                                    minscore=minscore, strict_phrase=strict_phrase)
 
     def more_like_this(self, fieldname, text=None, top=10, numterms=5,
                        model=classify.Bo1Model, normalize=True, filter=None):

diff --git a/tests/test_highlighting.py b/tests/test_highlighting.py
@@ -9,7 +9,6 @@
 from whoosh.filedb.filestore import RamStorage
 from whoosh.util.testing import TempIndex
 
-
 _doc = u("alfa bravo charlie delta echo foxtrot golf hotel india juliet " +
          "kilo lima")
 
@@ -23,6 +22,57 @@ def test_null_fragment():
     assert htext == "alfa BRAVO charlie delta echo foxtrot golf hotel INDIA juliet kilo lima"
 
 
+def test_phrase_strict():
+    def search(searcher, query_string):
+        parser = qparser.QueryParser("title", schema=ix.schema)
+        q = parser.parse(u(query_string))
+        result = searcher.search(q, terms=True)
+        result.fragmenter = highlight.ContextFragmenter()
+        result.formatter = highlight.UppercaseFormatter()
+        return result
+
+    schema = fields.Schema(id=fields.ID(stored=True),
+                           title=fields.TEXT(stored=True))
+    ix = RamStorage().create_index(schema)
+    w = ix.writer()
+    w.add_document(id=u("1"), title=u("strict phrase highlights phrase terms but not individual terms"))
+    w.commit()
+
+    with ix.searcher() as s:
+        # Phrase
+        r = search(s, "\"phrase terms\"")
+
+        # Non-strict
+        outputs = [hit.highlights("title", strict_phrase=False) for hit in r]
+        assert outputs == ["strict PHRASE highlights PHRASE TERMS but not individual...TERMS"]
+
+        # Strict
+        outputs = [hit.highlights("title", strict_phrase=True) for hit in r]
+        assert outputs == ["phrase highlights PHRASE TERMS but not individual"]
+
+        # Phrase with slop
+        r = search(s, "\"strict highlights terms\"~2")
+
+        # Non-strict
+        outputs = [hit.highlights("title", strict_phrase=False) for hit in r]
+        assert outputs == ["STRICT phrase HIGHLIGHTS phrase TERMS but not individual...TERMS"]
+
+        # Strict
+        outputs = [hit.highlights("title", strict_phrase=True) for hit in r]
+        assert outputs == ["STRICT phrase HIGHLIGHTS phrase TERMS but not individual"]
+
+        # Phrase with individual terms
+        r = search(s, "individual AND \"phrase terms\"")
+
+        # Non-strict
+        outputs = [hit.highlights("title", strict_phrase=False) for hit in r]
+        assert outputs == ["strict PHRASE highlights PHRASE TERMS but not INDIVIDUAL TERMS"]
+
+        # Strict
+        outputs = [hit.highlights("title", strict_phrase=True) for hit in r]
+        assert outputs == ["phrase highlights PHRASE TERMS but not INDIVIDUAL terms"]
+
+
 def test_sentence_fragment():
     text = u("This is the first sentence. This one doesn't have the word. " +
              "This sentence is the second. Third sentence here.")