From 07db88d659b5de964ab7c9ffa9ab98bdb7f90c20 Mon Sep 17 00:00:00 2001
From: Steven Nicolaou <steven.nicolaou@gmail.com>
Date: Mon, 15 Jul 2019 04:24:12 -0400
Subject: [PATCH 1/8] #532: Highlight longest overlapping token

---
 setup.py                   | 26 ++++++++++++++------------
 src/whoosh/highlight.py    | 11 ++++++++---
 tests/test_highlighting.py | 24 ++++++++++++++++++++++++
 3 files changed, 46 insertions(+), 15 deletions(-)
diff --git a/setup.py b/setup.py
index c4997749..d5accade 100644
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,8 @@
 #!python
 
-import os.path, sys
+import os.path
+import sys
+
 from setuptools import setup, find_packages
 from setuptools.command.test import test as TestCommand
 
@@ -20,7 +22,7 @@ def finalize_options(self):
         self.test_suite = True
 
     def run_tests(self):
-        #import here, cause outside the eggs aren't loaded
+        # import here, cause outside the eggs aren't loaded
         import pytest
         pytest.main(self.test_args)
 
@@ -44,18 +46,18 @@ def run_tests(self):
 
         zip_safe=True,
         install_requires=['cached-property'],
-        tests_require=['pytest'],
+        tests_require=['pytest', 'jieba'],
         cmdclass={'test': PyTest},
 
         classifiers=[
-        "Development Status :: 5 - Production/Stable",
-        "Intended Audience :: Developers",
-        "License :: OSI Approved :: BSD License",
-        "Natural Language :: English",
-        "Operating System :: OS Independent",
-        "Programming Language :: Python :: 2.5",
-        "Programming Language :: Python :: 3",
-        "Topic :: Software Development :: Libraries :: Python Modules",
-        "Topic :: Text Processing :: Indexing",
+            "Development Status :: 5 - Production/Stable",
+            "Intended Audience :: Developers",
+            "License :: OSI Approved :: BSD License",
+            "Natural Language :: English",
+            "Operating System :: OS Independent",
+            "Programming Language :: Python :: 2.5",
+            "Programming Language :: Python :: 3",
+            "Topic :: Software Development :: Libraries :: Python Modules",
+            "Topic :: Text Processing :: Indexing",
         ],
     )
diff --git a/src/whoosh/highlight.py b/src/whoosh/highlight.py
index 6fbbe6d0..212a7157 100644
--- a/src/whoosh/highlight.py
+++ b/src/whoosh/highlight.py
@@ -131,8 +131,8 @@ def __init__(self, text, matches, startchar=0, endchar=-1):
                 self.matched_terms.add(t.text)
 
     def __repr__(self):
-        return "<Fragment %d:%d %d>" % (self.startchar, self.endchar,
-                                        len(self.matches))
+        return "<Fragment %d:%d has %d matches>" % (self.startchar, self.endchar,
+                                                    len(self.matches))
 
     def __len__(self):
         return self.endchar - self.startchar
@@ -695,7 +695,12 @@ def format_fragment(self, fragment, replace=False):
         index = fragment.startchar
         text = fragment.text
 
-        for t in fragment.matches:
+        # For overlapping tokens (such as in Chinese), sort by position,
+        # then by inverse of length.
+        # Because the formatter is sequential, it will only pick the first
+        # token for a given position to highlight. This makes sure it picks
+        # the longest overlapping token.
+        for t in sorted(fragment.matches, key=lambda token: (token.startchar, -(token.endchar - token.startchar))):
             if t.startchar is None:
                 continue
             if t.startchar < index:
diff --git a/tests/test_highlighting.py b/tests/test_highlighting.py
index 523dff6b..c2cb94ff 100644
--- a/tests/test_highlighting.py
+++ b/tests/test_highlighting.py
@@ -2,6 +2,7 @@
 
 from __future__ import with_statement
 
+from jieba.analyse import ChineseAnalyzer
 import pytest
 
 from whoosh import analysis, highlight, fields, qparser, query
@@ -330,3 +331,26 @@ def test_whole_noterms():
 
         hi = r[0].highlights("text", minscore=0)
         assert hi == u("alfa bravo charlie delta echo foxtrot golf")
+
+
+def test_overlapping_tokens():
+    query_string = "马克思"
+    text = "两次历史性飞跃与马克思主义中国化"
+    analyzer = ChineseAnalyzer()
+    formatter = highlight.HtmlFormatter()
+
+    terms = [token.text for token in analyzer(query_string)]
+
+    assert terms == ['马克', '马克思']
+
+    output = highlight.highlight(
+        text,
+        terms,
+        analyzer,
+        highlight.WholeFragmenter(),
+        formatter
+    )
+
+    assert output == '两次历史性飞跃与<strong class="match term0">马克思</strong>主义中国化', \
+        'The longest overlapping token 马克思 was not selected by the highlighter'
+    # as opposed to '两次历史性飞跃与<strong class="match term0">马克</strong>思主义中国化'

From 4b02612b4fbfee2c67bc2e5c43d055e2c76c92c7 Mon Sep 17 00:00:00 2001
From: Steven Nicolaou <steven.nicolaou@gmail.com>
Date: Wed, 17 Jul 2019 02:02:58 -0400
Subject: [PATCH 2/8] #532: Add jieba to Travis

---
 .travis.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index a75414cf..6f458cbf 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -9,7 +9,7 @@ python:
   - "3.7"
 
 install:
-  - pip install pytest nose codecov coverage cached-property
+  - pip install pytest nose codecov coverage cached-property jieba
 
 script:
   - nosetests  --with-coverage

From 4737bb31f303fb91ce73d2e1ef3468efea499748 Mon Sep 17 00:00:00 2001
From: Steven Nicolaou <steven.nicolaou@gmail.com>
Date: Wed, 17 Jul 2019 02:16:57 -0400
Subject: [PATCH 3/8] Unicode input

---
 tests/test_highlighting.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_highlighting.py b/tests/test_highlighting.py
index c2cb94ff..714a395f 100644
--- a/tests/test_highlighting.py
+++ b/tests/test_highlighting.py
@@ -334,8 +334,8 @@ def test_whole_noterms():
 
 
 def test_overlapping_tokens():
-    query_string = "马克思"
-    text = "两次历史性飞跃与马克思主义中国化"
+    query_string = u("马克思")
+    text = u("两次历史性飞跃与马克思主义中国化")
     analyzer = ChineseAnalyzer()
     formatter = highlight.HtmlFormatter()
 

From e96c6cc38e1a47adbc6fb0446470e79650836194 Mon Sep 17 00:00:00 2001
From: Steven Nicolaou <steven.nicolaou@gmail.com>
Date: Wed, 17 Jul 2019 02:19:55 -0400
Subject: [PATCH 4/8] Unicode input

---
 tests/test_highlighting.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_highlighting.py b/tests/test_highlighting.py
index 714a395f..7416a516 100644
--- a/tests/test_highlighting.py
+++ b/tests/test_highlighting.py
@@ -341,7 +341,7 @@ def test_overlapping_tokens():
 
     terms = [token.text for token in analyzer(query_string)]
 
-    assert terms == ['马克', '马克思']
+    assert terms == [u('马克'), u('马克思')]
 
     output = highlight.highlight(
         text,

From 407b6c012a9c8a1cfe60e303eb3ca484f80f59e2 Mon Sep 17 00:00:00 2001
From: Steven Nicolaou <steven.nicolaou@gmail.com>
Date: Wed, 17 Jul 2019 02:58:44 -0400
Subject: [PATCH 5/8] Remove assert

---
 tests/test_highlighting.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/test_highlighting.py b/tests/test_highlighting.py
index 7416a516..17069078 100644
--- a/tests/test_highlighting.py
+++ b/tests/test_highlighting.py
@@ -341,8 +341,6 @@ def test_overlapping_tokens():
 
     terms = [token.text for token in analyzer(query_string)]
 
-    assert terms == [u('马克'), u('马克思')]
-
     output = highlight.highlight(
         text,
         terms,

From 3c9379f06022cded307047b90a2bbf7edd455500 Mon Sep 17 00:00:00 2001
From: Steven Nicolaou <steven.nicolaou@gmail.com>
Date: Wed, 17 Jul 2019 03:04:02 -0400
Subject: [PATCH 6/8] Unicode

---
 tests/test_highlighting.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_highlighting.py b/tests/test_highlighting.py
index 17069078..c29f0f67 100644
--- a/tests/test_highlighting.py
+++ b/tests/test_highlighting.py
@@ -349,6 +349,6 @@ def test_overlapping_tokens():
         formatter
     )
 
-    assert output == '两次历史性飞跃与<strong class="match term0">马克思</strong>主义中国化', \
+    assert output == u('两次历史性飞跃与<strong class="match term0">马克思</strong>主义中国化'), \
         'The longest overlapping token 马克思 was not selected by the highlighter'
     # as opposed to '两次历史性飞跃与<strong class="match term0">马克</strong>思主义中国化'

From a7f8243acb1ea659d2d628b9cbbb2bf40a380b17 Mon Sep 17 00:00:00 2001
From: Steven Nicolaou <steven.nicolaou@gmail.com>
Date: Wed, 17 Jul 2019 03:16:11 -0400
Subject: [PATCH 7/8] Unicode

---
 tests/test_highlighting.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_highlighting.py b/tests/test_highlighting.py
index c29f0f67..b22db8da 100644
--- a/tests/test_highlighting.py
+++ b/tests/test_highlighting.py
@@ -2,8 +2,8 @@
 
 from __future__ import with_statement
 
-from jieba.analyse import ChineseAnalyzer
 import pytest
+from jieba.analyse import ChineseAnalyzer
 
 from whoosh import analysis, highlight, fields, qparser, query
 from whoosh.compat import u
@@ -350,5 +350,5 @@ def test_overlapping_tokens():
     )
 
     assert output == u('两次历史性飞跃与<strong class="match term0">马克思</strong>主义中国化'), \
-        'The longest overlapping token 马克思 was not selected by the highlighter'
+        'The longest overlapping token 马克思 was not selected by the highlighter' + ' : ' + output
     # as opposed to '两次历史性飞跃与<strong class="match term0">马克</strong>思主义中国化'

From cbb9f77cad9ae29e0c59f33b1c09ee08b782789d Mon Sep 17 00:00:00 2001
From: Steven Nicolaou <steven.nicolaou@gmail.com>
Date: Thu, 18 Jul 2019 02:20:32 -0400
Subject: [PATCH 8/8] Fix Unicode in test

---
 tests/test_highlighting.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/test_highlighting.py b/tests/test_highlighting.py
index b22db8da..1647d1bf 100644
--- a/tests/test_highlighting.py
+++ b/tests/test_highlighting.py
@@ -334,8 +334,8 @@ def test_whole_noterms():
 
 
 def test_overlapping_tokens():
-    query_string = u("马克思")
-    text = u("两次历史性飞跃与马克思主义中国化")
+    query_string = u'马克思'
+    text = u'两次历史性飞跃与马克思主义中国化'
     analyzer = ChineseAnalyzer()
     formatter = highlight.HtmlFormatter()
 
@@ -349,6 +349,6 @@ def test_overlapping_tokens():
         formatter
     )
 
-    assert output == u('两次历史性飞跃与<strong class="match term0">马克思</strong>主义中国化'), \
-        'The longest overlapping token 马克思 was not selected by the highlighter' + ' : ' + output
+    assert output == u'两次历史性飞跃与<strong class="match term0">马克思</strong>主义中国化', \
+        u'The longest overlapping token 马克思 was not selected by the highlighter'
     # as opposed to '两次历史性飞跃与<strong class="match term0">马克</strong>思主义中国化'