From 07db88d659b5de964ab7c9ffa9ab98bdb7f90c20 Mon Sep 17 00:00:00 2001 From: Steven Nicolaou Date: Mon, 15 Jul 2019 04:24:12 -0400 Subject: [PATCH 1/8] #532: Highlight longest overlapping token --- setup.py | 26 ++++++++++++++------------ src/whoosh/highlight.py | 11 ++++++++--- tests/test_highlighting.py | 24 ++++++++++++++++++++++++ 3 files changed, 46 insertions(+), 15 deletions(-) diff --git a/setup.py b/setup.py index c4997749..d5accade 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,8 @@ #!python -import os.path, sys +import os.path +import sys + from setuptools import setup, find_packages from setuptools.command.test import test as TestCommand @@ -20,7 +22,7 @@ def finalize_options(self): self.test_suite = True def run_tests(self): - #import here, cause outside the eggs aren't loaded + # import here, cause outside the eggs aren't loaded import pytest pytest.main(self.test_args) @@ -44,18 +46,18 @@ def run_tests(self): zip_safe=True, install_requires=['cached-property'], - tests_require=['pytest'], + tests_require=['pytest', 'jieba'], cmdclass={'test': PyTest}, classifiers=[ - "Development Status :: 5 - Production/Stable", - "Intended Audience :: Developers", - "License :: OSI Approved :: BSD License", - "Natural Language :: English", - "Operating System :: OS Independent", - "Programming Language :: Python :: 2.5", - "Programming Language :: Python :: 3", - "Topic :: Software Development :: Libraries :: Python Modules", - "Topic :: Text Processing :: Indexing", + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "License :: OSI Approved :: BSD License", + "Natural Language :: English", + "Operating System :: OS Independent", + "Programming Language :: Python :: 2.5", + "Programming Language :: Python :: 3", + "Topic :: Software Development :: Libraries :: Python Modules", + "Topic :: Text Processing :: Indexing", ], ) diff --git a/src/whoosh/highlight.py b/src/whoosh/highlight.py index 6fbbe6d0..212a7157 100644 --- a/src/whoosh/highlight.py +++ b/src/whoosh/highlight.py @@ -131,8 +131,8 @@ def __init__(self, text, matches, startchar=0, endchar=-1): self.matched_terms.add(t.text) def __repr__(self): - return "" % (self.startchar, self.endchar, - len(self.matches)) + return "" % (self.startchar, self.endchar, + len(self.matches)) def __len__(self): return self.endchar - self.startchar @@ -695,7 +695,12 @@ def format_fragment(self, fragment, replace=False): index = fragment.startchar text = fragment.text - for t in fragment.matches: + # For overlapping tokens (such as in Chinese), sort by position, + # then by inverse of length. + # Because the formatter is sequential, it will only pick the first + # token for a given position to highlight. This makes sure it picks + # the longest overlapping token. + for t in sorted(fragment.matches, key=lambda token: (token.startchar, -(token.endchar - token.startchar))): if t.startchar is None: continue if t.startchar < index: diff --git a/tests/test_highlighting.py b/tests/test_highlighting.py index 523dff6b..c2cb94ff 100644 --- a/tests/test_highlighting.py +++ b/tests/test_highlighting.py @@ -2,6 +2,7 @@ from __future__ import with_statement +from jieba.analyse import ChineseAnalyzer import pytest from whoosh import analysis, highlight, fields, qparser, query @@ -330,3 +331,26 @@ def test_whole_noterms(): hi = r[0].highlights("text", minscore=0) assert hi == u("alfa bravo charlie delta echo foxtrot golf") + + +def test_overlapping_tokens(): + query_string = "马克思" + text = "两次历史性飞跃与马克思主义中国化" + analyzer = ChineseAnalyzer() + formatter = highlight.HtmlFormatter() + + terms = [token.text for token in analyzer(query_string)] + + assert terms == ['马克', '马克思'] + + output = highlight.highlight( + text, + terms, + analyzer, + highlight.WholeFragmenter(), + formatter + ) + + assert output == '两次历史性飞跃与马克思主义中国化', \ + 'The longest overlapping token 马克思 was not selected by the highlighter' + # as opposed to '两次历史性飞跃与马克思主义中国化' From 4b02612b4fbfee2c67bc2e5c43d055e2c76c92c7 Mon Sep 17 00:00:00 2001 From: Steven Nicolaou Date: Wed, 17 Jul 2019 02:02:58 -0400 Subject: [PATCH 2/8] #532: Add jieba to Travis --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index a75414cf..6f458cbf 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,7 +9,7 @@ python: - "3.7" install: - - pip install pytest nose codecov coverage cached-property + - pip install pytest nose codecov coverage cached-property jieba script: - nosetests --with-coverage From 4737bb31f303fb91ce73d2e1ef3468efea499748 Mon Sep 17 00:00:00 2001 From: Steven Nicolaou Date: Wed, 17 Jul 2019 02:16:57 -0400 Subject: [PATCH 3/8] Unicode input --- tests/test_highlighting.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_highlighting.py b/tests/test_highlighting.py index c2cb94ff..714a395f 100644 --- a/tests/test_highlighting.py +++ b/tests/test_highlighting.py @@ -334,8 +334,8 @@ def test_whole_noterms(): def test_overlapping_tokens(): - query_string = "马克思" - text = "两次历史性飞跃与马克思主义中国化" + query_string = u("马克思") + text = u("两次历史性飞跃与马克思主义中国化") analyzer = ChineseAnalyzer() formatter = highlight.HtmlFormatter() From e96c6cc38e1a47adbc6fb0446470e79650836194 Mon Sep 17 00:00:00 2001 From: Steven Nicolaou Date: Wed, 17 Jul 2019 02:19:55 -0400 Subject: [PATCH 4/8] Unicode input --- tests/test_highlighting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_highlighting.py b/tests/test_highlighting.py index 714a395f..7416a516 100644 --- a/tests/test_highlighting.py +++ b/tests/test_highlighting.py @@ -341,7 +341,7 @@ def test_overlapping_tokens(): terms = [token.text for token in analyzer(query_string)] - assert terms == ['马克', '马克思'] + assert terms == [u('马克'), u('马克思')] output = highlight.highlight( text, From 407b6c012a9c8a1cfe60e303eb3ca484f80f59e2 Mon Sep 17 00:00:00 2001 From: Steven Nicolaou Date: Wed, 17 Jul 2019 02:58:44 -0400 Subject: [PATCH 5/8] Remove assert --- tests/test_highlighting.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_highlighting.py b/tests/test_highlighting.py index 7416a516..17069078 100644 --- a/tests/test_highlighting.py +++ b/tests/test_highlighting.py @@ -341,8 +341,6 @@ def test_overlapping_tokens(): terms = [token.text for token in analyzer(query_string)] - assert terms == [u('马克'), u('马克思')] - output = highlight.highlight( text, terms, From 3c9379f06022cded307047b90a2bbf7edd455500 Mon Sep 17 00:00:00 2001 From: Steven Nicolaou Date: Wed, 17 Jul 2019 03:04:02 -0400 Subject: [PATCH 6/8] Unicode --- tests/test_highlighting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_highlighting.py b/tests/test_highlighting.py index 17069078..c29f0f67 100644 --- a/tests/test_highlighting.py +++ b/tests/test_highlighting.py @@ -349,6 +349,6 @@ def test_overlapping_tokens(): formatter ) - assert output == '两次历史性飞跃与马克思主义中国化', \ + assert output == u('两次历史性飞跃与马克思主义中国化'), \ 'The longest overlapping token 马克思 was not selected by the highlighter' # as opposed to '两次历史性飞跃与马克思主义中国化' From a7f8243acb1ea659d2d628b9cbbb2bf40a380b17 Mon Sep 17 00:00:00 2001 From: Steven Nicolaou Date: Wed, 17 Jul 2019 03:16:11 -0400 Subject: [PATCH 7/8] Unicode --- tests/test_highlighting.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_highlighting.py b/tests/test_highlighting.py index c29f0f67..b22db8da 100644 --- a/tests/test_highlighting.py +++ b/tests/test_highlighting.py @@ -2,8 +2,8 @@ from __future__ import with_statement -from jieba.analyse import ChineseAnalyzer import pytest +from jieba.analyse import ChineseAnalyzer from whoosh import analysis, highlight, fields, qparser, query from whoosh.compat import u @@ -350,5 +350,5 @@ def test_overlapping_tokens(): ) assert output == u('两次历史性飞跃与马克思主义中国化'), \ - 'The longest overlapping token 马克思 was not selected by the highlighter' + 'The longest overlapping token 马克思 was not selected by the highlighter' + ' : ' + output # as opposed to '两次历史性飞跃与马克思主义中国化' From cbb9f77cad9ae29e0c59f33b1c09ee08b782789d Mon Sep 17 00:00:00 2001 From: Steven Nicolaou Date: Thu, 18 Jul 2019 02:20:32 -0400 Subject: [PATCH 8/8] Fix Unicode in test --- tests/test_highlighting.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_highlighting.py b/tests/test_highlighting.py index b22db8da..1647d1bf 100644 --- a/tests/test_highlighting.py +++ b/tests/test_highlighting.py @@ -334,8 +334,8 @@ def test_whole_noterms(): def test_overlapping_tokens(): - query_string = u("马克思") - text = u("两次历史性飞跃与马克思主义中国化") + query_string = u'马克思' + text = u'两次历史性飞跃与马克思主义中国化' analyzer = ChineseAnalyzer() formatter = highlight.HtmlFormatter() @@ -349,6 +349,6 @@ def test_overlapping_tokens(): formatter ) - assert output == u('两次历史性飞跃与马克思主义中国化'), \ - 'The longest overlapping token 马克思 was not selected by the highlighter' + ' : ' + output + assert output == u'两次历史性飞跃与马克思主义中国化', \ + u'The longest overlapping token 马克思 was not selected by the highlighter' # as opposed to '两次历史性飞跃与马克思主义中国化'