Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Highlight longest overlapping token #546

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ python:
- "3.7"

install:
- pip install pytest nose codecov coverage cached-property
- pip install pytest nose codecov coverage cached-property jieba

script:
- nosetests --with-coverage
Expand Down
26 changes: 14 additions & 12 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#!python

import os.path, sys
import os.path
import sys

from setuptools import setup, find_packages
from setuptools.command.test import test as TestCommand

Expand All @@ -20,7 +22,7 @@ def finalize_options(self):
self.test_suite = True

def run_tests(self):
#import here, cause outside the eggs aren't loaded
# import here, cause outside the eggs aren't loaded
import pytest
pytest.main(self.test_args)

Expand All @@ -44,18 +46,18 @@ def run_tests(self):

zip_safe=True,
install_requires=['cached-property'],
tests_require=['pytest'],
tests_require=['pytest', 'jieba'],
cmdclass={'test': PyTest},

classifiers=[
"Development Status :: 5 - Production/Stable",
"Intended Audience :: Developers",
"License :: OSI Approved :: BSD License",
"Natural Language :: English",
"Operating System :: OS Independent",
"Programming Language :: Python :: 2.5",
"Programming Language :: Python :: 3",
"Topic :: Software Development :: Libraries :: Python Modules",
"Topic :: Text Processing :: Indexing",
"Development Status :: 5 - Production/Stable",
"Intended Audience :: Developers",
"License :: OSI Approved :: BSD License",
"Natural Language :: English",
"Operating System :: OS Independent",
"Programming Language :: Python :: 2.5",
"Programming Language :: Python :: 3",
"Topic :: Software Development :: Libraries :: Python Modules",
"Topic :: Text Processing :: Indexing",
],
)
11 changes: 8 additions & 3 deletions src/whoosh/highlight.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,8 +131,8 @@ def __init__(self, text, matches, startchar=0, endchar=-1):
self.matched_terms.add(t.text)

def __repr__(self):
return "<Fragment %d:%d %d>" % (self.startchar, self.endchar,
len(self.matches))
return "<Fragment %d:%d has %d matches>" % (self.startchar, self.endchar,
len(self.matches))

def __len__(self):
return self.endchar - self.startchar
Expand Down Expand Up @@ -695,7 +695,12 @@ def format_fragment(self, fragment, replace=False):
index = fragment.startchar
text = fragment.text

for t in fragment.matches:
# For overlapping tokens (such as in Chinese), sort by position,
# then by inverse of length.
# Because the formatter is sequential, it will only pick the first
# token for a given position to highlight. This makes sure it picks
# the longest overlapping token.
for t in sorted(fragment.matches, key=lambda token: (token.startchar, -(token.endchar - token.startchar))):
if t.startchar is None:
continue
if t.startchar < index:
Expand Down
22 changes: 22 additions & 0 deletions tests/test_highlighting.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from __future__ import with_statement

import pytest
from jieba.analyse import ChineseAnalyzer

from whoosh import analysis, highlight, fields, qparser, query
from whoosh.compat import u
Expand Down Expand Up @@ -330,3 +331,24 @@ def test_whole_noterms():

hi = r[0].highlights("text", minscore=0)
assert hi == u("alfa bravo charlie delta echo foxtrot golf")


def test_overlapping_tokens():
query_string = u'马克思'
text = u'两次历史性飞跃与马克思主义中国化'
analyzer = ChineseAnalyzer()
formatter = highlight.HtmlFormatter()

terms = [token.text for token in analyzer(query_string)]

output = highlight.highlight(
text,
terms,
analyzer,
highlight.WholeFragmenter(),
formatter
)

assert output == u'两次历史性飞跃与<strong class="match term0">马克思</strong>主义中国化', \
u'The longest overlapping token 马克思 was not selected by the highlighter'
# as opposed to '两次历史性飞跃与<strong class="match term0">马克</strong>思主义中国化'