-
Notifications
You must be signed in to change notification settings - Fork 24.9k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'master' into translog-generation
* master: Eclipse: move print margin to 100 columns Add support for fragment_length in the unified highlighter (#23431)
- Loading branch information
Showing
11 changed files
with
762 additions
and
338 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
171 changes: 171 additions & 0 deletions
171
core/src/main/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScanner.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,171 @@ | ||
/* | ||
* Licensed to Elasticsearch under one or more contributor | ||
* license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright | ||
* ownership. Elasticsearch licenses this file to you under | ||
* the Apache License, Version 2.0 (the "License"); you may | ||
* not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
package org.apache.lucene.search.uhighlight; | ||
|
||
import java.text.BreakIterator; | ||
import java.text.CharacterIterator; | ||
import java.util.Locale; | ||
|
||
/** | ||
* A custom break iterator that scans text to find break-delimited passages bounded by | ||
* a provided maximum length. This class delegates the boundary search to a first level | ||
* break iterator. When this break iterator finds a passage greater than the maximum length | ||
* a secondary break iterator is used to re-split the passage at the first boundary after | ||
* maximum length. | ||
* This is useful to split passages created by {@link BreakIterator}s like `sentence` that | ||
* can create big outliers on semi-structured text. | ||
* | ||
* WARNING: This break iterator is designed to work with the {@link UnifiedHighlighter}. | ||
**/ | ||
public class BoundedBreakIteratorScanner extends BreakIterator { | ||
private final BreakIterator mainBreak; | ||
private final BreakIterator innerBreak; | ||
private final int maxLen; | ||
|
||
private int lastPrecedingOffset = -1; | ||
private int windowStart = -1; | ||
private int windowEnd = -1; | ||
private int innerStart = -1; | ||
private int innerEnd = 0; | ||
|
||
private BoundedBreakIteratorScanner(BreakIterator mainBreak, | ||
BreakIterator innerBreak, | ||
int maxLen) { | ||
this.mainBreak = mainBreak; | ||
this.innerBreak = innerBreak; | ||
this.maxLen = maxLen; | ||
} | ||
|
||
@Override | ||
public CharacterIterator getText() { | ||
return mainBreak.getText(); | ||
} | ||
|
||
@Override | ||
public void setText(CharacterIterator newText) { | ||
reset(); | ||
mainBreak.setText(newText); | ||
innerBreak.setText(newText); | ||
} | ||
|
||
@Override | ||
public void setText(String newText) { | ||
reset(); | ||
mainBreak.setText(newText); | ||
innerBreak.setText(newText); | ||
} | ||
|
||
private void reset() { | ||
lastPrecedingOffset = -1; | ||
windowStart = -1; | ||
windowEnd = -1; | ||
innerStart = -1; | ||
innerEnd = 0; | ||
} | ||
|
||
/** | ||
* Must be called with increasing offset. See {@link FieldHighlighter} for usage. | ||
*/ | ||
@Override | ||
public int preceding(int offset) { | ||
if (offset < lastPrecedingOffset) { | ||
throw new IllegalArgumentException("offset < lastPrecedingOffset: " + | ||
"usage doesn't look like UnifiedHighlighter"); | ||
} | ||
if (offset > windowStart && offset < windowEnd) { | ||
innerStart = innerEnd; | ||
innerEnd = windowEnd; | ||
} else { | ||
windowStart = innerStart = mainBreak.preceding(offset); | ||
windowEnd = innerEnd = mainBreak.following(offset-1); | ||
} | ||
|
||
if (innerEnd - innerStart > maxLen) { | ||
// the current split is too big, | ||
// so starting from the current term we try to find boundaries on the left first | ||
if (offset - maxLen > innerStart) { | ||
innerStart = Math.max(innerStart, | ||
innerBreak.preceding(offset - maxLen)); | ||
} | ||
// and then we try to expand the passage to the right with the remaining size | ||
int remaining = Math.max(0, maxLen - (offset - innerStart)); | ||
if (offset + remaining < windowEnd) { | ||
innerEnd = Math.min(windowEnd, | ||
innerBreak.following(offset + remaining)); | ||
} | ||
} | ||
lastPrecedingOffset = offset - 1; | ||
return innerStart; | ||
} | ||
|
||
/** | ||
* Can be invoked only after a call to preceding(offset+1). | ||
* See {@link FieldHighlighter} for usage. | ||
*/ | ||
@Override | ||
public int following(int offset) { | ||
if (offset != lastPrecedingOffset || innerEnd == -1) { | ||
throw new IllegalArgumentException("offset != lastPrecedingOffset: " + | ||
"usage doesn't look like UnifiedHighlighter"); | ||
} | ||
return innerEnd; | ||
} | ||
|
||
/** | ||
* Returns a {@link BreakIterator#getSentenceInstance(Locale)} bounded to maxLen. | ||
* Secondary boundaries are found using a {@link BreakIterator#getWordInstance(Locale)}. | ||
*/ | ||
public static BreakIterator getSentence(Locale locale, int maxLen) { | ||
final BreakIterator sBreak = BreakIterator.getSentenceInstance(locale); | ||
final BreakIterator wBreak = BreakIterator.getWordInstance(locale); | ||
return new BoundedBreakIteratorScanner(sBreak, wBreak, maxLen); | ||
} | ||
|
||
|
||
@Override | ||
public int current() { | ||
// Returns the last offset of the current split | ||
return this.innerEnd; | ||
} | ||
|
||
@Override | ||
public int first() { | ||
throw new IllegalStateException("first() should not be called in this context"); | ||
} | ||
|
||
@Override | ||
public int next() { | ||
throw new IllegalStateException("next() should not be called in this context"); | ||
} | ||
|
||
@Override | ||
public int last() { | ||
throw new IllegalStateException("last() should not be called in this context"); | ||
} | ||
|
||
@Override | ||
public int next(int n) { | ||
throw new IllegalStateException("next(n) should not be called in this context"); | ||
} | ||
|
||
@Override | ||
public int previous() { | ||
throw new IllegalStateException("previous() should not be called in this context"); | ||
} | ||
} |
79 changes: 79 additions & 0 deletions
79
core/src/main/java/org/apache/lucene/search/uhighlight/CustomFieldHighlighter.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
/* | ||
* Licensed to Elasticsearch under one or more contributor | ||
* license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright | ||
* ownership. Elasticsearch licenses this file to you under | ||
* the Apache License, Version 2.0 (the "License"); you may | ||
* not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
|
||
package org.apache.lucene.search.uhighlight; | ||
|
||
import java.text.BreakIterator; | ||
import java.util.Locale; | ||
|
||
import static org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR; | ||
|
||
/** | ||
* Custom {@link FieldHighlighter} that creates a single passage bounded to {@code noMatchSize} when | ||
* no highlights were found. | ||
*/ | ||
class CustomFieldHighlighter extends FieldHighlighter { | ||
private static final Passage[] EMPTY_PASSAGE = new Passage[0]; | ||
|
||
private final Locale breakIteratorLocale; | ||
private final int noMatchSize; | ||
private final String fieldValue; | ||
|
||
CustomFieldHighlighter(String field, FieldOffsetStrategy fieldOffsetStrategy, | ||
Locale breakIteratorLocale, BreakIterator breakIterator, | ||
PassageScorer passageScorer, int maxPassages, int maxNoHighlightPassages, | ||
PassageFormatter passageFormatter, int noMatchSize, String fieldValue) { | ||
super(field, fieldOffsetStrategy, breakIterator, passageScorer, maxPassages, | ||
maxNoHighlightPassages, passageFormatter); | ||
this.breakIteratorLocale = breakIteratorLocale; | ||
this.noMatchSize = noMatchSize; | ||
this.fieldValue = fieldValue; | ||
} | ||
|
||
@Override | ||
protected Passage[] getSummaryPassagesNoHighlight(int maxPassages) { | ||
if (noMatchSize > 0) { | ||
int pos = 0; | ||
while (pos < fieldValue.length() && fieldValue.charAt(pos) == MULTIVAL_SEP_CHAR) { | ||
pos ++; | ||
} | ||
if (pos < fieldValue.length()) { | ||
int end = fieldValue.indexOf(MULTIVAL_SEP_CHAR, pos); | ||
if (end == -1) { | ||
end = fieldValue.length(); | ||
} | ||
if (noMatchSize+pos < end) { | ||
BreakIterator bi = BreakIterator.getWordInstance(breakIteratorLocale); | ||
bi.setText(fieldValue); | ||
// Finds the next word boundary **after** noMatchSize. | ||
end = bi.following(noMatchSize + pos); | ||
if (end == BreakIterator.DONE) { | ||
end = fieldValue.length(); | ||
} | ||
} | ||
Passage passage = new Passage(); | ||
passage.setScore(Float.NaN); | ||
passage.setStartOffset(pos); | ||
passage.setEndOffset(end); | ||
return new Passage[]{passage}; | ||
} | ||
} | ||
return EMPTY_PASSAGE; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.