Skip to content

Commit

Permalink
Merge branch 'master' into translog-generation
Browse files Browse the repository at this point in the history
* master:
  Eclipse: move print margin to 100 columns
  Add support for fragment_length in the unified highlighter (#23431)
  • Loading branch information
jasontedor committed Mar 17, 2017
2 parents 8f6b609 + f30f182 commit 70ade35
Show file tree
Hide file tree
Showing 11 changed files with 762 additions and 338 deletions.
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
eclipse.preferences.version=1

# previous configuration from maven build
# this is merged with gradle's generated properties during 'gradle eclipse'

# NOTE: null pointer analysis etc is not enabled currently, it seems very unstable
Expand All @@ -17,6 +16,6 @@ eclipse.preferences.version=1
# org.eclipse.jdt.core.compiler.problem.potentialNullReference=warning

org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
org.eclipse.jdt.core.formatter.lineSplit=140
org.eclipse.jdt.core.formatter.lineSplit=100
org.eclipse.jdt.core.formatter.tabulation.char=space
org.eclipse.jdt.core.formatter.tabulation.size=4
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.lucene.search.uhighlight;

import java.text.BreakIterator;
import java.text.CharacterIterator;
import java.util.Locale;

/**
* A custom break iterator that scans text to find break-delimited passages bounded by
* a provided maximum length. This class delegates the boundary search to a first level
* break iterator. When this break iterator finds a passage greater than the maximum length
* a secondary break iterator is used to re-split the passage at the first boundary after
* maximum length.
* This is useful to split passages created by {@link BreakIterator}s like `sentence` that
* can create big outliers on semi-structured text.
*
* WARNING: This break iterator is designed to work with the {@link UnifiedHighlighter}.
**/
public class BoundedBreakIteratorScanner extends BreakIterator {
private final BreakIterator mainBreak;
private final BreakIterator innerBreak;
private final int maxLen;

private int lastPrecedingOffset = -1;
private int windowStart = -1;
private int windowEnd = -1;
private int innerStart = -1;
private int innerEnd = 0;

private BoundedBreakIteratorScanner(BreakIterator mainBreak,
BreakIterator innerBreak,
int maxLen) {
this.mainBreak = mainBreak;
this.innerBreak = innerBreak;
this.maxLen = maxLen;
}

@Override
public CharacterIterator getText() {
return mainBreak.getText();
}

@Override
public void setText(CharacterIterator newText) {
reset();
mainBreak.setText(newText);
innerBreak.setText(newText);
}

@Override
public void setText(String newText) {
reset();
mainBreak.setText(newText);
innerBreak.setText(newText);
}

private void reset() {
lastPrecedingOffset = -1;
windowStart = -1;
windowEnd = -1;
innerStart = -1;
innerEnd = 0;
}

/**
* Must be called with increasing offset. See {@link FieldHighlighter} for usage.
*/
@Override
public int preceding(int offset) {
if (offset < lastPrecedingOffset) {
throw new IllegalArgumentException("offset < lastPrecedingOffset: " +
"usage doesn't look like UnifiedHighlighter");
}
if (offset > windowStart && offset < windowEnd) {
innerStart = innerEnd;
innerEnd = windowEnd;
} else {
windowStart = innerStart = mainBreak.preceding(offset);
windowEnd = innerEnd = mainBreak.following(offset-1);
}

if (innerEnd - innerStart > maxLen) {
// the current split is too big,
// so starting from the current term we try to find boundaries on the left first
if (offset - maxLen > innerStart) {
innerStart = Math.max(innerStart,
innerBreak.preceding(offset - maxLen));
}
// and then we try to expand the passage to the right with the remaining size
int remaining = Math.max(0, maxLen - (offset - innerStart));
if (offset + remaining < windowEnd) {
innerEnd = Math.min(windowEnd,
innerBreak.following(offset + remaining));
}
}
lastPrecedingOffset = offset - 1;
return innerStart;
}

/**
* Can be invoked only after a call to preceding(offset+1).
* See {@link FieldHighlighter} for usage.
*/
@Override
public int following(int offset) {
if (offset != lastPrecedingOffset || innerEnd == -1) {
throw new IllegalArgumentException("offset != lastPrecedingOffset: " +
"usage doesn't look like UnifiedHighlighter");
}
return innerEnd;
}

/**
* Returns a {@link BreakIterator#getSentenceInstance(Locale)} bounded to maxLen.
* Secondary boundaries are found using a {@link BreakIterator#getWordInstance(Locale)}.
*/
public static BreakIterator getSentence(Locale locale, int maxLen) {
final BreakIterator sBreak = BreakIterator.getSentenceInstance(locale);
final BreakIterator wBreak = BreakIterator.getWordInstance(locale);
return new BoundedBreakIteratorScanner(sBreak, wBreak, maxLen);
}


@Override
public int current() {
// Returns the last offset of the current split
return this.innerEnd;
}

@Override
public int first() {
throw new IllegalStateException("first() should not be called in this context");
}

@Override
public int next() {
throw new IllegalStateException("next() should not be called in this context");
}

@Override
public int last() {
throw new IllegalStateException("last() should not be called in this context");
}

@Override
public int next(int n) {
throw new IllegalStateException("next(n) should not be called in this context");
}

@Override
public int previous() {
throw new IllegalStateException("previous() should not be called in this context");
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.lucene.search.uhighlight;

import java.text.BreakIterator;
import java.util.Locale;

import static org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR;

/**
* Custom {@link FieldHighlighter} that creates a single passage bounded to {@code noMatchSize} when
* no highlights were found.
*/
class CustomFieldHighlighter extends FieldHighlighter {
private static final Passage[] EMPTY_PASSAGE = new Passage[0];

private final Locale breakIteratorLocale;
private final int noMatchSize;
private final String fieldValue;

CustomFieldHighlighter(String field, FieldOffsetStrategy fieldOffsetStrategy,
Locale breakIteratorLocale, BreakIterator breakIterator,
PassageScorer passageScorer, int maxPassages, int maxNoHighlightPassages,
PassageFormatter passageFormatter, int noMatchSize, String fieldValue) {
super(field, fieldOffsetStrategy, breakIterator, passageScorer, maxPassages,
maxNoHighlightPassages, passageFormatter);
this.breakIteratorLocale = breakIteratorLocale;
this.noMatchSize = noMatchSize;
this.fieldValue = fieldValue;
}

@Override
protected Passage[] getSummaryPassagesNoHighlight(int maxPassages) {
if (noMatchSize > 0) {
int pos = 0;
while (pos < fieldValue.length() && fieldValue.charAt(pos) == MULTIVAL_SEP_CHAR) {
pos ++;
}
if (pos < fieldValue.length()) {
int end = fieldValue.indexOf(MULTIVAL_SEP_CHAR, pos);
if (end == -1) {
end = fieldValue.length();
}
if (noMatchSize+pos < end) {
BreakIterator bi = BreakIterator.getWordInstance(breakIteratorLocale);
bi.setText(fieldValue);
// Finds the next word boundary **after** noMatchSize.
end = bi.following(noMatchSize + pos);
if (end == BreakIterator.DONE) {
end = fieldValue.length();
}
}
Passage passage = new Passage();
passage.setScore(Float.NaN);
passage.setStartOffset(pos);
passage.setEndOffset(end);
return new Passage[]{passage};
}
}
return EMPTY_PASSAGE;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.lucene.all.AllTermQuery;
import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery;
Expand All @@ -47,6 +49,7 @@
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;

/**
* Subclass of the {@link UnifiedHighlighter} that works for a single field in a single document.
Expand All @@ -57,37 +60,41 @@
* Supports both returning empty snippets and non highlighted snippets when no highlighting can be performed.
*/
public class CustomUnifiedHighlighter extends UnifiedHighlighter {
public static final char MULTIVAL_SEP_CHAR = (char) 0;
private static final Snippet[] EMPTY_SNIPPET = new Snippet[0];

private final String fieldValue;
private final PassageFormatter passageFormatter;
private final BreakIterator breakIterator;
private final boolean returnNonHighlightedSnippets;
private final Locale breakIteratorLocale;
private final int noMatchSize;

/**
* Creates a new instance of {@link CustomUnifiedHighlighter}
*
* @param analyzer the analyzer used for the field at index time, used for multi term queries internally
* @param passageFormatter our own {@link CustomPassageFormatter}
* which generates snippets in forms of {@link Snippet} objects
* which generates snippets in forms of {@link Snippet} objects
* @param breakIteratorLocale the {@link Locale} to use for dividing text into passages.
* If null {@link Locale#ROOT} is used
* @param breakIterator the {@link BreakIterator} to use for dividing text into passages.
* If null {@link BreakIterator#getSentenceInstance(Locale)} is used.
* @param fieldValue the original field values as constructor argument, loaded from the _source field or
* the relevant stored field.
* @param returnNonHighlightedSnippets whether non highlighted snippets should be
* returned rather than empty snippets when no highlighting can be performed
* If null {@link BreakIterator#getSentenceInstance(Locale)} is used.
* @param fieldValue the original field values delimited by MULTIVAL_SEP_CHAR
* @param noMatchSize The size of the text that should be returned when no highlighting can be performed
*/
public CustomUnifiedHighlighter(IndexSearcher searcher,
Analyzer analyzer,
PassageFormatter passageFormatter,
@Nullable Locale breakIteratorLocale,
@Nullable BreakIterator breakIterator,
String fieldValue,
boolean returnNonHighlightedSnippets) {
int noMatchSize) {
super(searcher, analyzer);
this.breakIterator = breakIterator;
this.breakIteratorLocale = breakIteratorLocale == null ? Locale.ROOT : breakIteratorLocale;
this.passageFormatter = passageFormatter;
this.fieldValue = fieldValue;
this.returnNonHighlightedSnippets = returnNonHighlightedSnippets;
this.noMatchSize = noMatchSize;
}

/**
Expand All @@ -111,16 +118,13 @@ public Snippet[] highlightField(String field, Query query, int docId, int maxPas
@Override
protected List<CharSequence[]> loadFieldValues(String[] fields, DocIdSetIterator docIter,
int cacheCharsThreshold) throws IOException {
//we only highlight one field, one document at a time
// we only highlight one field, one document at a time
return Collections.singletonList(new String[]{fieldValue});
}

@Override
protected BreakIterator getBreakIterator(String field) {
if (breakIterator != null) {
return breakIterator;
}
return super.getBreakIterator(field);
return breakIterator;
}

@Override
Expand All @@ -129,11 +133,18 @@ protected PassageFormatter getFormatter(String field) {
}

@Override
protected int getMaxNoHighlightPassages(String field) {
if (returnNonHighlightedSnippets) {
return 1;
}
return 0;
protected FieldHighlighter getFieldHighlighter(String field, Query query, Set<Term> allTerms, int maxPassages) {
BytesRef[] terms = filterExtractedTerms(getFieldMatcher(field), allTerms);
Set<HighlightFlag> highlightFlags = getFlags(field);
PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
CharacterRunAutomaton[] automata = getAutomata(field, query, highlightFlags);
OffsetSource offsetSource = getOptimizedOffsetSource(field, terms, phraseHelper, automata);
BreakIterator breakIterator = new SplittingBreakIterator(getBreakIterator(field),
UnifiedHighlighter.MULTIVAL_SEP_CHAR);
FieldOffsetStrategy strategy =
getOffsetStrategy(offsetSource, field, terms, phraseHelper, automata, highlightFlags);
return new CustomFieldHighlighter(field, strategy, breakIteratorLocale, breakIterator,
getScorer(field), maxPassages, (noMatchSize > 0 ? 1 : 0), getFormatter(field), noMatchSize, fieldValue);
}

@Override
Expand All @@ -146,7 +157,6 @@ protected Collection<Query> preSpanQueryRewrite(Query query) {
return rewriteCustomQuery(query);
}


/**
* Translate custom queries in queries that are supported by the unified highlighter.
*/
Expand Down
Loading

0 comments on commit 70ade35

Please sign in to comment.