Skip to content

Commit

Permalink
Merge branch 'main' into intra_segment_concurrency
Browse files Browse the repository at this point in the history
  • Loading branch information
javanna committed Sep 5, 2024
2 parents ea86d50 + a414a96 commit 51283d2
Show file tree
Hide file tree
Showing 20 changed files with 413 additions and 197 deletions.
4 changes: 4 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,8 @@ API Changes

* GITHUB#13632: CandidateMatcher public matching functions (Bryan Jacobowitz)

* GITHUB#13708: Move Operations.sameLanguage/subsetOf to test-framework. (Robert Muir)


New Features
---------------------
Expand Down Expand Up @@ -174,6 +176,8 @@ Improvements

* GITHUB#12172: Update Romanian stopwords list to include the modern unicode forms. (Trey Jones)

* GITHUB#13707: Improve Operations.isTotal() to work with non-minimal automata. (Dawid Weiss, Robert Muir)

Optimizations
---------------------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1490,7 +1490,7 @@ public void testRandomSyns() throws Exception {
}

assertTrue(approxEquals(actual, expected));
assertTrue(Operations.sameLanguage(actual, expected));
assertTrue(AutomatonTestUtil.sameLanguage(actual, expected));
}

a.close();
Expand Down
170 changes: 76 additions & 94 deletions lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@
import java.util.BitSet;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
Expand Down Expand Up @@ -182,30 +181,65 @@ public static Automaton repeat(Automaton a) {
// Repeating the empty automata will still only accept the empty automata.
return a;
}

if (a.isAccept(0) && a.getAcceptStates().cardinality() == 1) {
// If state 0 is the only accept state, then this automaton already repeats itself.
return a;
}

Automaton.Builder builder = new Automaton.Builder();
// Create the initial state, which is accepted
builder.createState();
builder.setAccept(0, true);
builder.copy(a);

Transition t = new Transition();

int[] stateMap = new int[a.getNumStates()];
for (int state = 0; state < a.getNumStates(); ++state) {
if (a.isAccept(state) == false) {
stateMap[state] = builder.createState();
} else if (a.getNumTransitions(state) == 0) {
// Accept states that have no transitions get merged into state 0.
stateMap[state] = 0;
} else {
int newState = builder.createState();
stateMap[state] = newState;
builder.setAccept(newState, true);
}
}

// Now copy the automaton while renumbering states.
for (int state = 0; state < a.getNumStates(); ++state) {
int src = stateMap[state];
int count = a.initTransition(state, t);
for (int i = 0; i < count; i++) {
a.getNextTransition(t);
int dest = stateMap[t.dest];
builder.addTransition(src, dest, t.min, t.max);
}
}

// Now copy transitions of the initial state to our new initial state.
int count = a.initTransition(0, t);
for (int i = 0; i < count; i++) {
a.getNextTransition(t);
builder.addTransition(0, t.dest + 1, t.min, t.max);
builder.addTransition(0, stateMap[t.dest], t.min, t.max);
}

int numStates = a.getNumStates();
for (int s = 0; s < numStates; s++) {
if (a.isAccept(s)) {
// Now copy transitions of the initial state to final states to make the automaton repeat
// itself.
for (int s = a.getAcceptStates().nextSetBit(0);
s != -1;
s = a.getAcceptStates().nextSetBit(s + 1)) {
if (stateMap[s] != 0) {
count = a.initTransition(0, t);
for (int i = 0; i < count; i++) {
a.getNextTransition(t);
builder.addTransition(s + 1, t.dest + 1, t.min, t.max);
builder.addTransition(stateMap[s], stateMap[t.dest], t.min, t.max);
}
}
}

return builder.finish();
return removeDeadStates(builder.finish());
}

/**
Expand Down Expand Up @@ -374,17 +408,6 @@ public static Automaton intersection(Automaton a1, Automaton a2) {
return removeDeadStates(c);
}

/**
* Returns true if these two automata accept exactly the same language. This is a costly
* computation! Both automata must be determinized and have no dead states!
*/
public static boolean sameLanguage(Automaton a1, Automaton a2) {
if (a1 == a2) {
return true;
}
return subsetOf(a2, a1) && subsetOf(a1, a2);
}

// TODO: move to test-framework?
/**
* Returns true if this automaton has any states that cannot be reached from the initial state or
Expand Down Expand Up @@ -417,73 +440,6 @@ public static boolean hasDeadStatesToAccept(Automaton a) {
return reachableFromAccept.isEmpty() == false;
}

/**
* Returns true if the language of <code>a1</code> is a subset of the language of <code>a2</code>.
* Both automata must be determinized and must have no dead states.
*
* <p>Complexity: quadratic in number of states.
*/
public static boolean subsetOf(Automaton a1, Automaton a2) {
if (a1.isDeterministic() == false) {
throw new IllegalArgumentException("a1 must be deterministic");
}
if (a2.isDeterministic() == false) {
throw new IllegalArgumentException("a2 must be deterministic");
}
assert hasDeadStatesFromInitial(a1) == false;
assert hasDeadStatesFromInitial(a2) == false;
if (a1.getNumStates() == 0) {
// Empty language is alwyas a subset of any other language
return true;
} else if (a2.getNumStates() == 0) {
return isEmpty(a1);
}

// TODO: cutover to iterators instead
Transition[][] transitions1 = a1.getSortedTransitions();
Transition[][] transitions2 = a2.getSortedTransitions();
ArrayDeque<StatePair> worklist = new ArrayDeque<>();
HashSet<StatePair> visited = new HashSet<>();
StatePair p = new StatePair(0, 0);
worklist.add(p);
visited.add(p);
while (worklist.size() > 0) {
p = worklist.removeFirst();
if (a1.isAccept(p.s1) && a2.isAccept(p.s2) == false) {
return false;
}
Transition[] t1 = transitions1[p.s1];
Transition[] t2 = transitions2[p.s2];
for (int n1 = 0, b2 = 0; n1 < t1.length; n1++) {
while (b2 < t2.length && t2[b2].max < t1[n1].min) {
b2++;
}
int min1 = t1[n1].min, max1 = t1[n1].max;

for (int n2 = b2; n2 < t2.length && t1[n1].max >= t2[n2].min; n2++) {
if (t2[n2].min > min1) {
return false;
}
if (t2[n2].max < Character.MAX_CODE_POINT) {
min1 = t2[n2].max + 1;
} else {
min1 = Character.MAX_CODE_POINT;
max1 = Character.MIN_CODE_POINT;
}
StatePair q = new StatePair(t1[n1].dest, t2[n2].dest);
if (!visited.contains(q)) {
worklist.add(q);
visited.add(q);
}
}
if (min1 <= max1) {
return false;
}
}
}
return true;
}

/**
* Returns an automaton that accepts the union of the languages of the given automata.
*
Expand Down Expand Up @@ -857,22 +813,48 @@ public static boolean isEmpty(Automaton a) {
return true;
}

/** Returns true if the given automaton accepts all strings. The automaton must be minimized. */
/**
* Returns true if the given automaton accepts all strings.
*
* <p>The automaton must be deterministic, or this method may return false.
*
* <p>Complexity: linear in number of states and transitions.
*/
public static boolean isTotal(Automaton a) {
return isTotal(a, Character.MIN_CODE_POINT, Character.MAX_CODE_POINT);
}

/**
* Returns true if the given automaton accepts all strings for the specified min/max range of the
* alphabet. The automaton must be minimized.
* alphabet.
*
* <p>The automaton must be deterministic, or this method may return false.
*
* <p>Complexity: linear in number of states and transitions.
*/
public static boolean isTotal(Automaton a, int minAlphabet, int maxAlphabet) {
if (a.isAccept(0) && a.getNumTransitions(0) == 1) {
Transition t = new Transition();
a.getTransition(0, 0, t);
return t.dest == 0 && t.min == minAlphabet && t.max == maxAlphabet;
BitSet states = getLiveStates(a);
Transition spare = new Transition();
int seenStates = 0;
for (int state = states.nextSetBit(0); state >= 0; state = states.nextSetBit(state + 1)) {
// all reachable states must be accept states
if (a.isAccept(state) == false) return false;
// all reachable states must contain transitions covering minAlphabet-maxAlphabet
int previousLabel = minAlphabet - 1;
for (int transition = 0; transition < a.getNumTransitions(state); transition++) {
a.getTransition(state, transition, spare);
// no gaps are allowed
if (spare.min > previousLabel + 1) return false;
previousLabel = spare.max;
}
if (previousLabel < maxAlphabet) return false;
if (state == Integer.MAX_VALUE) {
break; // or (state+1) would overflow
}
seenStates++;
}
return false;
// we've checked all the states, automaton is either total or empty
return seenStates > 0;
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,14 @@
* @lucene.experimental
*/
public class StatePair {
// only mike knows what it does (do not expose)
int s;
int s1;
int s2;

/** first state */
public final int s1;

/** second state */
public final int s2;

StatePair(int s, int s1, int s2) {
this.s = s;
Expand Down Expand Up @@ -81,7 +86,7 @@ public boolean equals(Object obj) {
@Override
public int hashCode() {
// Don't use s1 ^ s2 since it's vulnerable to the case where s1 == s2 always --> hashCode = 0,
// e.g. if you call Operations.sameLanguage,
// e.g. if you call AutomatonTestUtil.sameLanguage,
// passing the same automaton against itself:
return s1 * 31 + s2;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -625,7 +625,7 @@ private void assertSameLanguage(Automaton expected, Automaton actual) {
Operations.removeDeadStates(expected), DEFAULT_DETERMINIZE_WORK_LIMIT);
Automaton actualDet =
Operations.determinize(Operations.removeDeadStates(actual), DEFAULT_DETERMINIZE_WORK_LIMIT);
if (Operations.sameLanguage(expectedDet, actualDet) == false) {
if (AutomatonTestUtil.sameLanguage(expectedDet, actualDet) == false) {
Set<String> expectedPaths = toPathStrings(expectedDet);
Set<String> actualPaths = toPathStrings(actualDet);
StringBuilder b = new StringBuilder();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ public void testIntersect() throws Exception {

Automaton actual =
Operations.determinize(Automata.makeStringUnion(found), DEFAULT_DETERMINIZE_WORK_LIMIT);
assertTrue(Operations.sameLanguage(expected, actual));
assertTrue(AutomatonTestUtil.sameLanguage(expected, actual));
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -403,12 +403,8 @@ public void testRandomQueries() throws Exception {
bigSearcher.count(q3.build()));

// test diff (randomized) scorers produce the same results on bigSearcher as well
hits1 =
bigSearcher.search(q1, new TopFieldCollectorManager(sort, 1000 * mulFactor, 1))
.scoreDocs;
hits2 =
bigSearcher.search(q1, new TopFieldCollectorManager(sort, 1000 * mulFactor, 1))
.scoreDocs;
hits1 = bigSearcher.search(q1, new TopFieldCollectorManager(sort, mulFactor, 1)).scoreDocs;
hits2 = bigSearcher.search(q1, new TopFieldCollectorManager(sort, mulFactor, 1)).scoreDocs;
CheckHits.checkEqual(q1, hits1, hits2);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ public void testSameLanguage() throws Exception {
Automaton a2 =
Operations.removeDeadStates(
Operations.concatenate(Automata.makeString("foo"), Automata.makeString("bar")));
assertTrue(Operations.sameLanguage(a1, a2));
assertTrue(AutomatonTestUtil.sameLanguage(a1, a2));
}

public void testCommonPrefixString() throws Exception {
Expand Down Expand Up @@ -257,7 +257,7 @@ public void testMinimizeSimple() throws Exception {
Automaton a = Automata.makeString("foobar");
Automaton aMin = MinimizationOperations.minimize(a, DEFAULT_DETERMINIZE_WORK_LIMIT);

assertTrue(Operations.sameLanguage(a, aMin));
assertTrue(AutomatonTestUtil.sameLanguage(a, aMin));
}

public void testMinimize2() throws Exception {
Expand All @@ -266,7 +266,7 @@ public void testMinimize2() throws Exception {
Arrays.asList(Automata.makeString("foobar"), Automata.makeString("boobar")));
Automaton aMin = MinimizationOperations.minimize(a, DEFAULT_DETERMINIZE_WORK_LIMIT);
assertTrue(
Operations.sameLanguage(
AutomatonTestUtil.sameLanguage(
Operations.determinize(Operations.removeDeadStates(a), DEFAULT_DETERMINIZE_WORK_LIMIT),
aMin));
}
Expand All @@ -276,7 +276,7 @@ public void testReverse() throws Exception {
Automaton ra = Operations.reverse(a);
Automaton a2 = Operations.determinize(Operations.reverse(ra), DEFAULT_DETERMINIZE_WORK_LIMIT);

assertTrue(Operations.sameLanguage(a, a2));
assertTrue(AutomatonTestUtil.sameLanguage(a, a2));
}

public void testOptional() throws Exception {
Expand Down Expand Up @@ -401,7 +401,7 @@ public void testReverseRandom1() throws Exception {
Automaton ra = Operations.reverse(a);
Automaton rra = Operations.reverse(ra);
assertTrue(
Operations.sameLanguage(
AutomatonTestUtil.sameLanguage(
Operations.determinize(Operations.removeDeadStates(a), Integer.MAX_VALUE),
Operations.determinize(Operations.removeDeadStates(rra), Integer.MAX_VALUE)));
}
Expand Down Expand Up @@ -502,7 +502,7 @@ public void testBuilderRandom() throws Exception {
}

assertTrue(
Operations.sameLanguage(
AutomatonTestUtil.sameLanguage(
Operations.determinize(Operations.removeDeadStates(a), Integer.MAX_VALUE),
Operations.determinize(
Operations.removeDeadStates(builder.finish()), Integer.MAX_VALUE)));
Expand Down Expand Up @@ -735,7 +735,8 @@ public void testSameLanguage1() throws Exception {
a2.addTransition(0, state, 'a');
a2.finishState();
assertTrue(
Operations.sameLanguage(Operations.removeDeadStates(a), Operations.removeDeadStates(a2)));
AutomatonTestUtil.sameLanguage(
Operations.removeDeadStates(a), Operations.removeDeadStates(a2)));
}

private Automaton randomNoOp(Automaton a) {
Expand Down Expand Up @@ -1288,7 +1289,7 @@ private void assertSame(Collection<BytesRef> terms, Automaton a) {
Automaton a2 =
Operations.removeDeadStates(Operations.determinize(unionTerms(terms), Integer.MAX_VALUE));
assertTrue(
Operations.sameLanguage(
AutomatonTestUtil.sameLanguage(
a2, Operations.removeDeadStates(Operations.determinize(a, Integer.MAX_VALUE))));

// Do same check, in UTF8 space
Expand Down Expand Up @@ -1613,7 +1614,7 @@ public void testMakeBinaryIntervalOpenBoth() throws Exception {

public void testAcceptAllEmptyStringMin() throws Exception {
Automaton a = Automata.makeBinaryInterval(newBytesRef(), true, null, true);
assertTrue(Operations.sameLanguage(Automata.makeAnyBinary(), a));
assertTrue(AutomatonTestUtil.sameLanguage(Automata.makeAnyBinary(), a));
}

private static IntsRef toIntsRef(String s) {
Expand Down
Loading

0 comments on commit 51283d2

Please sign in to comment.