From 87bc8270ffa2fc7d1438418e965b19bcb863a52a Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Thu, 5 Sep 2024 06:59:55 -0400 Subject: [PATCH] move Operations.sameLanguage/subsetOf to AutomatonTestUtil in test-framework (#13708) This code is suitable for tests only and may throw unexpected Exceptions or AssertionErrors for some input. --- lucene/CHANGES.txt | 2 + .../synonym/TestSynonymGraphFilter.java | 2 +- .../lucene/util/automaton/Operations.java | 79 ------------------ .../lucene/util/automaton/StatePair.java | 11 ++- .../lucene/analysis/TestGraphTokenizers.java | 2 +- .../apache/lucene/index/TestTermsEnum2.java | 2 +- .../lucene/util/automaton/TestAutomaton.java | 19 ++--- .../util/automaton/TestDeterminism.java | 10 +-- .../automaton/TestLevenshteinAutomata.java | 20 ++--- .../lucene/util/automaton/TestMinimize.java | 4 +- .../lucene/util/automaton/TestOperations.java | 8 +- .../util/automaton/TestRegExpParsing.java | 3 +- .../automaton/TestStringsToAutomaton.java | 3 +- .../util/automaton/AutomatonTestUtil.java | 80 +++++++++++++++++++ 14 files changed, 129 insertions(+), 116 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 80ca30c4fc74..b41144b2d9c8 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -112,6 +112,8 @@ API Changes * GITHUB#13632: CandidateMatcher public matching functions (Bryan Jacobowitz) +* GITHUB#13708: Move Operations.sameLanguage/subsetOf to test-framework. (Robert Muir) + New Features --------------------- diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymGraphFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymGraphFilter.java index 1eb80ea5081e..974edc8e9f6f 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymGraphFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymGraphFilter.java @@ -1490,7 +1490,7 @@ public void testRandomSyns() throws Exception { } assertTrue(approxEquals(actual, expected)); - assertTrue(Operations.sameLanguage(actual, expected)); + assertTrue(AutomatonTestUtil.sameLanguage(actual, expected)); } a.close(); diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java index 2052b1c50bf5..8fd43dbe1ff1 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java @@ -35,7 +35,6 @@ import java.util.BitSet; import java.util.Collection; import java.util.HashMap; -import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; @@ -374,17 +373,6 @@ public static Automaton intersection(Automaton a1, Automaton a2) { return removeDeadStates(c); } - /** - * Returns true if these two automata accept exactly the same language. This is a costly - * computation! Both automata must be determinized and have no dead states! - */ - public static boolean sameLanguage(Automaton a1, Automaton a2) { - if (a1 == a2) { - return true; - } - return subsetOf(a2, a1) && subsetOf(a1, a2); - } - // TODO: move to test-framework? /** * Returns true if this automaton has any states that cannot be reached from the initial state or @@ -417,73 +405,6 @@ public static boolean hasDeadStatesToAccept(Automaton a) { return reachableFromAccept.isEmpty() == false; } - /** - * Returns true if the language of a1 is a subset of the language of a2. - * Both automata must be determinized and must have no dead states. - * - *

Complexity: quadratic in number of states. - */ - public static boolean subsetOf(Automaton a1, Automaton a2) { - if (a1.isDeterministic() == false) { - throw new IllegalArgumentException("a1 must be deterministic"); - } - if (a2.isDeterministic() == false) { - throw new IllegalArgumentException("a2 must be deterministic"); - } - assert hasDeadStatesFromInitial(a1) == false; - assert hasDeadStatesFromInitial(a2) == false; - if (a1.getNumStates() == 0) { - // Empty language is alwyas a subset of any other language - return true; - } else if (a2.getNumStates() == 0) { - return isEmpty(a1); - } - - // TODO: cutover to iterators instead - Transition[][] transitions1 = a1.getSortedTransitions(); - Transition[][] transitions2 = a2.getSortedTransitions(); - ArrayDeque worklist = new ArrayDeque<>(); - HashSet visited = new HashSet<>(); - StatePair p = new StatePair(0, 0); - worklist.add(p); - visited.add(p); - while (worklist.size() > 0) { - p = worklist.removeFirst(); - if (a1.isAccept(p.s1) && a2.isAccept(p.s2) == false) { - return false; - } - Transition[] t1 = transitions1[p.s1]; - Transition[] t2 = transitions2[p.s2]; - for (int n1 = 0, b2 = 0; n1 < t1.length; n1++) { - while (b2 < t2.length && t2[b2].max < t1[n1].min) { - b2++; - } - int min1 = t1[n1].min, max1 = t1[n1].max; - - for (int n2 = b2; n2 < t2.length && t1[n1].max >= t2[n2].min; n2++) { - if (t2[n2].min > min1) { - return false; - } - if (t2[n2].max < Character.MAX_CODE_POINT) { - min1 = t2[n2].max + 1; - } else { - min1 = Character.MAX_CODE_POINT; - max1 = Character.MIN_CODE_POINT; - } - StatePair q = new StatePair(t1[n1].dest, t2[n2].dest); - if (!visited.contains(q)) { - worklist.add(q); - visited.add(q); - } - } - if (min1 <= max1) { - return false; - } - } - } - return true; - } - /** * Returns an automaton that accepts the union of the languages of the given automata. * diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/StatePair.java b/lucene/core/src/java/org/apache/lucene/util/automaton/StatePair.java index ad1be724f9cd..bd003507f2de 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/StatePair.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/StatePair.java @@ -35,9 +35,14 @@ * @lucene.experimental */ public class StatePair { + // only mike knows what it does (do not expose) int s; - int s1; - int s2; + + /** first state */ + public final int s1; + + /** second state */ + public final int s2; StatePair(int s, int s1, int s2) { this.s = s; @@ -81,7 +86,7 @@ public boolean equals(Object obj) { @Override public int hashCode() { // Don't use s1 ^ s2 since it's vulnerable to the case where s1 == s2 always --> hashCode = 0, - // e.g. if you call Operations.sameLanguage, + // e.g. if you call AutomatonTestUtil.sameLanguage, // passing the same automaton against itself: return s1 * 31 + s2; } diff --git a/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java b/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java index 6e51a6bbb0c3..4737694702f1 100644 --- a/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java +++ b/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java @@ -625,7 +625,7 @@ private void assertSameLanguage(Automaton expected, Automaton actual) { Operations.removeDeadStates(expected), DEFAULT_DETERMINIZE_WORK_LIMIT); Automaton actualDet = Operations.determinize(Operations.removeDeadStates(actual), DEFAULT_DETERMINIZE_WORK_LIMIT); - if (Operations.sameLanguage(expectedDet, actualDet) == false) { + if (AutomatonTestUtil.sameLanguage(expectedDet, actualDet) == false) { Set expectedPaths = toPathStrings(expectedDet); Set actualPaths = toPathStrings(actualDet); StringBuilder b = new StringBuilder(); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum2.java b/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum2.java index ddca01a2387e..105579b25d60 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum2.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum2.java @@ -183,7 +183,7 @@ public void testIntersect() throws Exception { Automaton actual = Operations.determinize(Automata.makeStringUnion(found), DEFAULT_DETERMINIZE_WORK_LIMIT); - assertTrue(Operations.sameLanguage(expected, actual)); + assertTrue(AutomatonTestUtil.sameLanguage(expected, actual)); } } } diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java index e4dd739ef78d..3c7d6eea198a 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java @@ -87,7 +87,7 @@ public void testSameLanguage() throws Exception { Automaton a2 = Operations.removeDeadStates( Operations.concatenate(Automata.makeString("foo"), Automata.makeString("bar"))); - assertTrue(Operations.sameLanguage(a1, a2)); + assertTrue(AutomatonTestUtil.sameLanguage(a1, a2)); } public void testCommonPrefixString() throws Exception { @@ -257,7 +257,7 @@ public void testMinimizeSimple() throws Exception { Automaton a = Automata.makeString("foobar"); Automaton aMin = MinimizationOperations.minimize(a, DEFAULT_DETERMINIZE_WORK_LIMIT); - assertTrue(Operations.sameLanguage(a, aMin)); + assertTrue(AutomatonTestUtil.sameLanguage(a, aMin)); } public void testMinimize2() throws Exception { @@ -266,7 +266,7 @@ public void testMinimize2() throws Exception { Arrays.asList(Automata.makeString("foobar"), Automata.makeString("boobar"))); Automaton aMin = MinimizationOperations.minimize(a, DEFAULT_DETERMINIZE_WORK_LIMIT); assertTrue( - Operations.sameLanguage( + AutomatonTestUtil.sameLanguage( Operations.determinize(Operations.removeDeadStates(a), DEFAULT_DETERMINIZE_WORK_LIMIT), aMin)); } @@ -276,7 +276,7 @@ public void testReverse() throws Exception { Automaton ra = Operations.reverse(a); Automaton a2 = Operations.determinize(Operations.reverse(ra), DEFAULT_DETERMINIZE_WORK_LIMIT); - assertTrue(Operations.sameLanguage(a, a2)); + assertTrue(AutomatonTestUtil.sameLanguage(a, a2)); } public void testOptional() throws Exception { @@ -401,7 +401,7 @@ public void testReverseRandom1() throws Exception { Automaton ra = Operations.reverse(a); Automaton rra = Operations.reverse(ra); assertTrue( - Operations.sameLanguage( + AutomatonTestUtil.sameLanguage( Operations.determinize(Operations.removeDeadStates(a), Integer.MAX_VALUE), Operations.determinize(Operations.removeDeadStates(rra), Integer.MAX_VALUE))); } @@ -502,7 +502,7 @@ public void testBuilderRandom() throws Exception { } assertTrue( - Operations.sameLanguage( + AutomatonTestUtil.sameLanguage( Operations.determinize(Operations.removeDeadStates(a), Integer.MAX_VALUE), Operations.determinize( Operations.removeDeadStates(builder.finish()), Integer.MAX_VALUE))); @@ -735,7 +735,8 @@ public void testSameLanguage1() throws Exception { a2.addTransition(0, state, 'a'); a2.finishState(); assertTrue( - Operations.sameLanguage(Operations.removeDeadStates(a), Operations.removeDeadStates(a2))); + AutomatonTestUtil.sameLanguage( + Operations.removeDeadStates(a), Operations.removeDeadStates(a2))); } private Automaton randomNoOp(Automaton a) { @@ -1288,7 +1289,7 @@ private void assertSame(Collection terms, Automaton a) { Automaton a2 = Operations.removeDeadStates(Operations.determinize(unionTerms(terms), Integer.MAX_VALUE)); assertTrue( - Operations.sameLanguage( + AutomatonTestUtil.sameLanguage( a2, Operations.removeDeadStates(Operations.determinize(a, Integer.MAX_VALUE)))); // Do same check, in UTF8 space @@ -1613,7 +1614,7 @@ public void testMakeBinaryIntervalOpenBoth() throws Exception { public void testAcceptAllEmptyStringMin() throws Exception { Automaton a = Automata.makeBinaryInterval(newBytesRef(), true, null, true); - assertTrue(Operations.sameLanguage(Automata.makeAnyBinary(), a)); + assertTrue(AutomatonTestUtil.sameLanguage(Automata.makeAnyBinary(), a)); } private static IntsRef toIntsRef(String s) { diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestDeterminism.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestDeterminism.java index 65616fa55b99..e69568d38739 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestDeterminism.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestDeterminism.java @@ -41,7 +41,7 @@ public void testAgainstSimple() throws Exception { a = AutomatonTestUtil.determinizeSimple(a); Automaton b = Operations.determinize(a, Integer.MAX_VALUE); // TODO: more verifications possible? - assertTrue(Operations.sameLanguage(a, b)); + assertTrue(AutomatonTestUtil.sameLanguage(a, b)); } } @@ -53,20 +53,20 @@ private static void assertAutomaton(Automaton a) { Operations.complement( Operations.complement(a, DEFAULT_DETERMINIZE_WORK_LIMIT), DEFAULT_DETERMINIZE_WORK_LIMIT); - assertTrue(Operations.sameLanguage(a, equivalent)); + assertTrue(AutomatonTestUtil.sameLanguage(a, equivalent)); // a union a = a equivalent = Operations.determinize( Operations.removeDeadStates(Operations.union(a, a)), DEFAULT_DETERMINIZE_WORK_LIMIT); - assertTrue(Operations.sameLanguage(a, equivalent)); + assertTrue(AutomatonTestUtil.sameLanguage(a, equivalent)); // a intersect a = a equivalent = Operations.determinize( Operations.removeDeadStates(Operations.intersection(a, a)), DEFAULT_DETERMINIZE_WORK_LIMIT); - assertTrue(Operations.sameLanguage(a, equivalent)); + assertTrue(AutomatonTestUtil.sameLanguage(a, equivalent)); // a minus a = empty Automaton empty = Operations.minus(a, a, DEFAULT_DETERMINIZE_WORK_LIMIT); @@ -81,7 +81,7 @@ private static void assertAutomaton(Automaton a) { equivalent = Operations.minus(optional, Automata.makeEmptyString(), DEFAULT_DETERMINIZE_WORK_LIMIT); // System.out.println("equiv " + equivalent); - assertTrue(Operations.sameLanguage(a, equivalent)); + assertTrue(AutomatonTestUtil.sameLanguage(a, equivalent)); } } } diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestLevenshteinAutomata.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestLevenshteinAutomata.java index c8adb8751b90..bc6d268c15e9 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestLevenshteinAutomata.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestLevenshteinAutomata.java @@ -81,44 +81,46 @@ private void assertLev(String s, int maxDistance) { // check that the dfa for n-1 accepts a subset of the dfa for n if (n > 0) { assertTrue( - Operations.subsetOf( + AutomatonTestUtil.subsetOf( Operations.removeDeadStates(automata[n - 1]), Operations.removeDeadStates(automata[n]))); assertTrue( - Operations.subsetOf( + AutomatonTestUtil.subsetOf( Operations.removeDeadStates(automata[n - 1]), Operations.removeDeadStates(tautomata[n]))); assertTrue( - Operations.subsetOf( + AutomatonTestUtil.subsetOf( Operations.removeDeadStates(tautomata[n - 1]), Operations.removeDeadStates(automata[n]))); assertTrue( - Operations.subsetOf( + AutomatonTestUtil.subsetOf( Operations.removeDeadStates(tautomata[n - 1]), Operations.removeDeadStates(tautomata[n]))); assertNotSame(automata[n - 1], automata[n]); } // check that Lev(N) is a subset of LevT(N) assertTrue( - Operations.subsetOf( + AutomatonTestUtil.subsetOf( Operations.removeDeadStates(automata[n]), Operations.removeDeadStates(tautomata[n]))); // special checks for specific n switch (n) { case 0: // easy, matches the string itself assertTrue( - Operations.sameLanguage( + AutomatonTestUtil.sameLanguage( Automata.makeString(s), Operations.removeDeadStates(automata[0]))); assertTrue( - Operations.sameLanguage( + AutomatonTestUtil.sameLanguage( Automata.makeString(s), Operations.removeDeadStates(tautomata[0]))); break; case 1: // generate a lev1 naively, and check the accepted lang is the same. assertTrue( - Operations.sameLanguage(naiveLev1(s), Operations.removeDeadStates(automata[1]))); + AutomatonTestUtil.sameLanguage( + naiveLev1(s), Operations.removeDeadStates(automata[1]))); assertTrue( - Operations.sameLanguage(naiveLev1T(s), Operations.removeDeadStates(tautomata[1]))); + AutomatonTestUtil.sameLanguage( + naiveLev1T(s), Operations.removeDeadStates(tautomata[1]))); break; default: assertBruteForce(s, automata[n], n); diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestMinimize.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestMinimize.java index a43cc8ae8b13..92be1e4d5697 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestMinimize.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestMinimize.java @@ -28,7 +28,7 @@ public void testBasic() { Automaton a = AutomatonTestUtil.randomAutomaton(random()); Automaton la = Operations.determinize(Operations.removeDeadStates(a), Integer.MAX_VALUE); Automaton lb = MinimizationOperations.minimize(a, Integer.MAX_VALUE); - assertTrue(Operations.sameLanguage(la, lb)); + assertTrue(AutomatonTestUtil.sameLanguage(la, lb)); } } @@ -42,7 +42,7 @@ public void testAgainstBrzozowski() { Automaton a = AutomatonTestUtil.randomAutomaton(random()); a = AutomatonTestUtil.minimizeSimple(a); Automaton b = MinimizationOperations.minimize(a, Integer.MAX_VALUE); - assertTrue(Operations.sameLanguage(a, b)); + assertTrue(AutomatonTestUtil.sameLanguage(a, b)); assertEquals(a.getNumStates(), b.getNumStates()); int numStates = a.getNumStates(); diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java index ec38eafe0ced..c6ccf403fc8a 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java @@ -50,7 +50,7 @@ public void testStringUnion() { assertTrue(naiveUnion.isDeterministic()); assertFalse(Operations.hasDeadStatesFromInitial(naiveUnion)); - assertTrue(Operations.sameLanguage(union, naiveUnion)); + assertTrue(AutomatonTestUtil.sameLanguage(union, naiveUnion)); } private static Automaton naiveUnion(List strings) { @@ -116,13 +116,13 @@ public void testEmptySingletonNFAConcatenate() { Automaton concat2 = Operations.concatenate(singleton, nfa); assertFalse(concat2.isDeterministic()); assertTrue( - Operations.sameLanguage( + AutomatonTestUtil.sameLanguage( Operations.determinize(concat1, 100), Operations.determinize(concat2, 100))); assertTrue( - Operations.sameLanguage( + AutomatonTestUtil.sameLanguage( Operations.determinize(nfa, 100), Operations.determinize(concat1, 100))); assertTrue( - Operations.sameLanguage( + AutomatonTestUtil.sameLanguage( Operations.determinize(nfa, 100), Operations.determinize(concat2, 100))); } diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExpParsing.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExpParsing.java index 7d0f062f36bc..74fb08cb7188 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExpParsing.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExpParsing.java @@ -20,6 +20,7 @@ import java.util.Map; import java.util.Set; import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.tests.util.automaton.AutomatonTestUtil; /** * Simple unit tests for RegExp parsing. @@ -698,7 +699,7 @@ public void testIllegalMatchFlags() { private void assertSameLanguage(Automaton expected, Automaton actual) { expected = Operations.determinize(expected, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT); actual = Operations.determinize(actual, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT); - boolean result = Operations.sameLanguage(expected, actual); + boolean result = AutomatonTestUtil.sameLanguage(expected, actual); if (result == false) { System.out.println(expected.toDot()); System.out.println(actual.toDot()); diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java index 0e5a3f9fc30d..efaa451258bb 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java @@ -28,6 +28,7 @@ import java.util.Set; import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.tests.util.TestUtil; +import org.apache.lucene.tests.util.automaton.AutomatonTestUtil; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; @@ -158,7 +159,7 @@ private void checkMinimized(Automaton a) { private static void assertSameAutomaton(Automaton a, Automaton b) { assertEquals(a.getNumStates(), b.getNumStates()); assertEquals(a.getNumTransitions(), b.getNumTransitions()); - assertTrue(Operations.sameLanguage(a, b)); + assertTrue(AutomatonTestUtil.sameLanguage(a, b)); } private List basicTerms() { diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/util/automaton/AutomatonTestUtil.java b/lucene/test-framework/src/java/org/apache/lucene/tests/util/automaton/AutomatonTestUtil.java index 38819479cfc6..df2731f3cb43 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/util/automaton/AutomatonTestUtil.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/util/automaton/AutomatonTestUtil.java @@ -16,6 +16,7 @@ */ package org.apache.lucene.tests.util.automaton; +import java.util.ArrayDeque; import java.util.ArrayList; import java.util.BitSet; import java.util.HashMap; @@ -33,6 +34,7 @@ import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.Operations; import org.apache.lucene.util.automaton.RegExp; +import org.apache.lucene.util.automaton.StatePair; import org.apache.lucene.util.automaton.TooComplexToDeterminizeException; import org.apache.lucene.util.automaton.Transition; @@ -533,4 +535,82 @@ public static boolean isDeterministicSlow(Automaton a) { assert a.isDeterministic() == true; return true; } + + /** + * Returns true if these two automata accept exactly the same language. This is a costly + * computation! Both automata must be determinized and have no dead states! + */ + public static boolean sameLanguage(Automaton a1, Automaton a2) { + if (a1 == a2) { + return true; + } + return subsetOf(a2, a1) && subsetOf(a1, a2); + } + + /** + * Returns true if the language of a1 is a subset of the language of a2. + * Both automata must be determinized and must have no dead states. + * + *

Complexity: quadratic in number of states. + */ + public static boolean subsetOf(Automaton a1, Automaton a2) { + if (a1.isDeterministic() == false) { + throw new IllegalArgumentException("a1 must be deterministic"); + } + if (a2.isDeterministic() == false) { + throw new IllegalArgumentException("a2 must be deterministic"); + } + assert Operations.hasDeadStatesFromInitial(a1) == false; + assert Operations.hasDeadStatesFromInitial(a2) == false; + if (a1.getNumStates() == 0) { + // Empty language is alwyas a subset of any other language + return true; + } else if (a2.getNumStates() == 0) { + return Operations.isEmpty(a1); + } + + // TODO: cutover to iterators instead + Transition[][] transitions1 = a1.getSortedTransitions(); + Transition[][] transitions2 = a2.getSortedTransitions(); + ArrayDeque worklist = new ArrayDeque<>(); + HashSet visited = new HashSet<>(); + StatePair p = new StatePair(0, 0); + worklist.add(p); + visited.add(p); + while (worklist.size() > 0) { + p = worklist.removeFirst(); + if (a1.isAccept(p.s1) && a2.isAccept(p.s2) == false) { + return false; + } + Transition[] t1 = transitions1[p.s1]; + Transition[] t2 = transitions2[p.s2]; + for (int n1 = 0, b2 = 0; n1 < t1.length; n1++) { + while (b2 < t2.length && t2[b2].max < t1[n1].min) { + b2++; + } + int min1 = t1[n1].min, max1 = t1[n1].max; + + for (int n2 = b2; n2 < t2.length && t1[n1].max >= t2[n2].min; n2++) { + if (t2[n2].min > min1) { + return false; + } + if (t2[n2].max < Character.MAX_CODE_POINT) { + min1 = t2[n2].max + 1; + } else { + min1 = Character.MAX_CODE_POINT; + max1 = Character.MIN_CODE_POINT; + } + StatePair q = new StatePair(t1[n1].dest, t2[n2].dest); + if (!visited.contains(q)) { + worklist.add(q); + visited.add(q); + } + } + if (min1 <= max1) { + return false; + } + } + } + return true; + } }