From 9fc3f8aec2dc68bf96b97100e2b9841582dc9949 Mon Sep 17 00:00:00 2001 From: David Lurton Date: Thu, 10 Sep 2020 11:39:20 -0700 Subject: [PATCH 1/9] Optimize LIKE pattern compilation In two ways: - Change fold/union operations to accumulate to a single list. - Replace *ordered* sets and maps to hash sets and maps. This results in a > 10x improvement in compiling large like patterns (i.e. 1000 characters and up). --- .../partiql/lang/eval/LikeMatchingAutomata.kt | 77 ++++++++++--------- 1 file changed, 42 insertions(+), 35 deletions(-) diff --git a/lang/src/org/partiql/lang/eval/LikeMatchingAutomata.kt b/lang/src/org/partiql/lang/eval/LikeMatchingAutomata.kt index b24f7f858d..ed61a74c09 100644 --- a/lang/src/org/partiql/lang/eval/LikeMatchingAutomata.kt +++ b/lang/src/org/partiql/lang/eval/LikeMatchingAutomata.kt @@ -18,6 +18,20 @@ import org.partiql.lang.util.codePointSequence import java.util.ArrayList import java.util.HashSet +@Suppress("UNCHECKED_CAST") +private fun Iterable.fastToMutableHashSet(): MutableSet = + when(this) { + // `HashSet.clone()` is faster than constructing a new HashSet and calling .addAll() + is HashSet -> this.clone() as HashSet + else -> HashSet().also { it.addAll(this) } + } + +private infix fun Iterable.fastUnion(other: Iterable): MutableSet { + val set = this.fastToMutableHashSet() + set.addAll(other) + return set +} + /** * Enumeration of Alphabet letters which can be one of @@ -38,7 +52,7 @@ private sealed class Alphabet { * Represents the dead DFA State. * This state is terminal-- has no outgoing transitions and it is neither a start State nor a Final state */ -val DFADeadState : IDFAState = DFAState(mutableSetOf(), mutableMapOf()) +val DFADeadState : IDFAState = DFAState(HashSet(), HashMap()) // Represents the DFA of the empty pattern val DFAEmptyPattern = object :IDFAState { @@ -90,8 +104,8 @@ interface IDFAState { */ private open class DFAState(val nfaStates: MutableSet, val outgoing: MutableMap, - var accepting: Boolean = nfaStates.filter { it.isAccepting }.isNotEmpty(), - var start: Boolean = nfaStates.filter { it.isStartState }.isNotEmpty() + var accepting: Boolean = nfaStates.any { it.isAccepting }, + var start: Boolean = nfaStates.any { it.isStartState } ) : IDFAState { fun addTransition(transition: Alphabet, target: DFAState) { @@ -111,8 +125,7 @@ private open class DFAState(val nfaStates: MutableSet, override fun isAccepting(): Boolean = accepting - - /** + /** * Given a character, take a step in our DFA starting with `this` state and possible transitions that match [codePoint]. * * @param codePoint character to match against possible valid transitions @@ -157,6 +170,8 @@ private open class DFAState(val nfaStates: MutableSet, } } + + /** * Represents a state in an NFA where * @@ -168,15 +183,15 @@ private open class DFAState(val nfaStates: MutableSet, private class NFAState(val stateNumber: Int, val isAccepting: Boolean, val isStartState: Boolean, - val outgoing: MutableMap> = mutableMapOf>()) { + val outgoing: MutableMap> = HashMap()) { fun get(transition: Alphabet): Set = - outgoing[transition]?.let { it } ?: setOf() + outgoing[transition] ?: hashSetOf() fun addTransition(label: Alphabet, target: NFAState) { when (outgoing.containsKey(label)) { - true -> outgoing[label]?.add(target) ?: mutableSetOf(target) - false -> outgoing.put(label, mutableSetOf(target)) + true -> outgoing[label]?.add(target) ?: HashSet().apply { add(target) } + false -> outgoing[label] = HashSet().apply { add(target) } } } @@ -195,17 +210,11 @@ private class NFAState(val stateNumber: Int, is Alphabet.Epsilon -> epsilonClosure() is Alphabet.AnyOneChar, is Alphabet.AnyZeroOrMoreChars -> { - val startSet = epsilonClosure().union(get(alpha)) - startSet.fold(get(alpha)) { acc, nfaState -> acc.union(nfaState.epsilonClosure()) } + val startSet = epsilonClosure().fastUnion(get(alpha)) + startSet.fold(get(alpha)) { acc, nfaState -> acc.fastUnion(nfaState.epsilonClosure()) } } } - /** - * Convineance method for 1-character closure when given a [Char]. - */ - fun getOutgoingStates(letter: Char): Set = - closure(letter.toInt()) - /** * Given a character return the set of NFA States that are the character-closure for `this` node. * @@ -231,7 +240,7 @@ private class NFAState(val stateNumber: Int, * @return set of NFA States reachable from `this` state by 1 non-epsilon transition. */ fun getNonEpsilonTransitionTargets(codePoint: Int): Set = - get(Alphabet.Letter(codePoint)).union(get(Alphabet.AnyOneChar)) + get(Alphabet.Letter(codePoint)).fastUnion(get(Alphabet.AnyOneChar)) /** @@ -249,9 +258,9 @@ private class NFAState(val stateNumber: Int, fun closure(codePoint: Int): Set { val reachableThroughEpsilon = epsilonClosure() val reachableThroughNonEpsilon = getNonEpsilonTransitionTargets(codePoint).let { - it.fold(it.toSet(), { acc, state -> acc.union(state.epsilonClosure()) }) + it.fold(it, { acc, state -> acc.fastUnion(state.epsilonClosure()) }) } - return reachableThroughEpsilon.union(reachableThroughNonEpsilon) + return reachableThroughEpsilon.fastUnion(reachableThroughNonEpsilon) } @@ -264,7 +273,7 @@ private class NFAState(val stateNumber: Int, fun epsilonClosure(): Set = get(Alphabet.Epsilon).let { it.fold(it, { acc, state -> - acc.union(state.epsilonClosure()) + acc.fastUnion(state.epsilonClosure()) }) } } @@ -286,13 +295,12 @@ private class NFAState(val stateNumber: Int, fun buildDfaFromPattern(pattern: String, escape: Int?, patternSize: Int): IDFAState { escape?.let { val patternAsNfaLetters = patternToSequenceOfNfaLetters(pattern, it) - val dfaAlpha = patternAsNfaLetters.map(nfaLettersToDfaAlphabet()).toSet() + val dfaAlpha = patternAsNfaLetters.map(nfaLettersToDfaAlphabet()).toHashSet() return nfaToDfa(dfaAlpha, buildNfa(patternAsNfaLetters, patternSize)) } val patternAsNfaLetters = patternToSequenceOfNfaLetters(pattern) - val dfaAlpha = patternAsNfaLetters.map(nfaLettersToDfaAlphabet()).toSet() + val dfaAlpha = patternAsNfaLetters.map(nfaLettersToDfaAlphabet()).toHashSet() return nfaToDfa(dfaAlpha, buildNfa(patternAsNfaLetters, patternSize)) - } /** @@ -302,7 +310,7 @@ fun buildDfaFromPattern(pattern: String, escape: Int?, patternSize: Int): IDFASt * @param pattern search pattern * @param escapeChar escape character * - * @return sequence of lketters in the NFA's alphabet that correspond to the characters in the pattern + * @return sequence of letters in the NFA's alphabet that correspond to the characters in the pattern */ private fun patternToSequenceOfNfaLetters(pattern: String, escapeChar: Int): Sequence { val codePointIter = pattern.codePointSequence().iterator() @@ -331,7 +339,6 @@ private fun patternToSequenceOfNfaLetters(pattern: String): Sequence = codePointToAlphabetLetter(it) } - /** * Given a character, return its corresponding Alphabet Letter * @@ -370,7 +377,7 @@ private fun nfaLettersToDfaAlphabet(): (Alphabet) -> Alphabet { * @return DFA that simulates the NFA with start state [nfa] */ private fun nfaToDfa(alphabet: Set, nfa: NFAState) = - buildDFA(alphabet, mutableMapOf(), setOf(nfa.epsilonClosure().union(setOf(nfa)))) + buildDFA(alphabet, HashMap(), hashSetOf(nfa.epsilonClosure().fastUnion(hashSetOf(nfa)))) /** @@ -389,7 +396,7 @@ private fun buildDFA(dfaAlphabet: Set, delta: MutableMap, Alphabet>, Set>, todo: Set>): DFAState { - var unprocessed = todo.toMutableSet() + var unprocessed = todo.fastToMutableHashSet() val processed = HashSet>() while (unprocessed.isNotEmpty()) { val nfaStates = unprocessed.first() @@ -401,8 +408,10 @@ private fun buildDFA(dfaAlphabet: Set, val deltaUpdates: List, Alphabet>, Set>> = dfaAlphabet.map { Pair(Pair(nfaStates, it), - nfaStates.fold(setOf()) { acc, state -> - acc.union(state.getOutgoingStates(it)) + HashSet().apply { + nfaStates.forEach { state -> + addAll(state.getOutgoingStates(it)) + } }) } processed.add(nfaStates) @@ -411,15 +420,13 @@ private fun buildDFA(dfaAlphabet: Set, it.second }.filter { s -> s.isNotEmpty() && !processed.contains(s) - }.toMutableSet() - unprocessed = unprocessed.union(newStates).toMutableSet() + }.fastToMutableHashSet() + unprocessed = unprocessed.fastUnion(newStates) } val nfaStateSetToDfaState = HashMap, DFAState>() - - - delta.forEach { nfaStateSetToDfaState.put(it.key.first, DFAState(it.key.first.toMutableSet(), HashMap())) } + delta.forEach { nfaStateSetToDfaState[it.key.first] = DFAState(it.key.first.fastToMutableHashSet(), HashMap()) } delta.forEach { (nfaSet, alpha), target -> val targetDfa: DFAState = nfaStateSetToDfaState[target].let { it } ?: DFADeadState as DFAState nfaStateSetToDfaState[nfaSet]?.addTransition(alpha, targetDfa) ?: errNoContext("DFA state for $nfaSet does not exist", internal = true) From d53ef3de8959f48cdc3113c45722e0b37da0dd5c Mon Sep 17 00:00:00 2001 From: David Lurton Date: Mon, 14 Sep 2020 16:31:31 -0700 Subject: [PATCH 2/9] Make LIKE pattern compiling interruptible. --- .../partiql/lang/eval/LikeMatchingAutomata.kt | 37 ++++++++++++------- .../partiql/lang/eval/LikePredicateTest.kt | 22 +++++++++++ 2 files changed, 45 insertions(+), 14 deletions(-) diff --git a/lang/src/org/partiql/lang/eval/LikeMatchingAutomata.kt b/lang/src/org/partiql/lang/eval/LikeMatchingAutomata.kt index ed61a74c09..0ef2738224 100644 --- a/lang/src/org/partiql/lang/eval/LikeMatchingAutomata.kt +++ b/lang/src/org/partiql/lang/eval/LikeMatchingAutomata.kt @@ -18,6 +18,7 @@ import org.partiql.lang.util.codePointSequence import java.util.ArrayList import java.util.HashSet + @Suppress("UNCHECKED_CAST") private fun Iterable.fastToMutableHashSet(): MutableSet = when(this) { @@ -102,10 +103,11 @@ interface IDFAState { * - [accepting] true if this is a Final state, false otherwise * - [start] true if this is a Start state, false otherwise */ -private open class DFAState(val nfaStates: MutableSet, - val outgoing: MutableMap, - var accepting: Boolean = nfaStates.any { it.isAccepting }, - var start: Boolean = nfaStates.any { it.isStartState } +private open class DFAState( + val nfaStates: MutableSet, + val outgoing: MutableMap, + var accepting: Boolean = nfaStates.any { it.isAccepting }, + var start: Boolean = nfaStates.any { it.isStartState } ) : IDFAState { fun addTransition(transition: Alphabet, target: DFAState) { @@ -180,10 +182,11 @@ private open class DFAState(val nfaStates: MutableSet, * - [isStartState] true when this State is a Start state, false othewise * - [outgoing] map of alphabet letter to NFA State */ -private class NFAState(val stateNumber: Int, - val isAccepting: Boolean, - val isStartState: Boolean, - val outgoing: MutableMap> = HashMap()) { +private class NFAState( + val stateNumber: Int, + val isAccepting: Boolean, + val isStartState: Boolean, + val outgoing: MutableMap> = HashMap()) { fun get(transition: Alphabet): Set = outgoing[transition] ?: hashSetOf() @@ -392,9 +395,10 @@ private fun nfaToDfa(alphabet: Set, nfa: NFAState) = * * @return DFA that simulates the NFA */ -private fun buildDFA(dfaAlphabet: Set, - delta: MutableMap, Alphabet>, Set>, - todo: Set>): DFAState { +private fun buildDFA( + dfaAlphabet: Set, + delta: MutableMap, Alphabet>, Set>, + todo: Set>): DFAState { var unprocessed = todo.fastToMutableHashSet() val processed = HashSet>() @@ -411,6 +415,10 @@ private fun buildDFA(dfaAlphabet: Set, HashSet().apply { nfaStates.forEach { state -> addAll(state.getOutgoingStates(it)) + + if (Thread.interrupted()) { + throw InterruptedException() + } } }) } @@ -445,8 +453,9 @@ private fun buildDFA(dfaAlphabet: Set, * * @return update [delta] that incorporates changes in [deltaUpdates] */ -private fun updateDelta(delta: MutableMap, Alphabet>, Set>, - deltaUpdates: List, Alphabet>, Set>>) { +private fun updateDelta( + delta: MutableMap, Alphabet>, Set>, + deltaUpdates: List, Alphabet>, Set>>) { deltaUpdates.forEach { if (delta.containsKey(it.first)) { if (delta[it.first] != it.second) { @@ -470,7 +479,7 @@ private fun updateDelta(delta: MutableMap, Alphabet>, Set, patternSize: Int): NFAState = - letters.foldIndexed(mutableListOf(NFAState(-1, 0 == patternSize , true)), { index, acc, transition -> + letters.foldIndexed(mutableListOf(NFAState(-1, 0 == patternSize, true)), { index, acc, transition -> alphabetToNFAStateAcc(transition, NFAState(index, index == (patternSize - 1), false), acc) }).first() diff --git a/lang/test/org/partiql/lang/eval/LikePredicateTest.kt b/lang/test/org/partiql/lang/eval/LikePredicateTest.kt index 15dd92beef..d6e55a96ad 100644 --- a/lang/test/org/partiql/lang/eval/LikePredicateTest.kt +++ b/lang/test/org/partiql/lang/eval/LikePredicateTest.kt @@ -19,6 +19,7 @@ import org.partiql.lang.errors.* import org.partiql.lang.util.* import org.assertj.core.api.* import org.junit.* +import kotlin.concurrent.thread import kotlin.test.* class LikePredicateTest : EvaluatorTestBase() { @@ -629,5 +630,26 @@ class LikePredicateTest : EvaluatorTestBase() { } + @Test + fun interruptedThreadThrowsInterruptedException() { + // '%!!!!....%' should take a very long time to compile until we make some major refactorings + // of the LIKE pattern matching implementations. + val heavyPayload = "foo like '%${"!".repeat(5000)}%'" + + var wasInterrupted = false + + val someThread = thread { + try { + CompilerPipeline.standard(ion).compile(heavyPayload) + } catch(_: InterruptedException) { + wasInterrupted = true + } + } + + someThread.interrupt() + someThread.join(5000) + + assertTrue("Thread should have been interrupted!", wasInterrupted) + } } From 50d4e16cfe3da10e062a12189e9d3d1c04b6cd11 Mon Sep 17 00:00:00 2001 From: David Lurton Date: Sun, 20 Sep 2020 14:37:43 -0700 Subject: [PATCH 3/9] Seems mostly functional, needs optimization --- lang/test/org/partiql/lang/ExperimentPad.kt | 219 ++++++++++++++++++ lang/test/org/partiql/lang/SubListIterator.kt | 35 +++ 2 files changed, 254 insertions(+) create mode 100644 lang/test/org/partiql/lang/ExperimentPad.kt create mode 100644 lang/test/org/partiql/lang/SubListIterator.kt diff --git a/lang/test/org/partiql/lang/ExperimentPad.kt b/lang/test/org/partiql/lang/ExperimentPad.kt new file mode 100644 index 0000000000..dcb3db3196 --- /dev/null +++ b/lang/test/org/partiql/lang/ExperimentPad.kt @@ -0,0 +1,219 @@ +package org.partiql.lang + +import junitparams.Parameters +import org.junit.Test +import org.partiql.lang.eval.EvaluatorTestBase + +//@Ignore +class ScratchPad : EvaluatorTestBase() { + + data class TestCase(val pattern: String, val input: String, val shouldMatch: Boolean) + + private fun createTestCase(pattern: String, vectors: List>) = + vectors.map { TestCase(pattern, it.first, it.second) } + + fun parametersForPatternTest() = listOf( + createTestCase("a", listOf( + "a" to true, + "aa" to false, + "b" to false, + "bb" to false + )), + createTestCase("aa", listOf( + "a" to false, + "aa" to true, + "b" to false, + "bb" to false + )), + createTestCase("_", listOf( + "a" to true, + "b" to true, + "aa" to false, + "bb" to false + )), + createTestCase("__", listOf( + "a" to false, + "b" to false, + "aa" to true, + "bb" to true + )), + createTestCase("%", listOf( + "a" to true, + "bb" to true + )), + createTestCase("%%", listOf( + "a" to true, + "bb" to true + )), + createTestCase("a%", listOf( + "a" to true, + "ab" to true, + "abcde" to true, + "b" to false, + "ba" to false, + "baa" to false + )), + createTestCase("%a", listOf( + "a" to true, + "ba" to true, + "edcba" to true, + "b" to false, + "ab" to false, + "aab" to false + )), + createTestCase("%a%", listOf( + "a" to true, + "ab" to true, + "ba" to true, + "bab" to true, + "bbabb" to true, + "b" to false, + "bb" to false + )), + createTestCase("%_asdf_%", listOf( + "1asdf1" to true, + "1asdf1x" to true, + "x1asdf1" to true, + "xyz1asdf1" to true, + "1asdf1xyz" to true, + "xyz1asdf1xyz" to true + )) + ).flatten() + + @Test + @Parameters + fun patternTest(tc: TestCase) { + val pat = compilePattern(tc.pattern) + val actualMatches = executePattern2(pat, tc.input) + + assertEquals(tc.shouldMatch, actualMatches) + } +} + +fun executePattern2(parts: List, str: String): Boolean { + return executePattern2( + SubListIterator(parts), SubListIterator(str.toList())) +} + +fun executePattern2(partsItr: SubListIterator, charsItr: SubListIterator): Boolean { + while (partsItr.hasNext()) { + if(!executeOnePart(partsItr, charsItr)) + return false + } + return !charsItr.hasNext() +} + +fun executeOnePart(partsItr: SubListIterator, charsItr: SubListIterator): Boolean { + when (val currentPart = partsItr.next()) { + is PatternPart.AnyOneChar -> { + if(!charsItr.hasNext()) + return false + + charsItr.next() + return true + } + is PatternPart.ExactChars -> { + currentPart.chars.forEach { + if (!charsItr.hasNext() || charsItr.next() != it) { + return false + } + } + return true + } + PatternPart.AnyZeroOrMoreChars -> { + // No need to check the rest of the string if this is the last pattern part + if (!partsItr.hasNext()) { + charsItr.skipToEnd() // consume rest of string otherwise we will consider this a non-match. + return true + } + + while (true) { + val nextPartsItr = partsItr.subListIterator(1) + val codepointsItrClone = charsItr.clone() + if (executePattern2(nextPartsItr, codepointsItrClone)) { + return true + } + + charsItr.next() + if (!charsItr.hasNext()) { + return false + } + } + } + }.let { } +} + +sealed class PatternPart { + object AnyOneChar : PatternPart() + object AnyZeroOrMoreChars : PatternPart() + data class ExactChars(val chars: List) : PatternPart() +} + +private val ANY_CHARS = '%' +private val ANY_ONE_CHAR = '_' + +// TODO: merge multiple consecutive % together? +// TODO: does the % in '%_' actually mean anything? +fun compilePattern(pattern: String): List { + val codepoints = pattern.toList().listIterator() + val parts = ArrayList() + while(codepoints.hasNext()) { + val c = codepoints.next() + parts.add(when(c) { + ANY_ONE_CHAR -> PatternPart.AnyOneChar + ANY_CHARS -> PatternPart.AnyZeroOrMoreChars + else -> { + // Build pattern for matching the exact string + val buffer = ArrayList() + buffer.add(c) + // stop building if we encounter end of input + while(codepoints.hasNext()) { + val cc = codepoints.next() + // stop building and back up one if we encounter the `%` or `_` characters. + // TODO: handle escape sequence + if (cc == ANY_ONE_CHAR || cc == ANY_CHARS) { + codepoints.previous() + break + } + buffer.add(cc) + } + + PatternPart.ExactChars(buffer) + } + }) + } + + return parts +} + +fun List.isLast(idx: Int) = this.size - 1 == idx +fun IntArray.isLast(idx: Int) = this.size - 1 == idx + + +//val payload1 = """ 'foo' like '%!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!%' """.trimIndent() +//val payload2 = """ 'foo' like '%!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!%' """.trimIndent() +//val payload3 = """ 'foo' like '%!!!!!!!!!!%' """ +// +//@Test +//fun test1() { +// compile(payload1) +//} +//@Test +//fun test2() { +// compile(payload2) +//} +//@Test +//fun test3() { +// compile(payload3) +//} +// +//@Test +//fun foo() { +// +//} + +//fun compile(sql: String): Expression { +// val pipeline = CompilerPipeline.standard(ion) +// return pipeline.compile(sql) +//} + diff --git a/lang/test/org/partiql/lang/SubListIterator.kt b/lang/test/org/partiql/lang/SubListIterator.kt new file mode 100644 index 0000000000..b1cbf644a5 --- /dev/null +++ b/lang/test/org/partiql/lang/SubListIterator.kt @@ -0,0 +1,35 @@ +package org.partiql.lang + +class SubListIterator(private val backingList: List) : ListIterator { + private var idx = -1; + + override fun hasNext(): Boolean = (backingList.size - 1) > idx + + override fun hasPrevious(): Boolean = idx >= 1 + + override fun next(): T { + if(!hasNext()) throw NoSuchElementException() + return backingList[++idx] + } + + override fun nextIndex(): Int = idx + + override fun previous(): T { + if(!hasPrevious()) throw NoSuchElementException() + return backingList[--idx] + } + + override fun previousIndex(): Int = idx - 1 + + fun subListIterator(fromIdx: Int): SubListIterator { + val newIdx = idx + fromIdx + if(newIdx >= backingList.size) throw NoSuchElementException() + return SubListIterator(backingList.subList(idx + fromIdx, backingList.size)) + } + + fun skipToEnd() { + idx = backingList.size - 1 + } + + fun clone(): SubListIterator = SubListIterator(backingList).also { it.idx = this.idx } +} \ No newline at end of file From db18528e9545c403e5e45dfb5e98cd78651d0b2b Mon Sep 17 00:00:00 2001 From: David Lurton Date: Sun, 20 Sep 2020 17:39:08 -0700 Subject: [PATCH 4/9] Fully functional, needs cleanup --- .../partiql/lang/eval/EvaluatingCompiler.kt | 39 ++-- .../lang/eval/like/CheckpointIteratorImpl.kt | 61 +++++ .../org/partiql/lang/eval/like/PatternPart.kt | 117 ++++++++++ lang/test/org/partiql/lang/ExperimentPad.kt | 219 ------------------ lang/test/org/partiql/lang/SubListIterator.kt | 35 --- .../lang/eval/like/PatternPartTests.kt | 96 ++++++++ 6 files changed, 298 insertions(+), 269 deletions(-) create mode 100644 lang/src/org/partiql/lang/eval/like/CheckpointIteratorImpl.kt create mode 100644 lang/src/org/partiql/lang/eval/like/PatternPart.kt delete mode 100644 lang/test/org/partiql/lang/ExperimentPad.kt delete mode 100644 lang/test/org/partiql/lang/SubListIterator.kt create mode 100644 lang/test/org/partiql/lang/eval/like/PatternPartTests.kt diff --git a/lang/src/org/partiql/lang/eval/EvaluatingCompiler.kt b/lang/src/org/partiql/lang/eval/EvaluatingCompiler.kt index 07253743ba..17c0b4243c 100644 --- a/lang/src/org/partiql/lang/eval/EvaluatingCompiler.kt +++ b/lang/src/org/partiql/lang/eval/EvaluatingCompiler.kt @@ -21,6 +21,9 @@ import org.partiql.lang.ast.passes.* import org.partiql.lang.domains.PartiqlAst import org.partiql.lang.errors.* import org.partiql.lang.eval.binding.* +import org.partiql.lang.eval.like.PatternPart +import org.partiql.lang.eval.like.executePattern +import org.partiql.lang.eval.like.parsePattern import org.partiql.lang.syntax.SqlParser import org.partiql.lang.util.* import java.math.* @@ -1661,10 +1664,12 @@ internal class EvaluatingCompiler( val patternLocationMeta = patternExpr.metas.sourceLocationMeta val escapeLocationMeta = escapeExpr?.metas?.sourceLocationMeta + + // TODO: re-evaluate the below comment. // Note that the return value is a nullable and deferred. // This is so that null short-circuits can be supported. // The effective type is Either> - fun getDfa(pattern: ExprValue, escape: ExprValue?): (() -> IDFAState)? { + fun getPatternParts(pattern: ExprValue, escape: ExprValue?): (() -> List)? { val dfaArgs = listOfNotNull(pattern, escape) when { dfaArgs.any { it.type.isUnknown } -> return null @@ -1681,27 +1686,29 @@ internal class EvaluatingCompiler( val (patternString: String, escapeChar: Int?, patternSize) = checkPattern(pattern.ionValue, patternLocationMeta, escape?.ionValue, escapeLocationMeta) - val dfa = - if (patternString.isEmpty()) DFAEmptyPattern - else buildDfaFromPattern(patternString, escapeChar, patternSize) + val patternParts = when { + patternString.isEmpty() -> emptyList() + // TODO: include escapeChar + else -> parsePattern(patternString, escapeChar) + } - return { dfa } + return { patternParts } } } } /** See getDfa for more info on the DFA's odd type. */ - fun runDfa(value: ExprValue, dfa: (() -> IDFAState)?): ExprValue { + fun runPatternParts(value: ExprValue, dfa: (() -> List)?): ExprValue { return when { dfa == null || value.type.isUnknown -> valueFactory.nullValue - !value.type.isText -> err( + !value.type.isText -> err( "LIKE expression must be given non-null strings as input", ErrorCode.EVALUATOR_LIKE_INVALID_INPUTS, errorContextFrom(operatorMetas).also { it[Property.LIKE_VALUE] = value.ionValue.toString() }, internal = false) - else -> dfa().run(value.stringValue()).exprValue() + else -> valueFactory.newBoolean(executePattern(dfa(), value.stringValue())) } } @@ -1712,19 +1719,19 @@ internal class EvaluatingCompiler( return when { patternExpr is Literal && (escapeExpr == null || escapeExpr is Literal) -> { - val dfa = getDfa( + val patternParts = getPatternParts( valueFactory.newFromIonValue(patternExpr.ionValue), (escapeExpr as? Literal)?.ionValue?.let { valueFactory.newFromIonValue(it) }) // If valueExpr is also a literal then we can evaluate this at compile time and return a constant. if (valueExpr is Literal) { - val resultValue = runDfa(valueFactory.newFromIonValue(valueExpr.ionValue), dfa) + val resultValue = runPatternParts(valueFactory.newFromIonValue(valueExpr.ionValue), patternParts) return thunkFactory.thunkEnv(operatorMetas) { resultValue } } else { thunkFactory.thunkEnv(operatorMetas) { env -> val value = valueThunk(env) - runDfa(value, dfa) + runPatternParts(value, patternParts) } } } @@ -1736,8 +1743,8 @@ internal class EvaluatingCompiler( thunkFactory.thunkEnv(operatorMetas) { env -> val value = valueThunk(env) val pattern = patternThunk(env) - val dfa = getDfa(pattern, null) - runDfa(value, dfa) + val dfa = getPatternParts(pattern, null) + runPatternParts(value, dfa) } } else -> { @@ -1747,8 +1754,8 @@ internal class EvaluatingCompiler( val value = valueThunk(env) val pattern = patternThunk(env) val escape = escapeThunk(env) - val dfa = getDfa(pattern, escape) - runDfa(value, dfa) + val dfa = getPatternParts(pattern, escape) + runPatternParts(value, dfa) } } } @@ -1786,6 +1793,8 @@ internal class EvaluatingCompiler( escape: IonValue?, escapeLocationMeta: SourceLocationMeta? ): Triple { + // TODO: don't bother calculating size anymore. + val patternString = pattern.stringValue()?.let { it } ?: err("Must provide a non-null value for PATTERN in a LIKE predicate: $pattern", errorContextFrom(patternLocationMeta), diff --git a/lang/src/org/partiql/lang/eval/like/CheckpointIteratorImpl.kt b/lang/src/org/partiql/lang/eval/like/CheckpointIteratorImpl.kt new file mode 100644 index 0000000000..dfcd53f682 --- /dev/null +++ b/lang/src/org/partiql/lang/eval/like/CheckpointIteratorImpl.kt @@ -0,0 +1,61 @@ +package org.partiql.lang.eval.like + +import java.util.Stack + +interface CheckpointIterator : Iterator { + fun skipToEnd() + fun checkpoint() + fun restore() +} + +class CheckpointIteratorImpl(private val backingList: List) : CheckpointIterator { + private val checkpointStack = Stack() + private var idx = -1 + + override fun hasNext(): Boolean = (backingList.size - 1) > idx + + override fun next(): T { + if(!hasNext()) throw NoSuchElementException() + return backingList[++idx] + } + + override fun skipToEnd() { + idx = backingList.size - 1 + } + + override fun checkpoint() { + checkpointStack.push(idx) + } + + override fun restore() { + idx = checkpointStack.pop() + } +} + + +class CheckointCodepointIterator(private val str: String) : CheckpointIterator { + private val checkpointStack = Stack() + private val codepointCount = str.codePointCount(0, str.length) + private var idx = -1 + + override fun hasNext(): Boolean = (codepointCount - 1) > idx + + override fun next(): Int { + if(!hasNext()) throw NoSuchElementException() + return str.codePointAt(++idx) + } + + override fun skipToEnd() { + idx = codepointCount + } + + override fun checkpoint() { + checkpointStack.push(idx) + } + + override fun restore() { + idx = checkpointStack.pop() + } + + +} diff --git a/lang/src/org/partiql/lang/eval/like/PatternPart.kt b/lang/src/org/partiql/lang/eval/like/PatternPart.kt new file mode 100644 index 0000000000..b6f7e1be7e --- /dev/null +++ b/lang/src/org/partiql/lang/eval/like/PatternPart.kt @@ -0,0 +1,117 @@ +package org.partiql.lang.eval.like + +import kotlin.streams.toList + +internal sealed class PatternPart { + object AnyOneChar : PatternPart() + object AnyZeroOrMoreChars : PatternPart() + @Suppress("ArrayInDataClass") + data class ExactChars(val codepoints: IntArray) : PatternPart() +} + +private val ANY_CHARS = '%'.toInt() +private val ANY_ONE_CHAR = '_'.toInt() + +// TODO: merge multiple consecutive % together? +// TODO: does the % in '%_' actually mean anything? +internal fun parsePattern(pattern: String, escapeChar: Int?): List { + val codepoints = pattern.codePoints().toList().listIterator() + val parts = ArrayList() + while(codepoints.hasNext()) { + val c = codepoints.next() + parts.add(when(c) { + ANY_ONE_CHAR -> PatternPart.AnyOneChar + ANY_CHARS -> PatternPart.AnyZeroOrMoreChars + else -> { + + codepoints.previous() + // Build pattern for matching the exact string + val buffer = ArrayList() + // stop building if we encounter end of input + do { + val cc = codepoints.next() + // stop building and back up one if we encounter `%` or `_` characters not precdeed by + // the escape character + if(escapeChar != null && cc == escapeChar) { + buffer.add(codepoints.next()) + } else { + if (cc == ANY_ONE_CHAR || cc == ANY_CHARS) { + codepoints.previous() + break + } + buffer.add(cc) + } + + } while(codepoints.hasNext()) + + PatternPart.ExactChars(buffer.toIntArray()) + } + }) + } + + return parts +} + +private fun List.isLast(idx: Int) = this.size - 1 == idx +private fun IntArray.isLast(idx: Int) = this.size - 1 == idx + +internal fun executePattern(parts: List, str: String): Boolean { + return executePattern( + CheckpointIteratorImpl(parts), CheckointCodepointIterator(str)) +} + +private fun executePattern(partsItr: CheckpointIterator, charsItr: CheckointCodepointIterator): Boolean { + while (partsItr.hasNext()) { + if(!executeOnePart(partsItr, charsItr)) + return false + } + return !charsItr.hasNext() +} + +private fun executeOnePart(partsItr: CheckpointIterator, charsItr: CheckointCodepointIterator): Boolean { + when (val currentPart = partsItr.next()) { + is PatternPart.AnyOneChar -> { + if(!charsItr.hasNext()) + return false + + charsItr.next() + return true + } + is PatternPart.ExactChars -> { + currentPart.codepoints.forEach { + if (!charsItr.hasNext() || charsItr.next() != it) { + return false + } + } + return true + } + PatternPart.AnyZeroOrMoreChars -> { + // No need to check the rest of the string if this is the last pattern part + if (!partsItr.hasNext()) { + charsItr.skipToEnd() // consume rest of string otherwise we will consider this a non-match. + return true + } + + while (true) { + partsItr.checkpoint() + charsItr.checkpoint() + + val nextPatternMatches = executePattern(partsItr, charsItr) + partsItr.restore() + charsItr.restore() + + if (nextPatternMatches) { + // TODO: we can pop the index stack instead of restoring it here to avoid having to + // re-run the patternpart during the next call to executeOnePart + return true + } + + charsItr.next() + if (!charsItr.hasNext()) { + return false + } + } + } + } +} + diff --git a/lang/test/org/partiql/lang/ExperimentPad.kt b/lang/test/org/partiql/lang/ExperimentPad.kt deleted file mode 100644 index dcb3db3196..0000000000 --- a/lang/test/org/partiql/lang/ExperimentPad.kt +++ /dev/null @@ -1,219 +0,0 @@ -package org.partiql.lang - -import junitparams.Parameters -import org.junit.Test -import org.partiql.lang.eval.EvaluatorTestBase - -//@Ignore -class ScratchPad : EvaluatorTestBase() { - - data class TestCase(val pattern: String, val input: String, val shouldMatch: Boolean) - - private fun createTestCase(pattern: String, vectors: List>) = - vectors.map { TestCase(pattern, it.first, it.second) } - - fun parametersForPatternTest() = listOf( - createTestCase("a", listOf( - "a" to true, - "aa" to false, - "b" to false, - "bb" to false - )), - createTestCase("aa", listOf( - "a" to false, - "aa" to true, - "b" to false, - "bb" to false - )), - createTestCase("_", listOf( - "a" to true, - "b" to true, - "aa" to false, - "bb" to false - )), - createTestCase("__", listOf( - "a" to false, - "b" to false, - "aa" to true, - "bb" to true - )), - createTestCase("%", listOf( - "a" to true, - "bb" to true - )), - createTestCase("%%", listOf( - "a" to true, - "bb" to true - )), - createTestCase("a%", listOf( - "a" to true, - "ab" to true, - "abcde" to true, - "b" to false, - "ba" to false, - "baa" to false - )), - createTestCase("%a", listOf( - "a" to true, - "ba" to true, - "edcba" to true, - "b" to false, - "ab" to false, - "aab" to false - )), - createTestCase("%a%", listOf( - "a" to true, - "ab" to true, - "ba" to true, - "bab" to true, - "bbabb" to true, - "b" to false, - "bb" to false - )), - createTestCase("%_asdf_%", listOf( - "1asdf1" to true, - "1asdf1x" to true, - "x1asdf1" to true, - "xyz1asdf1" to true, - "1asdf1xyz" to true, - "xyz1asdf1xyz" to true - )) - ).flatten() - - @Test - @Parameters - fun patternTest(tc: TestCase) { - val pat = compilePattern(tc.pattern) - val actualMatches = executePattern2(pat, tc.input) - - assertEquals(tc.shouldMatch, actualMatches) - } -} - -fun executePattern2(parts: List, str: String): Boolean { - return executePattern2( - SubListIterator(parts), SubListIterator(str.toList())) -} - -fun executePattern2(partsItr: SubListIterator, charsItr: SubListIterator): Boolean { - while (partsItr.hasNext()) { - if(!executeOnePart(partsItr, charsItr)) - return false - } - return !charsItr.hasNext() -} - -fun executeOnePart(partsItr: SubListIterator, charsItr: SubListIterator): Boolean { - when (val currentPart = partsItr.next()) { - is PatternPart.AnyOneChar -> { - if(!charsItr.hasNext()) - return false - - charsItr.next() - return true - } - is PatternPart.ExactChars -> { - currentPart.chars.forEach { - if (!charsItr.hasNext() || charsItr.next() != it) { - return false - } - } - return true - } - PatternPart.AnyZeroOrMoreChars -> { - // No need to check the rest of the string if this is the last pattern part - if (!partsItr.hasNext()) { - charsItr.skipToEnd() // consume rest of string otherwise we will consider this a non-match. - return true - } - - while (true) { - val nextPartsItr = partsItr.subListIterator(1) - val codepointsItrClone = charsItr.clone() - if (executePattern2(nextPartsItr, codepointsItrClone)) { - return true - } - - charsItr.next() - if (!charsItr.hasNext()) { - return false - } - } - } - }.let { } -} - -sealed class PatternPart { - object AnyOneChar : PatternPart() - object AnyZeroOrMoreChars : PatternPart() - data class ExactChars(val chars: List) : PatternPart() -} - -private val ANY_CHARS = '%' -private val ANY_ONE_CHAR = '_' - -// TODO: merge multiple consecutive % together? -// TODO: does the % in '%_' actually mean anything? -fun compilePattern(pattern: String): List { - val codepoints = pattern.toList().listIterator() - val parts = ArrayList() - while(codepoints.hasNext()) { - val c = codepoints.next() - parts.add(when(c) { - ANY_ONE_CHAR -> PatternPart.AnyOneChar - ANY_CHARS -> PatternPart.AnyZeroOrMoreChars - else -> { - // Build pattern for matching the exact string - val buffer = ArrayList() - buffer.add(c) - // stop building if we encounter end of input - while(codepoints.hasNext()) { - val cc = codepoints.next() - // stop building and back up one if we encounter the `%` or `_` characters. - // TODO: handle escape sequence - if (cc == ANY_ONE_CHAR || cc == ANY_CHARS) { - codepoints.previous() - break - } - buffer.add(cc) - } - - PatternPart.ExactChars(buffer) - } - }) - } - - return parts -} - -fun List.isLast(idx: Int) = this.size - 1 == idx -fun IntArray.isLast(idx: Int) = this.size - 1 == idx - - -//val payload1 = """ 'foo' like '%!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!%' """.trimIndent() -//val payload2 = """ 'foo' like '%!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!%' """.trimIndent() -//val payload3 = """ 'foo' like '%!!!!!!!!!!%' """ -// -//@Test -//fun test1() { -// compile(payload1) -//} -//@Test -//fun test2() { -// compile(payload2) -//} -//@Test -//fun test3() { -// compile(payload3) -//} -// -//@Test -//fun foo() { -// -//} - -//fun compile(sql: String): Expression { -// val pipeline = CompilerPipeline.standard(ion) -// return pipeline.compile(sql) -//} - diff --git a/lang/test/org/partiql/lang/SubListIterator.kt b/lang/test/org/partiql/lang/SubListIterator.kt deleted file mode 100644 index b1cbf644a5..0000000000 --- a/lang/test/org/partiql/lang/SubListIterator.kt +++ /dev/null @@ -1,35 +0,0 @@ -package org.partiql.lang - -class SubListIterator(private val backingList: List) : ListIterator { - private var idx = -1; - - override fun hasNext(): Boolean = (backingList.size - 1) > idx - - override fun hasPrevious(): Boolean = idx >= 1 - - override fun next(): T { - if(!hasNext()) throw NoSuchElementException() - return backingList[++idx] - } - - override fun nextIndex(): Int = idx - - override fun previous(): T { - if(!hasPrevious()) throw NoSuchElementException() - return backingList[--idx] - } - - override fun previousIndex(): Int = idx - 1 - - fun subListIterator(fromIdx: Int): SubListIterator { - val newIdx = idx + fromIdx - if(newIdx >= backingList.size) throw NoSuchElementException() - return SubListIterator(backingList.subList(idx + fromIdx, backingList.size)) - } - - fun skipToEnd() { - idx = backingList.size - 1 - } - - fun clone(): SubListIterator = SubListIterator(backingList).also { it.idx = this.idx } -} \ No newline at end of file diff --git a/lang/test/org/partiql/lang/eval/like/PatternPartTests.kt b/lang/test/org/partiql/lang/eval/like/PatternPartTests.kt new file mode 100644 index 0000000000..18e9a6974a --- /dev/null +++ b/lang/test/org/partiql/lang/eval/like/PatternPartTests.kt @@ -0,0 +1,96 @@ +package org.partiql.lang.eval.like + +import junitparams.JUnitParamsRunner +import junitparams.Parameters +import org.junit.Assert +import org.junit.Test +import org.junit.runner.RunWith + +@RunWith(JUnitParamsRunner::class) +class PatternPartTests { + + data class TestCase(val pattern: String, val escapeChar: Int?, val input: String, val shouldMatch: Boolean) + + private fun createTestCase(pattern: String, escapeChar: Char?, vectors: List>) = + vectors.map { TestCase(pattern, escapeChar?.toInt(), it.first, it.second) } + + fun parametersForPatternTest() = listOf( + createTestCase("a", null, listOf( + "a" to true, + "aa" to false, + "b" to false, + "bb" to false + )), + createTestCase("aa", null, listOf( + "a" to false, + "aa" to true, + "b" to false, + "bb" to false + )), + createTestCase("_", null, listOf( + "a" to true, + "b" to true, + "aa" to false, + "bb" to false + )), + createTestCase("__", null, listOf( + "a" to false, + "b" to false, + "aa" to true, + "bb" to true + )), + createTestCase("%", null, listOf( + "a" to true, + "bb" to true + )), + createTestCase("%%", null, listOf( + "a" to true, + "bb" to true + )), + createTestCase("a%", null, listOf( + "a" to true, + "ab" to true, + "abcde" to true, + "b" to false, + "ba" to false, + "baa" to false + )), + createTestCase("%a", null, listOf( + "a" to true, + "ba" to true, + "edcba" to true, + "b" to false, + "ab" to false, + "aab" to false + )), + createTestCase("%a%", null, listOf( + "a" to true, + "ab" to true, + "ba" to true, + "bab" to true, + "bbabb" to true, + "b" to false, + "bb" to false + )), + createTestCase("%_asdf_%", null, listOf( + "1asdf1" to true, + "1asdf1x" to true, + "x1asdf1" to true, + "xyz1asdf1" to true, + "1asdf1xyz" to true, + "xyz1asdf1xyz" to true + )), + createTestCase("\\%\\_", '\\', listOf( + "%_" to true + )) + ).flatten() + + @Test + @Parameters + fun patternTest(tc: TestCase) { + val pat = parsePattern(tc.pattern, tc.escapeChar) + val actualMatches = executePattern(pat, tc.input) + + Assert.assertEquals(tc.shouldMatch, actualMatches) + } +} \ No newline at end of file From 046a167ede133675d107f05049e323eb8ac8ab85 Mon Sep 17 00:00:00 2001 From: David Lurton Date: Sun, 20 Sep 2020 19:26:58 -0700 Subject: [PATCH 5/9] Cleanup a little, remove LikeMatchingAutomata --- .../partiql/lang/eval/LikeMatchingAutomata.kt | 511 ------------------ .../lang/eval/like/CheckpointIterator.kt | 31 ++ .../lang/eval/like/CheckpointIteratorImpl.kt | 42 +- .../eval/like/CodepointCheckpointIterator.kt | 33 ++ .../org/partiql/lang/eval/like/PatternPart.kt | 38 +- .../partiql/lang/eval/LikePredicateTest.kt | 24 - .../lang/eval/like/PatternPartTests.kt | 22 + 7 files changed, 113 insertions(+), 588 deletions(-) delete mode 100644 lang/src/org/partiql/lang/eval/LikeMatchingAutomata.kt create mode 100644 lang/src/org/partiql/lang/eval/like/CheckpointIterator.kt create mode 100644 lang/src/org/partiql/lang/eval/like/CodepointCheckpointIterator.kt diff --git a/lang/src/org/partiql/lang/eval/LikeMatchingAutomata.kt b/lang/src/org/partiql/lang/eval/LikeMatchingAutomata.kt deleted file mode 100644 index 0ef2738224..0000000000 --- a/lang/src/org/partiql/lang/eval/LikeMatchingAutomata.kt +++ /dev/null @@ -1,511 +0,0 @@ -/* - * Copyright 2019 Amazon.com, Inc. or its affiliates. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"). - * You may not use this file except in compliance with the License. - * A copy of the License is located at: - * - * http://aws.amazon.com/apache2.0/ - * - * or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific - * language governing permissions and limitations under the License. - */ - -package org.partiql.lang.eval - -import org.partiql.lang.util.codePointSequence -import java.util.ArrayList -import java.util.HashSet - - -@Suppress("UNCHECKED_CAST") -private fun Iterable.fastToMutableHashSet(): MutableSet = - when(this) { - // `HashSet.clone()` is faster than constructing a new HashSet and calling .addAll() - is HashSet -> this.clone() as HashSet - else -> HashSet().also { it.addAll(this) } - } - -private infix fun Iterable.fastUnion(other: Iterable): MutableSet { - val set = this.fastToMutableHashSet() - set.addAll(other) - return set -} - - -/** - * Enumeration of Alphabet letters which can be one of - * - * - Any one character -- SQL `_` that maps to `.` in RegeExp - * - Zero or more characters -- SQL `%` that maps to `.*` in RegeExp - * - Epsilon -- denotes the epsilon (empty) transitions in an NFA - * - Character -- denotes any single character - */ -private sealed class Alphabet { - data class Letter(val codePoint: Int) : Alphabet() - object AnyOneChar : Alphabet() - object AnyZeroOrMoreChars : Alphabet() - object Epsilon : Alphabet() -} - -/** - * Represents the dead DFA State. - * This state is terminal-- has no outgoing transitions and it is neither a start State nor a Final state - */ -val DFADeadState : IDFAState = DFAState(HashSet(), HashMap()) - -// Represents the DFA of the empty pattern -val DFAEmptyPattern = object :IDFAState { - override fun isAccepting(): Boolean { - return false - } - - override fun run(word: String?): Boolean = - word?.let { - word.isEmpty() // SQL92 pp. 216 Case 5)b) - } ?: false - - - override fun step(codePoint: Int): IDFAState? { - return DFADeadState - } - -} - -interface IDFAState { - fun isAccepting(): Boolean - /** - * Given a possibly `null` string, starting from `this` DFA state run the automaton and return - * `true` if we exhaust [word] and we are in a an accepting state, false otherwise. - * - * @param word input to the DFA - * - * @return true if the DFA accepts the input, false otherwise - */ - fun run(word: String?): Boolean - - /** - * Given a character, take a step in our DFA starting with `this` state and possible transitions that match [codePoint]. - * - * @param codePoint character to match against possible valid transitions - * - * @return next DFA state - */ - fun step(codePoint: Int): IDFAState? -} - -/** - * Represents a DFA State where - * - * - [nfaStates] set of NFA states that correspond to this DFA state - * - [outgoing] map of transitions to DFA states - * - [accepting] true if this is a Final state, false otherwise - * - [start] true if this is a Start state, false otherwise - */ -private open class DFAState( - val nfaStates: MutableSet, - val outgoing: MutableMap, - var accepting: Boolean = nfaStates.any { it.isAccepting }, - var start: Boolean = nfaStates.any { it.isStartState } -) : IDFAState { - - fun addTransition(transition: Alphabet, target: DFAState) { - if (transition == Alphabet.Epsilon) errNoContext("DFA cannot have epsilon transitions: $transition, $target", internal = true) - when (outgoing.containsKey(transition)) { - true -> if (target != outgoing[transition]) - errNoContext("DFA cannot have a transition that maps to different targets : $transition -> $target AND $transition -> $outgoing.get(transition)", internal = true) - false -> outgoing.put(transition, target) - } - } - - fun addNFAStates(nfaState: NFAState) { - nfaStates.add(nfaState) - accepting = accepting || nfaState.isAccepting - start = start || nfaState.isStartState - } - - override fun isAccepting(): Boolean = accepting - - /** - * Given a character, take a step in our DFA starting with `this` state and possible transitions that match [codePoint]. - * - * @param codePoint character to match against possible valid transitions - * - * @return next DFA state - */ - override fun step(codePoint: Int): IDFAState? { - val trans = Alphabet.Letter(codePoint) - when (outgoing.containsKey(trans)) { - true -> return outgoing[trans] - else -> { - if (outgoing.containsKey(Alphabet.AnyOneChar)) return outgoing[Alphabet.AnyOneChar] - else return DFADeadState - } - } - } - - - /** - * Given a possibly `null` string, starting from `this` DFA state run the automaton and return - * `true` if we exhaust [word] and we are in a an accepting state, false otherwise. - * - * @param word input to the DFA - * - * @return true if the DFA accepts the input, false otherwise - */ - override fun run(word: String?): Boolean { - var currentState: IDFAState = this - - word?.let { - it.codePointSequence().forEach { ele -> - val newState: IDFAState? = currentState.step(ele) - when (newState) { - null -> return false - DFADeadState -> return false - else -> currentState = newState - } - } - } - - return currentState.isAccepting() - } -} - - - -/** - * Represents a state in an NFA where - * - * - [stateNumber] is a number used for this state - * - [isAccepting] true when this State is a Final state, false otherwise - * - [isStartState] true when this State is a Start state, false othewise - * - [outgoing] map of alphabet letter to NFA State - */ -private class NFAState( - val stateNumber: Int, - val isAccepting: Boolean, - val isStartState: Boolean, - val outgoing: MutableMap> = HashMap()) { - - fun get(transition: Alphabet): Set = - outgoing[transition] ?: hashSetOf() - - fun addTransition(label: Alphabet, target: NFAState) { - when (outgoing.containsKey(label)) { - true -> outgoing[label]?.add(target) ?: HashSet().apply { add(target) } - false -> outgoing[label] = HashSet().apply { add(target) } - } - } - - /** - * Given a letter from the NFA's alphabet return the letter-closure from `this` state. - * - * @param alpha letter from the NFA's alphabet - * - * @return set of NFA states that make up the letter-closure--reachable states from `this` state - * through a combination of 1 transition of `alpha` and any sequence of 1 or more epsilon transitions. - * - */ - fun getOutgoingStates(alpha: Alphabet): Set = - when (alpha) { - is Alphabet.Letter -> getOutgoingStates(alpha.codePoint) - is Alphabet.Epsilon -> epsilonClosure() - is Alphabet.AnyOneChar, - is Alphabet.AnyZeroOrMoreChars -> { - val startSet = epsilonClosure().fastUnion(get(alpha)) - startSet.fold(get(alpha)) { acc, nfaState -> acc.fastUnion(nfaState.epsilonClosure()) } - } - } - - /** - * Given a character return the set of NFA States that are the character-closure for `this` node. - * - * The character closure is the set of all NFA State reachable through `this` state by - * following any combination of epsilon transitions and *one* non-epsilon transition that - * matches [codePoint]. - * - * - * @param codePoint character to check for transitions - * - * @return set of NFA states reachable though any combination of epsilon transitions and *one* - * non-epsilon transitions that matches the input character. - - */ - fun getOutgoingStates(codePoint: Int): Set = - closure(codePoint) - - /** - * Given a character return all states reachable from `this` state by 1 non-epsilon transition. - * - * @param codePoint character to check for transitions - * - * @return set of NFA States reachable from `this` state by 1 non-epsilon transition. - */ - fun getNonEpsilonTransitionTargets(codePoint: Int): Set = - get(Alphabet.Letter(codePoint)).fastUnion(get(Alphabet.AnyOneChar)) - - - /** - * Given a code point for a character return the character-closure for `this` node. - * The character closure is the set of all NFA State reachable through `this` state by - * following any combination of epsilon transitions and *one* non-epsilon transition that - * matches [codePoint]. - * - * - * @param codePoint character to check for transitions - * - * @return set of NFA states reachable though any combination of epsilon transitions and *one* - * non-epsilon transitions that matches the input character. - */ - fun closure(codePoint: Int): Set { - val reachableThroughEpsilon = epsilonClosure() - val reachableThroughNonEpsilon = getNonEpsilonTransitionTargets(codePoint).let { - it.fold(it, { acc, state -> acc.fastUnion(state.epsilonClosure()) }) - } - return reachableThroughEpsilon.fastUnion(reachableThroughNonEpsilon) - } - - - /** - * Returns the espilon-closure of this NFA State. All states reachable from `this` state by using one or more - * epsilon transitions in succession. - * - * @return the set of NFA states that make the epsilon closure of `this` NFA state - */ - fun epsilonClosure(): Set = - get(Alphabet.Epsilon).let { - it.fold(it, { acc, state -> - acc.fastUnion(state.epsilonClosure()) - }) - } -} - - -/** - * Given the search pattern, possible escape character used in the search pattern and the size of the search pattern, - * build a DFA recognizer. The recognizer builds an NFA that then translates to a DFA. - * - * PRE-CONDITION: [pattern] is a valid LIKE pattern, i.e, the result of `checkPattern` function - * - * - * @param pattern valid search pattern as a [String] - * @param escape possible escape character - * @param patternSize size of the pattern - * - * @return DFA that accepts inputs which match [pattern] - */ -fun buildDfaFromPattern(pattern: String, escape: Int?, patternSize: Int): IDFAState { - escape?.let { - val patternAsNfaLetters = patternToSequenceOfNfaLetters(pattern, it) - val dfaAlpha = patternAsNfaLetters.map(nfaLettersToDfaAlphabet()).toHashSet() - return nfaToDfa(dfaAlpha, buildNfa(patternAsNfaLetters, patternSize)) - } - val patternAsNfaLetters = patternToSequenceOfNfaLetters(pattern) - val dfaAlpha = patternAsNfaLetters.map(nfaLettersToDfaAlphabet()).toHashSet() - return nfaToDfa(dfaAlpha, buildNfa(patternAsNfaLetters, patternSize)) -} - -/** - * Given a search pattern and an escape character possibly used in the pattern, return the sequence - * of letters in the NFA's alphabet that correspond to the characters in the pattern. - * - * @param pattern search pattern - * @param escapeChar escape character - * - * @return sequence of letters in the NFA's alphabet that correspond to the characters in the pattern - */ -private fun patternToSequenceOfNfaLetters(pattern: String, escapeChar: Int): Sequence { - val codePointIter = pattern.codePointSequence().iterator() - val result = ArrayList() - - while (codePointIter.hasNext()) { - val current = codePointIter.next() - when (current) { - escapeChar -> result.add(Alphabet.Letter(codePointIter.next())) // skip current, use successor as raw character - else -> result.add(codePointToAlphabetLetter(current)) - } - } - return result.asSequence() -} - -/** - * Given the search pattern return a sequence of [Alphabet] that holds the corresponding [Alphabet] instance for - * each character in the input pattern. - * - * @param pattern search pattern - * - * @return sequence of [Alphabet] for each character in the input - */ -private fun patternToSequenceOfNfaLetters(pattern: String): Sequence = - pattern.codePointSequence().map { - codePointToAlphabetLetter(it) - } - -/** - * Given a character, return its corresponding Alphabet Letter - * - * @param codePoint input character as a code point - * - * @return corresponding [Alphabet] instance for the input - */ -private fun codePointToAlphabetLetter(codePoint: Int): Alphabet { - return when (codePoint) { - '_'.toInt() -> Alphabet.AnyOneChar - '%'.toInt() -> Alphabet.AnyZeroOrMoreChars - else -> Alphabet.Letter(codePoint) - } -} - -/** - * Function that given an instance of [Alphabet] for an NFA returns the appropriate [Alphabet] for the NFA's DFA. - * Change all zero or more letter to any one char letter. All other elements of the input remain unchanged. - * - */ -private fun nfaLettersToDfaAlphabet(): (Alphabet) -> Alphabet { - return { a -> - when (a) { - Alphabet.AnyZeroOrMoreChars -> Alphabet.AnyOneChar - else -> a - } - } -} - -/** - * Given the DFA alphabet and the start NFA state, return the DFA that simulates the NFA - * - * @param alphabet DFA alphabet - * @param nfa NFA start state - * - * @return DFA that simulates the NFA with start state [nfa] - */ -private fun nfaToDfa(alphabet: Set, nfa: NFAState) = - buildDFA(alphabet, HashMap(), hashSetOf(nfa.epsilonClosure().fastUnion(hashSetOf(nfa)))) - - -/** - * Given the DFA alphabet, the current DFA delta and a set of sets of NFA State, process - * the set of sets of NFA States and update the DFA. - * - * This function builds the table that simulates the NFA to create the DFA - * - * @param dfaAlphabet DFA Alphabet, the rows of the table - * @param delta DFA delta function thus far - * @param set of sets of NFA states to process - * - * @return DFA that simulates the NFA - */ -private fun buildDFA( - dfaAlphabet: Set, - delta: MutableMap, Alphabet>, Set>, - todo: Set>): DFAState { - - var unprocessed = todo.fastToMutableHashSet() - val processed = HashSet>() - while (unprocessed.isNotEmpty()) { - val nfaStates = unprocessed.first() - unprocessed.remove(nfaStates) - // delta = (Q x \Sigma) -> Q - // where Q is \Set(NFAState) - // maps to the type - // delta : Pair, Alphabet>, Set - val deltaUpdates: List, Alphabet>, Set>> = - dfaAlphabet.map { - Pair(Pair(nfaStates, it), - HashSet().apply { - nfaStates.forEach { state -> - addAll(state.getOutgoingStates(it)) - - if (Thread.interrupted()) { - throw InterruptedException() - } - } - }) - } - processed.add(nfaStates) - updateDelta(delta, deltaUpdates) - val newStates = deltaUpdates.map { - it.second - }.filter { s -> - s.isNotEmpty() && !processed.contains(s) - }.fastToMutableHashSet() - unprocessed = unprocessed.fastUnion(newStates) - } - - val nfaStateSetToDfaState = HashMap, DFAState>() - - delta.forEach { nfaStateSetToDfaState[it.key.first] = DFAState(it.key.first.fastToMutableHashSet(), HashMap()) } - delta.forEach { (nfaSet, alpha), target -> - val targetDfa: DFAState = nfaStateSetToDfaState[target].let { it } ?: DFADeadState as DFAState - nfaStateSetToDfaState[nfaSet]?.addTransition(alpha, targetDfa) ?: errNoContext("DFA state for $nfaSet does not exist", internal = true) - } - - val dfaStartState = nfaStateSetToDfaState.values.filter { it.start } - if (dfaStartState.size == 1) return dfaStartState.first() - else errNoContext("DFA has more that 1 start state : $dfaStartState", internal = true) -} - -/** - * Given our current delta for the DFA and a list of updates, return the updated delta. - * - * @param delta current delta for the DFA - * @param deltaUpdates list of updates to be processed - * - * @return update [delta] that incorporates changes in [deltaUpdates] - */ -private fun updateDelta( - delta: MutableMap, Alphabet>, Set>, - deltaUpdates: List, Alphabet>, Set>>) { - deltaUpdates.forEach { - if (delta.containsKey(it.first)) { - if (delta[it.first] != it.second) { - errNoContext("construction of DFA attempted to add the same transition with two distinct targets: $it.first, $it.second", internal = true) - } - } else { - delta.put(it.first, it.second) - } - } -} - - -/** - * Given the sequence of NFA letters that correspond to the search string and the search string's length - * build an NFA that accepts words that match [letters]. - * - * @param letters sequence of NFA letters that correspond to the search string - * @param patternSize size of the search string - * - * @return NFA that accepts words that match [letters] - * - */ -private fun buildNfa(letters: Sequence, patternSize: Int): NFAState = - letters.foldIndexed(mutableListOf(NFAState(-1, 0 == patternSize, true)), { index, acc, transition -> - alphabetToNFAStateAcc(transition, NFAState(index, index == (patternSize - 1), false), acc) - }).first() - -/** - * Given the current letter in the NFA's alphabet, the new NFA state created and the list of already created - * NFA states, add necessary transitions in the NFA states (new and old) to simulate a move of the NFA for the - * input letter. - * - * @param letter new letter for the NFA - * @param newState newly created NFA state - * @param acc accumulator that holds previously processed NFA states. - * - * @return updated list of NFA states - */ -private fun alphabetToNFAStateAcc(letter: Alphabet, newState: NFAState, acc: MutableList): MutableList = - when (letter) { - is Alphabet.Letter, is Alphabet.AnyOneChar -> { - acc.last().addTransition(letter, newState) - acc.add(newState) - acc - } - is Alphabet.AnyZeroOrMoreChars -> { - acc.last().addTransition(Alphabet.Epsilon, newState) - newState.addTransition(Alphabet.AnyOneChar, newState) - acc.add(newState) - acc - } - is Alphabet.Epsilon -> errNoContext("Found epsilon letter while processing pattern chars", internal = true) - } diff --git a/lang/src/org/partiql/lang/eval/like/CheckpointIterator.kt b/lang/src/org/partiql/lang/eval/like/CheckpointIterator.kt new file mode 100644 index 0000000000..96f8e5a164 --- /dev/null +++ b/lang/src/org/partiql/lang/eval/like/CheckpointIterator.kt @@ -0,0 +1,31 @@ +package org.partiql.lang.eval.like + +/** + * Extends [Iterator] with the ability to save the current position and restore it later, + * thereby allowing an a kind of infinite lookahead. + */ +interface CheckpointIterator : Iterator { + + /** + * Saves the current position on an internal stack. + * + * Every invocation of this function should be paired with either a [restoreCheckpoint] or [discardCheckpoint]. + */ + fun saveCheckpoint() + + /** + * Sets the current position to the last saved checkpoint and pops it off of the internal stack. + * + * Do not call this function without invoking [saveCheckpoint] first. + */ + fun restoreCheckpoint() + + /** + * Discards position currently on the top of the internal stack. + * + * Do not call this function without invoking [saveCheckpoint] first. + */ + fun discardCheckpoint() +} + + diff --git a/lang/src/org/partiql/lang/eval/like/CheckpointIteratorImpl.kt b/lang/src/org/partiql/lang/eval/like/CheckpointIteratorImpl.kt index dfcd53f682..59c3837160 100644 --- a/lang/src/org/partiql/lang/eval/like/CheckpointIteratorImpl.kt +++ b/lang/src/org/partiql/lang/eval/like/CheckpointIteratorImpl.kt @@ -2,12 +2,8 @@ package org.partiql.lang.eval.like import java.util.Stack -interface CheckpointIterator : Iterator { - fun skipToEnd() - fun checkpoint() - fun restore() -} +/** An implementation of [CheckpointIterator] which is backed by a [List]. */ class CheckpointIteratorImpl(private val backingList: List) : CheckpointIterator { private val checkpointStack = Stack() private var idx = -1 @@ -19,43 +15,17 @@ class CheckpointIteratorImpl(private val backingList: List) : CheckpointIt return backingList[++idx] } - override fun skipToEnd() { - idx = backingList.size - 1 - } - - override fun checkpoint() { + override fun saveCheckpoint() { checkpointStack.push(idx) } - override fun restore() { + override fun restoreCheckpoint() { idx = checkpointStack.pop() } -} - - -class CheckointCodepointIterator(private val str: String) : CheckpointIterator { - private val checkpointStack = Stack() - private val codepointCount = str.codePointCount(0, str.length) - private var idx = -1 - - override fun hasNext(): Boolean = (codepointCount - 1) > idx - - override fun next(): Int { - if(!hasNext()) throw NoSuchElementException() - return str.codePointAt(++idx) - } - - override fun skipToEnd() { - idx = codepointCount - } - - override fun checkpoint() { - checkpointStack.push(idx) - } - override fun restore() { - idx = checkpointStack.pop() + override fun discardCheckpoint() { + checkpointStack.pop() } +} -} diff --git a/lang/src/org/partiql/lang/eval/like/CodepointCheckpointIterator.kt b/lang/src/org/partiql/lang/eval/like/CodepointCheckpointIterator.kt new file mode 100644 index 0000000000..bb37bbe17d --- /dev/null +++ b/lang/src/org/partiql/lang/eval/like/CodepointCheckpointIterator.kt @@ -0,0 +1,33 @@ +package org.partiql.lang.eval.like + +import java.util.Stack + +/** An implementation of [CheckpointIterator] that iterates over the unicode codepoints within a string. */ +class CodepointCheckpointIterator(private val str: String) : CheckpointIterator { + private val checkpointStack = Stack() + private val codepointCount = str.codePointCount(0, str.length) + private var idx = -1 + + override fun hasNext(): Boolean = (codepointCount - 1) > idx + + override fun next(): Int { + if(!hasNext()) throw NoSuchElementException() + return str.codePointAt(++idx) + } + + fun skipToEnd() { + idx = codepointCount + } + + override fun saveCheckpoint() { + checkpointStack.push(idx) + } + + override fun restoreCheckpoint() { + idx = checkpointStack.pop() + } + + override fun discardCheckpoint() { + checkpointStack.pop() + } +} \ No newline at end of file diff --git a/lang/src/org/partiql/lang/eval/like/PatternPart.kt b/lang/src/org/partiql/lang/eval/like/PatternPart.kt index b6f7e1be7e..d2042e5476 100644 --- a/lang/src/org/partiql/lang/eval/like/PatternPart.kt +++ b/lang/src/org/partiql/lang/eval/like/PatternPart.kt @@ -52,15 +52,12 @@ internal fun parsePattern(pattern: String, escapeChar: Int?): List return parts } -private fun List.isLast(idx: Int) = this.size - 1 == idx -private fun IntArray.isLast(idx: Int) = this.size - 1 == idx - internal fun executePattern(parts: List, str: String): Boolean { return executePattern( - CheckpointIteratorImpl(parts), CheckointCodepointIterator(str)) + CheckpointIteratorImpl(parts), CodepointCheckpointIterator(str)) } -private fun executePattern(partsItr: CheckpointIterator, charsItr: CheckointCodepointIterator): Boolean { +private fun executePattern(partsItr: CheckpointIterator, charsItr: CodepointCheckpointIterator): Boolean { while (partsItr.hasNext()) { if(!executeOnePart(partsItr, charsItr)) return false @@ -68,7 +65,7 @@ private fun executePattern(partsItr: CheckpointIterator, charsItr: return !charsItr.hasNext() } -private fun executeOnePart(partsItr: CheckpointIterator, charsItr: CheckointCodepointIterator): Boolean { +private fun executeOnePart(partsItr: CheckpointIterator, charsItr: CodepointCheckpointIterator): Boolean { when (val currentPart = partsItr.next()) { is PatternPart.AnyOneChar -> { if(!charsItr.hasNext()) @@ -78,6 +75,7 @@ private fun executeOnePart(partsItr: CheckpointIterator, charsItr: return true } is PatternPart.ExactChars -> { + // Consume characters as long currentPart.codepoints.forEach { if (!charsItr.hasNext() || charsItr.next() != it) { return false @@ -93,23 +91,29 @@ private fun executeOnePart(partsItr: CheckpointIterator, charsItr: } while (true) { - partsItr.checkpoint() - charsItr.checkpoint() - - val nextPatternMatches = executePattern(partsItr, charsItr) - partsItr.restore() - charsItr.restore() - - if (nextPatternMatches) { - // TODO: we can pop the index stack instead of restoring it here to avoid having to - // re-run the patternpart during the next call to executeOnePart + // Mark checkpoints on our iterators that so we can store the current position + // of them later if the next pattern part doesn't match. We will keep doing this + // until the next pattern part matches. + partsItr.saveCheckpoint() + charsItr.saveCheckpoint() + + if (executePattern(partsItr, charsItr)) { + // Discard the checkpoint saved above. We don't technically need to do this + // but it prevents the *next* pattern part from executing needlessly. + partsItr.discardCheckpoint() + charsItr.discardCheckpoint() return true + } else { + // The next pattern did not match, restore the iterator positions for the next iteration + partsItr.restoreCheckpoint() + charsItr.restoreCheckpoint() } - charsItr.next() if (!charsItr.hasNext()) { return false } + + charsItr.next() } } } diff --git a/lang/test/org/partiql/lang/eval/LikePredicateTest.kt b/lang/test/org/partiql/lang/eval/LikePredicateTest.kt index d6e55a96ad..480cc46d73 100644 --- a/lang/test/org/partiql/lang/eval/LikePredicateTest.kt +++ b/lang/test/org/partiql/lang/eval/LikePredicateTest.kt @@ -628,28 +628,4 @@ class LikePredicateTest : EvaluatorTestBase() { NodeMetadata(1, 56)) { voidEval("SELECT * FROM `[{name:1, type:\"a\"}]` as a WHERE a.name LIKE a.type ") } - - - @Test - fun interruptedThreadThrowsInterruptedException() { - // '%!!!!....%' should take a very long time to compile until we make some major refactorings - // of the LIKE pattern matching implementations. - val heavyPayload = "foo like '%${"!".repeat(5000)}%'" - - var wasInterrupted = false - - val someThread = thread { - try { - CompilerPipeline.standard(ion).compile(heavyPayload) - } catch(_: InterruptedException) { - wasInterrupted = true - } - } - - someThread.interrupt() - someThread.join(5000) - - assertTrue("Thread should have been interrupted!", wasInterrupted) - } - } diff --git a/lang/test/org/partiql/lang/eval/like/PatternPartTests.kt b/lang/test/org/partiql/lang/eval/like/PatternPartTests.kt index 18e9a6974a..338f4383cb 100644 --- a/lang/test/org/partiql/lang/eval/like/PatternPartTests.kt +++ b/lang/test/org/partiql/lang/eval/like/PatternPartTests.kt @@ -16,18 +16,21 @@ class PatternPartTests { fun parametersForPatternTest() = listOf( createTestCase("a", null, listOf( + "" to false, "a" to true, "aa" to false, "b" to false, "bb" to false )), createTestCase("aa", null, listOf( + "" to false, "a" to false, "aa" to true, "b" to false, "bb" to false )), createTestCase("_", null, listOf( + "" to false, "a" to true, "b" to true, "aa" to false, @@ -40,14 +43,17 @@ class PatternPartTests { "bb" to true )), createTestCase("%", null, listOf( + "" to true, "a" to true, "bb" to true )), createTestCase("%%", null, listOf( + "" to true, "a" to true, "bb" to true )), createTestCase("a%", null, listOf( + "" to false, "a" to true, "ab" to true, "abcde" to true, @@ -56,6 +62,7 @@ class PatternPartTests { "baa" to false )), createTestCase("%a", null, listOf( + "" to false, "a" to true, "ba" to true, "edcba" to true, @@ -63,7 +70,14 @@ class PatternPartTests { "ab" to false, "aab" to false )), + createTestCase("%foo%bar%bat%baz%bork%borz%", null, listOf( + "" to false, + "foobarbatbazborkborz" to true, + "000foo1bar22bat333baz444bork555borz666" to true, + "000foo1bar22bat333baz444bork555borD666" to false + )), createTestCase("%a%", null, listOf( + "" to false, "a" to true, "ab" to true, "ba" to true, @@ -73,6 +87,8 @@ class PatternPartTests { "bb" to false )), createTestCase("%_asdf_%", null, listOf( + "" to false, + "asdf" to false, "1asdf1" to true, "1asdf1x" to true, "x1asdf1" to true, @@ -81,7 +97,13 @@ class PatternPartTests { "xyz1asdf1xyz" to true )), createTestCase("\\%\\_", '\\', listOf( + "" to false, "%_" to true + )), + createTestCase("%\\%\\__", '\\', listOf( + "" to false, + "%_1" to true, + "asdf%_1" to true )) ).flatten() From 1f362713b7bb6e6f313ecaad651431db9627b4a3 Mon Sep 17 00:00:00 2001 From: David Lurton Date: Sun, 20 Sep 2020 19:40:48 -0700 Subject: [PATCH 6/9] Final cleanups --- .../org/partiql/lang/eval/EvaluatingCompiler.kt | 16 +++++----------- .../org/partiql/lang/eval/like/PatternPart.kt | 6 ++---- 2 files changed, 7 insertions(+), 15 deletions(-) diff --git a/lang/src/org/partiql/lang/eval/EvaluatingCompiler.kt b/lang/src/org/partiql/lang/eval/EvaluatingCompiler.kt index 17c0b4243c..f0ee447842 100644 --- a/lang/src/org/partiql/lang/eval/EvaluatingCompiler.kt +++ b/lang/src/org/partiql/lang/eval/EvaluatingCompiler.kt @@ -1665,10 +1665,8 @@ internal class EvaluatingCompiler( val escapeLocationMeta = escapeExpr?.metas?.sourceLocationMeta - // TODO: re-evaluate the below comment. // Note that the return value is a nullable and deferred. // This is so that null short-circuits can be supported. - // The effective type is Either> fun getPatternParts(pattern: ExprValue, escape: ExprValue?): (() -> List)? { val dfaArgs = listOfNotNull(pattern, escape) when { @@ -1683,12 +1681,11 @@ internal class EvaluatingCompiler( internal = false) } else -> { - val (patternString: String, escapeChar: Int?, patternSize) = + val (patternString: String, escapeChar: Int?) = checkPattern(pattern.ionValue, patternLocationMeta, escape?.ionValue, escapeLocationMeta) val patternParts = when { patternString.isEmpty() -> emptyList() - // TODO: include escapeChar else -> parsePattern(patternString, escapeChar) } @@ -1792,10 +1789,9 @@ internal class EvaluatingCompiler( patternLocationMeta: SourceLocationMeta?, escape: IonValue?, escapeLocationMeta: SourceLocationMeta? - ): Triple { - // TODO: don't bother calculating size anymore. + ): Pair { - val patternString = pattern.stringValue()?.let { it } + val patternString = pattern.stringValue() ?: err("Must provide a non-null value for PATTERN in a LIKE predicate: $pattern", errorContextFrom(patternLocationMeta), internal = false) @@ -1805,7 +1801,6 @@ internal class EvaluatingCompiler( val escapeCharCodePoint = escapeCharString.codePointAt(0) // escape is a string of length 1 val validEscapedChars = setOf('_'.toInt(), '%'.toInt(), escapeCharCodePoint) val iter = patternString.codePointSequence().iterator() - var count = 0 while (iter.hasNext()) { val current = iter.next() @@ -1818,11 +1813,10 @@ internal class EvaluatingCompiler( }, internal = false) } - count++ } - return Triple(patternString, escapeCharCodePoint, count) + return Pair(patternString, escapeCharCodePoint) } - return Triple(patternString, null, patternString.length) + return Pair(patternString, null) } /** diff --git a/lang/src/org/partiql/lang/eval/like/PatternPart.kt b/lang/src/org/partiql/lang/eval/like/PatternPart.kt index d2042e5476..8da74ac716 100644 --- a/lang/src/org/partiql/lang/eval/like/PatternPart.kt +++ b/lang/src/org/partiql/lang/eval/like/PatternPart.kt @@ -12,8 +12,6 @@ internal sealed class PatternPart { private val ANY_CHARS = '%'.toInt() private val ANY_ONE_CHAR = '_'.toInt() -// TODO: merge multiple consecutive % together? -// TODO: does the % in '%_' actually mean anything? internal fun parsePattern(pattern: String, escapeChar: Int?): List { val codepoints = pattern.codePoints().toList().listIterator() val parts = ArrayList() @@ -30,11 +28,11 @@ internal fun parsePattern(pattern: String, escapeChar: Int?): List // stop building if we encounter end of input do { val cc = codepoints.next() - // stop building and back up one if we encounter `%` or `_` characters not precdeed by - // the escape character + // If [escapeChar] is encountered, just add the next codepoint to the buffer.] if(escapeChar != null && cc == escapeChar) { buffer.add(codepoints.next()) } else { + // stop building and back up one if we encounter `%` or `_` characters if (cc == ANY_ONE_CHAR || cc == ANY_CHARS) { codepoints.previous() break From 04072eb08369b3985737ea0ca2d3c1dd8410ac45 Mon Sep 17 00:00:00 2001 From: David Lurton Date: Thu, 24 Sep 2020 12:18:36 -0700 Subject: [PATCH 7/9] Apply PR feedback --- .../partiql/lang/eval/EvaluatingCompiler.kt | 35 +++++------ .../lang/eval/like/CheckpointIterator.kt | 2 +- .../lang/eval/like/CheckpointIteratorImpl.kt | 2 +- .../eval/like/CodepointCheckpointIterator.kt | 2 +- .../org/partiql/lang/eval/like/PatternPart.kt | 62 ++++++++++++------- .../lang/eval/like/PatternPartTests.kt | 24 +++++++ 6 files changed, 85 insertions(+), 42 deletions(-) diff --git a/lang/src/org/partiql/lang/eval/EvaluatingCompiler.kt b/lang/src/org/partiql/lang/eval/EvaluatingCompiler.kt index f0ee447842..88c8614b4a 100644 --- a/lang/src/org/partiql/lang/eval/EvaluatingCompiler.kt +++ b/lang/src/org/partiql/lang/eval/EvaluatingCompiler.kt @@ -1643,9 +1643,9 @@ internal class EvaluatingCompiler( * * Three cases * - * 1. All arguments are literals, then compile and run the DFA - * 1. Search pattern and escape pattern are literals, compile the DFA. Running the DFA is deferred to evaluation time. - * 1. Pattern or escape (or both) are *not* literals, compile and running of DFA deferred to evaluation time. + * 1. All arguments are literals, then compile and run the pattern + * 1. Search pattern and escape pattern are literals, compile the pattern. Running the pattern deferred to evaluation time. + * 1. Pattern or escape (or both) are *not* literals, compile and running of pattern deferred to evaluation time. * * ``` * LIKE [ESCAPE ] @@ -1668,10 +1668,10 @@ internal class EvaluatingCompiler( // Note that the return value is a nullable and deferred. // This is so that null short-circuits can be supported. fun getPatternParts(pattern: ExprValue, escape: ExprValue?): (() -> List)? { - val dfaArgs = listOfNotNull(pattern, escape) + val patternArgs = listOfNotNull(pattern, escape) when { - dfaArgs.any { it.type.isUnknown } -> return null - dfaArgs.any { !it.type.isText } -> return { + patternArgs.any { it.type.isUnknown } -> return null + patternArgs.any { !it.type.isText } -> return { err("LIKE expression must be given non-null strings as input", ErrorCode.EVALUATOR_LIKE_INVALID_INPUTS, errorContextFrom(operatorMetas).also { @@ -1694,10 +1694,9 @@ internal class EvaluatingCompiler( } } - /** See getDfa for more info on the DFA's odd type. */ - fun runPatternParts(value: ExprValue, dfa: (() -> List)?): ExprValue { + fun runPatternParts(value: ExprValue, patternParts: (() -> List)?): ExprValue { return when { - dfa == null || value.type.isUnknown -> valueFactory.nullValue + patternParts == null || value.type.isUnknown -> valueFactory.nullValue !value.type.isText -> err( "LIKE expression must be given non-null strings as input", ErrorCode.EVALUATOR_LIKE_INVALID_INPUTS, @@ -1705,14 +1704,14 @@ internal class EvaluatingCompiler( it[Property.LIKE_VALUE] = value.ionValue.toString() }, internal = false) - else -> valueFactory.newBoolean(executePattern(dfa(), value.stringValue())) + else -> valueFactory.newBoolean(executePattern(patternParts(), value.stringValue())) } } val valueThunk = argThunks[0] - // If the pattern and escape expressions are literals then we can can compile the DFA now and - // re-use it with every execution. Otherwise we must re-compile the DFA every time. + // If the pattern and escape expressions are literals then we can can compile the pattern now and + // re-use it with every execution. Otherwise we must re-compile the pattern every time. return when { patternExpr is Literal && (escapeExpr == null || escapeExpr is Literal) -> { @@ -1736,23 +1735,23 @@ internal class EvaluatingCompiler( val patternThunk = argThunks[1] when { argThunks.size == 2 -> { - //thunk that re-compiles the DFA every evaluation without a custom escape sequence + //thunk that re-compiles the pattern every evaluation without a custom escape sequence thunkFactory.thunkEnv(operatorMetas) { env -> val value = valueThunk(env) val pattern = patternThunk(env) - val dfa = getPatternParts(pattern, null) - runPatternParts(value, dfa) + val pps = getPatternParts(pattern, null) + runPatternParts(value, pps) } } else -> { - //thunk that re-compiles the DFA every evaluation but *with* a custom escape sequence + //thunk that re-compiles the pattern every evaluation but *with* a custom escape sequence val escapeThunk = argThunks[2] thunkFactory.thunkEnv(operatorMetas) { env -> val value = valueThunk(env) val pattern = patternThunk(env) val escape = escapeThunk(env) - val dfa = getPatternParts(pattern, escape) - runPatternParts(value, dfa) + val pps = getPatternParts(pattern, escape) + runPatternParts(value, pps) } } } diff --git a/lang/src/org/partiql/lang/eval/like/CheckpointIterator.kt b/lang/src/org/partiql/lang/eval/like/CheckpointIterator.kt index 96f8e5a164..2038c8a40d 100644 --- a/lang/src/org/partiql/lang/eval/like/CheckpointIterator.kt +++ b/lang/src/org/partiql/lang/eval/like/CheckpointIterator.kt @@ -4,7 +4,7 @@ package org.partiql.lang.eval.like * Extends [Iterator] with the ability to save the current position and restore it later, * thereby allowing an a kind of infinite lookahead. */ -interface CheckpointIterator : Iterator { +internal interface CheckpointIterator : Iterator { /** * Saves the current position on an internal stack. diff --git a/lang/src/org/partiql/lang/eval/like/CheckpointIteratorImpl.kt b/lang/src/org/partiql/lang/eval/like/CheckpointIteratorImpl.kt index 59c3837160..a432b09fad 100644 --- a/lang/src/org/partiql/lang/eval/like/CheckpointIteratorImpl.kt +++ b/lang/src/org/partiql/lang/eval/like/CheckpointIteratorImpl.kt @@ -4,7 +4,7 @@ import java.util.Stack /** An implementation of [CheckpointIterator] which is backed by a [List]. */ -class CheckpointIteratorImpl(private val backingList: List) : CheckpointIterator { +internal class CheckpointIteratorImpl(private val backingList: List) : CheckpointIterator { private val checkpointStack = Stack() private var idx = -1 diff --git a/lang/src/org/partiql/lang/eval/like/CodepointCheckpointIterator.kt b/lang/src/org/partiql/lang/eval/like/CodepointCheckpointIterator.kt index bb37bbe17d..055aa74f55 100644 --- a/lang/src/org/partiql/lang/eval/like/CodepointCheckpointIterator.kt +++ b/lang/src/org/partiql/lang/eval/like/CodepointCheckpointIterator.kt @@ -3,7 +3,7 @@ package org.partiql.lang.eval.like import java.util.Stack /** An implementation of [CheckpointIterator] that iterates over the unicode codepoints within a string. */ -class CodepointCheckpointIterator(private val str: String) : CheckpointIterator { +internal class CodepointCheckpointIterator(private val str: String) : CheckpointIterator { private val checkpointStack = Stack() private val codepointCount = str.codePointCount(0, str.length) private var idx = -1 diff --git a/lang/src/org/partiql/lang/eval/like/PatternPart.kt b/lang/src/org/partiql/lang/eval/like/PatternPart.kt index 8da74ac716..1cfc644b9a 100644 --- a/lang/src/org/partiql/lang/eval/like/PatternPart.kt +++ b/lang/src/org/partiql/lang/eval/like/PatternPart.kt @@ -4,43 +4,63 @@ import kotlin.streams.toList internal sealed class PatternPart { object AnyOneChar : PatternPart() - object AnyZeroOrMoreChars : PatternPart() - @Suppress("ArrayInDataClass") - data class ExactChars(val codepoints: IntArray) : PatternPart() + object ZeroOrMoreOfAnyChar : PatternPart() + data class ExactChars(val codepoints: IntArray) : PatternPart() { + override fun equals(other: Any?): Boolean { + if (this === other) return true + if (other !is ExactChars) return false + + if (!codepoints.contentEquals(other.codepoints)) return false + + return true + } + + override fun hashCode(): Int { + return codepoints.contentHashCode() + } + + } } -private val ANY_CHARS = '%'.toInt() -private val ANY_ONE_CHAR = '_'.toInt() +private const val ZERO_OR_MORE_OF_ANY_CHAR = '%'.toInt() +private const val ANY_ONE_CHAR = '_'.toInt() internal fun parsePattern(pattern: String, escapeChar: Int?): List { - val codepoints = pattern.codePoints().toList().listIterator() + val codepointList = pattern.codePoints().toList() + val codepointsItr = codepointList.listIterator() val parts = ArrayList() - while(codepoints.hasNext()) { - val c = codepoints.next() + while(codepointsItr.hasNext()) { + val c = codepointsItr.next() parts.add(when(c) { ANY_ONE_CHAR -> PatternPart.AnyOneChar - ANY_CHARS -> PatternPart.AnyZeroOrMoreChars - else -> { + ZERO_OR_MORE_OF_ANY_CHAR -> { + // consider consecutive `%` to be the same as one `%` + while(codepointsItr.hasNext() && codepointList[codepointsItr.nextIndex()] == ZERO_OR_MORE_OF_ANY_CHAR) { + codepointsItr.next() + } - codepoints.previous() + PatternPart.ZeroOrMoreOfAnyChar + } + else -> { + codepointsItr.previous() // Build pattern for matching the exact string val buffer = ArrayList() // stop building if we encounter end of input do { - val cc = codepoints.next() - // If [escapeChar] is encountered, just add the next codepoint to the buffer.] + val cc = codepointsItr.next() + // If [escapeChar] is encountered, just add the next codepoint to the buffer. if(escapeChar != null && cc == escapeChar) { - buffer.add(codepoints.next()) + buffer.add(codepointsItr.next()) } else { // stop building and back up one if we encounter `%` or `_` characters - if (cc == ANY_ONE_CHAR || cc == ANY_CHARS) { - codepoints.previous() + if (cc == ANY_ONE_CHAR || cc == ZERO_OR_MORE_OF_ANY_CHAR) { + codepointsItr.previous() break } buffer.add(cc) } - } while(codepoints.hasNext()) + } while(codepointsItr.hasNext()) PatternPart.ExactChars(buffer.toIntArray()) } @@ -66,14 +86,14 @@ private fun executePattern(partsItr: CheckpointIterator, charsItr: private fun executeOnePart(partsItr: CheckpointIterator, charsItr: CodepointCheckpointIterator): Boolean { when (val currentPart = partsItr.next()) { is PatternPart.AnyOneChar -> { - if(!charsItr.hasNext()) + if(!charsItr.hasNext()) { return false + } charsItr.next() return true } is PatternPart.ExactChars -> { - // Consume characters as long currentPart.codepoints.forEach { if (!charsItr.hasNext() || charsItr.next() != it) { return false @@ -81,7 +101,7 @@ private fun executeOnePart(partsItr: CheckpointIterator, charsItr: } return true } - PatternPart.AnyZeroOrMoreChars -> { + PatternPart.ZeroOrMoreOfAnyChar -> { // No need to check the rest of the string if this is the last pattern part if (!partsItr.hasNext()) { charsItr.skipToEnd() // consume rest of string otherwise we will consider this a non-match. @@ -97,7 +117,7 @@ private fun executeOnePart(partsItr: CheckpointIterator, charsItr: if (executePattern(partsItr, charsItr)) { // Discard the checkpoint saved above. We don't technically need to do this - // but it prevents the *next* pattern part from executing needlessly. + // but it prevents the *next* pattern part from executing again. partsItr.discardCheckpoint() charsItr.discardCheckpoint() return true diff --git a/lang/test/org/partiql/lang/eval/like/PatternPartTests.kt b/lang/test/org/partiql/lang/eval/like/PatternPartTests.kt index 338f4383cb..b6319f4620 100644 --- a/lang/test/org/partiql/lang/eval/like/PatternPartTests.kt +++ b/lang/test/org/partiql/lang/eval/like/PatternPartTests.kt @@ -5,6 +5,7 @@ import junitparams.Parameters import org.junit.Assert import org.junit.Test import org.junit.runner.RunWith +import kotlin.test.assertEquals @RunWith(JUnitParamsRunner::class) class PatternPartTests { @@ -115,4 +116,27 @@ class PatternPartTests { Assert.assertEquals(tc.shouldMatch, actualMatches) } + + @Test + fun patternParserTest() { + // the parser should consider multiple consecutive % to be the same as one + val patParts = parsePattern("%%a%%%_%%% %%", escapeChar = null) + assertEquals( + listOf( + PatternPart.ZeroOrMoreOfAnyChar, + PatternPart.ExactChars("a".codePoints().toArray()), + PatternPart.ZeroOrMoreOfAnyChar, + PatternPart.AnyOneChar, + PatternPart.ZeroOrMoreOfAnyChar, + PatternPart.ExactChars(" ".codePoints().toArray()), + PatternPart.ZeroOrMoreOfAnyChar + ), + patParts) + } + + @Test + fun stressTest() { + // makes absolutely certain we do not stack overflow on too many consecutive `%` characters + assertEquals(true, executePattern(parsePattern("%".repeat(10000) + "a", escapeChar = null), "aaaa")) + } } \ No newline at end of file From 158780de0e510693ace24c8f83e31dbfe138c55b Mon Sep 17 00:00:00 2001 From: David Lurton Date: Thu, 24 Sep 2020 12:23:53 -0700 Subject: [PATCH 8/9] Apply more PR feedback --- lang/src/org/partiql/lang/eval/like/CheckpointIterator.kt | 2 +- lang/src/org/partiql/lang/eval/like/PatternPart.kt | 7 ++++--- lang/test/org/partiql/lang/eval/LikePredicateTest.kt | 1 - 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/lang/src/org/partiql/lang/eval/like/CheckpointIterator.kt b/lang/src/org/partiql/lang/eval/like/CheckpointIterator.kt index 2038c8a40d..c570734dc7 100644 --- a/lang/src/org/partiql/lang/eval/like/CheckpointIterator.kt +++ b/lang/src/org/partiql/lang/eval/like/CheckpointIterator.kt @@ -2,7 +2,7 @@ package org.partiql.lang.eval.like /** * Extends [Iterator] with the ability to save the current position and restore it later, - * thereby allowing an a kind of infinite lookahead. + * thereby allowing a kind of infinite lookahead. */ internal interface CheckpointIterator : Iterator { diff --git a/lang/src/org/partiql/lang/eval/like/PatternPart.kt b/lang/src/org/partiql/lang/eval/like/PatternPart.kt index 1cfc644b9a..b6b21d3c2f 100644 --- a/lang/src/org/partiql/lang/eval/like/PatternPart.kt +++ b/lang/src/org/partiql/lang/eval/like/PatternPart.kt @@ -109,9 +109,10 @@ private fun executeOnePart(partsItr: CheckpointIterator, charsItr: } while (true) { - // Mark checkpoints on our iterators that so we can store the current position - // of them later if the next pattern part doesn't match. We will keep doing this - // until the next pattern part matches. + // Mark checkpoints on our iterators so that we can store the current position + // of them later if the if the remaining pattern parts don't match. We will keep + // doing this and and advancing the current character position until the + // remaining pattern parts match. If we reach the end of the string, then there is no match. partsItr.saveCheckpoint() charsItr.saveCheckpoint() diff --git a/lang/test/org/partiql/lang/eval/LikePredicateTest.kt b/lang/test/org/partiql/lang/eval/LikePredicateTest.kt index 480cc46d73..7b28224bec 100644 --- a/lang/test/org/partiql/lang/eval/LikePredicateTest.kt +++ b/lang/test/org/partiql/lang/eval/LikePredicateTest.kt @@ -19,7 +19,6 @@ import org.partiql.lang.errors.* import org.partiql.lang.util.* import org.assertj.core.api.* import org.junit.* -import kotlin.concurrent.thread import kotlin.test.* class LikePredicateTest : EvaluatorTestBase() { From 85d494b9bc2006076ae9f9a95b7761a6d0e81ab6 Mon Sep 17 00:00:00 2001 From: David Lurton Date: Thu, 24 Sep 2020 13:17:39 -0700 Subject: [PATCH 9/9] Update PatternPart.kt --- lang/src/org/partiql/lang/eval/like/PatternPart.kt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lang/src/org/partiql/lang/eval/like/PatternPart.kt b/lang/src/org/partiql/lang/eval/like/PatternPart.kt index b6b21d3c2f..d6851c4add 100644 --- a/lang/src/org/partiql/lang/eval/like/PatternPart.kt +++ b/lang/src/org/partiql/lang/eval/like/PatternPart.kt @@ -110,7 +110,7 @@ private fun executeOnePart(partsItr: CheckpointIterator, charsItr: while (true) { // Mark checkpoints on our iterators so that we can store the current position - // of them later if the if the remaining pattern parts don't match. We will keep + // of them later if the remaining pattern parts don't match. We will keep // doing this and and advancing the current character position until the // remaining pattern parts match. If we reach the end of the string, then there is no match. partsItr.saveCheckpoint() @@ -118,7 +118,7 @@ private fun executeOnePart(partsItr: CheckpointIterator, charsItr: if (executePattern(partsItr, charsItr)) { // Discard the checkpoint saved above. We don't technically need to do this - // but it prevents the *next* pattern part from executing again. + // but it prevents the *next* pattern part from executing again without need. partsItr.discardCheckpoint() charsItr.discardCheckpoint() return true