From c5d72c0a9e01da676b2e4ea2485e4fe5a53c6ac2 Mon Sep 17 00:00:00 2001 From: David Lurton Date: Thu, 24 Sep 2020 13:37:48 -0700 Subject: [PATCH] Fast `LIKE` pattern compilation (#286) Implements #284 Replaces previous LIKE implementation which is slow when compiling large patterns with wildcard characters `%` with another implementation that compiles the patterns in linear time and has similar performance characteristics at evaluation time. --- .../partiql/lang/eval/EvaluatingCompiler.kt | 72 +-- .../partiql/lang/eval/LikeMatchingAutomata.kt | 495 ------------------ .../lang/eval/like/CheckpointIterator.kt | 31 ++ .../lang/eval/like/CheckpointIteratorImpl.kt | 31 ++ .../eval/like/CodepointCheckpointIterator.kt | 33 ++ .../org/partiql/lang/eval/like/PatternPart.kt | 140 +++++ .../partiql/lang/eval/LikePredicateTest.kt | 3 - .../lang/eval/like/PatternPartTests.kt | 142 +++++ 8 files changed, 414 insertions(+), 533 deletions(-) delete mode 100644 lang/src/org/partiql/lang/eval/LikeMatchingAutomata.kt create mode 100644 lang/src/org/partiql/lang/eval/like/CheckpointIterator.kt create mode 100644 lang/src/org/partiql/lang/eval/like/CheckpointIteratorImpl.kt create mode 100644 lang/src/org/partiql/lang/eval/like/CodepointCheckpointIterator.kt create mode 100644 lang/src/org/partiql/lang/eval/like/PatternPart.kt create mode 100644 lang/test/org/partiql/lang/eval/like/PatternPartTests.kt diff --git a/lang/src/org/partiql/lang/eval/EvaluatingCompiler.kt b/lang/src/org/partiql/lang/eval/EvaluatingCompiler.kt index 07253743ba..88c8614b4a 100644 --- a/lang/src/org/partiql/lang/eval/EvaluatingCompiler.kt +++ b/lang/src/org/partiql/lang/eval/EvaluatingCompiler.kt @@ -21,6 +21,9 @@ import org.partiql.lang.ast.passes.* import org.partiql.lang.domains.PartiqlAst import org.partiql.lang.errors.* import org.partiql.lang.eval.binding.* +import org.partiql.lang.eval.like.PatternPart +import org.partiql.lang.eval.like.executePattern +import org.partiql.lang.eval.like.parsePattern import org.partiql.lang.syntax.SqlParser import org.partiql.lang.util.* import java.math.* @@ -1640,9 +1643,9 @@ internal class EvaluatingCompiler( * * Three cases * - * 1. All arguments are literals, then compile and run the DFA - * 1. Search pattern and escape pattern are literals, compile the DFA. Running the DFA is deferred to evaluation time. - * 1. Pattern or escape (or both) are *not* literals, compile and running of DFA deferred to evaluation time. + * 1. All arguments are literals, then compile and run the pattern + * 1. Search pattern and escape pattern are literals, compile the pattern. Running the pattern deferred to evaluation time. + * 1. Pattern or escape (or both) are *not* literals, compile and running of pattern deferred to evaluation time. * * ``` * LIKE [ESCAPE ] @@ -1661,14 +1664,14 @@ internal class EvaluatingCompiler( val patternLocationMeta = patternExpr.metas.sourceLocationMeta val escapeLocationMeta = escapeExpr?.metas?.sourceLocationMeta + // Note that the return value is a nullable and deferred. // This is so that null short-circuits can be supported. - // The effective type is Either> - fun getDfa(pattern: ExprValue, escape: ExprValue?): (() -> IDFAState)? { - val dfaArgs = listOfNotNull(pattern, escape) + fun getPatternParts(pattern: ExprValue, escape: ExprValue?): (() -> List)? { + val patternArgs = listOfNotNull(pattern, escape) when { - dfaArgs.any { it.type.isUnknown } -> return null - dfaArgs.any { !it.type.isText } -> return { + patternArgs.any { it.type.isUnknown } -> return null + patternArgs.any { !it.type.isText } -> return { err("LIKE expression must be given non-null strings as input", ErrorCode.EVALUATOR_LIKE_INVALID_INPUTS, errorContextFrom(operatorMetas).also { @@ -1678,53 +1681,53 @@ internal class EvaluatingCompiler( internal = false) } else -> { - val (patternString: String, escapeChar: Int?, patternSize) = + val (patternString: String, escapeChar: Int?) = checkPattern(pattern.ionValue, patternLocationMeta, escape?.ionValue, escapeLocationMeta) - val dfa = - if (patternString.isEmpty()) DFAEmptyPattern - else buildDfaFromPattern(patternString, escapeChar, patternSize) + val patternParts = when { + patternString.isEmpty() -> emptyList() + else -> parsePattern(patternString, escapeChar) + } - return { dfa } + return { patternParts } } } } - /** See getDfa for more info on the DFA's odd type. */ - fun runDfa(value: ExprValue, dfa: (() -> IDFAState)?): ExprValue { + fun runPatternParts(value: ExprValue, patternParts: (() -> List)?): ExprValue { return when { - dfa == null || value.type.isUnknown -> valueFactory.nullValue - !value.type.isText -> err( + patternParts == null || value.type.isUnknown -> valueFactory.nullValue + !value.type.isText -> err( "LIKE expression must be given non-null strings as input", ErrorCode.EVALUATOR_LIKE_INVALID_INPUTS, errorContextFrom(operatorMetas).also { it[Property.LIKE_VALUE] = value.ionValue.toString() }, internal = false) - else -> dfa().run(value.stringValue()).exprValue() + else -> valueFactory.newBoolean(executePattern(patternParts(), value.stringValue())) } } val valueThunk = argThunks[0] - // If the pattern and escape expressions are literals then we can can compile the DFA now and - // re-use it with every execution. Otherwise we must re-compile the DFA every time. + // If the pattern and escape expressions are literals then we can can compile the pattern now and + // re-use it with every execution. Otherwise we must re-compile the pattern every time. return when { patternExpr is Literal && (escapeExpr == null || escapeExpr is Literal) -> { - val dfa = getDfa( + val patternParts = getPatternParts( valueFactory.newFromIonValue(patternExpr.ionValue), (escapeExpr as? Literal)?.ionValue?.let { valueFactory.newFromIonValue(it) }) // If valueExpr is also a literal then we can evaluate this at compile time and return a constant. if (valueExpr is Literal) { - val resultValue = runDfa(valueFactory.newFromIonValue(valueExpr.ionValue), dfa) + val resultValue = runPatternParts(valueFactory.newFromIonValue(valueExpr.ionValue), patternParts) return thunkFactory.thunkEnv(operatorMetas) { resultValue } } else { thunkFactory.thunkEnv(operatorMetas) { env -> val value = valueThunk(env) - runDfa(value, dfa) + runPatternParts(value, patternParts) } } } @@ -1732,23 +1735,23 @@ internal class EvaluatingCompiler( val patternThunk = argThunks[1] when { argThunks.size == 2 -> { - //thunk that re-compiles the DFA every evaluation without a custom escape sequence + //thunk that re-compiles the pattern every evaluation without a custom escape sequence thunkFactory.thunkEnv(operatorMetas) { env -> val value = valueThunk(env) val pattern = patternThunk(env) - val dfa = getDfa(pattern, null) - runDfa(value, dfa) + val pps = getPatternParts(pattern, null) + runPatternParts(value, pps) } } else -> { - //thunk that re-compiles the DFA every evaluation but *with* a custom escape sequence + //thunk that re-compiles the pattern every evaluation but *with* a custom escape sequence val escapeThunk = argThunks[2] thunkFactory.thunkEnv(operatorMetas) { env -> val value = valueThunk(env) val pattern = patternThunk(env) val escape = escapeThunk(env) - val dfa = getDfa(pattern, escape) - runDfa(value, dfa) + val pps = getPatternParts(pattern, escape) + runPatternParts(value, pps) } } } @@ -1785,8 +1788,9 @@ internal class EvaluatingCompiler( patternLocationMeta: SourceLocationMeta?, escape: IonValue?, escapeLocationMeta: SourceLocationMeta? - ): Triple { - val patternString = pattern.stringValue()?.let { it } + ): Pair { + + val patternString = pattern.stringValue() ?: err("Must provide a non-null value for PATTERN in a LIKE predicate: $pattern", errorContextFrom(patternLocationMeta), internal = false) @@ -1796,7 +1800,6 @@ internal class EvaluatingCompiler( val escapeCharCodePoint = escapeCharString.codePointAt(0) // escape is a string of length 1 val validEscapedChars = setOf('_'.toInt(), '%'.toInt(), escapeCharCodePoint) val iter = patternString.codePointSequence().iterator() - var count = 0 while (iter.hasNext()) { val current = iter.next() @@ -1809,11 +1812,10 @@ internal class EvaluatingCompiler( }, internal = false) } - count++ } - return Triple(patternString, escapeCharCodePoint, count) + return Pair(patternString, escapeCharCodePoint) } - return Triple(patternString, null, patternString.length) + return Pair(patternString, null) } /** diff --git a/lang/src/org/partiql/lang/eval/LikeMatchingAutomata.kt b/lang/src/org/partiql/lang/eval/LikeMatchingAutomata.kt deleted file mode 100644 index b24f7f858d..0000000000 --- a/lang/src/org/partiql/lang/eval/LikeMatchingAutomata.kt +++ /dev/null @@ -1,495 +0,0 @@ -/* - * Copyright 2019 Amazon.com, Inc. or its affiliates. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"). - * You may not use this file except in compliance with the License. - * A copy of the License is located at: - * - * http://aws.amazon.com/apache2.0/ - * - * or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific - * language governing permissions and limitations under the License. - */ - -package org.partiql.lang.eval - -import org.partiql.lang.util.codePointSequence -import java.util.ArrayList -import java.util.HashSet - - -/** - * Enumeration of Alphabet letters which can be one of - * - * - Any one character -- SQL `_` that maps to `.` in RegeExp - * - Zero or more characters -- SQL `%` that maps to `.*` in RegeExp - * - Epsilon -- denotes the epsilon (empty) transitions in an NFA - * - Character -- denotes any single character - */ -private sealed class Alphabet { - data class Letter(val codePoint: Int) : Alphabet() - object AnyOneChar : Alphabet() - object AnyZeroOrMoreChars : Alphabet() - object Epsilon : Alphabet() -} - -/** - * Represents the dead DFA State. - * This state is terminal-- has no outgoing transitions and it is neither a start State nor a Final state - */ -val DFADeadState : IDFAState = DFAState(mutableSetOf(), mutableMapOf()) - -// Represents the DFA of the empty pattern -val DFAEmptyPattern = object :IDFAState { - override fun isAccepting(): Boolean { - return false - } - - override fun run(word: String?): Boolean = - word?.let { - word.isEmpty() // SQL92 pp. 216 Case 5)b) - } ?: false - - - override fun step(codePoint: Int): IDFAState? { - return DFADeadState - } - -} - -interface IDFAState { - fun isAccepting(): Boolean - /** - * Given a possibly `null` string, starting from `this` DFA state run the automaton and return - * `true` if we exhaust [word] and we are in a an accepting state, false otherwise. - * - * @param word input to the DFA - * - * @return true if the DFA accepts the input, false otherwise - */ - fun run(word: String?): Boolean - - /** - * Given a character, take a step in our DFA starting with `this` state and possible transitions that match [codePoint]. - * - * @param codePoint character to match against possible valid transitions - * - * @return next DFA state - */ - fun step(codePoint: Int): IDFAState? -} - -/** - * Represents a DFA State where - * - * - [nfaStates] set of NFA states that correspond to this DFA state - * - [outgoing] map of transitions to DFA states - * - [accepting] true if this is a Final state, false otherwise - * - [start] true if this is a Start state, false otherwise - */ -private open class DFAState(val nfaStates: MutableSet, - val outgoing: MutableMap, - var accepting: Boolean = nfaStates.filter { it.isAccepting }.isNotEmpty(), - var start: Boolean = nfaStates.filter { it.isStartState }.isNotEmpty() -) : IDFAState { - - fun addTransition(transition: Alphabet, target: DFAState) { - if (transition == Alphabet.Epsilon) errNoContext("DFA cannot have epsilon transitions: $transition, $target", internal = true) - when (outgoing.containsKey(transition)) { - true -> if (target != outgoing[transition]) - errNoContext("DFA cannot have a transition that maps to different targets : $transition -> $target AND $transition -> $outgoing.get(transition)", internal = true) - false -> outgoing.put(transition, target) - } - } - - fun addNFAStates(nfaState: NFAState) { - nfaStates.add(nfaState) - accepting = accepting || nfaState.isAccepting - start = start || nfaState.isStartState - } - - override fun isAccepting(): Boolean = accepting - - - /** - * Given a character, take a step in our DFA starting with `this` state and possible transitions that match [codePoint]. - * - * @param codePoint character to match against possible valid transitions - * - * @return next DFA state - */ - override fun step(codePoint: Int): IDFAState? { - val trans = Alphabet.Letter(codePoint) - when (outgoing.containsKey(trans)) { - true -> return outgoing[trans] - else -> { - if (outgoing.containsKey(Alphabet.AnyOneChar)) return outgoing[Alphabet.AnyOneChar] - else return DFADeadState - } - } - } - - - /** - * Given a possibly `null` string, starting from `this` DFA state run the automaton and return - * `true` if we exhaust [word] and we are in a an accepting state, false otherwise. - * - * @param word input to the DFA - * - * @return true if the DFA accepts the input, false otherwise - */ - override fun run(word: String?): Boolean { - var currentState: IDFAState = this - - word?.let { - it.codePointSequence().forEach { ele -> - val newState: IDFAState? = currentState.step(ele) - when (newState) { - null -> return false - DFADeadState -> return false - else -> currentState = newState - } - } - } - - return currentState.isAccepting() - } -} - -/** - * Represents a state in an NFA where - * - * - [stateNumber] is a number used for this state - * - [isAccepting] true when this State is a Final state, false otherwise - * - [isStartState] true when this State is a Start state, false othewise - * - [outgoing] map of alphabet letter to NFA State - */ -private class NFAState(val stateNumber: Int, - val isAccepting: Boolean, - val isStartState: Boolean, - val outgoing: MutableMap> = mutableMapOf>()) { - - fun get(transition: Alphabet): Set = - outgoing[transition]?.let { it } ?: setOf() - - fun addTransition(label: Alphabet, target: NFAState) { - when (outgoing.containsKey(label)) { - true -> outgoing[label]?.add(target) ?: mutableSetOf(target) - false -> outgoing.put(label, mutableSetOf(target)) - } - } - - /** - * Given a letter from the NFA's alphabet return the letter-closure from `this` state. - * - * @param alpha letter from the NFA's alphabet - * - * @return set of NFA states that make up the letter-closure--reachable states from `this` state - * through a combination of 1 transition of `alpha` and any sequence of 1 or more epsilon transitions. - * - */ - fun getOutgoingStates(alpha: Alphabet): Set = - when (alpha) { - is Alphabet.Letter -> getOutgoingStates(alpha.codePoint) - is Alphabet.Epsilon -> epsilonClosure() - is Alphabet.AnyOneChar, - is Alphabet.AnyZeroOrMoreChars -> { - val startSet = epsilonClosure().union(get(alpha)) - startSet.fold(get(alpha)) { acc, nfaState -> acc.union(nfaState.epsilonClosure()) } - } - } - - /** - * Convineance method for 1-character closure when given a [Char]. - */ - fun getOutgoingStates(letter: Char): Set = - closure(letter.toInt()) - - /** - * Given a character return the set of NFA States that are the character-closure for `this` node. - * - * The character closure is the set of all NFA State reachable through `this` state by - * following any combination of epsilon transitions and *one* non-epsilon transition that - * matches [codePoint]. - * - * - * @param codePoint character to check for transitions - * - * @return set of NFA states reachable though any combination of epsilon transitions and *one* - * non-epsilon transitions that matches the input character. - - */ - fun getOutgoingStates(codePoint: Int): Set = - closure(codePoint) - - /** - * Given a character return all states reachable from `this` state by 1 non-epsilon transition. - * - * @param codePoint character to check for transitions - * - * @return set of NFA States reachable from `this` state by 1 non-epsilon transition. - */ - fun getNonEpsilonTransitionTargets(codePoint: Int): Set = - get(Alphabet.Letter(codePoint)).union(get(Alphabet.AnyOneChar)) - - - /** - * Given a code point for a character return the character-closure for `this` node. - * The character closure is the set of all NFA State reachable through `this` state by - * following any combination of epsilon transitions and *one* non-epsilon transition that - * matches [codePoint]. - * - * - * @param codePoint character to check for transitions - * - * @return set of NFA states reachable though any combination of epsilon transitions and *one* - * non-epsilon transitions that matches the input character. - */ - fun closure(codePoint: Int): Set { - val reachableThroughEpsilon = epsilonClosure() - val reachableThroughNonEpsilon = getNonEpsilonTransitionTargets(codePoint).let { - it.fold(it.toSet(), { acc, state -> acc.union(state.epsilonClosure()) }) - } - return reachableThroughEpsilon.union(reachableThroughNonEpsilon) - } - - - /** - * Returns the espilon-closure of this NFA State. All states reachable from `this` state by using one or more - * epsilon transitions in succession. - * - * @return the set of NFA states that make the epsilon closure of `this` NFA state - */ - fun epsilonClosure(): Set = - get(Alphabet.Epsilon).let { - it.fold(it, { acc, state -> - acc.union(state.epsilonClosure()) - }) - } -} - - -/** - * Given the search pattern, possible escape character used in the search pattern and the size of the search pattern, - * build a DFA recognizer. The recognizer builds an NFA that then translates to a DFA. - * - * PRE-CONDITION: [pattern] is a valid LIKE pattern, i.e, the result of `checkPattern` function - * - * - * @param pattern valid search pattern as a [String] - * @param escape possible escape character - * @param patternSize size of the pattern - * - * @return DFA that accepts inputs which match [pattern] - */ -fun buildDfaFromPattern(pattern: String, escape: Int?, patternSize: Int): IDFAState { - escape?.let { - val patternAsNfaLetters = patternToSequenceOfNfaLetters(pattern, it) - val dfaAlpha = patternAsNfaLetters.map(nfaLettersToDfaAlphabet()).toSet() - return nfaToDfa(dfaAlpha, buildNfa(patternAsNfaLetters, patternSize)) - } - val patternAsNfaLetters = patternToSequenceOfNfaLetters(pattern) - val dfaAlpha = patternAsNfaLetters.map(nfaLettersToDfaAlphabet()).toSet() - return nfaToDfa(dfaAlpha, buildNfa(patternAsNfaLetters, patternSize)) - -} - -/** - * Given a search pattern and an escape character possibly used in the pattern, return the sequence - * of letters in the NFA's alphabet that correspond to the characters in the pattern. - * - * @param pattern search pattern - * @param escapeChar escape character - * - * @return sequence of lketters in the NFA's alphabet that correspond to the characters in the pattern - */ -private fun patternToSequenceOfNfaLetters(pattern: String, escapeChar: Int): Sequence { - val codePointIter = pattern.codePointSequence().iterator() - val result = ArrayList() - - while (codePointIter.hasNext()) { - val current = codePointIter.next() - when (current) { - escapeChar -> result.add(Alphabet.Letter(codePointIter.next())) // skip current, use successor as raw character - else -> result.add(codePointToAlphabetLetter(current)) - } - } - return result.asSequence() -} - -/** - * Given the search pattern return a sequence of [Alphabet] that holds the corresponding [Alphabet] instance for - * each character in the input pattern. - * - * @param pattern search pattern - * - * @return sequence of [Alphabet] for each character in the input - */ -private fun patternToSequenceOfNfaLetters(pattern: String): Sequence = - pattern.codePointSequence().map { - codePointToAlphabetLetter(it) - } - - -/** - * Given a character, return its corresponding Alphabet Letter - * - * @param codePoint input character as a code point - * - * @return corresponding [Alphabet] instance for the input - */ -private fun codePointToAlphabetLetter(codePoint: Int): Alphabet { - return when (codePoint) { - '_'.toInt() -> Alphabet.AnyOneChar - '%'.toInt() -> Alphabet.AnyZeroOrMoreChars - else -> Alphabet.Letter(codePoint) - } -} - -/** - * Function that given an instance of [Alphabet] for an NFA returns the appropriate [Alphabet] for the NFA's DFA. - * Change all zero or more letter to any one char letter. All other elements of the input remain unchanged. - * - */ -private fun nfaLettersToDfaAlphabet(): (Alphabet) -> Alphabet { - return { a -> - when (a) { - Alphabet.AnyZeroOrMoreChars -> Alphabet.AnyOneChar - else -> a - } - } -} - -/** - * Given the DFA alphabet and the start NFA state, return the DFA that simulates the NFA - * - * @param alphabet DFA alphabet - * @param nfa NFA start state - * - * @return DFA that simulates the NFA with start state [nfa] - */ -private fun nfaToDfa(alphabet: Set, nfa: NFAState) = - buildDFA(alphabet, mutableMapOf(), setOf(nfa.epsilonClosure().union(setOf(nfa)))) - - -/** - * Given the DFA alphabet, the current DFA delta and a set of sets of NFA State, process - * the set of sets of NFA States and update the DFA. - * - * This function builds the table that simulates the NFA to create the DFA - * - * @param dfaAlphabet DFA Alphabet, the rows of the table - * @param delta DFA delta function thus far - * @param set of sets of NFA states to process - * - * @return DFA that simulates the NFA - */ -private fun buildDFA(dfaAlphabet: Set, - delta: MutableMap, Alphabet>, Set>, - todo: Set>): DFAState { - - var unprocessed = todo.toMutableSet() - val processed = HashSet>() - while (unprocessed.isNotEmpty()) { - val nfaStates = unprocessed.first() - unprocessed.remove(nfaStates) - // delta = (Q x \Sigma) -> Q - // where Q is \Set(NFAState) - // maps to the type - // delta : Pair, Alphabet>, Set - val deltaUpdates: List, Alphabet>, Set>> = - dfaAlphabet.map { - Pair(Pair(nfaStates, it), - nfaStates.fold(setOf()) { acc, state -> - acc.union(state.getOutgoingStates(it)) - }) - } - processed.add(nfaStates) - updateDelta(delta, deltaUpdates) - val newStates = deltaUpdates.map { - it.second - }.filter { s -> - s.isNotEmpty() && !processed.contains(s) - }.toMutableSet() - unprocessed = unprocessed.union(newStates).toMutableSet() - } - - val nfaStateSetToDfaState = HashMap, DFAState>() - - - - delta.forEach { nfaStateSetToDfaState.put(it.key.first, DFAState(it.key.first.toMutableSet(), HashMap())) } - delta.forEach { (nfaSet, alpha), target -> - val targetDfa: DFAState = nfaStateSetToDfaState[target].let { it } ?: DFADeadState as DFAState - nfaStateSetToDfaState[nfaSet]?.addTransition(alpha, targetDfa) ?: errNoContext("DFA state for $nfaSet does not exist", internal = true) - } - - val dfaStartState = nfaStateSetToDfaState.values.filter { it.start } - if (dfaStartState.size == 1) return dfaStartState.first() - else errNoContext("DFA has more that 1 start state : $dfaStartState", internal = true) -} - -/** - * Given our current delta for the DFA and a list of updates, return the updated delta. - * - * @param delta current delta for the DFA - * @param deltaUpdates list of updates to be processed - * - * @return update [delta] that incorporates changes in [deltaUpdates] - */ -private fun updateDelta(delta: MutableMap, Alphabet>, Set>, - deltaUpdates: List, Alphabet>, Set>>) { - deltaUpdates.forEach { - if (delta.containsKey(it.first)) { - if (delta[it.first] != it.second) { - errNoContext("construction of DFA attempted to add the same transition with two distinct targets: $it.first, $it.second", internal = true) - } - } else { - delta.put(it.first, it.second) - } - } -} - - -/** - * Given the sequence of NFA letters that correspond to the search string and the search string's length - * build an NFA that accepts words that match [letters]. - * - * @param letters sequence of NFA letters that correspond to the search string - * @param patternSize size of the search string - * - * @return NFA that accepts words that match [letters] - * - */ -private fun buildNfa(letters: Sequence, patternSize: Int): NFAState = - letters.foldIndexed(mutableListOf(NFAState(-1, 0 == patternSize , true)), { index, acc, transition -> - alphabetToNFAStateAcc(transition, NFAState(index, index == (patternSize - 1), false), acc) - }).first() - -/** - * Given the current letter in the NFA's alphabet, the new NFA state created and the list of already created - * NFA states, add necessary transitions in the NFA states (new and old) to simulate a move of the NFA for the - * input letter. - * - * @param letter new letter for the NFA - * @param newState newly created NFA state - * @param acc accumulator that holds previously processed NFA states. - * - * @return updated list of NFA states - */ -private fun alphabetToNFAStateAcc(letter: Alphabet, newState: NFAState, acc: MutableList): MutableList = - when (letter) { - is Alphabet.Letter, is Alphabet.AnyOneChar -> { - acc.last().addTransition(letter, newState) - acc.add(newState) - acc - } - is Alphabet.AnyZeroOrMoreChars -> { - acc.last().addTransition(Alphabet.Epsilon, newState) - newState.addTransition(Alphabet.AnyOneChar, newState) - acc.add(newState) - acc - } - is Alphabet.Epsilon -> errNoContext("Found epsilon letter while processing pattern chars", internal = true) - } diff --git a/lang/src/org/partiql/lang/eval/like/CheckpointIterator.kt b/lang/src/org/partiql/lang/eval/like/CheckpointIterator.kt new file mode 100644 index 0000000000..c570734dc7 --- /dev/null +++ b/lang/src/org/partiql/lang/eval/like/CheckpointIterator.kt @@ -0,0 +1,31 @@ +package org.partiql.lang.eval.like + +/** + * Extends [Iterator] with the ability to save the current position and restore it later, + * thereby allowing a kind of infinite lookahead. + */ +internal interface CheckpointIterator : Iterator { + + /** + * Saves the current position on an internal stack. + * + * Every invocation of this function should be paired with either a [restoreCheckpoint] or [discardCheckpoint]. + */ + fun saveCheckpoint() + + /** + * Sets the current position to the last saved checkpoint and pops it off of the internal stack. + * + * Do not call this function without invoking [saveCheckpoint] first. + */ + fun restoreCheckpoint() + + /** + * Discards position currently on the top of the internal stack. + * + * Do not call this function without invoking [saveCheckpoint] first. + */ + fun discardCheckpoint() +} + + diff --git a/lang/src/org/partiql/lang/eval/like/CheckpointIteratorImpl.kt b/lang/src/org/partiql/lang/eval/like/CheckpointIteratorImpl.kt new file mode 100644 index 0000000000..a432b09fad --- /dev/null +++ b/lang/src/org/partiql/lang/eval/like/CheckpointIteratorImpl.kt @@ -0,0 +1,31 @@ +package org.partiql.lang.eval.like + +import java.util.Stack + + +/** An implementation of [CheckpointIterator] which is backed by a [List]. */ +internal class CheckpointIteratorImpl(private val backingList: List) : CheckpointIterator { + private val checkpointStack = Stack() + private var idx = -1 + + override fun hasNext(): Boolean = (backingList.size - 1) > idx + + override fun next(): T { + if(!hasNext()) throw NoSuchElementException() + return backingList[++idx] + } + + override fun saveCheckpoint() { + checkpointStack.push(idx) + } + + override fun restoreCheckpoint() { + idx = checkpointStack.pop() + } + + override fun discardCheckpoint() { + checkpointStack.pop() + } +} + + diff --git a/lang/src/org/partiql/lang/eval/like/CodepointCheckpointIterator.kt b/lang/src/org/partiql/lang/eval/like/CodepointCheckpointIterator.kt new file mode 100644 index 0000000000..055aa74f55 --- /dev/null +++ b/lang/src/org/partiql/lang/eval/like/CodepointCheckpointIterator.kt @@ -0,0 +1,33 @@ +package org.partiql.lang.eval.like + +import java.util.Stack + +/** An implementation of [CheckpointIterator] that iterates over the unicode codepoints within a string. */ +internal class CodepointCheckpointIterator(private val str: String) : CheckpointIterator { + private val checkpointStack = Stack() + private val codepointCount = str.codePointCount(0, str.length) + private var idx = -1 + + override fun hasNext(): Boolean = (codepointCount - 1) > idx + + override fun next(): Int { + if(!hasNext()) throw NoSuchElementException() + return str.codePointAt(++idx) + } + + fun skipToEnd() { + idx = codepointCount + } + + override fun saveCheckpoint() { + checkpointStack.push(idx) + } + + override fun restoreCheckpoint() { + idx = checkpointStack.pop() + } + + override fun discardCheckpoint() { + checkpointStack.pop() + } +} \ No newline at end of file diff --git a/lang/src/org/partiql/lang/eval/like/PatternPart.kt b/lang/src/org/partiql/lang/eval/like/PatternPart.kt new file mode 100644 index 0000000000..d6851c4add --- /dev/null +++ b/lang/src/org/partiql/lang/eval/like/PatternPart.kt @@ -0,0 +1,140 @@ +package org.partiql.lang.eval.like + +import kotlin.streams.toList + +internal sealed class PatternPart { + object AnyOneChar : PatternPart() + object ZeroOrMoreOfAnyChar : PatternPart() + data class ExactChars(val codepoints: IntArray) : PatternPart() { + override fun equals(other: Any?): Boolean { + if (this === other) return true + if (other !is ExactChars) return false + + if (!codepoints.contentEquals(other.codepoints)) return false + + return true + } + + override fun hashCode(): Int { + return codepoints.contentHashCode() + } + + } +} + +private const val ZERO_OR_MORE_OF_ANY_CHAR = '%'.toInt() +private const val ANY_ONE_CHAR = '_'.toInt() + +internal fun parsePattern(pattern: String, escapeChar: Int?): List { + val codepointList = pattern.codePoints().toList() + val codepointsItr = codepointList.listIterator() + val parts = ArrayList() + while(codepointsItr.hasNext()) { + val c = codepointsItr.next() + parts.add(when(c) { + ANY_ONE_CHAR -> PatternPart.AnyOneChar + ZERO_OR_MORE_OF_ANY_CHAR -> { + // consider consecutive `%` to be the same as one `%` + while(codepointsItr.hasNext() && codepointList[codepointsItr.nextIndex()] == ZERO_OR_MORE_OF_ANY_CHAR) { + codepointsItr.next() + } + + PatternPart.ZeroOrMoreOfAnyChar + } + else -> { + codepointsItr.previous() + // Build pattern for matching the exact string + val buffer = ArrayList() + // stop building if we encounter end of input + do { + val cc = codepointsItr.next() + // If [escapeChar] is encountered, just add the next codepoint to the buffer. + if(escapeChar != null && cc == escapeChar) { + buffer.add(codepointsItr.next()) + } else { + // stop building and back up one if we encounter `%` or `_` characters + if (cc == ANY_ONE_CHAR || cc == ZERO_OR_MORE_OF_ANY_CHAR) { + codepointsItr.previous() + break + } + buffer.add(cc) + } + + } while(codepointsItr.hasNext()) + + PatternPart.ExactChars(buffer.toIntArray()) + } + }) + } + + return parts +} + +internal fun executePattern(parts: List, str: String): Boolean { + return executePattern( + CheckpointIteratorImpl(parts), CodepointCheckpointIterator(str)) +} + +private fun executePattern(partsItr: CheckpointIterator, charsItr: CodepointCheckpointIterator): Boolean { + while (partsItr.hasNext()) { + if(!executeOnePart(partsItr, charsItr)) + return false + } + return !charsItr.hasNext() +} + +private fun executeOnePart(partsItr: CheckpointIterator, charsItr: CodepointCheckpointIterator): Boolean { + when (val currentPart = partsItr.next()) { + is PatternPart.AnyOneChar -> { + if(!charsItr.hasNext()) { + return false + } + + charsItr.next() + return true + } + is PatternPart.ExactChars -> { + currentPart.codepoints.forEach { + if (!charsItr.hasNext() || charsItr.next() != it) { + return false + } + } + return true + } + PatternPart.ZeroOrMoreOfAnyChar -> { + // No need to check the rest of the string if this is the last pattern part + if (!partsItr.hasNext()) { + charsItr.skipToEnd() // consume rest of string otherwise we will consider this a non-match. + return true + } + + while (true) { + // Mark checkpoints on our iterators so that we can store the current position + // of them later if the remaining pattern parts don't match. We will keep + // doing this and and advancing the current character position until the + // remaining pattern parts match. If we reach the end of the string, then there is no match. + partsItr.saveCheckpoint() + charsItr.saveCheckpoint() + + if (executePattern(partsItr, charsItr)) { + // Discard the checkpoint saved above. We don't technically need to do this + // but it prevents the *next* pattern part from executing again without need. + partsItr.discardCheckpoint() + charsItr.discardCheckpoint() + return true + } else { + // The next pattern did not match, restore the iterator positions for the next iteration + partsItr.restoreCheckpoint() + charsItr.restoreCheckpoint() + } + + if (!charsItr.hasNext()) { + return false + } + + charsItr.next() + } + } + } +} + diff --git a/lang/test/org/partiql/lang/eval/LikePredicateTest.kt b/lang/test/org/partiql/lang/eval/LikePredicateTest.kt index 15dd92beef..7b28224bec 100644 --- a/lang/test/org/partiql/lang/eval/LikePredicateTest.kt +++ b/lang/test/org/partiql/lang/eval/LikePredicateTest.kt @@ -627,7 +627,4 @@ class LikePredicateTest : EvaluatorTestBase() { NodeMetadata(1, 56)) { voidEval("SELECT * FROM `[{name:1, type:\"a\"}]` as a WHERE a.name LIKE a.type ") } - - - } diff --git a/lang/test/org/partiql/lang/eval/like/PatternPartTests.kt b/lang/test/org/partiql/lang/eval/like/PatternPartTests.kt new file mode 100644 index 0000000000..b6319f4620 --- /dev/null +++ b/lang/test/org/partiql/lang/eval/like/PatternPartTests.kt @@ -0,0 +1,142 @@ +package org.partiql.lang.eval.like + +import junitparams.JUnitParamsRunner +import junitparams.Parameters +import org.junit.Assert +import org.junit.Test +import org.junit.runner.RunWith +import kotlin.test.assertEquals + +@RunWith(JUnitParamsRunner::class) +class PatternPartTests { + + data class TestCase(val pattern: String, val escapeChar: Int?, val input: String, val shouldMatch: Boolean) + + private fun createTestCase(pattern: String, escapeChar: Char?, vectors: List>) = + vectors.map { TestCase(pattern, escapeChar?.toInt(), it.first, it.second) } + + fun parametersForPatternTest() = listOf( + createTestCase("a", null, listOf( + "" to false, + "a" to true, + "aa" to false, + "b" to false, + "bb" to false + )), + createTestCase("aa", null, listOf( + "" to false, + "a" to false, + "aa" to true, + "b" to false, + "bb" to false + )), + createTestCase("_", null, listOf( + "" to false, + "a" to true, + "b" to true, + "aa" to false, + "bb" to false + )), + createTestCase("__", null, listOf( + "a" to false, + "b" to false, + "aa" to true, + "bb" to true + )), + createTestCase("%", null, listOf( + "" to true, + "a" to true, + "bb" to true + )), + createTestCase("%%", null, listOf( + "" to true, + "a" to true, + "bb" to true + )), + createTestCase("a%", null, listOf( + "" to false, + "a" to true, + "ab" to true, + "abcde" to true, + "b" to false, + "ba" to false, + "baa" to false + )), + createTestCase("%a", null, listOf( + "" to false, + "a" to true, + "ba" to true, + "edcba" to true, + "b" to false, + "ab" to false, + "aab" to false + )), + createTestCase("%foo%bar%bat%baz%bork%borz%", null, listOf( + "" to false, + "foobarbatbazborkborz" to true, + "000foo1bar22bat333baz444bork555borz666" to true, + "000foo1bar22bat333baz444bork555borD666" to false + )), + createTestCase("%a%", null, listOf( + "" to false, + "a" to true, + "ab" to true, + "ba" to true, + "bab" to true, + "bbabb" to true, + "b" to false, + "bb" to false + )), + createTestCase("%_asdf_%", null, listOf( + "" to false, + "asdf" to false, + "1asdf1" to true, + "1asdf1x" to true, + "x1asdf1" to true, + "xyz1asdf1" to true, + "1asdf1xyz" to true, + "xyz1asdf1xyz" to true + )), + createTestCase("\\%\\_", '\\', listOf( + "" to false, + "%_" to true + )), + createTestCase("%\\%\\__", '\\', listOf( + "" to false, + "%_1" to true, + "asdf%_1" to true + )) + ).flatten() + + @Test + @Parameters + fun patternTest(tc: TestCase) { + val pat = parsePattern(tc.pattern, tc.escapeChar) + val actualMatches = executePattern(pat, tc.input) + + Assert.assertEquals(tc.shouldMatch, actualMatches) + } + + @Test + fun patternParserTest() { + // the parser should consider multiple consecutive % to be the same as one + val patParts = parsePattern("%%a%%%_%%% %%", escapeChar = null) + assertEquals( + listOf( + PatternPart.ZeroOrMoreOfAnyChar, + PatternPart.ExactChars("a".codePoints().toArray()), + PatternPart.ZeroOrMoreOfAnyChar, + PatternPart.AnyOneChar, + PatternPart.ZeroOrMoreOfAnyChar, + PatternPart.ExactChars(" ".codePoints().toArray()), + PatternPart.ZeroOrMoreOfAnyChar + ), + patParts) + } + + @Test + fun stressTest() { + // makes absolutely certain we do not stack overflow on too many consecutive `%` characters + assertEquals(true, executePattern(parsePattern("%".repeat(10000) + "a", escapeChar = null), "aaaa")) + } +} \ No newline at end of file