Skip to content

Commit ba17866

Browse files
committed
Improve repeat and non-greedy handling
1 parent b9eb162 commit ba17866

14 files changed

+290
-68
lines changed

benchmarks/src/main/scala/ZeroStarStarABenchmarks.scala

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ import org.openjdk.jmh.annotations.{Benchmark, Scope, State}
1212
class ZeroStarStarABenchmarks {
1313
val manyZeros: String = "0" * 20
1414
val java: Pattern = Pattern.compile("(0*)*A")
15-
val irrec: RegexC[Unit] = lit('0').chain(Greedy).chain(Greedy).void <* lit('A')
15+
val irrec: RegexC[Unit] = lit('0').star(Greedy).star(Greedy).void <* lit('A')
1616
val irrecMatcher: String => Boolean = irrec.stringMatcher
1717

1818
@Benchmark

docs/usage.md

+9-8
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ duration.toPattern
8989

9090
### random matches for a regular expression
9191

92-
Irrec provides support for creating [Scalacheck](https://www.scalacheck.org/) generators that produce values that match a regular expression. This generation is done efficiently as opposed to generating a bunch of random values and then filtering the ones that don't match the regular expression (which would quickly lead to Scalacheck giving up on generating matching values).
92+
Irrec provides [Scalacheck](https://www.scalacheck.org/) generators that produce values that match a regular expression. These can be useful for tests, or even just for glancing at random matches to ensure that your regular expression does what you intended. Check out the [regex-explorer](regex-explorer.md) for interactive browser-based regular expression exploration powered by irrec.
9393

9494
```scala mdoc:silent
9595
import ceedubs.irrec.regex.gen._, CharRegexGen._
@@ -103,20 +103,24 @@ val genDurationString: Gen[String] = genRegexMatchingString(duration)
103103
Gen.listOfN(3, genDurationString).apply(Gen.Parameters.default, Seed(1046531L))
104104
```
105105

106+
Were all of those results input that you intended your regular expression to accept?
107+
108+
This generation is done efficiently as opposed to generating a bunch of random values and then filtering the ones that don't match the regular expression (which would quickly lead to Scalacheck giving up on generating matching values).
109+
106110
Sometimes you may want to generate both matches and non-matches for your regular expression to make sure that both cases are handled. There are various `Gen` instances that will generate input that matches the regular expresssion roughly half of the time.
107111

108112
```scala mdoc:silent
109113
val genDurationCandidateString: Gen[String] =
110114
Gen.resize(12, genRegexCandidateString(duration))
111115

112116
val genExamples: Gen[List[(String, Option[Duration])]] =
113-
Gen.listOfN(3, genDurationCandidateString).map(candidates =>
117+
Gen.listOfN(4, genDurationCandidateString).map(candidates =>
114118
candidates.map(candidate => (candidate, durationParser.parseOnlyS(candidate)))
115119
)
116120
```
117121

118122
```scala mdoc
119-
genExamples.apply(Gen.Parameters.default, Seed(1046531L))
123+
genExamples.apply(Gen.Parameters.default, Seed(1046533L))
120124
```
121125

122126
### random regular expressions
@@ -154,13 +158,10 @@ regexesAndMatches.map(x =>
154158
Sometimes you may want to generate both matches and non-matches for your random regular expression to make sure that both cases are handled. There are various `Gen` instances for `RegexAndCandidate` that will generate random regular expressions along with data that matches the regular expresssion roughly half of the time.
155159

156160
```scala mdoc:silent
157-
val regexAndCandidateGen: Gen[RegexAndCandidate[Char, Double]] =
158-
Gen.resize(12, genAlphaNumRegexAndCandidate)
159-
160161
val regexesAndCandidatesGen: Gen[List[RegexAndCandidate[Char, Double]]] =
161-
Gen.listOfN(4, regexAndCandidateGen)
162+
Gen.listOfN(4, genAlphaNumRegexAndCandidate)
162163

163-
val regexesAndCandidates: List[RegexAndCandidate[Char, Double]] = regexesAndCandidatesGen.apply(Gen.Parameters.default.withSize(30), Seed(105771L)).get
164+
val regexesAndCandidates: List[RegexAndCandidate[Char, Double]] = regexesAndCandidatesGen.apply(Gen.Parameters.default.withSize(15), Seed(105375L)).get
164165
```
165166

166167
```scala mdoc

parser/src/main/scala/Parser.scala

+17-10
Original file line numberDiff line numberDiff line change
@@ -19,18 +19,24 @@ object Parser {
1919
sealed abstract class RepeatCount extends Product with Serializable {
2020
def min: Int = this match {
2121
case RepeatCount.Exact(n) => n
22-
case RepeatCount.Range(lower, _) => lower
22+
case RepeatCount.Range(lower, _, _) => lower
2323
}
2424

2525
def max: Option[Int] = this match {
2626
case RepeatCount.Exact(n) => Some(n)
27-
case RepeatCount.Range(_, upper) => upper
27+
case RepeatCount.Range(_, upper, _) => upper
28+
}
29+
30+
def onRegex(regex: RegexC[Unit]): RegexC[Unit] = this match {
31+
case RepeatCount.Exact(n) => regex.count(n).void
32+
case RepeatCount.Range(min, max, g) => regex.repeat(min, max, g).void
2833
}
2934
}
3035

3136
object RepeatCount {
3237
final case class Exact(n: Int) extends RepeatCount
33-
final case class Range(lowerInclusive: Int, upperInclusive: Option[Int]) extends RepeatCount
38+
final case class Range(lowerInclusive: Int, upperInclusive: Option[Int], greediness: Greediness)
39+
extends RepeatCount
3440
}
3541

3642
private val escapableCharToLit: Map[Char, Char] = specialNonCharClassCharToLit + ('-' -> '-')
@@ -115,9 +121,11 @@ object Parser {
115121
def repeatCount[_: P]: P[RepeatCount] =
116122
P(
117123
"{" ~/ (
118-
(posInt ~ "," ~/ posInt.?).map { case (l, h) => RepeatCount.Range(l, h) } |
119-
posInt.map(RepeatCount.Exact(_))
120-
) ~ "}").opaque("repeat count such as '{3}', '{1,4}', or '{3,}'")
124+
(posInt ~ "," ~/ posInt.? ~/ "}" ~/ (P("?").map(_ => Greediness.NonGreedy) | Pass(
125+
Greediness.Greedy))).map { case (l, h, g) => RepeatCount.Range(l, h, g) } |
126+
(posInt.map(RepeatCount.Exact(_)) ~ "}")
127+
)
128+
).opaque("repeat count such as '{3}', '{1,4}', `{1, 4}?`, '{3,}', or `{3,}?")
121129

122130
def charOrRange[_: P]: P[Match.MatchSet[Char]] =
123131
matchCharRange.map(r => MatchSet.allow(Diet.fromRange(r))) |
@@ -187,12 +195,11 @@ object Parser {
187195

188196
def factor[_: P]: P[RegexC[Unit]] = P {
189197
base.flatMap { r =>
190-
// TODO greediness
191-
// TODO voids
192-
P("*").map(_ => r.chain(Greediness.Greedy).void) |
198+
P("*?").map(_ => r.star(Greediness.NonGreedy).void) |
199+
P("*").map(_ => r.star(Greediness.Greedy).void) |
193200
P("+").map(_ => r.oneOrMore(Greediness.Greedy).void) |
194201
P("?").map(_ => r.optional.void) |
195-
repeatCount.map(count => r.repeat(count.min, count.max, Greediness.Greedy).void) |
202+
repeatCount.map(_.onRegex(r)) |
196203
Pass(r)
197204
}
198205
}

regex-gen/src/main/scala/CharRegexGen.scala

+4-1
Original file line numberDiff line numberDiff line change
@@ -63,5 +63,8 @@ object CharRegexGen {
6363
regexMatchingStringGenFromDiet(supportedCharacters)
6464

6565
def genRegexCandidateString[Out]: RegexC[Out] => Gen[String] =
66-
r => Gen.oneOf(regexMatchingStringGenFromDiet(supportedCharacters)(r), arbitrary[String])
66+
r =>
67+
Gen.oneOf(
68+
regexMatchingStringGenFromDiet(supportedCharacters)(r),
69+
Gen.frequency(2 -> Gen.asciiStr, 1 -> arbitrary[String]))
6770
}

regex-gen/src/main/scala/RegexGen.scala

+21-5
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ object RegexGen {
4242

4343
def genMatch[A: Discrete: Order](genA: Gen[A], genDietA: Gen[Diet[A]]): Gen[Match[A]] =
4444
Gen.frequency(
45-
9 -> genA.map(Match.lit(_)),
45+
16 -> genA.map(Match.lit(_)),
4646
3 -> genDietA.map(MatchSet.allow(_)),
4747
2 -> genDietA.map(MatchSet.forbid(_)),
4848
1 -> Gen.const(Match.wildcard)
@@ -82,7 +82,7 @@ object RegexGen {
8282
else
8383
Gen.frequency(
8484
// AndThen
85-
5 -> (
85+
20 -> (
8686
for {
8787
rIDepth <- Gen.choose(1, depth - 1)
8888
rIGen <- genRegexWithEv[In](cfg).apply(rIDepth)
@@ -95,7 +95,7 @@ object RegexGen {
9595
} yield combinator.andThen(rf, rI)
9696
),
9797
// FMap
98-
2 -> (for {
98+
3 -> (for {
9999
regexGen <- genRegexWithEv[In](cfg).apply(depth - 1)
100100
r <- regexGen.evidence.regexGen
101101
f <- {
@@ -111,8 +111,24 @@ object RegexGen {
111111
nel <- depths.traverse(depth => genRegexWithDepth[In, Out](cfg, depth))
112112
} yield Regex.Or[In, Match[In], Out](nel)
113113
),
114+
// Repeat
115+
3 -> (
116+
for {
117+
maxCount <- Gen.choose(1, math.min(depth - 1, 5))
118+
rIGen <- genRegexWithEv[In](cfg).apply(depth - maxCount)
119+
rI <- rIGen.evidence.regexGen
120+
min <- Gen.chooseNum(0, maxCount)
121+
max <- Gen.frequency(1 -> None, 5 -> Some(maxCount))
122+
g <- arbitrary[Greediness]
123+
z <- arbitrary[Out]
124+
fold <- {
125+
implicit val iCogen = rIGen.evidence.cogenOut
126+
arbitrary[(Out, rIGen.T) => Out]
127+
}
128+
} yield combinator.repeatFold(rI, min, max, g, z)(fold)
129+
),
114130
// Star
115-
1 -> (
131+
2 -> (
116132
for {
117133
rIGen <- genRegexWithEv[In](cfg).apply(depth - 1)
118134
rI <- rIGen.evidence.regexGen
@@ -122,7 +138,7 @@ object RegexGen {
122138
implicit val iCogen = rIGen.evidence.cogenOut
123139
arbitrary[(Out, rIGen.T) => Out]
124140
}
125-
} yield combinator.star(rI, g, z)(fold)
141+
} yield combinator.starFold(rI, g, z)(fold)
126142
)
127143
)
128144
}

regex-gen/src/main/scala/RegexMatchGen.scala

+7
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,13 @@ object RegexMatchGen {
4242
star =
4343
λ[λ[i => (RegexM[In, i], Greediness, Out, (Out, i) => Out)] ~> λ[a => Gen[Stream[In]]]](
4444
t => Gen.containerOf[Stream, Stream[In]](outer.apply(t._1)).map(_.flatten)),
45+
repeat =
46+
λ[λ[i => (RegexM[In, i], Int, Option[Int], Greediness, Out, (Out, i) => Out)] ~> λ[
47+
a => Gen[Stream[In]]]](t =>
48+
for {
49+
count <- Gen.chooseNum(t._2, math.max(t._2, t._3.getOrElse(5)))
50+
nestedStream <- Gen.containerOfN[Stream, Stream[In]](count, outer.apply(t._1))
51+
} yield nestedStream.flatten),
4552
mapped =
4653
λ[λ[a => (RegexM[In, a], a => Out)] ~> λ[a => Gen[Stream[In]]]](t => outer.apply(t._1)),
4754
or = alternatives => Gen.oneOf(alternatives.toList).flatMap(apply),

regex/src/main/scala/Regex.scala

+80-4
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,8 @@ import cats.implicits._
2525
* and [[ceedubs.irrec.regex.RegexCOps]].
2626
*/
2727
sealed abstract class Regex[-In, +M, Out] extends Serializable {
28-
def chain(greediness: Greediness): Regex[In, M, Chain[Out]] =
29-
combinator.chain(this, greediness)
28+
def star(greediness: Greediness): Regex[In, M, Chain[Out]] =
29+
combinator.star(this, greediness)
3030

3131
def many: Regex[In, M, Chain[Out]] = combinator.many(this)
3232

@@ -85,6 +85,38 @@ object Regex {
8585
extends Regex[In, M, Out] {
8686
type Init = I
8787
}
88+
89+
final case class Repeat[-In, +M, I, Out](
90+
r: Regex[In, M, I],
91+
minInclusive: Int,
92+
maxInclusive: Option[Int],
93+
greediness: Greediness,
94+
z: Out,
95+
fold: (Out, I) => Out)
96+
extends Regex[In, M, Out] {
97+
type Init = I
98+
99+
def expand: Regex[In, M, Out] = {
100+
val tail = maxInclusive.fold(combinator.star(r, greediness).some) { max =>
101+
if (max <= minInclusive) None
102+
else {
103+
(0 to (max - minInclusive)).toList.toNel.map { counts =>
104+
val orderedCounts = greediness match {
105+
// TODO reversed?
106+
case Greediness.Greedy => counts.reverse
107+
case Greediness.NonGreedy => counts
108+
}
109+
Regex.Or(orderedCounts.map(i => expandedCount(i, r)))
110+
}
111+
}
112+
}
113+
val head = expandedCount(minInclusive, r)
114+
tail
115+
.fold(head)(tail => head.map2(tail)(_ concat _))
116+
.map(_.foldLeft(z)(fold))
117+
}
118+
}
119+
88120
// TODO efficiently handle with NFA
89121
final case class Void[-In, +M, I](r: Regex[In, M, I]) extends Regex[In, M, Unit] {
90122
type Init = I
@@ -97,6 +129,7 @@ object Regex {
97129
case Eps => F.pure(Eps)
98130
case x @ Fail() => F.pure(x)
99131
case Star(r, g, z, fold) => traverseM(r)(f).map(Star(_, g, z, fold))
132+
case Repeat(r, min, max, g, z, fold) => traverseM(r)(f).map(Repeat(_, min, max, g, z, fold))
100133
case FMap(r, g) => traverseM(r)(f).map(FMap(_, g))
101134
case Or(alternatives) => alternatives.traverse(traverseM(_)(f)).map(Or(_))
102135
case AndThen(l, r) => traverseM(l)(f).map2(traverseM(r)(f))(AndThen(_, _))
@@ -109,6 +142,8 @@ object Regex {
109142
elem: (M, In => Option[Out]) => R,
110143
andThen: λ[i => (Regex[In, M, i => Out], Regex[In, M, i])] ~> λ[a => R],
111144
star: λ[i => (Regex[In, M, i], Greediness, Out, (Out, i) => Out)] ~> λ[a => R],
145+
repeat: λ[i => (Regex[In, M, i], Int, Option[Int], Greediness, Out, (Out, i) => Out)] ~> λ[
146+
a => R],
112147
mapped: λ[a => (Regex[In, M, a], a => Out)] ~> λ[a => R],
113148
or: NonEmptyList[Regex[In, M, Out]] => R,
114149
void: Is[Unit, Out] => Regex[In, M, ?] ~> λ[a => R]
@@ -117,6 +152,7 @@ object Regex {
117152
case Or(alternatives) => or(alternatives)
118153
case e: Elem[In, M, Out] => elem(e.metadata, e.apply)
119154
case Star(r, g, z, f) => star((r, g, z, f))
155+
case Repeat(r, min, max, g, z, f) => repeat((r, min, max, g, z, f))
120156
case FMap(r, f) => mapped((r, f))
121157
case Eps => eps(Is.refl[Unit])
122158
case Fail() => fail()
@@ -152,7 +188,7 @@ object Regex {
152188
// TODO Stream is deprecated in 2.13, right?
153189
// TODO use Cont/ContT?
154190
// TODO return a custom type?
155-
def compileCont[In, M, A, R](
191+
private def compileCont[In, M, A, R](
156192
re: Regex[In, (ThreadId, M), A]): Cont[A => Stream[Thread[In, R]]] => Stream[Thread[In, R]] = {
157193
type ContOut = Cont[A => Stream[Thread[In, R]]] => Stream[Thread[In, R]]
158194
Regex.fold[In, (ThreadId, M), A, ContOut](
@@ -200,6 +236,18 @@ object Regex {
200236
threads(z, _)
201237
}
202238
},
239+
repeat = new (λ[i => (
240+
Regex[In, (ThreadId, M), i],
241+
Int,
242+
Option[Int],
243+
Greediness,
244+
A,
245+
(A, i) => A)] ~> λ[a => ContOut]) {
246+
def apply[i](
247+
t: (Regex[In, (ThreadId, M), i], Int, Option[Int], Greediness, A, (A, i) => A)): ContOut =
248+
sys.error(
249+
"compileCont called with a Repeat instance that hadn't been expanded. This should never happen.")
250+
},
203251
mapped = new (λ[a => (Regex[In, (ThreadId, M), a], a => A)] ~> λ[a => ContOut]) {
204252
def apply[i](t: (Regex[In, (ThreadId, M), i], i => A)): ContOut = {
205253
val rc = compileCont[In, M, i, R](t._1)
@@ -216,14 +264,42 @@ object Regex {
216264
)(re)
217265
}
218266

267+
private def expandRepeat[In, M]: Regex[In, M, ?] ~> Regex[In, M, ?] =
268+
new (Regex[In, M, ?] ~> Regex[In, M, ?]) {
269+
def apply[A](r: Regex[In, M, A]): Regex[In, M, A] = r match {
270+
case r: Repeat[_, _, _, _] => r.expand
271+
case r => r
272+
}
273+
}
274+
219275
def compile[In, M, Out](r: Regex[In, M, Out]): ParseState[In, Out] = {
220276
val threads =
221277
Regex
222-
.compileCont(assignThreadIds(r))
278+
.compileCont(assignThreadIds(transformRecursive(expandRepeat)(r)))
223279
.apply(Cont.Single((out: Out) => Stream(Thread.Accept[In, Out](out))))
224280
ParseState.fromThreads(threads)
225281
}
226282

283+
def transformRecursive[In, M](
284+
f: Regex[In, M, ?] ~> Regex[In, M, ?]): Regex[In, M, ?] ~> Regex[In, M, ?] =
285+
new (Regex[In, M, ?] ~> Regex[In, M, ?]) {
286+
def apply[A](fa: Regex[In, M, A]): Regex[In, M, A] = fa match {
287+
case Eps => f(Eps)
288+
case x @ Fail() => f(x)
289+
case x: Elem[_, _, _] => f(x)
290+
case AndThen(l, r) => f(AndThen(apply(l), apply(r)))
291+
case Or(alternatives) => f(Or(alternatives.map(apply(_))))
292+
case FMap(r, g) => f(FMap(apply(r), g))
293+
case Star(r, greediness, z, fold) => f(Star(apply(r), greediness, z, fold))
294+
case Repeat(r, minInclusive, maxInclusive, greediness, z, fold) =>
295+
f(Repeat(apply(r), minInclusive, maxInclusive, greediness, z, fold))
296+
case Void(r) => f(Void(apply(r)))
297+
}
298+
}
299+
300+
private def expandedCount[In, M, Out](n: Int, r: Regex[In, M, Out]): Regex[In, M, Chain[Out]] =
301+
Chain.fromSeq(1 to n).traverse(_ => r)
302+
227303
// TODO optimize
228304
// TODO naming/documentation
229305
def matcher[F[_]: Foldable, In, M, Out](r: Regex[In, M, Out]): F[In] => Boolean = {

regex/src/main/scala/RegexOps.scala

+2-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@ final class RegexOps[In, M, Out](private val r: Regex[In, M, Out]) extends AnyVa
1111
def either[Out2](o: Regex[In, M, Out2]): Regex[In, M, Either[Out, Out2]] =
1212
combinator.either(r, o)
1313

14-
def star[Out2](g: Greediness, z: Out2)(fold: (Out2, Out) => Out2) = combinator.star(r, g, z)(fold)
14+
def starFold[Out2](g: Greediness, z: Out2)(fold: (Out2, Out) => Out2) =
15+
combinator.starFold(r, g, z)(fold)
1516

1617
def matcher[F[_]: Foldable]: F[In] => Boolean = Regex.matcher(r)
1718

0 commit comments

Comments
 (0)