Skip to content

Commit

Permalink
feat: add pcre extended mode
Browse files Browse the repository at this point in the history
  • Loading branch information
bergmannjg committed Jul 21, 2024
1 parent abc349c commit 0f11bc6
Show file tree
Hide file tree
Showing 14 changed files with 333 additions and 191 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ghpages.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:
uses: actions/checkout@v4
- id: lean-action
name: Build and test
uses: leanprover/lean-action@v1-beta
uses: leanprover/lean-action@v1
with:
lint-module: Regex
use-github-cache: false
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,5 @@ benchmark/*.txt
x.txt
y.txt
testdata/pcre/JSON-PP-4.16
z.txt
*.log
40 changes: 24 additions & 16 deletions Main.lean
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,12 @@ structure RegexOptions where
wantsHelp : Bool := false
all : Bool := false
unescape : Bool := false
path : Option String := none
haystackPath : Option String := none
patternPath : Option String := none
unanchoredPrefix : Bool := true
unanchoredPrefixSimulation : Bool := false
flavor : Syntax.Flavor := Syntax.Flavor.Pcre
experimental : Bool := false
extended : Regex.Grammar.ExtendedKind := .None

abbrev CliMainM := ExceptT CliError MainM
abbrev CliStateM := StateT RegexOptions CliMainM
Expand All @@ -51,13 +52,15 @@ def takeOptArg (opt arg : String) : CliM String := do

def regexShortOption : (opt : Char) → CliM PUnit
| 'a' => modifyThe RegexOptions ({· with all := true})
| 'f' => do let path ← takeOptArg "-f" "path"; modifyThe RegexOptions ({· with path})
| 'f' => do let haystackPath ← takeOptArg "-f" "path"; modifyThe RegexOptions ({· with haystackPath})
| 'F' => do let patternPath ← takeOptArg "-F" "path"; modifyThe RegexOptions ({· with patternPath})
| 'h' => modifyThe RegexOptions ({· with wantsHelp := true})
| 'n' => modifyThe RegexOptions ({· with unanchoredPrefix := false})
| 's' => modifyThe RegexOptions ({· with unanchoredPrefixSimulation := true})
| 'u' => modifyThe RegexOptions ({· with unescape := true})
| 'v' => modifyThe RegexOptions ({· with verbosity := Lake.Verbosity.verbose})
| 'x' => modifyThe RegexOptions ({· with experimental := true})
| 'x' => modifyThe RegexOptions ({· with extended := .Extended})
| 'X' => modifyThe RegexOptions ({· with extended := .ExtendedMore})
| opt => throw <| CliError.unknownShortOption opt

def regexLongOption : (opt : String) → CliM PUnit
Expand Down Expand Up @@ -125,7 +128,8 @@ def grammar : CliM PUnit := do
let opts ← getThe RegexOptions
match ← takeArg? with
| some re =>
let g ← Regex.Grammar.parse re opts.flavor
let re := if opts.unescape then unescapeString re else re
let g ← Regex.Grammar.parse re opts.flavor opts.extended
IO.println s!"Grammar {opts.flavor}\n{g}"
| none => throw <| CliError.missingArg "re"

Expand All @@ -134,7 +138,7 @@ def grammarT : CliM PUnit := do
let opts ← getThe RegexOptions
match ← takeArg? with
| some re =>
let g ← Regex.Grammar.parse re opts.flavor
let g ← Regex.Grammar.parse re opts.flavor opts.extended
let g' ← Regex.Grammar.translate g
IO.println s!"Grammar translated\n{g'}"
| none => throw <| CliError.missingArg "re"
Expand All @@ -144,8 +148,8 @@ def ast : CliM PUnit := do
let opts ← getThe RegexOptions
match ← takeArg? with
| some re =>
let ast ← AstItems.parse re opts.flavor
IO.println s!"Ast{if opts.experimental then " experimental" else ""} {opts.flavor}\n{ast}"
let ast ← AstItems.parse re opts.flavor opts.extended
IO.println s!"Ast {opts.flavor}\n{ast}"
| none => throw <| CliError.missingArg "re"

def hir : CliM PUnit := do
Expand Down Expand Up @@ -173,26 +177,30 @@ def captures : CliM PUnit := do
let (re, haystack) :=
match ← takeArg?, ← takeArg? with
| some re, some haystack => pure (re, haystack)
| none, _ => throw <| CliError.missingArg "regex"
| some re, none =>
match opts.path with
| some x, none =>
match opts.haystackPath with
| some path => do
let haystack ← IO.FS.readFile path
pure (re, haystack)
| none => throw <| CliError.missingArg "haystack"

pure (x, haystack)
| none =>
match opts.patternPath with
| some path => do
let re ← IO.FS.readFile path
pure (re, x)
| none => throw <| CliError.missingArg "regex or haystack"
| none, _ => throw <| CliError.missingArg "regex and haystack"
let haystack := if opts.unescape then unescapeString haystack else haystack
let isVerbose := opts.verbosity == Lake.Verbosity.verbose
if isVerbose then
let chars := haystack.data |> List.map (fun (c : Char) => (UInt32.intAsString c.val))
IO.println s!"re '{re}' haystack chars '{chars}'"

let flags : Syntax.Flags := default
let config := {(default:Compiler.Config) with
let config := {(default : Compiler.Config) with
unanchored_prefix := opts.unanchoredPrefix
unanchored_prefix_simulation := opts.unanchoredPrefixSimulation }

let regex ← Regex.build re opts.flavor flags config
let regex ← Regex.build re opts.flavor flags config opts.extended
if isVerbose then IO.println s!"nfa {regex.nfa}"
if isVerbose then IO.println s!"nfa unanchored_prefix_in_backtrack {regex.nfa.unanchored_prefix_in_backtrack}"

Expand Down
1 change: 0 additions & 1 deletion Regex.lean
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ The following features are not yet implemented in the Pcre flavor:
* [conditional patterns](https://pcre2project.github.io/pcre2/doc/html/pcre2syntax.html#TOC1),
* [backtracking control](https://pcre2project.github.io/pcre2/doc/html/pcre2syntax.html#SEC28),
* [variable length look behind](https://pcre2project.github.io/pcre2/doc/html/pcre2syntax.html#SEC22),
* [extended patterns](https://perldoc.perl.org/perlre#Extended-Patterns),
* capture groups with a backreference should have fixed width (a restriction of the backtrack algorithm).
The following features are not implemented in the Rust flavor:
Expand Down
121 changes: 121 additions & 0 deletions Regex/Data/Parsec/Basic.lean
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
import Init.Meta
import Lean.Data.Parsec

open Lean Lean.Parsec Lean.Syntax

/-! Parsec utils for ReaderT and StateT -/
namespace Parsec

/-- extends `Lean.Parsec.attempt` -/
def attemptM (p : (ReaderT ρ $ StateT σ Parsec) α)
: (ReaderT ρ $ StateT σ Parsec) α := λ f s it =>
match p f s it with
| .success rem res => .success rem res
| .error _ err => .error it err

private def orElseT (p : Parsec α) (q : Unit → (ReaderT ρ $ StateT σ Parsec) α)
: (ReaderT ρ $ StateT σ Parsec) α := do
let flavor ← read
let state ← get
Parsec.orElse p (fun () => (·.1) <$> (q ()) flavor state)

instance : HOrElse (Parsec α) ((ReaderT ρ $ StateT σ Parsec) α) ((ReaderT ρ $ StateT σ Parsec) α) where
hOrElse := orElseT

/-- extends `Lean.Parsec.tryCatch` -/
def tryCatchT (p : StateT σ Parsec α)
(csuccess : α → StateT σ Parsec β)
(cerror : Unit → StateT σ Parsec β)
: StateT σ Parsec β := fun s it =>
match p s it with
| .success rem a => csuccess a.1 a.2 rem
| .error rem err =>
-- We assume that it.s never changes as the `Parsec` monad only modifies `it.pos`.
if it.pos = rem.pos then cerror () s rem else .error rem err

private partial def manyCoreT (p : StateT σ Parsec α) (acc : Array α) : StateT σ Parsec $ Array α :=
tryCatchT p (manyCoreT p <| acc.push ·) (fun _ => pure acc)

/-- extends `Lean.Parsec.many` -/
partial def manyM (p : (ReaderT ρ $ StateT σ Parsec) α)
: (ReaderT ρ $ StateT σ Parsec) $ Array α := do
manyCoreT (p (← read)) #[]

def failM {α : Type} (msg : String) : (ReaderT ρ $ StateT σ Parsec) α :=
fail msg

def withPos (p : Parsec α) : Parsec (String.Pos × String.Pos × α) := fun it =>
let pos := it.pos
match p it with
| .success rem a => .success rem (pos, rem.pos, a)
| .error rem err => .error rem err

def withPosM (p : (ReaderT ρ $ StateT σ Parsec) α)
: (ReaderT ρ $ StateT σ Parsec) (String.Pos × String.Pos × α) := fun f s it =>
let pos := it.pos
match p f s it with
| .success rem a => .success rem ((pos, rem.pos, a.1), a.2)
| .error rem err => .error rem err

def withPosSkip : Parsec String.Pos := do
let (_, t, _) ← withPos skip
pure t

def optSkipChar (c : Char) : Parsec Unit := do
match ← peek? with
| some c' => if c = c' then skip *> pure () else pure ()
| none => pure ()

def optSkipString (s : String) : Parsec Unit := do
match ← peek? with
| some _ => skipString s
| none => pure ()

/-- exec `check` on current char -/
def testChar (check : Char -> Bool) : Parsec Bool := do
match ← peek? with
| some c => if check c then pure true else pure false
| none => pure false

/-- exec `check` on current char and consume char on success -/
def tryChar (check : Char -> Bool) : Parsec $ Option Char := do
match ← peek? with
| some c => if check c then pure $ some (← anyChar) else pure none
| none => pure none

/-- exec `check` on current char and skip char on success -/
def trySkipChar (check : Char -> Bool) : Parsec Bool := do
if let some _ ← tryChar check then pure true else pure false

/-- exec `check` on current char and then exec `f` on consumed char on success -/
def tryCharThen (check : Char -> Bool) (f : Char → α) (msg : String := "") : Parsec α := do
if let some c ← tryChar check then pure $ f c
else fail msg

/-- exec `check` on current char and map `f` on consumed char on success -/
def tryCharMap (check : Char -> Bool) (f : Char → α) : Parsec $ Option α := do
if let some c ← tryChar check then pure $ f c
else pure none

/-- exec `check` on current char and then exec `p` on success -/
def tryCharThenPWithPos (check : Char -> Bool) (p : Parsec α)
: Parsec $ Option (String.Pos × String.Pos × α) := do
match ← peek? with
| some c => if check c then pure $ some (← withPos p) else pure none
| none => pure none

/-- exec `check` on current char and then exec `p` on success -/
def tryCharThenPWithPosM (check : Char -> Bool) (p : (ReaderT ρ $ StateT σ Parsec) α)
: (ReaderT ρ $ StateT σ Parsec) $ Option (String.Pos × String.Pos × α) := do
match ← peek? with
| some c => if check c then pure $ some (← withPosM (p (← read))) else pure none
| none => pure none

def tryCharWithPos (check : Char -> Bool)
: Parsec $ Option (String.Pos × String.Pos × Char) := do
tryCharThenPWithPos check anyChar

def tryCharWithPosMap (check : Char -> Bool) (f : Char → String.Pos → String.Pos → α)
: Parsec $ Option α := do
if let some (p1, p2, c) ← tryCharWithPos check then pure $ f c p1 p2
else pure none
5 changes: 2 additions & 3 deletions Regex/Regex.lean
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,9 @@ instance : ToString Captures where
/-- Build a Regex from the given pattern. -/
def build (s : String) (flavor : Syntax.Flavor := default)
(flags : Syntax.Flags := default) (config : Compiler.Config := default)
(extended : Regex.Grammar.ExtendedKind := .None)
: Except String Regex := do
if flags.extended.getD false
then throw (Syntax.AstItems.toError s .FeatureNotImplementedFlagExtended) else
let nfa ← Syntax.AstItems.parse s flavor >>= Syntax.translate flags >>= Compiler.compile config
let nfa ← Syntax.AstItems.parse s flavor extended >>= Syntax.translate flags >>= Compiler.compile config
Except.ok ⟨nfa⟩

namespace Log
Expand Down
4 changes: 2 additions & 2 deletions Regex/Syntax/Ast/Ast.lean
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,7 @@ inductive Flag where
/-- `R` -/
| CRLF
/-- `x` -/
| IgnoreWhitespace
| Extended

instance : ToString Flag where
toString s := match s with
Expand All @@ -375,7 +375,7 @@ instance : ToString Flag where
| .SwapGreed => "SwapGreed"
| .Unicode => "Unicode"
| .CRLF => "CRLF"
| .IgnoreWhitespace => "IgnoreWhitespace"
| .Extended => "Extended"

/-- The kind of an item in a group of flags. -/
inductive FlagsItemKind where
Expand Down
57 changes: 25 additions & 32 deletions Regex/Syntax/Ast/Parser.lean
Original file line number Diff line number Diff line change
Expand Up @@ -59,39 +59,31 @@ private def parseHyphen (p : String) (x : Syntax) : ParserM Ast := do
pure $ .Literal $ Literal.toLiteral c p f t
| _ => Except.error s!"ill-formed literal syntax {x}"

private def nameToClassSetItems : List (String × ClassAsciiKind × Bool) :=
private def nameToClassSetItems : List (String × ClassAsciiKind) :=
[
("[:alnum:]", .Alnum, false),
("[:alpha:]", .Alpha, false),
("[:ascii:]", .Ascii, false),
("[:blank:]", .Blank, false),
("[:cntrl:]", .Cntrl, false),
("[:digit:]", .Digit, false),
("[:lower:]", .Lower, false),
("[:print:]", .Print, false),
("[:punct:]", .Punct, false),
("[:space:]", .Space, false),
("[:upper:]", .Upper, false),
("[:word:]", .Word, false),
("[:^alnum:]", .Alnum, true),
("[:^alpha:]", .Alpha, true),
("[:^ascii:]", .Ascii, true),
("[:^blank:]", .Blank, true),
("[:^cntrl:]", .Cntrl, true),
("[:^digit:]", .Digit, true),
("[:^lower:]", .Lower, true),
("[:^print:]", .Print, true),
("[:^punct:]", .Punct, true),
("[:^space:]", .Space, true),
("[:^upper:]", .Upper, true),
("[:^word:]", .Word, true)
("alnum", .Alnum),
("alpha", .Alpha),
("ascii", .Ascii),
("blank", .Blank),
("cntrl", .Cntrl),
("digit", .Digit),
("lower", .Lower),
("print", .Print),
("punct", .Punct),
("space", .Space),
("upper", .Upper),
("word", .Word),
]

private def parsePosixCharacterClass (p : String) (x : Syntax) : ParserM ClassSetItem := do
match x with
| Syntax.node _ `posixCharacterClass #[Lean.Syntax.atom (.synthetic f t) name] =>
let (negated, name) :=
match name.data with
| '^' :: tail => (true, String.mk tail)
| _ => (false, name)
match nameToClassSetItems |> List.find? (fun (n, _) => n = name) with
| some (_, cls, negated) => pure $ ClassSetItem.Ascii ⟨⟨p, f, t⟩ , cls, negated⟩
| some (_, cls) => pure $ ClassSetItem.Ascii ⟨⟨p, f, t⟩ , cls, negated⟩
| none => Except.error s!"unkown posixCharacterClass {name}"
| _ => Except.error s!"ill-formed literal syntax {x}"

Expand Down Expand Up @@ -211,14 +203,13 @@ private def toFlags (chars : List Char) : ParserM $ Array FlagsItem := do
let items ← chars |> List.mapM (fun c =>
match c with
| '-' => pure ⟨"", FlagsItemKind.Negation⟩
| '^' => pure ⟨"", FlagsItemKind.Negation⟩
| 'i' => pure ⟨"", FlagsItemKind.Flag .CaseInsensitive⟩
| 'm' => pure ⟨"", FlagsItemKind.Flag .MultiLine⟩
| 's' => pure ⟨"", FlagsItemKind.Flag .DotMatchesNewLine⟩
| 'u' => pure ⟨"", FlagsItemKind.Flag .Unicode⟩
| 'U' => pure ⟨"", FlagsItemKind.Flag .SwapGreed⟩
| 'R' => pure ⟨"", FlagsItemKind.Flag .CRLF⟩
| 'x' => throw (toError "" .FeatureNotImplementedFlagExtended)
| 'x' => pure ⟨"", FlagsItemKind.Flag .Extended⟩
| _ => throw (toError "" .FlagUnrecognized))
pure items.toArray

Expand Down Expand Up @@ -315,7 +306,7 @@ private def set_width (pattern : String) (g : Group) : ParserM Group := do
pure (Group.mk span (.Lookaround (.NegativeLookbehind width)) ast)
| _ => pure g

def unfoldUnion (union : ClassSetUnion) : Sum ClassSetItem ClassSetBinaryOp :=
private def unfoldUnion (union : ClassSetUnion) : Sum ClassSetItem ClassSetBinaryOp :=
let ⟨span, items⟩ := union
match items.size with
| 0 => Sum.inl $ ClassSetItem.Empty span
Expand All @@ -330,6 +321,7 @@ mutual

private def parseClassSetItem (p : String) (x : Syntax) : ParserM ClassSetItem := do
match x with
| Syntax.node _ `whitespace _ => pure $ ClassSetItem.Empty ""
| Syntax.node _ `literal _ => parseLiteral p x >>= toClassSetItem
| Syntax.node _ `hyphen _ => parseHyphen p x >>= toClassSetItem
| Syntax.node _ `range #[a, b] => rangeToClassSetItem p a b
Expand All @@ -346,6 +338,8 @@ termination_by sizeOf x
private def parseVal (p : String) (x : Syntax)
: ParserM Ast := do
match x.getKind with
| `whitespace => pure Ast.Empty
| `comment => pure Ast.Empty
| `literal => parseLiteral p x
| `dot => parseDot p x
| `group =>
Expand Down Expand Up @@ -488,10 +482,9 @@ private def parseSequence' (p : String) (x : Syntax) : ParserM Ast := do
| Syntax.node _ `sequence arr => parseSequence p arr
| _ => Except.error s!"ill-formed sequence syntax {x}"


/-- Parse the regular expression and return an abstract syntax tree. -/
def parse (p : String) (flavor : Syntax.Flavor) : Except String Ast := do
let stx ← Regex.Grammar.parse p flavor >>= Regex.Grammar.translate
def parse (p : String) (flavor : Syntax.Flavor) (extended : Regex.Grammar.ExtendedKind := .None) : Except String Ast := do
let stx ← Regex.Grammar.parse p flavor extended >>= Regex.Grammar.translate
let (ast, parser) ← parseSequence' p stx.raw flavor default

if parser.capture_index < parser.max_backreference
Expand Down
Loading

0 comments on commit 0f11bc6

Please sign in to comment.