From 0f11bc6704c53e51047c65219a4041d18dda1cd4 Mon Sep 17 00:00:00 2001 From: "J. Bergmann" Date: Sun, 21 Jul 2024 15:26:15 +0200 Subject: [PATCH] feat: add pcre extended mode --- .github/workflows/ghpages.yml | 2 +- .gitignore | 2 + Main.lean | 40 +++--- Regex.lean | 1 - Regex/Data/Parsec/Basic.lean | 121 ++++++++++++++++ Regex/Regex.lean | 5 +- Regex/Syntax/Ast/Ast.lean | 4 +- Regex/Syntax/Ast/Parser.lean | 57 ++++---- Regex/Syntax/Grammar/Grammar.lean | 231 ++++++++++++++++-------------- Regex/Syntax/Hir.lean | 3 +- Regex/Syntax/Translate.lean | 2 +- scripts/nolints.json | 12 +- test/PcreLoader.lean | 33 +++-- test/RegexTest.lean | 11 +- 14 files changed, 333 insertions(+), 191 deletions(-) create mode 100644 Regex/Data/Parsec/Basic.lean diff --git a/.github/workflows/ghpages.yml b/.github/workflows/ghpages.yml index 8f0ea47..3d3563e 100644 --- a/.github/workflows/ghpages.yml +++ b/.github/workflows/ghpages.yml @@ -28,7 +28,7 @@ jobs: uses: actions/checkout@v4 - id: lean-action name: Build and test - uses: leanprover/lean-action@v1-beta + uses: leanprover/lean-action@v1 with: lint-module: Regex use-github-cache: false diff --git a/.gitignore b/.gitignore index 3bee624..fbfafd0 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,5 @@ benchmark/*.txt x.txt y.txt testdata/pcre/JSON-PP-4.16 +z.txt +*.log diff --git a/Main.lean b/Main.lean index 09f5056..d44f498 100644 --- a/Main.lean +++ b/Main.lean @@ -23,11 +23,12 @@ structure RegexOptions where wantsHelp : Bool := false all : Bool := false unescape : Bool := false - path : Option String := none + haystackPath : Option String := none + patternPath : Option String := none unanchoredPrefix : Bool := true unanchoredPrefixSimulation : Bool := false flavor : Syntax.Flavor := Syntax.Flavor.Pcre - experimental : Bool := false + extended : Regex.Grammar.ExtendedKind := .None abbrev CliMainM := ExceptT CliError MainM abbrev CliStateM := StateT RegexOptions CliMainM @@ -51,13 +52,15 @@ def takeOptArg (opt arg : String) : CliM String := do def regexShortOption : (opt : Char) → CliM PUnit | 'a' => modifyThe RegexOptions ({· with all := true}) -| 'f' => do let path ← takeOptArg "-f" "path"; modifyThe RegexOptions ({· with path}) +| 'f' => do let haystackPath ← takeOptArg "-f" "path"; modifyThe RegexOptions ({· with haystackPath}) +| 'F' => do let patternPath ← takeOptArg "-F" "path"; modifyThe RegexOptions ({· with patternPath}) | 'h' => modifyThe RegexOptions ({· with wantsHelp := true}) | 'n' => modifyThe RegexOptions ({· with unanchoredPrefix := false}) | 's' => modifyThe RegexOptions ({· with unanchoredPrefixSimulation := true}) | 'u' => modifyThe RegexOptions ({· with unescape := true}) | 'v' => modifyThe RegexOptions ({· with verbosity := Lake.Verbosity.verbose}) -| 'x' => modifyThe RegexOptions ({· with experimental := true}) +| 'x' => modifyThe RegexOptions ({· with extended := .Extended}) +| 'X' => modifyThe RegexOptions ({· with extended := .ExtendedMore}) | opt => throw <| CliError.unknownShortOption opt def regexLongOption : (opt : String) → CliM PUnit @@ -125,7 +128,8 @@ def grammar : CliM PUnit := do let opts ← getThe RegexOptions match ← takeArg? with | some re => - let g ← Regex.Grammar.parse re opts.flavor + let re := if opts.unescape then unescapeString re else re + let g ← Regex.Grammar.parse re opts.flavor opts.extended IO.println s!"Grammar {opts.flavor}\n{g}" | none => throw <| CliError.missingArg "re" @@ -134,7 +138,7 @@ def grammarT : CliM PUnit := do let opts ← getThe RegexOptions match ← takeArg? with | some re => - let g ← Regex.Grammar.parse re opts.flavor + let g ← Regex.Grammar.parse re opts.flavor opts.extended let g' ← Regex.Grammar.translate g IO.println s!"Grammar translated\n{g'}" | none => throw <| CliError.missingArg "re" @@ -144,8 +148,8 @@ def ast : CliM PUnit := do let opts ← getThe RegexOptions match ← takeArg? with | some re => - let ast ← AstItems.parse re opts.flavor - IO.println s!"Ast{if opts.experimental then " experimental" else ""} {opts.flavor}\n{ast}" + let ast ← AstItems.parse re opts.flavor opts.extended + IO.println s!"Ast {opts.flavor}\n{ast}" | none => throw <| CliError.missingArg "re" def hir : CliM PUnit := do @@ -173,14 +177,18 @@ def captures : CliM PUnit := do let (re, haystack) := ← match ← takeArg?, ← takeArg? with | some re, some haystack => pure (re, haystack) - | none, _ => throw <| CliError.missingArg "regex" - | some re, none => - match opts.path with + | some x, none => + match opts.haystackPath with | some path => do let haystack ← IO.FS.readFile path - pure (re, haystack) - | none => throw <| CliError.missingArg "haystack" - + pure (x, haystack) + | none => + match opts.patternPath with + | some path => do + let re ← IO.FS.readFile path + pure (re, x) + | none => throw <| CliError.missingArg "regex or haystack" + | none, _ => throw <| CliError.missingArg "regex and haystack" let haystack := if opts.unescape then unescapeString haystack else haystack let isVerbose := opts.verbosity == Lake.Verbosity.verbose if isVerbose then @@ -188,11 +196,11 @@ def captures : CliM PUnit := do IO.println s!"re '{re}' haystack chars '{chars}'" let flags : Syntax.Flags := default - let config := {(default:Compiler.Config) with + let config := {(default : Compiler.Config) with unanchored_prefix := opts.unanchoredPrefix unanchored_prefix_simulation := opts.unanchoredPrefixSimulation } - let regex ← Regex.build re opts.flavor flags config + let regex ← Regex.build re opts.flavor flags config opts.extended if isVerbose then IO.println s!"nfa {regex.nfa}" if isVerbose then IO.println s!"nfa unanchored_prefix_in_backtrack {regex.nfa.unanchored_prefix_in_backtrack}" diff --git a/Regex.lean b/Regex.lean index 79e750f..37870ab 100644 --- a/Regex.lean +++ b/Regex.lean @@ -37,7 +37,6 @@ The following features are not yet implemented in the Pcre flavor: * [conditional patterns](https://pcre2project.github.io/pcre2/doc/html/pcre2syntax.html#TOC1), * [backtracking control](https://pcre2project.github.io/pcre2/doc/html/pcre2syntax.html#SEC28), * [variable length look behind](https://pcre2project.github.io/pcre2/doc/html/pcre2syntax.html#SEC22), -* [extended patterns](https://perldoc.perl.org/perlre#Extended-Patterns), * capture groups with a backreference should have fixed width (a restriction of the backtrack algorithm). The following features are not implemented in the Rust flavor: diff --git a/Regex/Data/Parsec/Basic.lean b/Regex/Data/Parsec/Basic.lean new file mode 100644 index 0000000..2ef8f49 --- /dev/null +++ b/Regex/Data/Parsec/Basic.lean @@ -0,0 +1,121 @@ +import Init.Meta +import Lean.Data.Parsec + +open Lean Lean.Parsec Lean.Syntax + +/-! Parsec utils for ReaderT and StateT -/ +namespace Parsec + +/-- extends `Lean.Parsec.attempt` -/ +def attemptM (p : (ReaderT ρ $ StateT σ Parsec) α) + : (ReaderT ρ $ StateT σ Parsec) α := λ f s it => + match p f s it with + | .success rem res => .success rem res + | .error _ err => .error it err + +private def orElseT (p : Parsec α) (q : Unit → (ReaderT ρ $ StateT σ Parsec) α) + : (ReaderT ρ $ StateT σ Parsec) α := do + let flavor ← read + let state ← get + Parsec.orElse p (fun () => (·.1) <$> (q ()) flavor state) + +instance : HOrElse (Parsec α) ((ReaderT ρ $ StateT σ Parsec) α) ((ReaderT ρ $ StateT σ Parsec) α) where + hOrElse := orElseT + +/-- extends `Lean.Parsec.tryCatch` -/ +def tryCatchT (p : StateT σ Parsec α) + (csuccess : α → StateT σ Parsec β) + (cerror : Unit → StateT σ Parsec β) + : StateT σ Parsec β := fun s it => + match p s it with + | .success rem a => csuccess a.1 a.2 rem + | .error rem err => + -- We assume that it.s never changes as the `Parsec` monad only modifies `it.pos`. + if it.pos = rem.pos then cerror () s rem else .error rem err + +private partial def manyCoreT (p : StateT σ Parsec α) (acc : Array α) : StateT σ Parsec $ Array α := + tryCatchT p (manyCoreT p <| acc.push ·) (fun _ => pure acc) + +/-- extends `Lean.Parsec.many` -/ +partial def manyM (p : (ReaderT ρ $ StateT σ Parsec) α) + : (ReaderT ρ $ StateT σ Parsec) $ Array α := do + manyCoreT (p (← read)) #[] + +def failM {α : Type} (msg : String) : (ReaderT ρ $ StateT σ Parsec) α := + fail msg + +def withPos (p : Parsec α) : Parsec (String.Pos × String.Pos × α) := fun it => + let pos := it.pos + match p it with + | .success rem a => .success rem (pos, rem.pos, a) + | .error rem err => .error rem err + +def withPosM (p : (ReaderT ρ $ StateT σ Parsec) α) + : (ReaderT ρ $ StateT σ Parsec) (String.Pos × String.Pos × α) := fun f s it => + let pos := it.pos + match p f s it with + | .success rem a => .success rem ((pos, rem.pos, a.1), a.2) + | .error rem err => .error rem err + +def withPosSkip : Parsec String.Pos := do + let (_, t, _) ← withPos skip + pure t + +def optSkipChar (c : Char) : Parsec Unit := do + match ← peek? with + | some c' => if c = c' then skip *> pure () else pure () + | none => pure () + +def optSkipString (s : String) : Parsec Unit := do + match ← peek? with + | some _ => skipString s + | none => pure () + +/-- exec `check` on current char -/ +def testChar (check : Char -> Bool) : Parsec Bool := do + match ← peek? with + | some c => if check c then pure true else pure false + | none => pure false + +/-- exec `check` on current char and consume char on success -/ +def tryChar (check : Char -> Bool) : Parsec $ Option Char := do + match ← peek? with + | some c => if check c then pure $ some (← anyChar) else pure none + | none => pure none + +/-- exec `check` on current char and skip char on success -/ +def trySkipChar (check : Char -> Bool) : Parsec Bool := do + if let some _ ← tryChar check then pure true else pure false + +/-- exec `check` on current char and then exec `f` on consumed char on success -/ +def tryCharThen (check : Char -> Bool) (f : Char → α) (msg : String := "") : Parsec α := do + if let some c ← tryChar check then pure $ f c + else fail msg + +/-- exec `check` on current char and map `f` on consumed char on success -/ +def tryCharMap (check : Char -> Bool) (f : Char → α) : Parsec $ Option α := do + if let some c ← tryChar check then pure $ f c + else pure none + +/-- exec `check` on current char and then exec `p` on success -/ +def tryCharThenPWithPos (check : Char -> Bool) (p : Parsec α) + : Parsec $ Option (String.Pos × String.Pos × α) := do + match ← peek? with + | some c => if check c then pure $ some (← withPos p) else pure none + | none => pure none + +/-- exec `check` on current char and then exec `p` on success -/ +def tryCharThenPWithPosM (check : Char -> Bool) (p : (ReaderT ρ $ StateT σ Parsec) α) + : (ReaderT ρ $ StateT σ Parsec) $ Option (String.Pos × String.Pos × α) := do + match ← peek? with + | some c => if check c then pure $ some (← withPosM (p (← read))) else pure none + | none => pure none + +def tryCharWithPos (check : Char -> Bool) + : Parsec $ Option (String.Pos × String.Pos × Char) := do + tryCharThenPWithPos check anyChar + +def tryCharWithPosMap (check : Char -> Bool) (f : Char → String.Pos → String.Pos → α) + : Parsec $ Option α := do + if let some (p1, p2, c) ← tryCharWithPos check then pure $ f c p1 p2 + else pure none diff --git a/Regex/Regex.lean b/Regex/Regex.lean index e1746d8..9815e66 100644 --- a/Regex/Regex.lean +++ b/Regex/Regex.lean @@ -57,10 +57,9 @@ instance : ToString Captures where /-- Build a Regex from the given pattern. -/ def build (s : String) (flavor : Syntax.Flavor := default) (flags : Syntax.Flags := default) (config : Compiler.Config := default) + (extended : Regex.Grammar.ExtendedKind := .None) : Except String Regex := do - if flags.extended.getD false - then throw (Syntax.AstItems.toError s .FeatureNotImplementedFlagExtended) else - let nfa ← Syntax.AstItems.parse s flavor >>= Syntax.translate flags >>= Compiler.compile config + let nfa ← Syntax.AstItems.parse s flavor extended >>= Syntax.translate flags >>= Compiler.compile config Except.ok ⟨nfa⟩ namespace Log diff --git a/Regex/Syntax/Ast/Ast.lean b/Regex/Syntax/Ast/Ast.lean index 5f205e1..5b8f0fe 100644 --- a/Regex/Syntax/Ast/Ast.lean +++ b/Regex/Syntax/Ast/Ast.lean @@ -365,7 +365,7 @@ inductive Flag where /-- `R` -/ | CRLF /-- `x` -/ - | IgnoreWhitespace + | Extended instance : ToString Flag where toString s := match s with @@ -375,7 +375,7 @@ instance : ToString Flag where | .SwapGreed => "SwapGreed" | .Unicode => "Unicode" | .CRLF => "CRLF" - | .IgnoreWhitespace => "IgnoreWhitespace" + | .Extended => "Extended" /-- The kind of an item in a group of flags. -/ inductive FlagsItemKind where diff --git a/Regex/Syntax/Ast/Parser.lean b/Regex/Syntax/Ast/Parser.lean index e90835a..0be0c92 100644 --- a/Regex/Syntax/Ast/Parser.lean +++ b/Regex/Syntax/Ast/Parser.lean @@ -59,39 +59,31 @@ private def parseHyphen (p : String) (x : Syntax) : ParserM Ast := do pure $ .Literal $ Literal.toLiteral c p f t | _ => Except.error s!"ill-formed literal syntax {x}" -private def nameToClassSetItems : List (String × ClassAsciiKind × Bool) := +private def nameToClassSetItems : List (String × ClassAsciiKind) := [ - ("[:alnum:]", .Alnum, false), - ("[:alpha:]", .Alpha, false), - ("[:ascii:]", .Ascii, false), - ("[:blank:]", .Blank, false), - ("[:cntrl:]", .Cntrl, false), - ("[:digit:]", .Digit, false), - ("[:lower:]", .Lower, false), - ("[:print:]", .Print, false), - ("[:punct:]", .Punct, false), - ("[:space:]", .Space, false), - ("[:upper:]", .Upper, false), - ("[:word:]", .Word, false), - ("[:^alnum:]", .Alnum, true), - ("[:^alpha:]", .Alpha, true), - ("[:^ascii:]", .Ascii, true), - ("[:^blank:]", .Blank, true), - ("[:^cntrl:]", .Cntrl, true), - ("[:^digit:]", .Digit, true), - ("[:^lower:]", .Lower, true), - ("[:^print:]", .Print, true), - ("[:^punct:]", .Punct, true), - ("[:^space:]", .Space, true), - ("[:^upper:]", .Upper, true), - ("[:^word:]", .Word, true) + ("alnum", .Alnum), + ("alpha", .Alpha), + ("ascii", .Ascii), + ("blank", .Blank), + ("cntrl", .Cntrl), + ("digit", .Digit), + ("lower", .Lower), + ("print", .Print), + ("punct", .Punct), + ("space", .Space), + ("upper", .Upper), + ("word", .Word), ] private def parsePosixCharacterClass (p : String) (x : Syntax) : ParserM ClassSetItem := do match x with | Syntax.node _ `posixCharacterClass #[Lean.Syntax.atom (.synthetic f t) name] => + let (negated, name) := + match name.data with + | '^' :: tail => (true, String.mk tail) + | _ => (false, name) match nameToClassSetItems |> List.find? (fun (n, _) => n = name) with - | some (_, cls, negated) => pure $ ClassSetItem.Ascii ⟨⟨p, f, t⟩ , cls, negated⟩ + | some (_, cls) => pure $ ClassSetItem.Ascii ⟨⟨p, f, t⟩ , cls, negated⟩ | none => Except.error s!"unkown posixCharacterClass {name}" | _ => Except.error s!"ill-formed literal syntax {x}" @@ -211,14 +203,13 @@ private def toFlags (chars : List Char) : ParserM $ Array FlagsItem := do let items ← chars |> List.mapM (fun c => match c with | '-' => pure ⟨"", FlagsItemKind.Negation⟩ - | '^' => pure ⟨"", FlagsItemKind.Negation⟩ | 'i' => pure ⟨"", FlagsItemKind.Flag .CaseInsensitive⟩ | 'm' => pure ⟨"", FlagsItemKind.Flag .MultiLine⟩ | 's' => pure ⟨"", FlagsItemKind.Flag .DotMatchesNewLine⟩ | 'u' => pure ⟨"", FlagsItemKind.Flag .Unicode⟩ | 'U' => pure ⟨"", FlagsItemKind.Flag .SwapGreed⟩ | 'R' => pure ⟨"", FlagsItemKind.Flag .CRLF⟩ - | 'x' => throw (toError "" .FeatureNotImplementedFlagExtended) + | 'x' => pure ⟨"", FlagsItemKind.Flag .Extended⟩ | _ => throw (toError "" .FlagUnrecognized)) pure items.toArray @@ -315,7 +306,7 @@ private def set_width (pattern : String) (g : Group) : ParserM Group := do pure (Group.mk span (.Lookaround (.NegativeLookbehind width)) ast) | _ => pure g -def unfoldUnion (union : ClassSetUnion) : Sum ClassSetItem ClassSetBinaryOp := +private def unfoldUnion (union : ClassSetUnion) : Sum ClassSetItem ClassSetBinaryOp := let ⟨span, items⟩ := union match items.size with | 0 => Sum.inl $ ClassSetItem.Empty span @@ -330,6 +321,7 @@ mutual private def parseClassSetItem (p : String) (x : Syntax) : ParserM ClassSetItem := do match x with + | Syntax.node _ `whitespace _ => pure $ ClassSetItem.Empty "" | Syntax.node _ `literal _ => parseLiteral p x >>= toClassSetItem | Syntax.node _ `hyphen _ => parseHyphen p x >>= toClassSetItem | Syntax.node _ `range #[a, b] => rangeToClassSetItem p a b @@ -346,6 +338,8 @@ termination_by sizeOf x private def parseVal (p : String) (x : Syntax) : ParserM Ast := do match x.getKind with + | `whitespace => pure Ast.Empty + | `comment => pure Ast.Empty | `literal => parseLiteral p x | `dot => parseDot p x | `group => @@ -488,10 +482,9 @@ private def parseSequence' (p : String) (x : Syntax) : ParserM Ast := do | Syntax.node _ `sequence arr => parseSequence p arr | _ => Except.error s!"ill-formed sequence syntax {x}" - /-- Parse the regular expression and return an abstract syntax tree. -/ -def parse (p : String) (flavor : Syntax.Flavor) : Except String Ast := do - let stx ← Regex.Grammar.parse p flavor >>= Regex.Grammar.translate +def parse (p : String) (flavor : Syntax.Flavor) (extended : Regex.Grammar.ExtendedKind := .None) : Except String Ast := do + let stx ← Regex.Grammar.parse p flavor extended >>= Regex.Grammar.translate let (ast, parser) ← parseSequence' p stx.raw flavor default if parser.capture_index < parser.max_backreference diff --git a/Regex/Syntax/Grammar/Grammar.lean b/Regex/Syntax/Grammar/Grammar.lean index 1bfb5fc..9695d3d 100644 --- a/Regex/Syntax/Grammar/Grammar.lean +++ b/Regex/Syntax/Grammar/Grammar.lean @@ -1,100 +1,29 @@ import Init.Meta import Lean.Data.Parsec +import Regex.Data.Parsec.Basic import Regex.Syntax.Flavor -open Lean Lean.Parsec Lean.Syntax +open Lean Lean.Parsec Lean.Syntax Parsec /-! Parse a regular expressions into a `Lean.Syntax` tree according to the `Syntax.Flavor`. -/ namespace Regex.Grammar -abbrev ParsecM (α : Type) := ReaderT Syntax.Flavor Parsec α +/-- https://www.pcre.org/current/doc/html/pcre2pattern.html#internaloptions -/ +inductive ExtendedKind where + | None + | Extended + | ExtendedMore +deriving BEq, Repr -/-! Parsec Utils -/ -namespace Utils +/-- State of the parser -/ +private structure Parser where + /-- Flag `x` or 'xx' -/ + extended : ExtendedKind -private def orElse (p : Parsec Syntax) (q : Unit → ParsecM Syntax) : ParsecM Syntax := do - let flavor ← read - Parsec.orElse p (fun () => q () flavor) +instance : Inhabited Parser := ⟨⟨.None⟩⟩ -instance : HOrElse (Parsec Syntax) (ParsecM Syntax) (ParsecM Syntax) where - hOrElse := orElse - -private def attemptM (p : ParsecM α) : ParsecM α := do - attempt (p (← read)) - -private def failM {α : Type} (msg : String) : ParsecM α := - fail msg - -private def withPos (p : Parsec α) : Parsec (String.Pos × String.Pos × α) := fun it => - let pos := it.pos - match p it with - | .success rem a => .success rem (pos, rem.pos, a) - | .error rem err => .error rem err - -private def withPosM (p : ParsecM α) : ParsecM (String.Pos × String.Pos × α) := do - withPos (p (← read)) - -private def withPosSkip : Parsec String.Pos := do - let (_, t, _) ← withPos skip - pure t - -private def optSkipChar (c : Char) : Parsec Unit := do - match ← peek? with - | some c' => if c = c' then skip *> pure () else pure () - | none => pure () - -/-- exec `check` on current char -/ -private def testChar (check : Char -> Bool) : Parsec Bool := do - match ← peek? with - | some c => if check c then pure true else pure false - | none => pure false - -/-- exec `check` on current char and consume char on success -/ -private def tryChar (check : Char -> Bool) : Parsec $ Option Char := do - match ← peek? with - | some c => if check c then pure $ some (← anyChar) else pure none - | none => pure none - -/-- exec `check` on current char and skip char on success -/ -private def trySkipChar (check : Char -> Bool) : Parsec Bool := do - if let some _ ← tryChar check then pure true else pure false - -/-- exec `check` on current char and then exec `f` on consumed char on success -/ -private def tryCharThen (check : Char -> Bool) (f : Char → α) (msg : String := "") : Parsec α := do - if let some c ← tryChar check then pure $ f c - else fail msg - -/-- exec `check` on current char and map `f` on consumed char on success -/ -private def tryCharMap (check : Char -> Bool) (f : Char → α) : Parsec $ Option α := do - if let some c ← tryChar check then pure $ f c - else pure none - -/-- exec `check` on current char and then exec `p` on success -/ -private def tryCharThenPWithPos (check : Char -> Bool) (p : Parsec α) - : Parsec $ Option (String.Pos × String.Pos × α) := do - match ← peek? with - | some c => if check c then pure $ some (← withPos p) else pure none - | none => pure none - -/-- exec `check` on current char and then exec `p` on success -/ -private def tryCharThenPWithPosM (check : Char -> Bool) (p : ParsecM α) - : ParsecM $ Option (String.Pos × String.Pos × α) := do - let flavor ← read - match ← peek? with - | some c => if check c then pure $ some (← withPos (p flavor)) else pure none - | none => pure none - -private def tryCharWithPos (check : Char -> Bool) : Parsec $ Option (String.Pos × String.Pos × Char) := do - tryCharThenPWithPos check anyChar - -private def tryCharWithPosMap (check : Char -> Bool) (f : Char → String.Pos → String.Pos →α) : Parsec $ Option α := do - if let some (p1, p2, c) ← tryCharWithPos check then pure $ f c p1 p2 - else pure none - -end Utils - -open Utils +abbrev ParsecM := ReaderT Syntax.Flavor $ StateT Parser Parsec /-- https://www.pcre.org/current/doc/html/pcre2pattern.html#SEC4 -/ private def isMetaCharacter : Char → Bool @@ -129,6 +58,47 @@ private def literal (inCharacterClass : Bool := false) : ParsecM Syntax := attem else fail "invalid literal character" +private def isWsChar (c : Char) (inCharacterClass : Bool) : Bool := + if inCharacterClass then c = '\x09' || c = ' ' + else c = '\x09' || c = '\x0a' || c = '\x0b' || c = '\x0c' || c = '\x0d' || c = ' ' + +private def wsChar (inCharacterClass : Bool) : Parsec Char := attempt do + let c ← anyChar + if isWsChar c inCharacterClass then return c else fail "" + +private def wsChars (inCharacterClass : Bool) : ParsecM Syntax := attemptM do + let state ← get + if inCharacterClass && state.extended == ExtendedKind.ExtendedMore + || !inCharacterClass && state.extended != ExtendedKind.None + then + let (f, t, chars) ← withPos (manyChars (wsChar inCharacterClass)) + if chars.length > 0 then + pure $ mkNodeOfKind `whitespace chars f t + else failM "" + else failM "" + +/-- https://www.pcre.org/current/doc/html/pcre2pattern.html#SEC24 -/ +private def comments : ParsecM Syntax := attemptM do + let state ← get + if state.extended == ExtendedKind.Extended + then + if let some _ ← tryChar (· = '#') then + if ← testChar (· = '\n') then + let (f, t, _) ← withPos $ skipChar '\n' + pure $ mkNodeOfKind `comment "" f t + else + let (f, t, chars) ← withPos $ manyChars (notFollowedBy (pchar '\n') *> anyChar) <* optSkipChar '\n' + if chars.length > 0 then + pure $ mkNodeOfKind `comment chars f t + else failM "" + else failM "" + else failM "" + +private def skipWsChars : ParsecM Unit := do + tryCatchT + (manyM (comments <|> wsChars false) (← read)) + (fun _ => pure ()) (fun _ => pure ()) + private def toControlChar (c : Char) (f t : String.Pos) : ParsecM Syntax := do let val ← if c.isUpper then pure (c.val - 'A'.val + 1) @@ -187,7 +157,7 @@ private def decodeDigits (l : List Nat) (base : Nat) : Char := private def parenWithChars (p : ParsecM α) (startChar : Char := '{') (endChar : Char := '}') : ParsecM $ Array α := attemptM do let _ ← pchar startChar - let arr ← manyCore (p (← read)) #[] + let arr ← manyM (p (← read)) let _ ← pchar endChar pure arr @@ -241,7 +211,7 @@ private def groupName : ParsecM String := do private def capturingGroupKind : ParsecM Syntax := attemptM do let (f, _, _) ← withPos (pchar ('?')) if ← testChar (fun c1 => c1 = '<' || c1 = '\'' || c1 = 'P') then - let (_, t, name) ← withPos (groupName (← read)) + let (_, t, name) ← withPosM (groupName (← read)) pure $ mkNodeOfKind `capturingGroup name f t else fail "invalid capturing group character" @@ -268,9 +238,13 @@ private def defineGroupKind : ParsecM Syntax := attempt do private def subroutineGroupKind : ParsecM Syntax := attempt do let (f, t, _) ← withPos (pchar ('?')) let c1 ← peek! - if c1 = '?' || c1 = '&' || c1 = '(' || c1 = 'P' || c1 = '|' || c1 = '-' || c1.isDigit then + if c1 = '?' || c1 = '&' || c1 = '(' || c1 = 'P' || c1 = '|' || c1.isDigit then let chars ← manyChars (notFollowedBy (pchar ')') *> groupLetter) pure $ mkNodeOfKind `subroutineGroupKind ⟨chars.toList⟩ f t + else if c1 = '-' then + let c ← pchar '-' + let chars ← many1Chars (notFollowedBy (pchar ')') *> digit) + pure $ mkNodeOfKind `subroutineGroupKind ⟨[c] ++ chars.toList⟩ f t else fail "" private def commentGroupKind : ParsecM Syntax := attempt do @@ -286,7 +260,7 @@ private def namedLookaroundGroupKind : ParsecM Syntax := attempt do <|> skipString "plb:" *> (pure $ mkNodeOfKind `lookaroundGroup "?<=" f t) <|> skipString "nlb:" *> (pure $ mkNodeOfKind `lookaroundGroup "? pchar '(') pure $ mkNodeOfKind `controlName ⟨chars.toList⟩ f t @@ -296,13 +270,38 @@ private def controlVerbGroupKind : ParsecM Syntax := attemptM do let (f, t, _) ← withPos (pchar ('*')) (skipString "ACCEPT" <|> skipString "COMMIT" <|> skipString "MARK"<|> skipString "PRUNE" <|> skipString "SKIP" <|> skipString "THEN") - *> controlName (← read) *> (pure $ mkNodeOfKind `controlVerbGroup "" f t) + *> controlName *> (pure $ mkNodeOfKind `controlVerbGroup "" f t) <|> controlName -private def nonCapturingGroupKind : ParsecM Syntax := attempt do +private def containsString (s m : String) : Bool := + let arr := s.splitOn m + arr.length >= 2 + +private def toExtendedKind (flags : String) : ExtendedKind := + if flags.contains '-' && flags.contains 'x' + && flags.find (· = '-') < flags.find (· = 'x') then .None + else if containsString flags "xx" then .ExtendedMore + else if flags.contains 'x' then .Extended + else .None + +private def expandFlags (flags : String) : String := + match flags.data with + | '^' :: tail => + let flagsPos := "imsx".data |> List.filter (fun c => tail.contains c) + let flagsNeg := "imsx".data |> List.filter (fun c => !tail.contains c) + ⟨flagsPos⟩ ++ "-" ++ ⟨flagsNeg⟩ + | _ => flags + +private def nonCapturingGroupKind : ParsecM Syntax := attemptM do let (f, t, _) ← withPos (pchar ('?')) let flags ← manyChars (notFollowedBy (pchar ':' <|> pchar ')') *> (asciiLetter <|> pchar '-' <|> pchar '^')) <* optSkipChar ':' + + let flags := expandFlags flags + if flags.length > 0 then + let state ← get + set {state with extended := toExtendedKind flags} + pure $ mkNodeOfKind `nonCapturingGroup flags f t private def groupKind : ParsecM Syntax := do @@ -436,7 +435,7 @@ private def escapedChar : ParsecM Syntax := attempt do private def literalChars : ParsecM Syntax := attempt do skipChar 'Q' - let chars ← manyChars (notFollowedBy (skipString "\\E") *> anyChar) <* skipString "\\E" + let chars ← manyChars (notFollowedBy (skipString "\\E") *> anyChar) <* optSkipString "\\E" let list := chars.data |> List.map (fun c => mkLiteral c 0 0) pure $ Syntax.node (SourceInfo.synthetic 0 0) `sequence list.toArray @@ -455,6 +454,7 @@ private def escapeSeq (inCharacterClass : Bool := false) : ParsecM Syntax := att end EscapeSeq private def repetitionModifier : ParsecM Syntax := do + skipWsChars match ← peek? with | some c => if c = '+' || c = '?' then skip; pure $ mkLit `repetitionModifier ⟨[c]⟩ SourceInfo.none @@ -469,6 +469,7 @@ private def toRepetitionRight (s : String) : Syntax := private def repetitionContent : ParsecM Syntax := attemptM do ws + skipWsChars let c ← peek! if c = ',' then let (f, _, _) ← withPos $ skipChar ',' @@ -505,6 +506,7 @@ private def repetitionContent : ParsecM Syntax := attemptM do /-- https://www.pcre.org/current/doc/html/pcre2pattern.html#SEC17 -/ private def repetition : ParsecM Syntax := attemptM do + skipWsChars let c ← peek! if c = '{' then let (f, t, _) ← withPos (pchar ('{')) @@ -536,19 +538,11 @@ private def repetition : ParsecM Syntax := attemptM do pure $ Syntax.node (SourceInfo.synthetic f t) `repetition #[litA, litB, modifier] else fail "" -private def toPosixCharacterClass (p : Parsec String ): Parsec Syntax := attempt do - pure $ mkNodeOfKind `posixCharacterClass (← p) 0 0 - private def posixCharacterClass : Parsec Syntax := attempt do - (pstring "[:alnum:]" <|> pstring "[:ascii:]" <|> pstring "[:alpha:]" <|> pstring "[:blank:]" - <|> pstring "[:cntrl:]" <|> pstring "[:digit:]" <|> pstring "[:lower:]" <|> pstring "[:word:]" - <|> pstring "[:print:]" <|> pstring "[:punct:]" <|> pstring "[:space:]" - <|> pstring "[:upper:]" <|> - pstring "[:^alnum:]" <|> pstring "[:^ascii:]" <|> pstring "[:^alpha:]" <|> pstring "[:^blank:]" - <|> pstring "[:^cntrl:]" <|> pstring "[:^digit:]" <|> pstring "[:^lower:]" <|> pstring "[:^word:]" - <|> pstring "[:^print:]" <|> pstring "[:^punct:]" <|> pstring "[:^space:]" - <|> pstring "[:^upper:]") - |> toPosixCharacterClass + let (f, _, _) ← withPos $ pstring "[:" + let (_, t, chars) ← withPos $ + manyChars (notFollowedBy (pstring ":]") *> (asciiLetter <|> pchar '^')) <* pstring ":]" + pure $ mkNodeOfKind `posixCharacterClass chars f t private def consumeStartOfCharacterClass : ParsecM $ Array Syntax := attemptM do if let some stx ← consumeChar? '^' then @@ -561,7 +555,7 @@ private def consumeStartOfCharacterClass : ParsecM $ Array Syntax := attemptM do private def characterClass (val : ParsecM Syntax) : ParsecM Syntax := attemptM do let (f, _, _) ← withPos (pchar ('[')) let start ← consumeStartOfCharacterClass - let arr ← manyCore (val (← read)) #[] + let arr ← manyM (val (← read)) let (_, t, _) ← withPos (pchar (']')) pure $ Syntax.node (SourceInfo.synthetic f t) `characterClass (start ++ arr) @@ -570,6 +564,7 @@ mutual private partial def valInCharacterClass : ParsecM $ Syntax := do let p ← EscapeSeq.escapeSeq true <|> posixCharacterClass + <|> wsChars true <|> (if Syntax.Flavor.Rust == (← read) then characterClass' else failM "") <|> characterClassSetOperation <|> hyphen <|> literal true @@ -580,28 +575,42 @@ private partial def characterClass' : ParsecM Syntax := end +private def getFlags (x : Syntax) : Option String := + match x with + | Syntax.node _ `nonCapturingGroup #[Lean.Syntax.atom _ s] => some s + | _ => none + /-- https://www.pcre.org/current/doc/html/pcre2pattern.html#SEC14 -/ private def group (val : ParsecM Syntax) : ParsecM Syntax := attemptM do + let state ← get let (f, _, _) ← withPos (pchar ('(')) let kind ← groupKind - let arr ← manyCore (val (← read)) #[] + let arr ← manyM val let (_, t, _) ← withPos (pchar (')')) + + let flags := Option.getD (getFlags kind) "" + if arr.size = 0 then -- set state in outer group + let state ← get + set {state with extended := toExtendedKind flags} + else set state -- set previous state + pure $ Syntax.node (SourceInfo.synthetic f t) `group (#[kind] ++ arr) private partial def val : ParsecM $ Syntax := do let p ← EscapeSeq.escapeSeq <|> (group val) <|> verticalBar <|> assertion - <|> characterClass valInCharacterClass <|> repetition <|> dot <|> literal + <|> characterClass valInCharacterClass <|> repetition <|> comments + <|> wsChars false <|> dot <|> literal pure $ p private def sequence : ParsecM $ TSyntax `sequence := do - let (f, t, arr) ← withPos (manyCore (val (← read)) #[]) + let (f, t, arr) ← withPosM (manyM val) pure $ (TSyntax.mk (Syntax.node (SourceInfo.synthetic f t) `sequence arr)) /-- Parse a PCRE2 regular expressions into a `Lean.Syntax` tree. -/ -def parse (s : String) (flavor : Syntax.Flavor) : Except String $ TSyntax `sequence := - match (sequence flavor) s.mkIterator with +def parse (s : String) (flavor : Syntax.Flavor) (extended : ExtendedKind := .None) : Except String $ TSyntax `sequence := + match (sequence flavor ⟨extended⟩ ) s.mkIterator with | Parsec.ParseResult.success it res => if it.atEnd - then Except.ok res + then Except.ok res.1 else Except.error s!"offset {repr it.i.byteIdx}: cannot parse regex" | Parsec.ParseResult.error it err => Except.error s!"offset {repr it.i.byteIdx}: {err}" diff --git a/Regex/Syntax/Hir.lean b/Regex/Syntax/Hir.lean index 24b28d0..4872c64 100644 --- a/Regex/Syntax/Hir.lean +++ b/Regex/Syntax/Hir.lean @@ -2,6 +2,7 @@ import Batteries.Data.Array.Basic import Regex.Utils import Regex.Interval import Regex.Unicode +import Regex.Syntax.Grammar.Grammar /-! ## High-level intermediate representation for a regular expression. @@ -152,7 +153,7 @@ structure Flags where dot_matches_new_line: Option Bool swap_greed: Option Bool crlf: Option Bool - extended : Option Bool + extended : Option Regex.Grammar.ExtendedKind instance : Inhabited Flags := ⟨⟨none, none, none, none, none, none⟩⟩ diff --git a/Regex/Syntax/Translate.lean b/Regex/Syntax/Translate.lean index 56d07d0..9e86242 100644 --- a/Regex/Syntax/Translate.lean +++ b/Regex/Syntax/Translate.lean @@ -48,7 +48,7 @@ private def from_ast (ast: Syntax.AstItems.Flags) : Syntax.Flags := (fun (enabled, acc) f => match f.kind with | .Negation => (false, acc) - | .Flag Flag.IgnoreWhitespace => (enabled, acc) + | .Flag Flag.Extended => (enabled, acc) | .Flag Flag.CRLF => (enabled, {acc with crlf := enabled}) | .Flag Flag.Unicode => (enabled, acc) | .Flag Flag.SwapGreed => (enabled, {acc with swap_greed := enabled}) diff --git a/scripts/nolints.json b/scripts/nolints.json index 580451b..3311f92 100644 --- a/scripts/nolints.json +++ b/scripts/nolints.json @@ -10,6 +10,14 @@ ["docBlame", "IntervalSet.intervals"], ["docBlame", "Intervals.singleton"], ["docBlame", "Nat.intAsString"], + ["docBlame", "Parsec.failM"], + ["docBlame", "Parsec.optSkipChar"], + ["docBlame", "Parsec.optSkipString"], + ["docBlame", "Parsec.tryCharWithPos"], + ["docBlame", "Parsec.tryCharWithPosMap"], + ["docBlame", "Parsec.withPos"], + ["docBlame", "Parsec.withPosM"], + ["docBlame", "Parsec.withPosSkip"], ["docBlame", "Regex.nfa"], ["docBlame", "String.asHex"], ["docBlame", "String.decodeHex"], @@ -76,7 +84,6 @@ ["docBlame", "Syntax.AstItems.toError"], ["docBlame", "Syntax.AstItems.toErrorAt"], ["docBlame", "Syntax.AstItems.toString"], - ["docBlame", "Syntax.AstItems.unfoldUnion"], ["docBlame", "Syntax.AstItems.visit_loop"], ["docBlame", "Syntax.ClassUnicode.canonicalize"], ["docBlame", "Syntax.ClassUnicode.case_fold"], @@ -146,9 +153,6 @@ ["docBlame", "NFA.Unchecked.NFA.states"], ["docBlame", "NFA.Unchecked.NFA.toString"], ["docBlame", "NFA.Unchecked.State.toString"], - ["docBlame", "Regex.Grammar.Utils.attemptM"], - ["docBlame", "Regex.Grammar.Utils.failM"], - ["docBlame", "Regex.Grammar.Utils.orElse"], ["docBlame", "Syntax.AstItems.Alternation.asts"], ["docBlame", "Syntax.AstItems.Alternation.into_ast"], ["docBlame", "Syntax.AstItems.AssertionKind.toString"], diff --git a/test/PcreLoader.lean b/test/PcreLoader.lean index 066336c..9c2ddaf 100644 --- a/test/PcreLoader.lean +++ b/test/PcreLoader.lean @@ -125,24 +125,29 @@ private def setOption (c : Char) (t : RegexTest) : RegexTest := | 's' => { t with single_line := some true } | 'm' => { t with multi_line := some true } | 'g' => { t with «global» := some true } - | 'x' => { t with extended := some true } + | 'x' => { t with extended := some .Extended } | _ => t private def toPattern (p : PcreTest) (t : RegexTest) : RegexTest := let pattern := p.pattern.trim - match (pattern.data.head?, pattern.data.getLast?) with - | (some '/', some '/') => - { t with regex := Sum.inl ⟨(pattern.data.drop 1).dropLast⟩ } - | (some '/', some c1) => - let t := setOption c1 t - let data := (pattern.data.drop 1).dropLast - match data.getLast? with - | some '/' => { t with regex := Sum.inl ⟨data.dropLast⟩ } - | some c2 => - let t := setOption c2 t - { t with regex := Sum.inl ⟨data.dropLast.dropLast⟩ } - | none => t - | (_, _) => t + if pattern.endsWith "/xx" then + { t with + regex := Sum.inl ⟨(pattern.data.drop 1).take (pattern.length - 4)⟩ + extended := some .ExtendedMore } + else + match (pattern.data.head?, pattern.data.getLast?) with + | (some '/', some '/') => + { t with regex := Sum.inl ⟨(pattern.data.drop 1).dropLast⟩ } + | (some '/', some c1) => + let t := setOption c1 t + let data := (pattern.data.drop 1).dropLast + match data.getLast? with + | some '/' => { t with regex := Sum.inl ⟨data.dropLast⟩ } + | some c2 => + let t := setOption c2 t + { t with regex := Sum.inl ⟨data.dropLast.dropLast⟩ } + | none => t + | (_, _) => t private def toRegexTest (i : Nat) (p : PcreTest) : Except String $ RegexTest := do pure <| toPattern p diff --git a/test/RegexTest.lean b/test/RegexTest.lean index ee4b9c1..f3c8f9c 100644 --- a/test/RegexTest.lean +++ b/test/RegexTest.lean @@ -70,7 +70,7 @@ structure RegexTest where multi_line : Option Bool := none single_line : Option Bool := none «global» : Option Bool := none - extended : Option Bool := none + extended : Option Regex.Grammar.ExtendedKind := none namespace RegexTest @@ -110,7 +110,8 @@ instance : ToString RegexTest where let str := str ++ (if checkFlagIsTrue s.single_line then " single_line" else "") let str := str ++ (if checkFlagIsTrue s.multi_line then " multi_line" else "") let str := str ++ (if checkFlagIsTrue s.global then " global" else "") - let str := str ++ (if checkFlagIsTrue s.extended then " extended" else "") + let str := str ++ (match s.extended with + | some .Extended => " extended" | some .ExtendedMore => " extendedMore" | _ => "") str instance : ToString RegexTests where @@ -148,12 +149,12 @@ def captures (flavor : Syntax.Flavor) (t : RegexTest) : Except String (Array Reg let flags := {flags with case_insensitive := t.«case-insensitive», dot_matches_new_line := t.single_line, - multi_line := t.multi_line - extended := t.extended} + multi_line := t.multi_line} let config := {config with unanchored_prefix := !t.anchored.getD false} + let extended := Option.getD t.extended .None let haystack := if t.unescape.getD false then unescapeStr t.haystack else t.haystack - let re ← Regex.build (Sum.val t.regex) flavor flags config + let re ← Regex.build (Sum.val t.regex) flavor flags config extended Except.ok (Regex.all_captures haystack.toSubstring re) def checkMatches (arr : Array Regex.Captures) (t : RegexTest) : Bool :=