Skip to content

Commit

Permalink
feat: add named capture groups
Browse files Browse the repository at this point in the history
  • Loading branch information
bergmannjg committed Jul 3, 2024
1 parent f7a42c6 commit 88dddfa
Show file tree
Hide file tree
Showing 13 changed files with 162 additions and 54 deletions.
15 changes: 8 additions & 7 deletions Main.lean
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def regexShortOption : (opt : Char) → CliM PUnit
| 'f' => do let path ← takeOptArg "-f" "path"; modifyThe RegexOptions ({· with path})
| 'h' => modifyThe RegexOptions ({· with wantsHelp := true})
| 'n' => modifyThe RegexOptions ({· with unanchoredPrefix := false})
| 's' => modifyThe RegexOptions ({· with unanchoredPrefixSimulation := true})
| 'u' => modifyThe RegexOptions ({· with unescape := true})
| 'v' => modifyThe RegexOptions ({· with verbosity := Lake.Verbosity.verbose})
| opt => throw <| CliError.unknownShortOption opt
Expand Down Expand Up @@ -84,13 +85,13 @@ COMMANDS:
captures <re> <s> get first or all captures of <re> in <s>
OPTIONS:
--help, -h print help
--verbose, -v show verbose information
--all, -a get all captures
--file, -f <path> path to file with haystack
--no-unanchored-prefix, -n whether not to compile an unanchored prefix into the NFA
--unanchored-prefix-simulation whether to simulate an unanchored prefix with the backtracker
--unescape, -u unescape haystack"
--help, -h print help
--verbose, -v show verbose information
--all, -a get all captures
--file, -f <path> path to file with haystack
--no-unanchored-prefix, -n whether not to compile an unanchored prefix into the NFA
--unanchored-prefix-simulation, -s whether to simulate an unanchored prefix with the backtracker
--unescape, -u unescape haystack"

namespace regex

Expand Down
2 changes: 0 additions & 2 deletions Regex.lean
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import Regex.Syntax.Hir
import Regex.Nfa
import Regex.Unicode
import Regex.Regex
import Regex.Notation

Expand Down Expand Up @@ -36,7 +35,6 @@ The following features are not yet implemented in the Pcre flavor:
* [backtracking control](https://pcre2project.github.io/pcre2/doc/html/pcre2syntax.html#SEC28),
* [variable length look behind](https://pcre2project.github.io/pcre2/doc/html/pcre2syntax.html#SEC22),
* [extended patterns](https://perldoc.perl.org/perlre#Extended-Patterns),
* [named capture groups](https://perldoc.perl.org/perlre#(?NAME%3Epattern)),
* capture groups with a backreference should have fixed width (a restriction of the backtrack algorithm).
The following features are not implemented in the Rust flavor:
Expand Down
28 changes: 21 additions & 7 deletions Regex/Backtrack.lean
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,13 @@ private def toPairs (slots : Array (Nat × Nat × Option String.Pos))
if state.logEnabled then { state with msgs := state.msgs.push s!"{msg ()}"}
else state

theorem withMsg_eq {nfa : Checked.NFA} {s s1 : SearchState nfa.n} {msg : Unit -> String}
(h : withMsg msg s = s1) : s.countVisited = s1.countVisited ∧ s.stack = s1.stack := by
unfold withMsg at h
split at h <;> try simp_all
simp [SearchState.ext_iff] at h
simp_all

private def encodeChar? (c: Option Char) : String :=
match c with
| some curr => UInt32.intAsString curr.val
Expand Down Expand Up @@ -643,7 +650,7 @@ private def encodeChar? (c: Option Char) : String :=
| none =>
(withMsg
(fun _ =>
s!"{state.sid}: SparseTransitions '{encodeChar? state.at.curr?}' failed") state)
s!"{state.sid}: SparseTransitions failed at charpos {state.at}") state)

@[inline] private def step_union (alts : Array $ Fin n) (state : SearchState n) : SearchState n :=
match alts with
Expand Down Expand Up @@ -752,28 +759,34 @@ private def encodeChar? (c: Option Char) : String :=
-- countVisited is not changed in `toNextStep`
{searchState' with countVisited := searchState.countVisited}

theorem toNextStep'_eq (nfa : Checked.NFA) (state : Checked.State nfa.n) (s s1 : SearchState nfa.n)
(h : toNextStep' nfa state s = s1) : s.countVisited = s1.countVisited := by
theorem toNextStep'_eq {nfa : Checked.NFA} {state : Checked.State nfa.n} {s s1 : SearchState nfa.n}
{msg : Unit → String}
(h : toNextStep' nfa state (withMsg msg s) = s1) : s.countVisited = s1.countVisited := by
unfold toNextStep' at h
simp [SearchState.ext_iff] at h
simp_all
unfold withMsg at h
split at h <;> try simp_all

/-- execute next step in NFA if state not already visited. Returns true if steps available. -/
@[inline] private def toNextStepChecked (nfa : Checked.NFA) (state : SearchState nfa.n)
: Bool × SearchState nfa.n :=
match Visited.checkVisited' state with
| (false, state') =>
let state := nfa.states.get state'.sid
(true, toNextStep' nfa state state')
| _ => (false, state)
(true, toNextStep'
nfa
state
(withMsg (fun _ => s!"{state'.sid}: visit charpos {state'.at}") state'))
--state')
| _ => (false, (withMsg (fun _ => s!"{state.sid}: isVisited charpos {state.at}") state))

theorem toNextStepChecked_true_lt (nfa : Checked.NFA) (s s1 : SearchState nfa.n)
(h : toNextStepChecked nfa s = (true, s1)) : s.countVisited < s1.countVisited := by
unfold toNextStepChecked at h
split at h <;> simp_all
rename_i s2 hcv
have heq : s1.countVisited = s2.countVisited := by
simp [toNextStep'_eq _ _ s2 s1 h]
simp [toNextStep'_eq h]
have hlt : s.countVisited < s2.countVisited := by
simp [Visited.checkVisited'_false_lt s hcv]
rw [← heq] at hlt
Expand All @@ -784,6 +797,7 @@ theorem toNextStepChecked_false_eq (nfa : Checked.NFA) (s s1 : SearchState nfa.n
: s.countVisited = s1.countVisited ∧ s.stack = s1.stack := by
unfold toNextStepChecked at h
split at h <;> try simp_all
simp [withMsg_eq h]

@[inline] private def visitedSize (state : SearchState n) : Nat :=
(Visited.getRefValue state.visited).size
Expand Down
1 change: 1 addition & 0 deletions Regex/Regex.lean
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import Regex.Unicode
import Regex.Nfa
import Regex.Backtrack
import Regex.Utils
Expand Down
10 changes: 5 additions & 5 deletions Regex/Syntax/Ast/Ast.lean
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ inductive ErrorKind
| GroupNameEmpty
/-- An invalid character was seen for a capture group name. -/
| GroupNameInvalid
/-- A capture group name is not found of a backreference. -/
| GroupNameNotFound
/-- A closing `>` could not be found for a capture group name. -/
| GroupNameUnexpectedEof
/-- An unclosed group, e.g., `(ab`. -/
Expand Down Expand Up @@ -96,8 +98,6 @@ inductive ErrorKind
/-- A octal literal did not correspond to a Unicode scalar value. -/
| EscapeOctalInvalid
/-- Feature not implemented. -/
| FeatureNotImplementedNamedGroups
/-- Feature not implemented. -/
| FeatureNotImplementedSubroutines
/-- Feature not implemented. -/
| FeatureNotImplementedFlagShorthand
Expand Down Expand Up @@ -137,6 +137,7 @@ def toString : ErrorKind -> String
| .GroupNameDuplicate => "duplicate capture group name"
| .GroupNameEmpty => "empty capture group name"
| .GroupNameInvalid => "invalid capture group character"
| .GroupNameNotFound => "capture group of backreference not found."
| .GroupNameUnexpectedEof => "unclosed capture group name"
| .GroupUnclosed => "unclosed group"
| .GroupUnopened => "unopened group"
Expand All @@ -159,7 +160,6 @@ def toString : ErrorKind -> String
| .UnsupportedBackreference => "backreferences are not supported"
| .UnsupportedLookAround => "look-around, including look-ahead and look-behind, is not supported"
| .UnkownAsciiClass => "ascii class unkown"
| .FeatureNotImplementedNamedGroups => "feature not implemented: NamedGroups"
| .FeatureNotImplementedSubroutines => "feature not implemented: Subroutines"
| .FeatureNotImplementedFlagShorthand => "feature not implemented: Flag ^"
| .FeatureNotImplementedAtomicGroup => "feature not implemented: AtomicGroup"
Expand Down Expand Up @@ -425,14 +425,14 @@ instance : ToString LookaroundKind where

/-- The kind of a group. -/
inductive GroupKind where
| CaptureIndex : Nat -> GroupKind
| CaptureIndex : Nat -> Option String -> GroupKind
| NonCapturing : Flags -> GroupKind
| Lookaround : LookaroundKind -> GroupKind

namespace GroupKind

def toString : GroupKind -> String
| .CaptureIndex i => s!"CaptureIndex {i}"
| .CaptureIndex i s => s!"CaptureIndex {i} name {s}"
| .NonCapturing flags => s!"NonCapturing {flags}"
| .Lookaround kind => s!"Lookaround {kind}"

Expand Down
113 changes: 99 additions & 14 deletions Regex/Syntax/Ast/Parse.lean
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import Batteries.Data.Nat.Lemmas
import Regex.Data.Array.Basic
import Regex.Syntax.Ast.Ast
import Regex.Utils
import Regex.Unicode

namespace Syntax

Expand Down Expand Up @@ -73,6 +74,8 @@ private inductive ClassState where
private structure Parser where
/-- The current capture index. -/
capture_index : Nat
/-- The names of capture groups. -/
capture_group_names : List (Nat × String)
/-- The maximal single digit backreference. -/
max_backreference : Nat
/-- Disable pattern metacharacters from \Q until \E -/
Expand All @@ -82,7 +85,7 @@ private structure Parser where
/-- A stack of nested character classes. -/
stack_class: Array ClassState

instance : Inhabited Parser := ⟨0, 0, false, #[], #[]⟩
instance : Inhabited Parser := ⟨0, [], 0, false, #[], #[]⟩

abbrev ParserM := ReaderT Flavor $ StateT Parser (Except String)

Expand Down Expand Up @@ -253,6 +256,50 @@ private def maybe_parse_special_word_boundary (pattern : String) (i : Nat)
else
Except.ok none

private def is_start_of_group_name (c : Char) :=
c = '_'
|| Unicode.isCharOfGeneralCategory Unicode.GeneralCategory.L c

private def is_char_in_group_name (c : Char) :=
c = '_'
|| Unicode.isCharOfGeneralCategory Unicode.GeneralCategory.L c
|| Unicode.isCharOfGeneralCategory Unicode.GeneralCategory.Nd c

private def read_group_name? (pattern : String) (i : Nat) (offset : Nat) (end_marker : Char)
: Option (NChars × String) := do
let chars := (pattern.data.drop (i + offset))
|> List.takeWhile (fun c => is_char_in_group_name c || c = ' ')
if chars.length > 0 && end_marker = pattern.getAtCodepoint (i + offset + chars.length) then
let offset := offset + chars.length + 1
let name : String := ⟨chars⟩
some (offset, name.trim)
else none

private def parse_backref_name (pattern : String) (i : Nat) (end_marker : Char)
: ParserM (NChars × BackRef) := do
let state ← get
match read_group_name? pattern i 0 end_marker with
| some (offset, name) =>
match state.capture_group_names |> List.find? (fun (_, n) => n = name) with
| some (n, _) =>
if n > state.max_backreference then set {state with max_backreference := n}
let span := String.toSpan pattern (i - 1) (i + offset)
pure (offset - 1, ⟨span, n⟩)
| none => throw (toError pattern .GroupNameNotFound)
| none => throw (toError pattern .GroupNameNotFound)

private def maybe_parse_backref_name (pattern : String) (i : Nat) (end_marker : Char)
: ParserM $ Option (NatPos × Primitive) := do
let c := pattern.getAtCodepoint i
let c1 := pattern.getAtCodepoint (i + 1)
let c2 := pattern.getAtCodepoint (i + 2)
if c = '?' && c1 = 'P' && c2 = '=' then
let (offset, br) ← parse_backref_name pattern (i + 3) end_marker
pure (some (NatPos.succ (offset + 4), Primitive.BackRef br))
else if c = '?' && c1 = 'P' && c2 = '>' then
throw (toError pattern .FeatureNotImplementedSubroutines)
else pure none

private def maybe_parse_backref_num (pattern : String) (i : Nat)
: ParserM $ Option (NChars × BackRef) := do
let state ← get
Expand All @@ -273,7 +320,9 @@ private def maybe_parse_backref_num (pattern : String) (i : Nat)
if 0 < n && n ≤ state.capture_index
then pure (some (offset0 + 1, ⟨span, n⟩))
else pure none
| (_, _, _) => pure none
| (_, _, _) =>
if offset0 - 1 = 0 && is_start_of_group_name c then parse_backref_name pattern i '}'
else pure none

private def maybe_parse_backref (pattern : String) (i : Nat)
: ParserM $ Option (NChars × BackRef) := do
Expand Down Expand Up @@ -410,6 +459,18 @@ private def parse_escape (pattern : String) (i : Nat) (inSetClass : Bool := fals
else Except.error (toError pattern .EscapeUnrecognized)
| 'e' => pure (NatPos.one, toVerbatim '\x07')
| 'f' => pure (NatPos.one, toVerbatim '\x0C')
| 'k' =>
if flavor == Flavor.Pcre
then
let c1 := pattern.getAtCodepoint (i+1)
if c1 = '<' then
let (offset, br) ← parse_backref_name pattern (i + 2) '>'
pure (NatPos.succ (offset + 2), Primitive.BackRef br)
else if c1 = '{' then
let (offset, br) ← parse_backref_name pattern (i + 2) '}'
pure (NatPos.succ (offset + 2), Primitive.BackRef br)
else Except.error (toError pattern .EscapeUnrecognized)
else Except.error (toError pattern .EscapeUnrecognized)
| 'n' => pure (NatPos.one, toVerbatim '\n')
| 'o' =>
let c1 := pattern.getAtCodepoint (i+1)
Expand Down Expand Up @@ -923,6 +984,23 @@ private def parse_flags (pattern : String) (i : Nat)
let flags : Flags := ⟨(String.toSpan pattern i chars.size), items⟩
Except.ok (chars.size, flags)

private def parse_group_name (pattern : String) (i : Nat) (offset : Nat) (end_marker : Char)
: ParserM (NChars × (Sum SetFlags Group)) := do
match read_group_name? pattern i offset end_marker with
| some (offset, name) =>
let parser ← get
if parser.capture_group_names |> List.find? (fun (_, n) => n = name) |> Option.isSome
then throw (toError pattern .GroupNameDuplicate) else
let capture_index := parser.capture_index + 1
let g := Group.mk (String.toSpan pattern i (i + offset)) (.CaptureIndex capture_index (some name)) Ast.Empty
let parser := {parser with
capture_index := capture_index
capture_group_names := parser.capture_group_names ++ [(capture_index, name)]
}
set parser
pure (offset, Sum.inr g)
| none => throw (toError pattern .GroupNameInvalid)

/-- Parse a group (which contains a sub-expression) or a set of flags. -/
private def parse_group (pattern : String) (i : Nat)
: ParserM (NChars × (Sum SetFlags Group)) := do
Expand All @@ -931,12 +1009,12 @@ private def parse_group (pattern : String) (i : Nat)
let c1 := pattern.getAtCodepoint i
let c2 := pattern.getAtCodepoint (i + 1)
let c3 := pattern.getAtCodepoint (i + 2)
if c1 = '?' && c2 = '<' && c3.isAlpha then
throw (toError pattern .FeatureNotImplementedNamedGroups)
else if c1 = '?' && c2 = 'P' && (c3 = '<' || c3 = '=') then
throw (toError pattern .FeatureNotImplementedNamedGroups)
if c1 = '?' && c2 = '<' && is_start_of_group_name c3 then
parse_group_name pattern i 2 '>'
else if c1 = '?' && c2 = 'P' && c3 = '<' then
parse_group_name pattern i 3 '>'
else if c1 = '?' && c2 = '\'' then
throw (toError pattern .FeatureNotImplementedNamedGroups)
parse_group_name pattern i 2 '\''
else if c1 = '?' && c2.isDigit then
throw (toError pattern .FeatureNotImplementedSubroutines)
else if c1 = '?' && (c2 = '-' || c2 = '+') && c3.isDigit then
Expand Down Expand Up @@ -1010,7 +1088,7 @@ private def parse_group (pattern : String) (i : Nat)
pure (n + 2, Sum.inr g)
else
let parser := {parser with capture_index := parser.capture_index + 1 }
let g := Group.mk (String.toSpan pattern i (i + 1)) (.CaptureIndex parser.capture_index) Ast.Empty
let g := Group.mk (String.toSpan pattern i (i + 1)) (.CaptureIndex parser.capture_index none) Ast.Empty
set parser
pure (0, Sum.inr g)

Expand Down Expand Up @@ -1063,7 +1141,7 @@ private def get_fixed_width (pattern : String) (ast : Ast) : Except String Nat :
match rep with
| AstItems.Repetition.mk _ (RepetitionOp.mk _ (.Range (.Exactly n))) _ _ _ => pure n
| _ => throw (toError pattern .FixedWidtExcpected)
| .Group ⟨_, GroupKind.CaptureIndex _, ast⟩ =>
| .Group ⟨_, GroupKind.CaptureIndex _ _, ast⟩ =>
let width ← get_fixed_width pattern ast
pure width
| .Group ⟨_, GroupKind.Lookaround _, _⟩ =>
Expand Down Expand Up @@ -1130,7 +1208,7 @@ private def add_char_to_concat (pattern : String) (i : Nat) (c : Char) (concat :
private def checkBackRefence (b : Nat) (ast: Ast) : Except String Bool := do
let check (ast : Ast) :=
match ast with
| .Group ⟨_, AstItems.GroupKind.CaptureIndex b', _⟩ => b = b'
| .Group ⟨_, AstItems.GroupKind.CaptureIndex b' _, _⟩ => b = b'
| _ => false

match AstItems.find ast check with
Expand Down Expand Up @@ -1171,10 +1249,17 @@ def parse (pattern : String) (flavor : Syntax.Flavor) : Except String Ast := do
| '(' =>
if state.disabled_metacharacters
then loop pattern (i+1) (add_char_to_concat pattern i c concat) else
let (n, concat) ← push_group pattern (i + 1) concat
have : pattern.length - (i + n + 1) < pattern.length - i := by
simp [Nat.sum_succ_lt_of_not_gt _ _ _ h₀]
loop pattern (i+n+1) concat
match ← maybe_parse_backref_name pattern (i+1) ')' with
| some (⟨n, h₁⟩ , p) =>
let asts := concat.asts.push p.into_ast
have : pattern.length - (i + n) < pattern.length - i := by
simp [Nat.sum_lt_of_not_gt _ _ _ h₀ h₁]
loop pattern (i+n) (Concat.mk (concat.span) asts)
| none =>
let (n, concat) ← push_group pattern (i + 1) concat
have : pattern.length - (i + n + 1) < pattern.length - i := by
simp [Nat.sum_succ_lt_of_not_gt _ _ _ h₀]
loop pattern (i+n+1) concat
| ')' =>
if state.disabled_metacharacters
then loop pattern (i+1) (add_char_to_concat pattern i c concat) else
Expand Down
2 changes: 1 addition & 1 deletion Regex/Syntax/Translate.lean
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ private def hir_assertion (ast : AstItems.Assertion) (flags : Syntax.Flags) : Hi
private def hir_capture (g : AstItems.Group) (expr: Hir) : Hir :=
let (index, name) : Option Nat × Option String :=
match g.kind with
| .CaptureIndex captureIndex => (some captureIndex, none)
| .CaptureIndex captureIndex _ => (some captureIndex, none)
| .NonCapturing _ => (none, none)
| .Lookaround _ => (none, none)

Expand Down
Loading

0 comments on commit 88dddfa

Please sign in to comment.