From 01bcdbdc631fa6b635d89b0ade53f9723bfb6023 Mon Sep 17 00:00:00 2001 From: Rudi Grinberg Date: Thu, 24 Sep 2020 12:50:00 -0700 Subject: [PATCH] Update vendored re (#3797) * Update vendored re * Go from 1.7.1 to 1.9.0 * Do not just use a subset, use all of re. This simplifies maintenance * Custom modifications are generated in the bash script. This simplifies updates. Signed-off-by: Rudi Grinberg * Fix update script Signed-off-by: Rudi Grinberg --- vendor/re/{LICENSE => LICENSE.md} | 0 vendor/re/src/{re_automata.ml => automata.ml} | 111 +- .../re/src/{re_automata.mli => automata.mli} | 31 +- vendor/re/src/category.ml | 27 + vendor/re/src/category.mli | 25 + vendor/re/src/color_map.ml | 34 + vendor/re/src/color_map.mli | 14 + vendor/re/src/core.ml | 1254 +++++++++++++++ vendor/re/src/{re.mli => core.mli} | 222 +-- vendor/re/src/{re_cset.ml => cset.ml} | 2 +- vendor/re/src/{re_cset.mli => cset.mli} | 0 vendor/re/src/dune | 3 +- vendor/re/src/emacs.ml | 124 ++ vendor/re/src/emacs.mli | 37 + vendor/re/src/fmt.ml | 47 + vendor/re/src/glob.ml | 289 ++++ vendor/re/src/glob.mli | 77 + vendor/re/src/group.ml | 73 + vendor/re/src/group.mli | 52 + vendor/re/src/pcre.ml | 119 ++ vendor/re/src/pcre.mli | 45 + vendor/re/src/perl.ml | 272 ++++ vendor/re/src/perl.mli | 41 + vendor/re/src/pmark.ml | 13 + vendor/re/src/pmark.mli | 8 + vendor/re/src/posix.ml | 156 ++ vendor/re/src/posix.mli | 98 ++ vendor/re/src/re.ml | 1351 +---------------- vendor/re/src/re_fmt.ml | 25 - vendor/re/src/str.ml | 295 ++++ vendor/re/src/str.mli | 203 +++ vendor/update-re.sh | 28 +- 32 files changed, 3526 insertions(+), 1550 deletions(-) rename vendor/re/{LICENSE => LICENSE.md} (100%) rename vendor/re/src/{re_automata.ml => automata.ml} (87%) rename vendor/re/src/{re_automata.mli => automata.mli} (76%) create mode 100644 vendor/re/src/category.ml create mode 100644 vendor/re/src/category.mli create mode 100644 vendor/re/src/color_map.ml create mode 100644 vendor/re/src/color_map.mli create mode 100644 vendor/re/src/core.ml rename vendor/re/src/{re.mli => core.mli} (71%) rename vendor/re/src/{re_cset.ml => cset.ml} (99%) rename vendor/re/src/{re_cset.mli => cset.mli} (100%) create mode 100644 vendor/re/src/emacs.ml create mode 100644 vendor/re/src/emacs.mli create mode 100644 vendor/re/src/fmt.ml create mode 100644 vendor/re/src/glob.ml create mode 100644 vendor/re/src/glob.mli create mode 100644 vendor/re/src/group.ml create mode 100644 vendor/re/src/group.mli create mode 100644 vendor/re/src/pcre.ml create mode 100644 vendor/re/src/pcre.mli create mode 100644 vendor/re/src/perl.ml create mode 100644 vendor/re/src/perl.mli create mode 100644 vendor/re/src/pmark.ml create mode 100644 vendor/re/src/pmark.mli create mode 100644 vendor/re/src/posix.ml create mode 100644 vendor/re/src/posix.mli delete mode 100644 vendor/re/src/re_fmt.ml create mode 100644 vendor/re/src/str.ml create mode 100644 vendor/re/src/str.mli diff --git a/vendor/re/LICENSE b/vendor/re/LICENSE.md similarity index 100% rename from vendor/re/LICENSE rename to vendor/re/LICENSE.md diff --git a/vendor/re/src/re_automata.ml b/vendor/re/src/automata.ml similarity index 87% rename from vendor/re/src/re_automata.ml rename to vendor/re/src/automata.ml index e8bf768ff2e..c454b7c8e65 100644 --- a/vendor/re/src/re_automata.ml +++ b/vendor/re/src/automata.ml @@ -20,33 +20,13 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *) -module Cset = Re_cset - type sem = [ `Longest | `Shortest | `First ] type rep_kind = [ `Greedy | `Non_greedy ] -type category = int type mark = int type idx = int -module Pmark : sig - type t = private int - val equal : t -> t -> bool - val compare : t -> t -> int - val gen : unit -> t - val pp : Format.formatter -> t -> unit -end -= struct - type t = int - let equal (x : int) (y : int) = x = y - let compare (x : int) (y : int) = compare x y - let r = ref 0 - let gen () = incr r ; !r - - let pp = Format.pp_print_int -end - type expr = { id : int; def : def } and def = @@ -57,20 +37,18 @@ and def = | Rep of rep_kind * sem * expr | Mark of int | Erase of int * int - | Before of category - | After of category + | Before of Category.t + | After of Category.t | Pmark of Pmark.t -module PmarkSet = Set.Make(Pmark) - let hash_combine h accu = accu * 65599 + h module Marks = struct type t = { marks : (int * int) list - ; pmarks : PmarkSet.t } + ; pmarks : Pmark.Set.t } - let empty = { marks = [] ; pmarks = PmarkSet.empty } + let empty = { marks = [] ; pmarks = Pmark.Set.empty } let rec merge_marks_offset old = function | [] -> @@ -84,7 +62,7 @@ module Marks = struct let merge old nw = { marks = merge_marks_offset old.marks nw.marks - ; pmarks = PmarkSet.union old.pmarks nw.pmarks } + ; pmarks = Pmark.Set.union old.pmarks nw.pmarks } let rec hash_marks_offset l accu = match l with @@ -127,7 +105,7 @@ let pp_rep_kind fmt = function | `Non_greedy -> Format.pp_print_string fmt "Non_greedy" let rec pp ch e = - let open Re_fmt in + let open Fmt in match e.def with Cst l -> sexp ch "cst" Cset.pp l; @@ -146,9 +124,9 @@ let rec pp ch e = | Erase (b, e) -> sexp ch "erase" (pair int int) (b, e) | Before c -> - sexp ch "before" int c + sexp ch "before" Category.pp c | After c -> - sexp ch "after" int c + sexp ch "after" Category.pp c (****) @@ -175,7 +153,7 @@ let mk_expr ids def = let empty ids = mk_expr ids (Alt []) let cst ids s = - if Re_cset.is_empty s + if Cset.is_empty s then empty ids else mk_expr ids (Cst s) @@ -228,7 +206,7 @@ let rec rename ids x = type hash = int type mark_infos = int array -type status = Failed | Match of mark_infos * PmarkSet.t | Running +type status = Failed | Match of mark_infos * Pmark.Set.t | Running module E = struct type t = @@ -300,22 +278,22 @@ end module State = struct type t = { idx: idx - ; category: category + ; category: Category.t ; desc: E.t list ; mutable status: status option ; hash: hash } let dummy = { idx = -1 - ; category = -1 + ; category = Category.dummy ; desc = [] ; status = None ; hash = -1 } let hash idx cat desc = - E.hash desc (hash_combine idx (hash_combine cat 0)) land 0x3FFFFFFF + E.hash desc (hash_combine idx (hash_combine (Category.to_int cat) 0)) land 0x3FFFFFFF - let mk idx cat desc = + let mk idx cat desc = { idx ; category = cat ; desc @@ -326,12 +304,12 @@ module State = struct let equal x y = (x.hash : int) = y.hash && (x.idx : int) = y.idx && - (x.category : int) = y.category && E.equal x.desc y.desc + Category.equal x.category y.category && E.equal x.desc y.desc let compare x y = let c = compare (x.hash : int) y.hash in if c <> 0 then c else - let c = compare (x.category : int) y.category in + let c = Category.compare x.category y.category in if c <> 0 then c else compare x.desc y.desc @@ -421,18 +399,18 @@ let rec set_idx idx = function let filter_marks b e marks = {marks with Marks.marks = List.filter (fun (i, _) -> i < b || i > e) marks.Marks.marks } -let rec delta_1 marks c cat' cat x rem = +let rec delta_1 marks c ~next_cat ~prev_cat x rem = (*Format.eprintf "%d@." x.id;*) match x.def with Cst s -> if Cset.mem c s then E.texp marks eps_expr :: rem else rem | Alt l -> - delta_2 marks c cat' cat l rem + delta_2 marks c ~next_cat ~prev_cat l rem | Seq (kind, y, z) -> - let y' = delta_1 marks c cat' cat y [] in - delta_seq c cat' cat kind y' z rem + let y' = delta_1 marks c ~next_cat ~prev_cat y [] in + delta_seq c ~next_cat ~prev_cat kind y' z rem | Rep (rep_kind, kind, y) -> - let y' = delta_1 marks c cat' cat y [] in + let y' = delta_1 marks c ~next_cat ~prev_cat y [] in let (y'', marks') = match first @@ -451,21 +429,23 @@ let rec delta_1 marks c cat' cat x rem = let marks = { marks with Marks.marks = (i, -1) :: List.remove_assq i marks.Marks.marks } in E.TMatch marks :: rem | Pmark i -> - let marks = { marks with Marks.pmarks = PmarkSet.add i marks.Marks.pmarks } in + let marks = { marks with Marks.pmarks = Pmark.Set.add i marks.Marks.pmarks } in E.TMatch marks :: rem | Erase (b, e) -> E.TMatch (filter_marks b e marks) :: rem | Before cat'' -> - if cat land cat'' <> 0 then E.TMatch marks :: rem else rem + if Category.intersect next_cat cat'' then E.TMatch marks :: rem else rem | After cat'' -> - if cat' land cat'' <> 0 then E.TMatch marks :: rem else rem + if Category.intersect prev_cat cat'' then E.TMatch marks :: rem else rem -and delta_2 marks c cat' cat l rem = +and delta_2 marks c ~next_cat ~prev_cat l rem = match l with [] -> rem - | y :: r -> delta_1 marks c cat' cat y (delta_2 marks c cat' cat r rem) + | y :: r -> + delta_1 marks c ~next_cat ~prev_cat y + (delta_2 marks c ~next_cat ~prev_cat r rem) -and delta_seq c cat' cat kind y z rem = +and delta_seq c ~next_cat ~prev_cat kind y z rem = match first (function E.TMatch marks -> Some marks | _ -> None) y with @@ -474,35 +454,42 @@ and delta_seq c cat' cat kind y z rem = | Some marks -> match kind with `Longest -> - E.tseq kind (remove_matches y) z (delta_1 marks c cat' cat z rem) + E.tseq kind (remove_matches y) z + (delta_1 marks c ~next_cat ~prev_cat z rem) | `Shortest -> - delta_1 marks c cat' cat z (E.tseq kind (remove_matches y) z rem) + delta_1 marks c ~next_cat ~prev_cat z + (E.tseq kind (remove_matches y) z rem) | `First -> let (y', y'') = split_at_match y in - E.tseq kind y' z (delta_1 marks c cat' cat z (E.tseq kind y'' z rem)) + E.tseq kind y' z + (delta_1 marks c ~next_cat ~prev_cat z (E.tseq kind y'' z rem)) -let rec delta_3 c cat' cat x rem = +let rec delta_3 c ~next_cat ~prev_cat x rem = match x with E.TSeq (y, z, kind) -> - let y' = delta_4 c cat' cat y [] in - delta_seq c cat' cat kind y' z rem + let y' = delta_4 c ~next_cat ~prev_cat y [] in + delta_seq c ~next_cat ~prev_cat kind y' z rem | E.TExp (marks, e) -> - delta_1 marks c cat' cat e rem + delta_1 marks c ~next_cat ~prev_cat e rem | E.TMatch _ -> x :: rem -and delta_4 c cat' cat l rem = +and delta_4 c ~next_cat ~prev_cat l rem = match l with [] -> rem - | y :: r -> delta_3 c cat' cat y (delta_4 c cat' cat r rem) + | y :: r -> + delta_3 c ~next_cat ~prev_cat y + (delta_4 c ~next_cat ~prev_cat r rem) -let delta tbl_ref cat' char st = +let delta tbl_ref next_cat char st = + let prev_cat = st.State.category in let (expr', _) = - remove_duplicates [] (delta_4 char st.State.category cat' st.State.desc []) + remove_duplicates [] + (delta_4 char ~next_cat ~prev_cat st.State.desc []) eps_expr in let idx = free_index tbl_ref expr' in let expr'' = set_idx idx expr' in - State.mk idx cat' expr'' + State.mk idx next_cat expr'' (****) @@ -586,7 +573,7 @@ let rec deriv_1 all_chars categories marks cat x rem = | Before cat' -> Cset.prepend (List.assq cat' categories) [E.TMatch marks] rem | After cat' -> - if cat land cat' <> 0 then Cset.prepend all_chars [E.TMatch marks] rem else rem + if Category.intersect cat cat' then Cset.prepend all_chars [E.TMatch marks] rem else rem and deriv_2 all_chars categories marks cat l rem = match l with diff --git a/vendor/re/src/re_automata.mli b/vendor/re/src/automata.mli similarity index 76% rename from vendor/re/src/re_automata.mli rename to vendor/re/src/automata.mli index 96005d21f7f..68443050d70 100644 --- a/vendor/re/src/re_automata.mli +++ b/vendor/re/src/automata.mli @@ -22,7 +22,6 @@ (* Regular expressions *) -type category = int type mark = int type sem = [ `Longest | `Shortest | `First ] @@ -31,14 +30,6 @@ type rep_kind = [ `Greedy | `Non_greedy ] val pp_sem : Format.formatter -> sem -> unit val pp_rep_kind : Format.formatter -> rep_kind -> unit -module Pmark : sig - type t = private int - val equal : t -> t -> bool - val compare : t -> t -> int - val gen : unit -> t - val pp : Format.formatter -> t -> unit -end - type expr val is_eps : expr -> bool val pp : Format.formatter -> expr -> unit @@ -46,7 +37,7 @@ val pp : Format.formatter -> expr -> unit type ids val create_ids : unit -> ids -val cst : ids -> Re_cset.t -> expr +val cst : ids -> Cset.t -> expr val empty : ids -> expr val alt : ids -> expr list -> expr val seq : ids -> sem -> expr -> expr -> expr @@ -55,22 +46,20 @@ val rep : ids -> rep_kind -> sem -> expr -> expr val mark : ids -> mark -> expr val pmark : ids -> Pmark.t -> expr val erase : ids -> mark -> mark -> expr -val before : ids -> category -> expr -val after : ids -> category -> expr +val before : ids -> Category.t -> expr +val after : ids -> Category.t -> expr val rename : ids -> expr -> expr (****) -module PmarkSet : Set.S with type elt = Pmark.t - (* States of the automata *) type idx = int module Marks : sig type t = { marks: (mark * idx) list - ; pmarks: PmarkSet.t } + ; pmarks: Pmark.Set.t } end module E : sig @@ -80,17 +69,17 @@ end type hash type mark_infos = int array -type status = Failed | Match of mark_infos * PmarkSet.t | Running +type status = Failed | Match of mark_infos * Pmark.Set.t | Running module State : sig type t = { idx: idx - ; category: category + ; category: Category.t ; desc: E.t list ; mutable status: status option ; hash: hash } val dummy : t - val create : category -> expr -> t + val create : Category.t -> expr -> t module Table : Hashtbl.S with type key = t end @@ -102,10 +91,10 @@ type working_area val create_working_area : unit -> working_area val index_count : working_area -> int -val delta : working_area -> category -> Re_cset.c -> State.t -> State.t +val delta : working_area -> Category.t -> Cset.c -> State.t -> State.t val deriv : - working_area -> Re_cset.t -> (category * Re_cset.t) list -> State.t -> - (Re_cset.t * State.t) list + working_area -> Cset.t -> (Category.t * Cset.t) list -> State.t -> + (Cset.t * State.t) list (****) diff --git a/vendor/re/src/category.ml b/vendor/re/src/category.ml new file mode 100644 index 00000000000..ff952b53bcf --- /dev/null +++ b/vendor/re/src/category.ml @@ -0,0 +1,27 @@ + +type t = int +let equal (x : int) (y : int) = x = y +let compare (x : int) (y : int) = compare x y +let to_int x = x +let pp = Format.pp_print_int + +let intersect x y = x land y <> 0 +let (++) x y = x lor y + +let dummy = -1 +let inexistant = 1 +let letter = 2 +let not_letter = 4 +let newline = 8 +let lastnewline = 16 +let search_boundary = 32 + +let from_char = function + (* Should match [cword] definition *) + | 'a'..'z' | 'A'..'Z' | '0'..'9' | '_' | '\170' | '\181' | '\186' + | '\192'..'\214' | '\216'..'\246' | '\248'..'\255' -> + letter + | '\n' -> + not_letter ++ newline + | _ -> + not_letter diff --git a/vendor/re/src/category.mli b/vendor/re/src/category.mli new file mode 100644 index 00000000000..c50deb32028 --- /dev/null +++ b/vendor/re/src/category.mli @@ -0,0 +1,25 @@ +(** Categories represent the various kinds of characters that can be tested + by look-ahead and look-behind operations. + + This is more restricted than Cset, but faster. +*) + +type t +val (++) : t -> t -> t +val from_char : char -> t + +val dummy : t +val inexistant : t +val letter : t +val not_letter : t +val newline : t +val lastnewline : t +val search_boundary : t +val to_int : t -> int +val equal : t -> t -> bool +val compare : t -> t -> int + +val intersect : t -> t -> bool + + +val pp : Format.formatter -> t -> unit diff --git a/vendor/re/src/color_map.ml b/vendor/re/src/color_map.ml new file mode 100644 index 00000000000..045ef6ec76e --- /dev/null +++ b/vendor/re/src/color_map.ml @@ -0,0 +1,34 @@ +(* In reality, this can really be represented as a bool array. + + The representation is best thought of as a list of all chars along with a + flag: + + (a, 0), (b, 1), (c, 0), (d, 0), ... + + characters belonging to the same color are represented by sequnces of + characters with the flag set to 0. +*) + +type t = Bytes.t + +let make () = Bytes.make 257 '\000' + +let flatten cm = + let c = Bytes.create 256 in + let color_repr = Bytes.create 256 in + let v = ref 0 in + Bytes.set c 0 '\000'; + Bytes.set color_repr 0 '\000'; + for i = 1 to 255 do + if Bytes.get cm i <> '\000' then incr v; + Bytes.set c i (Char.chr !v); + Bytes.set color_repr !v (Char.chr i) + done; + (c, Bytes.sub color_repr 0 (!v + 1), !v + 1) + +(* mark all the endpoints of the intervals of the char set with the 1 byte *) +let split s cm = + Cset.iter s ~f:(fun i j -> + Bytes.set cm i '\001'; + Bytes.set cm (j + 1) '\001'; + ) diff --git a/vendor/re/src/color_map.mli b/vendor/re/src/color_map.mli new file mode 100644 index 00000000000..2b642c3a54a --- /dev/null +++ b/vendor/re/src/color_map.mli @@ -0,0 +1,14 @@ +(* Color maps exists to provide an optimization for the regex engine. The fact + that some characters are entirely equivalent for some regexes means that we + can use them interchangeably. + + A color map assigns a color to every character in our character set. Any two + characters with the same color will be treated equivalently by the automaton. +*) +type t + +val make : unit -> t + +val flatten : t -> bytes * bytes * int + +val split : Cset.t -> t -> unit diff --git a/vendor/re/src/core.ml b/vendor/re/src/core.ml new file mode 100644 index 00000000000..99cf99c26e2 --- /dev/null +++ b/vendor/re/src/core.ml @@ -0,0 +1,1254 @@ +(* + RE - A regular expression library + + Copyright (C) 2001 Jerome Vouillon + email: Jerome.Vouillon@pps.jussieu.fr + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation, with + linking exception; either version 2.1 of the License, or (at + your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*) + +let rec iter n f v = if n = 0 then v else iter (n - 1) f (f v) + +(****) + +let unknown = -2 +let break = -3 + +type match_info = + | Match of Group.t + | Failed + | Running + +type state = + { idx : int; + (* Index of the current position in the position table. + Not yet computed transitions point to a dummy state where + [idx] is set to [unknown]; + If [idx] is set to [break] for states that either always + succeed or always fail. *) + real_idx : int; + (* The real index, in case [idx] is set to [break] *) + next : state array; + (* Transition table, indexed by color *) + mutable final : + (Category.t * + (Automata.idx * Automata.status)) list; + (* Mapping from the category of the next character to + - the index where the next position should be saved + - possibly, the list of marks (and the corresponding indices) + corresponding to the best match *) + desc : Automata.State.t + (* Description of this state of the automata *) } + +(* Automata (compiled regular expression) *) +type re = + { initial : Automata.expr; + (* The whole regular expression *) + mutable initial_states : (Category.t * state) list; + (* Initial states, indexed by initial category *) + colors : Bytes.t; + (* Color table *) + color_repr : Bytes.t; + (* Table from colors to one character of this color *) + ncolor : int; + (* Number of colors. *) + lnl : int; + (* Color of the last newline. -1 if unnecessary *) + tbl : Automata.working_area; + (* Temporary table used to compute the first available index + when computing a new state *) + states : state Automata.State.Table.t; + (* States of the deterministic automata *) + group_count : int + (* Number of groups in the regular expression *) } + +let pp_re ch re = Automata.pp ch re.initial + +let print_re = pp_re + +(* Information used during matching *) +type info = + { re : re; + (* The automata *) + colors : Bytes.t; + (* Color table ([x.colors = x.re.colors]) + Shortcut used for performance reasons *) + mutable positions : int array; + (* Array of mark positions + The mark are off by one for performance reasons *) + pos : int; + (* Position where the match is started *) + last : int + (* Position where the match should stop *) } + + +(****) + +let category re ~color = + if color = -1 then + Category.inexistant + (* Special category for the last newline *) + else if color = re.lnl then + Category.(lastnewline ++ newline ++ not_letter) + else + Category.from_char (Bytes.get re.color_repr color) + +(****) + +let dummy_next = [||] + +let unknown_state = + { idx = unknown; real_idx = 0; + next = dummy_next; final = []; + desc = Automata.State.dummy } + +let mk_state ncol desc = + let break_state = + match Automata.status desc with + | Automata.Running -> false + | Automata.Failed + | Automata.Match _ -> true + in + { idx = if break_state then break else desc.Automata.State.idx; + real_idx = desc.Automata.State.idx; + next = if break_state then dummy_next else Array.make ncol unknown_state; + final = []; + desc } + +let find_state re desc = + try + Automata.State.Table.find re.states desc + with Not_found -> + let st = mk_state re.ncolor desc in + Automata.State.Table.add re.states desc st; + st + +(**** Match with marks ****) + +let delta info cat ~color st = + let desc = Automata.delta info.re.tbl cat color st.desc in + let len = Array.length info.positions in + if desc.Automata.State.idx = len && len > 0 then begin + let pos = info.positions in + info.positions <- Array.make (2 * len) 0; + Array.blit pos 0 info.positions 0 len + end; + desc + +let validate info (s:string) ~pos st = + let color = Char.code (Bytes.get info.colors (Char.code s.[pos])) in + let cat = category info.re ~color in + let desc' = delta info cat ~color st in + let st' = find_state info.re desc' in + st.next.(color) <- st' + +(* +let rec loop info s pos st = + if pos < info.last then + let st' = st.next.(Char.code info.cols.[Char.code s.[pos]]) in + let idx = st'.idx in + if idx >= 0 then begin + info.positions.(idx) <- pos; + loop info s (pos + 1) st' + end else if idx = break then begin + info.positions.(st'.real_idx) <- pos; + st' + end else begin (* Unknown *) + validate info s pos st; + loop info s pos st + end + else + st +*) + +let rec loop info (s:string) ~pos st = + if pos < info.last then + let st' = st.next.(Char.code (Bytes.get info.colors (Char.code s.[pos]))) in + loop2 info s ~pos st st' + else + st + +and loop2 info s ~pos st st' = + if st'.idx >= 0 then begin + let pos = pos + 1 in + if pos < info.last then begin + (* It is important to place these reads before the write *) + (* But then, we don't have enough registers left to store the + right position. So, we store the position plus one. *) + let st'' = + st'.next.(Char.code (Bytes.get info.colors (Char.code s.[pos]))) in + info.positions.(st'.idx) <- pos; + loop2 info s ~pos st' st'' + end else begin + info.positions.(st'.idx) <- pos; + st' + end + end else if st'.idx = break then begin + info.positions.(st'.real_idx) <- pos + 1; + st' + end else begin (* Unknown *) + validate info s ~pos st; + loop info s ~pos st + end + +let rec loop_no_mark info s ~pos ~last st = + if pos < last then + let st' = st.next.(Char.code (Bytes.get info.colors (Char.code s.[pos]))) in + if st'.idx >= 0 then + loop_no_mark info s ~pos:(pos + 1) ~last st' + else if st'.idx = break then + st' + else begin (* Unknown *) + validate info s ~pos st; + loop_no_mark info s ~pos ~last st + end + else + st + +let final info st cat = + try + List.assq cat st.final + with Not_found -> + let st' = delta info cat ~color:(-1) st in + let res = (st'.Automata.State.idx, Automata.status st') in + st.final <- (cat, res) :: st.final; + res + +let find_initial_state re cat = + try + List.assq cat re.initial_states + with Not_found -> + let st = find_state re (Automata.State.create cat re.initial) in + re.initial_states <- (cat, st) :: re.initial_states; + st + +let get_color re (s:string) pos = + if pos < 0 then + -1 + else + let slen = String.length s in + if pos >= slen then + -1 + else if pos = slen - 1 && re.lnl <> -1 && s.[pos] = '\n' then + (* Special case for the last newline *) + re.lnl + else + Char.code (Bytes.get re.colors (Char.code s.[pos])) + +let rec handle_last_newline info ~pos st ~groups = + let st' = st.next.(info.re.lnl) in + if st'.idx >= 0 then begin + if groups then info.positions.(st'.idx) <- pos + 1; + st' + end else if st'.idx = break then begin + if groups then info.positions.(st'.real_idx) <- pos + 1; + st' + end else begin (* Unknown *) + let color = info.re.lnl in + let real_c = Char.code (Bytes.get info.colors (Char.code '\n')) in + let cat = category info.re ~color in + let desc' = delta info cat ~color:real_c st in + let st' = find_state info.re desc' in + st.next.(color) <- st'; + handle_last_newline info ~pos st ~groups + end + +let rec scan_str info (s:string) initial_state ~groups = + let pos = info.pos in + let last = info.last in + if (last = String.length s + && info.re.lnl <> -1 + && last > pos + && String.get s (last - 1) = '\n') + then begin + let info = { info with last = last - 1 } in + let st = scan_str info s initial_state ~groups in + if st.idx = break then + st + else + handle_last_newline info ~pos:(last - 1) st ~groups + end else if groups then + loop info s ~pos initial_state + else + loop_no_mark info s ~pos ~last initial_state + +let match_str ~groups ~partial re s ~pos ~len = + let slen = String.length s in + let last = if len = -1 then slen else pos + len in + let info = + { re ; colors = re.colors; pos ; last + ; positions = + if groups then begin + let n = Automata.index_count re.tbl + 1 in + if n <= 10 then + [|0;0;0;0;0;0;0;0;0;0|] + else + Array.make n 0 + end else + [||] } + in + let initial_cat = + if pos = 0 then + Category.(search_boundary ++ inexistant) + else + Category.(search_boundary + ++ category re ~color:(get_color re s (pos - 1))) + in + let initial_state = find_initial_state re initial_cat in + let st = scan_str info s initial_state ~groups in + let res = + if st.idx = break || partial then + Automata.status st.desc + else + let final_cat = + if last = slen then + Category.(search_boundary ++ inexistant) + else + Category.(search_boundary ++ category re ~color:(get_color re s last)) + in + let (idx, res) = final info st final_cat in + if groups then info.positions.(idx) <- last + 1; + res + in + match res with + Automata.Match (marks, pmarks) -> + Match { s ; marks; pmarks ; gpos = info.positions; gcount = re.group_count} + | Automata.Failed -> Failed + | Automata.Running -> Running + +let mk_re ~initial ~colors ~color_repr ~ncolor ~lnl ~group_count = + { initial ; + initial_states = []; + colors; + color_repr; + ncolor; + lnl; + tbl = Automata.create_working_area (); + states = Automata.State.Table.create 97; + group_count } + +(**** Character sets ****) + +let cseq c c' = Cset.seq (Char.code c) (Char.code c') +let cadd c s = Cset.add (Char.code c) s + +let trans_set cache cm s = + match Cset.one_char s with + | Some i -> Cset.csingle (Bytes.get cm i) + | None -> + let v = (Cset.hash_rec s, s) in + try + Cset.CSetMap.find v !cache + with Not_found -> + let l = + Cset.fold_right + s + ~f:(fun (i, j) l -> Cset.union (cseq (Bytes.get cm i) + (Bytes.get cm j)) l) + ~init:Cset.empty + in + cache := Cset.CSetMap.add v l !cache; + l + +(****) + +type regexp = + Set of Cset.t + | Sequence of regexp list + | Alternative of regexp list + | Repeat of regexp * int * int option + | Beg_of_line | End_of_line + | Beg_of_word | End_of_word | Not_bound + | Beg_of_str | End_of_str + | Last_end_of_line | Start | Stop + | Sem of Automata.sem * regexp + | Sem_greedy of Automata.rep_kind * regexp + | Group of regexp | No_group of regexp | Nest of regexp + | Case of regexp | No_case of regexp + | Intersection of regexp list + | Complement of regexp list + | Difference of regexp * regexp + | Pmark of Pmark.t * regexp + +module View = struct + type t = regexp = + Set of Cset.t + | Sequence of regexp list + | Alternative of regexp list + | Repeat of regexp * int * int option + | Beg_of_line | End_of_line + | Beg_of_word | End_of_word | Not_bound + | Beg_of_str | End_of_str + | Last_end_of_line | Start | Stop + | Sem of Automata.sem * regexp + | Sem_greedy of Automata.rep_kind * regexp + | Group of regexp | No_group of regexp | Nest of regexp + | Case of regexp | No_case of regexp + | Intersection of regexp list + | Complement of regexp list + | Difference of regexp * regexp + | Pmark of Pmark.t * regexp + + let view t = t +end + +let rec pp fmt t = + let open Fmt in + let var s re = sexp fmt s pp re in + let seq s rel = sexp fmt s (list pp) rel in + match t with + | Set s -> sexp fmt "Set" Cset.pp s + | Sequence sq -> seq "Sequence" sq + | Alternative alt -> seq "Alternative" alt + | Repeat (re, start, stop) -> + let pp' fmt () = fprintf fmt "%a@ %d%a" pp re start optint stop in + sexp fmt "Repeat" pp' () + | Beg_of_line -> str fmt "Beg_of_line" + | End_of_line -> str fmt "End_of_line" + | Beg_of_word -> str fmt "Beg_of_word" + | End_of_word -> str fmt "End_of_word" + | Not_bound -> str fmt "Not_bound" + | Beg_of_str -> str fmt "Beg_of_str" + | End_of_str -> str fmt "End_of_str" + | Last_end_of_line -> str fmt "Last_end_of_line" + | Start -> str fmt "Start" + | Stop -> str fmt "Stop" + | Sem (sem, re) -> + sexp fmt "Sem" (pair Automata.pp_sem pp) (sem, re) + | Sem_greedy (k, re) -> + sexp fmt "Sem_greedy" (pair Automata.pp_rep_kind pp) (k, re) + | Group c -> var "Group" c + | No_group c -> var "No_group" c + | Nest c -> var "Nest" c + | Case c -> var "Case" c + | No_case c -> var "No_case" c + | Intersection c -> seq "Intersection" c + | Complement c -> seq "Complement" c + | Difference (a, b) -> sexp fmt "Difference" (pair pp pp) (a, b) + | Pmark (m, r) -> sexp fmt "Pmark" (pair Pmark.pp pp) (m, r) + +let rec is_charset = function + | Set _ -> + true + | Alternative l | Intersection l | Complement l -> + List.for_all is_charset l + | Difference (r, r') -> + is_charset r && is_charset r' + | Sem (_, r) | Sem_greedy (_, r) + | No_group r | Case r | No_case r -> + is_charset r + | Sequence _ | Repeat _ | Beg_of_line | End_of_line + | Beg_of_word | End_of_word | Beg_of_str | End_of_str + | Not_bound | Last_end_of_line | Start | Stop + | Group _ | Nest _ | Pmark (_,_)-> + false + +(*XXX Use a better algorithm allowing non-contiguous regions? *) + +let cupper = + Cset.union (cseq 'A' 'Z') + (Cset.union (cseq '\192' '\214') (cseq '\216' '\222')) +let clower = Cset.offset 32 cupper +let calpha = + List.fold_right cadd ['\170'; '\181'; '\186'; '\223'; '\255'] + (Cset.union clower cupper) +let cdigit = cseq '0' '9' +let calnum = Cset.union calpha cdigit +let cword = cadd '_' calnum + +let colorize c regexp = + let lnl = ref false in + let rec colorize regexp = + match regexp with + Set s -> Color_map.split s c + | Sequence l -> List.iter colorize l + | Alternative l -> List.iter colorize l + | Repeat (r, _, _) -> colorize r + | Beg_of_line | End_of_line -> Color_map.split (Cset.csingle '\n') c + | Beg_of_word | End_of_word + | Not_bound -> Color_map.split cword c + | Beg_of_str | End_of_str + | Start | Stop -> () + | Last_end_of_line -> lnl := true + | Sem (_, r) + | Sem_greedy (_, r) + | Group r | No_group r + | Nest r | Pmark (_,r) -> colorize r + | Case _ | No_case _ + | Intersection _ + | Complement _ + | Difference _ -> assert false + in + colorize regexp; + !lnl + +(**** Compilation ****) + +let rec equal x1 x2 = + match x1, x2 with + Set s1, Set s2 -> + s1 = s2 + | Sequence l1, Sequence l2 -> + eq_list l1 l2 + | Alternative l1, Alternative l2 -> + eq_list l1 l2 + | Repeat (x1', i1, j1), Repeat (x2', i2, j2) -> + i1 = i2 && j1 = j2 && equal x1' x2' + | Beg_of_line, Beg_of_line + | End_of_line, End_of_line + | Beg_of_word, Beg_of_word + | End_of_word, End_of_word + | Not_bound, Not_bound + | Beg_of_str, Beg_of_str + | End_of_str, End_of_str + | Last_end_of_line, Last_end_of_line + | Start, Start + | Stop, Stop -> + true + | Sem (sem1, x1'), Sem (sem2, x2') -> + sem1 = sem2 && equal x1' x2' + | Sem_greedy (k1, x1'), Sem_greedy (k2, x2') -> + k1 = k2 && equal x1' x2' + | Group _, Group _ -> (* Do not merge groups! *) + false + | No_group x1', No_group x2' -> + equal x1' x2' + | Nest x1', Nest x2' -> + equal x1' x2' + | Case x1', Case x2' -> + equal x1' x2' + | No_case x1', No_case x2' -> + equal x1' x2' + | Intersection l1, Intersection l2 -> + eq_list l1 l2 + | Complement l1, Complement l2 -> + eq_list l1 l2 + | Difference (x1', x1''), Difference (x2', x2'') -> + equal x1' x2' && equal x1'' x2'' + | Pmark (m1, r1), Pmark (m2, r2) -> + Pmark.equal m1 m2 && equal r1 r2 + | _ -> + false + +and eq_list l1 l2 = + match l1, l2 with + [], [] -> + true + | x1 :: r1, x2 :: r2 -> + equal x1 x2 && eq_list r1 r2 + | _ -> + false + +let sequence = function + | [x] -> x + | l -> Sequence l + +let rec merge_sequences = function + | [] -> + [] + | Alternative l' :: r -> + merge_sequences (l' @ r) + | Sequence (x :: y) :: r -> + begin match merge_sequences r with + Sequence (x' :: y') :: r' when equal x x' -> + Sequence [x; Alternative [sequence y; sequence y']] :: r' + | r' -> + Sequence (x :: y) :: r' + end + | x :: r -> + x :: merge_sequences r + +module A = Automata + +let enforce_kind ids kind kind' cr = + match kind, kind' with + `First, `First -> cr + | `First, k -> A.seq ids k cr (A.eps ids) + | _ -> cr + +(* XXX should probably compute a category mask *) +let rec translate ids kind ign_group ign_case greedy pos cache c = function + | Set s -> + (A.cst ids (trans_set cache c s), kind) + | Sequence l -> + (trans_seq ids kind ign_group ign_case greedy pos cache c l, kind) + | Alternative l -> + begin match merge_sequences l with + [r'] -> + let (cr, kind') = + translate ids kind ign_group ign_case greedy pos cache c r' in + (enforce_kind ids kind kind' cr, kind) + | merged_sequences -> + (A.alt ids + (List.map + (fun r' -> + let (cr, kind') = + translate ids kind ign_group ign_case greedy + pos cache c r' in + enforce_kind ids kind kind' cr) + merged_sequences), + kind) + end + | Repeat (r', i, j) -> + let (cr, kind') = + translate ids kind ign_group ign_case greedy pos cache c r' in + let rem = + match j with + None -> + A.rep ids greedy kind' cr + | Some j -> + let f = + match greedy with + `Greedy -> + fun rem -> + A.alt ids + [A.seq ids kind' (A.rename ids cr) rem; A.eps ids] + | `Non_greedy -> + fun rem -> + A.alt ids + [A.eps ids; A.seq ids kind' (A.rename ids cr) rem] + in + iter (j - i) f (A.eps ids) + in + (iter i (fun rem -> A.seq ids kind' (A.rename ids cr) rem) rem, kind) + | Beg_of_line -> + (A.after ids Category.(inexistant ++ newline), kind) + | End_of_line -> + (A.before ids Category.(inexistant ++ newline), kind) + | Beg_of_word -> + (A.seq ids `First + (A.after ids Category.(inexistant ++ not_letter)) + (A.before ids Category.(inexistant ++ letter)), + kind) + | End_of_word -> + (A.seq ids `First + (A.after ids Category.(inexistant ++ letter)) + (A.before ids Category.(inexistant ++ not_letter)), + kind) + | Not_bound -> + (A.alt ids [A.seq ids `First + (A.after ids Category.letter) + (A.before ids Category.letter); + A.seq ids `First + (A.after ids Category.letter) + (A.before ids Category.letter)], + kind) + | Beg_of_str -> + (A.after ids Category.inexistant, kind) + | End_of_str -> + (A.before ids Category.inexistant, kind) + | Last_end_of_line -> + (A.before ids Category.(inexistant ++ lastnewline), kind) + | Start -> + (A.after ids Category.search_boundary, kind) + | Stop -> + (A.before ids Category.search_boundary, kind) + | Sem (kind', r') -> + let (cr, kind'') = + translate ids kind' ign_group ign_case greedy pos cache c r' in + (enforce_kind ids kind' kind'' cr, + kind') + | Sem_greedy (greedy', r') -> + translate ids kind ign_group ign_case greedy' pos cache c r' + | Group r' -> + if ign_group then + translate ids kind ign_group ign_case greedy pos cache c r' + else + let p = !pos in + pos := !pos + 2; + let (cr, kind') = + translate ids kind ign_group ign_case greedy pos cache c r' in + (A.seq ids `First (A.mark ids p) ( + A.seq ids `First cr (A.mark ids (p + 1))), + kind') + | No_group r' -> + translate ids kind true ign_case greedy pos cache c r' + | Nest r' -> + let b = !pos in + let (cr, kind') = + translate ids kind ign_group ign_case greedy pos cache c r' + in + let e = !pos - 1 in + if e < b then + (cr, kind') + else + (A.seq ids `First (A.erase ids b e) cr, kind') + | Difference _ | Complement _ | Intersection _ | No_case _ | Case _ -> + assert false + | Pmark (i, r') -> + let (cr, kind') = + translate ids kind ign_group ign_case greedy pos cache c r' in + (A.seq ids `First (A.pmark ids i) cr, kind') + +and trans_seq ids kind ign_group ign_case greedy pos cache c = function + | [] -> + A.eps ids + | [r] -> + let (cr', kind') = + translate ids kind ign_group ign_case greedy pos cache c r in + enforce_kind ids kind kind' cr' + | r :: rem -> + let (cr', kind') = + translate ids kind ign_group ign_case greedy pos cache c r in + let cr'' = + trans_seq ids kind ign_group ign_case greedy pos cache c rem in + if A.is_eps cr'' then + cr' + else if A.is_eps cr' then + cr'' + else + A.seq ids kind' cr' cr'' + +(**** Case ****) + +let case_insens s = + Cset.union s (Cset.union (Cset.offset 32 (Cset.inter s cupper)) + (Cset.offset (-32) (Cset.inter s clower))) + +let as_set = function + | Set s -> s + | _ -> assert false + +(* XXX Should split alternatives into (1) charsets and (2) more + complex regular expressions; alternative should therefore probably + be flatten here *) +let rec handle_case ign_case = function + | Set s -> + Set (if ign_case then case_insens s else s) + | Sequence l -> + Sequence (List.map (handle_case ign_case) l) + | Alternative l -> + let l' = List.map (handle_case ign_case) l in + if is_charset (Alternative l') then + Set (List.fold_left (fun s r -> Cset.union s (as_set r)) Cset.empty l') + else + Alternative l' + | Repeat (r, i, j) -> + Repeat (handle_case ign_case r, i, j) + | Beg_of_line | End_of_line | Beg_of_word | End_of_word | Not_bound + | Beg_of_str | End_of_str | Last_end_of_line | Start | Stop as r -> + r + | Sem (k, r) -> + let r' = handle_case ign_case r in + if is_charset r' then r' else Sem (k, r') + | Sem_greedy (k, r) -> + let r' = handle_case ign_case r in + if is_charset r' then r' else Sem_greedy (k, r') + | Group r -> + Group (handle_case ign_case r) + | No_group r -> + let r' = handle_case ign_case r in + if is_charset r' then r' else No_group r' + | Nest r -> + let r' = handle_case ign_case r in + if is_charset r' then r' else Nest r' + | Case r -> + handle_case false r + | No_case r -> + handle_case true r + | Intersection l -> + let l' = List.map (fun r -> handle_case ign_case r) l in + Set (List.fold_left (fun s r -> Cset.inter s (as_set r)) Cset.cany l') + | Complement l -> + let l' = List.map (fun r -> handle_case ign_case r) l in + Set (Cset.diff Cset.cany + (List.fold_left (fun s r -> Cset.union s (as_set r)) + Cset.empty l')) + | Difference (r, r') -> + Set (Cset.inter (as_set (handle_case ign_case r)) + (Cset.diff Cset.cany (as_set (handle_case ign_case r')))) + | Pmark (i,r) -> Pmark (i,handle_case ign_case r) + +(****) + +let compile_1 regexp = + let regexp = handle_case false regexp in + let c = Color_map.make () in + let need_lnl = colorize c regexp in + let (colors, color_repr, ncolor) = Color_map.flatten c in + let lnl = if need_lnl then ncolor else -1 in + let ncolor = if need_lnl then ncolor + 1 else ncolor in + let ids = A.create_ids () in + let pos = ref 0 in + let (r, kind) = + translate ids + `First false false `Greedy pos (ref Cset.CSetMap.empty) colors regexp in + let r = enforce_kind ids `First kind r in + (*Format.eprintf "<%d %d>@." !ids ncol;*) + mk_re ~initial:r ~colors ~color_repr ~ncolor ~lnl ~group_count:(!pos / 2) + +(****) + +let rec anchored = function + | Sequence l -> + List.exists anchored l + | Alternative l -> + List.for_all anchored l + | Repeat (r, i, _) -> + i > 0 && anchored r + | Set _ | Beg_of_line | End_of_line | Beg_of_word | End_of_word + | Not_bound | End_of_str | Last_end_of_line | Stop + | Intersection _ | Complement _ | Difference _ -> + false + | Beg_of_str | Start -> + true + | Sem (_, r) | Sem_greedy (_, r) | Group r | No_group r | Nest r + | Case r | No_case r | Pmark (_, r) -> + anchored r + +(****) + +type t = regexp + +let str s = + let l = ref [] in + for i = String.length s - 1 downto 0 do + l := Set (Cset.csingle s.[i]) :: !l + done; + Sequence !l +let char c = Set (Cset.csingle c) + +let alt = function + | [r] -> r + | l -> Alternative l +let seq = function + | [r] -> r + | l -> Sequence l + +let empty = alt [] +let epsilon = seq [] +let repn r i j = + if i < 0 then invalid_arg "Re.repn"; + begin match j with + | Some j when j < i -> invalid_arg "Re.repn" + | _ -> () + end; + Repeat (r, i, j) +let rep r = repn r 0 None +let rep1 r = repn r 1 None +let opt r = repn r 0 (Some 1) +let bol = Beg_of_line +let eol = End_of_line +let bow = Beg_of_word +let eow = End_of_word +let word r = seq [bow; r; eow] +let not_boundary = Not_bound +let bos = Beg_of_str +let eos = End_of_str +let whole_string r = seq [bos; r; eos] +let leol = Last_end_of_line +let start = Start +let stop = Stop +let longest r = Sem (`Longest, r) +let shortest r = Sem (`Shortest, r) +let first r = Sem (`First, r) +let greedy r = Sem_greedy (`Greedy, r) +let non_greedy r = Sem_greedy (`Non_greedy, r) +let group r = Group r +let no_group r = No_group r +let nest r = Nest r +let mark r = let i = Pmark.gen () in (i,Pmark (i,r)) + +let set str = + let s = ref Cset.empty in + for i = 0 to String.length str - 1 do + s := Cset.union (Cset.csingle str.[i]) !s + done; + Set !s + +let rg c c' = Set (cseq c c') + +let inter l = + let r = Intersection l in + if is_charset r then + r + else + invalid_arg "Re.inter" + +let compl l = + let r = Complement l in + if is_charset r then + r + else + invalid_arg "Re.compl" + +let diff r r' = + let r'' = Difference (r, r') in + if is_charset r'' then + r'' + else + invalid_arg "Re.diff" + +let any = Set Cset.cany +let notnl = Set (Cset.diff Cset.cany (Cset.csingle '\n')) + +let lower = alt [rg 'a' 'z'; char '\181'; rg '\223' '\246'; rg '\248' '\255'] +let upper = alt [rg 'A' 'Z'; rg '\192' '\214'; rg '\216' '\222'] +let alpha = alt [lower; upper; char '\170'; char '\186'] +let digit = rg '0' '9' +let alnum = alt [alpha; digit] +let wordc = alt [alnum; char '_'] +let ascii = rg '\000' '\127' +let blank = set "\t " +let cntrl = alt [rg '\000' '\031'; rg '\127' '\159'] +let graph = alt [rg '\033' '\126'; rg '\160' '\255'] +let print = alt [rg '\032' '\126'; rg '\160' '\255'] +let punct = + alt [rg '\033' '\047'; rg '\058' '\064'; rg '\091' '\096'; + rg '\123' '\126'; rg '\160' '\169'; rg '\171' '\180'; + rg '\182' '\185'; rg '\187' '\191'; char '\215'; char '\247'] +let space = alt [char ' '; rg '\009' '\013'] +let xdigit = alt [digit; rg 'a' 'f'; rg 'A' 'F'] + +let case r = Case r +let no_case r = No_case r + +(****) + +let compile r = + compile_1 ( + if anchored r then + group r + else + seq [shortest (rep any); group r] + ) + +let exec_internal name ?(pos=0) ?(len = -1) ~partial ~groups re s = + if pos < 0 || len < -1 || pos + len > String.length s then + invalid_arg name; + match_str ~groups ~partial re s ~pos ~len + +let exec ?pos ?len re s = + match exec_internal "Re.exec" ?pos ?len ~groups:true ~partial:false re s with + Match substr -> substr + | _ -> raise Not_found + +let exec_opt ?pos ?len re s = + match exec_internal "Re.exec_opt" ?pos ?len ~groups:true ~partial:false + re s with + Match substr -> Some substr + | _ -> None + +let execp ?pos ?len re s = + match exec_internal ~groups:false ~partial:false "Re.execp" ?pos ?len re s with + Match _substr -> true + | _ -> false + +let exec_partial ?pos ?len re s = + match exec_internal ~groups:false ~partial:true "Re.exec_partial" + ?pos ?len re s with + Match _ -> `Full + | Running -> `Partial + | Failed -> `Mismatch + +module Mark = struct + + type t = Pmark.t + + let test (g : Group.t) p = + Pmark.Set.mem p g.pmarks + + let all (g : Group.t) = g.pmarks + + module Set = Pmark.Set + + let equal = Pmark.equal + + let compare = Pmark.compare + +end + +type split_token = + [ `Text of string + | `Delim of Group.t + ] + +module Rseq = struct + let all ?(pos=0) ?len re s : _ Seq.t = + if pos < 0 then invalid_arg "Re.all"; + (* index of the first position we do not consider. + !pos < limit is an invariant *) + let limit = match len with + | None -> String.length s + | Some l -> + if l<0 || pos+l > String.length s then invalid_arg "Re.all"; + pos+l + in + (* iterate on matches. When a match is found, search for the next + one just after its end *) + let rec aux pos () = + if pos >= limit + then Seq.Nil (* no more matches *) + else + match match_str ~groups:true ~partial:false re s + ~pos ~len:(limit - pos) with + | Match substr -> + let p1, p2 = Group.offset substr 0 in + let pos = if p1=p2 then p2+1 else p2 in + Seq.Cons (substr, aux pos) + | Running + | Failed -> Seq.Nil + in + aux pos + + let matches ?pos ?len re s : _ Seq.t = + all ?pos ?len re s + |> Seq.map (fun sub -> Group.get sub 0) + + let split_full ?(pos=0) ?len re s : _ Seq.t = + if pos < 0 then invalid_arg "Re.split"; + let limit = match len with + | None -> String.length s + | Some l -> + if l<0 || pos+l > String.length s then invalid_arg "Re.split"; + pos+l + in + (* i: start of delimited string + pos: first position after last match of [re] + limit: first index we ignore (!pos < limit is an invariant) *) + let pos0 = pos in + let rec aux state i pos () = match state with + | `Idle when pos >= limit -> + if i < limit then ( + let sub = String.sub s i (limit - i) in + Seq.Cons (`Text sub, aux state (i+1) pos) + ) else Seq.Nil + | `Idle -> + begin match match_str ~groups:true ~partial:false re s ~pos + ~len:(limit - pos) with + | Match substr -> + let p1, p2 = Group.offset substr 0 in + let pos = if p1=p2 then p2+1 else p2 in + let old_i = i in + let i = p2 in + if p1 > pos0 then ( + (* string does not start by a delimiter *) + let text = String.sub s old_i (p1 - old_i) in + let state = `Yield (`Delim substr) in + Seq.Cons (`Text text, aux state i pos) + ) else Seq.Cons (`Delim substr, aux state i pos) + | Running -> Seq.Nil + | Failed -> + if i < limit + then ( + let text = String.sub s i (limit - i) in + (* yield last string *) + Seq.Cons (`Text text, aux state limit pos) + ) else + Seq.Nil + end + | `Yield x -> + Seq.Cons (x, aux `Idle i pos) + in + aux `Idle pos pos + + let split ?pos ?len re s : _ Seq.t = + let seq = split_full ?pos ?len re s in + let rec filter seq () = match seq () with + | Seq.Nil -> Seq.Nil + | Seq.Cons (`Delim _, tl) -> filter tl () + | Seq.Cons (`Text s,tl) -> Seq.Cons (s, filter tl) + in filter seq +end + +module Rlist = struct + let list_of_seq (s:'a Seq.t) : 'a list = + Seq.fold_left (fun l x -> x :: l) [] s |> List.rev + + let all ?pos ?len re s = Rseq.all ?pos ?len re s |> list_of_seq + + let matches ?pos ?len re s = Rseq.matches ?pos ?len re s |> list_of_seq + + let split_full ?pos ?len re s = Rseq.split_full ?pos ?len re s |> list_of_seq + + let split ?pos ?len re s = Rseq.split ?pos ?len re s |> list_of_seq +end + +module Gen = struct + type 'a gen = unit -> 'a option + let gen_of_seq (s:'a Seq.t) : 'a gen = + let r = ref s in + fun () -> match !r () with + | Seq.Nil -> None + | Seq.Cons (x, tl) -> + r := tl; + Some x + + let split ?pos ?len re s : _ gen = + Rseq.split ?pos ?len re s |> gen_of_seq + + let split_full ?pos ?len re s : _ gen = + Rseq.split_full ?pos ?len re s |> gen_of_seq + + let all ?pos ?len re s = Rseq.all ?pos ?len re s |> gen_of_seq + + let matches ?pos ?len re s = Rseq.matches ?pos ?len re s |> gen_of_seq +end + +let replace ?(pos=0) ?len ?(all=true) re ~f s = + if pos < 0 then invalid_arg "Re.replace"; + let limit = match len with + | None -> String.length s + | Some l -> + if l<0 || pos+l > String.length s then invalid_arg "Re.replace"; + pos+l + in + (* buffer into which we write the result *) + let buf = Buffer.create (String.length s) in + (* iterate on matched substrings. *) + let rec iter pos = + if pos < limit + then + match match_str ~groups:true ~partial:false re s ~pos ~len:(limit-pos) with + | Match substr -> + let p1, p2 = Group.offset substr 0 in + (* add string between previous match and current match *) + Buffer.add_substring buf s pos (p1-pos); + (* what should we replace the matched group with? *) + let replacing = f substr in + Buffer.add_string buf replacing; + if all then + (* if we matched a non-char e.g. ^ we must manually advance by 1 *) + iter ( + if p1=p2 then ( + (* a non char could be past the end of string. e.g. $ *) + if p2 < limit then Buffer.add_char buf s.[p2]; + p2+1 + ) else + p2) + else + Buffer.add_substring buf s p2 (limit-p2) + | Running -> () + | Failed -> + Buffer.add_substring buf s pos (limit-pos) + in + iter pos; + Buffer.contents buf + +let replace_string ?pos ?len ?all re ~by s = + replace ?pos ?len ?all re s ~f:(fun _ -> by) + +let witness t = + let rec witness = function + | Set c -> String.make 1 (Char.chr (Cset.pick c)) + | Sequence xs -> String.concat "" (List.map witness xs) + | Alternative (x :: _) -> witness x + | Alternative [] -> assert false + | Repeat (r, from, _to) -> + let w = witness r in + let b = Buffer.create (String.length w * from) in + for _i=1 to from do + Buffer.add_string b w + done; + Buffer.contents b + | No_case r -> witness r + | Intersection _ + | Complement _ + | Difference (_, _) -> assert false + | Group r + | No_group r + | Nest r + | Sem (_, r) + | Pmark (_, r) + | Case r + | Sem_greedy (_, r) -> witness r + | Beg_of_line + | End_of_line + | Beg_of_word + | End_of_word + | Not_bound + | Beg_of_str + | Last_end_of_line + | Start + | Stop + | End_of_str -> "" in + witness (handle_case false t) + +type 'a seq = 'a Seq.t +module Seq = Rseq +module List = Rlist +module Group = Group + +(** {2 Deprecated functions} *) + +type 'a gen = 'a Gen.gen +let all_gen = Gen.all +let matches_gen = Gen.matches +let split_gen = Gen.split +let split_full_gen = Gen.split_full + +let all_seq = Seq.all +let matches_seq = Seq.matches +let split_seq = Seq.split +let split_full_seq = Seq.split_full + + +type substrings = Group.t + +let get = Group.get +let get_ofs = Group.offset +let get_all = Group.all +let get_all_ofs = Group.all_offset +let test = Group.test + +type markid = Mark.t + +let marked = Mark.test +let mark_set = Mark.all + +(**********************************) + +(* +Information about the previous character: +- does not exists +- is a letter +- is not a letter +- is a newline +- is last newline + +Beginning of word: +- previous is not a letter or does not exist +- current is a letter or does not exist + +End of word: +- previous is a letter or does not exist +- current is not a letter or does not exist + +Beginning of line: +- previous is a newline or does not exist + +Beginning of buffer: +- previous does not exist + +End of buffer +- current does not exist + +End of line +- current is a newline or does not exist +*) + +(* +Rep: e = T,e | () + - semantics of the comma (shortest/longest/first) + - semantics of the union (greedy/non-greedy) + +Bounded repetition + a{0,3} = (a,(a,a?)?)? +*) + +type groups = Group.t + +include Rlist diff --git a/vendor/re/src/re.mli b/vendor/re/src/core.mli similarity index 71% rename from vendor/re/src/re.mli rename to vendor/re/src/core.mli index e8063b7c6e7..a79464b5280 100644 --- a/vendor/re/src/re.mli +++ b/vendor/re/src/core.mli @@ -28,8 +28,39 @@ type t type re (** Compiled regular expression *) -type groups -(** Information about groups in a match. *) +(** Manipulate matching groups. *) +module Group : sig + type t + (** Information about groups in a match. *) + + val get : t -> int -> string + (** Raise [Not_found] if the group did not match *) + + val offset : t -> int -> int * int + (** Raise [Not_found] if the group did not match *) + + val start : t -> int -> int + (** Return the start of the match. Raise [Not_found] if the group did not match. *) + + val stop : t -> int -> int + (** Return the end of the match. Raise [Not_found] if the group did not match. *) + + val all : t -> string array + (** Return the empty string for each group which did not match *) + + val all_offset : t -> (int * int) array + (** Return [(-1,-1)] for each group which did not match *) + + val test : t -> int -> bool + (** Test whether a group matched *) + + val nb_groups : t -> int + (** Returns the total number of groups defined - matched or not. + This function is experimental. *) + + val pp : Format.formatter -> t -> unit +end +type groups = Group.t [@@ocaml.deprecated "Use Group.t"] (** {2 Compilation and execution of a regular expression} *) @@ -40,7 +71,7 @@ val compile : t -> re val exec : ?pos:int -> (* Default: 0 *) ?len:int -> (* Default: -1 (until end of string) *) - re -> string -> groups + re -> string -> Group.t (** [exec re str] matches [str] against the compiled expression [re], and returns the matched groups if any. @param pos optional beginning of the string (default 0) @@ -52,7 +83,7 @@ val exec : val exec_opt : ?pos:int -> (* Default: 0 *) ?len:int -> (* Default: -1 (until end of string) *) - re -> string -> groups option + re -> string -> Group.t option (** Similar to {!exec}, but returns an option instead of using an exception. *) val execp : @@ -68,41 +99,6 @@ val exec_partial : re -> string -> [ `Full | `Partial | `Mismatch ] (** More detailed version of {!exec_p} *) -(** Manipulate matching groups. *) -module Group : sig - - type t = groups - (** Information about groups in a match. *) - - val get : t -> int -> string - (** Raise [Not_found] if the group did not match *) - - val offset : t -> int -> int * int - (** Raise [Not_found] if the group did not match *) - - val start : t -> int -> int - (** Return the start of the match. Raise [Not_found] if the group did not match. *) - - val stop : t -> int -> int - (** Return the end of the match. Raise [Not_found] if the group did not match. *) - - val all : t -> string array - (** Return the empty string for each group which did not match *) - - val all_offset : t -> (int * int) array - (** Return [(-1,-1)] for each group which did not match *) - - val test : t -> int -> bool - (** Test whether a group matched *) - - val nb_groups : t -> int - (** Returns the total number of groups defined - matched or not. - This function is experimental. *) - - val pp : Format.formatter -> t -> unit - -end - (** Marks *) module Mark : sig @@ -124,62 +120,84 @@ end (** {2 High Level Operations} *) +type split_token = + [ `Text of string (** Text between delimiters *) + | `Delim of Group.t (** Delimiter *) + ] + +type 'a seq = 'a Seq.t + +module Seq : sig + val all : + ?pos:int -> (** Default: 0 *) + ?len:int -> + re -> string -> Group.t Seq.t + (** Same as {!all} but returns an iterator + @since NEXT_RELEASE *) + + val matches : + ?pos:int -> (** Default: 0 *) + ?len:int -> + re -> string -> string Seq.t + (** Same as {!matches}, but returns an iterator + @since NEXT_RELEASE *) + + val split : + ?pos:int -> (** Default: 0 *) + ?len:int -> + re -> string -> string Seq.t + (** @since NEXT_RELEASE *) + + val split_full : + ?pos:int -> (** Default: 0 *) + ?len:int -> + re -> string -> split_token Seq.t + (** @since NEXT_RELEASE *) +end + +val all : ?pos:int -> ?len:int -> re -> string -> Group.t list +(** Repeatedly calls {!exec} on the given string, starting at given position and + length.*) + type 'a gen = unit -> 'a option -val all : - ?pos:int -> (** Default: 0 *) - ?len:int -> - re -> string -> Group.t list -(** Repeatedly calls {!exec} on the given string, starting at given - position and length.*) +val all_gen : ?pos:int -> ?len:int -> re -> string -> Group.t gen +[@@ocaml.deprecated "Use Seq.all"] -val all_gen : - ?pos:int -> (** Default: 0 *) - ?len:int -> - re -> string -> Group.t gen -(** Same as {!all} but returns a generator *) +val all_seq : ?pos:int -> ?len:int -> re -> string -> Group.t seq +[@@ocaml.deprecated "Use Seq.all"] -val matches : - ?pos:int -> (** Default: 0 *) - ?len:int -> - re -> string -> string list -(** Same as {!all}, but extracts the matched substring rather than - returning the whole group. This basically iterates over matched - strings *) +val matches : ?pos:int -> ?len:int -> re -> string -> string list +(** Same as {!all}, but extracts the matched substring rather than returning + the whole group. This basically iterates over matched strings *) -val matches_gen : - ?pos:int -> (** Default: 0 *) - ?len:int -> - re -> string -> string gen -(** Same as {!matches}, but returns a generator. *) +val matches_gen : ?pos:int -> ?len:int -> re -> string -> string gen +[@@ocaml.deprecated "Use Seq.matches"] -val split : - ?pos:int -> (** Default: 0 *) - ?len:int -> - re -> string -> string list -(** [split re s] splits [s] into chunks separated by [re]. It yields - the chunks themselves, not the separator. For instance - this can be used with a whitespace-matching re such as ["[\t ]+"]. *) +val matches_seq : ?pos:int -> ?len:int -> re -> string -> string seq +[@@ocaml.deprecated "Use Seq.matches"] -val split_gen : - ?pos:int -> (** Default: 0 *) - ?len:int -> - re -> string -> string gen +val split : ?pos:int -> ?len:int -> re -> string -> string list +(** [split re s] splits [s] into chunks separated by [re]. It yields the chunks + themselves, not the separator. For instance this can be used with a + whitespace-matching re such as ["[\t ]+"]. *) -type split_token = - [ `Text of string (** Text between delimiters *) - | `Delim of Group.t (** Delimiter *) - ] +val split_gen : ?pos:int -> ?len:int -> re -> string -> string gen +[@@ocaml.deprecated "Use Seq.split"] -val split_full : - ?pos:int -> (** Default: 0 *) - ?len:int -> - re -> string -> split_token list +val split_seq : ?pos:int -> ?len:int -> re -> string -> string seq +[@@ocaml.deprecated "Use Seq.split"] -val split_full_gen : - ?pos:int -> (** Default: 0 *) - ?len:int -> - re -> string -> split_token gen +val split_full : ?pos:int -> ?len:int -> re -> string -> split_token list +(** [split re s] splits [s] into chunks separated by [re]. It yields the chunks + along with the separators. For instance this can be used with a + whitespace-matching re such as ["[\t ]+"]. *) + +val split_full_gen : ?pos:int -> ?len:int -> re -> string -> split_token gen +[@@ocaml.deprecated "Use Seq.split_full"] + +val split_full_seq : ?pos:int -> ?len:int -> re -> string -> split_token seq +[@@ocaml.deprecated "Use Seq.split_full"] val replace : ?pos:int -> (** Default: 0 *) @@ -372,6 +390,31 @@ val pp_re : Format.formatter -> re -> unit (** Alias for {!pp_re}. Deprecated *) val print_re : Format.formatter -> re -> unit +module View : sig + type outer + + (** A view of the top-level of a regex. This type is unstable and may change *) + type t = + Set of Cset.t + | Sequence of outer list + | Alternative of outer list + | Repeat of outer * int * int option + | Beg_of_line | End_of_line + | Beg_of_word | End_of_word | Not_bound + | Beg_of_str | End_of_str + | Last_end_of_line | Start | Stop + | Sem of Automata.sem * outer + | Sem_greedy of Automata.rep_kind * outer + | Group of outer | No_group of outer | Nest of outer + | Case of outer | No_case of outer + | Intersection of outer list + | Complement of outer list + | Difference of outer * outer + | Pmark of Pmark.t * outer + + val view : outer -> t +end with type outer := t + (** {2 Experimental functions}. *) val witness : t -> string @@ -381,28 +424,37 @@ val witness : t -> string (** {2 Deprecated functions} *) type substrings = Group.t +[@@ocaml.deprecated "Use Group.t"] (** Alias for {!Group.t}. Deprecated *) val get : Group.t -> int -> string +[@@ocaml.deprecated "Use Group.get"] (** Same as {!Group.get}. Deprecated *) val get_ofs : Group.t -> int -> int * int +[@@ocaml.deprecated "Use Group.offset"] (** Same as {!Group.offset}. Deprecated *) val get_all : Group.t -> string array +[@@ocaml.deprecated "Use Group.all"] (** Same as {!Group.all}. Deprecated *) val get_all_ofs : Group.t -> (int * int) array +[@@ocaml.deprecated "Use Group.all_offset"] (** Same as {!Group.all_offset}. Deprecated *) val test : Group.t -> int -> bool +[@@ocaml.deprecated "Use Group.test"] (** Same as {!Group.test}. Deprecated *) type markid = Mark.t +[@@ocaml.deprecated "Use Mark."] (** Alias for {!Mark.t}. Deprecated *) val marked : Group.t -> Mark.t -> bool +[@@ocaml.deprecated "Use Mark.test"] (** Same as {!Mark.test}. Deprecated *) val mark_set : Group.t -> Mark.Set.t +[@@ocaml.deprecated "Use Mark.all"] (** Same as {!Mark.all}. Deprecated *) diff --git a/vendor/re/src/re_cset.ml b/vendor/re/src/cset.ml similarity index 99% rename from vendor/re/src/re_cset.ml rename to vendor/re/src/cset.ml index 025690a4543..17e4d1d3008 100644 --- a/vendor/re/src/re_cset.ml +++ b/vendor/re/src/cset.ml @@ -102,7 +102,7 @@ let print_one ch (c1, c2) = else Format.fprintf ch "%d-%d" c1 c2 -let pp = Re_fmt.list print_one +let pp = Fmt.list print_one let rec iter t ~f = match t with diff --git a/vendor/re/src/re_cset.mli b/vendor/re/src/cset.mli similarity index 100% rename from vendor/re/src/re_cset.mli rename to vendor/re/src/cset.mli diff --git a/vendor/re/src/dune b/vendor/re/src/dune index 5f5c1ad5c5b..80196c91fec 100644 --- a/vendor/re/src/dune +++ b/vendor/re/src/dune @@ -1,5 +1,4 @@ (library - (name dune_re) + (name dune_re) (public_name dune-private-libs.dune_re) - (flags (:standard -w -50)) (synopsis "Internal Dune library, do not use!")) diff --git a/vendor/re/src/emacs.ml b/vendor/re/src/emacs.ml new file mode 100644 index 00000000000..df0bf5224fb --- /dev/null +++ b/vendor/re/src/emacs.ml @@ -0,0 +1,124 @@ +(* + RE - A regular expression library + + Copyright (C) 2001 Jerome Vouillon + email: Jerome.Vouillon@pps.jussieu.fr + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation, with + linking exception; either version 2.1 of the License, or (at + your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*) + +module Re = Core + +exception Parse_error +exception Not_supported + +let parse s = + let i = ref 0 in + let l = String.length s in + let eos () = !i = l in + let test c = not (eos ()) && s.[!i] = c in + let test2 c c' = !i + 1 < l && s.[!i] = c && s.[!i + 1] = c' in + let accept c = let r = test c in if r then incr i; r in + let accept2 c c' = let r = test2 c c' in if r then i := !i + 2; r in + let get () = let r = s.[!i] in incr i; r in + + let rec regexp () = regexp' (branch ()) + and regexp' left = + if accept2 '\\' '|' then regexp' (Re.alt [left; branch ()]) else left + and branch () = branch' [] + and branch' left = + if eos () || test2 '\\' '|' || test2 '\\' ')' then Re.seq (List.rev left) + else branch' (piece () :: left) + and piece () = + let r = atom () in + if accept '*' then Re.rep r else + if accept '+' then Re.rep1 r else + if accept '?' then Re.opt r else + r + and atom () = + if accept '.' then begin + Re.notnl + end else if accept '^' then begin + Re.bol + end else if accept '$' then begin + Re.eol + end else if accept '[' then begin + if accept '^' then + Re.compl (bracket []) + else + Re.alt (bracket []) + end else if accept '\\' then begin + if accept '(' then begin + let r = regexp () in + if not (accept2 '\\' ')') then raise Parse_error; + Re.group r + end else if accept '`' then + Re.bos + else if accept '\'' then + Re.eos + else if accept '=' then + Re.start + else if accept 'b' then + Re.alt [Re.bow; Re.eow] + else if accept 'B' then + Re.not_boundary + else if accept '<' then + Re.bow + else if accept '>' then + Re.eow + else if accept 'w' then + Re.alt [Re.alnum; Re.char '_'] + else if accept 'W' then + Re.compl [Re.alnum; Re.char '_'] + else begin + if eos () then raise Parse_error; + match get () with + '*' | '+' | '?' | '[' | ']' | '.' | '^' | '$' | '\\' as c -> + Re.char c + | '0' .. '9' -> + raise Not_supported + | _ -> + raise Parse_error + end + end else begin + if eos () then raise Parse_error; + match get () with + '*' | '+' | '?' -> raise Parse_error + | c -> Re.char c + end + and bracket s = + if s <> [] && accept ']' then s else begin + let c = char () in + if accept '-' then begin + if accept ']' then Re.char c :: Re.char '-' :: s else begin + let c' = char () in + bracket (Re.rg c c' :: s) + end + end else + bracket (Re.char c :: s) + end + and char () = + if eos () then raise Parse_error; + get () + in + let res = regexp () in + if not (eos ()) then raise Parse_error; + res + +let re ?(case = true) s = let r = parse s in if case then r else Re.no_case r + +let compile = Re.compile +let compile_pat ?(case = true) s = compile (re ~case s) diff --git a/vendor/re/src/emacs.mli b/vendor/re/src/emacs.mli new file mode 100644 index 00000000000..e8b774dc43a --- /dev/null +++ b/vendor/re/src/emacs.mli @@ -0,0 +1,37 @@ +(* + RE - A regular expression library + + Copyright (C) 2001 Jerome Vouillon + email: Jerome.Vouillon@pps.jussieu.fr + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation, with + linking exception; either version 2.1 of the License, or (at + your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*) + +(** Emacs-style regular expressions *) + +exception Parse_error +exception Not_supported +(** Errors that can be raised during the parsing of the regular expression *) + +val re : ?case:bool -> string -> Core.t +(** Parsing of an Emacs-style regular expression *) + +val compile : Core.t -> Core.re +(** Regular expression compilation *) + +val compile_pat : ?case:bool -> string -> Core.re +(** Same as [Core.compile] *) + diff --git a/vendor/re/src/fmt.ml b/vendor/re/src/fmt.ml new file mode 100644 index 00000000000..ca303f0ed61 --- /dev/null +++ b/vendor/re/src/fmt.ml @@ -0,0 +1,47 @@ +(** Very small tooling for format printers. *) + +include Format + +type 'a t = Format.formatter -> 'a -> unit + +(* Only in the stdlib since 4.02, so we copy. *) +let rec list ?(pp_sep = pp_print_cut) pp ppf = function + | [] -> () + | [v] -> pp ppf v + | v :: vs -> + pp ppf v; + pp_sep ppf (); + list ~pp_sep pp ppf vs + +(* want this name to make sure we don't use pp_print_list from stdlib + accidentally *) +let pp_print_list = list + +let str = pp_print_string +let sexp fmt s pp x = fprintf fmt "@[<3>(%s@ %a)@]" s pp x +let pair pp1 pp2 fmt (v1,v2) = + pp1 fmt v1; pp_print_space fmt () ; pp2 fmt v2 +let triple pp1 pp2 pp3 fmt (v1, v2, v3) = + pp1 fmt v1; pp_print_space fmt () ; + pp2 fmt v2; pp_print_space fmt () ; + pp3 fmt v3 +let int = pp_print_int +let optint fmt = function + | None -> () + | Some i -> fprintf fmt "@ %d" i + +let quote fmt s = Format.fprintf fmt "\"%s\"" s + +let pp_olist pp_elem fmt = + Format.fprintf fmt "@[<3>[@ %a@ ]@]" + (pp_print_list + ~pp_sep:(fun fmt () -> fprintf fmt ";@ ") + pp_elem) + +let pp_str_list = pp_olist quote + +let to_to_string pp x = + let b = Buffer.create 16 in + let fmt = Format.formatter_of_buffer b in + pp fmt x; + Buffer.contents b diff --git a/vendor/re/src/glob.ml b/vendor/re/src/glob.ml new file mode 100644 index 00000000000..86f03408a90 --- /dev/null +++ b/vendor/re/src/glob.ml @@ -0,0 +1,289 @@ +(* + RE - A regular expression library + + Copyright (C) 2001 Jerome Vouillon + email: Jerome.Vouillon@pps.jussieu.fr + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation, with + linking exception; either version 2.1 of the License, or (at + your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*) + +module Re = Core + +exception Parse_error + +type enclosed = + | Char of char + | Range of char * char + +type piece = + | Exactly of char + | Any_of of enclosed list + | Any_but of enclosed list + | One + | Many + +type t = piece list + +let of_string s : t = + let i = ref 0 in + let l = String.length s in + let eos () = !i = l in + let read c = + let r = not (eos ()) && s.[!i] = c in + if r then incr i; + r + in + + let char () = + ignore (read '\\' : bool); + if eos () then raise Parse_error; + let r = s.[!i] in + incr i; + r + in + + let enclosed () : enclosed list = + let rec loop s = + (* This returns the list in reverse order, but order isn't important anyway *) + if s <> [] && read ']' + then s + else + let c = char () in + if not (read '-') + then loop (Char c :: s) + else if read ']' + then Char c :: Char '-' :: s + else + let c' = char () in + loop (Range (c, c') :: s) + in + loop [] + in + + let piece () = + if read '*' + then Many + else if read '?' + then One + else if not (read '[') + then Exactly (char ()) + else if read '^' || read '!' + then Any_but (enclosed ()) + else Any_of (enclosed ()) + in + + let rec loop pieces = + if eos () + then List.rev pieces + else loop (piece () :: pieces) + in + + loop [] + +let mul l l' = + List.flatten (List.map (fun s -> List.map (fun s' -> s ^ s') l') l) + +let explode str = + let l = String.length str in + let rec expl inner s i acc beg = + if i >= l then begin + if inner then raise Parse_error; + (mul beg [String.sub str s (i - s)], i) + end else + match str.[i] with + | '\\' -> expl inner s (i + 2) acc beg + | '{' -> + let (t, i') = expl true (i + 1) (i + 1) [] [""] in + expl inner i' i' acc + (mul beg (mul [String.sub str s (i - s)] t)) + | ',' when inner -> + expl inner (i + 1) (i + 1) + (mul beg [String.sub str s (i - s)] @ acc) [""] + | '}' when inner -> + (mul beg [String.sub str s (i - s)] @ acc, i + 1) + | _ -> + expl inner s (i + 1) acc beg + in + List.rev (fst (expl false 0 0 [] [""])) + +module State = struct + type t = { + re_pieces : Re.t list; (* last piece at head of list. *) + remaining : piece list; (* last piece at tail of list. *) + am_at_start_of_pattern : bool; (* true at start of pattern *) + am_at_start_of_component : bool; (* true at start of pattern or immediately + after '/' *) + pathname : bool; + period : bool; + } + + let create ~period ~pathname remaining = + { + re_pieces = []; + am_at_start_of_pattern = true; + am_at_start_of_component = true; + pathname; + period; + remaining; + } + + let explicit_period t = + t.period && ( + t.am_at_start_of_pattern || + (t.am_at_start_of_component && t.pathname) + ) + + let explicit_slash t = t.pathname + + let append ?(am_at_start_of_component=false) t piece = + { t with + re_pieces = piece :: t.re_pieces; + am_at_start_of_pattern = false; + am_at_start_of_component; + } + + let to_re t = Re.seq (List.rev t.re_pieces) + + let next t = + match t.remaining with + | [] -> None + | piece :: remaining -> Some (piece, { t with remaining }) +end + +let one ~explicit_slash ~explicit_period = + Re.compl ( + List.concat [ + if explicit_slash then [Re.char '/'] else []; + if explicit_period then [Re.char '.'] else []; + ] + ) + +let enclosed enclosed = + match enclosed with + | Char c -> Re.char c + | Range (low, high) -> Re.rg low high + +let enclosed_set ~explicit_slash ~explicit_period kind set = + let set = List.map enclosed set in + let enclosure = + match kind with + | `Any_of -> Re.alt set + | `Any_but -> Re.compl set + in + Re.inter [enclosure; one ~explicit_slash ~explicit_period] + +let exactly state c = + State.append state (Re.char c) ~am_at_start_of_component:(c = '/') + +let many (state : State.t) = + let explicit_slash = State.explicit_slash state in + let explicit_period = State.explicit_period state in + (* Whether we must explicitly match period depends on the surrounding characters, but + slashes are easy to explicit match. This conditional splits out some simple cases. + *) + if not explicit_period then begin + State.append state (Re.rep (one ~explicit_slash ~explicit_period)) + end else if not explicit_slash then begin + (* In this state, we explicitly match periods only at the very beginning *) + State.append state (Re.opt ( + Re.seq [ + one ~explicit_slash:false ~explicit_period; + Re.rep (one ~explicit_slash:false ~explicit_period:false); + ] + )) + end else begin + let not_empty = + Re.seq [ + one ~explicit_slash:true ~explicit_period:true; + Re.rep (one ~explicit_slash:true ~explicit_period:false); + ] + in + (* [maybe_empty] is the default translation of Many, except in some special cases. + *) + let maybe_empty = Re.opt not_empty in + let enclosed_set state kind set = + State.append state (Re.alt [ + enclosed_set kind set ~explicit_slash:true ~explicit_period:true; + Re.seq [ + not_empty; + (* Since [not_empty] matched, subsequent dots are not leading. *) + enclosed_set kind set ~explicit_slash:true ~explicit_period:false; + ]; + ]) + in + let rec lookahead state = + match State.next state with + | None -> State.append state maybe_empty + (* glob ** === glob * . *) + | Some (Many, state) -> lookahead state + | Some (Exactly c, state) -> + let state = + State.append state + (if c = '.' + then not_empty + else maybe_empty) + in + exactly state c + (* glob *? === glob ?* *) + | Some (One, state) -> State.append state not_empty + | Some (Any_of enclosed, state) -> enclosed_set state `Any_of enclosed + | Some (Any_but enclosed, state) -> enclosed_set state `Any_but enclosed + in + lookahead state + end + +let piece state piece = + let explicit_slash = State.explicit_slash state in + let explicit_period = State.explicit_period state in + match piece with + | One -> State.append state (one ~explicit_slash ~explicit_period) + | Many -> many state + | Any_of enclosed -> + State.append state (enclosed_set `Any_of ~explicit_slash ~explicit_period enclosed) + | Any_but enclosed -> + State.append state (enclosed_set `Any_but ~explicit_slash ~explicit_period enclosed) + | Exactly c -> exactly state c + +let glob ~pathname ~period glob = + let rec loop state = + match State.next state with + | None -> State.to_re state + | Some (p, state) -> loop (piece state p) + in + loop (State.create ~pathname ~period glob) + +let glob + ?(anchored = false) + ?(pathname = true) + ?(period = true) + ?(expand_braces = false) + s + = + let to_re s = + let re = glob ~pathname ~period (of_string s) in + if anchored + then Re.whole_string re + else re + in + if expand_braces + then Re.alt (List.map to_re (explode s)) + else to_re s + +let glob' ?anchored period s = glob ?anchored ~period s + +let globx ?anchored s = glob ?anchored ~expand_braces:true s + +let globx' ?anchored period s = glob ?anchored ~expand_braces:true ~period s diff --git a/vendor/re/src/glob.mli b/vendor/re/src/glob.mli new file mode 100644 index 00000000000..607ee48c922 --- /dev/null +++ b/vendor/re/src/glob.mli @@ -0,0 +1,77 @@ +(* + RE - A regular expression library + + Copyright (C) 2001 Jerome Vouillon + email: Jerome.Vouillon@pps.jussieu.fr + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation, with + linking exception; either version 2.1 of the License, or (at + your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*) + +(** Shell-style regular expressions *) + +exception Parse_error + +val glob : + ?anchored:bool -> + ?pathname:bool -> + ?period:bool -> + ?expand_braces:bool -> + string -> + Core.t +(** Implements the semantics of shells patterns. The returned regular + expression is unanchored by default. + + Character '*' matches any sequence of characters and character + '?' matches a single character. + A sequence '[...]' matches any one of the enclosed characters. + A sequence '[^...]' or '[!...]' matches any character *but* the enclosed characters. + A backslash escapes the following character. The last character of the string cannot + be a backslash. + + [anchored] controls whether the regular expression will only match entire + strings. Defaults to false. + + [pathname]: If this flag is set, match a slash in string only with a slash in pattern + and not by an asterisk ('*') or a question mark ('?') metacharacter, nor by a bracket + expression ('[]') containing a slash. Defaults to true. + + [period]: If this flag is set, a leading period in string has to be matched exactly by + a period in pattern. A period is considered to be leading if it is the first + character in string, or if both [pathname] is set and the period immediately follows a + slash. Defaults to true. + + If [expand_braces] is true, braced sets will expand into multiple globs, + e.g. a\{x,y\}b\{1,2\} matches axb1, axb2, ayb1, ayb2. As specified for bash, brace + expansion is purely textual and can be nested. Defaults to false. *) + +val glob' : ?anchored:bool -> bool -> string -> Core.t +(** Same, but allows to choose whether dots at the beginning of a + file name need to be explicitly matched (true) or not (false) + + @deprecated Use [glob ~period]. +*) + +val globx : ?anchored:bool -> string -> Core.t +(** This version of [glob] also recognizes the pattern \{..,..\} + + @deprecated Prefer [glob ~expand_braces:true]. +*) + +val globx' : ?anchored:bool -> bool -> string -> Core.t +(** This version of [glob'] also recognizes the pattern \{..,..\} + + @deprecated Prefer [glob ~expand_braces:true ~period]. +*) diff --git a/vendor/re/src/group.ml b/vendor/re/src/group.ml new file mode 100644 index 00000000000..dc43a0197f7 --- /dev/null +++ b/vendor/re/src/group.ml @@ -0,0 +1,73 @@ +(* Result of a successful match. *) +type t = + { s : string + ; marks : Automata.mark_infos + ; pmarks : Pmark.Set.t + ; gpos : int array + ; gcount : int + } + +let offset t i = + if 2 * i + 1 >= Array.length t.marks then raise Not_found; + let m1 = t.marks.(2 * i) in + if m1 = -1 then raise Not_found; + let p1 = t.gpos.(m1) - 1 in + let p2 = t.gpos.(t.marks.(2 * i + 1)) - 1 in + (p1, p2) + +let get t i = + let (p1, p2) = offset t i in + String.sub t.s p1 (p2 - p1) + +let start subs i = fst (offset subs i) + +let stop subs i = snd (offset subs i) + +let test t i = + if 2 * i >= Array.length t.marks then + false + else + let idx = t.marks.(2 * i) in + idx <> -1 + +let dummy_offset = (-1, -1) + +let all_offset t = + let res = Array.make t.gcount dummy_offset in + for i = 0 to Array.length t.marks / 2 - 1 do + let m1 = t.marks.(2 * i) in + if m1 <> -1 then begin + let p1 = t.gpos.(m1) in + let p2 = t.gpos.(t.marks.(2 * i + 1)) in + res.(i) <- (p1 - 1, p2 - 1) + end + done; + res + +let dummy_string = "" + +let all t = + let res = Array.make t.gcount dummy_string in + for i = 0 to Array.length t.marks / 2 - 1 do + let m1 = t.marks.(2 * i) in + if m1 <> -1 then begin + let p1 = t.gpos.(m1) in + let p2 = t.gpos.(t.marks.(2 * i + 1)) in + res.(i) <- String.sub t.s (p1 - 1) (p2 - p1) + end + done; + res + +let pp fmt t = + let matches = + let offsets = all_offset t in + let strs = all t in + Array.to_list ( + Array.init (Array.length strs) (fun i -> strs.(i), offsets.(i)) + ) in + let open Fmt in + let pp_match fmt (str, (start, stop)) = + fprintf fmt "@[(%s (%d %d))@]" str start stop in + sexp fmt "Group" (list pp_match) matches + +let nb_groups t = t.gcount diff --git a/vendor/re/src/group.mli b/vendor/re/src/group.mli new file mode 100644 index 00000000000..7ff2ccbb35c --- /dev/null +++ b/vendor/re/src/group.mli @@ -0,0 +1,52 @@ +(* Result of a successful match. *) +type t = + { s : string + (* Input string. Matched strings are substrings of s *) + + ; marks : Automata.mark_infos + (* Mapping from group indices to positions in gpos. group i has positions 2*i + - 1, 2*i + 1 in gpos. If the group wasn't matched, then its corresponding + values in marks will be -1,-1 *) + + ; pmarks : Pmark.Set.t + (* Marks positions. i.e. those marks created with Re.marks *) + + ; gpos : int array + (* Group positions. Adjacent elements are (start, stop) of group match. + indexed by the values in marks. So group i in an re would be the substring: + + start = t.gpos.(marks.(2*i)) - 1 + stop = t.gpos.(marks.(2*i + 1)) - 1 *) + + ; gcount : int + (* Number of groups the regular expression contains. Matched or not *) + } + +(** Information about groups in a match. *) + +val get : t -> int -> string +(** Raise [Not_found] if the group did not match *) + +val offset : t -> int -> int * int +(** Raise [Not_found] if the group did not match *) + +val start : t -> int -> int +(** Return the start of the match. Raise [Not_found] if the group did not match. *) + +val stop : t -> int -> int +(** Return the end of the match. Raise [Not_found] if the group did not match. *) + +val all : t -> string array +(** Return the empty string for each group which did not match *) + +val all_offset : t -> (int * int) array +(** Return [(-1,-1)] for each group which did not match *) + +val test : t -> int -> bool +(** Test whether a group matched *) + +val nb_groups : t -> int +(** Returns the total number of groups defined - matched or not. + This function is experimental. *) + +val pp : Format.formatter -> t -> unit diff --git a/vendor/re/src/pcre.ml b/vendor/re/src/pcre.ml new file mode 100644 index 00000000000..43e60e5960d --- /dev/null +++ b/vendor/re/src/pcre.ml @@ -0,0 +1,119 @@ +module Re = Core + +type regexp = Re.re + +type flag = [ `CASELESS | `MULTILINE | `ANCHORED ] + +type split_result = + | Text of string + | Delim of string + | Group of int * string + | NoGroup + +type groups = Core.Group.t + +let re ?(flags = []) pat = + let opts = List.map (function + | `CASELESS -> `Caseless + | `MULTILINE -> `Multiline + | `ANCHORED -> `Anchored + ) flags in + Perl.re ~opts pat + +let regexp ?flags pat = Re.compile (re ?flags pat) + +let extract ~rex s = + Re.Group.all (Re.exec rex s) + +let exec ~rex ?pos s = + Re.exec rex ?pos s + +let get_substring s i = + Re.Group.get s i + +let get_substring_ofs s i = + Re.Group.offset s i + +let pmatch ~rex s = + Re.execp rex s + +let substitute ~rex ~subst str = + let b = Buffer.create 1024 in + let rec loop pos = + if pos >= String.length str then + Buffer.contents b + else if Re.execp ~pos rex str then ( + let ss = Re.exec ~pos rex str in + let start, fin = Re.Group.offset ss 0 in + let pat = Re.Group.get ss 0 in + Buffer.add_substring b str pos (start - pos); + Buffer.add_string b (subst pat); + loop fin + ) else ( + Buffer.add_substring b str pos (String.length str - pos); + loop (String.length str) + ) + in + loop 0 + +let split ~rex str = + let rec loop accu pos = + if pos >= String.length str then + List.rev accu + else if Re.execp ~pos rex str then ( + let ss = Re.exec ~pos rex str in + let start, fin = Re.Group.offset ss 0 in + let s = String.sub str pos (start - pos) in + loop (s :: accu) fin + ) else ( + let s = String.sub str pos (String.length str - pos) in + loop (s :: accu) (String.length str) + ) in + loop [] 0 + +(* From PCRE *) +let string_unsafe_sub s ofs len = + let r = Bytes.create len in + Bytes.unsafe_blit s ofs r 0 len; + Bytes.unsafe_to_string r + +let quote s = + let len = String.length s in + let buf = Bytes.create (len lsl 1) in + let pos = ref 0 in + for i = 0 to len - 1 do + match String.unsafe_get s i with + | '\\' | '^' | '$' | '.' | '[' | '|' + | '(' | ')' | '?' | '*' | '+' | '{' as c -> + Bytes.unsafe_set buf !pos '\\'; + incr pos; + Bytes.unsafe_set buf !pos c; incr pos + | c -> Bytes.unsafe_set buf !pos c; incr pos + done; + string_unsafe_sub buf 0 !pos + +let full_split ?(max=0) ~rex s = + if String.length s = 0 then [] + else if max = 1 then [Text s] + else + let results = Re.split_full rex s in + let matches = + List.map (function + | `Text s -> [Text s] + | `Delim d -> + let matches = Re.Group.all_offset d in + let delim = Re.Group.get d 0 in + (Delim delim)::( + let l = ref [] in + for i = 1 to Array.length matches - 1 do + l := + (if matches.(i) = (-1, -1) + then NoGroup + else Group (i, Re.Group.get d i)) + ::(!l) + done; + List.rev !l)) results in + List.concat matches + + +type substrings = Group.t diff --git a/vendor/re/src/pcre.mli b/vendor/re/src/pcre.mli new file mode 100644 index 00000000000..81e56519eba --- /dev/null +++ b/vendor/re/src/pcre.mli @@ -0,0 +1,45 @@ +type regexp = Core.re + +type flag = [ `CASELESS | `MULTILINE | `ANCHORED ] + +type groups = Core.Group.t + +(** Result of a {!Pcre.full_split} *) +type split_result = + | Text of string (** Text part of splitted string *) + | Delim of string (** Delimiter part of splitted string *) + | Group of int * string (** Subgroup of matched delimiter (subgroup_nr, subgroup_str) *) + | NoGroup (** Unmatched subgroup *) + +val re : ?flags:(flag list) -> string -> Core.t +(** [re ~flags s] creates the regexp [s] using the pcre syntax. *) + +val regexp : ?flags:(flag list) -> string -> regexp +(** [re ~flags s] compiles the regexp [s] using the pcre syntax. *) + +val extract : rex:regexp -> string -> string array +(** [extract ~rex s] executes [rex] on [s] and returns the matching groups. *) + +val exec : rex:regexp -> ?pos:int -> string -> groups +(** Equivalent to {!Core.exec}. *) + +val get_substring : groups -> int -> string +(** Equivalent to {!Core.Group.get}. *) + +val get_substring_ofs : groups -> int -> int * int +(** Equivalent to {!Core.Group.offset}. *) + +val pmatch : rex:regexp -> string -> bool +(** Equivalent to {!Core.execp}. *) + +val substitute : rex:Core.re -> subst:(string -> string) -> string -> string + +val full_split : ?max:int -> rex:regexp -> string -> split_result list + +val split : rex:regexp -> string -> string list + +val quote : string -> string + +(** {2 Deprecated} *) + +type substrings = Group.t diff --git a/vendor/re/src/perl.ml b/vendor/re/src/perl.ml new file mode 100644 index 00000000000..aba49958ab3 --- /dev/null +++ b/vendor/re/src/perl.ml @@ -0,0 +1,272 @@ +(* + RE - A regular expression library + + Copyright (C) 2001 Jerome Vouillon + email: Jerome.Vouillon@pps.jussieu.fr + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation, with + linking exception; either version 2.1 of the License, or (at + your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*) + +module Re = Core + +exception Parse_error +exception Not_supported + +let posix_class_of_string = function + | "alnum" -> Re.alnum + | "ascii" -> Re.ascii + | "blank" -> Re.blank + | "cntrl" -> Re.cntrl + | "digit" -> Re.digit + | "lower" -> Re.lower + | "print" -> Re.print + | "space" -> Re.space + | "upper" -> Re.upper + | "word" -> Re.wordc + | "punct" -> Re.punct + | "graph" -> Re.graph + | "xdigit" -> Re.xdigit + | class_ -> invalid_arg ("Invalid pcre class: " ^ class_) + +let posix_class_strings = + [ "alnum" ; "ascii" ; "blank" + ; "cntrl" ; "digit" ; "lower" + ; "print" ; "space" ; "upper" + ; "word" ; "punct" ; "graph" + ; "xdigit" ] + +let parse multiline dollar_endonly dotall ungreedy s = + let i = ref 0 in + let l = String.length s in + let eos () = !i = l in + let test c = not (eos ()) && s.[!i] = c in + let accept c = let r = test c in if r then incr i; r in + let accept_s s' = + let len = String.length s' in + try + for j = 0 to len - 1 do + try if s'.[j] <> s.[!i + j] then raise Exit + with _ -> raise Exit + done; + i := !i + len; + true + with Exit -> false in + let get () = let r = s.[!i] in incr i; r in + let unget () = decr i in + let greedy_mod r = + let gr = accept '?' in + let gr = if ungreedy then not gr else gr in + if gr then Re.non_greedy r else Re.greedy r + in + let rec regexp () = regexp' (branch ()) + and regexp' left = + if accept '|' then regexp' (Re.alt [left; branch ()]) else left + and branch () = branch' [] + and branch' left = + if eos () || test '|' || test ')' then Re.seq (List.rev left) + else branch' (piece () :: left) + and piece () = + let r = atom () in + if accept '*' then greedy_mod (Re.rep r) else + if accept '+' then greedy_mod (Re.rep1 r) else + if accept '?' then greedy_mod (Re.opt r) else + if accept '{' then + match integer () with + Some i -> + let j = if accept ',' then integer () else Some i in + if not (accept '}') then raise Parse_error; + begin match j with + Some j when j < i -> raise Parse_error | _ -> () + end; + greedy_mod (Re.repn r i j) + | None -> + unget (); r + else + r + and atom () = + if accept '.' then begin + if dotall then Re.any else Re.notnl + end else if accept '(' then begin + if accept '?' then begin + if accept ':' then begin + let r = regexp () in + if not (accept ')') then raise Parse_error; + r + end else if accept '#' then begin + comment () + end else + raise Parse_error + end else begin + let r = regexp () in + if not (accept ')') then raise Parse_error; + Re.group r + end + end else + if accept '^' then begin + if multiline then Re.bol else Re.bos + end else if accept '$' then begin + if multiline then Re.eol else if dollar_endonly then Re.leol else Re.eos + end else if accept '[' then begin + if accept '^' then + Re.compl (bracket []) + else + Re.alt (bracket []) + end else if accept '\\' then begin +(* XXX + - Back-references + - \cx (control-x), \e, \f, \n, \r, \t, \xhh, \ddd +*) + if eos () then raise Parse_error; + match get () with + 'w' -> + Re.alt [Re.alnum; Re.char '_'] + | 'W' -> + Re.compl [Re.alnum; Re.char '_'] + | 's' -> + Re.space + | 'S' -> + Re.compl [Re.space] + | 'd' -> + Re.digit + | 'D' -> + Re.compl [Re.digit] + | 'b' -> + Re.alt [Re.bow; Re.eow] + | 'B' -> + Re.not_boundary + | 'A' -> + Re.bos + | 'Z' -> + Re.leol + | 'z' -> + Re.eos + | 'G' -> + Re.start + | 'a'..'z' | 'A'..'Z' -> + raise Parse_error + | '0'..'9' -> + raise Not_supported + | c -> + Re.char c + end else begin + if eos () then raise Parse_error; + match get () with + '*' | '+' | '?' | '{' | '\\' -> raise Parse_error + | c -> Re.char c + end + and integer () = + if eos () then None else + match get () with + '0'..'9' as d -> integer' (Char.code d - Char.code '0') + | _ -> unget (); None + and integer' i = + if eos () then Some i else + match get () with + '0'..'9' as d -> + let i' = 10 * i + (Char.code d - Char.code '0') in + if i' < i then raise Parse_error; + integer' i' + | _ -> + unget (); Some i + and bracket s = + if s <> [] && accept ']' then s else begin + match char () with + | `Char c -> + if accept '-' then begin + if accept ']' then Re.char c :: Re.char '-' :: s else begin + match char () with + `Char c' -> + bracket (Re.rg c c' :: s) + | `Set st' -> + bracket (Re.char c :: Re.char '-' :: st' :: s) + end + end else + bracket (Re.char c :: s) + | `Set st -> bracket (st :: s) + end + and char () = + if eos () then raise Parse_error; + let c = get () in + if c = '[' then begin + if accept '=' then raise Not_supported; + if accept ':' then + let compl = accept '^' in + let cls = + try List.find accept_s posix_class_strings + with Not_found -> raise Parse_error in + if not (accept_s ":]") then raise Parse_error; + let re = + let posix_class = posix_class_of_string cls in + if compl then Re.compl [posix_class] else posix_class in + `Set (re) + else if accept '.' then begin + if eos () then raise Parse_error; + let c = get () in + if not (accept '.') then raise Not_supported; + if not (accept ']') then raise Parse_error; + `Char c + end else + `Char c + end else if c = '\\' then begin + if eos () then raise Parse_error; + let c = get () in +(* XXX + \127, ... +*) + match c with + 'b' -> `Char '\008' + | 'n' -> `Char '\n' (*XXX*) + | 'r' -> `Char '\r' (*XXX*) + | 't' -> `Char '\t' (*XXX*) + | 'w' -> `Set (Re.alt [Re.alnum; Re.char '_']) + | 'W' -> `Set (Re.compl [Re.alnum; Re.char '_']) + | 's' -> `Set (Re.space) + | 'S' -> `Set (Re.compl [Re.space]) + | 'd' -> `Set (Re.digit) + | 'D' -> `Set (Re.compl [Re.digit]) + | 'a'..'z' | 'A'..'Z' -> + raise Parse_error + | '0'..'9' -> + raise Not_supported + | _ -> + `Char c + end else + `Char c + and comment () = + if eos () then raise Parse_error; + if accept ')' then Re.epsilon else begin incr i; comment () end + in + let res = regexp () in + if not (eos ()) then raise Parse_error; + res + +type opt = + [ `Ungreedy | `Dotall | `Dollar_endonly + | `Multiline | `Anchored | `Caseless ] + +let re ?(opts = []) s = + let r = + parse + (List.memq `Multiline opts) (List.memq `Dollar_endonly opts) + (List.memq `Dotall opts) (List.memq `Ungreedy opts) + s + in + let r = if List.memq `Anchored opts then Re.seq [Re.start; r] else r in + let r = if List.memq `Caseless opts then Re.no_case r else r in + r + +let compile = Re.compile +let compile_pat ?(opts = []) s = compile (re ~opts s) diff --git a/vendor/re/src/perl.mli b/vendor/re/src/perl.mli new file mode 100644 index 00000000000..08b1c6bdcd1 --- /dev/null +++ b/vendor/re/src/perl.mli @@ -0,0 +1,41 @@ +(* + RE - A regular expression library + + Copyright (C) 2001 Jerome Vouillon + email: Jerome.Vouillon@pps.jussieu.fr + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation, with + linking exception; either version 2.1 of the License, or (at + your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*) + +(** Perl-style regular expressions *) + +exception Parse_error +exception Not_supported +(** Errors that can be raised during the parsing of the regular expression *) + + +type opt = + [ `Ungreedy | `Dotall | `Dollar_endonly + | `Multiline | `Anchored | `Caseless ] + +val re : ?opts:opt list -> string -> Core.t +(** Parsing of a Perl-style regular expression *) + +val compile : Core.t -> Core.re +(** (Same as [Re.compile]) *) + +val compile_pat : ?opts:opt list -> string -> Core.re +(** Regular expression compilation *) diff --git a/vendor/re/src/pmark.ml b/vendor/re/src/pmark.ml new file mode 100644 index 00000000000..db4671206d6 --- /dev/null +++ b/vendor/re/src/pmark.ml @@ -0,0 +1,13 @@ + +module Pmark = struct + type t = int + let equal (x : int) (y : int) = x = y + let compare (x : int) (y : int) = compare x y + let r = ref 0 + let gen () = incr r ; !r + + let pp = Format.pp_print_int +end + +include Pmark +module Set = Set.Make(Pmark) diff --git a/vendor/re/src/pmark.mli b/vendor/re/src/pmark.mli new file mode 100644 index 00000000000..8c5e55ff223 --- /dev/null +++ b/vendor/re/src/pmark.mli @@ -0,0 +1,8 @@ + +type t = private int +val equal : t -> t -> bool +val compare : t -> t -> int +val gen : unit -> t +val pp : Format.formatter -> t -> unit + +module Set : Set.S with type elt = t diff --git a/vendor/re/src/posix.ml b/vendor/re/src/posix.ml new file mode 100644 index 00000000000..16a311b7df5 --- /dev/null +++ b/vendor/re/src/posix.ml @@ -0,0 +1,156 @@ +(* + RE - A regular expression library + + Copyright (C) 2001 Jerome Vouillon + email: Jerome.Vouillon@pps.jussieu.fr + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation, with + linking exception; either version 2.1 of the License, or (at + your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*) + +(* +What we could (should?) do: +- a* ==> longest ((shortest (no_group a)* ), a | ()) (!!!) +- abc understood as (ab)c +- "((a?)|b)" against "ab" should not bind the first subpattern to anything + +Note that it should be possible to handle "(((ab)c)d)e" efficiently +*) +module Re = Core + +exception Parse_error +exception Not_supported + +let parse newline s = + let i = ref 0 in + let l = String.length s in + let eos () = !i = l in + let test c = not (eos ()) && s.[!i] = c in + let accept c = let r = test c in if r then incr i; r in + let get () = let r = s.[!i] in incr i; r in + let unget () = decr i in + + let rec regexp () = regexp' (branch ()) + and regexp' left = + if accept '|' then regexp' (Re.alt [left; branch ()]) else left + and branch () = branch' [] + and branch' left = + if eos () || test '|' || test ')' then Re.seq (List.rev left) + else branch' (piece () :: left) + and piece () = + let r = atom () in + if accept '*' then Re.rep (Re.nest r) else + if accept '+' then Re.rep1 (Re.nest r) else + if accept '?' then Re.opt r else + if accept '{' then + match integer () with + Some i -> + let j = if accept ',' then integer () else Some i in + if not (accept '}') then raise Parse_error; + begin match j with + Some j when j < i -> raise Parse_error | _ -> () + end; + Re.repn (Re.nest r) i j + | None -> + unget (); r + else + r + and atom () = + if accept '.' then begin + if newline then Re.notnl else Re.any + end else if accept '(' then begin + let r = regexp () in + if not (accept ')') then raise Parse_error; + Re.group r + end else + if accept '^' then begin + if newline then Re.bol else Re.bos + end else if accept '$' then begin + if newline then Re.eol else Re.eos + end else if accept '[' then begin + if accept '^' then + Re.diff (Re.compl (bracket [])) (Re.char '\n') + else + Re.alt (bracket []) + end else + if accept '\\' then begin + if eos () then raise Parse_error; + match get () with + '|' | '(' | ')' | '*' | '+' | '?' + | '[' | '.' | '^' | '$' | '{' | '\\' as c -> Re.char c + | _ -> raise Parse_error + end else begin + if eos () then raise Parse_error; + match get () with + '*' | '+' | '?' | '{' | '\\' -> raise Parse_error + | c -> Re.char c + end + and integer () = + if eos () then None else + match get () with + '0'..'9' as d -> integer' (Char.code d - Char.code '0') + | _ -> unget (); None + and integer' i = + if eos () then Some i else + match get () with + '0'..'9' as d -> + let i' = 10 * i + (Char.code d - Char.code '0') in + if i' < i then raise Parse_error; + integer' i' + | _ -> + unget (); Some i + and bracket s = + if s <> [] && accept ']' then s else begin + let c = char () in + if accept '-' then begin + if accept ']' then Re.char c :: Re.char '-' :: s else begin + let c' = char () in + bracket (Re.rg c c' :: s) + end + end else + bracket (Re.char c :: s) + end + and char () = + if eos () then raise Parse_error; + let c = get () in + if c = '[' then begin + if accept '=' then raise Not_supported + else if accept ':' then begin + raise Not_supported (*XXX*) + end else if accept '.' then begin + if eos () then raise Parse_error; + let c = get () in + if not (accept '.') then raise Not_supported; + if not (accept ']') then raise Parse_error; + c + end else + c + end else + c + in + let res = regexp () in + if not (eos ()) then raise Parse_error; + res + +type opt = [`ICase | `NoSub | `Newline] + +let re ?(opts = []) s = + let r = parse (List.memq `Newline opts) s in + let r = if List.memq `ICase opts then Re.no_case r else r in + let r = if List.memq `NoSub opts then Re.no_group r else r in + r + +let compile re = Re.compile (Re.longest re) +let compile_pat ?(opts = []) s = compile (re ~opts s) diff --git a/vendor/re/src/posix.mli b/vendor/re/src/posix.mli new file mode 100644 index 00000000000..e77d0e465e5 --- /dev/null +++ b/vendor/re/src/posix.mli @@ -0,0 +1,98 @@ +(* + RE - A regular expression library + + Copyright (C) 2001 Jerome Vouillon + email: Jerome.Vouillon@pps.jussieu.fr + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation, with + linking exception; either version 2.1 of the License, or (at + your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*) + +(** +References: + - {{: http://www.opengroup.org/onlinepubs/007908799/xbd/re.html} re} + - {{: http://www.opengroup.org/onlinepubs/007908799/xsh/regcomp.html} regcomp} + +Example of how to use this module (to parse some IRC logs): + +{[ +type msg = { + time:string; + author:string; + content:string; +} + +let re = Core.compile (Re_posix.re "([^:].*:[^:]*:[^:]{2})<.([^>]+)> (.+)$") + +(* parse a line *) +let match_line line = + try + let substrings = Core.exec re line in + let groups = Core.get_all substrings in + (* groups can be obtained directly by index within [substrings] *) + Some {time=groups.(1); author=groups.(2); content=groups.(3)} + with Not_found -> + None (* regex didn't match *) +]} +*) + +(** XXX Character classes *) + +exception Parse_error +exception Not_supported +(** Errors that can be raised during the parsing of the regular expression *) + +type opt = [`ICase | `NoSub | `Newline] + +val re : ?opts:(opt list) -> string -> Core.t +(** Parsing of a Posix extended regular expression *) + +val compile : Core.t -> Core.re +(** Regular expression compilation *) + +val compile_pat : ?opts:(opt list) -> string -> Core.re +(** [compile r] is defined as [Core.compile (Core.longest r)] *) + +(* +Deviation from the standard / ambiguities in the standard +--------------------------------------------------------- +We tested the behavior of the Linux library (glibc) and the Solaris +library. + +(1) An expression [efg] should be parsed as [(ef)g]. + All implementations parse it as [e(fg)]. +(2) When matching the pattern "((a)|b)*" against the string "ab", + the sub-expression "((a)|b)" should match "b", and the + sub-expression "(a)" should not match anything. + In both implementation, the sub-expression "(a)" matches "a". +(3) When matching the pattern "(aa?)*" against the string "aaa", it is + not clear whether the final match of the sub-expression "(aa?)" is + the last "a" (all matches of the sub-expression are successively + maximized), or "aa" (the final match is maximized). + Both implementations implements the first case. +(4) When matching the pattern "((a?)|b)*" against the string "ab", + the sub-expression "((a?)|b)" should match the empty string at the + end of the string (it is better to match the empty string than to + match nothing). + In both implementations, this sub-expression matches "b". + (Strangely, in the Linux implementation, the sub-expression "(a?)" + correctly matches the empty string at the end of the string) + +This library behaves the same way as the other libraries for all +points, except for (2) and (4) where it follows the standard. + +The behavior of this library in theses four cases may change in future +releases. +*) diff --git a/vendor/re/src/re.ml b/vendor/re/src/re.ml index ebaaf0db75b..e9d56978c8e 100644 --- a/vendor/re/src/re.ml +++ b/vendor/re/src/re.ml @@ -1,1343 +1,8 @@ -(* - RE - A regular expression library - - Copyright (C) 2001 Jerome Vouillon - email: Jerome.Vouillon@pps.jussieu.fr - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation, with - linking exception; either version 2.1 of the License, or (at - your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -*) - -module Cset = Re_cset -module Automata = Re_automata -module MarkSet = Automata.PmarkSet - -let rec iter n f v = if n = 0 then v else iter (n - 1) f (f v) - -(****) - -let unknown = -2 -let break = -3 - -(* Result of a successful match. *) -type groups = - { s : string - (* Input string. Matched strings are substrings of s *) - - ; marks : Automata.mark_infos - (* Mapping from group indices to positions in gpos. group i has positions 2*i - - 1, 2*i + 1 in gpos. If the group wasn't matched, then its corresponding - values in marks will be -1,-1 *) - - ; pmarks : MarkSet.t - (* Marks positions. i.e. those marks created with Re.marks *) - - ; gpos : int array - (* Group positions. Adjacent elements are (start, stop) of group match. - indexed by the values in marks. So group i in an re would be the substring: - - start = t.gpos.(marks.(2*i)) - 1 - stop = t.gpos.(marks.(2*i + 1)) - 1 *) - - ; gcount : int - (* Number of groups the regular expression contains. Matched or not *) } - -type match_info = - | Match of groups - | Failed - | Running - -type state = - { idx : int; - (* Index of the current position in the position table. - Not yet computed transitions point to a dummy state where - [idx] is set to [unknown]; - If [idx] is set to [break] for states that either always - succeed or always fail. *) - real_idx : int; - (* The real index, in case [idx] is set to [break] *) - next : state array; - (* Transition table, indexed by color *) - mutable final : - (Automata.category * - (Automata.idx * Automata.status)) list; - (* Mapping from the category of the next character to - - the index where the next position should be saved - - possibly, the list of marks (and the corresponding indices) - corresponding to the best match *) - desc : Automata.State.t - (* Description of this state of the automata *) } - -(* Automata (compiled regular expression) *) -type re = - { initial : Automata.expr; - (* The whole regular expression *) - mutable initial_states : (Automata.category * state) list; - (* Initial states, indexed by initial category *) - cols : Bytes.t; - (* Color table *) - col_repr : Bytes.t; - (* Table from colors to one character of this color *) - ncol : int; - (* Number of colors. *) - lnl : int; - (* Color of the last newline *) - tbl : Automata.working_area; - (* Temporary table used to compute the first available index - when computing a new state *) - states : state Automata.State.Table.t; - (* States of the deterministic automata *) - group_count : int - (* Number of groups in the regular expression *) } - -let pp_re ch re = Automata.pp ch re.initial - -let print_re = pp_re - -(* Information used during matching *) -type info = - { re : re; - (* The automata *) - i_cols : Bytes.t; - (* Color table ([x.i_cols = x.re.cols]) - Shortcut used for performance reasons *) - mutable positions : int array; - (* Array of mark positions - The mark are off by one for performance reasons *) - pos : int; - (* Position where the match is started *) - last : int - (* Position where the match should stop *) } - - -(****) - -let cat_inexistant = 1 -let cat_letter = 2 -let cat_not_letter = 4 -let cat_newline = 8 -let cat_lastnewline = 16 -let cat_search_boundary = 32 - -let category re c = - if c = -1 then - cat_inexistant - (* Special category for the last newline *) - else if c = re.lnl then - cat_lastnewline lor cat_newline lor cat_not_letter - else - match Bytes.get re.col_repr c with - (* Should match [cword] definition *) - 'a'..'z' | 'A'..'Z' | '0'..'9' | '_' | '\170' | '\181' | '\186' - | '\192'..'\214' | '\216'..'\246' | '\248'..'\255' -> - cat_letter - | '\n' -> - cat_not_letter lor cat_newline - | _ -> - cat_not_letter - -(****) - -let dummy_next = [||] - -let unknown_state = - { idx = unknown; real_idx = 0; - next = dummy_next; final = []; - desc = Automata.State.dummy } - -let mk_state ncol desc = - let break_state = - match Automata.status desc with - | Automata.Running -> false - | Automata.Failed - | Automata.Match _ -> true - in - { idx = if break_state then break else desc.Automata.State.idx; - real_idx = desc.Automata.State.idx; - next = if break_state then dummy_next else Array.make ncol unknown_state; - final = []; - desc = desc } - -let find_state re desc = - try - Automata.State.Table.find re.states desc - with Not_found -> - let st = mk_state re.ncol desc in - Automata.State.Table.add re.states desc st; - st - -(**** Match with marks ****) - -let delta info cat c st = - let desc = Automata.delta info.re.tbl cat c st.desc in - let len = Array.length info.positions in - if desc.Automata.State.idx = len && len > 0 then begin - let pos = info.positions in - info.positions <- Array.make (2 * len) 0; - Array.blit pos 0 info.positions 0 len - end; - desc - -let validate info (s:string) pos st = - let c = Char.code (Bytes.get info.i_cols (Char.code s.[pos])) in - let cat = category info.re c in - let desc' = delta info cat c st in - let st' = find_state info.re desc' in - st.next.(c) <- st' - -(* -let rec loop info s pos st = - if pos < info.last then - let st' = st.next.(Char.code info.i_cols.[Char.code s.[pos]]) in - let idx = st'.idx in - if idx >= 0 then begin - info.positions.(idx) <- pos; - loop info s (pos + 1) st' - end else if idx = break then begin - info.positions.(st'.real_idx) <- pos; - st' - end else begin (* Unknown *) - validate info s pos st; - loop info s pos st - end - else - st -*) - -let rec loop info (s:string) pos st = - if pos < info.last then - let st' = st.next.(Char.code (Bytes.get info.i_cols (Char.code s.[pos]))) in - loop2 info s pos st st' - else - st - -and loop2 info s pos st st' = - if st'.idx >= 0 then begin - let pos = pos + 1 in - if pos < info.last then begin - (* It is important to place these reads before the write *) - (* But then, we don't have enough registers left to store the - right position. So, we store the position plus one. *) - let st'' = st'.next.(Char.code (Bytes.get info.i_cols (Char.code s.[pos]))) in - info.positions.(st'.idx) <- pos; - loop2 info s pos st' st'' - end else begin - info.positions.(st'.idx) <- pos; - st' - end - end else if st'.idx = break then begin - info.positions.(st'.real_idx) <- pos + 1; - st' - end else begin (* Unknown *) - validate info s pos st; - loop info s pos st - end - -let rec loop_no_mark info s pos last st = - if pos < last then - let st' = st.next.(Char.code (Bytes.get info.i_cols (Char.code s.[pos]))) in - if st'.idx >= 0 then - loop_no_mark info s (pos + 1) last st' - else if st'.idx = break then - st' - else begin (* Unknown *) - validate info s pos st; - loop_no_mark info s pos last st - end - else - st - -let final info st cat = - try - List.assq cat st.final - with Not_found -> - let st' = delta info cat (-1) st in - let res = (st'.Automata.State.idx, Automata.status st') in - st.final <- (cat, res) :: st.final; - res - -let find_initial_state re cat = - try - List.assq cat re.initial_states - with Not_found -> - let st = find_state re (Automata.State.create cat re.initial) in - re.initial_states <- (cat, st) :: re.initial_states; - st - -let get_color re (s:string) pos = - if pos < 0 then - -1 - else - let slen = String.length s in - if pos >= slen then - -1 - else if pos = slen - 1 && re.lnl <> -1 && s.[pos] = '\n' then - (* Special case for the last newline *) - re.lnl - else - Char.code (Bytes.get re.cols (Char.code s.[pos])) - -let rec handle_last_newline info pos st groups = - let st' = st.next.(info.re.lnl) in - if st'.idx >= 0 then begin - if groups then info.positions.(st'.idx) <- pos + 1; - st' - end else if st'.idx = break then begin - if groups then info.positions.(st'.real_idx) <- pos + 1; - st' - end else begin (* Unknown *) - let c = info.re.lnl in - let real_c = Char.code (Bytes.get info.i_cols (Char.code '\n')) in - let cat = category info.re c in - let desc' = delta info cat real_c st in - let st' = find_state info.re desc' in - st.next.(c) <- st'; - handle_last_newline info pos st groups - end - -let rec scan_str info (s:string) initial_state groups = - let pos = info.pos in - let last = info.last in - if (last = String.length s - && info.re.lnl <> -1 - && last > pos - && String.get s (last - 1) = '\n') - then begin - let info = { info with last = last - 1 } in - let st = scan_str info s initial_state groups in - if st.idx = break then - st - else - handle_last_newline info (last - 1) st groups - end else if groups then - loop info s pos initial_state - else - loop_no_mark info s pos last initial_state - -let match_str ~groups ~partial re s ~pos ~len = - let slen = String.length s in - let last = if len = -1 then slen else pos + len in - let info = - { re = re; i_cols = re.cols; pos = pos; last = last; - positions = - if groups then begin - let n = Automata.index_count re.tbl + 1 in - if n <= 10 then - [|0;0;0;0;0;0;0;0;0;0|] - else - Array.make n 0 - end else - [||] } - in - let initial_cat = - if pos = 0 then - cat_search_boundary lor cat_inexistant - else - cat_search_boundary lor category re (get_color re s (pos - 1)) in - let initial_state = find_initial_state re initial_cat in - let st = scan_str info s initial_state groups in - let res = - if st.idx = break || partial then - Automata.status st.desc - else - let final_cat = - if last = slen then - cat_search_boundary lor cat_inexistant - else - cat_search_boundary lor category re (get_color re s last) in - let (idx, res) = final info st final_cat in - if groups then info.positions.(idx) <- last + 1; - res - in - match res with - Automata.Match (marks, pmarks) -> - Match { s ; marks; pmarks ; gpos = info.positions; gcount = re.group_count} - | Automata.Failed -> Failed - | Automata.Running -> Running - -let mk_re init cols col_repr ncol lnl group_count = - { initial = init; - initial_states = []; - cols = cols; - col_repr = col_repr; - ncol = ncol; - lnl = lnl; - tbl = Automata.create_working_area (); - states = Automata.State.Table.create 97; - group_count = group_count } - -(**** Character sets ****) - -let cseq c c' = Cset.seq (Char.code c) (Char.code c') -let cadd c s = Cset.add (Char.code c) s - -let trans_set cache cm s = - match Cset.one_char s with - | Some i -> Cset.csingle (Bytes.get cm i) - | None -> - let v = (Cset.hash_rec s, s) in - try - Cset.CSetMap.find v !cache - with Not_found -> - let l = - Cset.fold_right - s - ~f:(fun (i, j) l -> Cset.union (cseq (Bytes.get cm i) - (Bytes.get cm j)) l) - ~init:Cset.empty - in - cache := Cset.CSetMap.add v l !cache; - l - -(****) - -type regexp = - Set of Cset.t - | Sequence of regexp list - | Alternative of regexp list - | Repeat of regexp * int * int option - | Beg_of_line | End_of_line - | Beg_of_word | End_of_word | Not_bound - | Beg_of_str | End_of_str - | Last_end_of_line | Start | Stop - | Sem of Automata.sem * regexp - | Sem_greedy of Automata.rep_kind * regexp - | Group of regexp | No_group of regexp | Nest of regexp - | Case of regexp | No_case of regexp - | Intersection of regexp list - | Complement of regexp list - | Difference of regexp * regexp - | Pmark of Automata.Pmark.t * regexp - -let rec pp fmt t = - let open Re_fmt in - let var s re = sexp fmt s pp re in - let seq s rel = sexp fmt s (list pp) rel in - match t with - | Set s -> sexp fmt "Set" Cset.pp s - | Sequence sq -> seq "Sequence" sq - | Alternative alt -> seq "Alternative" alt - | Repeat (re, start, stop) -> - let pp' fmt () = fprintf fmt "%a@ %d%a" pp re start optint stop in - sexp fmt "Repeat" pp' () - | Beg_of_line -> str fmt "Beg_of_line" - | End_of_line -> str fmt "End_of_line" - | Beg_of_word -> str fmt "Beg_of_word" - | End_of_word -> str fmt "End_of_word" - | Not_bound -> str fmt "Not_bound" - | Beg_of_str -> str fmt "Beg_of_str" - | End_of_str -> str fmt "End_of_str" - | Last_end_of_line -> str fmt "Last_end_of_line" - | Start -> str fmt "Start" - | Stop -> str fmt "Stop" - | Sem (sem, re) -> - sexp fmt "Sem" (pair Automata.pp_sem pp) (sem, re) - | Sem_greedy (k, re) -> - sexp fmt "Sem_greedy" (pair Automata.pp_rep_kind pp) (k, re) - | Group c -> var "Group" c - | No_group c -> var "No_group" c - | Nest c -> var "Nest" c - | Case c -> var "Case" c - | No_case c -> var "No_case" c - | Intersection c -> seq "Intersection" c - | Complement c -> seq "Complement" c - | Difference (a, b) -> sexp fmt "Difference" (pair pp pp) (a, b) - | Pmark (m, r) -> sexp fmt "Pmark" (pair Automata.Pmark.pp pp) (m, r) - -let rec is_charset = function - | Set _ -> - true - | Alternative l | Intersection l | Complement l -> - List.for_all is_charset l - | Difference (r, r') -> - is_charset r && is_charset r' - | Sem (_, r) | Sem_greedy (_, r) - | No_group r | Case r | No_case r -> - is_charset r - | Sequence _ | Repeat _ | Beg_of_line | End_of_line - | Beg_of_word | End_of_word | Beg_of_str | End_of_str - | Not_bound | Last_end_of_line | Start | Stop - | Group _ | Nest _ | Pmark (_,_)-> - false - -(**** Colormap ****) - -(*XXX Use a better algorithm allowing non-contiguous regions? *) -let split s cm = - Re_cset.iter s ~f:(fun i j -> - Bytes.set cm i '\001'; - Bytes.set cm (j + 1) '\001'; - ) - -let cupper = - Cset.union (cseq 'A' 'Z') - (Cset.union (cseq '\192' '\214') (cseq '\216' '\222')) -let clower = Cset.offset 32 cupper -let calpha = - List.fold_right cadd ['\170'; '\181'; '\186'; '\223'; '\255'] - (Cset.union clower cupper) -let cdigit = cseq '0' '9' -let calnum = Cset.union calpha cdigit -let cword = cadd '_' calnum - -let colorize c regexp = - let lnl = ref false in - let rec colorize regexp = - match regexp with - Set s -> split s c - | Sequence l -> List.iter colorize l - | Alternative l -> List.iter colorize l - | Repeat (r, _, _) -> colorize r - | Beg_of_line | End_of_line -> split (Cset.csingle '\n') c - | Beg_of_word | End_of_word - | Not_bound -> split cword c - | Beg_of_str | End_of_str - | Start | Stop -> () - | Last_end_of_line -> lnl := true - | Sem (_, r) - | Sem_greedy (_, r) - | Group r | No_group r - | Nest r | Pmark (_,r) -> colorize r - | Case _ | No_case _ - | Intersection _ - | Complement _ - | Difference _ -> assert false - in - colorize regexp; - !lnl - -let make_cmap () = Bytes.make 257 '\000' - -let flatten_cmap cm = - let c = Bytes.create 256 in - let col_repr = Bytes.create 256 in - let v = ref 0 in - Bytes.set c 0 '\000'; - Bytes.set col_repr 0 '\000'; - for i = 1 to 255 do - if Bytes.get cm i <> '\000' then incr v; - Bytes.set c i (Char.chr !v); - Bytes.set col_repr !v (Char.chr i) - done; - (c, Bytes.sub col_repr 0 (!v + 1), !v + 1) - -(**** Compilation ****) - -let rec equal x1 x2 = - match x1, x2 with - Set s1, Set s2 -> - s1 = s2 - | Sequence l1, Sequence l2 -> - eq_list l1 l2 - | Alternative l1, Alternative l2 -> - eq_list l1 l2 - | Repeat (x1', i1, j1), Repeat (x2', i2, j2) -> - i1 = i2 && j1 = j2 && equal x1' x2' - | Beg_of_line, Beg_of_line - | End_of_line, End_of_line - | Beg_of_word, Beg_of_word - | End_of_word, End_of_word - | Not_bound, Not_bound - | Beg_of_str, Beg_of_str - | End_of_str, End_of_str - | Last_end_of_line, Last_end_of_line - | Start, Start - | Stop, Stop -> - true - | Sem (sem1, x1'), Sem (sem2, x2') -> - sem1 = sem2 && equal x1' x2' - | Sem_greedy (k1, x1'), Sem_greedy (k2, x2') -> - k1 = k2 && equal x1' x2' - | Group _, Group _ -> (* Do not merge groups! *) - false - | No_group x1', No_group x2' -> - equal x1' x2' - | Nest x1', Nest x2' -> - equal x1' x2' - | Case x1', Case x2' -> - equal x1' x2' - | No_case x1', No_case x2' -> - equal x1' x2' - | Intersection l1, Intersection l2 -> - eq_list l1 l2 - | Complement l1, Complement l2 -> - eq_list l1 l2 - | Difference (x1', x1''), Difference (x2', x2'') -> - equal x1' x2' && equal x1'' x2'' - | Pmark (m1, r1), Pmark (m2, r2) -> - Automata.Pmark.equal m1 m2 && equal r1 r2 - | _ -> - false - -and eq_list l1 l2 = - match l1, l2 with - [], [] -> - true - | x1 :: r1, x2 :: r2 -> - equal x1 x2 && eq_list r1 r2 - | _ -> - false - -let sequence = function - | [x] -> x - | l -> Sequence l - -let rec merge_sequences = function - | [] -> - [] - | Alternative l' :: r -> - merge_sequences (l' @ r) - | Sequence (x :: y) :: r -> - begin match merge_sequences r with - Sequence (x' :: y') :: r' when equal x x' -> - Sequence [x; Alternative [sequence y; sequence y']] :: r' - | r' -> - Sequence (x :: y) :: r' - end - | x :: r -> - x :: merge_sequences r - -module A = Automata - -let enforce_kind ids kind kind' cr = - match kind, kind' with - `First, `First -> cr - | `First, k -> A.seq ids k cr (A.eps ids) - | _ -> cr - -(* XXX should probably compute a category mask *) -let rec translate ids kind ign_group ign_case greedy pos cache c = function - | Set s -> - (A.cst ids (trans_set cache c s), kind) - | Sequence l -> - (trans_seq ids kind ign_group ign_case greedy pos cache c l, kind) - | Alternative l -> - begin match merge_sequences l with - [r'] -> - let (cr, kind') = - translate ids kind ign_group ign_case greedy pos cache c r' in - (enforce_kind ids kind kind' cr, kind) - | merged_sequences -> - (A.alt ids - (List.map - (fun r' -> - let (cr, kind') = - translate ids kind ign_group ign_case greedy - pos cache c r' in - enforce_kind ids kind kind' cr) - merged_sequences), - kind) - end - | Repeat (r', i, j) -> - let (cr, kind') = - translate ids kind ign_group ign_case greedy pos cache c r' in - let rem = - match j with - None -> - A.rep ids greedy kind' cr - | Some j -> - let f = - match greedy with - `Greedy -> - fun rem -> - A.alt ids - [A.seq ids kind' (A.rename ids cr) rem; A.eps ids] - | `Non_greedy -> - fun rem -> - A.alt ids - [A.eps ids; A.seq ids kind' (A.rename ids cr) rem] - in - iter (j - i) f (A.eps ids) - in - (iter i (fun rem -> A.seq ids kind' (A.rename ids cr) rem) rem, kind) - | Beg_of_line -> - (A.after ids (cat_inexistant lor cat_newline), kind) - | End_of_line -> - (A.before ids (cat_inexistant lor cat_newline), kind) - | Beg_of_word -> - (A.seq ids `First - (A.after ids (cat_inexistant lor cat_not_letter)) - (A.before ids (cat_inexistant lor cat_letter)), - kind) - | End_of_word -> - (A.seq ids `First - (A.after ids (cat_inexistant lor cat_letter)) - (A.before ids (cat_inexistant lor cat_not_letter)), - kind) - | Not_bound -> - (A.alt ids [A.seq ids `First - (A.after ids cat_letter) - (A.before ids cat_letter); - A.seq ids `First - (A.after ids cat_letter) - (A.before ids cat_letter)], - kind) - | Beg_of_str -> - (A.after ids cat_inexistant, kind) - | End_of_str -> - (A.before ids cat_inexistant, kind) - | Last_end_of_line -> - (A.before ids (cat_inexistant lor cat_lastnewline), kind) - | Start -> - (A.after ids cat_search_boundary, kind) - | Stop -> - (A.before ids cat_search_boundary, kind) - | Sem (kind', r') -> - let (cr, kind'') = - translate ids kind' ign_group ign_case greedy pos cache c r' in - (enforce_kind ids kind' kind'' cr, - kind') - | Sem_greedy (greedy', r') -> - translate ids kind ign_group ign_case greedy' pos cache c r' - | Group r' -> - if ign_group then - translate ids kind ign_group ign_case greedy pos cache c r' - else - let p = !pos in - pos := !pos + 2; - let (cr, kind') = - translate ids kind ign_group ign_case greedy pos cache c r' in - (A.seq ids `First (A.mark ids p) ( - A.seq ids `First cr (A.mark ids (p + 1))), - kind') - | No_group r' -> - translate ids kind true ign_case greedy pos cache c r' - | Nest r' -> - let b = !pos in - let (cr, kind') = - translate ids kind ign_group ign_case greedy pos cache c r' - in - let e = !pos - 1 in - if e < b then - (cr, kind') - else - (A.seq ids `First (A.erase ids b e) cr, kind') - | Difference _ | Complement _ | Intersection _ | No_case _ | Case _ -> - assert false - | Pmark (i, r') -> - let (cr, kind') = - translate ids kind ign_group ign_case greedy pos cache c r' in - (A.seq ids `First (A.pmark ids i) cr, kind') - -and trans_seq ids kind ign_group ign_case greedy pos cache c = function - | [] -> - A.eps ids - | [r] -> - let (cr', kind') = - translate ids kind ign_group ign_case greedy pos cache c r in - enforce_kind ids kind kind' cr' - | r :: rem -> - let (cr', kind') = - translate ids kind ign_group ign_case greedy pos cache c r in - let cr'' = - trans_seq ids kind ign_group ign_case greedy pos cache c rem in - if A.is_eps cr'' then - cr' - else if A.is_eps cr' then - cr'' - else - A.seq ids kind' cr' cr'' - -(**** Case ****) - -let case_insens s = - Cset.union s (Cset.union (Cset.offset 32 (Cset.inter s cupper)) - (Cset.offset (-32) (Cset.inter s clower))) - -let as_set = function - | Set s -> s - | _ -> assert false - -(* XXX Should split alternatives into (1) charsets and (2) more - complex regular expressions; alternative should therefore probably - be flatten here *) -let rec handle_case ign_case = function - | Set s -> - Set (if ign_case then case_insens s else s) - | Sequence l -> - Sequence (List.map (handle_case ign_case) l) - | Alternative l -> - let l' = List.map (handle_case ign_case) l in - if is_charset (Alternative l') then - Set (List.fold_left (fun s r -> Cset.union s (as_set r)) Cset.empty l') - else - Alternative l' - | Repeat (r, i, j) -> - Repeat (handle_case ign_case r, i, j) - | Beg_of_line | End_of_line | Beg_of_word | End_of_word | Not_bound - | Beg_of_str | End_of_str | Last_end_of_line | Start | Stop as r -> - r - | Sem (k, r) -> - let r' = handle_case ign_case r in - if is_charset r' then r' else Sem (k, r') - | Sem_greedy (k, r) -> - let r' = handle_case ign_case r in - if is_charset r' then r' else Sem_greedy (k, r') - | Group r -> - Group (handle_case ign_case r) - | No_group r -> - let r' = handle_case ign_case r in - if is_charset r' then r' else No_group r' - | Nest r -> - let r' = handle_case ign_case r in - if is_charset r' then r' else Nest r' - | Case r -> - handle_case false r - | No_case r -> - handle_case true r - | Intersection l -> - let l' = List.map (fun r -> handle_case ign_case r) l in - Set (List.fold_left (fun s r -> Cset.inter s (as_set r)) Cset.cany l') - | Complement l -> - let l' = List.map (fun r -> handle_case ign_case r) l in - Set (Cset.diff Cset.cany - (List.fold_left (fun s r -> Cset.union s (as_set r)) - Cset.empty l')) - | Difference (r, r') -> - Set (Cset.inter (as_set (handle_case ign_case r)) - (Cset.diff Cset.cany (as_set (handle_case ign_case r')))) - | Pmark (i,r) -> Pmark (i,handle_case ign_case r) - -(****) - -let compile_1 regexp = - let regexp = handle_case false regexp in - let c = make_cmap () in - let need_lnl = colorize c regexp in - let (col, col_repr, ncol) = flatten_cmap c in - let lnl = if need_lnl then ncol else -1 in - let ncol = if need_lnl then ncol + 1 else ncol in - let ids = A.create_ids () in - let pos = ref 0 in - let (r, kind) = - translate ids - `First false false `Greedy pos (ref Cset.CSetMap.empty) col regexp in - let r = enforce_kind ids `First kind r in - (*Format.eprintf "<%d %d>@." !ids ncol;*) - mk_re r col col_repr ncol lnl (!pos / 2) - -(****) - -let rec anchored = function - | Sequence l -> - List.exists anchored l - | Alternative l -> - List.for_all anchored l - | Repeat (r, i, _) -> - i > 0 && anchored r - | Set _ | Beg_of_line | End_of_line | Beg_of_word | End_of_word - | Not_bound | End_of_str | Last_end_of_line | Stop - | Intersection _ | Complement _ | Difference _ -> - false - | Beg_of_str | Start -> - true - | Sem (_, r) | Sem_greedy (_, r) | Group r | No_group r | Nest r - | Case r | No_case r | Pmark (_, r) -> - anchored r - -(****) - -type t = regexp - -let str s = - let l = ref [] in - for i = String.length s - 1 downto 0 do - l := Set (Cset.csingle s.[i]) :: !l - done; - Sequence !l -let char c = Set (Cset.csingle c) - -let alt = function - | [r] -> r - | l -> Alternative l -let seq = function - | [r] -> r - | l -> Sequence l - -let empty = alt [] -let epsilon = seq [] -let repn r i j = - if i < 0 then invalid_arg "Re.repn"; - begin match j with - | Some j when j < i -> invalid_arg "Re.repn" - | _ -> () - end; - Repeat (r, i, j) -let rep r = repn r 0 None -let rep1 r = repn r 1 None -let opt r = repn r 0 (Some 1) -let bol = Beg_of_line -let eol = End_of_line -let bow = Beg_of_word -let eow = End_of_word -let word r = seq [bow; r; eow] -let not_boundary = Not_bound -let bos = Beg_of_str -let eos = End_of_str -let whole_string r = seq [bos; r; eos] -let leol = Last_end_of_line -let start = Start -let stop = Stop -let longest r = Sem (`Longest, r) -let shortest r = Sem (`Shortest, r) -let first r = Sem (`First, r) -let greedy r = Sem_greedy (`Greedy, r) -let non_greedy r = Sem_greedy (`Non_greedy, r) -let group r = Group r -let no_group r = No_group r -let nest r = Nest r -let mark r = let i = Automata.Pmark.gen () in (i,Pmark (i,r)) - -let set str = - let s = ref Cset.empty in - for i = 0 to String.length str - 1 do - s := Cset.union (Cset.csingle str.[i]) !s - done; - Set !s - -let rg c c' = Set (cseq c c') - -let inter l = - let r = Intersection l in - if is_charset r then - r - else - invalid_arg "Re.inter" - -let compl l = - let r = Complement l in - if is_charset r then - r - else - invalid_arg "Re.compl" - -let diff r r' = - let r'' = Difference (r, r') in - if is_charset r'' then - r'' - else - invalid_arg "Re.diff" - -let any = Set Cset.cany -let notnl = Set (Cset.diff Cset.cany (Cset.csingle '\n')) - -let lower = alt [rg 'a' 'z'; char '\181'; rg '\223' '\246'; rg '\248' '\255'] -let upper = alt [rg 'A' 'Z'; rg '\192' '\214'; rg '\216' '\222'] -let alpha = alt [lower; upper; char '\170'; char '\186'] -let digit = rg '0' '9' -let alnum = alt [alpha; digit] -let wordc = alt [alnum; char '_'] -let ascii = rg '\000' '\127' -let blank = set "\t " -let cntrl = alt [rg '\000' '\031'; rg '\127' '\159'] -let graph = alt [rg '\033' '\126'; rg '\160' '\255'] -let print = alt [rg '\032' '\126'; rg '\160' '\255'] -let punct = - alt [rg '\033' '\047'; rg '\058' '\064'; rg '\091' '\096'; - rg '\123' '\126'; rg '\160' '\169'; rg '\171' '\180'; - rg '\182' '\185'; rg '\187' '\191'; char '\215'; char '\247'] -let space = alt [char ' '; rg '\009' '\013'] -let xdigit = alt [digit; rg 'a' 'f'; rg 'A' 'F'] - -let case r = Case r -let no_case r = No_case r - -(****) - -let compile r = - compile_1 ( - if anchored r then - group r - else - seq [shortest (rep any); group r] - ) - -let exec_internal name ?(pos=0) ?(len = -1) ~groups re s = - if pos < 0 || len < -1 || pos + len > String.length s then - invalid_arg name; - match_str ~groups ~partial:false re s ~pos ~len - -let exec ?pos ?len re s = - match exec_internal "Re.exec" ?pos ?len ~groups:true re s with - Match substr -> substr - | _ -> raise Not_found - -let exec_opt ?pos ?len re s = - match exec_internal "Re.exec_opt" ?pos ?len ~groups:true re s with - Match substr -> Some substr - | _ -> None - -let execp ?pos ?len re s = - match exec_internal ~groups:false "Re.execp" ?pos ?len re s with - Match _substr -> true - | _ -> false - -let exec_partial ?pos ?len re s = - match exec_internal ~groups:false "Re.exec_partial" ?pos ?len re s with - Match _ -> `Full - | Running -> `Partial - | Failed -> `Mismatch - -module Group = struct - - type t = groups - - let offset t i = - if 2 * i + 1 >= Array.length t.marks then raise Not_found; - let m1 = t.marks.(2 * i) in - if m1 = -1 then raise Not_found; - let p1 = t.gpos.(m1) - 1 in - let p2 = t.gpos.(t.marks.(2 * i + 1)) - 1 in - (p1, p2) - - let get t i = - let (p1, p2) = offset t i in - String.sub t.s p1 (p2 - p1) - - let start subs i = fst (offset subs i) - - let stop subs i = snd (offset subs i) - - let test t i = - if 2 * i >= Array.length t.marks then - false - else - let idx = t.marks.(2 * i) in - idx <> -1 - - let dummy_offset = (-1, -1) - - let all_offset t = - let res = Array.make t.gcount dummy_offset in - for i = 0 to Array.length t.marks / 2 - 1 do - let m1 = t.marks.(2 * i) in - if m1 <> -1 then begin - let p1 = t.gpos.(m1) in - let p2 = t.gpos.(t.marks.(2 * i + 1)) in - res.(i) <- (p1 - 1, p2 - 1) - end - done; - res - - let dummy_string = "" - - let all t = - let res = Array.make t.gcount dummy_string in - for i = 0 to Array.length t.marks / 2 - 1 do - let m1 = t.marks.(2 * i) in - if m1 <> -1 then begin - let p1 = t.gpos.(m1) in - let p2 = t.gpos.(t.marks.(2 * i + 1)) in - res.(i) <- String.sub t.s (p1 - 1) (p2 - p1) - end - done; - res - - let pp fmt t = - let matches = - let offsets = all_offset t in - let strs = all t in - Array.to_list ( - Array.init (Array.length strs) (fun i -> strs.(i), offsets.(i)) - ) in - let open Re_fmt in - let pp_match fmt (str, (start, stop)) = - fprintf fmt "@[(%s (%d %d))@]" str start stop in - sexp fmt "Group" (list pp_match) matches - - let nb_groups t = t.gcount -end - -module Mark = struct - - type t = Automata.Pmark.t - - let test {pmarks ; _} p = - Automata.PmarkSet.mem p pmarks - - let all s = s.pmarks - - module Set = MarkSet - - let equal = Automata.Pmark.equal - - let compare = Automata.Pmark.compare - -end - -type 'a gen = unit -> 'a option - -let all_gen ?(pos=0) ?len re s = - if pos < 0 then invalid_arg "Re.all"; - (* index of the first position we do not consider. - !pos < limit is an invariant *) - let limit = match len with - | None -> String.length s - | Some l -> - if l<0 || pos+l > String.length s then invalid_arg "Re.all"; - pos+l - in - (* iterate on matches. When a match is found, search for the next - one just after its end *) - let pos = ref pos in - fun () -> - if !pos >= limit - then None (* no more matches *) - else - match match_str ~groups:true ~partial:false re s - ~pos:!pos ~len:(limit - !pos) with - | Match substr -> - let p1, p2 = Group.offset substr 0 in - pos := if p1=p2 then p2+1 else p2; - Some substr - | Running - | Failed -> None - -let all ?pos ?len re s = - let l = ref [] in - let g = all_gen ?pos ?len re s in - let rec iter () = match g() with - | None -> List.rev !l - | Some sub -> l := sub :: !l; iter () - in iter () - -let matches_gen ?pos ?len re s = - let g = all_gen ?pos ?len re s in - fun () -> - match g() with - | None -> None - | Some sub -> Some (Group.get sub 0) - -let matches ?pos ?len re s = - let l = ref [] in - let g = all_gen ?pos ?len re s in - let rec iter () = match g() with - | None -> List.rev !l - | Some sub -> l := Group.get sub 0 :: !l; iter () - in iter () - -type split_token = - [ `Text of string - | `Delim of groups - ] - -let split_full_gen ?(pos=0) ?len re s = - if pos < 0 then invalid_arg "Re.split"; - let limit = match len with - | None -> String.length s - | Some l -> - if l<0 || pos+l > String.length s then invalid_arg "Re.split"; - pos+l - in - (* i: start of delimited string - pos: first position after last match of [re] - limit: first index we ignore (!pos < limit is an invariant) *) - let pos0 = pos in - let state = ref `Idle in - let i = ref pos and pos = ref pos in - let next () = match !state with - | `Idle when !pos >= limit -> - if !i < limit then ( - let sub = String.sub s !i (limit - !i) in - incr i; - Some (`Text sub) - ) else None - | `Idle -> - begin match match_str ~groups:true ~partial:false re s ~pos:!pos - ~len:(limit - !pos) with - | Match substr -> - let p1, p2 = Group.offset substr 0 in - pos := if p1=p2 then p2+1 else p2; - let old_i = !i in - i := p2; - if p1 > pos0 then ( - (* string does not start by a delimiter *) - let text = String.sub s old_i (p1 - old_i) in - state := `Yield (`Delim substr); - Some (`Text text) - ) else Some (`Delim substr) - | Running -> None - | Failed -> - if !i < limit - then ( - let text = String.sub s !i (limit - !i) in - i := limit; - Some (`Text text) (* yield last string *) - ) else - None - end - | `Yield x -> - state := `Idle; - Some x - in next - -let split_full ?pos ?len re s = - let l = ref [] in - let g = split_full_gen ?pos ?len re s in - let rec iter () = match g() with - | None -> List.rev !l - | Some s -> l := s :: !l; iter () - in iter () - -let split_gen ?pos ?len re s = - let g = split_full_gen ?pos ?len re s in - let rec next() = match g() with - | None -> None - | Some (`Delim _) -> next() - | Some (`Text s) -> Some s - in next - -let split ?pos ?len re s = - let l = ref [] in - let g = split_full_gen ?pos ?len re s in - let rec iter () = match g() with - | None -> List.rev !l - | Some (`Delim _) -> iter() - | Some (`Text s) -> l := s :: !l; iter () - in iter () - -let replace ?(pos=0) ?len ?(all=true) re ~f s = - if pos < 0 then invalid_arg "Re.replace"; - let limit = match len with - | None -> String.length s - | Some l -> - if l<0 || pos+l > String.length s then invalid_arg "Re.replace"; - pos+l - in - (* buffer into which we write the result *) - let buf = Buffer.create (String.length s) in - (* iterate on matched substrings. *) - let rec iter pos = - if pos < limit - then - match match_str ~groups:true ~partial:false re s ~pos ~len:(limit-pos) with - | Match substr -> - let p1, p2 = Group.offset substr 0 in - (* add string between previous match and current match *) - Buffer.add_substring buf s pos (p1-pos); - (* what should we replace the matched group with? *) - let replacing = f substr in - Buffer.add_string buf replacing; - if all then - (* if we matched a non-char e.g. ^ we must manually advance by 1 *) - iter ( - if p1=p2 then ( - (* a non char could be past the end of string. e.g. $ *) - if p2 < limit then Buffer.add_char buf s.[p2]; - p2+1 - ) else - p2) - else - Buffer.add_substring buf s p2 (limit-p2) - | Running -> () - | Failed -> - Buffer.add_substring buf s pos (limit-pos) - in - iter pos; - Buffer.contents buf - -let replace_string ?pos ?len ?all re ~by s = - replace ?pos ?len ?all re s ~f:(fun _ -> by) - -let witness t = - let rec witness = function - | Set c -> String.make 1 (Char.chr (Cset.pick c)) - | Sequence xs -> String.concat "" (List.map witness xs) - | Alternative (x :: _) -> witness x - | Alternative [] -> assert false - | Repeat (r, from, _to) -> - let w = witness r in - let b = Buffer.create (String.length w * from) in - for _i=1 to from do - Buffer.add_string b w - done; - Buffer.contents b - | No_case r -> witness r - | Intersection _ - | Complement _ - | Difference (_, _) -> assert false - | Group r - | No_group r - | Nest r - | Sem (_, r) - | Pmark (_, r) - | Case r - | Sem_greedy (_, r) -> witness r - | Beg_of_line - | End_of_line - | Beg_of_word - | End_of_word - | Not_bound - | Beg_of_str - | Last_end_of_line - | Start - | Stop - | End_of_str -> "" in - witness (handle_case false t) - -(** {2 Deprecated functions} *) - -type substrings = groups - -let get = Group.get -let get_ofs = Group.offset -let get_all = Group.all -let get_all_ofs = Group.all_offset -let test = Group.test - -type markid = Mark.t - -let marked = Mark.test -let mark_set = Mark.all - -(**********************************) - -(* -Information about the previous character: -- does not exists -- is a letter -- is not a letter -- is a newline -- is last newline - -Beginning of word: -- previous is not a letter or does not exist -- current is a letter or does not exist - -End of word: -- previous is a letter or does not exist -- current is not a letter or does not exist - -Beginning of line: -- previous is a newline or does not exist - -Beginning of buffer: -- previous does not exist - -End of buffer -- current does not exist - -End of line -- current is a newline or does not exist -*) - -(* -Rep: e = T,e | () - - semantics of the comma (shortest/longest/first) - - semantics of the union (greedy/non-greedy) - -Bounded repetition - a{0,3} = (a,(a,a?)?)? -*) +include Core + +module Emacs = Emacs +module Glob = Glob +module Perl = Perl +module Pcre = Pcre +module Posix = Posix +module Str = Str diff --git a/vendor/re/src/re_fmt.ml b/vendor/re/src/re_fmt.ml deleted file mode 100644 index 0d707ecbb8a..00000000000 --- a/vendor/re/src/re_fmt.ml +++ /dev/null @@ -1,25 +0,0 @@ -(** Very small tooling for format printers. *) - -include Format - -(* Only in the stdlib since 4.02, so we copy. *) -let rec list pp ppf = function - | [] -> () - | [v] -> pp ppf v - | v :: vs -> - pp ppf v; - pp_print_space ppf (); - list pp ppf vs - -let str = pp_print_string -let sexp fmt s pp x = fprintf fmt "@[<3>(%s@ %a)@]" s pp x -let pair pp1 pp2 fmt (v1,v2) = - pp1 fmt v1; pp_print_space fmt () ; pp2 fmt v2 -let triple pp1 pp2 pp3 fmt (v1, v2, v3) = - pp1 fmt v1; pp_print_space fmt () ; - pp2 fmt v2; pp_print_space fmt () ; - pp3 fmt v3 -let int = pp_print_int -let optint fmt = function - | None -> () - | Some i -> fprintf fmt "@ %d" i diff --git a/vendor/re/src/str.ml b/vendor/re/src/str.ml new file mode 100644 index 00000000000..ff82fbd7e80 --- /dev/null +++ b/vendor/re/src/str.ml @@ -0,0 +1,295 @@ +(***********************************************************************) +(* *) +(* Objective Caml *) +(* *) +(* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) +(* *) +(* Copyright 1996 Institut National de Recherche en Informatique et *) +(* en Automatique. All rights reserved. This file is distributed *) +(* under the terms of the GNU Library General Public License, with *) +(* linking exception. *) +(* *) +(***********************************************************************) + +(* Modified by Jerome.Vouillon@pps.jussieu.fr for integration in RE *) + +(* $Id: re_str.ml,v 1.3 2002/07/03 15:47:54 vouillon Exp $ *) + +module Re = Core + +type regexp = + { re: Re.t + ; mtch: Re.re Lazy.t + ; srch: Re.re Lazy.t } + +let compile_regexp s c = + let re = Emacs.re ~case:(not c) s in + { re + ; mtch = lazy (Re.compile (Re.seq [Re.start; re])) + ; srch = lazy (Re.compile re) } + +let state = ref None + +let string_match re s p = + try + state := Some (Re.exec ~pos:p (Lazy.force re.mtch) s); + true + with Not_found -> + state := None; + false + +let string_partial_match re s p = + match + Re.exec_partial ~pos:p (Lazy.force re.mtch) s + with + `Full -> string_match re s p + | `Partial -> true + | `Mismatch -> false + +let search_forward re s p = + try + let res = Re.exec ~pos:p (Lazy.force re.srch) s in + state := Some res; + fst (Re.Group.offset res 0) + with Not_found -> + state := None; + raise Not_found + +let rec search_backward re s p = + try + let res = Re.exec ~pos:p (Lazy.force re.mtch) s in + state := Some res; + p + with Not_found -> + state := None; + if p = 0 then + raise Not_found + else + search_backward re s (p - 1) + +let valid_group n = + n >= 0 && n < 10 && ( + match !state with + | None -> false + | Some m -> n < Re.Group.nb_groups m + ) + +let offset_group i = + match !state with + | Some m -> Re.Group.offset m i + | None -> raise Not_found + +let group_len i = + try + let (b, e) = offset_group i in + e - b + with Not_found -> + 0 + +let rec repl_length repl p q len = + if p < len then begin + if repl.[p] <> '\\' then + repl_length repl (p + 1) (q + 1) len + else begin + let p = p + 1 in + if p = len then failwith "Str.replace: illegal backslash sequence"; + let q = + match repl.[p] with + | '\\' -> q + 1 + | '0' .. '9' as c -> q + group_len (Char.code c - Char.code '0') + | _ -> q + 2 in + repl_length repl (p + 1) q len + end + end else + q + +let rec replace orig repl p res q len = + if p < len then begin + let c = repl.[p] in + if c <> '\\' then begin + Bytes.set res q c; + replace orig repl (p + 1) res (q + 1) len + end else begin + match repl.[p + 1] with + '\\' -> + Bytes.set res q '\\'; + replace orig repl (p + 2) res (q + 1) len + | '0' .. '9' as c -> + let d = + try + let (b, e) = offset_group (Char.code c - Char.code '0') in + let d = e - b in + if d > 0 then String.blit orig b res q d; + d + with Not_found -> + 0 + in + replace orig repl (p + 2) res (q + d) len + | c -> + Bytes.set res q '\\'; + Bytes.set res (q + 1) c; + replace orig repl (p + 2) res (q + 2) len + end + end + +let replacement_text repl orig = + let len = String.length repl in + let res = Bytes.create (repl_length repl 0 0 len) in + replace orig repl 0 res 0 (String.length repl); + Bytes.unsafe_to_string res + +let quote s = + let len = String.length s in + let buf = Buffer.create (2 * len) in + for i = 0 to len - 1 do + match s.[i] with + '[' | ']' | '*' | '.' | '\\' | '?' | '+' | '^' | '$' as c -> + Buffer.add_char buf '\\'; + Buffer.add_char buf c + | c -> Buffer.add_char buf c + done; + Buffer.contents buf + +let string_before s n = String.sub s 0 n + +let string_after s n = String.sub s n (String.length s - n) + +let first_chars s n = String.sub s 0 n + +let last_chars s n = String.sub s (String.length s - n) n + +let regexp e = compile_regexp e false + +let regexp_case_fold e = compile_regexp e true + +let regexp_string s = compile_regexp (quote s) false + +let regexp_string_case_fold s = compile_regexp (quote s) true + +let group_beginning n = + if not (valid_group n) then invalid_arg "Str.group_beginning"; + let pos = fst (offset_group n) in + if pos = -1 then + raise Not_found + else + pos + +let group_end n = + if not (valid_group n) then invalid_arg "Str.group_end"; + let pos = snd (offset_group n) in + if pos = -1 then + raise Not_found + else + pos + +let matched_group n txt = + let (b, e) = offset_group n in + String.sub txt b (e - b) + +let replace_matched repl matched = replacement_text repl matched + +let match_beginning () = group_beginning 0 +and match_end () = group_end 0 +and matched_string txt = matched_group 0 txt + +let substitute_first expr repl_fun text = + try + let pos = search_forward expr text 0 in + String.concat "" [string_before text pos; + repl_fun text; + string_after text (match_end ())] + with Not_found -> + text + +let global_substitute expr repl_fun text = + let rec replace accu start last_was_empty = + try + let startpos = if last_was_empty then start + 1 else start in + if startpos > String.length text then raise Not_found; + let pos = search_forward expr text startpos in + let end_pos = match_end () in + let repl_text = repl_fun text in + replace (repl_text :: String.sub text start (pos-start) :: accu) + end_pos (end_pos = pos) + with Not_found -> + (string_after text start) :: accu in + String.concat "" (List.rev (replace [] 0 false)) + +let global_replace expr repl text = + global_substitute expr (replacement_text repl) text +and replace_first expr repl text = + substitute_first expr (replacement_text repl) text + +let search_forward_progress re s p = + let pos = search_forward re s p in + if match_end () > p then + pos + else if p < String.length s then + search_forward re s (p + 1) + else + raise Not_found + +let bounded_split expr text num = + let start = + if string_match expr text 0 then match_end () else 0 in + let rec split accu start n = + if start >= String.length text then + accu + else if n = 1 then + (string_after text start) :: accu + else + try + let pos = search_forward_progress expr text start in + split ((String.sub text start (pos-start)) :: accu) + (match_end ()) (n - 1) + with Not_found -> + (string_after text start) :: accu in + List.rev (split [] start num) + +let split expr text = bounded_split expr text 0 + +let bounded_split_delim expr text num = + let rec split accu start n = + if start > String.length text then + accu + else if n = 1 then + (string_after text start) :: accu + else + try + let pos = search_forward_progress expr text start in + split (String.sub text start (pos-start) :: accu) + (match_end ()) (n - 1) + with Not_found -> + (string_after text start) :: accu in + if text = "" then + [] + else + List.rev (split [] 0 num) + +let split_delim expr text = bounded_split_delim expr text 0 + +type split_result = Text of string | Delim of string + +let bounded_full_split expr text num = + let rec split accu start n = + if start >= String.length text then + accu + else if n = 1 then + Text (string_after text start) :: accu + else + try + let pos = search_forward_progress expr text start in + let s = matched_string text in + if pos > start then + split (Delim (s) :: + Text (String.sub text start (pos - start)) :: + accu) + (match_end ()) (n - 1) + else + split (Delim (s) :: accu) + (match_end ()) (n - 1) + with Not_found -> + Text (string_after text start) :: accu in + List.rev (split [] 0 num) + +let full_split expr text = bounded_full_split expr text 0 diff --git a/vendor/re/src/str.mli b/vendor/re/src/str.mli new file mode 100644 index 00000000000..55917fd7dcb --- /dev/null +++ b/vendor/re/src/str.mli @@ -0,0 +1,203 @@ +(***********************************************************************) +(* *) +(* Objective Caml *) +(* *) +(* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) +(* *) +(* Copyright 1996 Institut National de Recherche en Informatique et *) +(* en Automatique. All rights reserved. This file is distributed *) +(* under the terms of the GNU Library General Public License, with *) +(* linking exception. *) +(* *) +(***********************************************************************) + +(* $Id: re_str.mli,v 1.1 2002/01/16 14:16:04 vouillon Exp $ *) + +(** Module [Str]: regular expressions and high-level string processing *) + +(** {2 Regular expressions} *) + +type regexp +(** The type of compiled regular expressions. *) + +val regexp: string -> regexp +(** Compile a regular expression. The syntax for regular expressions + is the same as in Gnu Emacs. The special characters are + [$^.*+?[]]. The following constructs are recognized: + - [. ] matches any character except newline + - [* ] (postfix) matches the previous expression zero, one or + several times + - [+ ] (postfix) matches the previous expression one or + several times + - [? ] (postfix) matches the previous expression once or + not at all + - [[..] ] character set; ranges are denoted with [-], as in [[a-z]]; + an initial [^], as in [[^0-9]], complements the set + - [^ ] matches at beginning of line + - [$ ] matches at end of line + - [\| ] (infix) alternative between two expressions + - [\(..\)] grouping and naming of the enclosed expression + - [\1 ] the text matched by the first [\(...\)] expression + ([\2] for the second expression, etc) + - [\b ] matches word boundaries + - [\ ] quotes special characters. *) + +val regexp_case_fold: string -> regexp +(** Same as [regexp], but the compiled expression will match text + in a case-insensitive way: uppercase and lowercase letters will + be considered equivalent. *) + +val quote: string -> string +(** [Str.quote s] returns a regexp string that matches exactly + [s] and nothing else. *) + +val regexp_string: string -> regexp +val regexp_string_case_fold: string -> regexp +(** [Str.regexp_string s] returns a regular expression + that matches exactly [s] and nothing else. + [Str.regexp_string_case_fold] is similar, but the regexp + matches in a case-insensitive way. *) + +(** {2 String matching and searching} *) + +val string_match: regexp -> string -> int -> bool +(** [string_match r s start] tests whether the characters in [s] + starting at position [start] match the regular expression [r]. + The first character of a string has position [0], as usual. *) + +val search_forward: regexp -> string -> int -> int +(** [search_forward r s start] searches the string [s] for a substring + matching the regular expression [r]. The search starts at position + [start] and proceeds towards the end of the string. + Return the position of the first character of the matched + substring, or raise [Not_found] if no substring matches. *) + +val search_backward: regexp -> string -> int -> int +(** Same as [search_forward], but the search proceeds towards the + beginning of the string. *) + +val string_partial_match: regexp -> string -> int -> bool +(** Similar to [string_match], but succeeds whenever the argument + string is a prefix of a string that matches. This includes + the case of a true complete match. *) + +val matched_string: string -> string +(** [matched_string s] returns the substring of [s] that was matched + by the latest [string_match], [search_forward] or [search_backward]. + The user must make sure that the parameter [s] is the same string + that was passed to the matching or searching function. *) + +val match_beginning: unit -> int +val match_end: unit -> int +(** [match_beginning ()] returns the position of the first character + of the substring that was matched by [string_match], + [search_forward] or [search_backward]. [match_end ()] returns + the position of the character following the last character of + the matched substring. *) + +val matched_group: int -> string -> string +(** [matched_group n s] returns the substring of [s] that was matched + by the [n]th group [\(...\)] of the regular expression during + the latest [string_match], [search_forward] or [search_backward]. + The user must make sure that the parameter [s] is the same string + that was passed to the matching or searching function. + [matched_group n s] raises [Not_found] if the [n]th group + of the regular expression was not matched. This can happen + with groups inside alternatives [\|], options [?] + or repetitions [*]. For instance, the empty string will match + [\(a\)*], but [matched_group 1 ""] will raise [Not_found] + because the first group itself was not matched. *) + +val group_beginning: int -> int +val group_end: int -> int +(** [group_beginning n] returns the position of the first character + of the substring that was matched by the [n]th group of + the regular expression. [group_end n] returns + the position of the character following the last character of + the matched substring. Both functions raise [Not_found] + if the [n]th group of the regular expression + was not matched. *) + +(** {2 Replacement} *) + +val global_replace: regexp -> string -> string -> string +(** [global_replace regexp templ s] returns a string identical to [s], + except that all substrings of [s] that match [regexp] have been + replaced by [templ]. The replacement template [templ] can contain + [\1], [\2], etc; these sequences will be replaced by the text + matched by the corresponding group in the regular expression. + [\0] stands for the text matched by the whole regular expression. *) + +val replace_first: regexp -> string -> string -> string +(** Same as [global_replace], except that only the first substring + matching the regular expression is replaced. *) + +val global_substitute: regexp -> (string -> string) -> string -> string +(** [global_substitute regexp subst s] returns a string identical + to [s], except that all substrings of [s] that match [regexp] + have been replaced by the result of function [subst]. The + function [subst] is called once for each matching substring, + and receives [s] (the whole text) as argument. *) + +val substitute_first: regexp -> (string -> string) -> string -> string +(** Same as [global_substitute], except that only the first substring + matching the regular expression is replaced. *) + +val replace_matched : string -> string -> string +(** [replace_matched repl s] returns the replacement text [repl] + in which [\1], [\2], etc. have been replaced by the text + matched by the corresponding groups in the most recent matching + operation. [s] must be the same string that was matched during + this matching operation. *) + +(** {2 Splitting} *) + +val split: regexp -> string -> string list +(** [split r s] splits [s] into substrings, taking as delimiters + the substrings that match [r], and returns the list of substrings. + For instance, [split (regexp "[ \t]+") s] splits [s] into + blank-separated words. An occurrence of the delimiter at the + beginning and at the end of the string is ignored. *) + +val bounded_split: regexp -> string -> int -> string list +(** Same as [split], but splits into at most [n] substrings, + where [n] is the extra integer parameter. *) + +val split_delim: regexp -> string -> string list +val bounded_split_delim: regexp -> string -> int -> string list +(** Same as [split] and [bounded_split], but occurrences of the + delimiter at the beginning and at the end of the string are + recognized and returned as empty strings in the result. + For instance, [split_delim (regexp " ") " abc "] + returns [[""; "abc"; ""]], while [split] with the same + arguments returns [["abc"]]. *) + +type split_result = Text of string | Delim of string + +val full_split: regexp -> string -> split_result list +val bounded_full_split: regexp -> string -> int -> split_result list +(** Same as [split_delim] and [bounded_split_delim], but returns + the delimiters as well as the substrings contained between + delimiters. The former are tagged [Delim] in the result list; + the latter are tagged [Text]. For instance, + [full_split (regexp "[{}]") "{ab}"] returns + [[Delim "{"; Text "ab"; Delim "}"]]. *) + +(** {2 Extracting substrings} *) + +val string_before: string -> int -> string +(** [string_before s n] returns the substring of all characters of [s] + that precede position [n] (excluding the character at + position [n]). *) + +val string_after: string -> int -> string +(** [string_after s n] returns the substring of all characters of [s] + that follow position [n] (including the character at + position [n]). *) + +val first_chars: string -> int -> string +(** [first_chars s n] returns the first [n] characters of [s]. + This is the same function as [string_before]. *) + +val last_chars: string -> int -> string +(** [last_chars s n] returns the last [n] characters of [s]. *) diff --git a/vendor/update-re.sh b/vendor/update-re.sh index 789614e89f6..8f799e87700 100755 --- a/vendor/update-re.sh +++ b/vendor/update-re.sh @@ -1,6 +1,6 @@ #!/bin/bash -version=1.7.1 +version=1.9.0 set -e -o pipefail @@ -10,18 +10,24 @@ trap "rm -rf $TMP" EXIT rm -rf re mkdir -p re/src -(cd $TMP && opam source re.$version) +( + cd $TMP + git clone https://github.com/ocaml/ocaml-re.git + cd ocaml-re + git checkout $version +) -SRC=$TMP/re.$version +SRC=$TMP/ocaml-re -cp -v $SRC/LICENSE re +cp -v $SRC/LICENSE.md re +cp -v $SRC/lib/*.{ml,mli} re/src/ -for m in re re_automata re_cset re_fmt; do - for ext in ml mli; do - fn=$SRC/lib/$m.$ext - [[ -f $fn ]] && cp -v $fn re/src - done -done +echo "include Re" > re/src/dune_re.ml +cat >re/src/dune <