Skip to content

Commit

Permalink
regex filters (#432): scan, splits, split, sub, gsub
Browse files Browse the repository at this point in the history
  • Loading branch information
pkoppstein authored and nicowilliams committed Aug 8, 2014
1 parent 0d437e2 commit a696c6b
Show file tree
Hide file tree
Showing 3 changed files with 185 additions and 11 deletions.
77 changes: 66 additions & 11 deletions builtin.c
Original file line number Diff line number Diff line change
Expand Up @@ -974,23 +974,78 @@ static const char* const jq_builtins[] = {
"def flatten: reduce .[] as $i ([]; if $i | type == \"array\" then . + ($i | flatten) else . + [$i] end);",
"def flatten(x): x as $x | reduce .[] as $i ([]; if $i | type == \"array\" and $x > 0 then . + ($i | flatten($x-1)) else . + [$i] end);",
"def range(x): x as $x | range(0;$x);",
// regular expressions:
"def match(re; mode): _match_impl(re; mode; false)|.[];",
"def match(val): (val|type) as $vt | if $vt == \"string\" then match(val; null)"
" elif $vt == \"array\" and (val | length) > 1 then match(val[0]; val[1])"
" elif $vt == \"array\" and (val | length) > 0 then match(val[0]; null)"
" else error( $vt + \" not a string or array\") end;",
" elif $vt == \"array\" and (val | length) > 1 then match(val[0]; val[1])"
" elif $vt == \"array\" and (val | length) > 0 then match(val[0]; null)"
" else error( $vt + \" not a string or array\") end;",
"def test(re; mode): _match_impl(re; mode; true);",
"def test(val): (val|type) as $vt | if $vt == \"string\" then test(val; null)"
" elif $vt == \"array\" and (val | length) > 1 then test(val[0]; val[1])"
" elif $vt == \"array\" and (val | length) > 0 then test(val[0]; null)"
" else error( $vt + \" not a string or array\") end;",
// Ex.: "a1" | capture( "(?<x>[a-z*])" ).x => "a"
" elif $vt == \"array\" and (val | length) > 1 then test(val[0]; val[1])"
" elif $vt == \"array\" and (val | length) > 0 then test(val[0]; null)"
" else error( $vt + \" not a string or array\") end;",
"def capture(re; mods): match(re; mods) | reduce ( .captures | .[] | select(.name != null) | { (.name) : .string } ) as $pair ({}; . + $pair);",
"def capture(val): (val|type) as $vt | if $vt == \"string\" then capture(val; null)"
" elif $vt == \"array\" and (val | length) > 1 then capture(val[0]; val[1])"
" elif $vt == \"array\" and (val | length) > 0 then capture(val[0]; null)"
" else error( $vt + \" not a string or array\") end;",
" elif $vt == \"array\" and (val | length) > 1 then capture(val[0]; val[1])"
" elif $vt == \"array\" and (val | length) > 0 then capture(val[0]; null)"
" else error( $vt + \" not a string or array\") end;",
"def scan(re):"
" match(re; \"g\")"
" | if (.captures|length > 0)"
" then [ .captures | .[] | .string ]"
" else .string"
" end ;",
//
// If input is an array, then emit a stream of successive subarrays of length n (or less),
// and similarly for strings.
"def nwise(a; n): if a|length <= n then a else a[0:n] , nwise(a[n:]; n) end;",
"def nwise(n): nwise(.; n);",
//
// splits/1 produces a stream; split/1 is retained for backward compatibility.
"def splits(re; flags): . as $s"
// # multiple occurrences of "g" are acceptable
" | [ match(re; \"g\" + flags) | (.offset, .offset + .length) ]"
" | [0] + . +[$s|length]"
" | nwise(2)"
" | $s[.[0]:.[1] ] ;",
"def splits(re): splits(re; null);",
//
// split emits an array for backward compatibility
"def split(re; flags): [ splits(re; flags) ];",
"def split(re): [ splits(re; null) ];",
//
// If s contains capture variables, then create a capture object and pipe it to s
"def sub(re; s):"
" . as $in"
" | [match(re)]"
" | .[0]"
" | . as $r"
// # create the \"capture\" object:
" | reduce ( $r | .captures | .[] | select(.name != null) | { (.name) : .string } ) as $pair"
" ({}; . + $pair)"
" | if . == {} then $in | .[0:$r.offset]+s+.[$r.offset+$r.length:]"
" else (. | s)"
" end ;",
//
// repeated substitution of re (which may contain named captures)
"def gsub(re; s):"
// # _stredit(edits;s) - s is the \"to\" string, which might contain capture variables,
// # so if an edit contains captures, then create the capture object and pipe it to s
" def _stredit(edits; s):"
" if (edits|length) == 0 then ."
" else . as $in"
" | (edits|length -1) as $l"
" | (edits[$l]) as $edit"
// # create the \"capture\" object:
" | ($edit | reduce ( $edit | .captures | .[] | select(.name != null) | { (.name) : .string } ) as $pair"
" ({}; . + $pair) )"
" | if . == {} then $in | .[0:$edit.offset]+s+.[$edit.offset+$edit.length:] | _stredit(edits[0:$l]; s)"
" else (if $l == 0 then \"\" else ($in | _stredit(edits[0:$l]; s)) end) + (. | s)"
" end"
" end ;"
" [match(re;\"g\")] as $edits | _stredit($edits; s) ;",

//#######################################################################
// range/3, with a `by` expression argument
"def range(init; upto; by): "
" init as $init |"
Expand Down
85 changes: 85 additions & 0 deletions docs/content/3.manual/manual.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1721,6 +1721,91 @@ sections:
- program: 'capture("(?<a>[a-z]+)-(?<n>[0-9]+)")'
input: '"xyzzy-14"'
output: '{ "a": "xyzzy", "n": "14" }''
- title: "`scan(regex)`, `scan(regex; flags)`"
body: |
Emit a stream of the non-overlapping substrings of the input
that match the regex in accordance with the flags, if any
have been specified. If there is no match, the stream is empty.
To capture all the matches for each input string, use the idiom
[ expr ], e.g. [ scan(regex) ].
example:
- program: 'scan("c")'
input: '"abcdefabc"'
output: '"c"'
'"c"'

- program: 'scan("b")'
input: ("", "")
output: '[]'
'[]"'

- title: "`split(regex)`, split(regex; flags)`"
body: |
For backwards compatibility, `split` emits an array of the strings
corresponding to the successive segments of the input string after it
has been split at the boundaries defined by the regex and any
specified flags. The substrings corresponding to the boundaries
themselves are excluded. If regex is the empty string, then the first
match will be the empty string.

`split(regex)` can be thought of as a wrapper around `splits(regex)`,
and similarly for `split(regex; flags)`.

example:
- program: 'split(", *")'
input: '"ab,cd, ef"`
output: '["ab","cd","ef"]'


- title: "`splits(regex)`, splits(regex; flags)`"
body: |
These provide the same results as their `split` counterparts,
but as a stream instead of an array.

example:
- program: 'splits(", *")'
input: '("ab,cd", "ef, gh")`
output:
'"ab"'
'"cd"'
'"ef"'
'"gh"'
- title: "`sub(regex; tostring)`"
body: |
Emit the string obtained by replacing the first match of regex in the
input string with `tostring`, after interpolation. `tostring` should
be a jq string, and may contain references to named captures. The
named captures are, in effect, presented as a JSON object (as
constructed by `capture`) to `tostring`, so a reference to a captured
variable named "x" would take the form: "\(.x)".
example:
- program: 'sub("^[^a-z]*(?<x>[a-z]*).*")'
input: '"123abc456"'
output: '"ZabcZabc"'


- title: "`gsub(regex; string)`"

body: |
`gsub` is like `sub` but all the non-overlapping occurrences of the regex are
replaced by the string, after interpolation.

example:
- program: 'gsub("(?<x>.)[^a]*"; "+\(.x)-")'

input: '"Abcabc"'
output: '"+A-+a-"'


- title: Advanced features
body: |
Expand Down
34 changes: 34 additions & 0 deletions tests/all.test
Original file line number Diff line number Diff line change
Expand Up @@ -820,6 +820,40 @@ capture("(?<a>[a-z]+)-(?<n>[0-9]+)")
"xyzzy-14"
{"a":"xyzzy","n":"14"}


# jq-coded utilities built on match:
#
# The second element in these tests' inputs tests the case where the
# fromstring matches both the head and tail of the string
[.[] | sub(", "; ":")]
["a,b, c, d, e,f", ", a,b, c, d, e,f, "]
["a,b:c, d, e,f",":a,b, c, d, e,f, "]
, #2 [", ",", ",", "],["a,b","c","d","e,f"]], #3 [[":a,b, c, d, e,f,"],[":a,b:c:d:e,f:"],[", ",", ",", ",", ",", "],["","a,b","c","d","e,f",""]]]

[.[] | gsub(", "; ":")]
["a,b, c, d, e,f",", a,b, c, d, e,f, "]
["a,b:c:d:e,f",":a,b:c:d:e,f:"]

[.[] | scan(", ")]
["a,b, c, d, e,f",", a,b, c, d, e,f, "]

[.[] | split(", ")]
["a,b, c, d, e,f",", a,b, c, d, e,f, "]

########################
[.[]|[[sub(", *";":")], [gsub(", *";":")], [scan(", *")], split(", *")]]
["a,b, c, d, e,f",", a,b, c, d, e,f, "]
[[["a:b, c, d, e,f"],["a:b:c:d:e:f"],[",",", ",", ",", ",","],["a","b","c","d","e","f"]],[[":a,b, c, d, e,f, "],[":a:b:c:d:e:f:"],[", ",",",", ",", ",", ",",",", "],["","a","b","c","d","e","f",""]]]

[.[]|[[sub(", +";":")], [gsub(", +";":")], [scan(", +")], split(", +")]]
["a,b, c, d, e,f",", a,b, c, d, e,f, "]
[[["a,b:c, d, e,f"],["a,b:c:d:e,f"],[", ",", ",", "],["a,b","c","d","e,f"]],[[":a,b, c, d, e,f, "],[":a,b:c:d:e,f:"],[", ",", ",", ",", ",", "],["","a,b","c","d","e,f",""]]]

# reference to named captures
gsub("(?<x>.)[^a]*"; "+\(.x)-")
"Abcabc"
"+A-+a-"

[.[]|ltrimstr("foo")]
["fo", "foo", "barfoo", "foobar", "afoo"]
["fo","","barfoo","bar","afoo"]
Expand Down

0 comments on commit a696c6b

Please sign in to comment.