diff --git a/jsonquerylang/functions.py b/jsonquerylang/functions.py index 860bfc7..8cf1e34 100644 --- a/jsonquerylang/functions.py +++ b/jsonquerylang/functions.py @@ -1,6 +1,6 @@ from functools import reduce, cmp_to_key from math import prod -import re +import regex def get_functions(compile, build_function): @@ -282,14 +282,54 @@ def fn_not_in(path, not_in_values): ) def fn_regex(path, expression, options=None): - regex = ( - re.compile(expression, flags=_parse_regex_flags(options)) + compiled_regex = ( + regex.compile(expression, flags=_parse_regex_flags(options)) if options - else re.compile(expression) + else regex.compile(expression) ) getter = compile(path) - return lambda value: regex.match(getter(value)) is not None + return lambda value: compiled_regex.match(getter(value)) is not None + + def match_to_json(result): + value = result.group() + groups = [*result.groups()] + named_groups = result.groupdict() + + if named_groups: + return {"value": value, "groups": groups, "namedGroups": named_groups} + + if groups: + return {"value": value, "groups": groups} + + return {"value": value} + + def fn_match(path, expression, options=None): + compiled_regex = ( + regex.compile(expression, flags=_parse_regex_flags(options)) + if options + else regex.compile(expression) + ) + getter = compile(path) + + def search(value): + first_match = compiled_regex.search(getter(value)) + + return match_to_json(first_match) if first_match else None + + return search + + def fn_match_all(path, expression, options=None): + compiled_regex = ( + regex.compile(expression, flags=_parse_regex_flags(options)) + if options + else regex.compile(expression) + ) + getter = compile(path) + + return lambda value: [ + match_to_json(item) for item in compiled_regex.finditer(getter(value)) + ] def eq(a, b): return a == b and type(a) == type(b) @@ -360,6 +400,8 @@ def lt(a, b): "in": fn_in, "not in": fn_not_in, "regex": fn_regex, + "match": fn_match, + "matchAll": fn_match_all, "eq": fn_eq, "gt": fn_gt, "gte": fn_gte, @@ -384,12 +426,12 @@ def _parse_regex_flags(flags): return None all_flags = { - "A": re.A, - "I": re.I, - "M": re.M, - "S": re.S, - "X": re.X, - "L": re.L, + "A": regex.A, + "I": regex.I, + "M": regex.M, + "S": regex.S, + "X": regex.X, + "L": regex.L, } first, *rest = flags.upper() diff --git a/requirements.txt b/requirements.txt index daad4e6..e5338ab 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,6 +19,7 @@ Pygments==2.19.1 pyproject_hooks==1.2.0 pywin32-ctypes==0.2.3 readme_renderer==44.0 +regex==2025.11.3 requests==2.32.3 requests-toolbelt==1.0.0 rfc3986==2.0.0 diff --git a/setup.py b/setup.py index 5902a49..2e70c90 100644 --- a/setup.py +++ b/setup.py @@ -33,5 +33,5 @@ "Operating System :: OS Independent", ], include_package_data=True, - install_requires=[], + install_requires=["regex"], ) diff --git a/tests/test-suite/compile.test.json b/tests/test-suite/compile.test.json index 208ccf0..f540af2 100644 --- a/tests/test-suite/compile.test.json +++ b/tests/test-suite/compile.test.json @@ -2011,6 +2011,155 @@ "input": null, "query": ["regex", "Joe", "^[A-z]+$"], "output": true + }, + { + "input": null, + "query": ["regex", "2025", "^[A-z]+$"], + "output": false + } + ] + }, + { + "category": "match", + "description": "should extract a regular expression match from a string", + "tests": [ + { + "input": null, + "query": ["match", "Hello World!", "[A-z]+"], + "output": { "value": "Hello" } + }, + { + "input": null, + "query": ["match", "2025-11-05", "[A-z]+"], + "output": null + }, + { + "input": null, + "query": ["match", "Hello World!", "([A-Z])([a-z]+)"], + "output": { + "value": "Hello", + "groups": ["H", "ello"] + } + } + ] + }, + { + "category": "match", + "description": "should extract a regular expression match with groups from a string", + "tests": [ + { + "input": null, + "query": [ + "match", + "I'm on holiday from 2025-07-18 till 2025-08-01", + "(?\\d{4})-(?\\d{2})-(?\\d{2})" + ], + "output": { + "value": "2025-07-18", + "groups": ["2025", "07", "18"], + "namedGroups": { "year": "2025", "month": "07", "date": "18" } + } + }, + { + "input": null, + "query": ["match", "Hello World!", "(?\\d{4})-(?\\d{2})-(?\\d{2})"], + "output": null + } + ] + }, + { + "category": "match", + "description": "should extract a regular expression match with flags from a string", + "tests": [ + { + "input": null, + "query": ["match", "Hello World!", "world", ""], + "output": null + }, + { + "input": null, + "query": ["match", "Hello World!", "world!", "i"], + "output": { "value": "World!" } + }, + { + "input": null, + "query": ["match", "Hello World!", "(?world)!", "i"], + "output": { + "value": "World!", + "groups": ["World"], + "namedGroups": { "group1": "World" } + } + } + ] + }, + { + "category": "matchAll", + "description": "should extract all regular expression matches from a string", + "tests": [ + { + "input": null, + "query": ["matchAll", "Hello World!", "[A-z]+"], + "output": [{ "value": "Hello" }, { "value": "World" }] + }, + { + "input": null, + "query": ["matchAll", "2025-05-11", "[A-z]+"], + "output": [] + }, + { + "input": null, + "query": ["matchAll", "Hello World!", "([A-Z])([a-z]+)"], + "output": [ + { "value": "Hello", "groups": ["H", "ello"] }, + { "value": "World", "groups": ["W", "orld"] } + ] + } + ] + }, + { + "category": "matchAll", + "description": "should extract all regular expression matches with groups from a string", + "tests": [ + { + "input": null, + "query": [ + "matchAll", + "I'm on holiday from 2025-07-18 till 2025-08-01", + "(?\\d{4})-(?\\d{2})-(?\\d{2})" + ], + "output": [ + { + "value": "2025-07-18", + "groups": ["2025", "07", "18"], + "namedGroups": { "year": "2025", "month": "07", "date": "18" } + }, + { + "value": "2025-08-01", + "groups": ["2025", "08", "01"], + "namedGroups": { "year": "2025", "month": "08", "date": "01" } + } + ] + }, + { + "input": null, + "query": ["matchAll", "Hello World!", "(?\\d{4})-(?\\d{2})-(?\\d{2})"], + "output": [] + } + ] + }, + { + "category": "matchAll", + "description": "should extract all regular expression matches with a flag from a string", + "tests": [ + { + "input": null, + "query": ["matchAll", "Hello World!", "\\b[a-z]+\\b", ""], + "output": [] + }, + { + "input": null, + "query": ["matchAll", "Hello World!", "\\b[a-z]+\\b", "i"], + "output": [{ "value": "Hello" }, { "value": "World" }] } ] },