support for minimum/maximum in JSON schemas #1019

mmoskal · 2024-09-10T22:22:41Z

Currently we don't support the numerical bounds for JSON numbers. I wrote some code that compiles the bounds into regular expressions.

I don't have time right now to make it into full PR right now - maybe someone else wants to take a look.

There are two functions in the file, one for integer and one for real (float) ranges. It only supports inclusive ranges right now (not a problem for integers, but for floats would need some more work to support exclusive).

import re
import math


def mk_or(parts: list[str]) -> str:
    if len(parts) == 1:
        return parts[0]
    return "(" + "|".join(parts) + ")"


def num_digits(n: int) -> int:
    return len(str(n))


def rx_int_range(left: int, right: int) -> str:
    assert left <= right
    if left < 0:
        if right < 0:
            return "(-" + rx_int_range(-right, -left) + ")"
        else:
            return "(-" + rx_int_range(0, -left) + "|" + rx_int_range(0, right) + ")"
    else:
        if num_digits(left) == num_digits(right):
            l = str(left)
            r = str(right)
            if left == right:
                return f"({l})"

            lpref = l[:-1]
            lx = l[-1]
            rpref = r[:-1]
            rx = r[-1]

            # 1723-1728 => 172[3-8]
            if lpref == rpref:
                return f"({lpref}[{lx}-{rx}])"

            # general case 723-915 => 72[3-9]|(73-90)[0-9]|91[0-5]
            left_rec = int(lpref)
            right_rec = int(rpref)
            assert left_rec < right_rec
            parts = []

            # optimizations:
            # for 0, we have 720-915 => (72-90)[0-9]|91[0-5]
            if lx != "0":
                left_rec += 1
                parts.append(f"{lpref}[{lx}-9]")
            # for 9 we have 723-919 => 72[3-9]|(73-91)[0-9]
            if rx != "9":
                right_rec -= 1
                parts.append(f"{rpref}[0-{rx}]")

            # the middle can be empty 723-734 => 72[3-9]|73[0-4]
            if left_rec <= right_rec:
                inner = rx_int_range(left_rec, right_rec)
                parts.append(f"{inner}[0-9]")

            return mk_or(parts)
        else:
            break_point = 10 ** num_digits(left) - 1
            return mk_or(
                [rx_int_range(left, break_point), rx_int_range(break_point + 1, right)]
            )


def lexi_x_to_9(x: str) -> str:
    if x == "":
        return "[0-9]*"
    if len(x) == 1:
        return f"[{x}-9][0-9]*"
    x0 = int(x[0])
    parts = [x[0] + lexi_x_to_9(x[1:])]
    if x0 < 9:
        parts.append(f"[{x0 + 1}-9][0-9]*")
    return mk_or(parts)


def lexi_0_to_x(x: str) -> str:
    if x == "":
        return ""  # don't allow trailing zeros
    x0 = int(x[0])
    parts = [x[0] + lexi_0_to_x(x[1:])]
    if x0 > 0:
        parts.append(f"[0-{x0 - 1}][0-9]*")
    return mk_or(parts)


def lexi_range(ld: str, rd: str) -> str:
    assert len(ld) == len(rd)
    if ld == rd:
        return ld
    l0 = int(ld[0])
    r0 = int(rd[0])
    # common prefix: 137-144 => 1(37-44)
    if r0 == l0:
        return ld[0] + lexi_range(ld[1:], rd[1:])
    assert l0 < r0
    # 23470-82142 => 2(347-999)|[3-7][0-9]*|8(0000-2142)
    parts = [ld[0] + lexi_x_to_9(ld[1:].rstrip("0"))]
    # is the [3-7][0-9]* part empty?
    if l0 + 1 < r0:
        parts.append(f"[{l0 + 1}-{r0 - 1}][0-9]*")
    parts.append(rd[0] + lexi_0_to_x(rd[1:].rstrip("0")))
    return mk_or(parts)

def float_to_str(f: float) -> str:
    s = f"{f:f}"
    return s.rstrip("0").rstrip(".")

def rx_float_range(left: float, right: float) -> str:
    assert left <= right
    if left < 0:
        if right < 0:
            return "(-" + rx_float_range(-right, -left) + ")"
        else:
            return (
                "(-"
                + rx_float_range(0.0, -left)
                + "|"
                + rx_float_range(0.0, right)
                + ")"
            )
    else:
        l = float_to_str(left)
        r = float_to_str(right)
        if left == right:
            return f"({l})"

        if "e" in l or "e" in r:
            raise ValueError("Scientific notation not supported")
        if not math.isfinite(left) or not math.isfinite(right):
            raise ValueError("Infinite numbers not supported")

        left_rec = int(l.split(".")[0])
        right_rec = int(r.split(".")[0])

        ld = (l.split(".") + [""])[1]
        rd = (r.split(".") + [""])[1]

        # 17.123-17.1448 -> 17.((1230-1447)[0-9]*|1448)
        if left_rec == right_rec:
            while len(ld) < len(rd):
                ld += "0"
            while len(rd) < len(ld):
                rd += "0"
            suff = "\\." + lexi_range(ld, rd)
            if int(ld) == 0:
                return f"({left_rec}({suff})?)"
            else:
                return f"({left_rec}{suff})"

        parts = []

        # 7.321-22.123 -> 7.(321-999)|8-21(.[0-9]+)?|22.(000-123)
        if ld:
            parts.append(f"({left_rec}\\.{lexi_x_to_9(ld)})")
            left_rec += 1

        if right_rec - 1 >= left_rec:
            inner = rx_int_range(left_rec, right_rec - 1)
            parts.append(f"({inner}(\\.[0-9]+)?)")

        if rd:
            parts.append(f"({right_rec}(\\.{lexi_0_to_x(rd)})?)")
        else:
            parts.append(f"{right_rec}")

        return mk_or(parts)


def do_test_int_range(rx: str, left: int, right: int) -> None:
    print(rx)
    n = left - 1000
    while n < right + 1000:
        m = re.fullmatch(rx, str(n)) is not None
        f = left <= n <= right
        if m != f:
            print(f"Failed for {n} match={m} expected={f}")
            assert False
        n += 1


def test_int_range(left: int, right: int) -> None:
    print(f"Testing range {left}-{right}")
    rx = rx_int_range(left, right)
    do_test_int_range(rx, left, right)


def test_float_range(left: float, right: float) -> None:
    print(f"Testing range {left}-{right}")
    rx = rx_float_range(left, right)
    do_test_int_range(rx, math.ceil(left), math.floor(right))
    eps = 0.000001
    for x in [left, right, 0, int(left), int(right)]:
        for off in [0, -eps, eps, 1, -1]:
            n = x + off
            ns = float_to_str(n)
            m = re.fullmatch(rx, ns) is not None
            f = left <= n <= right
            if m != f:
                print(f"Failed float for {ns} match={m} expected={f}")
                assert False


def main():
    test_int_range(0, 9)
    test_int_range(1, 7)
    test_int_range(0, 99)
    test_int_range(13, 170)
    test_int_range(13, 17)
    test_int_range(13, 27)
    test_int_range(13, 57)
    test_int_range(72, 91)
    test_int_range(723, 915)
    test_int_range(23, 915)
    test_int_range(-1, 915)
    test_int_range(-9, 9)
    test_int_range(-3, 3)
    test_int_range(-3, 0)
    test_int_range(-72, 13)

    test_float_range(0, 10)
    test_float_range(-10, 0)

    test_float_range(0.5, 0.72)
    test_float_range(0.5, 1.72)
    test_float_range(0.5, 1.32)
    test_float_range(0.3245, 0.325)
    test_float_range(1, 2.34)
    test_float_range(1.33, 2)
    test_float_range(1, 10.34)
    test_float_range(1.33, 10)
    test_float_range(-1.33, 10)
    test_float_range(-17.23, -1.33)
    test_float_range(-1.23, -1.221)
    test_float_range(-10.2, 45293.9)


if __name__ == "__main__":
    main()

hudson-ai self-assigned this Sep 13, 2024

hudson-ai linked a pull request Sep 13, 2024 that will close this issue

[Feature] Support for JSON minimum , maximum, exclusiveMinimum, and exclusiveMaximum #1023

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

support for minimum/maximum in JSON schemas #1019

support for minimum/maximum in JSON schemas #1019

mmoskal commented Sep 10, 2024

support for minimum/maximum in JSON schemas #1019

support for minimum/maximum in JSON schemas #1019

Comments

mmoskal commented Sep 10, 2024