Skip to content

Commit

Permalink
[JSON] Raise exceptions when "sibling" keywords are unhandled (#1063)
Browse files Browse the repository at this point in the history
When an applicator has sibling keywords, JSON schema mandates that both sets of keywords are respected. This PR ensures that we raise early exceptions wherever this "schema intersection" behavior isn't explicitly handled.
  • Loading branch information
hudson-ai authored Oct 29, 2024
1 parent da80081 commit dc5a080
Show file tree
Hide file tree
Showing 2 changed files with 173 additions and 24 deletions.
64 changes: 52 additions & 12 deletions guidance/library/_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,17 +165,17 @@ class ObjectKeywords(str, Enum):
JSONType.OBJECT: ObjectKeywords,
}

DEFS_KEYS = {"$defs", "definitions"}

IGNORED_KEYS = {
"$anchor",
"$defs",
"$schema",
"$id",
"id",
"$comment",
"title",
"description",
"default",
"definitions",
"description",
"examples",
}

Expand All @@ -188,7 +188,7 @@ class ObjectKeywords(str, Enum):
IGNORED_KEYS.add("discriminator")

WHITESPACE = {b" ", b"\t", b"\n", b"\r"}
VALID_KEYS = set(Keyword) | IGNORED_KEYS | DEFS_KEYS | set(NumberKeywords) | set(StringKeywords) | set(ArrayKeywords) | set(ObjectKeywords)
VALID_KEYS = set(Keyword) | set(NumberKeywords) | set(StringKeywords) | set(ArrayKeywords) | set(ObjectKeywords) | IGNORED_KEYS

FORMAT_PATTERNS: dict[str, Optional[str]] = {
# https://json-schema.org/understanding-json-schema/reference/string#built-in-formats
Expand Down Expand Up @@ -398,6 +398,11 @@ def validate_json_node_keys(node: Mapping[str, Any]):
)


def get_sibling_keys(node: Mapping[str, Any], key: str) -> set[str]:
# Get the set of functional (non-ignored) keys that are siblings of the given key
return set(node.keys()) & VALID_KEYS - set(IGNORED_KEYS) - {key}


class GenJson:
item_separator = ", "
key_separator = ": "
Expand Down Expand Up @@ -723,7 +728,20 @@ def const(
lm,
*,
value: Union[None, bool, int, float, str, Mapping, Sequence],
instance_type: Optional[Union[str, Sequence[str]]] = None,
enum: Optional[Sequence[Union[None, bool, int, float, str, Mapping, Sequence]]] = None,
):
schema_to_validate_against: dict[str, Any] = {}
if instance_type is not None:
schema_to_validate_against["type"] = instance_type
if enum is not None:
schema_to_validate_against["enum"] = enum
if schema_to_validate_against:
# Raise a validation error if the value doesn't match the type
jsonschema.validate(
instance=value,
schema=schema_to_validate_against,
)
# Base case
if isinstance(value, (type(None), bool, int, float, str)):
return lm + json_dumps(value)
Expand Down Expand Up @@ -756,14 +774,18 @@ def enum(
self,
lm,
*,
options: Sequence[Mapping[str, Any]]
options: Sequence[Union[None, bool, int, float, str, Mapping, Sequence]],
instance_type: Optional[Union[str, Sequence[str]]] = None,
):
# TODO: can we support a whitespace-flexible version of this?
all_opts: list[GrammarFunction] = []
for opt in options:
all_opts.append(
self.const(value=opt)
)
for instance in options:
try:
grm = self.const(value=instance, instance_type=instance_type)
except jsonschema.ValidationError:
continue
all_opts.append(grm)
if not all_opts:
raise ValueError(f"No valid options found for enum with type {instance_type!r}: {options}")
return lm + select(options=all_opts)


Expand Down Expand Up @@ -811,29 +833,47 @@ def json(
validate_json_node_keys(json_schema)

if Keyword.ANYOF in json_schema:
sibling_keys = get_sibling_keys(json_schema, Keyword.ANYOF)
if sibling_keys:
raise NotImplementedError(f"anyOf with sibling keys is not yet supported. Got {sibling_keys}")
return lm + self.anyOf(anyof_list=json_schema[Keyword.ANYOF])

if Keyword.ALLOF in json_schema:
sibling_keys = get_sibling_keys(json_schema, Keyword.ALLOF)
if sibling_keys:
raise NotImplementedError(f"allOf with sibling keys is not yet supported. Got {sibling_keys}")
allof_list = json_schema[Keyword.ALLOF]
if len(allof_list) != 1:
raise ValueError("Only support allOf with exactly one item")
return lm + self.json(json_schema=allof_list[0])

if Keyword.ONEOF in json_schema:
sibling_keys = get_sibling_keys(json_schema, Keyword.ONEOF)
if sibling_keys:
raise NotImplementedError(f"oneOf with sibling keys is not yet supported. Got {sibling_keys}")
oneof_list = json_schema[Keyword.ONEOF]
if len(oneof_list) == 1:
return lm + self.json(json_schema=oneof_list[0])
warnings.warn("oneOf not fully supported, falling back to anyOf. This may cause validation errors in some cases.")
return lm + self.anyOf(anyof_list=oneof_list)

if Keyword.REF in json_schema:
sibling_keys = get_sibling_keys(json_schema, Keyword.REF)
if sibling_keys:
raise NotImplementedError(f"$ref with sibling keys is not yet supported. Got {sibling_keys}")
return lm + self.ref(reference=json_schema[Keyword.REF])

if Keyword.CONST in json_schema:
return lm + self.const(value=json_schema[Keyword.CONST])
sibling_keys = get_sibling_keys(json_schema, Keyword.CONST) - {Keyword.TYPE, Keyword.ENUM}
if sibling_keys:
raise NotImplementedError(f"const with sibling keys is not yet supported. Got {sibling_keys}")
return lm + self.const(value=json_schema[Keyword.CONST], instance_type=json_schema.get(Keyword.TYPE, None), enum=json_schema.get(Keyword.ENUM, None))

if Keyword.ENUM in json_schema:
return lm + self.enum(options=json_schema[Keyword.ENUM])
sibling_keys = get_sibling_keys(json_schema, Keyword.ENUM) - {Keyword.TYPE}
if sibling_keys:
raise NotImplementedError(f"enum with sibling keys is not yet supported. Got {sibling_keys}")
return lm + self.enum(options=json_schema[Keyword.ENUM], instance_type=json_schema.get(Keyword.TYPE, None))

if Keyword.TYPE in json_schema:
target_types = cast(Union[str, Sequence[str]], json_schema[Keyword.TYPE])
Expand Down
133 changes: 121 additions & 12 deletions tests/unit/library/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -1265,14 +1265,12 @@ def test_nested_refs(self, test_object, valid):
# ref valid, maxItems valid
({"foo": []}, True),
# ref valid, maxItems invalid
pytest.param(
*({"foo": [1, 2, 3]}, False),
marks=pytest.mark.xfail(reason="sibling keywords to ref are not yet supported"),
),
({"foo": [1, 2, 3]}, False),
# ref invalid
({"foo": "string"}, False),
],
)
@pytest.mark.xfail(reason="sibling keywords to ref are not yet supported")
def test_ref_applies_alongside_sibling_keywords(self, test_object, valid):
schema = {
"$schema": "https://json-schema.org/draft/2020-12/schema",
Expand Down Expand Up @@ -1559,12 +1557,10 @@ def test_naive_replacement_of_ref_with_its_destination_is_not_correct(
# invalid on outer field
({"foo": {"bar": "a"}, "bar": 1}, False),
# valid on both fields
pytest.param(
*({"foo": {"bar": "a"}, "bar": "a"}, True),
marks=pytest.mark.xfail(reason="refs with sibling keywords are not yet supported; foo here is being seen as an additionalProperty before bar"),
),
({"foo": {"bar": "a"}, "bar": "a"}, True),
],
)
@pytest.mark.xfail(reason="refs with sibling keywords are not yet supported")
def test_refs_with_relative_uris_and_defs(self, test_object, valid):
schema = {
"$schema": "https://json-schema.org/draft/2020-12/schema",
Expand Down Expand Up @@ -1594,12 +1590,10 @@ def test_refs_with_relative_uris_and_defs(self, test_object, valid):
# invalid on outer field
({"foo": {"bar": "a"}, "bar": 1}, False),
# valid on both fields
pytest.param(
*({"foo": {"bar": "a"}, "bar": "a"}, True),
marks=pytest.mark.xfail(reason="refs with sibling keywords are not yet supported; foo here is being seen as an additionalProperty before bar"),
),
({"foo": {"bar": "a"}, "bar": "a"}, True),
],
)
@pytest.mark.xfail(reason="refs with sibling keywords are not yet supported")
def test_relative_refs_with_absolute_uris_and_defs(self, test_object, valid):
schema = {
"$schema": "https://json-schema.org/draft/2020-12/schema",
Expand Down Expand Up @@ -2354,6 +2348,60 @@ def test_bad_prefix_enum(self, bad_obj, good_bytes, failure_byte, allowed_bytes)
schema_obj=schema_obj,
)

@pytest.mark.parametrize(
"obj, valid",
[
(1, True),
(2, False),
("2", False),
("1", False),
(True, False),
]
)
def test_typed_enum_single_type(self, obj, valid):
schema_obj = {
"enum": [1, "2", True],
"type": "integer"
}
if valid:
validate(instance=obj, schema=schema_obj)
generate_and_check(obj, schema_obj)
else:
with pytest.raises(ValidationError):
validate(instance=obj, schema=schema_obj)
check_match_failure(bad_string=json_dumps(obj), schema_obj=schema_obj)

@pytest.mark.parametrize(
"obj, valid",
[
(1, True),
(2, False),
("2", True),
("1", False),
(True, False),
]
)
def test_typed_enum_multiple_types(self, obj, valid):
schema_obj = {
"enum": [1, "2", True],
"type": ["integer", "string"]
}
if valid:
validate(instance=obj, schema=schema_obj)
generate_and_check(obj, schema_obj)
else:
with pytest.raises(ValidationError):
validate(instance=obj, schema=schema_obj)
check_match_failure(bad_string=json_dumps(obj), schema_obj=schema_obj)

def test_invalid_typed_enum(self):
schema_obj = {
"enum": [1, "2"],
"type": "boolean"
}
with pytest.raises(ValueError) as ve:
gen_json(schema=schema_obj)
assert ve.value.args[0] == "No valid options found for enum with type 'boolean': [1, '2']"

class TestConst:
def test_constant_int(self):
Expand Down Expand Up @@ -2413,6 +2461,67 @@ def test_constant_precedence(self):
schema_obj=schema_obj,
)

def test_valid_typed_const(self):
schema_obj = {
"const": 1,
"type": "integer"
}
target_obj = 1
validate(instance=target_obj, schema=schema_obj)
generate_and_check(target_obj, schema_obj)

def test_invalid_typed_const(self):
schema_obj = {
"const": 1,
"type": "boolean"
}
with pytest.raises(ValidationError):
gen_json(schema=schema_obj)

def test_valid_enum_const(self):
schema_obj = {
"const": 1,
"enum": [1, 2, 3]
}
target_obj = 1
validate(instance=target_obj, schema=schema_obj)
generate_and_check(target_obj, schema_obj)

def test_invalid_enum_const(self):
schema_obj = {
"const": 1,
"enum": [2, 3]
}
with pytest.raises(ValidationError):
gen_json(schema=schema_obj)

def test_valid_typed_enum_const(self):
schema_obj = {
"const": 1,
"enum": [1, "2", 3],
"type": "integer"
}
target_obj = 1
validate(instance=target_obj, schema=schema_obj)
generate_and_check(target_obj, schema_obj)

@pytest.mark.parametrize(
"const",
[
"2", # right enum, wrong type
2, # wrong enum, right type
"3", # wrong enum, wrong type
]
)
def test_invalid_typed_enum_const(self, const):
schema_obj = {
"const": const,
"enum": [1, "2", 3],
"type": "integer"
}
with pytest.raises(ValidationError):
gen_json(schema=schema_obj)


class TestAdditionalProperties:

Expand Down

0 comments on commit dc5a080

Please sign in to comment.