src/event_model/generate/create_documents.py

import importlib
import importlib.util
import inspect
import json
from collections import OrderedDict
from pathlib import Path
from typing import Any, Dict, List, Set, Union, cast

import datamodel_code_generator

from event_model.basemodels import ALL_BASEMODELS

from .type_wrapper import BaseModel, to_snake

JSONSCHEMA = Path(__file__).parent.parent / "jsonschemas"
DOCUMENTS = Path(__file__).parent.parent / "documents"


# Used to add user written schema to autogenerated schema.
def merge_dicts(dict1: dict, dict2: dict) -> dict:
    return_dict = dict2.copy()

    for key in dict1:
        if key not in dict2:
            return_dict[key] = dict1[key]

        elif not isinstance(dict1[key], type(dict2[key])):
            return_dict[key] = dict1[key]

        elif isinstance(dict1[key], dict):
            return_dict[key] = merge_dicts(dict1[key], dict2[key])

        elif isinstance(dict1[key], list):
            return_dict[key] = dict1[key] + dict2[key]

    return return_dict


def sort_alphabetically(schema: Dict) -> Dict:
    """Sorts the schema alphabetically by key name, exchanging the
    properties dicts for OrderedDicts"""
    schema = OrderedDict(sorted(schema.items(), key=lambda x: x[0]))

    return schema


SortOrder = {
    "title": 0,
    "description": 1,
    "type": 2,
    "$defs": 3,
    "properties": 4,
    "required": 5,
    "patternProperties": 6,
    "additionalProperties": 7,
}


def sort_schema(document_schema: Dict[str, Any]) -> Dict[str, Any]:
    assert isinstance(document_schema, dict)
    document_schema = OrderedDict(
        sorted(
            document_schema.items(),
            key=lambda x: SortOrder.get(x[0], len(SortOrder)),
        )
    )

    for key in document_schema:
        if key in ("$defs", "properties", "required"):
            if isinstance(document_schema[key], dict):
                document_schema[key] = sort_alphabetically(document_schema[key])
                for key2 in document_schema[key]:
                    if isinstance(document_schema[key][key2], dict):
                        document_schema[key][key2] = sort_schema(
                            document_schema[key][key2]
                        )
            elif isinstance(document_schema[key], list):
                document_schema[key].sort()

    return document_schema


def dump_json(schema: Dict[str, Any], jsonschema_path: Path):
    """Returns true if the basemodel had to change, false otherwise"""
    sorted_schema = sort_schema(schema)
    with jsonschema_path.open(mode="w") as f:
        json.dump(sorted_schema, f, indent=4)
    return True


def remove_subschema(
    schema: Dict[str, Any], subschema: Dict[str, Any]
) -> Dict[str, Any]:
    for key, value in subschema.items():
        if key in schema:
            if isinstance(value, dict):
                schema[key] = remove_subschema(schema[key], value)
                if not schema[key]:
                    schema.pop(key, None)
            else:
                schema.pop(key, None)

    return schema


def jsonschema_differs_from_saved(
    schema: Dict[str, Any],
    jsonschema_path: Path,
    ignore_schema: Union[Dict[str, Any], None] = None,
):
    """
    Check if the schema at the given file path differs from the one passed in,
    ignoring the extra schema.
    """

    if not jsonschema_path.exists():
        print("+++++++++++++++ PATH DOESN@T EXIST", jsonschema_path)
        return True

    with jsonschema_path.open("r") as file:
        file_schema = json.load(file)

    # Remove extra schema keys from the file schema if extra_schema is provided
    if ignore_schema:
        file_schema = remove_subschema(file_schema, ignore_schema)

    file_schema.pop("additionalProperties", None)
    schema.pop("additionalProperties", None)

    return sort_schema(file_schema) != sort_schema(schema)


def import_basemodels(path: Path) -> List[type[BaseModel]]:
    # Dynamically import the module
    module_name = path.stem
    spec = importlib.util.spec_from_file_location(module_name, path)
    if spec is None or spec.loader is None:
        raise RuntimeError(f"Failed to import {path}")

    module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(module)
    return [
        attribute
        for attr_str in dir(module)
        if inspect.isclass(attribute := getattr(module, attr_str))
        and issubclass(attribute, BaseModel)
        and attribute != BaseModel
    ]


def generate_typeddict(jsonschema_path: Path, documents_path=DOCUMENTS):
    output_path = documents_path / f"{jsonschema_path.stem}.py"
    datamodel_code_generator.generate(
        input_=jsonschema_path,
        input_file_type=datamodel_code_generator.InputFileType.JsonSchema,
        output=output_path,
        output_model_type=datamodel_code_generator.DataModelType.TypingTypedDict,
        use_schema_description=True,
        use_field_description=True,
        use_annotated=True,
        field_constraints=True,
        wrap_string_literal=True,
    )
    with output_path.open("r+") as f:
        content = f.read()
        f.seek(0, 0)
        f.write("# ruff: noqa\n" + content)


def get_jsonschema_path(jsonschema: Dict, parent_path=JSONSCHEMA) -> Path:
    return parent_path / f"{to_snake(jsonschema['title'])}.json"


def generate_jsonschema(
    basemodel: type[BaseModel],
    jsonschema_parent_path=JSONSCHEMA,
    documents_parent_path=DOCUMENTS,
) -> Set[Path]:
    all_schema = set()

    schema_extra: Dict[str, Any] = cast(
        Dict[str, Any], basemodel.model_config.pop("json_schema_extra", {})
    )
    model_jsonschema = basemodel.model_json_schema()

    jsonschema_path = get_jsonschema_path(
        model_jsonschema, parent_path=jsonschema_parent_path
    )

    if jsonschema_differs_from_saved(
        model_jsonschema.copy(),
        jsonschema_path,
        ignore_schema=schema_extra,
    ):
        print(f"Detected change in {basemodel}, updating schema.")
        # Dump with the extra schema that we want to leave out of
        # the TypedDict conversion
        dump_json(model_jsonschema, jsonschema_path=jsonschema_path)
        generate_typeddict(jsonschema_path, documents_path=documents_parent_path)

    else:
        print(f"No change in {basemodel}.")

    # Dump the schema with the extra schema, also updates extra schema
    # if it's changed.
    dump_json(
        sort_schema(merge_dicts(model_jsonschema, schema_extra)),
        jsonschema_path=jsonschema_path,
    )

    all_schema.add(jsonschema_path)

    for parent in [parent for parent in basemodel.__bases__ if parent is not BaseModel]:
        assert issubclass(
            parent, BaseModel
        )  # Parents of BaseModel's can only be other BaseModel
        all_schema.update(
            generate_jsonschema(
                parent,
                jsonschema_parent_path=jsonschema_parent_path,
                documents_parent_path=documents_parent_path,
            )
        )

    return all_schema


def generate():
    all_schema = set()
    for basemodel in ALL_BASEMODELS:
        all_schema.update(generate_jsonschema(basemodel))

    init_py_imports = "\n".join(
        # Using the schema path since it will have the same stem as the TypedDict files
        sorted(
            [
                f"from .{schema_path.stem} import *  # noqa: F403"
                for schema_path in all_schema
            ]
        )
    )

    with open(DOCUMENTS / "__init__.py", "w") as f:
        f.write(init_py_imports + "\n")