diff --git a/cli/.gitignore b/cli/.gitignore new file mode 100644 index 0000000..ad4a1f1 --- /dev/null +++ b/cli/.gitignore @@ -0,0 +1,176 @@ +# Created by https://www.toptal.com/developers/gitignore/api/python +# Edit at https://www.toptal.com/developers/gitignore?templates=python + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +### Python Patch ### +# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration +poetry.toml + +# ruff +.ruff_cache/ + +# LSP config files +pyrightconfig.json + +# End of https://www.toptal.com/developers/gitignore/api/python diff --git a/cli/README.md b/cli/README.md new file mode 100644 index 0000000..6b50306 --- /dev/null +++ b/cli/README.md @@ -0,0 +1,54 @@ +# OsInts: the OpenSearch Integrations CLI + +The OsInts CLI is a utility CLI for developing integrations with OpenSearch Integrations. +It provides a few convenience methods: + +- `diff`: Type check your integration given a sample data record and the appropriate SS4O schema. + +## Installation + +Use the package manager [pip](https://pip.pypa.io/en/stable/) to install the CLI. + +```bash +$ cd cli +$ pip install . +... +Successfully installed osints-0.1.0 +$ osints +``` + +If you want the installation to be editable (for development), you can either specify the editable flag: + +```bash +$ pip install . --editable +$ osints +``` + +Or you can skip the install entirely and run it as a module: + +```bash +$ python3 -m src.main +``` + +## Usage + +See `osints --help` for a summary of all commands. + +### Usage: `diff` + +Here's an example usage of `diff` on the [current (buggy) version of the Nginx integration](https://github.com/opensearch-project/dashboards-observability/tree/6d5bd478704dc7342b1471767ced7036bb23f335/server/adaptors/integrations/__data__/repository/nginx): +```bash +$ osints diff --mapping schemas/logs-1.0.0.mapping.json --data data/sample.json +- event.category: ["web"] ++ event.category: "keyword" +- event.type: ["access"] ++ event.type: "keyword" +- http.response.status_code: "200" ++ http.response.status_code: "integer" +- span_id: "abcdef1010" +- trace_id: "102981ABCD2901" +``` + +From this, we can gather: +- `event.category`, `event.type`, and `http.response.status_code` are all the wrong type. The first two should be a `keyword` instead of a list of strings, while the latter should be an integer `200` instead of a string `"200"`. +- `span_id` and `trace_id` are present in the data but not accounted for in the schema. This indicates that they are either redundant or incorrectly named. In this case, it turns out to be the latter, there are appropriate `spanId` and `traceId` fields. diff --git a/cli/setup.py b/cli/setup.py new file mode 100644 index 0000000..504a8ea --- /dev/null +++ b/cli/setup.py @@ -0,0 +1,14 @@ +from setuptools import setup, find_packages + +setup( + name="osints", + version="0.1.1", + packages=find_packages(), + include_package_data=True, + install_requires=[ + "beartype", + "click", + ], + python_requires=">3.10.0", + entry_points={"console_scripts": ["osints = src.main:cli"]}, +) diff --git a/cli/src/__init__.py b/cli/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/cli/src/diff/__init__.py b/cli/src/diff/__init__.py new file mode 100644 index 0000000..9c2ffef --- /dev/null +++ b/cli/src/diff/__init__.py @@ -0,0 +1 @@ +from .diff import diff diff --git a/cli/src/diff/diff.py b/cli/src/diff/diff.py new file mode 100644 index 0000000..65ef24e --- /dev/null +++ b/cli/src/diff/diff.py @@ -0,0 +1,157 @@ +from beartype import beartype +import click +import json +import os.path +import glob + + +@beartype +def load_mapping(mapping: str) -> dict[str, dict]: + with open(mapping, "r") as mapping_file: + data = json.load(mapping_file) + properties = data.get("template", {}).get("mappings", {}).get("properties") + if properties is None: + return {} + composed_of = data.get("composed_of", []) + curr_dir = os.path.dirname(mapping) + for item in composed_of: + item_glob = glob.glob(os.path.join(curr_dir, f"{item}*")) + if len(item_glob) == 0: + click.secho( + f"ERROR: mapping file {mapping} references component {item}, which does not exist.", + err=True, + fg="red", + ) + raise click.Abort() + if properties.get(item) is not None: + click.secho( + f"ERROR: mapping file {mapping} references component {item} and defines conflicting key '{item}'", + err=True, + fg="red", + ) + raise click.Abort() + # Greedily take any mapping that matches the name for now. + # Later, configuration will need to be implemented. + if len(item_glob) > 1: + click.secho( + f"WARNING: found more than one mapping for component {item}. Assuming {item_glob[0]}.", + err=True, + fg="yellow", + ) + properties.update(load_mapping(item_glob[0])) + return properties + + +@beartype +def flat_type_check(expect: str, actual: object) -> dict[str, dict]: + match expect: + case "text" | "keyword": + if not isinstance(actual, str): + return {"expected": expect, "actual": actual} + case "long" | "integer": + if not isinstance(actual, int): + return {"expected": expect, "actual": actual} + case "alias": + # We assume aliases were already unwrapped by the caller and ignore them. + return {} + case "date": + if not isinstance(actual, str) and not isinstance(actual, int): + return {"expected": expect, "actual": actual} + case _: + click.secho(f"WARNING: unknown type '{expect}'", err=True, fg="yellow") + return {} + + +@beartype +def get_type(mapping: dict) -> str | dict: + if mapping.get("properties"): + return { + key: get_type(value) for key, value in mapping.get("properties").items() + } + return mapping.get("type", "unknown") + + +@beartype +def do_check( + mapping: dict[str, dict], data: dict[str, object], show_missing: bool = False +) -> dict[str, dict]: + result = {} + for key, value in mapping.items(): + if key not in data: + if show_missing and value.get("type") != "alias": + result[key] = {"expected": get_type(value), "actual": None} + continue + elif "properties" in value and isinstance(data[key], dict): + check = do_check(value["properties"], data[key], show_missing) + if check != {}: + result[key] = check + elif value.get("type") == "alias": + # Unwrap aliases split by '.' + value_path = value["path"].split(".") + curr_data = data + for step in value_path[:-1]: + if step not in curr_data: + curr_data[step] = {} + curr_data = curr_data[step] + curr_data[value_path[-1]] = data[key] + elif "type" in value: + check = flat_type_check(value["type"], data[key]) + if check != {}: + result[key] = check + for key, value in data.items(): + if key not in mapping: + result[key] = {"expected": None, "actual": value} + return result + + +@beartype +def output_diff(difference: dict[str, object], prefix: str = "") -> None: + for key, value in sorted(difference.items()): + out_key = prefix + key + if "expected" not in value and "actual" not in value: + output_diff(value, f"{prefix}{key}.") + if value.get("actual") is not None: + click.secho(f"- {out_key}: {json.dumps(value.get('actual'))}", fg="red") + if value.get("expected") is not None: + click.secho(f"+ {out_key}: {json.dumps(value.get('expected'))}", fg="green") + + +@click.command() +@click.option( + "--mapping", + type=click.Path(exists=True, readable=True), + help="The mapping for the format the data should have", +) +@click.option( + "--data", + type=click.Path(exists=True, readable=True), + help="The location of data to validate", +) +@click.option( + "--json", + "output_json", + is_flag=True, + help="Output machine-readable JSON instead of the default diff format", +) +@click.option( + "--show-missing", + "show_missing", + is_flag=True, + help="Output fields that are expected in the mappings but missing in the data", +) +def diff(mapping, data, output_json, show_missing): + """Type check your integration given a sample data record and the appropriate SS4O schema.""" + properties = load_mapping(mapping) + with open(data, "r") as data_file: + data_json = json.load(data_file) + if isinstance(data_json, list): + # Unwrap list of data, assume first record is representative + data_json = data_json[0] + check = do_check(properties, data_json, show_missing) + if output_json: + click.echo(json.dumps(check, sort_keys=True)) + else: + output_diff(check) + +if __name__ == "__main__": + diff() diff --git a/cli/src/main.py b/cli/src/main.py new file mode 100644 index 0000000..ebadf1a --- /dev/null +++ b/cli/src/main.py @@ -0,0 +1,14 @@ +import click +from .diff import diff + +@click.group() +def cli(): + """Various tools for working with OpenSearch Integrations.""" + pass + + +cli.add_command(diff) + + +if __name__ == "__main__": + cli()