diff --git a/.github/workflows/code-coverage.yml b/.github/workflows/code-coverage.yml index 91a98060..255e3bc2 100644 --- a/.github/workflows/code-coverage.yml +++ b/.github/workflows/code-coverage.yml @@ -3,14 +3,12 @@ name: Code Coverage Evaluation on PR on: push: branches: - - develop - - develop_* + - develop_v1 paths: - 'packages/*/src/**' pull_request: branches: - - develop - - develop_* + - develop_v1 paths: - 'packages/*/src/**' types: diff --git a/.github/workflows/code-style.yml b/.github/workflows/code-style.yml index 6c4e59e7..8860bae7 100644 --- a/.github/workflows/code-style.yml +++ b/.github/workflows/code-style.yml @@ -3,14 +3,12 @@ name: Code style compliance check on: push: branches: - - develop - - develop_* + - develop_v1 paths: - 'packages/*/src/**' pull_request: branches: - - develop - - develop_* + - develop_v1 paths: - 'packages/*/src/**' types: diff --git a/.github/workflows/integration-testing.yml b/.github/workflows/integration-testing.yml index 0e4d4a0e..74824757 100644 --- a/.github/workflows/integration-testing.yml +++ b/.github/workflows/integration-testing.yml @@ -2,17 +2,13 @@ name: Integration Testing With stix-shifter and Live Data Sources on: push: branches: - - develop - - develop_* - - release + - develop_v1 paths: - 'packages/*/src/**' - 'pyproject.toml' pull_request: branches: - - develop - - develop_* - - release + - develop_v1 paths: - 'packages/*/src/**' - 'pyproject.toml' diff --git a/.github/workflows/stixshifter-module-verification.yml b/.github/workflows/stixshifter-module-verification.yml deleted file mode 100644 index 66949595..00000000 --- a/.github/workflows/stixshifter-module-verification.yml +++ /dev/null @@ -1,31 +0,0 @@ -name: Daily STIX-shifter Connector Package Verification Test - -on: - schedule: - # Run this once per day, towards the end of the day for keeping the most - # recent data point most meaningful (hours are interpreted in UTC). - - cron: "55 02 * * *" - workflow_dispatch: # Allow for running this manually. - -jobs: - verify-stixshifter: - runs-on: ubuntu-latest - defaults: - run: - shell: bash - working-directory: ./packages/kestrel_datasource_stixshifter - steps: - - uses: actions/checkout@v4 - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.10' - - name: Install Python Tools - run: pip install --upgrade pip setuptools wheel - - name: Install kestrel_core - working-directory: ./packages/kestrel_core - run: pip install . - - name: Install kestrel_datasource_stixshifter - run: pip install .[test] - - name: Sample STIX-shifter Connector Package Verification on PyPI - run: pytest -vv tests/test_stixshifter.py -k test_verify_package_origin diff --git a/.github/workflows/unit-testing-kestrel2.yml b/.github/workflows/unit-testing-kestrel2.yml deleted file mode 100644 index 4113a1e1..00000000 --- a/.github/workflows/unit-testing-kestrel2.yml +++ /dev/null @@ -1,69 +0,0 @@ -name: Unit testing on PR - -on: - push: - branches: - - develop - - develop_* - paths: - - 'packages-nextgen/**' - pull_request: - branches: - - develop - - develop_* - paths: - - 'packages-nextgen/**' - types: - - opened - - reopened - - synchronize - -jobs: - test-kestrel-core: - strategy: - matrix: - os: [ubuntu-latest, macos-latest] - python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] - runs-on: ${{ matrix.os }} - defaults: - run: - shell: bash - working-directory: ./packages-nextgen/kestrel_core - steps: - - uses: actions/checkout@v4 - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - - name: Install Python Tools - run: pip install --upgrade pip setuptools wheel pytest - - name: Install kestrel_core - run: pip install . - - name: Unit testing - run: pytest -vv - - test-kestrel-interface-opensearch: - strategy: - matrix: - os: [ubuntu-latest, macos-latest] - python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] - runs-on: ${{ matrix.os }} - defaults: - run: - shell: bash - working-directory: ./packages-nextgen/kestrel_interface_opensearch - steps: - - uses: actions/checkout@v4 - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - - name: Install Python Tools - run: pip install --upgrade pip setuptools wheel pytest - - name: Install kestrel_core - working-directory: ./packages-nextgen/kestrel_core - run: pip install . - - name: Install kestrel_interface_opensearch - run: pip install . - - name: Unit testing - run: pytest -vv diff --git a/.github/workflows/unit-testing.yml b/.github/workflows/unit-testing.yml index 8af6b843..d23c33b1 100644 --- a/.github/workflows/unit-testing.yml +++ b/.github/workflows/unit-testing.yml @@ -3,14 +3,12 @@ name: Unit testing on PR on: push: branches: - - develop - - develop_* + - develop_v1 paths: - 'packages/**' pull_request: branches: - - develop - - develop_* + - develop_v1 paths: - 'packages/**' types: diff --git a/.github/workflows/unused-import.yml b/.github/workflows/unused-import.yml index e1174ba5..1ef42972 100644 --- a/.github/workflows/unused-import.yml +++ b/.github/workflows/unused-import.yml @@ -3,14 +3,12 @@ name: Unused imports check on: push: branches: - - develop - - develop_* + - develop_v1 paths: - 'packages/*/src/**' pull_request: branches: - - develop - - develop_* + - develop_v1 paths: - 'packages/*/src/**' types: diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 3fd92963..eebfd6a2 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -9,6 +9,26 @@ The format is based on `Keep a Changelog`_. Unreleased ========== +1.8.5 (2024-05-01) +================== + +Added +----- + +- cli/diag: add start/stop/last options +- subquery generation support in stix-shifter interface based on specified time window +- configuration doc on `subquery_time_window` + +Changed +------- + +- cli/diag: change default timeframe to last 5 minutes + +Fixed +----- + +- Repeated queries when stix-shifter pagination is off + 1.8.4 (2024-04-23) ================== diff --git a/packages-nextgen/kestrel_core/README.rst b/packages-nextgen/kestrel_core/README.rst deleted file mode 120000 index c768ff7d..00000000 --- a/packages-nextgen/kestrel_core/README.rst +++ /dev/null @@ -1 +0,0 @@ -../../README.rst \ No newline at end of file diff --git a/packages-nextgen/kestrel_core/pyproject.toml b/packages-nextgen/kestrel_core/pyproject.toml deleted file mode 100644 index e57a5bca..00000000 --- a/packages-nextgen/kestrel_core/pyproject.toml +++ /dev/null @@ -1,61 +0,0 @@ -[build-system] -requires = ["setuptools >= 68.2.2", "wheel"] -build-backend = "setuptools.build_meta" - -[project] -name = "kestrel_core" -version = "2.0.0" -description = "Kestrel Threat Hunting Language" -readme = "README.rst" -requires-python = ">=3.8" -license = {text = "Apache 2.0 License"} -maintainers = [ - {name = "Xiaokui Shu", email = "xiaokui.shu@ibm.com"}, - {name = "Paul Coccoli", email = "pcoccoli@us.ibm.com"}, -] -keywords = [ - "kestrel", - "language", - "DSL", - "cybersecurity", - "threat hunting", - "huntflow", - "entity", -] -classifiers = [ - "Topic :: Security", - "Operating System :: OS Independent", - "Development Status :: 4 - Beta", - "Programming Language :: Python :: 3", -] - -dependencies = [ - "typeguard>=4.1.5", - "pyyaml>=6.0.1", - "lark>=1.1.7", - "pandas>=2.0.3", - "pyarrow>=13.0.0", - "mashumaro>=3.10", - "networkx>=3.1", # networkx==3.2.1 only for Python>=3.9 - "SQLAlchemy>=2.0.23", - "dpath>=2.1.6", -] - -[project.optional-dependencies] -dev = [ - "black", -] -test = [ - "pytest", -] - -[project.urls] -Homepage = "https://github.com/opencybersecurityalliance/kestrel-lang" -Documentation = "https://kestrel.readthedocs.io/" -Repository = "https://github.com/opencybersecurityalliance/kestrel-lang.git" - -[tool.setuptools.packages.find] -where = ["src"] - -[tool.setuptools.package-data] -"*" = ["*.lark", "*.yaml"] diff --git a/packages-nextgen/kestrel_core/src/kestrel/__future__.py b/packages-nextgen/kestrel_core/src/kestrel/__future__.py deleted file mode 100644 index efe66a26..00000000 --- a/packages-nextgen/kestrel_core/src/kestrel/__future__.py +++ /dev/null @@ -1,14 +0,0 @@ -import sys -from typeguard import typechecked - - -"""Entrance to invoke any backward compatibility patch - -This module is for developers to quickly locate backward compatibility pathes -in Kestrel code and remove them through time. -""" - - -@typechecked -def is_python_older_than_minor_version(minor: int) -> bool: - return sys.version_info.minor < minor diff --git a/packages-nextgen/kestrel_core/src/kestrel/__init__.py b/packages-nextgen/kestrel_core/src/kestrel/__init__.py deleted file mode 100644 index 738b8b89..00000000 --- a/packages-nextgen/kestrel_core/src/kestrel/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from kestrel.session import Session diff --git a/packages-nextgen/kestrel_core/src/kestrel/cache/__init__.py b/packages-nextgen/kestrel_core/src/kestrel/cache/__init__.py deleted file mode 100644 index 66614485..00000000 --- a/packages-nextgen/kestrel_core/src/kestrel/cache/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from kestrel.cache.base import AbstractCache -from kestrel.cache.inmemory import InMemoryCache -from kestrel.cache.sqlite import SqliteCache diff --git a/packages-nextgen/kestrel_core/src/kestrel/cache/base.py b/packages-nextgen/kestrel_core/src/kestrel/cache/base.py deleted file mode 100644 index 4d1a94bb..00000000 --- a/packages-nextgen/kestrel_core/src/kestrel/cache/base.py +++ /dev/null @@ -1,100 +0,0 @@ -from __future__ import annotations -from pandas import DataFrame -from typing import MutableMapping -from uuid import UUID -from abc import abstractmethod - -from kestrel.config.internal import CACHE_INTERFACE_IDENTIFIER -from kestrel.interface import AbstractInterface - - -class AbstractCache(AbstractInterface, MutableMapping): - """Base class for Kestrel cache - - Additional @abstractmethod from AbstractInterface: - - - evaluate_graph() - """ - - @staticmethod - def schemes() -> Iterable[str]: - return [CACHE_INTERFACE_IDENTIFIER] - - @abstractmethod - def __del__(self): - """Delete the cache and release memory/disk space""" - ... - - @abstractmethod - def __getitem__(self, instruction_id: UUID) -> DataFrame: - """Get the dataframe for the cached instruction - - This method will automatically support `uuid in cache` - - Parameters: - instruction_id: id of the instruction - - Returns: - dataframe of the given (likely Variable) instruction - """ - ... - - @abstractmethod - def __setitem__(self, instruction_id: UUID, data: DataFrame): - """Store the dataframe of an instruction into cache - - Parameters: - - instruction_id: id of the instruction - - data: data associated with the instruction - """ - ... - - @abstractmethod - def __delitem__(self, instruction_id: UUID): - """Delete cached item - - Parameters: - instruction_id: id of the instruction - """ - ... - - @abstractmethod - def get_virtual_copy(self) -> AbstractCache: - """Create a virtual cache object from this cache - - This method needs to reimplement __del__, __getitem__, __setitem__, - __delitem__ to not actually hit the store media, e.g., SQLite. - - The virtual cache is useful for the implementation of the Explain() - instruction, pretending the dependent graphs are evaluated, so the - evaluation can continue towards the Return() instruction. - - Because Python invokes special methods from class methods, replacing - the __getitem__, __setitem__, and __delitem__ in the object does not - help. It is better to derive a subclass and replace __class__ of the - object to the subclass to correctly invoke the new set of __xitem___. - - https://docs.python.org/3/reference/datamodel.html#special-lookup - - And Python garbage collector could clean up the virtual cache when - not in use, so the __del__ method should be reimplemented to make - sure the store media is not closed. - """ - ... - - def store(self, instruction_id: UUID, data: DataFrame): - self[instruction_id] = data - - def __iter__(self) -> UUID: - """Return UUIDs of instructions cached - - Returns: - UUIDs in iterator - """ - return iter(self.cache_catalog) - - def __len__(self) -> int: - """How many items are cached""" - return len(self.cache_catalog) diff --git a/packages-nextgen/kestrel_core/src/kestrel/cache/inmemory.py b/packages-nextgen/kestrel_core/src/kestrel/cache/inmemory.py deleted file mode 100644 index 87557222..00000000 --- a/packages-nextgen/kestrel_core/src/kestrel/cache/inmemory.py +++ /dev/null @@ -1,136 +0,0 @@ -from copy import copy -from pandas import DataFrame -from typeguard import typechecked -from uuid import UUID -from typing import ( - Mapping, - MutableMapping, - Optional, - Iterable, - Any, -) - -from kestrel.cache.base import AbstractCache -from kestrel.ir.graph import IRGraphEvaluable -from kestrel.display import GraphletExplanation, NativeQuery -from kestrel.ir.instructions import ( - Instruction, - Return, - Explain, - Variable, - Filter, - SourceInstruction, - TransformingInstruction, -) -from kestrel.interface.codegen.dataframe import ( - evaluate_source_instruction, - evaluate_transforming_instruction, -) - - -@typechecked -class InMemoryCache(AbstractCache): - def __init__( - self, - initial_cache: Mapping[UUID, DataFrame] = {}, - session_id: Optional[UUID] = None, - ): - super().__init__(session_id) - self.cache: MutableMapping[UUID, DataFrame] = {} - - # update() will call __setitem__() internally - self.update(initial_cache) - - def __del__(self): - del self.cache - - def __getitem__(self, instruction_id: UUID) -> DataFrame: - return self.cache[self.cache_catalog[instruction_id]] - - def __delitem__(self, instruction_id: UUID): - del self.cache[self.cache_catalog[instruction_id]] - del self.cache_catalog[instruction_id] - - def __setitem__( - self, - instruction_id: UUID, - data: DataFrame, - ): - self.cache_catalog[instruction_id] = instruction_id.hex - self.cache[self.cache_catalog[instruction_id]] = data - - def get_virtual_copy(self) -> AbstractCache: - v = copy(self) - v.cache_catalog = copy(self.cache_catalog) - v.__class__ = InMemoryCacheVirtual - return v - - def evaluate_graph( - self, - graph: IRGraphEvaluable, - instructions_to_evaluate: Optional[Iterable[Instruction]] = None, - ) -> Mapping[UUID, DataFrame]: - mapping = {} - if not instructions_to_evaluate: - instructions_to_evaluate = graph.get_sink_nodes() - for instruction in instructions_to_evaluate: - df = self._evaluate_instruction_in_graph(graph, instruction) - self[instruction.id] = df - mapping[instruction.id] = df - return mapping - - def explain_graph( - self, - graph: IRGraphEvaluable, - instructions_to_explain: Optional[Iterable[Instruction]] = None, - ) -> Mapping[UUID, GraphletExplanation]: - mapping = {} - if not instructions_to_evaluate: - instructions_to_evaluate = graph.get_sink_nodes() - for instruction in instructions_to_evaluate: - dep_graph = graph.duplicate_dependent_subgraph_of_node(instruction) - graph_dict = dep_graph.to_dict() - query = NativeQuery("DataFrame", "") - mapping[instruction.id] = GraphletExplanation(graph_dict, query) - return mapping - - def _evaluate_instruction_in_graph( - self, graph: IRGraphEvaluable, instruction: Instruction - ) -> DataFrame: - if instruction.id in self: - df = self[instruction.id] - elif isinstance(instruction, SourceInstruction): - df = evaluate_source_instruction(instruction) - elif isinstance(instruction, TransformingInstruction): - trunk, r2n = graph.get_trunk_n_branches(instruction) - df = self._evaluate_instruction_in_graph(graph, trunk) - if isinstance(instruction, (Return, Explain)): - pass - elif isinstance(instruction, Variable): - self[instruction.id] = df - else: - if isinstance(instruction, Filter): - # replace each ReferenceValue with a list of values - instruction.resolve_references( - lambda x: list( - self._evaluate_instruction_in_graph(graph, r2n[x]).iloc[ - :, 0 - ] - ) - ) - df = evaluate_transforming_instruction(instruction, df) - else: - raise NotImplementedError(f"Unknown instruction type: {instruction}") - return df - - -@typechecked -class InMemoryCacheVirtual(InMemoryCache): - def __getitem__(self, instruction_id: UUID) -> Any: - return self.cache_catalog[instruction_id] - - def __delitem__(self, instruction_id: UUID): - del self.cache_catalog[instruction_id] - - def __setitem__(self, instruction_id: UUID, data: Any): - self.cache_catalog[instruction_id] = "virtual" + instruction_id.hex diff --git a/packages-nextgen/kestrel_core/src/kestrel/cache/sqlite.py b/packages-nextgen/kestrel_core/src/kestrel/cache/sqlite.py deleted file mode 100644 index 97b8fb13..00000000 --- a/packages-nextgen/kestrel_core/src/kestrel/cache/sqlite.py +++ /dev/null @@ -1,191 +0,0 @@ -import logging -from copy import copy -from typing import Iterable, Mapping, Optional, Union, Any -from uuid import UUID - -import sqlalchemy -from dateutil.parser import parse as dt_parser -from pandas import DataFrame, read_sql -from typeguard import typechecked - -from kestrel.cache.base import AbstractCache -from kestrel.interface.codegen.sql import SqlTranslator -from kestrel.ir.graph import IRGraphEvaluable -from kestrel.display import GraphletExplanation, NativeQuery -from kestrel.ir.instructions import ( - Construct, - Instruction, - Return, - Explain, - Variable, - Filter, - SourceInstruction, - TransformingInstruction, - SolePredecessorTransformingInstruction, -) - -_logger = logging.getLogger(__name__) - - -@typechecked -class SqliteTranslator(SqlTranslator): - def __init__(self, from_obj: Union[SqlTranslator, str]): - if isinstance(from_obj, SqlTranslator): - fc = from_obj.query.subquery(name=from_obj.associated_variable) - else: # str to represent table name - fc = sqlalchemy.table(from_obj) - super().__init__( - sqlalchemy.dialects.sqlite.dialect(), dt_parser, "time", fc - ) # FIXME: need mapping for timestamp? - self.associated_variable = None - - -@typechecked -class SqliteCache(AbstractCache): - def __init__( - self, - initial_cache: Optional[Mapping[UUID, DataFrame]] = None, - session_id: Optional[UUID] = None, - ): - super().__init__() - - basename = session_id or "cache" - self.db_path = f"{basename}.db" - - # for an absolute file path, the three slashes are followed by the absolute path - # for a relative path, it's also three slashes? - self.engine = sqlalchemy.create_engine(f"sqlite:///{self.db_path}") - self.connection = self.engine.connect() - - if initial_cache: - for instruction_id, data in initial_cache.items(): - self[instruction_id] = data - - def __del__(self): - self.connection.close() - - def __getitem__(self, instruction_id: UUID) -> DataFrame: - return read_sql(self.cache_catalog[instruction_id], self.connection) - - def __delitem__(self, instruction_id: UUID): - table_name = self.cache_catalog[instruction_id] - self.connection.execute(sqlalchemy.text(f'DROP TABLE "{table_name}"')) - del self.cache_catalog[instruction_id] - - def __setitem__( - self, - instruction_id: UUID, - data: DataFrame, - ): - table_name = instruction_id.hex - self.cache_catalog[instruction_id] = table_name - data.to_sql(table_name, con=self.connection, if_exists="replace", index=False) - - def get_virtual_copy(self) -> AbstractCache: - v = copy(self) - v.cache_catalog = copy(self.cache_catalog) - v.__class__ = SqliteCacheVirtual - return v - - def evaluate_graph( - self, - graph: IRGraphEvaluable, - instructions_to_evaluate: Optional[Iterable[Instruction]] = None, - ) -> Mapping[UUID, DataFrame]: - mapping = {} - if not instructions_to_evaluate: - instructions_to_evaluate = graph.get_sink_nodes() - for instruction in instructions_to_evaluate: - _logger.debug(f"evaluate instruction: {instruction}") - translator = self._evaluate_instruction_in_graph(graph, instruction) - # TODO: may catch error in case evaluation starts from incomplete SQL - _logger.debug(f"SQL query generated: {translator.result_w_literal_binds()}") - mapping[instruction.id] = read_sql(translator.result(), self.connection) - return mapping - - def explain_graph( - self, - graph: IRGraphEvaluable, - instructions_to_explain: Optional[Iterable[Instruction]] = None, - ) -> Mapping[UUID, GraphletExplanation]: - mapping = {} - if not instructions_to_explain: - instructions_to_explain = graph.get_sink_nodes() - for instruction in instructions_to_explain: - dep_graph = graph.duplicate_dependent_subgraph_of_node(instruction) - graph_dict = dep_graph.to_dict() - translator = self._evaluate_instruction_in_graph(graph, instruction) - query = NativeQuery("SQL", str(translator.result_w_literal_binds())) - mapping[instruction.id] = GraphletExplanation(graph_dict, query) - return mapping - - def _evaluate_instruction_in_graph( - self, - graph: IRGraphEvaluable, - instruction: Instruction, - ) -> SqliteTranslator: - if instruction.id in self: - # cached in sqlite - table_name = self.cache_catalog[instruction.id] - translator = SqliteTranslator(table_name) - - elif isinstance(instruction, SourceInstruction): - if isinstance(instruction, Construct): - # cache the data - self[instruction.id] = DataFrame(instruction.data) - # pull the data to start a SqliteTranslator - table_name = self.cache_catalog[instruction.id] - translator = SqliteTranslator(table_name) - else: - raise NotImplementedError(f"Unknown instruction type: {instruction}") - - elif isinstance(instruction, TransformingInstruction): - trunk, r2n = graph.get_trunk_n_branches(instruction) - translator = self._evaluate_instruction_in_graph(graph, trunk) - - if isinstance(instruction, SolePredecessorTransformingInstruction): - if isinstance(instruction, (Return, Explain)): - pass - elif isinstance(instruction, Variable): - # start a new translator and use previous one as subquery - # this allows using the variable as a dependent node - # if the variable is a sink, `SELECT * FROM (subquery)` also works - translator.associated_variable = instruction.name - translator = SqliteTranslator(translator) - else: - translator.add_instruction(instruction) - - elif isinstance(instruction, Filter): - # replace each ReferenceValue with a subquery - # note that this subquery will be used as a value for the .in_ operator - # we should not use .subquery() here but just `Select` class - # otherwise, will get warning: - # SAWarning: Coercing Subquery object into a select() for use in IN(); - # please pass a select() construct explicitly - instruction.resolve_references( - lambda x: self._evaluate_instruction_in_graph(graph, r2n[x]).query - ) - translator.add_instruction(instruction) - - else: - raise NotImplementedError(f"Unknown instruction type: {instruction}") - - else: - raise NotImplementedError(f"Unknown instruction type: {instruction}") - - return translator - - -@typechecked -class SqliteCacheVirtual(SqliteCache): - def __getitem__(self, instruction_id: UUID) -> Any: - return self.cache_catalog[instruction_id] - - def __delitem__(self, instruction_id: UUID): - del self.cache_catalog[instruction_id] - - def __setitem__(self, instruction_id: UUID, data: Any): - self.cache_catalog[instruction_id] = instruction_id.hex + "v" - - def __del__(self): - pass diff --git a/packages-nextgen/kestrel_core/src/kestrel/cli.py b/packages-nextgen/kestrel_core/src/kestrel/cli.py deleted file mode 100644 index e69de29b..00000000 diff --git a/packages-nextgen/kestrel_core/src/kestrel/config/__init__.py b/packages-nextgen/kestrel_core/src/kestrel/config/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/packages-nextgen/kestrel_core/src/kestrel/config/internal.py b/packages-nextgen/kestrel_core/src/kestrel/config/internal.py deleted file mode 100644 index ed9fd2b1..00000000 --- a/packages-nextgen/kestrel_core/src/kestrel/config/internal.py +++ /dev/null @@ -1 +0,0 @@ -CACHE_INTERFACE_IDENTIFIER = "cache" diff --git a/packages-nextgen/kestrel_core/src/kestrel/config/kestrel.yaml b/packages-nextgen/kestrel_core/src/kestrel/config/kestrel.yaml deleted file mode 100644 index ccdd38b1..00000000 --- a/packages-nextgen/kestrel_core/src/kestrel/config/kestrel.yaml +++ /dev/null @@ -1,70 +0,0 @@ -# syntax default values -language: - default_variable: "_" - default_sort_order: "desc" - default_datasource_schema: "stixshifter" - default_analytics_schema: "python" - -# how a Kestrel session is executed -session: - cache_directory_prefix: "kestrel-session-" # under system temp directory - local_database_path: "local.db" - log_path: "session.log" - show_execution_summary: true - -# whether/how to prefetch all records/observations for entities -prefetch: - - # enable/disable prefetch for command - # - # If prefetch is enabled, Kestrel will send additional queries to the data - # source to search for related records regarding entities retrieved from the - # user-specified pattern, collecting more complete information (attributes, - # connections to other entities) of the entities from different records. - switch_per_command: - get: true - find: true - - # declare the list of entity types to not prefetch - # - # This can be used when a user finds prefetch hinders the performance with - # large amount of results for one or more generic type of entities. For - # example, the data source may have millions of records containing - # `C:\Windows\SYSTEM32\ntdll.dll` touched by all Windows processes in a short - # amount of time. Executing a Kestrel command `f = FIND file LINKED p` will - # retrieve the file from a process and then start prefetch to gain - # information/connections of the file from all processes. Retrieval of - # millions records will likely result in a performance issue, thus the user - # can put `file` in this list to disable prefetch for it. - excluded_entities: - - - # - file - # - user-account - # - x-oca-asset - - # Detailed logic to identify the same process from different records is more - # complex than many data source query language can express, so Kestrel - # retrieves potential same process candidate records and perform fine-grained - # process identification in Kestrel with these parameters. - process_identification: - pid_but_name_changed_time_begin_offset: -5 # seconds - pid_but_name_changed_time_end_offset: 5 # seconds - pid_and_name_time_begin_offset: -3600 # seconds - pid_and_name_time_end_offset: 3600 # seconds - pid_and_ppid_time_begin_offset: -3600 # seconds - pid_and_ppid_time_end_offset: 3600 # seconds - pid_and_name_and_ppid_time_begin_offset: -86400 # seconds - pid_and_name_and_ppid_time_end_offset: 86400 # seconds - -# option when generating STIX query -stixquery: - timerange_start_offset: -300 # seconds - timerange_stop_offset: 300 # seconds - support_id: false # STIX 2.0 does not support unique ID - -# debug options -debug: - env_var: "KESTREL_DEBUG" # debug mode if the environment variable exists - cache_directory_prefix: "kestrel-" # under system temp directory - session_exit_marker: "session.exited" - maximum_exited_session: 3 diff --git a/packages-nextgen/kestrel_core/src/kestrel/config/utils.py b/packages-nextgen/kestrel_core/src/kestrel/config/utils.py deleted file mode 100644 index 0b912e7a..00000000 --- a/packages-nextgen/kestrel_core/src/kestrel/config/utils.py +++ /dev/null @@ -1,50 +0,0 @@ -import os -import yaml -from pathlib import Path -import logging -from typeguard import typechecked -from typing import Mapping, Union - -from kestrel.utils import update_nested_dict, load_data_file - -CONFIG_DIR_DEFAULT = Path.home() / ".config" / "kestrel" -CONFIG_PATH_DEFAULT = CONFIG_DIR_DEFAULT / "kestrel.yaml" -CONFIG_PATH_ENV_VAR = "KESTREL_CONFIG" # override CONFIG_PATH_DEFAULT if provided - -_logger = logging.getLogger(__name__) - - -@typechecked -def load_default_config() -> Mapping: - _logger.debug(f"Loading default config file...") - default_config = load_data_file("kestrel.config", "kestrel.yaml") - config_with_envvar_expanded = os.path.expandvars(default_config) - config_content = yaml.safe_load(config_with_envvar_expanded) - return config_content - - -@typechecked -def load_user_config( - config_path_env_var: str, config_path_default: Union[str, Path] -) -> Mapping: - config_path_default = config_path_default.absolute().as_posix() - config_path = os.getenv(config_path_env_var, config_path_default) - config_path = os.path.expanduser(config_path) - config = {} - if config_path: - try: - with open(config_path, "r") as fp: - _logger.debug(f"User configuration file found: {config_path}") - config = yaml.safe_load(os.path.expandvars(fp.read())) - except FileNotFoundError: - _logger.debug(f"User configuration file not exist.") - return config - - -@typechecked -def load_config() -> Mapping: - config_default = load_default_config() - config_user = load_user_config(CONFIG_PATH_ENV_VAR, CONFIG_PATH_DEFAULT) - _logger.debug(f"User configuration loaded: {config_user}") - _logger.debug(f"Updating default config with user config...") - return update_nested_dict(config_default, config_user) diff --git a/packages-nextgen/kestrel_core/src/kestrel/display.py b/packages-nextgen/kestrel_core/src/kestrel/display.py deleted file mode 100644 index e6729f85..00000000 --- a/packages-nextgen/kestrel_core/src/kestrel/display.py +++ /dev/null @@ -1,34 +0,0 @@ -from typing import List, Union, Mapping -from dataclasses import dataclass -from mashumaro.mixins.json import DataClassJSONMixin -from pandas import DataFrame - - -@dataclass -class NativeQuery(DataClassJSONMixin): - # which query language - language: str - # what query statement - statement: str - - -@dataclass -class GraphletExplanation(DataClassJSONMixin): - # serialized IRGraph - graph: Mapping - # data source query - query: NativeQuery - - -@dataclass -class GraphExplanation(DataClassJSONMixin): - graphlets: List[GraphletExplanation] - - -# Kestrel Display Object -Display = Union[ - str, - dict, - DataFrame, - GraphExplanation, -] diff --git a/packages-nextgen/kestrel_core/src/kestrel/exceptions.py b/packages-nextgen/kestrel_core/src/kestrel/exceptions.py deleted file mode 100644 index cd088afe..00000000 --- a/packages-nextgen/kestrel_core/src/kestrel/exceptions.py +++ /dev/null @@ -1,120 +0,0 @@ -class KestrelError(Exception): - pass - - -class InstructionNotFound(KestrelError): - pass - - -class InvalidInstruction(KestrelError): - pass - - -class InvalidSeralizedGraph(KestrelError): - pass - - -class InvalidSeralizedInstruction(KestrelError): - pass - - -class InvalidDataSource(KestrelError): - pass - - -class VariableNotFound(KestrelError): - pass - - -class ReferenceNotFound(KestrelError): - pass - - -class DataSourceNotFound(KestrelError): - pass - - -class DuplicatedVariable(KestrelError): - pass - - -class DuplicatedReference(KestrelError): - pass - - -class DuplicatedDataSource(KestrelError): - pass - - -class DuplicatedSingletonInstruction(KestrelError): - pass - - -class MultiInterfacesInGraph(KestrelError): - pass - - -class MultiSourcesInGraph(KestrelError): - pass - - -class LargerThanOneIndegreeInstruction(KestrelError): - pass - - -class DanglingReferenceInFilter(KestrelError): - pass - - -class DanglingFilter(KestrelError): - pass - - -class DuplicatedReferenceInFilter(KestrelError): - pass - - -class MissingReferenceInFilter(KestrelError): - pass - - -class InvalidSerializedDatasourceInterfaceCacheCatalog(KestrelError): - pass - - -class InevaluableInstruction(KestrelError): - pass - - -class MappingParseError(KestrelError): - pass - - -class InterfaceNotFound(KestrelError): - pass - - -class IRGraphMissingNode(KestrelError): - pass - - -class InterfaceNotConfigured(KestrelError): - pass - - -class InvalidInterfaceImplementation(KestrelError): - pass - - -class ConflictingInterfaceScheme(KestrelError): - pass - - -class DataSourceError(KestrelError): - pass - - -class UnsupportedOperatorError(KestrelError): - """The data source doesn't support this operator""" - - pass diff --git a/packages-nextgen/kestrel_core/src/kestrel/frontend/__init__.py b/packages-nextgen/kestrel_core/src/kestrel/frontend/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/packages-nextgen/kestrel_core/src/kestrel/frontend/compile.py b/packages-nextgen/kestrel_core/src/kestrel/frontend/compile.py deleted file mode 100644 index cb1f897f..00000000 --- a/packages-nextgen/kestrel_core/src/kestrel/frontend/compile.py +++ /dev/null @@ -1,398 +0,0 @@ -# Lark Transformer - -import logging -from datetime import datetime, timedelta -from functools import reduce - -from dateutil.parser import parse as to_datetime -from lark import Transformer, Token -from typeguard import typechecked - -from kestrel.mapping.data_model import translate_comparison_to_ocsf -from kestrel.utils import unescape_quoted_string -from kestrel.ir.filter import ( - FExpression, - FComparison, - IntComparison, - FloatComparison, - StrComparison, - ListComparison, - RefComparison, - ReferenceValue, - MultiComp, - ListOp, - NumCompOp, - StrCompOp, - ExpOp, - BoolExp, - TimeRange, -) -from kestrel.ir.graph import ( - IRGraph, - compose, -) -from kestrel.ir.instructions import ( - Construct, - DataSource, - Filter, - Limit, - Offset, - ProjectAttrs, - ProjectEntity, - Reference, - Return, - Sort, - Variable, - Explain, -) -from kestrel.exceptions import IRGraphMissingNode - - -_logger = logging.getLogger(__name__) - - -DEFAULT_VARIABLE = "_" -DEFAULT_SORT_ORDER = "DESC" - - -@typechecked -def _unescape_quoted_string(s: str): - if s.startswith("r"): - return s[2:-1] - else: - return s[1:-1].encode("utf-8").decode("unicode_escape") - - -@typechecked -def _create_comp(field: str, op_value: str, value) -> FComparison: - # TODO: implement MultiComp - - if op_value in (ListOp.IN, ListOp.NIN): - op = ListOp - comp = RefComparison if isinstance(value, ReferenceValue) else ListComparison - elif isinstance(value, int): - op = NumCompOp - comp = IntComparison - elif isinstance(value, float): - op = NumCompOp - comp = FloatComparison - elif isinstance(value, ReferenceValue): - op = ListOp - op_value = ListOp.IN if op_value in (ListOp.IN, StrCompOp.EQ) else ListOp.NIN - comp = RefComparison - else: - op = StrCompOp - comp = StrComparison - return comp(field, op(op_value), value) - - -@typechecked -def _map_filter_exp( - entity_name: str, filter_exp: FExpression, property_map: dict -) -> FExpression: - if isinstance( - filter_exp, - (IntComparison, FloatComparison, StrComparison, ListComparison, RefComparison), - ): - # get the field - field = filter_exp.field - # add entity to field if it doesn't have one already - if ":" not in field: - field = f"{entity_name}:{field}" - # map field to new syntax (e.g. STIX to OCSF) - # TODO: ECS to OCSF? Would need to merge STIX and ECS data model maps. - map_result = translate_comparison_to_ocsf( - property_map, field, filter_exp.op, filter_exp.value - ) - # Build a MultiComp if field maps to several values - if len(map_result) > 1: - filter_exp = MultiComp( - ExpOp.OR, - [_create_comp(field, op, value) for field, op, value in map_result], - ) - elif len(map_result) == 1: # it maps to a single value - mapping = map_result[0] - _logger.debug("mapping = %s", mapping) - field = mapping[0] - prefix = f"{entity_name}." - if field.startswith(prefix): - # Need to prune the entity name - field = field[len(prefix) :] - filter_exp.field = field - filter_exp.op = mapping[1] - filter_exp.value = mapping[2] - else: # pass-through - pass - # TODO: for RefComparison, map the attribute in value (may not be possible here) - - elif isinstance(filter_exp, BoolExp): - # recursively map boolean expressions - filter_exp = BoolExp( - _map_filter_exp(entity_name, filter_exp.lhs, property_map), - filter_exp.op, - _map_filter_exp(entity_name, filter_exp.rhs, property_map), - ) - elif isinstance(filter_exp, MultiComp): - # normally, this should be unreachable - # if this becomes a valid case, we need to change - # the definition of MultiComp to accept a MultiComp - # in addition to Comparisons in its `comps` list - filter_exp = MultiComp( - filter_exp.op, - [_map_filter_exp(entity_name, x, property_map) for x in filter_exp.comps], - ) - return filter_exp - - -@typechecked -def _add_reference_branches_for_filter(graph: IRGraph, filter_node: Filter): - if filter_node not in graph: - raise IRGraphMissingNode("Internal error: filter node expected") - else: - for refvalue in filter_node.get_references(): - r = graph.add_node(Reference(refvalue.reference)) - p = graph.add_node(ProjectAttrs([refvalue.attribute]), r) - graph.add_edge(p, filter_node) - - -class _KestrelT(Transformer): - def __init__( - self, - default_variable=DEFAULT_VARIABLE, - default_sort_order=DEFAULT_SORT_ORDER, - token_prefix="", - entity_map={}, - property_map={}, - ): - # token_prefix is the modification by Lark when using `merge_transformers()` - self.default_variable = default_variable - self.default_sort_order = default_sort_order - self.token_prefix = token_prefix - self.entity_map = entity_map - self.property_map = property_map # TODO: rename to data_model_map? - super().__init__() - - def start(self, args): - return reduce(compose, args, IRGraph()) - - def statement(self, args): - return args[0] - - def assignment(self, args): - # TODO: move the var+var into expression in Lark - variable_node = Variable(args[0].value) - graph, root = args[1] - graph.add_node(variable_node, root) - return graph - - def expression(self, args): - # TODO: add more clauses than WHERE and ATTR - # TODO: think about order of clauses when turning into nodes - graph = IRGraph() - reference = graph.add_node(args[0]) - root = reference - if len(args) > 1: - for clause in args[1:]: - graph.add_node(clause, root) - root = clause - if isinstance(clause, Filter): - # this is where_clause - _add_reference_branches_for_filter(graph, clause) - return graph, root - - def vtrans(self, args): - if len(args) == 1: - return Reference(args[0].value) - else: - # TODO: transformer support - ... - - def new(self, args): - # TODO: use entity type - - graph = IRGraph() - if len(args) == 1: - # Try to get entity type from first entity - data = args[0] - else: - data = args[1] - data_node = Construct(data) - graph.add_node(data_node) - return graph, data_node - - def var_data(self, args): - if isinstance(args[0], Token): - # TODO - ... - else: - v = args[0] - return v - - def json_objs(self, args): - return args - - def json_obj(self, args): - return dict(args) - - def json_pair(self, args): - v = args[0].value - if "ESCAPED_STRING" in args[0].type: - v = unescape_quoted_string(v) - return v, args[1] - - def json_value(self, args): - v = args[0].value - if args[0].type == self.token_prefix + "ESCAPED_STRING": - v = unescape_quoted_string(v) - elif args[0].type == self.token_prefix + "NUMBER": - v = float(v) if "." in v else int(v) - return v - - def get(self, args): - graph = IRGraph() - entity_name = args[0].value - mapped_entity_name = self.entity_map.get(entity_name, entity_name) - - # prepare Filter node - filter_node = args[2] - filter_node.exp = _map_filter_exp( - args[0].value, filter_node.exp, self.property_map - ) - - # add basic Source and Filter nodes - source_node = graph.add_node(args[1]) - filter_node = graph.add_node(filter_node, source_node) - - # add reference nodes if used in Filter - _add_reference_branches_for_filter(graph, filter_node) - - projection_node = graph.add_node(ProjectEntity(mapped_entity_name), filter_node) - root = projection_node - if len(args) > 3: - for arg in args[3:]: - if isinstance(arg, TimeRange): - filter_node.timerange = args[3] - elif isinstance(arg, Limit): - root = graph.add_node(arg, projection_node) - return graph, root - - def where_clause(self, args): - exp = args[0] - return Filter(exp) - - def attr_clause(self, args): - attrs = args[0].split(",") - attrs = [attr.strip() for attr in attrs] - return ProjectAttrs(attrs) - - def sort_clause(self, args): - # args[0] is Token('BY', 'BY') - return Sort(*args[1:]) - - def expression_or(self, args): - return BoolExp(args[0], ExpOp.OR, args[1]) - - def expression_and(self, args): - return BoolExp(args[0], ExpOp.AND, args[1]) - - def comparison_std(self, args): - """Emit a Comparison object for a Filter""" - field = args[0].value - op = args[1] - value = args[2] - comp = _create_comp(field, op, value) - return comp - - def op(self, args): - """Convert operator token to a plain string""" - return " ".join([arg.upper() for arg in args]) - - def op_keyword(self, args): - """Convert operator token to a plain string""" - return args[0].value - - # Literals - def advanced_string(self, args): - value = _unescape_quoted_string(args[0].value) - return value - - def reference_or_simple_string(self, args): - vname = args[0].value - attr = args[1].value if len(args) > 1 else None - return ReferenceValue(vname, attr) - - def number(self, args): - v = args[0].value - try: - return int(v) - except ValueError: - return float(v) - - def value(self, args): - return args[0] - - def literal_list(self, args): - return args - - def literal(self, args): - return args[0] - - def datasource(self, args): - return DataSource(args[0].value) - - # Timespans - def timespan_relative(self, args): - num = int(args[0]) - unit = args[1] - if unit == "DAY": - delta = timedelta(days=num) - elif unit == "HOUR": - delta = timedelta(hours=num) - elif unit == "MINUTE": - delta = timedelta(minutes=num) - elif unit == "SECOND": - delta = timedelta(seconds=num) - stop = datetime.utcnow() - start = stop - delta - return TimeRange(start, stop) - - def timespan_absolute(self, args): - start = to_datetime(args[0]) - stop = to_datetime(args[1]) - return TimeRange(start, stop) - - def day(self, _args): - return "DAY" - - def hour(self, _args): - return "HOUR" - - def minute(self, _args): - return "MINUTE" - - def second(self, _args): - return "SECOND" - - def timestamp(self, args): - return args[0] - - # Limit - def limit_clause(self, args): - n = int(args[0]) - return Limit(n) - - def offset_clause(self, args): - n = int(args[0]) - return Offset(n) - - def disp(self, args): - graph, root = args[0] - graph.add_node(Return(), root) - return graph - - def explain(self, args): - graph = IRGraph() - reference = graph.add_node(Reference(args[0].value)) - explain = graph.add_node(Explain(), reference) - graph.add_node(Return(), explain) - return graph diff --git a/packages-nextgen/kestrel_core/src/kestrel/frontend/completer.py b/packages-nextgen/kestrel_core/src/kestrel/frontend/completer.py deleted file mode 100644 index e69de29b..00000000 diff --git a/packages-nextgen/kestrel_core/src/kestrel/frontend/kestrel.lark b/packages-nextgen/kestrel_core/src/kestrel/frontend/kestrel.lark deleted file mode 100644 index 1e00bfc9..00000000 --- a/packages-nextgen/kestrel_core/src/kestrel/frontend/kestrel.lark +++ /dev/null @@ -1,302 +0,0 @@ -// -// Kestrel Grammar -// - -// -// A huntflow is a sequence of statements -// - -start: statement* - -statement: assignment - | command_no_result - -// If no VARIABLE is given, default to _ in post-parsing -// For assign or merge, the result variable is required -// This eliminates meaningless huntflows like `var1 var2 var3` -assignment: VARIABLE "=" expression - | VARIABLE "=" VARIABLE ("+" VARIABLE)+ - | (VARIABLE "=")? command_with_result - -// "?" at the beginning will inline command -?command_with_result: find - | get - | group - | join - | load - | new - | sort - -?command_no_result: apply - | explain - | describe - | disp - | info - | save - -// -// All commands -// - -find: "FIND"i ENTITY_TYPE RELATION (REVERSED)? VARIABLE where_clause? timespan? limit_clause? - -get: "GET"i ENTITY_TYPE ("FROM"i datasource)? where_clause timespan? limit_clause? - -group: "GROUP"i VARIABLE BY grp_spec ("WITH"i agg_list)? - -join: "JOIN"i VARIABLE "," VARIABLE (BY ATTRIBUTE "," ATTRIBUTE)? - -load: "LOAD"i stdpath ("AS"i ENTITY_TYPE)? - -new: "NEW"i ENTITY_TYPE? var_data - -sort: "SORT"i VARIABLE BY ATTRIBUTE (ASC|DESC)? - -apply: "APPLY"i analytics_uri "ON"i variables ("WITH"i args)? - -disp: "DISP"i expression - -info: "INFO"i VARIABLE - -save: "SAVE"i VARIABLE "TO"i stdpath - -describe: "DESCRIBE"i var_attr - -explain: "EXPLAIN"i VARIABLE - -// -// Variable definition -// - -variables: VARIABLE ("," VARIABLE)* - -VARIABLE: CNAME - -// -// Expression -// - -expression: vtrans where_clause? attr_clause? sort_clause? limit_clause? offset_clause? - -// not use rule name `transform` since it is a special function in Lark -// the function in transformer will mal-function in `merge_transformers()` -vtrans: transformer "(" VARIABLE ")" - | VARIABLE - -transformer: TIMESTAMPED - | ADDOBSID - | RECORDS - -TIMESTAMPED: "TIMESTAMPED"i -ADDOBSID: "ADDOBSID"i -RECORDS: "RECORDS"i - -where_clause: "WHERE"i ecg_pattern -attr_clause: "ATTR"i ATTRIBUTES -sort_clause: "SORT"i BY ATTRIBUTE (ASC|DESC)? -limit_clause: "LIMIT"i INT -offset_clause: "OFFSET"i INT - -?ecg_pattern: disjunction - | "[" disjunction "]" // STIX compatible - -?disjunction: conjunction - | disjunction "OR"i conjunction -> expression_or - -?conjunction: comparison - | conjunction "AND"i comparison -> expression_and - -?comparison: comparison_std - | comparison_null - | "(" disjunction ")" - -comparison_std: ENTITY_ATTRIBUTE_PATH op value -comparison_null: ENTITY_ATTRIBUTE_PATH null_op NULL - -// -// Timespan -// - -?timespan: "start"i timestamp "stop"i timestamp -> timespan_absolute - | "last"i INT timeunit -> timespan_relative - -?timeunit: day - | hour - | minute - | second - -day: "days"i | "day"i | "d"i -hour: "hours"i | "hour"i | "h"i -minute: "minutes"i | "minute"i | "m"i -second: "seconds"i | "second"i | "s"i - -timestamp: ISOTIMESTAMP - | "\"" ISOTIMESTAMP "\"" - | "'" ISOTIMESTAMP "'" - | "t\"" ISOTIMESTAMP "\"" - | "t'" ISOTIMESTAMP "'" - -ISOTIMESTAMP: /\d{4}-[01]\d-[0-3]\dT[0-2]\d:[0-5]\d:[0-5]\d(\.\d+)?Z/ - -// -// FIND command constructs -// - -RELATION: WORD - -// -// GROUP command constructs -// - -grp_spec: grp_expr ("," grp_expr)* - -grp_expr: ATTRIBUTE - | bin_func - -// No other scalar funcs are supported yet -bin_func: "BIN"i "(" ATTRIBUTE "," INT timeunit? ")" - -agg_list: agg ("," agg)* - -agg: funcname "(" ATTRIBUTE ")" ("AS"i alias)? - -?funcname: (MIN|MAX|SUM|AVG|COUNT|NUNIQUE) -MIN: "MIN"i -MAX: "MAX"i -SUM: "SUM"i -AVG: "AVG"i -COUNT: "COUNT"i -NUNIQUE: "NUNIQUE"i - -?alias: ECNAME - -// -// GET command constructs -// - -datasource: DATASRC_SIMPLE - | DATASRC_ESCAPED - | VARIABLE - -DATASRC_SIMPLE: PATH_SIMPLE ("," PATH_SIMPLE)* -DATASRC_ESCAPED: PATH_ESCAPED - -// -// APPLY command constructs -// - -analytics_uri: ANALYTICS_SIMPLE - | ANALYTICS_ESCAPED - -ANALYTICS_SIMPLE: PATH_SIMPLE -ANALYTICS_ESCAPED: PATH_ESCAPED - -// -// Two-level JSON in command NEW -// - -// use terminal to load the entire var_data without parsing into it -var_data: "[" (RAW_VALUES | json_objs) "]" - -RAW_VALUES: ESCAPED_STRING_WS ("," ESCAPED_STRING_WS)* - -json_objs: json_obj ("," json_obj)* -json_obj: "{" json_pair ("," json_pair)* "}" -json_pair: ESCAPED_STRING ":" json_value -json_value: (NUMBER|ESCAPED_STRING|TRUE|FALSE|NULL) - -// -// Arguments -// - -args: arg_kv_pair ("," arg_kv_pair)* - -arg_kv_pair: ECNAME "=" value - -// -// Shared keywords -// - -BY: "BY"i -ASC: "ASC"i -DESC: "DESC"i -REVERSED: "BY"i -TRUE: "TRUE"i -FALSE: "FALSE"i -NULL: "NULL"i -IN: "IN"i -LIKE: "LIKE"i -MATCHES: "MATCHES"i -IS: "IS"i -NOT: "NOT"i -ISSUBSET: "ISSUBSET"i -ISSUPERSET: "ISSUPERSET"i - -op: OP_SIGN - | NOT? op_keyword - -OP_SIGN: /([!=]?=|[<>]=?)/ - -op_keyword: IN - | LIKE - | MATCHES - | ISSUBSET - | ISSUPERSET - -null_op: IS NOT? - -// -// Common language constructs -// - -value: literal_list - | literal - -literal: reference_or_simple_string - | string - | number - -literal_list: "(" literal ("," literal)* ")" - | "[" literal ("," literal)* "]" - -reference_or_simple_string: ECNAME ("." ATTRIBUTE)? - -var_attr: ECNAME "." ATTRIBUTE - -?string: advanced_string - -number: NUMBER - -ENTITY_ATTRIBUTE_PATH: (ENTITY_TYPE ":")? ATTRIBUTE - -ENTITY_TYPE: ECNAME - -stdpath: PATH_SIMPLE - | PATH_ESCAPED - -// TODO: support attributes without quote for dash -// x.hash.SHA-256 instead of x.hash.'SHA-256' -ATTRIBUTE: ECNAME "[*]"? ("." ECNAME_W_QUOTE)* -ATTRIBUTES: ATTRIBUTE (WS* "," WS* ATTRIBUTE)* - -ECNAME: (LETTER|"_") (LETTER|DIGIT|"_"|"-")* -ECNAME_W_QUOTE: (LETTER|DIGIT|"_"|"-"|"'")+ - -PATH_SIMPLE: (ECNAME "://")? (LETTER|DIGIT|"_"|"-"|"."|"/")+ - -PATH_ESCAPED: "\"" (ECNAME "://")? _STRING_ESC_INNER "\"" - | "'" (ECNAME "://")? _STRING_ESC_INNER "'" - -ESCAPED_STRING: "\"" _STRING_ESC_INNER "\"" - | "'" _STRING_ESC_INNER "'" -ESCAPED_STRING_WS: WS* ESCAPED_STRING WS* - -// nearly Python string, but no [ubf]? as prefix options -// check Lark example of Python parser for reference -advanced_string: /(r?)("(?!"").*?(? COMMENT - -%ignore WS -%ignore COMMENT diff --git a/packages-nextgen/kestrel_core/src/kestrel/frontend/parser.py b/packages-nextgen/kestrel_core/src/kestrel/frontend/parser.py deleted file mode 100644 index 0ff482c5..00000000 --- a/packages-nextgen/kestrel_core/src/kestrel/frontend/parser.py +++ /dev/null @@ -1,70 +0,0 @@ -# parse Kestrel syntax, apply frontend mapping, transform to IR - -import logging -import os -from itertools import chain - -from kestrel.frontend.compile import _KestrelT -from kestrel.mapping.data_model import reverse_mapping -from kestrel.utils import load_data_file -from lark import Lark -from typeguard import typechecked -import yaml - - -_logger = logging.getLogger(__name__) - - -frontend_mapping = {} - - -@typechecked -def get_mapping(mapping_type: str, mapping_package: str, mapping_filepath: str) -> dict: - global frontend_mapping - mapping = frontend_mapping.get(mapping_type) - if mapping is not None: - return mapping - try: - mapping_str = load_data_file(mapping_package, mapping_filepath) - mapping = yaml.safe_load(mapping_str) - if mapping_type == "property": - # New data model map is always OCSF->native - mapping = reverse_mapping(mapping) - frontend_mapping[mapping_type] = mapping - except Exception as ex: - _logger.error("Failed to load %s", mapping_str, exc_info=ex) - mapping = None # FIXME: this is not a dict - return mapping - - -@typechecked -def get_keywords(): - # TODO: this Kestrel1 code needs to be updated - grammar = load_data_file("kestrel.frontend", "kestrel.lark") - parser = Lark(grammar, parser="lalr") - alphabet_patterns = filter(lambda x: x.pattern.value.isalnum(), parser.terminals) - # keywords = [x.pattern.value for x in alphabet_patterns] + all_relations - keywords = [x.pattern.value for x in alphabet_patterns] - keywords_lower = map(lambda x: x.lower(), keywords) - keywords_upper = map(lambda x: x.upper(), keywords) - keywords_comprehensive = list(chain(keywords_lower, keywords_upper)) - return keywords_comprehensive - - -# Create a single, reusable transformer -_parser = Lark( - load_data_file("kestrel.frontend", "kestrel.lark"), - parser="lalr", - transformer=_KestrelT( - entity_map=get_mapping( - "entity", "kestrel.mapping", os.path.join("entityname", "stix.yaml") - ), - property_map=get_mapping( - "property", "kestrel.mapping", os.path.join("entityattribute", "stix.yaml") - ), - ), -) - - -def parse_kestrel(stmts): - return _parser.parse(stmts) diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/__init__.py b/packages-nextgen/kestrel_core/src/kestrel/interface/__init__.py deleted file mode 100644 index 3c4b25e5..00000000 --- a/packages-nextgen/kestrel_core/src/kestrel/interface/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from kestrel.interface.base import AbstractInterface -from kestrel.interface.manager import InterfaceManager diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/base.py b/packages-nextgen/kestrel_core/src/kestrel/interface/base.py deleted file mode 100644 index 50f5601f..00000000 --- a/packages-nextgen/kestrel_core/src/kestrel/interface/base.py +++ /dev/null @@ -1,134 +0,0 @@ -import json -from abc import ABC, abstractmethod -from pandas import DataFrame -from uuid import UUID -from typing import ( - Mapping, - MutableMapping, - Optional, - Iterable, -) - -from kestrel.display import GraphletExplanation -from kestrel.ir.instructions import Instruction -from kestrel.ir.graph import IRGraphEvaluable -from kestrel.exceptions import ( - InvalidSerializedDatasourceInterfaceCacheCatalog, -) - - -MODULE_PREFIX = "kestrel_interface_" - - -class AbstractInterface(ABC): - """Abstract class for datasource/analytics interface - - Concepts: - - - Think an interface as a datalake - - - Think a datasource as a table in the datalake - - Attributes: - - session_id: the optional information to derive table name in datalake - - datasources: map a datasource name to datalake table name - - cache_catalog: map a cached item (instruction.id) to datalake table/view name - """ - - def __init__( - self, - serialized_cache_catalog: Optional[str] = None, - session_id: Optional[UUID] = None, - ): - self.session_id = session_id - self.cache_catalog: MutableMapping[UUID, str] = {} - - if serialized_cache_catalog: - try: - self.cache_catalog = json.loads(serialized_cache_catalog) - except: - raise InvalidSerializedDatasourceInterfaceCacheCatalog() - - # Python 3.13 will drop chain of @classmethod and @property - # use @staticmethod instead (cannot make it a property) - @staticmethod - @abstractmethod - def schemes() -> Iterable[str]: - """The schemes to specify the interface - - Each scheme should be defined as ``("_"|LETTER) ("_"|LETTER|DIGIT)*`` - """ - ... - - @abstractmethod - def store( - self, - instruction_id: UUID, - data: DataFrame, - ): - """Create a new table in the datalake from a dataframe - - The name of the table is a function of instruction_id (and session_id) - in case there are conflicting tables in the datalake. - - The function can be implemented as a hashtable. If the hash collides - with an existing hash, figure out whether the existing hash/table is - used by the current interface and session. If yes, then replace; if - not, then generate a new random value and record in self.cache_catalog. - - This method will update self.cache_catalog. - - Parameters: - - instruction_id: the key to be placed in `self.cache_catalog` - - data: the dataframe to store - """ - ... - - @abstractmethod - def evaluate_graph( - self, - graph: IRGraphEvaluable, - instructions_to_evaluate: Optional[Iterable[Instruction]] = None, - ) -> Mapping[UUID, DataFrame]: - """Evaluate the IRGraph - - Parameters: - - graph: The evaluate IRGraph - - instructions_to_evaluate: instructions to evaluate and return; by default, it will be all Return instructions in the graph - - Returns: - - DataFrames for each instruction in instructions_to_evaluate. - """ - ... - - @abstractmethod - def explain_graph( - self, - graph: IRGraphEvaluable, - instructions_to_explain: Optional[Iterable[Instruction]] = None, - ) -> Mapping[UUID, GraphletExplanation]: - """Explain how to evaluate the IRGraph - - Parameters: - - graph: The evaluable IRGraph - - instructions_to_explain: instructions to explain and return; by default, it will be all Return instructions in the graph - - Returns: - - GraphletExplanation (a Kestrel Display object) for each instruction in instructions_to_explain. - """ - ... - - def cache_catalog_to_json(self) -> str: - """Serialize the cache catalog to a JSON string""" - return json.dumps(self.cache_catalog) diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/codegen/__init__.py b/packages-nextgen/kestrel_core/src/kestrel/interface/codegen/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/codegen/dataframe.py b/packages-nextgen/kestrel_core/src/kestrel/interface/codegen/dataframe.py deleted file mode 100644 index 21ed706e..00000000 --- a/packages-nextgen/kestrel_core/src/kestrel/interface/codegen/dataframe.py +++ /dev/null @@ -1,143 +0,0 @@ -import sys -import inspect -import re -import operator -import functools -from typeguard import typechecked -from pandas import DataFrame, Series -from typing import Callable - -from kestrel.ir.instructions import ( - SourceInstruction, - TransformingInstruction, - Construct, - Limit, - ProjectAttrs, - ProjectEntity, - Filter, -) -from kestrel.ir.filter import ( - FExpression, - BoolExp, - MultiComp, - StrCompOp, - NumCompOp, - ExpOp, - ListOp, -) - - -@typechecked -def evaluate_source_instruction(instruction: SourceInstruction) -> DataFrame: - eval_func = _select_eval_func(instruction.instruction) - return eval_func(instruction) - - -@typechecked -def evaluate_transforming_instruction( - instruction: TransformingInstruction, dataframe: DataFrame -) -> DataFrame: - eval_func = _select_eval_func(instruction.instruction) - return eval_func(instruction, dataframe) - - -@typechecked -def _select_eval_func(instruction_name: str) -> Callable: - eval_funcs = inspect.getmembers(sys.modules[__name__], inspect.isfunction) - try: - _funcs = filter(lambda x: x[0] == "_eval_" + instruction_name, eval_funcs) - return next(_funcs)[1] - except StopIteration: - raise NotImplementedError( - f"evaluation function for {instruction_name} in dataframe cache" - ) - - -@typechecked -def _eval_Construct(instruction: Construct) -> DataFrame: - return DataFrame(instruction.data) - - -@typechecked -def _eval_Limit(instruction: Limit, dataframe: DataFrame) -> DataFrame: - return dataframe.head(instruction.num) - - -@typechecked -def _eval_ProjectAttrs(instruction: ProjectAttrs, dataframe: DataFrame) -> DataFrame: - return dataframe[instruction.attrs] - - -@typechecked -def _eval_ProjectEntity(instruction: ProjectEntity, dataframe: DataFrame) -> DataFrame: - # TODO - ... - - -@typechecked -def _eval_Filter(instruction: Filter, dataframe: DataFrame) -> DataFrame: - return dataframe[_eval_Filter_exp(instruction.exp, dataframe)] - - -@typechecked -def _eval_Filter_exp(exp: FExpression, dataframe: DataFrame) -> Series: - if isinstance(exp, BoolExp): - bs = _eval_Filter_exp_BoolExp(exp, dataframe) - elif isinstance(exp, MultiComp): - bss = [xs for xs in _eval_Filter_exp(exp.comps, dataframe)] - if exp.op == ExpOp.AND: - bs = functools.reduce(lambda x, y: x & y, bss) - elif exp.op == ExpOp.OR: - bs = functools.reduce(lambda x, y: x | y, bss) - else: - raise NotImplementedError("unkown kestrel.ir.filter.ExpOp type") - else: - bs = _eval_Filter_exp_Comparison(exp, dataframe) - return bs - - -@typechecked -def _eval_Filter_exp_BoolExp(boolexp: BoolExp, dataframe: DataFrame) -> Series: - if boolexp.op == ExpOp.AND: - bs = _eval_Filter_exp(boolexp.lhs, dataframe) & _eval_Filter_exp( - boolexp.rhs, dataframe - ) - elif boolexp.op == ExpOp.OR: - bs = _eval_Filter_exp(boolexp.lhs, dataframe) | _eval_Filter_exp( - boolexp.rhs, dataframe - ) - else: - raise NotImplementedError("unkown kestrel.ir.filter.ExpOp type") - return bs - - -@typechecked -def _eval_Filter_exp_Comparison( - c: FExpression, - dataframe: DataFrame, -) -> Series: - comp2func = { - NumCompOp.EQ: operator.eq, - NumCompOp.NEQ: operator.ne, - NumCompOp.LT: operator.gt, # value first in functools.partial - NumCompOp.LE: operator.ge, # value first in functools.partial - NumCompOp.GT: operator.lt, # value first in functools.partial - NumCompOp.GE: operator.le, # value first in functools.partial - StrCompOp.EQ: operator.eq, - StrCompOp.NEQ: operator.ne, - StrCompOp.LIKE: lambda w, x: bool( - re.search(w.replace(".", r"\.").replace("%", ".*?"), x) - ), - StrCompOp.NLIKE: lambda w, x: not bool( - re.search(w.replace(".", r"\.").replace("%", ".*?"), x) - ), - StrCompOp.MATCHES: lambda w, x: bool(re.search(w, x)), - StrCompOp.NMATCHES: lambda w, x: not bool(re.search(w, x)), - ListOp.IN: lambda w, x: x in w, - ListOp.NIN: lambda w, x: x not in w, - } - - try: - return dataframe[c.field].apply(functools.partial(comp2func[c.op], c.value)) - except KeyError: - raise NotImplementedError(f"unkown kestrel.ir.filter.*Op type: {c.op}") diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/codegen/kql.py b/packages-nextgen/kestrel_core/src/kestrel/interface/codegen/kql.py deleted file mode 100644 index e69de29b..00000000 diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/codegen/sql.py b/packages-nextgen/kestrel_core/src/kestrel/interface/codegen/sql.py deleted file mode 100644 index 75f97608..00000000 --- a/packages-nextgen/kestrel_core/src/kestrel/interface/codegen/sql.py +++ /dev/null @@ -1,165 +0,0 @@ -import logging -from functools import reduce -from typing import Callable - -from sqlalchemy import and_, column, or_, select, FromClause, asc, desc -from sqlalchemy.engine import Compiled, default -from sqlalchemy.sql.elements import BinaryExpression, BooleanClauseList -from sqlalchemy.sql.expression import ColumnClause, ColumnOperators -from sqlalchemy.sql.selectable import Select -from typeguard import typechecked - -from kestrel.ir.filter import ( - BoolExp, - ExpOp, - FComparison, - ListOp, - MultiComp, - NumCompOp, - StrComparison, - StrCompOp, -) -from kestrel.ir.instructions import ( - Filter, - Instruction, - Limit, - Offset, - ProjectAttrs, - ProjectEntity, - Sort, - SortDirection, -) - - -_logger = logging.getLogger(__name__) - -# SQLAlchemy comparison operator functions -comp2func = { - NumCompOp.EQ: ColumnOperators.__eq__, - NumCompOp.NEQ: ColumnOperators.__ne__, - NumCompOp.LT: ColumnOperators.__lt__, - NumCompOp.LE: ColumnOperators.__le__, - NumCompOp.GT: ColumnOperators.__gt__, - NumCompOp.GE: ColumnOperators.__ge__, - StrCompOp.EQ: ColumnOperators.__eq__, - StrCompOp.NEQ: ColumnOperators.__ne__, - StrCompOp.LIKE: ColumnOperators.like, - StrCompOp.NLIKE: ColumnOperators.not_like, - StrCompOp.MATCHES: ColumnOperators.regexp_match, - StrCompOp.NMATCHES: ColumnOperators.regexp_match, # Caller must negate - ListOp.IN: ColumnOperators.in_, - ListOp.NIN: ColumnOperators.not_in, -} - - -@typechecked -def _render_comp(comp: FComparison) -> BinaryExpression: - col: ColumnClause = column(comp.field) - if comp.op == StrCompOp.NMATCHES: - return ~comp2func[comp.op](col, comp.value) - return comp2func[comp.op](col, comp.value) - - -@typechecked -def _render_multi_comp(comps: MultiComp) -> BooleanClauseList: - op = and_ if comps.op == ExpOp.AND else or_ - return reduce(op, map(_render_comp, comps.comps)) - - -@typechecked -class SqlTranslator: - def __init__( - self, - dialect: default.DefaultDialect, - timefmt: Callable, - timestamp: str, - from_obj: FromClause, - ): - # SQLAlchemy Dialect object (e.g. from sqlalchemy.dialects import sqlite; sqlite.dialect()) - self.dialect = dialect - - # Time formatting function for datasource - self.timefmt = timefmt - - # Primary timestamp field in target table - self.timestamp = timestamp - - # SQLAlchemy statement object - self.query: Select = select("*").select_from(from_obj) - - def _render_exp(self, exp: BoolExp) -> BooleanClauseList: - if isinstance(exp.lhs, BoolExp): - lhs = self._render_exp(exp.lhs) - elif isinstance(exp.lhs, MultiComp): - lhs = _render_multi_comp(exp.lhs) - else: - lhs = _render_comp(exp.lhs) - if isinstance(exp.rhs, BoolExp): - rhs = self._render_exp(exp.rhs) - elif isinstance(exp.rhs, MultiComp): - rhs = _render_multi_comp(exp.rhs) - else: - rhs = _render_comp(exp.rhs) - return and_(lhs, rhs) if exp.op == ExpOp.AND else or_(lhs, rhs) - - def add_Filter(self, filt: Filter) -> None: - if filt.timerange.start: - # Convert the timerange to the appropriate pair of comparisons - start_comp = StrComparison( - self.timestamp, ">=", self.timefmt(filt.timerange.start) - ) - stop_comp = StrComparison( - self.timestamp, "<", self.timefmt(filt.timerange.stop) - ) - # AND them together - time_exp = BoolExp(start_comp, ExpOp.AND, stop_comp) - # AND that with any existing filter expression - exp = BoolExp(filt.exp, ExpOp.AND, time_exp) - else: - exp = filt.exp - if isinstance(exp, BoolExp): - comp = self._render_exp(exp) - elif isinstance(exp, MultiComp): - comp = _render_multi_comp(exp) - else: - comp = _render_comp(exp) - self.query = self.query.where(comp) - - def add_ProjectAttrs(self, proj: ProjectAttrs) -> None: - cols = [column(col) for col in proj.attrs] - self.query = self.query.with_only_columns(*cols) # TODO: mapping? - - def add_ProjectEntity(self, proj: ProjectEntity) -> None: - self.query = self.query.with_only_columns( - column(proj.entity_type) - ) # TODO: mapping? - - def add_Limit(self, lim: Limit) -> None: - self.query = self.query.limit(lim.num) - - def add_Offset(self, offset: Offset) -> None: - self.query = self.query.offset(offset.num) - - def add_Sort(self, sort: Sort) -> None: - col = column(sort.attribute) - order = asc(col) if sort.direction == SortDirection.ASC else desc(col) - self.query = self.query.order_by(order) - - def add_instruction(self, i: Instruction) -> None: - inst_name = i.instruction - method_name = f"add_{inst_name}" - method = getattr(self, method_name) - if not method: - raise NotImplementedError(f"SqlTranslator.{method_name}") - method(i) - - def result(self) -> Compiled: - # TODO: two projections, e.g., ProjectAttrs after ProjectEntity - return self.query.compile(dialect=self.dialect) - - def result_w_literal_binds(self) -> Compiled: - # full SQL query with literal binds showing, i.e., IN [99, 51], not IN [?, ?] - # this is for debug display, not used by an sqlalchemy driver to execute - return self.query.compile( - dialect=self.dialect, compile_kwargs={"literal_binds": True} - ) diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/manager.py b/packages-nextgen/kestrel_core/src/kestrel/interface/manager.py deleted file mode 100644 index b5fd0904..00000000 --- a/packages-nextgen/kestrel_core/src/kestrel/interface/manager.py +++ /dev/null @@ -1,112 +0,0 @@ -from __future__ import annotations -import importlib -import pkgutil -import logging -import inspect -import sys -import itertools -from copy import copy -from typeguard import typechecked -from typing import Mapping, Iterable, Type - -from kestrel.exceptions import ( - InterfaceNotConfigured, - InterfaceNotFound, - InvalidInterfaceImplementation, - ConflictingInterfaceScheme, -) -from kestrel.interface.base import MODULE_PREFIX, AbstractInterface -from kestrel.config.internal import CACHE_INTERFACE_IDENTIFIER - - -_logger = logging.getLogger(__name__) - - -# basically a scheme to interface mapping -@typechecked -class InterfaceManager(Mapping): - def __init__(self, init_interfaces: Iterable[AbstractInterface] = []): - interface_classes = _load_interface_classes() - self.interfaces = list(init_interfaces) # copy/recreate the list - for iface_cls in interface_classes: - try: - iface = iface_cls() - _logger.debug(f"Initialize interface {iface_cls.__name__}") - self.interfaces.append(iface) - except InterfaceNotConfigured as e: - _logger.debug(f"Interface {iface_cls.__name__} not configured; ignored") - - def __getitem__(self, scheme: str) -> AbstractInterface: - for interface in self.interfaces: - if scheme in interface.schemes(): - return interface - else: - raise InterfaceNotFound(f"no interface loaded for scheme {scheme}") - - def __iter__(self) -> Iterable[str]: - return itertools.chain(*[i.schemes() for i in self.interfaces]) - - def __len__(self) -> int: - return sum(1 for _ in iter(self)) - - def copy_with_virtual_cache(self) -> InterfaceManager: - im = copy(self) - # shallow copy refers to the same list, so create/copy a new one - im.interfaces = copy(im.interfaces) - # now swap in virtual cache - cache = im[CACHE_INTERFACE_IDENTIFIER] - im.interfaces.remove(cache) - im.interfaces.append(cache.get_virtual_copy()) - return im - - def del_cache(self): - cache = self[CACHE_INTERFACE_IDENTIFIER] - self.interfaces.remove(cache) - del cache - - -def _load_interface_classes(): - interface_clss = [] - for itf_pkg_name in _list_interface_pkg_names(): - mod = importlib.import_module(itf_pkg_name) - _logger.debug(f"Imported {mod} from package {itf_pkg_name}") - cls = inspect.getmembers( - sys.modules[itf_pkg_name], _is_class(AbstractInterface) - ) - if not cls: - raise InvalidInterfaceImplementation( - f'no interface class found in package "{itf_pkg_name}"' - ) - elif len(cls) > 1: - raise InvalidInterfaceImplementation( - f'more than one interface class found in package "{itf_pkg_name}"' - ) - else: - interface_cls = cls[0][1] - _guard_scheme_conflict(interface_cls, interface_clss) - interface_clss.append(interface_cls) - return interface_clss - - -def _list_interface_pkg_names(): - pkg_names = [x.name for x in pkgutil.iter_modules()] - itf_names = [pkg for pkg in pkg_names if pkg.startswith(MODULE_PREFIX)] - return itf_names - - -def _is_class(cls): - return lambda obj: inspect.isclass(obj) and obj.__bases__[0] == cls - - -@typechecked -def _guard_scheme_conflict( - new_interface: Type[AbstractInterface], - interfaces: Iterable[Type[AbstractInterface]], -): - for interface in interfaces: - for scheme_new in new_interface.schemes(): - for scheme_old in interface.schemes(): - if scheme_new == scheme_old: - raise ConflictingInterfaceScheme( - f"scheme: {scheme_new} conflicting between {new_interface.__name__} and {interface.__name__}" - ) diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/translation/__init__.py b/packages-nextgen/kestrel_core/src/kestrel/interface/translation/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/translation/query/__init__.py b/packages-nextgen/kestrel_core/src/kestrel/interface/translation/query/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/translation/result/__init__.py b/packages-nextgen/kestrel_core/src/kestrel/interface/translation/result/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/packages-nextgen/kestrel_core/src/kestrel/ir/__init__.py b/packages-nextgen/kestrel_core/src/kestrel/ir/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/packages-nextgen/kestrel_core/src/kestrel/ir/filter.py b/packages-nextgen/kestrel_core/src/kestrel/ir/filter.py deleted file mode 100644 index ebdd6856..00000000 --- a/packages-nextgen/kestrel_core/src/kestrel/ir/filter.py +++ /dev/null @@ -1,196 +0,0 @@ -from __future__ import annotations - -from typeguard import typechecked -from dataclasses import dataclass -from datetime import datetime -from enum import Enum -from typing import List, Optional, Union, Iterable, Any, Callable - -from mashumaro.mixins.json import DataClassJSONMixin - - -class NumCompOp(str, Enum): - """Numerical comparison operators (for int and float)""" - - EQ = "=" - NEQ = "!=" - LT = "<" - LE = "<=" - GT = ">" - GE = ">=" - - -@dataclass -class IntComparison(DataClassJSONMixin): - """Integer comparison expression""" - - field: str - op: NumCompOp - value: int - - -@dataclass -class FloatComparison(DataClassJSONMixin): - """Floating point comparison expression""" - - field: str - op: NumCompOp - value: float - - -class StrCompOp(str, Enum): - """String comparison operators""" - - EQ = "=" - NEQ = "!=" - LIKE = "LIKE" - NLIKE = "NOT LIKE" - MATCHES = "MATCHES" - NMATCHES = "NOT MATCHES" - - -@dataclass -class StrComparison(DataClassJSONMixin): - """String comparison expression""" - - field: str - op: StrCompOp - value: str - - -class ListOp(str, Enum): - """List membership operator""" - - IN = "IN" - NIN = "NOT IN" - - -@dataclass -class ListStrComparison(DataClassJSONMixin): - """List of strings membership comparison expression""" - - field: str - op: ListOp - value: List[str] - - -@dataclass -class ListIntComparison(DataClassJSONMixin): - """List of ints membership comparison expression""" - - field: str - op: ListOp - value: List[int] - - -@dataclass -class ListComparison(DataClassJSONMixin): - """List membership comparison expression""" - - field: str - op: ListOp - value: Union[List[int], List[str]] - - -# frozen=True for generating __hash__() method -@dataclass(frozen=True) -class ReferenceValue(DataClassJSONMixin): - """Value for reference""" - - reference: str - attribute: Optional[str] - - -@dataclass -class RefComparison(DataClassJSONMixin): - """Referred variable comparison""" - - field: str - op: ListOp - value: ReferenceValue - - -class ExpOp(str, Enum): - """Boolean expression operator""" - - AND = "AND" - OR = "OR" - - -@dataclass -class MultiComp(DataClassJSONMixin): - """Boolean expression of multiple comparisons. - - The single operator applies to ALL comparisons, so `OR` acts like `any` and `AND` acts like `all`. - """ - - op: ExpOp - comps: List[ - Union[ - IntComparison, FloatComparison, StrComparison, ListComparison, RefComparison - ] - ] - - -@dataclass -class BoolExp(DataClassJSONMixin): - """Binary boolean expression of comparisons""" - - lhs: FExpression - op: ExpOp - rhs: FExpression - - -@dataclass -class TimeRange(DataClassJSONMixin): - """The time range of interest""" - - start: Optional[datetime] = None - stop: Optional[datetime] = None - - -FExpression = Union[ - IntComparison, - FloatComparison, - StrComparison, - ListComparison, - RefComparison, - MultiComp, - BoolExp, -] - - -FComparison = Union[ - IntComparison, - FloatComparison, - StrComparison, - ListComparison, - RefComparison, - MultiComp, -] - - -@typechecked -def get_references_from_exp(exp: FExpression) -> Iterable[ReferenceValue]: - if isinstance(exp, RefComparison): - yield exp.value - elif isinstance(exp, BoolExp): - yield from get_references_from_exp(exp.lhs) - yield from get_references_from_exp(exp.rhs) - elif isinstance(exp, MultiComp): - for comp in exp.comps: - yield from get_references_from_exp(comp) - - -@typechecked -def resolve_reference_with_function( - exp: FExpression, f: Callable[[ReferenceValue], Any] -): - if isinstance(exp, RefComparison): - exp.value = f(exp.value) - elif isinstance(exp, BoolExp): - resolve_reference_with_function(exp.lhs, f) - resolve_reference_with_function(exp.rhs, f) - elif isinstance(exp, MultiComp): - for comp in exp.comps: - resolve_reference_with_function(comp, f) diff --git a/packages-nextgen/kestrel_core/src/kestrel/ir/graph.py b/packages-nextgen/kestrel_core/src/kestrel/ir/graph.py deleted file mode 100644 index ddc41b7d..00000000 --- a/packages-nextgen/kestrel_core/src/kestrel/ir/graph.py +++ /dev/null @@ -1,832 +0,0 @@ -from __future__ import annotations -from typeguard import typechecked -from typing import Any, Iterable, Tuple, Mapping, MutableMapping, Union, Optional -from collections import defaultdict -from itertools import combinations -from uuid import UUID -import networkx -import json -from kestrel.ir.instructions import ( - Instruction, - TransformingInstruction, - SolePredecessorTransformingInstruction, - IntermediateInstruction, - SourceInstruction, - Variable, - DataSource, - Reference, - Return, - Filter, - ProjectAttrs, - instruction_from_dict, -) -from kestrel.ir.filter import ReferenceValue -from kestrel.exceptions import ( - InstructionNotFound, - InvalidSeralizedGraph, - VariableNotFound, - ReferenceNotFound, - DataSourceNotFound, - DuplicatedVariable, - DuplicatedReference, - DuplicatedDataSource, - DuplicatedSingletonInstruction, - MultiInterfacesInGraph, - MultiSourcesInGraph, - InevaluableInstruction, - LargerThanOneIndegreeInstruction, - DuplicatedReferenceInFilter, - MissingReferenceInFilter, - DanglingReferenceInFilter, - DanglingFilter, -) -from kestrel.config.internal import CACHE_INTERFACE_IDENTIFIER - - -@typechecked -def compose(g: IRGraph, h: IRGraph) -> IRGraph: - g.update(h) - return g - - -@typechecked -def union(g: IRGraph, h: IRGraph) -> IRGraph: - return compose(g, h) - - -@typechecked -class IRGraph(networkx.DiGraph): - def __init__( - self, serialized_graph: Union[None, str, Mapping[str, Iterable[Mapping]]] = None - ): - super().__init__() - if serialized_graph: - if isinstance(serialized_graph, str): - graph_in_dict = json.loads(serialized_graph) - else: - graph_in_dict = serialized_graph - self._from_dict(graph_in_dict) - - def add_node( - self, - node: Instruction, - dependent_node: Optional[Instruction] = None, - deref: bool = True, - ) -> Instruction: - """General adding node/instruction operation - - Parameters: - node: the instruction to add - dependent_node: the dependent instruction if node is a TransformingInstruction - deref: whether to dereference Reference instruction (only useful for if node is Reference) - - Returns: - The node added - """ - if node not in self: - if isinstance(node, TransformingInstruction): - node = self._add_node_with_dependent_node(node, dependent_node) - else: - node = self._add_node(node, deref) - return node - - def add_nodes_from(self, nodes: Iterable[Instruction], deref: bool = True): - """Add nodes in a list - - Parameters: - nodes: the list of nodes/instructions to add - deref: whether to deref Reference node - """ - for node in nodes: - self._add_node(node, deref) - - def add_edge(self, u: Instruction, v: Instruction, deref: bool = False): - """Add edge (add node if not exist) - - Parameters: - u: the source of the edge - v: the target of the edge - deref: whether to deref Reference node - """ - ux = self._add_node(u, deref) - vx = self._add_node(v, deref) - super().add_edge(ux, vx) - - def add_edges_from( - self, edges: Iterable[Tuple[Instruction, Instruction]], deref: bool = False - ): - """Add edges in a list - - Parameters: - edges: the edges to add - deref: whether to deref Reference node - """ - for u, v in edges: - self.add_edge(u, v, deref) - - def copy(self): - """Copy the IRGraph with all nodes as reference (not deepcopy) - - Support subclass of IRGraph to be copied. - """ - g = IRGraph() - g.update(self) - - # subclass support - if type(g) != type(self): - g = type(self)(g) - - return g - - def deepcopy(self): - """Copy the IRGraph with all nodes copied as new objects - - Support subclass of IRGraph to be deep copied. - """ - g = IRGraph() - o2n = {n: n.deepcopy() for n in self.nodes()} - for u, v in self.edges(): - g.add_edge(o2n[u], o2n[v]) - g.add_nodes_from([o2n[n] for n in self.nodes() if self.degree(n) == 0]) - - # subclass support - if type(g) != type(self): - g = type(self)(g) - - return g - - def get_node_by_id(self, ux: Union[UUID, str]) -> Instruction: - """Get node by ID - - Parameters: - ux: node ID - - Returns: - The Kestrel instruction (node in IRGraph) - """ - u = UUID(ux) if isinstance(ux, str) else ux - try: - return next(filter(lambda n: n.id == u, self.nodes())) - except StopIteration: - raise InstructionNotFound(u) - - def get_nodes_by_type(self, ntype: type) -> Iterable[Instruction]: - """Get nodes by type - - Parameters: - ntype: node/instruction type - - Returns: - The list of nodes/instructions - """ - return [n for n in self.nodes() if isinstance(n, ntype)] - - def get_nodes_by_type_and_attributes( - self, ntype: type, attr2val: Mapping[str, Union[str, bool, int]] - ) -> Iterable[Instruction]: - """Get nodes by both type and attributes/values - - Parameters: - ntype: node/instruction type - attr2val: instruction attribute/value dictionary - - Returns: - The list of nodes/instructions - """ - nodes = self.get_nodes_by_type(ntype) - return [ - n - for n in nodes - if all([getattr(n, k, None) == v for (k, v) in attr2val.items()]) - ] - - def get_variable(self, var_name: str) -> Variable: - """Get a Kestrel variable by its name - - Parameters: - var_name: variable name - - Returns: - The Kestrel variable given its name - """ - xs = self.get_nodes_by_type_and_attributes(Variable, {"name": var_name}) - if xs: - if len({x.version for x in xs}) < len(xs): - raise DuplicatedVariable(var_name) - else: - xs.sort(key=lambda x: x.version) - return xs[-1] - else: - raise VariableNotFound(var_name) - - def get_variables(self) -> Iterable[Variable]: - """Get all variables - - This method returns a list of variables, equivalent to *Symbol Table* used in traditional (non-graph-IR) language compilers. Shadowed variables (replaced by new variables with same names) will not be returned. - - Returns: - The list of all Kestrel variables in this huntflow. - """ - var_names = {v.name for v in self.get_nodes_by_type(Variable)} - return [self.get_variable(var_name) for var_name in var_names] - - def add_variable( - self, vx: Union[str, Variable], dependent_node: Instruction - ) -> Variable: - """Create new variable (if needed) and add to IRGraph - - Parameters: - vx: variable name (str) or already created node (Variable) - dependent_node: the instruction to which the variable refer - - Returns: - The variable node created/added - """ - v = Variable(vx) if isinstance(vx, str) else vx - return self.add_node(v, dependent_node) - - def get_reference(self, ref_name: str) -> Reference: - """Get a Kestrel reference by its name - - Parameters: - ref_name: reference name - - Returns: - The Reference node - """ - xs = self.get_nodes_by_type_and_attributes(Reference, {"name": ref_name}) - if xs: - if len(xs) > 1: - raise DuplicatedReference(ref_name) - else: - return xs.pop() - else: - raise ReferenceNotFound(ref_name) - - def get_references(self) -> Iterable[Reference]: - """Get all references - - Returns: - The list of reference nodes - """ - ref_names = {r.name for r in self.get_nodes_by_type(Reference)} - return [self.get_reference(ref_name) for ref_name in ref_names] - - def add_reference( - self, rx: Union[str, Reference], deref: bool = True - ) -> Union[Reference, Variable]: - """Create or add new reference node to IRGraph - - The reference node will be derefed if the flag is specified. - - Parameters: - rx: reference name (str) or already created node (Reference) - deref: whether to deref when adding node - - Returns: - The reference node created/added - """ - r = Reference(rx) if isinstance(rx, str) else rx - return self.add_node(r, deref) - - def get_datasource(self, interface: str, datasource: str) -> DataSource: - """Get a Kestrel datasource by its URI - - Parameters: - interface: the datasource interface name - datasource: the datasource name under the interface - - Returns: - The datasource - """ - xs = self.get_nodes_by_type_and_attributes( - DataSource, {"interface": interface, "datasource": datasource} - ) - if xs: - if len(xs) > 1: - raise DuplicatedDataSource(interface, datasource) - else: - return xs.pop() - else: - raise DataSourceNotFound(interface, datasource) - - def get_datasources(self) -> Iterable[DataSource]: - """Get all datasources - - Returns: - The list of data sources - """ - xs = self.get_nodes_by_type(DataSource) - - # to check for duplicated datasources - - return xs - - def add_datasource( - self, sx: Union[str, DataSource], default_interface: Optional[str] = None - ) -> DataSource: - """Create new datasource (if needed) and add to IRGraph if not exist - - Parameters: - sx: the full URI of the datasource (str) or already created node (DataSource) - default_interface: default interface name - - Returns: - The DataSource node found or added - """ - s = DataSource(sx, default_interface) if isinstance(sx, str) else sx - return self.add_node(s) - - def get_returns(self) -> Iterable[Return]: - """Get all return nodes - - Returns: - The list of return nodes - """ - return sorted(self.get_nodes_by_type(Return), key=lambda x: x.sequence) - - def get_max_return_sequence(self) -> int: - """Get the largest sequence number of all Returns - - Returns: - The largest sequence number of all Return instruction - """ - return max(map(lambda x: x.sequence, self.get_returns()), default=-1) - - def add_return(self, dependent_node: Instruction) -> Return: - """Create new Return instruction and add to IRGraph - - Parameters: - dependent_node: the instruction to hold return - - Returns: - The return node created/added - """ - return self.add_node(Return(), dependent_node) - - def get_sink_nodes(self) -> Iterable[Instruction]: - """Get all sink nodes (node with no successors) - - Returns: - The list of sink nodes - """ - return [n for n in self.nodes() if self.out_degree(n) == 0] - - def get_trunk_n_branches( - self, node: TransformingInstruction - ) -> (Instruction, Mapping[ReferenceValue, Instruction]): - """Get the trunk and branches paths for instruction - - For trunk path, return the tail node; for each branch, return the tail - node of the branchin mapping from reference to node. - - Parameters: - node: the instruction node - - Returns: - (tail node for trunk, ref to branch tail node mapping) - """ - ps = list(self.predecessors(node)) - pps = [(p, pp) for p in self.predecessors(node) for pp in self.predecessors(p)] - - # may need to add a patch in find_dependent_subgraphs_of_node() - # for each new case added in the if/elif, e.g., FIlter - if isinstance(node, SolePredecessorTransformingInstruction): - if len(ps) > 1: - raise LargerThanOneIndegreeInstruction() - else: - return ps[0], {} - elif isinstance(node, Filter): - r2n = {} - for rv in node.get_references(): - ppfs = [ - (p, pp) - for p, pp in pps - if isinstance(p, ProjectAttrs) - and isinstance(pp, (Variable, Reference)) - and p.attrs == [rv.attribute] - and pp.name == rv.reference - ] - if not ppfs: - raise MissingReferenceInFilter(rv, node, pps) - elif len(ppfs) > 1: - raise DuplicatedReferenceInFilter(rv, node, pps) - else: - p = ppfs[0][0] - r2n[rv] = p - ps.remove(p) - if len(ps) == 0: - raise DanglingFilter() - elif len(ps) > 1: - raise DanglingReferenceInFilter(ps) - return ps[0], r2n - else: - raise NotImplementedError(f"unknown instruction type: {node}") - - def update(self, ng: IRGraph): - """Extend the current IRGraph with a new IRGraph - - Parameters: - ng: the new IRGraph to merge/combine/union - """ - # After we add new variable nodes, we can no longer rely on - # self.get_variable() to get variables for de-referencing. - # Save the original variables first. - original_variables = {v.name: v for v in self.get_variables()} - - # prepare new variables from ng before merge - # should not use ng.get_variable(), - # which does not cover all overridden variables - for nv in ng.get_nodes_by_type(Variable): - if nv.name in original_variables: - nv.version += original_variables[nv.name].version + 1 - - # prepare return sequence from ng before merge - return_max_sequence = self.get_max_return_sequence() - for nr in ng.get_returns(): - nr.sequence += return_max_sequence + 1 - - # add refs first to deref correctly - # if any reference exist, it should be derefed before adding any variable - o2n_refs = {n: self._add_node(n) for n in ng.get_references()} - # add all nodes with dedup singleton node, e.g., SourceInstruction - o2n_nonrefs = {n: self._add_node(n) for n in ng.nodes() if n not in o2n_refs} - - # overall old to new node mapping - o2n = {} - o2n.update(o2n_refs) - o2n.update(o2n_nonrefs) - - # add all edges - self.add_edges_from([(o2n[u], o2n[v]) for (u, v) in ng.edges()]) - - def duplicate_dependent_subgraph_of_node(self, node: Instruction) -> IRGraph: - """Find and copy the dependent subgraph of a node (including the node) - - Parameters: - node: instruction node to start - - Returns: - A copy of the dependent subgraph (including the input node) - """ - nodes = networkx.ancestors(self, node) - nodes.add(node) - return self.subgraph(nodes).copy() - - def find_cached_dependent_subgraph_of_node( - self, node: Instruction, cache: MutableMapping[UUID, Any] - ) -> IRGraph: - """Return the cached dependent graph of the a node - - Discard nodes and subgraphs before any cached nodes, e.g., Variables. - - Parameters: - node: instruction node to start - cache: any type of node cache, e.g., content, SQL statement - - Returns: - The pruned IRGraph without nodes before cached Variable nodes - """ - g = self.duplicate_dependent_subgraph_of_node(node) - in_edges = [g.in_edges(n) for n in g.nodes() if n.id in cache] - g.remove_edges_from(set().union(*in_edges)) - - # important last step to discard any unconnected nodes/subgraphs prior to the dropped edges - return g.duplicate_dependent_subgraph_of_node(node) - - def find_dependent_subgraphs_of_node( - self, - node: Instruction, - cache: MutableMapping[UUID, Any], - ) -> Iterable[IRGraphEvaluable]: - """Find dependency subgraphs that do not have further dependency - - To evaluate a node, one needs to evaluate all nodes in its dependent - graph. However, not all nodes can be evaluated at once (e.g., impacted - by multiple interfaces). Some require more basic dependent subgraphs to - be evaluated first. This method segments the dependent graph of a node - and return the subgraphs that are IRGraphEvaluable. One can evaluate - the returns, cache them, and call this method again. After iterations - of return and evaluation of returned dependent subgraphs, the node can - finally be evaluated in the last return, which will just be a - IRGraphEvaluable at that time. - - TODO: analytics node support - - Parameters: - node: the instruction/node to generate dependent subgraphs for - cache: any type of node cache, e.g., content, SQL statement - - Returns: - A list of subgraphs that do not have further dependency - """ - _CII = CACHE_INTERFACE_IDENTIFIER - - # the base graph to segment - g = self.find_cached_dependent_subgraph_of_node(node, cache) - - # Mapping: {interface name: [impacted nodes]} - a2ns = defaultdict(set) - for n in g.get_nodes_by_type(SourceInstruction): - a2ns[n.interface].add(n) - a2ns[n.interface].update(networkx.descendants(g, n)) - - # all predecessor nodes to any interface impacted nodes - pns = set().union(*[set(g.predecessors(n)) for ns in a2ns.values() for n in ns]) - - # add non-source nodes to cache as default execution environment - # e.g., a path starting from a cached Variable - # nodes directly preceeding an interface impacted node do not need evaluation - cached_nodes = set([n for n in g.nodes() if n.id in cache]) - for n in cached_nodes - pns: - a2ns[_CII].add(n) - a2ns[_CII].update(networkx.descendants(g, n)) - - # find all nodes that are affected by two or more interfaces - shared_impacted_nodes = set().union( - *[a2ns[ix] & a2ns[iy] for ix, iy in combinations(a2ns.keys(), 2)] - ) - - # unshared nodes for each interface - a2uns = {k: v - shared_impacted_nodes for k, v in a2ns.items()} - - # handle direct predecessor node cached - # such nodes are required in building dep graphs around interfaces - # such nodes could be shared by multiple interfaces; can only handle now - for interface in set(a2uns) - set([_CII]): - ps = set().union(*[set(g.predecessors(n)) for n in a2uns[interface]]) - a2uns[interface].update(ps & cached_nodes) - - # a patch (corner case handling) for get_trunk_n_branches() - # add Variable/Reference node if succeeded by ProjectAttrs and Filter, - # which are in the dependent graph; the Variable is only needed by - # get_trunk_n_branches() as an auxiliary node - for interface in a2uns: - auxs = [] - for n in a2uns[interface]: - if isinstance(n, ProjectAttrs): - # need to search in `self`, not `g`, since the boundry of - # `g` is cut by the cache - p = next(self.predecessors(n)) - s = next(g.successors(n)) - if ( - isinstance(s, Filter) - and isinstance(p, (Variable, Reference)) - and s in a2uns[interface] - ): - auxs.append(p) - a2uns[interface].update(auxs) - - # remove dep graphs with only one node - # e.g., `ds://a` in "y = GET file FROM ds://a WHERE x = v.x" - # when v.x not in cache - dep_nodes = [ns for ns in a2uns.values() if len(ns) > 1] - # need to search in `self` due to the patch for get_trunk_n_branches() - dep_graphs = [ - IRGraphEvaluable(self.subgraph(ns)).deepcopy() for ns in dep_nodes - ] - - return dep_graphs - - def find_simple_query_subgraphs( - self, cache: MutableMapping[UUID, Any] - ) -> Iterable[IRGraphSimpleQuery]: - """Find dependency subgraphs those are IRGraphSimpleQuery - - Some interfaces, e.g., stix-shifter, build stateless query and do not - support JOIN or sub query/SELECT, so they can only evaluate a simple - SQL query around each source node. Use this method to prepare such tiny - graph segments for evaluation by the interface. The remaining of the - graph can be evaluated in cache. - - Parameters: - cache: any type of node cache, e.g., content, SQL statement - - Returns: - An iterator of simple-query subgraphs - """ - for n in self.get_nodes_by_type(SourceInstruction): - for g in self._find_paths_from_node_to_a_variable(n, cache): - yield IRGraphSimpleQuery(g) - - def _find_paths_from_node_to_a_variable( - self, node: Instruction, cache: MutableMapping[UUID, Any] - ) -> Iterable[IRGraph]: - """Find paths (linear IRGraph with directly attached cached nodes) from - the starting node to its closest variables - - If the linear IRGraph has a dependent branch/path longer than a cached - node, this linear IRGraph cannot be used to build a IRGraphSimpleQuery; - it needs to generate a subquery for the branch. - - Parameters: - node: the node to start path search - cache: any type of node cache, e.g., content, SQL statement - - Returns: - An iterator of paths - """ - # check whether the node has other uncached incoming nodes - # if no, this path can be a IRGraphSimpleQuery - if len([n for n in self.predecessors(node) if n.id not in cache]) <= 1: - # pcns: predecessor cached nodes - pcns = [n for n in self.predecessors(node) if n.id in cache] - for succ in self.successors(node): - if isinstance(succ, Variable): - yield self.subgraph([succ, node] + pcns) - else: - for succ_graph in self._find_paths_from_node_to_a_variable( - succ, cache - ): - yield self.subgraph(list(succ_graph.nodes()) + [node] + pcns) - - def to_dict(self) -> Mapping[str, Iterable[Mapping]]: - """Serialize to a Python dictionary (D3 graph format) - - Returns: - The graph in a Python dictionary to be dumped as JSON string - """ - nodes = [n.to_dict() for n in self.nodes()] - links = [{"source": str(u.id), "target": str(v.id)} for (u, v) in self.edges()] - return {"nodes": nodes, "links": links} - - def to_json(self) -> str: - """Serialize to a Python JSON string (D3 graph format) - - Returns: - The graph in a Python JSON string - """ - return json.dumps(self.to_dict()) - - def _add_node(self, node: Instruction, deref: bool = True) -> Instruction: - """Add just the node - - Dependency (if exists) not handled. Variable version and Return - sequence intentionally not handled here (handled in - _add_node_with_dependent_node()) for plain adding node opeartion used - by update(). - - Parameters: - node: the node/instruction to add - deref: whether to deref is a Reference node - - Returns: - The node added or found or derefed - """ - # test `node in self` is important - # there could be a Reference node already in graph, not to deref - if node not in self: - if isinstance(node, IntermediateInstruction): - if isinstance(node, Reference): - if deref: - try: - v = self.get_variable(node.name) - except VariableNotFound: - # deref failed, add Reference node directly - node = self._add_singleton_instruction(node) - else: - # deref succeed, no need to add node - node = v - else: - node = self._add_singleton_instruction(node) - else: - raise NotImplementedError( - f"unknown IntermediateInstruction: {node}" - ) - elif isinstance(node, SourceInstruction): - node = self._add_singleton_instruction(node) - else: - super().add_node(node) - return node - - def _add_singleton_instruction(self, node: Instruction) -> Instruction: - """Guard adding a singleton node - - 1. Singleton nodes are nodes that only has one copy in graph - - 2. A node that has no predecessors is a singleton node - - Parameters: - node: the node/instruction to add - - Returns: - The node added or found - """ - xs = [ - x - for x in self.nodes() - if x.has_same_content_as(node) and self.in_degree(x) == 0 - ] - if xs: - if len(xs) > 1: - raise DuplicatedSingletonInstruction(node) - else: - node = xs.pop() - else: - super().add_node(node) - return node - - def _add_node_with_dependent_node( - self, node: Instruction, dependent_node: Instruction - ) -> Instruction: - """Add node to graph with a dependent node - - Variable version and Return sequence are handled here. - - Parameters: - node: the node/instruction to add - dependent_node: the dependent node that should exist in the graph - - Return: - The node added - """ - if dependent_node not in self: - raise InstructionNotFound(dependent_node) - if node not in self: - if isinstance(node, Variable): - try: - ve = self.get_variable(node.name) - except VariableNotFound: - node.version = 0 - else: - node.version = ve.version + 1 - if isinstance(node, Return): - node.sequence = self.get_max_return_sequence() + 1 - # add_edge will add node first - self.add_edge(dependent_node, node) - return node - - def _from_dict(self, graph_in_dict: Mapping[str, Iterable[Mapping]]): - """Deserialize from a Python dictionary (D3 graph format) - - This method is an implicit constructor from a serialized graph. - - Parameters: - graph_in_dict: the serialized graph in Python dictionary - """ - nodes = graph_in_dict["nodes"] - edges = graph_in_dict["links"] - for n in nodes: - self._add_node(instruction_from_dict(n), False) - for e in edges: - try: - u = self.get_node_by_id(e["source"]) - v = self.get_node_by_id(e["target"]) - except InstructionNotFound as err: - raise InvalidSeralizedGraph() - else: - self.add_edge(u, v) - - -@typechecked -class IRGraphEvaluable(IRGraph): - """Evaluable IRGraph - - An evaluable IRGraph is an IRGraph that - - 1. Only has one interface - - 2. No IntermediateInstruction node - """ - - def __init__(self, graph: Optional[IRGraph] = None): - super().__init__() - - # need to initialize it before `self.update(graph)` below - self.interface = None - - # update() will call _add_node() internally to set self.interface - if graph: - self.update(graph) - - # all source nodes are already cached (no SourceInstruction) - if not self.interface: - self.interface = CACHE_INTERFACE_IDENTIFIER - - def _add_node(self, node: Instruction, deref: bool = True) -> Instruction: - if isinstance(node, IntermediateInstruction): - raise InevaluableInstruction(node) - elif isinstance(node, SourceInstruction): - if self.interface: - if node.interface != self.interface: - raise MultiInterfacesInGraph([self.interface, node.interface]) - else: - self.interface = node.interface - return super()._add_node(node, deref) - - -@typechecked -class IRGraphSimpleQuery(IRGraphEvaluable): - """Simple Query IRGraph - - A simple query IRGraph is an evaluable IRGraph that - - 1. It contains one source node - - 2. It can be compiled into a simple (not nested/joined) SQL query - """ - - def __init__(self, graph: Optional[IRGraph] = None): - if graph and len(graph.get_nodes_by_type(SourceInstruction)) > 1: - raise MultiSourcesInGraph() - super().__init__(graph) diff --git a/packages-nextgen/kestrel_core/src/kestrel/ir/instructions.py b/packages-nextgen/kestrel_core/src/kestrel/ir/instructions.py deleted file mode 100644 index 8b1aa1e3..00000000 --- a/packages-nextgen/kestrel_core/src/kestrel/ir/instructions.py +++ /dev/null @@ -1,228 +0,0 @@ -from __future__ import annotations - -import copy -import inspect -import json -import sys -import uuid -from dataclasses import InitVar, dataclass, field, fields -from enum import Enum -from typing import Any, Iterable, List, Mapping, Optional, Type, Union - -from kestrel.__future__ import is_python_older_than_minor_version -from kestrel.config.internal import CACHE_INTERFACE_IDENTIFIER -from kestrel.exceptions import ( - InvalidDataSource, - InvalidInstruction, - InvalidSeralizedInstruction, -) -from kestrel.ir.filter import ( - FExpression, - ReferenceValue, - TimeRange, - get_references_from_exp, - resolve_reference_with_function, -) -from mashumaro.mixins.json import DataClassJSONMixin -from typeguard import typechecked - -# https://stackoverflow.com/questions/70400639/how-do-i-get-python-dataclass-initvar-fields-to-work-with-typing-get-type-hints -if is_python_older_than_minor_version(11): - InitVar.__call__ = lambda *args: None - - -@dataclass -class Instruction(DataClassJSONMixin): - id: uuid.UUID = field(init=False) - instruction: str = field(init=False) - - def __post_init__(self): - # stable id during Instruction lifetime - self.id = uuid.uuid4() - self.instruction = self.__class__.__name__ - - def __eq__(self, other: Instruction): - return self.id == other.id - - def __hash__(self): - # stable hash during Instruction lifetime - return self.id.int - - def copy(self): - return copy.copy(self) - - def deepcopy(self): - return copy.deepcopy(self) - - def has_same_content_as(self, instruction: Instruction) -> bool: - if self.instruction == instruction.instruction: - flag = True - for f in fields(self): - if f.name != "id" and getattr(self, f.name) != getattr( - instruction, f.name - ): - flag = False - else: - flag = False - return flag - - -class TransformingInstruction(Instruction): - """The instruction that builds/dependent on one or more instructions""" - - pass - - -class SolePredecessorTransformingInstruction(TransformingInstruction): - """The translating instruction whose indegree==1""" - - pass - - -class SourceInstruction(Instruction): - """The instruction that does not dependent on any instruction""" - - interface: str - - -class IntermediateInstruction(Instruction): - """The instruction that aids AST to Kestrel IR compilation""" - - pass - - -@dataclass(eq=False) -class Return(SolePredecessorTransformingInstruction): - """The sink instruction that forces execution - - Return is implemented as a TransformingInstruction so it triggers - IRGraph._add_node_with_dependent_node() in IRGraph.add_node() - """ - - # the order/sequence of return instruction in huntflow (source code) - sequence: int = 0 - - -@dataclass(eq=False) -class Filter(TransformingInstruction): - exp: FExpression - timerange: TimeRange = field(default_factory=TimeRange) - - # TODO: from_json() for self.exp - - def get_references(self) -> Iterable[ReferenceValue]: - return get_references_from_exp(self.exp) - - def resolve_references(self, f: Callable[[ReferenceValue], Any]): - resolve_reference_with_function(self.exp, f) - - -@dataclass(eq=False) -class ProjectEntity(SolePredecessorTransformingInstruction): - entity_type: str - - -@dataclass(eq=False) -class ProjectAttrs(SolePredecessorTransformingInstruction): - # mashumaro does not support typing.Iterable, only List - attrs: List[str] - - -@dataclass(eq=False) -class DataSource(SourceInstruction): - uri: InitVar[Optional[str]] = None - default_interface: InitVar[Optional[str]] = None - interface: str = "" - datasource: str = "" - - def __post_init__(self, uri: Optional[str], default_interface: Optional[str]): - super().__post_init__() - if uri: - # normal constructor, not from deserliazation - xs = uri.split("://") - if len(xs) == 2: - self.interface = xs[0] - self.datasource = xs[1] - elif len(xs) == 1 and default_interface: - self.interface = default_interface - self.datasource = xs[0] - else: - raise InvalidDataSource(uri) - else: - # from deserliazation; mashumaro will take care - pass - - -@dataclass(eq=False) -class Variable(SolePredecessorTransformingInstruction): - name: str - # required to dereference a variable that has been created multiple times - # the variable with the largest version will be used by dereference - version: int = 0 - - -@dataclass(eq=False) -class Reference(IntermediateInstruction): - """Referred Kestrel variable (used in AST) before de-referencing to a Kestrel variable""" - - name: str - - -@dataclass(eq=False) -class Explain(SolePredecessorTransformingInstruction): - pass - - -@dataclass(eq=False) -class Limit(SolePredecessorTransformingInstruction): - num: int - - -@dataclass(eq=False) -class Offset(SolePredecessorTransformingInstruction): - num: int - - -@dataclass(eq=False) -class Construct(SourceInstruction): - data: List[Mapping[str, Union[str, int, bool]]] - interface: str = CACHE_INTERFACE_IDENTIFIER - - -class SortDirection(str, Enum): - ASC = "ASC" - DESC = "DESC" - - -@dataclass(eq=False) -class Sort(SolePredecessorTransformingInstruction): - attribute: str - direction: SortDirection = SortDirection.DESC - - -@typechecked -def get_instruction_class(name: str) -> Type[Instruction]: - classes = inspect.getmembers(sys.modules[__name__], inspect.isclass) - instructions = [cls for _, cls in classes if issubclass(cls, Instruction)] - try: - return next(filter(lambda cls: cls.__name__ == name, instructions)) - except StopIteration: - raise InvalidInstruction(name) - - -@typechecked -def instruction_from_dict(d: Mapping[str, Union[str, bool, int]]) -> Instruction: - instruction_class = get_instruction_class(d["instruction"]) - try: - instruction = instruction_class.from_dict(d) - instruction.id = uuid.UUID(d["id"]) - except: - raise InvalidSeralizedInstruction(d) - else: - return instruction - - -@typechecked -def instruction_from_json(json_str: str) -> Instruction: - instruction_in_dict = json.loads(json_str) - return instruction_from_dict(instruction_in_dict) diff --git a/packages-nextgen/kestrel_core/src/kestrel/ir/relation/__init__.py b/packages-nextgen/kestrel_core/src/kestrel/ir/relation/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/packages-nextgen/kestrel_core/src/kestrel/ir/relation/relation.yaml b/packages-nextgen/kestrel_core/src/kestrel/ir/relation/relation.yaml deleted file mode 100644 index e69de29b..00000000 diff --git a/packages-nextgen/kestrel_core/src/kestrel/ir/relation/utils.py b/packages-nextgen/kestrel_core/src/kestrel/ir/relation/utils.py deleted file mode 100644 index e69de29b..00000000 diff --git a/packages-nextgen/kestrel_core/src/kestrel/mapping/__init__.py b/packages-nextgen/kestrel_core/src/kestrel/mapping/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/packages-nextgen/kestrel_core/src/kestrel/mapping/data_model.py b/packages-nextgen/kestrel_core/src/kestrel/mapping/data_model.py deleted file mode 100644 index d05bd943..00000000 --- a/packages-nextgen/kestrel_core/src/kestrel/mapping/data_model.py +++ /dev/null @@ -1,279 +0,0 @@ -import logging -from typing import Optional, Union - -import dpath -import numpy as np -import yaml -from pandas import DataFrame -from typeguard import typechecked - -from kestrel.mapping.transformers import ( - run_transformer, - run_transformer_on_series, -) -from kestrel.utils import list_folder_files - -_logger = logging.getLogger(__name__) - - -def _add_mapping(obj: dict, key: str, mapping: dict): - """Add `key` -> `mapping` to `obj`, appending if necessary""" - existing_mapping = obj.get(key) - if existing_mapping: - if isinstance(existing_mapping, str): - existing_mapping = [{"ocsf_field": existing_mapping}] - elif isinstance(existing_mapping, dict): - existing_mapping = [existing_mapping] - else: - existing_mapping = [] - existing_mapping.append(mapping) - obj[key] = existing_mapping - - -def _reverse_dict(obj: dict, k: str, v: dict): - """Reverse a single OCSF -> native mapping and add it to `obj`""" - key = v["native_field"] - mapping = {i: j for i, j in v.items() if i != "native_field"} - mapping["ocsf_field"] = k - _add_mapping(obj, key, mapping) - - -def _add_attr(obj: dict, key: str, value: str): - """Add `key` -> `value` to `obj`, appending if necessary""" - if key not in obj: - obj[key] = value - else: - existing = obj[key] - if isinstance(existing, str): - obj[key] = [existing, value] - else: - existing.append(value) - - -def reverse_mapping(obj: dict, prefix: str = None, result: dict = None) -> dict: - """Reverse the mapping; return native -> OCSF map""" - if result is None: - result = {} - for k, v in obj.items(): - k = ".".join((prefix, k)) if prefix else k - # Recurse if necessary - if isinstance(v, str): - _add_attr(result, v, k) - elif isinstance(v, list): - # Need to handle multiple mappings - for i in v: - if isinstance(i, str): - _add_attr(result, i, k) - elif "native_field" in i: - _reverse_dict(result, k, i) - else: - # Need to "deep" merge with current results - reverse_mapping(i, k, result) - elif isinstance(v, dict): - # First determine if this is a complex mapping or just another level - if "native_field" in v: - _reverse_dict(result, k, v) - else: - # Need to "deep" merge with current results - reverse_mapping(v, k, result) - - return result - - -def _get_map_triple(d: dict, prefix: str, op: str, value) -> tuple: - mapped_op = d.get(f"{prefix}_op") - transform = d.get(f"{prefix}_value") - new_value = run_transformer(transform, value) - new_op = mapped_op if mapped_op else op - return (d[f"{prefix}_field"], new_op, new_value) - - -def translate_comparison_to_native( - dmm: dict, field: str, op: str, value: Union[str, int, float] -) -> list: - """Translate the (`field`, `op`, `value`) triple using data model map `dmm` - - This function may be used in datasource interfaces to translate a comparison - in the OCSF data model to the native data model, according to the data model - mapping in `dmm`. - - This function translates the (`field`, `op`, `value`) triple into a list of - translated triples based on the provided data model map. The data model map - is a dictionary that maps fields from one data model to another. For - example, if you have a field named "user.name" in your data model, but the - corresponding field in the native data model is "username", then you can use - the data model map to translate the field name. - - Parameters: - dmm: A dictionary that maps fields from one data model to another. - field: The field name to be translated. - op: The comparison operator. - value: The value to be compared against. - - Returns: - A list of translated triples. - - Raises: - KeyError: If the field cannot be found in the data model map. - """ - _logger.debug("comp_to_native: %s %s %s", field, op, value) - result = [] - mapping = dmm.get(field) - if mapping: - if isinstance(mapping, str): - # Simple 1:1 field name mapping - result.append((mapping, op, value)) - else: - raise NotImplementedError("complex native mapping") - else: - parts = field.split(".") - tmp = dmm - for part in parts: - if isinstance(tmp, dict): - tmp = tmp.get(part, {}) - else: - break - if tmp: - if isinstance(tmp, list): - for i in tmp: - if isinstance(i, dict): - result.append(_get_map_triple(i, "native", op, value)) - else: - result.append((i, op, value)) - elif isinstance(tmp, dict): - result.append(_get_map_triple(tmp, "native", op, value)) - elif isinstance(tmp, str): - result.append((tmp, op, value)) - else: - # Pass-through - result.append((field, op, value)) - _logger.debug("comp_to_native: return %s", result) - return result - - -def translate_comparison_to_ocsf( - dmm: dict, field: str, op: str, value: Union[str, int, float] -) -> list: - """Translate the (`field`, `op`, `value`) triple using data model map `dmm` - - This function is used in the frontend to translate a comparison in - the STIX (or, in the future, ECS) data model to the OCSF data - model, according to the data model mapping in `dmm`. - - This function translates the (`field`, `op`, `value`) triple into a list of - translated triples based on the provided data model map. The data model map - is a dictionary that maps fields from one data model to another. For - example, if you have a field named "user.name" in your data model, but the - corresponding field in the native data model is "username", then you can use - the data model map to translate the field name. - - Parameters: - dmm: A dictionary that maps fields from one data model to another. - field: The field name to be translated. - op: The comparison operator. - value: The value to be compared against. - - Returns: - A list of translated triples. - - Raises: - KeyError: If the field cannot be found in the data model map. - - """ - _logger.debug("comp_to_ocsf: %s %s %s", field, op, value) - result = [] - mapping = dmm.get(field) - if isinstance(mapping, str): - # Simple 1:1 field name mapping - result.append((mapping, op, value)) - elif isinstance(mapping, list): - for i in mapping: - if isinstance(i, dict): - result.append(_get_map_triple(i, "ocsf", op, value)) - else: - result.append((i, op, value)) - return result - - -@typechecked -def load_default_mapping( - data_model_name: str, - mapping_pkg: str = "kestrel.mapping", - submodule: str = "entityattribute", -): - result = {} - entityattr_mapping_files = list_folder_files( - mapping_pkg, submodule, prefix=data_model_name, extension="yaml" - ) - for f in entityattr_mapping_files: - with open(f, "r") as fp: - result.update(yaml.safe_load(fp)) - return result - - -@typechecked -def _get_from_mapping(mapping: Union[str, list, dict], key) -> list: - result = [] - if isinstance(mapping, list): - for i in mapping: - if isinstance(i, dict): - result.append(i[key]) - else: - result.append(i) - elif isinstance(mapping, dict): - result.append(mapping[key]) - elif isinstance(mapping, str): - result.append(mapping) - return result - - -@typechecked -def translate_projection_to_native( - dmm: dict, - entity_type: Optional[str], - attrs: Optional[list], - # TODO: optional str or callable for joining entity_type and attr? -) -> list: - result = [] - if entity_type: - dmm = dmm[entity_type] - if not attrs: - for native_field, mapping in reverse_mapping(dmm).items(): - result.extend( - [(native_field, i) for i in _get_from_mapping(mapping, "ocsf_field")] - ) - attrs = [] - for attr in attrs: - mapping = dmm.get(attr) - if not mapping: - parts = attr.split(".") - tmp = dmm - for part in parts: - if isinstance(tmp, dict): - tmp = tmp.get(part, {}) - else: - break - if tmp: - mapping = tmp - if mapping: - result.extend( - [(i, attr) for i in _get_from_mapping(mapping, "native_field")] - ) - else: - # Pass-through? - result.append((attr, attr)) # FIXME: raise exception instead? - _logger.debug("proj_to_native: return %s", result) - return result - - -@typechecked -def translate_dataframe(df: DataFrame, dmm: dict) -> DataFrame: - # Translate results into Kestrel OCSF data model - # The column names of df are already mapped - df = df.replace({np.nan: None}) - for col in df.columns: - mapping = dpath.get(dmm, col, separator=".") - if isinstance(mapping, dict): - transformer_name = mapping.get("ocsf_value") - df[col] = run_transformer_on_series(transformer_name, df[col]) - return df diff --git a/packages-nextgen/kestrel_core/src/kestrel/mapping/entityattribute/ecs.yaml b/packages-nextgen/kestrel_core/src/kestrel/mapping/entityattribute/ecs.yaml deleted file mode 100644 index d4a1bf75..00000000 --- a/packages-nextgen/kestrel_core/src/kestrel/mapping/entityattribute/ecs.yaml +++ /dev/null @@ -1,233 +0,0 @@ -# https://schema.ocsf.io/1.1.0/objects/file -file: - accessed_time: file.accessed - attributes: file.attributes - created_time: file.created - # This "hashes" notation comes from jmespath (filter projection) - # It's much easier to use the ECS notation in this case - hashes[?algorithm_id == 1]: - value: hash.md5 - hashes[?algorithm_id == 2]: - value: hash.sha1 - hashes[?algorithm_id == 3]: - value: hash.sha256 - hashes[?algorithm_id == 4]: - value: hash.sha512 - hashes[?algorithm_id == 5]: - value: hash.ssdeep - hashes[?algorithm_id == 6]: - value: hash.tlsh - hashes[*]: - value: - - hash.md5 - - hash.sha1 - - hash.sha256 - - hash.sha512 - - hash.ssdeep - - hash.tlsh - modified_time: file.ctime - mime_type: file.mime_type - mode: file.mode - modified_time: file.mtime - name: file.name - owner: file.owner - parent_folder: file.directory - path: file.path - size: file.size - type: file.type - xattributes: - primary_group: file.gid - link_name: file.target_path - - -# https://schema.ocsf.io/1.1.0/objects/group -group: - domain: group.domain - name: group.name - uid: group.id - - -# https://schema.ocsf.io/1.1.0/objects/process -process: - cmd_line: process.command_line - name: process.name - pid: process.pid - uid: process.entity_id - file: - name: - native_field: process.executable - native_op: LIKE - native_value: endswith - ocsf_value: basename - path: process.executable - parent_folder: - native_field: process.executable - native_op: LIKE - native_value: startswith - ocsf_value: dirname - # This "hashes" notation comes from jmespath (filter projection) - # It's much easier to use the ECS notation in this case - hashes[?algorithm_id == 1]: - value: process.hash.md5 - hashes[?algorithm_id == 2]: - value: process.hash.sha1 - hashes[?algorithm_id == 3]: - value: process.hash.sha256 - hashes[?algorithm_id == 4]: - value: process.hash.sha512 - hashes[?algorithm_id == 5]: - value: process.hash.ssdeep - hashes[?algorithm_id == 6]: - value: process.hash.tlsh - hashes[*]: - value: - - process.hash.md5 - - process.hash.sha1 - - process.hash.sha256 - - process.hash.sha512 - - process.hash.ssdeep - - process.hash.tlsh - parent_process: - cmd_line: process.parent.command_line - name: process.parent.name - pid: process.parent.pid - uid: process.parent.entity_id - file: - name: - native_field: process.parent.executable - native_op: LIKE - native_value: endswith - ocsf_value: basename - path: process.parent.executable - parent_folder: - native_field: process.parent.executable - native_op: LIKE - native_value: startswith - ocsf_value: dirname - - -# src_endpoint: see https://schema.ocsf.io/1.1.0/objects/endpoint -src_endpoint: &src_ref - domain: - - client.domain - - source.domain - hostname: - - client.domain - - source.domain - ip: - - client.ip - - source.ip - mac: - - client.mac - - source.mac - port: - - client.port - - source.port - - -# endpoint: see https://schema.ocsf.io/1.1.0/objects/endpoint -endpoint: - domain: - - client.domain - - source.domain - - server.domain - - destination.domain - hostname: - - client.domain - - source.domain - - server.domain - - destination.domain - ip: - - client.ip - - source.ip - - server.ip - - destination.ip - mac: - - client.mac - - source.mac - - server.mac - - destination.mac - port: - - client.port - - source.port - - server.port - - destination.port - - -# dst_endpoint: see https://schema.ocsf.io/1.1.0/objects/endpoint -dst_endpoint: &dst_ref - domain: - - server.domain - - destination.domain - hostname: - - server.domain - - destination.domain - ip: - - server.ip - - destination.ip - mac: - - server.mac - - destination.mac - port: - - server.port - - destination.port - - -# https://schema.ocsf.io/1.1.0/objects/network_traffic -# should be `network_traffic`? -traffic: &traffic - bytes: network.bytes - bytes_in: - - destination.bytes - - server.bytes - bytes_out: - - client.bytes - - source.bytes - packets: network.packets - packets_in: - - destination.packets - - server.packets - packets_out: - - client.packets - - source.packets - - -# https://schema.ocsf.io/1.1.0/objects/network_connection_info -connection_info: - direction: network.direction #TODO: need transformer? - protocol_num: network.iana_number - protocol_name: network.transport - protocol_ver: network.type - protocol_ver_id: - native_field: network.type - native_value: ip_version_to_network_layer - ocsf_value: network_layer_to_ip_version - - -# https://schema.ocsf.io/1.1.0/objects/certificate -certificate: - expiration_time: x509.not_after - created_time: x509.not_before - serial_number: x509.serial_number - fingerprints[*]: - algorithm: x509.signature_algorithm - version: x509.version_number - issuer: x509.issuer.distinguished_name - subject: x509.subject.distinguished_name - #uid: - - -# https://schema.ocsf.io/1.1.0/objects/user -user: - domain: user.domain - full_name: user.full_name - name: user.name - uid: user.id - - -# https://schema.ocsf.io/1.1.0/classes/network_activity -# Network Activity [4001] Class -network_activity: - src_endpoint: *src_ref - dst_endpoint: *dst_ref - traffic: *traffic diff --git a/packages-nextgen/kestrel_core/src/kestrel/mapping/entityattribute/stix.yaml b/packages-nextgen/kestrel_core/src/kestrel/mapping/entityattribute/stix.yaml deleted file mode 100644 index 7082e6dd..00000000 --- a/packages-nextgen/kestrel_core/src/kestrel/mapping/entityattribute/stix.yaml +++ /dev/null @@ -1,143 +0,0 @@ -# https://schema.ocsf.io/1.1.0/objects/file -file: - name: file:name - size: file:size - accessed_time: file:accessed - created_time: file:created - modified_time: file:modified - # This "hashes" notation comes from jmespath (filter projection) - # It's much easier to use the ECS notation in this case - hashes[?algorithm_id == 1]: - value: file:hashes.MD5 - hashes[?algorithm_id == 2]: - value: "file:hashes.'SHA-1'" - hashes[?algorithm_id == 3]: - value: "file:hashes.'SHA-256'" - hashes[?algorithm_id == 4]: - value: "file:hashes.'SHA-512'" - hashes[?algorithm_id == 5]: - value: file:hashes.SSDEEP - hashes[?algorithm_id == 6]: - value: file:hashes.TLSH - hashes[*]: - value: - - file:hashes.MD5 - - "file:hashes.'SHA-1'" - - "file:hashes.'SHA-256'" - - "file:hashes.'SHA-512'" - - file:hashes.SSDEEP - - file:hashes.TLSH - - -# https://schema.ocsf.io/1.1.0/objects/group -# group: -# domain: -# name: -# uid: - - -# https://schema.ocsf.io/1.1.0/objects/process -process: - cmd_line: process:command_line - name: process:name - pid: process:pid - uid: process:x_unique_id - file: - name: process:binary_ref.name - parent_folder: process:binary_ref.parent_directory_ref.path - # This "hashes" notation comes from jmespath (filter projection) - # It's much easier to use the ECS notation in this case - hashes[?algorithm_id == 1]: - value: process:binary_ref.hashes.MD5 - hashes[?algorithm_id == 2]: - value: process:binary_ref.hashes.'SHA-1' - hashes[?algorithm_id == 3]: - value: process:binary_ref.hashes.'SHA-256' - hashes[?algorithm_id == 4]: - value: process:binary_ref.hashes.'SHA-512' - hashes[?algorithm_id == 5]: - value: process:binary_ref.hashes.SSDEEP - hashes[?algorithm_id == 6]: - value: process:binary_ref.hashes.TLSH - hashes[*]: - value: - - process:binary_ref.hashes.MD5 - - process:binary_ref.hashes.'SHA-1' - - process:binary_ref.hashes.'SHA-256' - - process:binary_ref.hashes.'SHA-512' - - process:binary_ref.hashes.SSDEEP - - process:binary_ref.hashes.TLSH - parent_process: - cmd_line: process:parent_ref.command_line - name: process:parent_ref.name - pid: process:parent_ref.pid - uid: process:parent_ref.x_unique_id - file: - name: process:parent_ref.binary_ref.name - parent_folder: process:parent_ref.binary_ref.parent_directory_ref.path - - -# dst_endpoint: see https://schema.ocsf.io/1.1.0/objects/endpoint -dst_endpoint: - ip: - - network-traffic:dst_ref.value - - ipv4-addr:value - port: network-traffic:dst_port - - -# src_endpoint: see https://schema.ocsf.io/1.1.0/objects/endpoint -src_endpoint: - ip: - - network-traffic:src_ref.value - - ipv4-addr:value - port: network-traffic:src_port - - -# https://schema.ocsf.io/1.1.0/objects/endpoint -endpoint: - ip: ipv4-addr:value - - -# https://schema.ocsf.io/1.1.0/objects/device -device: - ip: ipv4-addr:value - - -# https://schema.ocsf.io/1.1.0/objects/network_traffic -traffic: # should be `network_traffic`? - #TODO: bytes: sum of byte counts? - bytes_in: network-traffic:dst_byte_count - bytes_out: network-traffic:src_byte_count - #TODO: packets: sum of packet counts? - packets_in: network-traffic:dst_packets - packets_out: network-traffic:src_packets - - -# https://schema.ocsf.io/1.1.0/objects/network_connection_info -# connection_info: -# direction: -# protocol_num: -# protocol_name: -# protocol_ver: -# protocol_ver_id: - - -# https://schema.ocsf.io/1.1.0/objects/certificate -certificate: - expiration_time: x509-certificate:validity_not_after - created_time: x509-certificate:validity_not_before - serial_number: x509-certificate:serial_number - fingerprints[*]: - algorithm: x509-certificate:signature_algorithm - version: x509-certificate:version_number - issuer: x509-certificate:issuer - subject: x509-certificate:subject - #uid: - - -# https://schema.ocsf.io/1.1.0/objects/user -user: - full_name: user-account:display_name - name: user-account:account_login - type: user-account:account_type - uid: user-account:user_id diff --git a/packages-nextgen/kestrel_core/src/kestrel/mapping/entityname/alias.yaml b/packages-nextgen/kestrel_core/src/kestrel/mapping/entityname/alias.yaml deleted file mode 100644 index 306b557a..00000000 --- a/packages-nextgen/kestrel_core/src/kestrel/mapping/entityname/alias.yaml +++ /dev/null @@ -1,3 +0,0 @@ -event: base_event -activity: base_event - diff --git a/packages-nextgen/kestrel_core/src/kestrel/mapping/entityname/ecs.yaml b/packages-nextgen/kestrel_core/src/kestrel/mapping/entityname/ecs.yaml deleted file mode 100644 index 8d06636e..00000000 --- a/packages-nextgen/kestrel_core/src/kestrel/mapping/entityname/ecs.yaml +++ /dev/null @@ -1,9 +0,0 @@ -process: process -file: file -group: group -client: network_endpoint -destination: network_endpoint -server: network_endpoint -source: network_endpoint -network: network_activity -user: user diff --git a/packages-nextgen/kestrel_core/src/kestrel/mapping/entityname/stix.yaml b/packages-nextgen/kestrel_core/src/kestrel/mapping/entityname/stix.yaml deleted file mode 100644 index cd80756a..00000000 --- a/packages-nextgen/kestrel_core/src/kestrel/mapping/entityname/stix.yaml +++ /dev/null @@ -1,21 +0,0 @@ -# TODO mapping for artifact: -autonomous-system: organization -directory: file -domain-name: endpoint -email-addr: user -email-message: email -file: file -ipv4-addr: endpoint -ipv6-addr: endpoint -mac-addr: endpoint -network-traffic: network_activity -process: process -software: product -url: http_request -user-account: user -x-ibm-finding: security_finding -x-ibm-ttp-tagging: attack -x-oca-asset: device -x-oca-event: base_event -x509-certificate: certificate -windows-registry-key: win/registry_key diff --git a/packages-nextgen/kestrel_core/src/kestrel/mapping/transformers.py b/packages-nextgen/kestrel_core/src/kestrel/mapping/transformers.py deleted file mode 100644 index 82202dcb..00000000 --- a/packages-nextgen/kestrel_core/src/kestrel/mapping/transformers.py +++ /dev/null @@ -1,110 +0,0 @@ -"""Kestrel Data Model Map value transformers""" - -from datetime import datetime, timezone -from typing import Callable - -from pandas import Series - - -# Dict of "registered" transformers -_transformers = {} - - -def transformer(func: Callable) -> Callable: - """A decorator for registering a transformer""" - _transformers[func.__name__] = func - return func - - -@transformer -def to_epoch_ms(value: str) -> int: - """Convert a time value to milliseconds since the epoch""" - if "." in value: - time_pattern = "%Y-%m-%dT%H:%M:%S.%fZ" - else: - time_pattern = "%Y-%m-%dT%H:%M:%SZ" - dt = datetime.strptime(value, time_pattern).replace(tzinfo=timezone.utc) - return int(dt.timestamp() * 1000) - - -@transformer -def dirname(path: str) -> str: # TODO: rename to winpath_dirname? - """Get the directory part of `path`""" - path_dir, _, _ = path.rpartition("\\") - return path_dir - - -@transformer -def basename(path: str) -> str: # TODO: rename to winpath_dirname? - """Get the filename part of `path`""" - _, _, path_file = path.rpartition("\\") - return path_file - - -@transformer -def startswith(value: str) -> str: # TODO: rename to winpath_startswith? - return f"{value}\\%" - - -@transformer -def endswith(value: str) -> str: # TODO: rename to winpath_endswith? - return f"%\\{value}" - - -@transformer -def to_int(value) -> int: - """Ensure `value` is an int""" - try: - return int(value) - except ValueError: - # Maybe it's a hexadecimal string? - return int(value, 16) - - -@transformer -def to_str(value) -> str: - """Ensure `value` is a str""" - return str(value) - - -@transformer -def ip_version_to_network_layer(value: int) -> str: - if value == 4: - return "ipv4" - elif value == 6: - return "ipv6" - elif value == 99: - return "other" - return "unknown" - - -@transformer -def network_layer_to_ip_version(val: str) -> int: - value = val.lower() - if value == "ipv4": - return 4 - elif value == "ipv6": - return 6 - elif value == "other": - return 99 - return 0 - - -def run_transformer(transformer_name: str, value): - """Run the registered transformer with name `transformer_name` on `value`""" - func = _transformers.get(transformer_name) - if func: - result = func(value) - else: - raise NameError(transformer_name) - return result - - -def run_transformer_on_series(transformer_name: str, value: Series): - """Run the registered transformer with name `transformer_name` on `value`""" - func = _transformers.get(transformer_name) - if func: - result = value.apply(func) - else: - raise NameError(transformer_name) - return result diff --git a/packages-nextgen/kestrel_core/src/kestrel/session.py b/packages-nextgen/kestrel_core/src/kestrel/session.py deleted file mode 100644 index 48ebf1f8..00000000 --- a/packages-nextgen/kestrel_core/src/kestrel/session.py +++ /dev/null @@ -1,130 +0,0 @@ -import logging -from contextlib import AbstractContextManager -from uuid import UUID, uuid4 -from typing import Iterable -from typeguard import typechecked - -from kestrel.display import Display, GraphExplanation -from kestrel.ir.graph import IRGraph -from kestrel.ir.instructions import Instruction, Explain -from kestrel.frontend.parser import parse_kestrel -from kestrel.cache import AbstractCache, SqliteCache -from kestrel.config.internal import CACHE_INTERFACE_IDENTIFIER -from kestrel.interface import AbstractInterface, InterfaceManager -from kestrel.exceptions import InstructionNotFound - - -_logger = logging.getLogger(__name__) - - -@typechecked -class Session(AbstractContextManager): - """Kestrel huntflow execution session""" - - def __init__(self): - self.session_id = uuid4() - self.irgraph = IRGraph() - - # load all interfaces; cache is a special interface - cache = SqliteCache() - self.interface_manager = InterfaceManager([cache]) - - def execute(self, huntflow_block: str) -> Iterable[Display]: - """Execute a Kestrel huntflow block. - - Execute a Kestrel statement or multiple consecutive statements (a - huntflow block) This method has the context of already executed - huntflow blocks in this session, so all existing variables can be - referred in the new huntflow block. - - Parameters: - huntflow_block: the new huntflow block to be executed - - Returns: - Evaluated result per Return instruction - """ - return list(self.execute_to_generate(huntflow_block)) - - def execute_to_generate(self, huntflow_block: str) -> Iterable[Display]: - """Execute a Kestrel huntflow and put results in a generator. - - Parameters: - huntflow_block: the new huntflow block to be executed - - Yields: - Evaluated result per Return instruction - """ - irgraph_new = parse_kestrel(huntflow_block) - self.irgraph.update(irgraph_new) - - for ret in irgraph_new.get_returns(): - yield self.evaluate_instruction(ret) - - def evaluate_instruction(self, ins: Instruction) -> Display: - """Evaluate a single Instruction. - - Parameters: - ins: the instruction to evaluate - - Returns: - Evaluated result (Kestrel Display object) - """ - if ins not in self.irgraph: - raise InstructionNotFound(ins.to_dict()) - - pred = self.irgraph.get_trunk_n_branches(ins)[0] - is_explain = isinstance(pred, Explain) - display = GraphExplanation([]) - - _interface_manager = ( - self.interface_manager.copy_with_virtual_cache() - if is_explain - else self.interface_manager - ) - _cache = _interface_manager[CACHE_INTERFACE_IDENTIFIER] - - # The current logic leads to caching results from non-cache and lastly - # evaluate in cache. - # TODO: may evaluate cache first, then push dependent variables to the - # last interface to eval; this requires priority of interfaces - while True: - for g in self.irgraph.find_dependent_subgraphs_of_node(ins, _cache): - interface = _interface_manager[g.interface] - for iid, _display in ( - interface.explain_graph(g) - if is_explain - else interface.evaluate_graph(g) - ).items(): - if is_explain: - display.graphlets.append(_display) - else: - display = _display - if interface is not _cache: - _cache[iid] = display - if iid == ins.id: - return display - - def do_complete(self, huntflow_block: str, cursor_pos: int): - """Kestrel code auto-completion. - - Parameters: - huntflow_block: Kestrel code - cursor_pos: the position to start completion (index in ``huntflow_block``) - - Returns: - A list of suggested strings to complete the code - """ - raise NotImplementedError() - - def close(self): - """Explicitly close the session. - - This may be executed by a context manager or when the program exits. - """ - # Note there are two conditions that trigger this function, so it is probably executed twice - # Be careful to write the logic in this function to avoid deleting nonexist files/dirs - if CACHE_INTERFACE_IDENTIFIER in self.interface_manager: - self.interface_manager.del_cache() - - def __exit__(self, exception_type, exception_value, traceback): - self.close() diff --git a/packages-nextgen/kestrel_core/src/kestrel/utils.py b/packages-nextgen/kestrel_core/src/kestrel/utils.py deleted file mode 100644 index 02cbb5b3..00000000 --- a/packages-nextgen/kestrel_core/src/kestrel/utils.py +++ /dev/null @@ -1,77 +0,0 @@ -import collections.abc -from importlib import resources -from kestrel.__future__ import is_python_older_than_minor_version -import os -from pathlib import Path -from pkgutil import get_data -from typeguard import typechecked -from typing import Optional, Mapping, Iterable - - -@typechecked -def load_data_file(package_name: str, file_name: str) -> str: - try: - # resources.files() is introduced in Python 3.9 - content = resources.files(package_name).joinpath(file_name).read_text() - except AttributeError: - # Python 3.8; deprecation warning forward - if is_python_older_than_minor_version(9): - content = get_data(package_name, file_name).decode("utf-8") - - return content - - -@typechecked -def list_folder_files( - package_name: str, - folder_name: str, - prefix: Optional[str] = None, - extension: Optional[str] = None, -) -> Iterable[str]: - # preprocesss extension to add dot it not there - if extension and extension[0] != ".": - extension = "." + extension - try: - file_paths = resources.files(package_name).joinpath(folder_name).iterdir() - except AttributeError: - if is_python_older_than_minor_version(9): - import pkg_resources - - file_names = pkg_resources.resource_listdir(package_name, folder_name) - file_paths = [ - Path( - pkg_resources.resource_filename( - package_name, os.path.join(folder_name, filename) - ) - ) - for filename in file_names - ] - file_list = ( - f - for f in file_paths - if ( - f.is_file() - and (f.name.endswith(extension) if extension else True) - and (f.name.startswith(prefix) if prefix else True) - ) - ) - return file_list - - -@typechecked -def unescape_quoted_string(s: str) -> str: - if s.startswith("r"): - return s[2:-1] - else: - return s[1:-1].encode("utf-8").decode("unicode_escape") - - -@typechecked -def update_nested_dict(dict_old: Mapping, dict_new: Optional[Mapping]) -> Mapping: - if dict_new: - for k, v in dict_new.items(): - if isinstance(v, collections.abc.Mapping) and k in dict_old: - dict_old[k] = update_nested_dict(dict_old[k], v) - else: - dict_old[k] = v - return dict_old diff --git a/packages-nextgen/kestrel_core/tests/__init__.py b/packages-nextgen/kestrel_core/tests/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/packages-nextgen/kestrel_core/tests/test_cache_inmemory.py b/packages-nextgen/kestrel_core/tests/test_cache_inmemory.py deleted file mode 100644 index 1a0bb9ca..00000000 --- a/packages-nextgen/kestrel_core/tests/test_cache_inmemory.py +++ /dev/null @@ -1,121 +0,0 @@ -import pytest -from pandas import DataFrame -from uuid import uuid4 - -from kestrel.cache import InMemoryCache -from kestrel.cache.inmemory import InMemoryCacheVirtual -from kestrel.ir.graph import IRGraph, IRGraphEvaluable -from kestrel.frontend.parser import parse_kestrel - - -def test_inmemory_cache_set_get_del(): - c = InMemoryCache() - idx = uuid4() - df = DataFrame([1, 2, 3]) - c[idx] = df - assert df.equals(c[idx]) - del c[idx] - assert idx not in c - - -def test_inmemory_cache_constructor(): - ids = [uuid4() for i in range(5)] - df = DataFrame([1, 2, 3]) - c = InMemoryCache({x:df for x in ids}) - for u in ids: - assert df.equals(c[u]) - for u in ids: - del c[u] - assert u not in c - - -def test_eval_new_filter_disp(): - stmt = """ -proclist = NEW process [ {"name": "cmd.exe", "pid": 123} - , {"name": "explorer.exe", "pid": 99} - , {"name": "firefox.exe", "pid": 201} - , {"name": "chrome.exe", "pid": 205} - ] -browsers = proclist WHERE name = 'firefox.exe' OR name = 'chrome.exe' -DISP browsers ATTR name, pid -""" - graph = IRGraphEvaluable(parse_kestrel(stmt)) - c = InMemoryCache() - mapping = c.evaluate_graph(graph) - - # check the return is correct - rets = graph.get_returns() - assert len(rets) == 1 - df = mapping[rets[0].id] - assert df.to_dict("records") == [ {"name": "firefox.exe", "pid": 201} - , {"name": "chrome.exe", "pid": 205} - ] - # check whether `proclist` is cached - proclist = graph.get_variable("proclist") - assert c[proclist.id].to_dict("records") == [ {"name": "cmd.exe", "pid": 123} - , {"name": "explorer.exe", "pid": 99} - , {"name": "firefox.exe", "pid": 201} - , {"name": "chrome.exe", "pid": 205} - ] - # check whether `browsers` is cached - browsers = graph.get_variable("browsers") - assert c[browsers.id].to_dict("records") == [ {"name": "firefox.exe", "pid": 201} - , {"name": "chrome.exe", "pid": 205} - ] - - -def test_eval_filter_with_ref(): - stmt = """ -proclist = NEW process [ {"name": "cmd.exe", "pid": 123} - , {"name": "explorer.exe", "pid": 99} - , {"name": "firefox.exe", "pid": 201} - , {"name": "chrome.exe", "pid": 205} - ] -browsers = proclist WHERE name = 'firefox.exe' OR name = 'chrome.exe' -specials = proclist WHERE pid IN [123, 201] -p2 = proclist WHERE pid = browsers.pid and name = specials.name -DISP p2 ATTR name, pid -""" - graph = IRGraphEvaluable(parse_kestrel(stmt)) - c = InMemoryCache() - mapping = c.evaluate_graph(graph) - - # check the return is correct - rets = graph.get_returns() - assert len(rets) == 1 - df = mapping[rets[0].id] - assert df.to_dict("records") == [ {"name": "firefox.exe", "pid": 201} ] - -def test_get_virtual_copy(): - stmt = """ -proclist = NEW process [ {"name": "cmd.exe", "pid": 123} - , {"name": "explorer.exe", "pid": 99} - , {"name": "firefox.exe", "pid": 201} - , {"name": "chrome.exe", "pid": 205} - ] -browsers = proclist WHERE name = 'firefox.exe' OR name = 'chrome.exe' -""" - graph = IRGraphEvaluable(parse_kestrel(stmt)) - c = InMemoryCache() - mapping = c.evaluate_graph(graph) - v = c.get_virtual_copy() - new_entry = uuid4() - v[new_entry] = True - - # v[new_entry] calls the right method - assert isinstance(v, InMemoryCacheVirtual) - assert v[new_entry].startswith("virtual") - - # v[new_entry] does not hit v.cache - assert len(c.cache) == 2 - assert len(v.cache) == 2 - - # the two cache_catalog are different - assert new_entry not in c - assert new_entry in v - del v[new_entry] - assert new_entry not in v - for u in c: - del v[u] - assert len(v) == 0 - assert len(c) == 2 diff --git a/packages-nextgen/kestrel_core/tests/test_cache_sqlite.py b/packages-nextgen/kestrel_core/tests/test_cache_sqlite.py deleted file mode 100644 index 5db07fb6..00000000 --- a/packages-nextgen/kestrel_core/tests/test_cache_sqlite.py +++ /dev/null @@ -1,183 +0,0 @@ -from uuid import uuid4 -from pandas import DataFrame - -from kestrel.cache import SqliteCache -from kestrel.cache.sqlite import SqliteCacheVirtual -from kestrel.ir.graph import IRGraphEvaluable -from kestrel.frontend.parser import parse_kestrel - - -def test_sqlite_cache_set_get_del(): - c = SqliteCache() - idx = uuid4() - df = DataFrame({'foo': [1, 2, 3]}) - c[idx] = df - assert df.equals(c[idx]) - del c[idx] - assert idx not in c - - -def test_sqlite_cache_constructor(): - ids = [uuid4() for i in range(5)] - df = DataFrame({'foo': [1, 2, 3]}) - c = SqliteCache({x:df for x in ids}) - for u in ids: - assert df.equals(c[u]) - for u in ids: - del c[u] - assert u not in c - - -def test_eval_new_disp(): - stmt = """ -proclist = NEW process [ {"name": "cmd.exe", "pid": 123} - , {"name": "explorer.exe", "pid": 99} - , {"name": "firefox.exe", "pid": 201} - , {"name": "chrome.exe", "pid": 205} - ] -DISP proclist ATTR name -""" - graph = IRGraphEvaluable(parse_kestrel(stmt)) - c = SqliteCache() - mapping = c.evaluate_graph(graph) - - # check the return is correct - rets = graph.get_returns() - assert len(rets) == 1 - df = mapping[rets[0].id] - assert df.to_dict("records") == [ {"name": "cmd.exe"} - , {"name": "explorer.exe"} - , {"name": "firefox.exe"} - , {"name": "chrome.exe"} - ] - - -def test_eval_new_filter_disp(): - stmt = """ -proclist = NEW process [ {"name": "cmd.exe", "pid": 123} - , {"name": "explorer.exe", "pid": 99} - , {"name": "firefox.exe", "pid": 201} - , {"name": "chrome.exe", "pid": 205} - ] -browsers = proclist WHERE name = 'firefox.exe' OR name = 'chrome.exe' -DISP browsers ATTR name, pid -""" - graph = IRGraphEvaluable(parse_kestrel(stmt)) - c = SqliteCache() - mapping = c.evaluate_graph(graph) - - # check the return is correct - rets = graph.get_returns() - assert len(rets) == 1 - df = mapping[rets[0].id] - assert df.to_dict("records") == [ {"name": "firefox.exe", "pid": 201} - , {"name": "chrome.exe", "pid": 205} - ] - - -def test_eval_two_returns(): - stmt = """ -proclist = NEW process [ {"name": "cmd.exe", "pid": 123} - , {"name": "explorer.exe", "pid": 99} - , {"name": "firefox.exe", "pid": 201} - , {"name": "chrome.exe", "pid": 205} - ] -browsers = proclist WHERE name != "cmd.exe" -DISP browsers -DISP browsers ATTR pid -""" - graph = parse_kestrel(stmt) - c = SqliteCache() - rets = graph.get_returns() - - # first DISP - gs = graph.find_dependent_subgraphs_of_node(rets[0], c) - assert len(gs) == 1 - mapping = c.evaluate_graph(gs[0]) - df1 = DataFrame([ {"name": "explorer.exe", "pid": 99} - , {"name": "firefox.exe", "pid": 201} - , {"name": "chrome.exe", "pid": 205} - ]) - assert len(mapping) == 1 - assert df1.equals(mapping[rets[0].id]) - - # second DISP - gs = graph.find_dependent_subgraphs_of_node(rets[1], c) - assert len(gs) == 1 - mapping = c.evaluate_graph(gs[0]) - df2 = DataFrame([ {"pid": 99} - , {"pid": 201} - , {"pid": 205} - ]) - assert len(mapping) == 1 - assert df2.equals(mapping[rets[1].id]) - - -def test_issue_446(): - """The `WHERE name IN ...` below was raising `sqlalchemy.exc.StatementError: (builtins.KeyError) 'name_1'` - https://github.com/opencybersecurityalliance/kestrel-lang/issues/446 - """ - stmt = """ -proclist = NEW process [ {"name": "cmd.exe", "pid": 123} - , {"name": "explorer.exe", "pid": 99} - , {"name": "firefox.exe", "pid": 201} - , {"name": "chrome.exe", "pid": 205} - ] -browsers = proclist WHERE name IN ("explorer.exe", "firefox.exe", "chrome.exe") -""" - graph = IRGraphEvaluable(parse_kestrel(stmt)) - c = SqliteCache() - _ = c.evaluate_graph(graph) - - -def test_eval_filter_with_ref(): - stmt = """ -proclist = NEW process [ {"name": "cmd.exe", "pid": 123} - , {"name": "explorer.exe", "pid": 99} - , {"name": "firefox.exe", "pid": 201} - , {"name": "chrome.exe", "pid": 205} - ] -browsers = proclist WHERE name = 'firefox.exe' OR name = 'chrome.exe' -specials = proclist WHERE pid IN [123, 201] -p2 = proclist WHERE pid = browsers.pid and name = specials.name -DISP p2 ATTR name, pid -""" - graph = IRGraphEvaluable(parse_kestrel(stmt)) - c = SqliteCache() - mapping = c.evaluate_graph(graph) - - # check the return is correct - rets = graph.get_returns() - assert len(rets) == 1 - df = mapping[rets[0].id] - assert df.to_dict("records") == [ {"name": "firefox.exe", "pid": 201} ] - -def test_get_virtual_copy(): - stmt = """ -proclist = NEW process [ {"name": "cmd.exe", "pid": 123} - , {"name": "explorer.exe", "pid": 99} - , {"name": "firefox.exe", "pid": 201} - , {"name": "chrome.exe", "pid": 205} - ] -browsers = proclist WHERE name = 'firefox.exe' OR name = 'chrome.exe' -""" - graph = IRGraphEvaluable(parse_kestrel(stmt)) - c = SqliteCache() - mapping = c.evaluate_graph(graph) - v = c.get_virtual_copy() - new_entry = uuid4() - v[new_entry] = True - - # v[new_entry] calls the right method - assert isinstance(v, SqliteCacheVirtual) - assert v[new_entry].endswith("v") - - # the two cache_catalog are different - assert new_entry not in c - assert new_entry in v - del v[new_entry] - assert new_entry not in v - for u in c: - del v[u] - assert len(v) == 0 - assert len(c) == 1 diff --git a/packages-nextgen/kestrel_core/tests/test_config.py b/packages-nextgen/kestrel_core/tests/test_config.py deleted file mode 100644 index 2fcec65a..00000000 --- a/packages-nextgen/kestrel_core/tests/test_config.py +++ /dev/null @@ -1,60 +0,0 @@ -import kestrel.config.utils as cfg -import os - - -def test_env_vars_in_config(): - - test_config = """--- -credentials: - username: $TEST_USER - password: $TEST_PASSWORD - """ - os.environ["TEST_USER"] = "test-user" - os.environ["TEST_PASSWORD"] = "test-password" - os.environ["KESTREL_CONFIG"] = os.path.join(os.sep, "tmp", "config.yaml") - - with open(os.getenv("KESTREL_CONFIG"), "w") as fp: - fp.write(test_config) - config = cfg.load_config() - assert config["credentials"]["username"] == "test-user" - assert config["credentials"]["password"] == "test-password" - - -def test_env_vars_in_config_overwrite(): - - test_config = """--- -credentials: - username: ${TEST_USER} - password: ${TEST_PASSWORD} -debug: - cache_directory_prefix: $KESTREL_CACHE_DIRECTORY_PREFIX - """ - os.environ["TEST_USER"] = "test-user" - os.environ["TEST_PASSWORD"] = "test-password" - os.environ["KESTREL_CONFIG"] = os.path.join(os.sep, "tmp", "config.yaml") - os.environ["KESTREL_CACHE_DIRECTORY_PREFIX"] = "Kestrel2.0-" - with open(os.getenv("KESTREL_CONFIG"), "w") as fp: - fp.write(test_config) - config = cfg.load_config() - assert config["credentials"]["username"] == "test-user" - assert config["credentials"]["password"] == "test-password" - assert config["debug"]["cache_directory_prefix"] == "Kestrel2.0-" - -def test_empty_env_var_in_config(): - test_config = """--- -credentials: - username: ${TEST_USER} - password: ${TEST_PASSWORD} -debug: - cache_directory_prefix: $I_DONT_EXIST - """ - os.environ["TEST_USER"] = "test-user" - os.environ["TEST_PASSWORD"] = "test-password" - os.environ["KESTREL_CONFIG"] = os.path.join(os.sep, "tmp", "config.yaml") - os.environ["KESTREL_CACHE_DIRECTORY_PREFIX"] = "Kestrel2.0-" - with open(os.getenv("KESTREL_CONFIG"), "w") as fp: - fp.write(test_config) - config = cfg.load_config() - assert config["credentials"]["username"] == "test-user" - assert config["credentials"]["password"] == "test-password" - assert config["debug"]["cache_directory_prefix"] == "$I_DONT_EXIST" \ No newline at end of file diff --git a/packages-nextgen/kestrel_core/tests/test_interface_datasource_codegen_dataframe.py b/packages-nextgen/kestrel_core/tests/test_interface_datasource_codegen_dataframe.py deleted file mode 100644 index 4f9f7507..00000000 --- a/packages-nextgen/kestrel_core/tests/test_interface_datasource_codegen_dataframe.py +++ /dev/null @@ -1,107 +0,0 @@ -import pytest -from pandas import DataFrame - -from kestrel.interface.codegen.dataframe import ( - evaluate_source_instruction, - evaluate_transforming_instruction, -) - -from kestrel.ir.instructions import ( - Construct, - Variable, - Filter, - Limit, - ProjectAttrs, -) - -from kestrel.frontend.parser import parse_kestrel - - -def test_evaluate_Construct(): - data = [ {"name": "cmd.exe", "pid": 123} - , {"name": "explorer.exe", "pid": 99} - , {"name": "firefox.exe", "pid": 201} - , {"name": "chrome.exe", "pid": 205} - ] - ins = Construct(data) - df = evaluate_source_instruction(ins) - assert df.equals(DataFrame(data)) - - -def test_non_exist_eval(): - with pytest.raises(NotImplementedError): - evaluate_transforming_instruction(Variable("asdf"), DataFrame()) - - -def test_evaluate_Limit(): - data = [ {"name": "cmd.exe", "pid": 123} - , {"name": "explorer.exe", "pid": 99} - , {"name": "firefox.exe", "pid": 201} - , {"name": "chrome.exe", "pid": 205} - ] - df = DataFrame(data) - dfx = evaluate_transforming_instruction(Limit(2), df) - assert dfx.equals(df.head(2)) - - -def test_evaluate_ProjectAttrs(): - data = [ {"name": "cmd.exe", "pid": 123} - , {"name": "explorer.exe", "pid": 99} - , {"name": "firefox.exe", "pid": 201} - , {"name": "chrome.exe", "pid": 205} - ] - df = DataFrame(data) - dfx = evaluate_transforming_instruction(ProjectAttrs(["name"]), df) - assert dfx.equals(df[["name"]]) - - -def test_evaluate_Construct_Filter_ProjectAttrs(): - stmt = r""" -proclist = NEW process [ {"name": "cmd.exe", "pid": 123} - , {"name": "explorer.exe", "pid": 99} - , {"name": "firefox.exe", "pid": 201} - , {"name": "chrome.exe", "pid": 205} - ] -browsers = proclist WHERE name = 'firefox.exe' OR name = 'chrome.exe' -DISP browsers ATTR name, pid -p2 = proclist WHERE pid > 100 -p3 = proclist WHERE name LIKE "c%.exe" -p4 = proclist WHERE name MATCHES r"^c\w{2}\.exe" -""" - graph = parse_kestrel(stmt) - c = graph.get_nodes_by_type(Construct)[0] - df0 = evaluate_source_instruction(c) - assert df0.to_dict("records") == [ {"name": "cmd.exe", "pid": 123} - , {"name": "explorer.exe", "pid": 99} - , {"name": "firefox.exe", "pid": 201} - , {"name": "chrome.exe", "pid": 205} - ] - - browsers = graph.get_variable("browsers") - ft = next(graph.predecessors(browsers)) - dfx = evaluate_transforming_instruction(ft, df0) - assert dfx.to_dict("records") == [ {"name": "firefox.exe", "pid": 201} - , {"name": "chrome.exe", "pid": 205} - ] - proj = next(graph.successors(browsers)) - dfy = evaluate_transforming_instruction(proj, dfx) - assert dfx.to_dict("records") == [ {"name": "firefox.exe", "pid": 201} - , {"name": "chrome.exe", "pid": 205} - ] - - ft = next(graph.predecessors(graph.get_variable("p2"))) - dfx = evaluate_transforming_instruction(ft, df0) - assert dfx.to_dict("records") == [ {"name": "cmd.exe", "pid": 123} - , {"name": "firefox.exe", "pid": 201} - , {"name": "chrome.exe", "pid": 205} - ] - - ft = next(graph.predecessors(graph.get_variable("p3"))) - dfx = evaluate_transforming_instruction(ft, df0) - assert dfx.to_dict("records") == [ {"name": "cmd.exe", "pid": 123} - , {"name": "chrome.exe", "pid": 205} - ] - - ft = next(graph.predecessors(graph.get_variable("p4"))) - dfx = evaluate_transforming_instruction(ft, df0) - assert dfx.to_dict("records") == [ {"name": "cmd.exe", "pid": 123} ] diff --git a/packages-nextgen/kestrel_core/tests/test_interface_datasource_codegen_sql.py b/packages-nextgen/kestrel_core/tests/test_interface_datasource_codegen_sql.py deleted file mode 100644 index 1cc3c46c..00000000 --- a/packages-nextgen/kestrel_core/tests/test_interface_datasource_codegen_sql.py +++ /dev/null @@ -1,82 +0,0 @@ -from datetime import datetime -from dateutil import parser - -from kestrel.interface.codegen.sql import SqlTranslator -from kestrel.ir.filter import ( - BoolExp, - ExpOp, - FComparison, - IntComparison, - ListOp, - ListComparison, - MultiComp, - NumCompOp, - StrCompOp, - StrComparison, - TimeRange, -) -from kestrel.ir.instructions import ( - DataSource, - Filter, - Limit, - Offset, - ProjectAttrs, - ProjectEntity, - Sort, -) - -# Use sqlite3 for testing -import sqlalchemy - -import pytest - - -def _dt(timestr: str) -> datetime: - return parser.parse(timestr) - - -def _time2string(ts: datetime) -> str: - return ts.strftime('%Y-%m-%dT%H:%M:%S.%f') - - -def _remove_nl(s): - return s.replace('\n', '') - - -@pytest.mark.parametrize( - "iseq, sql", [ - # Try a simple filter - ([Filter(IntComparison('foo', NumCompOp.GE, 0))], - "SELECT * FROM my_table WHERE foo >= ?"), - # Try a simple filter with sorting - ([Filter(IntComparison('foo', NumCompOp.GE, 0)), Sort('bar')], - "SELECT * FROM my_table WHERE foo >= ? ORDER BY bar DESC"), - # Simple filter plus time range - ([Filter(IntComparison('foo', NumCompOp.GE, 0), timerange=TimeRange(_dt('2023-12-06T08:17:00Z'), _dt('2023-12-07T08:17:00Z')))], - "SELECT * FROM my_table WHERE foo >= ? AND timestamp >= ? AND timestamp < ?"), - # sqlalchemy's sqlite dialect seems to always add the offset - ([Limit(3), ProjectAttrs(['foo', 'bar', 'baz']), Filter(StrComparison('foo', StrCompOp.EQ, 'abc'))], - "SELECT foo, bar, baz FROM my_table WHERE foo = ? LIMIT ? OFFSET ?"), - # Same as above but reverse order - ([Filter(StrComparison('foo', StrCompOp.EQ, 'abc')), ProjectAttrs(['foo', 'bar', 'baz']), Limit(3)], - "SELECT foo, bar, baz FROM my_table WHERE foo = ? LIMIT ? OFFSET ?"), - ([Filter(ListComparison('foo', ListOp.NIN, ['abc', 'def']))], - "SELECT * FROM my_table WHERE (foo NOT IN (__[POSTCOMPILE_foo_1]))"), # POSTCOMPILE is some SQLAlchemy-ism - ([Filter(StrComparison('foo', StrCompOp.MATCHES, '.*abc.*'))], - "SELECT * FROM my_table WHERE foo REGEXP ?"), - ([Filter(StrComparison('foo', StrCompOp.NMATCHES, '.*abc.*'))], - "SELECT * FROM my_table WHERE foo NOT REGEXP ?"), - ([Filter(MultiComp(ExpOp.OR, [IntComparison('foo', NumCompOp.EQ, 1), IntComparison('bar', NumCompOp.EQ, 1)]))], - "SELECT * FROM my_table WHERE foo = ? OR bar = ?"), - ([Filter(MultiComp(ExpOp.AND, [IntComparison('foo', NumCompOp.EQ, 1), IntComparison('bar', NumCompOp.EQ, 1)]))], - "SELECT * FROM my_table WHERE foo = ? AND bar = ?"), - ([Limit(1000), Offset(2000)], - "SELECT * FROM my_table LIMIT ? OFFSET ?"), - ] -) -def test_sql_translator(iseq, sql): - trans = SqlTranslator(sqlalchemy.dialects.sqlite.dialect(), _time2string, "timestamp", sqlalchemy.table("my_table")) - for i in iseq: - trans.add_instruction(i) - result = trans.result() - assert _remove_nl(str(result)) == sql diff --git a/packages-nextgen/kestrel_core/tests/test_ir_filter.py b/packages-nextgen/kestrel_core/tests/test_ir_filter.py deleted file mode 100644 index 1e248df8..00000000 --- a/packages-nextgen/kestrel_core/tests/test_ir_filter.py +++ /dev/null @@ -1,144 +0,0 @@ -import json - -from kestrel.frontend.parser import parse_kestrel -from kestrel.ir.filter import ( - IntComparison, FloatComparison, StrComparison, ListComparison, - RefComparison, ReferenceValue, ListOp, NumCompOp, StrCompOp, ExpOp, - BoolExp, MultiComp, get_references_from_exp, resolve_reference_with_function, -) -from kestrel.ir.instructions import ( - Filter, - instruction_from_json, -) - -import pytest - - -@pytest.mark.parametrize( - "field, op, value", [ - ("foo", StrCompOp.EQ, "bar"), - ("foo", NumCompOp.EQ, 42), - ("foo", NumCompOp.EQ, 3.14), - ("foo", StrCompOp.NEQ, "bar"), - ("foo", NumCompOp.NEQ, 42), - ("foo", NumCompOp.NEQ, 3.14), - ("foo", StrCompOp.LIKE, "%bar"), - ("foo", StrCompOp.NLIKE, "%bar"), - ] -) -def test_comparison(field, op, value): - if isinstance(value, int): - comp = IntComparison(field=field, op=op, value=value) - elif isinstance(value, float): - comp = FloatComparison(field=field, op=op, value=value) - else: - comp = StrComparison(field=field, op=op, value=value) - assert comp.field == field - assert comp.op == op - assert comp.value == value - json_data: str = comp.to_json() - data: dict = json.loads(json_data) - assert data["field"] == field - assert data["op"] == op - assert data["value"] == value - if isinstance(value, int): - comp2 = IntComparison.from_json(json_data) - elif isinstance(value, float): - comp2 = FloatComparison.from_json(json_data) - else: - comp2 = StrComparison.from_json(json_data) - assert comp == comp2 - - -@pytest.mark.parametrize( - "field, op, value", [ - ("foo", ListOp.IN, ["a", "b", "c"]), - ("foo", ListOp.NIN, ["a", "b", "c"]), - ("foo", ListOp.IN, [1, 2, 3]), - ("foo", ListOp.NIN, [1, 2, 3]), - ] -) -def test_list_comparison(field, op, value): - comp = ListComparison(field=field, op=op, value=value) - assert comp.field == field - assert comp.op == op - assert comp.value == value - json_data: str = comp.to_json() - data: dict = json.loads(json_data) - assert data["field"] == field - assert data["op"] == op - assert data["value"] == value - comp2 = ListComparison.from_json(json_data) - assert comp == comp2 - - - - -def test_multi_comparison(): - comp1 = StrComparison("foo", StrCompOp.EQ, "X") - comp2 = StrComparison("bar", StrCompOp.EQ, "Y") - comp3 = StrComparison("baz", StrCompOp.EQ, "Z") - mcomp = MultiComp(ExpOp.OR, [comp1, comp2, comp3]) - data = mcomp.to_json() - mcomp2 = MultiComp.from_json(data) - assert mcomp == mcomp2 - - -@pytest.mark.parametrize( - "lhs, op, rhs", [ - (StrComparison("foo", StrCompOp.EQ, "bar"), ExpOp.AND, IntComparison("baz", NumCompOp.EQ, 42)), - (StrComparison("foo", StrCompOp.LIKE, "%bar%"), ExpOp.OR, IntComparison("baz", NumCompOp.LE, 42)), - (IntComparison("baz", NumCompOp.GE, 42), ExpOp.AND, StrComparison("foo", StrCompOp.NEQ, "bar")), - (IntComparison("baz", NumCompOp.NEQ, 42), ExpOp.OR, StrComparison("foo", StrCompOp.EQ, "bar")), - (StrComparison("foo", StrCompOp.EQ, "bar"), ExpOp.AND, ListComparison("baz", ListOp.IN, ["a", "b", "c"])), - (StrComparison("foo", StrCompOp.EQ, "bar"), ExpOp.OR, ListComparison("baz", ListOp.IN, [1, 2, 3])), - (ListComparison("baz", ListOp.IN, ["a", "b", "c"]), ExpOp.AND, StrComparison("foo", StrCompOp.EQ, "bar")), - (ListComparison("baz", ListOp.IN, [1, 2, 3]), ExpOp.OR, StrComparison("foo", StrCompOp.EQ, "bar")), - (StrComparison("foo", StrCompOp.EQ, "X"), ExpOp.AND, - MultiComp(ExpOp.OR, [StrComparison("bar", StrCompOp.EQ, "A"), StrComparison("baz", StrCompOp.EQ, "B")])), - ] -) -def test_bool_exp(lhs, op, rhs): - exp = BoolExp(lhs, op, rhs) - data = exp.to_json() - exp2 = BoolExp.from_json(data) - assert exp == exp2 - - # Also test Filter - filt = Filter(exp) - data = filt.to_json() - filt2 = instruction_from_json(data) - assert filt == filt2 - - -def test_filter_compound_exp(): - comp1 = StrComparison("foo", StrCompOp.EQ, "bar") - comp2 = IntComparison("baz", NumCompOp.EQ, 42) - exp1 = BoolExp(comp1, ExpOp.AND, comp2) - comp3 = StrComparison("thing1", StrCompOp.NEQ, "abc") - comp4 = ListComparison("thing2", ListOp.IN, [1, 2, 3]) - exp2 = BoolExp(comp3, ExpOp.OR, comp4) - exp3 = BoolExp(exp1, ExpOp.AND, exp2) - filt = Filter(exp3) - data = filt.to_json() - filt2 = instruction_from_json(data) - assert filt == filt2 - - -def test_filter_with_reference(): - stmt = "x = y WHERE foo = 'bar' OR baz = z.baz" - graph = parse_kestrel(stmt) - filter_nodes = graph.get_nodes_by_type(Filter) - exp = filter_nodes[0].exp - exp_dict = exp.to_dict() - assert exp_dict == {'lhs': {'field': 'foo', 'op': '=', 'value': 'bar'}, 'op': 'OR', 'rhs': {'field': 'baz', 'op': 'IN', 'value': {'reference': 'z', 'attribute': 'baz'}}} - - -def test_fill_references_in_exp(): - lhs = StrComparison("foo", StrCompOp.EQ, "bar") - rhs = RefComparison("baz", "=", ReferenceValue("var", "attr")) - exp = BoolExp(lhs, ExpOp.AND, rhs) - rs = get_references_from_exp(exp) - assert len(list(rs)) == 1 - resolve_reference_with_function(exp, lambda x: 5) - assert exp.rhs.value == 5 diff --git a/packages-nextgen/kestrel_core/tests/test_ir_graph.py b/packages-nextgen/kestrel_core/tests/test_ir_graph.py deleted file mode 100644 index cd77da7d..00000000 --- a/packages-nextgen/kestrel_core/tests/test_ir_graph.py +++ /dev/null @@ -1,406 +0,0 @@ -import pytest -import networkx.utils -from collections import Counter -from pandas import DataFrame - -from kestrel.ir.instructions import ( - Variable, - DataSource, - Reference, - Return, - Filter, - Construct, - ProjectAttrs, - ProjectEntity, - Instruction, - TransformingInstruction, - CACHE_INTERFACE_IDENTIFIER, -) -from kestrel.ir.filter import StrComparison, StrCompOp -from kestrel.ir.graph import IRGraph, IRGraphSimpleQuery -from kestrel.frontend.parser import parse_kestrel -from kestrel.cache import InMemoryCache - - -def test_add_get_datasource(): - g = IRGraph() - g.add_datasource("stixshifter://abc") - - s = g.add_datasource(DataSource("stixshifter://abc")) - assert len(g) == 1 - - s2 = DataSource("stixshifter://abcd") - g.add_datasource(s2) - assert len(g) == 2 - - assert set(g.get_datasources()) == {s, s2} - g.get_datasource("stixshifter", "abc") == s - - -def test_add_same_node(): - g = IRGraph() - n = Instruction() - s = g.add_node(n) - s = g.add_node(n) - assert len(g) == 1 - - -def test_get_node_by_id(): - g = IRGraph() - n = Instruction() - s = g.add_node(n) - assert g.get_node_by_id(n.id) == n - - -def test_get_nodes_by_type_and_attributes(): - g = IRGraph() - s = g.add_datasource("stixshifter://abc") - v1 = g.add_variable("asdf", s) - v2 = g.add_variable("qwer", s) - v3 = g.add_variable("123", s) - ns = g.get_nodes_by_type_and_attributes(Variable, {"name": "asdf"}) - assert ns == [v1] - - -def test_get_returns(): - g = IRGraph() - s = g.add_datasource("stixshifter://abc") - g.add_return(s) - g.add_return(s) - g.add_return(s) - rets = g.get_returns() - assert len(rets) == 3 - assert [ret.sequence for ret in rets] == [0, 1, 2] - assert len(g.get_sink_nodes()) == 3 - - -def test_add_variable(): - g = IRGraph() - s = g.add_datasource("stixshifter://abc") - v1 = g.add_variable("asdf", s) - assert len(g) == 2 - assert len(g.edges()) == 1 - - v2 = g.add_variable("asdf", s) - assert len(g) == 3 - assert len(g.edges()) == 2 - - v = Variable("asdf") - v3 = g.add_variable(v, s) - assert v == v3 - v4 = g.add_variable(v, s) - assert v3 == v4 - - assert v1.version == 0 - assert v2.version == 1 - assert v3.version == 2 - assert len(g) == 4 - assert len(g.edges()) == 3 - - -def test_get_variables(): - g = IRGraph() - s = g.add_datasource("stixshifter://abc") - v1 = g.add_variable("asdf", s) - v2 = g.add_variable("asdf", s) - v3 = g.add_variable("asdf", s) - vs = g.get_variables() - assert len(vs) == 1 - assert vs[0].name == "asdf" - - -def test_add_get_reference(): - g = IRGraph() - s = g.add_node(DataSource("ss://ee")) - g.add_node(Variable("asdf"), s) - g.add_node(Reference("asdf")) - q1 = g.add_node(Reference("qwer")) - q2 = g.add_node(Reference("qwer")) - g.add_node(Variable("qwer"), s) - g.add_node(Reference("qwer")) - assert len(g) == 4 - assert len(g.edges()) == 2 - - assert q1 == q2 - assert g.get_reference("qwer") == q1 - refs = g.get_references() - assert refs == [q1] - - -def test_copy_graph(): - g = IRGraph() - s = g.add_datasource("stixshifter://abc") - g2 = g.copy() - assert s in g2 - for n in g2.nodes(): - n.datasource = "eee" - assert s in g - assert s.datasource == "eee" - - -def test_deepcopy_graph(): - g = IRGraph() - s = g.add_datasource("stixshifter://abc") - g2 = g.deepcopy() - assert len(g2.nodes()) == 1 - s2 = list(g2.nodes())[0] - s2.datasource = "eee" - assert s.datasource == "abc" - assert s2.datasource == "eee" - - -def test_update_graph(): - g = IRGraph() - s = g.add_datasource("stixshifter://abc") - v1 = g.add_variable("asdf", s) - v2 = g.add_variable("asdf", s) - v3 = g.add_variable("asdf", s) - r1 = g.add_return(v3) - - g2 = IRGraph() - s2 = g2.add_datasource("stixshifter://abc") - v4 = g2.add_variable("asdf", g2.add_node(Reference("asdf"))) - v5 = g2.add_variable("asdf", g2.add_node(TransformingInstruction(), s2)) - r2 = g2.add_return(v5) - - assert v1.version == 0 - assert v2.version == 1 - assert v3.version == 2 - assert v4.version == 0 - assert v5.version == 1 - assert r1.sequence == 0 - assert r2.sequence == 0 - assert len(g) == 5 - assert len(g2) == 6 - - g.update(g2) - assert v1.version == 0 - assert v2.version == 1 - assert v3.version == 2 - assert v4.version == 3 - assert v5.version == 4 - assert r1.sequence == 0 - assert r2.sequence == 1 - assert len(g) == 9 - assert s2 not in g - assert r1 in g - assert r2 in g - assert not g.get_references() - assert (v3, v4) in g.edges() - assert g.in_degree(v4) == 1 - assert g.out_degree(v4) == 0 - - -def test_serialization_deserialization(): - g1 = IRGraph() - s = g1.add_node(DataSource("ss://ee")) - r = g1.add_node(Reference("asdf")) - v = g1.add_node(Variable("asdf"), s) - j = g1.to_json() - g2 = IRGraph(j) - assert s in g2.nodes() - assert v in g2.nodes() - assert len(g2) == 3 - assert g2.edges() == {(s,v)} - - -def test_find_cached_dependent_subgraph_of_node(): - g = IRGraph() - - a1 = g.add_node(DataSource("ss://ee")) - a2 = g.add_node(Variable("asdf"), a1) - a3 = g.add_node(Instruction()) - g.add_edge(a2, a3) - a4 = g.add_node(Variable("qwer"), a3) - - b1 = g.add_node(DataSource("ss://eee")) - b2 = g.add_node(Variable("asdfe"), b1) - b3 = g.add_node(Instruction()) - g.add_edge(b2, b3) - b4 = g.add_node(Variable("qwere"), b3) - - c1 = g.add_node(Instruction()) - g.add_edge(a4, c1) - g.add_edge(b4, c1) - c2 = g.add_node(Variable("zxcv"), c1) - - g2 = g.find_cached_dependent_subgraph_of_node(c2, InMemoryCache()) - assert networkx.utils.graphs_equal(g, g2) - - g3 = g.find_cached_dependent_subgraph_of_node(c2, InMemoryCache({a2.id: DataFrame(), b2.id: DataFrame()})) - g.remove_node(a1) - g.remove_node(b1) - assert networkx.utils.graphs_equal(g, g3) - - -def test_find_dependent_subgraphs_of_node_just_cache(): - huntflow = """ -p1 = NEW process [ {"name": "cmd.exe", "pid": 123} - , {"name": "explorer.exe", "pid": 99} - , {"name": "firefox.exe", "pid": 201} - , {"name": "chrome.exe", "pid": 205} - ] - -browsers = p1 WHERE name = 'firefox.exe' OR name = 'chrome.exe' - -DISP browsers ATTR name -""" - graph = parse_kestrel(huntflow) - c = InMemoryCache() - ret = graph.get_returns()[0] - gs = graph.find_dependent_subgraphs_of_node(ret, c) - assert len(gs) == 1 - assert len(gs[0]) == 6 - assert Counter(map(type, gs[0].nodes())) == Counter([Filter, Variable, Variable, Construct, ProjectAttrs, Return]) - assert gs[0].interface == CACHE_INTERFACE_IDENTIFIER - - -def test_get_trunk_n_branches_filter(): - stmt = "y = x WHERE name = z.name AND pid = w.pid" - graph = parse_kestrel(stmt) - trunk, r2n = graph.get_trunk_n_branches(graph.get_nodes_by_type(Filter)[0]) - assert trunk.name == "x" - for r,n in r2n.items(): - assert next(graph.predecessors(n)).name == r.reference - - -def test_get_trunk_n_branches_variable(): - huntflow = """ -p1 = NEW process [ {"name": "cmd.exe", "pid": 123} - , {"name": "explorer.exe", "pid": 99} - , {"name": "firefox.exe", "pid": 201} - , {"name": "chrome.exe", "pid": 205} - ] -""" - graph = parse_kestrel(huntflow) - trunk, r2n = graph.get_trunk_n_branches(graph.get_variable("p1")) - assert isinstance(trunk, Construct) - assert r2n == {} - - -def test_find_dependent_subgraphs_of_node(): - huntflow = """ -p1 = NEW process [ {"name": "cmd.exe", "pid": 123} - , {"name": "explorer.exe", "pid": 99} - , {"name": "firefox.exe", "pid": 201} - , {"name": "chrome.exe", "pid": 205} - ] - -browsers = p1 WHERE name = 'firefox.exe' OR name = 'chrome.exe' - -p2 = GET process FROM elastic://edr1 - WHERE name = "cmd.exe" - LAST 5 DAYS - -p21 = p2 WHERE parent.name = "winword.exe" - -p3 = GET process FROM stixshifter://edr2 - WHERE parent_ref.name = "powershell.exe" - LAST 24 HOURS - -p31 = p3 WHERE parent.name = "excel.exe" - -p4 = p21 WHERE pid = p1.pid -p5 = GET process FROM stixshifter://edr5 WHERE pid = p4.pid - -DISP p5 ATTR pid, name, cmd_line -""" - graph = parse_kestrel(huntflow) - - p1 = graph.get_variable("p1") - p2 = graph.get_variable("p2") - p3 = graph.get_variable("p3") - p21 = graph.get_variable("p21") - p31 = graph.get_variable("p31") - p4 = graph.get_variable("p4") - p5 = graph.get_variable("p5") - ret = graph.get_returns()[0] - - c = InMemoryCache() - gs = graph.find_dependent_subgraphs_of_node(ret, c) - assert len(gs) == 2 - p1_projattr = [n for n in graph.successors(p1) if isinstance(n, ProjectAttrs)][0] - assert len(gs[0]) == 3 - assert set(map(type, gs[0].nodes())) == {Variable, ProjectAttrs, Construct} - assert p1_projattr == gs[0].get_nodes_by_type(ProjectAttrs)[0] - assert len(gs[1]) == 6 - assert Counter(map(type, gs[1].nodes())) == Counter([Filter, Filter, Variable, Variable, ProjectEntity, DataSource]) - - c.evaluate_graph(gs[0]) - assert p1_projattr.id in c - assert p1.id in c - assert len(c) == 2 - gs = graph.find_dependent_subgraphs_of_node(ret, c) - assert len(gs) == 1 - assert len(gs[0]) == 11 - assert p2 in gs[0] - assert p21 in gs[0] - assert p4 in gs[0] - assert Counter(map(type, gs[0].nodes())) == Counter([Filter, Filter, Filter, Variable, Variable, Variable, Variable, ProjectEntity, DataSource, ProjectAttrs, ProjectAttrs]) - - p4_projattr = next(graph.successors(p4)) - c[p4_projattr.id] = DataFrame() - gs = graph.find_dependent_subgraphs_of_node(ret, c) - assert len(gs) == 1 - assert len(gs[0]) == 8 - assert p4_projattr.id in c - assert p4_projattr in gs[0] - assert p5 in gs[0] - assert ret in gs[0] - assert Counter(map(type, gs[0].nodes())) == Counter([Filter, Return, Variable, Variable, ProjectEntity, DataSource, ProjectAttrs, ProjectAttrs]) - - -def test_find_simple_query_subgraphs(): - huntflow = """ -p1 = GET process FROM elastic://edr1 - WHERE name = "cmd.exe" - LAST 5 DAYS - -p2 = GET process FROM elastic://edr1 - WHERE pid = 999 - LAST 30 MINUTES - -p3 = p1 WHERE pid = p2.pid - -p4 = GET process FROM elastic://edr2 WHERE name = p3.name - -DISP p4 -""" - graph = parse_kestrel(huntflow) - c = InMemoryCache() - gs = graph.find_dependent_subgraphs_of_node(graph.get_returns()[0], c) - assert len(gs) == 1 - assert networkx.utils.graphs_equal(graph, gs[0]) - - vs = set(["p1", "p2"]) - for g in gs[0].find_simple_query_subgraphs(c): - assert isinstance(g, IRGraphSimpleQuery) - assert Counter(map(type, g.nodes())) == Counter([Variable, Filter, ProjectEntity, DataSource]) - assert len(g.edges()) == 3 - varname = g.get_variables()[0].name - assert varname in vs - vs.remove(varname) - assert vs == set() - - p1 = gs[0].get_variable("p1") - c[p1.id] = DataFrame() - p2 = gs[0].get_variable("p2") - c[p2.id] = DataFrame() - - gs = graph.find_dependent_subgraphs_of_node(graph.get_returns()[0], c) - # just a dep graph in cache - assert len(gs) == 1 - assert Counter(map(type, gs[0].nodes())) == Counter([Variable, Variable, Filter, ProjectAttrs, ProjectAttrs, Variable]) - sinks = gs[0].get_sink_nodes() - assert len(sinks) == 1 - sink = sinks[0] - assert isinstance(sink, ProjectAttrs) and sink.attrs == ['name'] - c[sink.id] = DataFrame() - - gs = graph.find_dependent_subgraphs_of_node(graph.get_returns()[0], c) - assert len(gs) == 1 - assert sink in gs[0] - assert Counter(map(type, gs[0].nodes())) == Counter([Variable, Filter, ProjectAttrs, DataSource, Return, ProjectEntity, Variable]) - for g in gs[0].find_simple_query_subgraphs(c): - assert Counter(map(type, g.nodes())) == Counter([ProjectAttrs, Variable, Filter, ProjectEntity, DataSource]) - assert sink in g diff --git a/packages-nextgen/kestrel_core/tests/test_ir_instructions.py b/packages-nextgen/kestrel_core/tests/test_ir_instructions.py deleted file mode 100644 index f9a32410..00000000 --- a/packages-nextgen/kestrel_core/tests/test_ir_instructions.py +++ /dev/null @@ -1,103 +0,0 @@ -import pytest - -from kestrel.ir.instructions import ( - Variable, - DataSource, - Construct, - get_instruction_class, - instruction_from_dict, - instruction_from_json, - CACHE_INTERFACE_IDENTIFIER, -) -from kestrel.exceptions import ( - InvalidSeralizedInstruction, - InvalidDataSource, -) - - -def test_instruction_post_init(): - v = Variable("asdf") - j = v.to_dict() - assert "id" in j - assert "instruction" in j - assert j["instruction"] == "Variable" - - -def test_stable_id(): - v = Variable("asdf") - _id = v.id - v.name = "qwer" - assert v.id == _id - - -def test_stable_hash(): - s = DataSource("stixshifter://abc") - h1 = hash(s) - s.datasource = "abcd" - h2 = hash(s) - assert h1 == h2 - - -def test_eq(): - s1 = DataSource("stixshifter://abc") - s2 = DataSource("stixshifter://abc") - s3 = instruction_from_dict(s1.to_dict()) - assert s1 != s2 - assert s1 == s3 - - -def test_get_instruction_class(): - cls = get_instruction_class("Variable") - v = cls("asdf") - assert cls == Variable - assert isinstance(v, Variable) - - -def test_add_source(): - s = DataSource("stixshifter://abc") - j = s.to_dict() - assert j["interface"] == "stixshifter" - assert j["datasource"] == "abc" - assert "id" in j - assert "instruction" in j - assert "uri" not in j - assert "default_interface" not in j - - x = DataSource("abc", "stixshifter") - assert x.interface == "stixshifter" - assert x.datasource == "abc" - - with pytest.raises(InvalidDataSource): - DataSource("sss://eee://ccc") - - with pytest.raises(InvalidDataSource): - DataSource("sss") - - -def test_construct(): - data = [ {"name": "cmd.exe", "pid": 123} - , {"name": "explorer.exe", "pid": 99} - , {"name": "firefox.exe", "pid": 201} - , {"name": "chrome.exe", "pid": 205} - ] - c = Construct(data) - assert c.data == data - assert c.interface == CACHE_INTERFACE_IDENTIFIER - - -def test_instruction_from_dict(): - v = Variable("asdf") - d = v.to_dict() - w = instruction_from_dict(d) - assert w == v - - del d["id"] - with pytest.raises(InvalidSeralizedInstruction): - instruction_from_dict(d) - - -def test_instruction_from_json(): - v = Variable("asdf") - j = v.to_json() - w = instruction_from_json(j) - assert w == v diff --git a/packages-nextgen/kestrel_core/tests/test_mapping_data_model.py b/packages-nextgen/kestrel_core/tests/test_mapping_data_model.py deleted file mode 100644 index 93abe83e..00000000 --- a/packages-nextgen/kestrel_core/tests/test_mapping_data_model.py +++ /dev/null @@ -1,200 +0,0 @@ -import pytest - -import pandas as pd - -from kestrel.mapping.data_model import ( - load_default_mapping, - reverse_mapping, - translate_comparison_to_native, - translate_comparison_to_ocsf, - translate_dataframe, - translate_projection_to_native, -) - - -# A "custom" mapping for an opensearch/elasticsearch datasource. -# This mapping works with data from Blue Team Village's 2023 DefCon CTF, for example. -WINLOGBEAT_MAPPING = { - "file": { - "path": "file.path", - "name": "file.name" - }, - "process": { - "cmd_line": "winlog.event_data.CommandLine", - "pid": { - "native_field": "winlog.event_data.ProcessId", - "native_value": "to_str", - "ocsf_value": "to_int" - }, - "uid": "winlog.event_data.ProcessGuid", - "file": { - "path": "winlog.event_data.Image", - "name": [ - { - "native_field": "winlog.event_data.Image", - "native_op": "LIKE", - "native_value": "endswith", - "ocsf_value": "basename" - } - ], - "parent_folder": [ - { - "native_field": "winlog.event_data.Image", - "native_op": "LIKE", - "native_value": "startswith", - "ocsf_value": "dirname" - } - ] - }, - "parent_process": { - "cmd_line": "winlog.event_data.ParentCommandLine", - "pid": "winlog.event_data.ParentProcessId", - "uid": "winlog.event_data.ParentProcessGuid", - "file": { - "path": "winlog.event_data.ParentImage", - "name": [ - { - "native_field": "winlog.event_data.ParentImage", - "native_op": "LIKE", - "native_value": "endswith", - "ocsf_value": "basename" - } - ], - "parent_folder": [ - { - "native_field": "winlog.event_data.ParentImage", - "native_op": "LIKE", - "native_value": "startswith", - "ocsf_value": "dirname" - } - ] - } - } - }, - "dst_endpoint": { - "ip": "winlog.event_data.DestinationIp", - "port": "winlog.event_data.DestinationPort" - }, - "src_endpoint": { - "ip": "winlog.event_data.SourceIp", - "port": "winlog.event_data.SourcePort" - } -} - - -# Simplified subset of the standard mapping -STIX_MAPPING = { - "device": { - "ip": "ipv4-addr:value" - }, - "endpoint": { - "ip": "ipv4-addr:value" - }, -} - - -# This mapping is used in 2 places: -# - frontend comparison from ECS to OCSF -# - backend comparison from OCSF to ECS (datasource) -ECS_MAPPING = load_default_mapping("ecs") - - -def test_reverse_mapping_ipv4(): - reverse_map = reverse_mapping(STIX_MAPPING) - ipv4 = reverse_map["ipv4-addr:value"] - assert isinstance(ipv4, list) - assert set(ipv4) == {"device.ip", "endpoint.ip"} - - -def test_reverse_mapping_executable(): - reverse_map = reverse_mapping(ECS_MAPPING) - exe = reverse_map["process.executable"] - assert isinstance(exe, list) - assert "process.file.path" in exe - for item in exe: - if isinstance(item, dict): - assert "ocsf_field" in item - if item["ocsf_field"] == "process.file.name": - # Make sure all metadata from the mapping got reversed - assert item["native_value"] == "endswith" - assert item["native_op"] == "LIKE" - assert item["ocsf_value"] == "basename" - - - -@pytest.mark.parametrize( - "dmm, field, op, value, expected_result", - [ - (WINLOGBEAT_MAPPING, "process.file.path", "=", "C:\\TMP\\foo.exe", - [("winlog.event_data.Image", "=", "C:\\TMP\\foo.exe")]), - (WINLOGBEAT_MAPPING, "process.file.name", "=", "foo.exe", - [("winlog.event_data.Image", "LIKE", "%\\foo.exe")]), - (ECS_MAPPING, "process.file.path", "=", "C:\\TMP\\foo.exe", - [("process.executable", "=", "C:\\TMP\\foo.exe")]), - (ECS_MAPPING, "process.file.name", "=", "foo.exe", - [("process.executable", "LIKE", "%\\foo.exe")]), - ], -) -def test_translate_comparison_to_native(dmm, field, op, value, expected_result): - assert translate_comparison_to_native(dmm, field, op, value) == expected_result - - -@pytest.mark.parametrize( - "dmm, field, op, value, expected_result", - [ - (ECS_MAPPING, "process.executable", "=", "C:\\TMP\\foo.exe", - [ - ("process.file.path", "=", "C:\\TMP\\foo.exe"), - ("process.file.name", "=", "foo.exe"), - ("process.file.parent_folder", "=", "C:\\TMP"), - ]), - (ECS_MAPPING, "process.executable", "LIKE", "%\\foo.exe", - [ - ("process.file.path", "LIKE", "%\\foo.exe"), - ("process.file.name", "LIKE", "foo.exe"), #TODO: could optimize this to "=" - ("process.file.parent_folder", "LIKE", "%"), #TODO: could eliminate this? - ]), - (STIX_MAPPING, "ipv4-addr:value", "=", "198.51.100.13", - [ - ("device.ip", "=", "198.51.100.13"), - ("endpoint.ip", "=", "198.51.100.13"), - ]), - ], -) -def test_translate_comparison_to_ocsf(dmm, field, op, value, expected_result): - """Test the translate function.""" - reverse_dmm = reverse_mapping(dmm) # Make the dmms fixtures? - assert set(translate_comparison_to_ocsf(reverse_dmm, field, op, value)) == set(expected_result) - - -@pytest.mark.parametrize( - "dmm, entity, field, expected_result", - [ - (WINLOGBEAT_MAPPING, "process", ["file.name", "pid"], - [("winlog.event_data.Image", "file.name"), ("winlog.event_data.ProcessId", "pid")]), - (WINLOGBEAT_MAPPING, "process", None, - [("winlog.event_data.CommandLine", "cmd_line"), - ("winlog.event_data.ProcessId", "pid"), - ("winlog.event_data.ProcessGuid", "uid"), - ("winlog.event_data.Image", "file.path"), - ("winlog.event_data.Image", "file.name"), - ("winlog.event_data.Image", "file.parent_folder"), - ("winlog.event_data.ParentCommandLine", "parent_process.cmd_line"), - ("winlog.event_data.ParentProcessId", "parent_process.pid"), - ("winlog.event_data.ParentProcessGuid", "parent_process.uid"), - ("winlog.event_data.ParentImage", "parent_process.file.path"), - ("winlog.event_data.ParentImage", "parent_process.file.name"), - ("winlog.event_data.ParentImage", "parent_process.file.parent_folder"), - ]), - ], -) -def test_translate_projection_to_native(dmm, entity, field, expected_result): - assert translate_projection_to_native(dmm, entity, field) == expected_result - - -def test_translate_dataframe(): #TODO: more testing here - df = pd.DataFrame({"file.path": [r"C:\Windows\System32\cmd.exe", r"C:\TMP"], - "pid": [1, 2]}) - dmm = load_default_mapping("ecs") - df = translate_dataframe(df, dmm["process"]) - #TODO:assert df["file.name"].iloc[0] == "cmd.exe" diff --git a/packages-nextgen/kestrel_core/tests/test_mapping_transformers.py b/packages-nextgen/kestrel_core/tests/test_mapping_transformers.py deleted file mode 100644 index 9e454925..00000000 --- a/packages-nextgen/kestrel_core/tests/test_mapping_transformers.py +++ /dev/null @@ -1,35 +0,0 @@ -import pandas as pd -import pytest - -from kestrel.mapping.transformers import ( - run_transformer, - run_transformer_on_series, -) - - -@pytest.mark.parametrize( - "transform, value, expected", [ - ("dirname", r"C:\Windows\System32\cmd.exe", r"C:\Windows\System32"), - ("basename", r"C:\Windows\System32\cmd.exe", r"cmd.exe"), - ("startswith", r"C:\Windows\System32", r"C:\Windows\System32\%"), - ("endswith", "cmd.exe", r"%\cmd.exe"), - ("to_int", 1234, 1234), - ("to_int", 1234.1234, 1234), # Maybe this should fail? - ("to_int", "1234", 1234), - ("to_int", "0x4d2", 1234), - ("to_str", "1234", "1234"), - ("to_str", 1234, "1234"), - ("to_epoch_ms", "2024-03-29T12:57:56.926Z", 1711717076926), - ("to_epoch_ms", "2024-03-29T12:57:56.92Z", 1711717076920), - ("to_epoch_ms", "2024-03-29T12:57:56.9Z", 1711717076900), - ("to_epoch_ms", "2024-03-29T12:57:56Z", 1711717076000), - ] -) -def test_run_transformer(transform, value, expected): - assert run_transformer(transform, value) == expected - - -def test_run_series_basename(): - data = pd.Series([r"C:\Windows\System32\cmd.exe", r"C:\TMP"]) - result = list(run_transformer_on_series("basename", data)) - assert result == ["cmd.exe", "TMP"] diff --git a/packages-nextgen/kestrel_core/tests/test_parser.py b/packages-nextgen/kestrel_core/tests/test_parser.py deleted file mode 100644 index 1ca5d314..00000000 --- a/packages-nextgen/kestrel_core/tests/test_parser.py +++ /dev/null @@ -1,290 +0,0 @@ -import json -import pytest -from collections import Counter -from datetime import datetime, timedelta, timezone - -from kestrel.frontend.parser import parse_kestrel -from kestrel.ir.graph import IRGraph -from kestrel.ir.filter import ReferenceValue -from kestrel.ir.instructions import ( - Construct, - DataSource, - Filter, - Limit, - Offset, - ProjectAttrs, - ProjectEntity, - Reference, - Sort, - Variable, - Explain, - Return, -) - - -@pytest.mark.parametrize( - "stmt", [ - "x = GET thing FROM if://ds WHERE foo = 'bar'", - "x = GET thing FROM if://ds WHERE foo > 1.5", - r"x = GET thing FROM if://ds WHERE foo = r'C:\TMP'", - "x = GET thing FROM if://ds WHERE foo = 'bar' OR baz != 42", - "x = GET thing FROM if://ds WHERE foo = 'bar' AND baz IN (1, 2, 3)", - "x = GET thing FROM if://ds WHERE foo = 'bar' AND baz IN (1)", - "x = GET thing FROM if://ds WHERE foo = 'bar' AND baz IN (1) LAST 3 DAYS", - ] -) -def test_parser_get_statements(stmt): - """ - This test isn't meant to be comprehensive, but checks basic transformer functionality. - - This will need to be updated as we build out the new Transformer - """ - - graph = parse_kestrel(stmt) - assert len(graph) == 4 - assert len(graph.get_nodes_by_type(Variable)) == 1 - assert len(graph.get_nodes_by_type(ProjectEntity)) == 1 - assert len(graph.get_nodes_by_type(DataSource)) == 1 - assert len(graph.get_nodes_by_type(Filter)) == 1 - - # Ensure result is serializable - _ = graph.to_json() - - -def test_parser_get_timespan_relative(): - stmt = "x = GET url FROM if://ds WHERE url = 'http://example.com/' LAST 5h" - graph = parse_kestrel(stmt) - filt_list = graph.get_nodes_by_type(Filter) - assert len(filt_list) == 1 - filt = filt_list[0] - delta = filt.timerange.stop - filt.timerange.start - assert delta == timedelta(hours=5) - - -def test_parser_get_timespan_absolute(): - stmt = ("x = GET url FROM if://ds WHERE url = 'http://example.com/'" - " START '2023-11-29T00:00:00Z' STOP '2023-11-29T05:00:00Z'") - graph = parse_kestrel(stmt) - filt_list = graph.get_nodes_by_type(Filter) - assert len(filt_list) == 1 - filt = filt_list[0] - delta = filt.timerange.stop - filt.timerange.start - assert delta == timedelta(hours=5) - assert filt.timerange.start == datetime(2023, 11, 29, 0, 0, tzinfo=timezone.utc) - assert filt.timerange.stop == datetime(2023, 11, 29, 5, 0, tzinfo=timezone.utc) - - -@pytest.mark.parametrize( - "stmt, expected", [ - ("x = GET url FROM if://ds WHERE url = 'http://example.com/' LIMIT 1", 1), - ("x = GET url FROM if://ds WHERE url = 'http://example.com/' LAST 3d LIMIT 2", 2), - (("x = GET url FROM if://ds WHERE url = 'http://example.com/'" - " START '2023-11-29T00:00:00Z' STOP '2023-11-29T05:00:00Z' LIMIT 3"), 3), - ] -) -def test_parser_get_with_limit(stmt, expected): - graph = parse_kestrel(stmt) - limits = graph.get_nodes_by_type(Limit) - assert len(limits) == 1 - limit = limits[0] - assert limit.num == expected - - -def get_parsed_filter_exp(stmt): - parse_tree = parse_kestrel(stmt) - filter_node = parse_tree.get_nodes_by_type(Filter).pop() - return filter_node.exp - - -def test_parser_mapping_single_comparison_to_single_value(): - # test for attributes in the form entity_name:property_name - stmt = "x = GET process FROM if://ds WHERE process:binary_ref.name = 'foo'" - parse_filter = get_parsed_filter_exp(stmt) - assert parse_filter.field == 'file.name' - # test when entity name is not included in the attributes - stmt = "x = GET process FROM if://ds WHERE binary_ref.name = 'foo'" - parse_filter = get_parsed_filter_exp(stmt) - assert parse_filter.field == 'file.name' - - -def test_parser_mapping_single_comparison_to_multiple_values(): - stmt = "x = GET ipv4-addr FROM if://ds WHERE value = '192.168.22.3'" - parse_filter = get_parsed_filter_exp(stmt) - comps = parse_filter.comps - assert isinstance(comps, list) and len(comps) == 4 - fields = [x.field for x in comps] - assert ("dst_endpoint.ip" in fields and "src_endpoint.ip" in fields and - "device.ip" in fields and "endpoint.ip" in fields) - - -def test_parser_mapping_multiple_comparison_to_multiple_values(): - stmt = "x = GET process FROM if://ds WHERE binary_ref.name = 'foo' "\ - "OR name = 'bam' AND parent_ref.name = 'boom'" - parse_filter = get_parsed_filter_exp(stmt) - field1 = parse_filter.lhs.field - assert field1 == 'file.name' - field2 = parse_filter.rhs.lhs.field - assert field2 == 'name' # 'process.name' - field3 = parse_filter.rhs.rhs.field - assert field3 == "parent_process.name" - - -def test_parser_new_json(): - stmt = """ -proclist = NEW process [ {"name": "cmd.exe", "pid": 123} - , {"name": "explorer.exe", "pid": 99} - , {"name": "firefox.exe", "pid": 201} - , {"name": "chrome.exe", "pid": 205} - ] -""" - graph = parse_kestrel(stmt) - cs = graph.get_nodes_by_type(Construct) - assert len(cs) == 1 - construct = cs[0] - df = [ {"name": "cmd.exe", "pid": 123} - , {"name": "explorer.exe", "pid": 99} - , {"name": "firefox.exe", "pid": 201} - , {"name": "chrome.exe", "pid": 205} - ] - assert df == construct.data - vs = graph.get_variables() - assert len(vs) == 1 - assert vs[0].name == "proclist" - - -@pytest.mark.parametrize( - "stmt, node_cnt", [ - ("x = y WHERE foo = 'bar'", 3), - ("x = y WHERE foo > 1.5", 3), - (r"x = y WHERE foo = r'C:\TMP'", 3), - ("x = y WHERE foo = 'bar' OR baz != 42", 3), - ("x = y WHERE foo = 'bar' AND baz IN (1, 2, 3)", 3), - ("x = y WHERE foo = 'bar' AND baz IN (1)", 3), - ("x = y WHERE foo = 'bar' SORT BY foo ASC LIMIT 3", 5), - ("x = y WHERE foo = 'bar' SORT BY foo ASC LIMIT 3 OFFSET 9", 6), - ] -) -def test_parser_expression(stmt, node_cnt): - """ - This test isn't meant to be comprehensive, but checks basic transformer functionality. - - This will need to be updated as we build out the new Transformer - """ - - graph = parse_kestrel(stmt) - assert len(graph) == node_cnt - assert len(graph.get_nodes_by_type(Variable)) == 1 - assert len(graph.get_nodes_by_type(Reference)) == 1 - assert len(graph.get_nodes_by_type(Filter)) == 1 - assert len(graph.get_nodes_by_type(Sort)) in (0, 1) - assert len(graph.get_nodes_by_type(Limit)) in (0, 1) - assert len(graph.get_nodes_by_type(Offset)) in (0, 1) - - -def test_three_statements_in_a_line(): - stmt = """ -proclist = NEW process [ {"name": "cmd.exe", "pid": 123} - , {"name": "explorer.exe", "pid": 99} - , {"name": "firefox.exe", "pid": 201} - , {"name": "chrome.exe", "pid": 205} - ] -browsers = proclist WHERE name = 'firefox.exe' OR name = 'chrome.exe' -DISP browsers ATTR name, pid -""" - graph = parse_kestrel(stmt) - assert len(graph) == 6 - c = graph.get_nodes_by_type(Construct)[0] - assert {"proclist", "browsers"} == {v.name for v in graph.get_variables()} - proclist = graph.get_variable("proclist") - browsers = graph.get_variable("browsers") - proj = graph.get_nodes_by_type(ProjectAttrs)[0] - assert proj.attrs == ['name', 'pid'] - ft = graph.get_nodes_by_type(Filter)[0] - assert ft.exp.to_dict() == {"lhs": {"field": "name", "op": "=", "value": "firefox.exe"}, "op": "OR", "rhs": {"field": "name", "op": "=", "value": "chrome.exe"}} - ret = graph.get_returns()[0] - assert len(graph.edges) == 5 - assert (c, proclist) in graph.edges - assert (proclist, ft) in graph.edges - assert (ft, browsers) in graph.edges - assert (browsers, proj) in graph.edges - assert (proj, ret) in graph.edges - - -@pytest.mark.parametrize( - "stmt, node_cnt, expected", [ - ("x = y WHERE foo = z.foo", 5, [ReferenceValue("z", "foo")]), - ("x = y WHERE foo > 1.5", 3, []), - ("x = y WHERE foo = 'bar' OR baz = z.baz", 5, [ReferenceValue("z", "baz")]), - ("x = y WHERE (foo = 'bar' OR baz = z.baz) AND (fox = w.fox AND bbb = z.bbb)", 8, [ReferenceValue("z", "baz"), ReferenceValue("w", "fox"), ReferenceValue("z", "bbb")]), - ("x = GET process FROM s://x WHERE foo = z.foo", 6, [ReferenceValue("z", "foo")]), - ("x = GET file FROM s://y WHERE foo > 1.5", 4, []), - ("x = GET file FROM c://x WHERE foo = 'bar' OR baz = z.baz", 6, [ReferenceValue("z", "baz")]), - ("x = GET user FROM s://x WHERE (foo = 'bar' OR baz = z.baz) AND (fox = w.fox AND bbb = z.bbb)", 9, [ReferenceValue("z", "baz"), ReferenceValue("w", "fox"), ReferenceValue("z", "bbb")]), - ] -) -def test_reference_branch(stmt, node_cnt, expected): - graph = parse_kestrel(stmt) - assert len(graph) == node_cnt - filter_nodes = graph.get_nodes_by_type(Filter) - assert len(filter_nodes) == 1 - filter_node = filter_nodes[0] - for rv in expected: - r = graph.get_reference(rv.reference) - assert r - projs = [p for p in graph.successors(r) if isinstance(p, ProjectAttrs) and p.attrs == [rv.attribute]] - assert projs and len(projs) == 1 - proj = projs[0] - assert proj - assert list(graph.successors(proj)) == [filter_node] - - -def test_parser_disp_after_new(): - stmt = """ -proclist = NEW process [ {"name": "cmd.exe", "pid": 123} - , {"name": "explorer.exe", "pid": 99} - , {"name": "firefox.exe", "pid": 201} - , {"name": "chrome.exe", "pid": 205} - ] -DISP proclist ATTR name, pid LIMIT 2 OFFSET 3 -""" - graph = parse_kestrel(stmt) - assert len(graph) == 6 - c = graph.get_nodes_by_type(Construct)[0] - assert {"proclist"} == {v.name for v in graph.get_variables()} - proclist = graph.get_variable("proclist") - proj = graph.get_nodes_by_type(ProjectAttrs)[0] - assert proj.attrs == ['name', 'pid'] - limit = graph.get_nodes_by_type(Limit)[0] - assert limit.num == 2 - offset = graph.get_nodes_by_type(Offset)[0] - assert offset.num == 3 - ret = graph.get_returns()[0] - assert len(graph.edges) == 5 - assert (c, proclist) in graph.edges - assert (proclist, proj) in graph.edges - assert (proj, limit) in graph.edges - assert (limit, offset) in graph.edges - assert (offset, ret) in graph.edges - - -def test_parser_explain_alone(): - stmt = "EXPLAIN abc" - graph = parse_kestrel(stmt) - assert len(graph) == 3 - assert len(graph.edges) == 2 - assert Counter(map(type, graph.nodes())) == Counter([Reference, Explain, Return]) - - -def test_parser_explain_dereferred(): - stmt = """ -proclist = NEW process [ {"name": "cmd.exe", "pid": 123} - , {"name": "explorer.exe", "pid": 99} - , {"name": "firefox.exe", "pid": 201} - , {"name": "chrome.exe", "pid": 205} - ] -EXPLAIN proclist -""" - graph = parse_kestrel(stmt) - assert len(graph) == 4 - assert len(graph.edges) == 3 - assert Counter(map(type, graph.nodes())) == Counter([Construct, Variable, Explain, Return]) diff --git a/packages-nextgen/kestrel_core/tests/test_session.py b/packages-nextgen/kestrel_core/tests/test_session.py deleted file mode 100644 index 115154d4..00000000 --- a/packages-nextgen/kestrel_core/tests/test_session.py +++ /dev/null @@ -1,186 +0,0 @@ -import pytest -import os -from kestrel import Session -from pandas import DataFrame -from uuid import uuid4 - -from kestrel.display import GraphExplanation -from kestrel.ir.instructions import Construct -from kestrel.config.internal import CACHE_INTERFACE_IDENTIFIER -from kestrel.frontend.parser import parse_kestrel -from kestrel.cache import SqliteCache - - -def test_execute_in_cache(): - hf = """ -proclist = NEW process [ {"name": "cmd.exe", "pid": 123} - , {"name": "explorer.exe", "pid": 99} - , {"name": "firefox.exe", "pid": 201} - , {"name": "chrome.exe", "pid": 205} - ] -browsers = proclist WHERE name != "cmd.exe" -DISP browsers -cmd = proclist WHERE name = "cmd.exe" -DISP cmd ATTR pid -""" - b1 = DataFrame([ {"name": "explorer.exe", "pid": 99} - , {"name": "firefox.exe", "pid": 201} - , {"name": "chrome.exe", "pid": 205} - ]) - b2 = DataFrame([ {"pid": 123} ]) - with Session() as session: - res = session.execute_to_generate(hf) - assert b1.equals(next(res)) - assert b2.equals(next(res)) - with pytest.raises(StopIteration): - next(res) - - -def test_double_deref_in_cache(): - # When the Filter node is dereferred twice - # The node should be deepcopied each time to avoid issue - hf = """ -proclist = NEW process [ {"name": "cmd.exe", "pid": 123} - , {"name": "explorer.exe", "pid": 99} - , {"name": "firefox.exe", "pid": 201} - , {"name": "chrome.exe", "pid": 205} - ] -px = proclist WHERE name != "cmd.exe" AND pid = 205 -chrome = proclist WHERE pid IN px.pid -DISP chrome -DISP chrome -""" - df = DataFrame([ {"name": "chrome.exe", "pid": 205} ]) - with Session() as session: - res = session.execute_to_generate(hf) - assert df.equals(next(res)) - assert df.equals(next(res)) - with pytest.raises(StopIteration): - next(res) - - -def test_explain_in_cache(): - hf = """ -proclist = NEW process [ {"name": "cmd.exe", "pid": 123} - , {"name": "explorer.exe", "pid": 99} - , {"name": "firefox.exe", "pid": 201} - , {"name": "chrome.exe", "pid": 205} - ] -browsers = proclist WHERE name != "cmd.exe" -chrome = browsers WHERE pid = 205 -EXPLAIN chrome -""" - with Session() as session: - ress = session.execute_to_generate(hf) - res = next(ress) - assert isinstance(res, GraphExplanation) - assert len(res.graphlets) == 1 - ge = res.graphlets[0] - assert ge.graph == session.irgraph.to_dict() - construct = session.irgraph.get_nodes_by_type(Construct)[0] - assert ge.query.language == "SQL" - stmt = ge.query.statement.replace('"', '') - assert stmt == f'SELECT * \nFROM (SELECT * \nFROM (SELECT * \nFROM (SELECT * \nFROM {construct.id.hex}v) AS proclist \nWHERE name != \'cmd.exe\') AS browsers \nWHERE pid = 205) AS chrome' - with pytest.raises(StopIteration): - next(ress) - - -def test_multi_interface_explain(): - - class DataLake(SqliteCache): - @staticmethod - def schemes(): - return ["datalake"] - - class Gateway(SqliteCache): - @staticmethod - def schemes(): - return ["gateway"] - - extra_db = [] - with Session() as session: - stmt1 = """ -procs = NEW process [ {"name": "cmd.exe", "pid": 123} - , {"name": "explorer.exe", "pid": 99} - , {"name": "firefox.exe", "pid": 201} - , {"name": "chrome.exe", "pid": 205} - ] -DISP procs -""" - session.execute(stmt1) - session.interface_manager[CACHE_INTERFACE_IDENTIFIER].__class__ = DataLake - session.irgraph.get_nodes_by_type_and_attributes(Construct, {"interface": CACHE_INTERFACE_IDENTIFIER})[0].interface = "datalake" - - new_cache = SqliteCache(session_id = uuid4()) - extra_db.append(new_cache.db_path) - session.interface_manager.interfaces.append(new_cache) - stmt2 = """ -nt = NEW network [ {"pid": 123, "source": "192.168.1.1", "destination": "1.1.1.1"} - , {"pid": 205, "source": "192.168.1.1", "destination": "1.1.1.2"} - ] -DISP nt -""" - session.execute(stmt2) - session.interface_manager[CACHE_INTERFACE_IDENTIFIER].__class__ = Gateway - session.irgraph.get_nodes_by_type_and_attributes(Construct, {"interface": CACHE_INTERFACE_IDENTIFIER})[0].interface = "gateway" - - new_cache = SqliteCache(session_id = uuid4()) - extra_db.append(new_cache.db_path) - session.interface_manager.interfaces.append(new_cache) - stmt3 = """ -domain = NEW domain [ {"ip": "1.1.1.1", "domain": "cloudflare.com"} - , {"ip": "1.1.1.2", "domain": "xyz.cloudflare.com"} - ] -DISP domain -""" - session.execute(stmt3) - - stmt = """ -p2 = procs WHERE name IN ("firefox.exe", "chrome.exe") -ntx = nt WHERE pid IN p2.pid -d2 = domain WHERE ip IN ntx.destination -EXPLAIN d2 -DISP d2 -""" - ress = session.execute_to_generate(stmt) - disp = next(ress) - df_res = next(ress) - - with pytest.raises(StopIteration): - next(ress) - - assert isinstance(disp, GraphExplanation) - assert len(disp.graphlets) == 4 - - assert len(disp.graphlets[0].graph["nodes"]) == 5 - query = disp.graphlets[0].query.statement.replace('"', '') - procs = session.irgraph.get_variable("procs") - c1 = next(session.irgraph.predecessors(procs)) - assert query == f"SELECT pid \nFROM (SELECT * \nFROM (SELECT * \nFROM {c1.id.hex}) AS procs \nWHERE name IN ('firefox.exe', 'chrome.exe')) AS p2" - - assert len(disp.graphlets[1].graph["nodes"]) == 2 - query = disp.graphlets[1].query.statement.replace('"', '') - nt = session.irgraph.get_variable("nt") - c2 = next(session.irgraph.predecessors(nt)) - assert query == f"SELECT * \nFROM (SELECT * \nFROM {c2.id.hex}) AS nt" - - # the current session.execute_to_generate() logic does not store - # in cache if evaluated by cache; the behavior may change in the future - assert len(disp.graphlets[2].graph["nodes"]) == 2 - query = disp.graphlets[2].query.statement.replace('"', '') - domain = session.irgraph.get_variable("domain") - c3 = next(session.irgraph.predecessors(domain)) - assert query == f"SELECT * \nFROM (SELECT * \nFROM {c3.id.hex}) AS domain" - - assert len(disp.graphlets[3].graph["nodes"]) == 12 - print(disp.graphlets[3].graph["nodes"]) - query = disp.graphlets[3].query.statement.replace('"', '') - p2 = session.irgraph.get_variable("p2") - p2pa = next(session.irgraph.successors(p2)) - assert query == f"SELECT * \nFROM (SELECT * \nFROM (SELECT * \nFROM {c3.id.hex}) AS domain \nWHERE ip IN (SELECT destination \nFROM (SELECT * \nFROM {nt.id.hex}v \nWHERE pid IN (SELECT * \nFROM {p2pa.id.hex}v)) AS ntx)) AS d2" - - df_ref = DataFrame([{"ip": "1.1.1.2", "domain": "xyz.cloudflare.com"}]) - assert df_ref.equals(df_res) - - for db_file in extra_db: - os.remove(db_file) diff --git a/packages-nextgen/kestrel_interface_opensearch/pyproject.toml b/packages-nextgen/kestrel_interface_opensearch/pyproject.toml deleted file mode 100644 index 6270f6d0..00000000 --- a/packages-nextgen/kestrel_interface_opensearch/pyproject.toml +++ /dev/null @@ -1,36 +0,0 @@ -[build-system] -requires = ["setuptools >= 68.2.2", "wheel"] -build-backend = "setuptools.build_meta" - -[project] -name = "kestrel_interface_opensearch" -version = "2.0.0" -description = "Kestrel OpenSearch Datasource Interface" -readme = "README.rst" -requires-python = ">=3.8" -license = {text = "Apache 2.0 License"} -maintainers = [ - {name = "Xiaokui Shu", email = "xiaokui.shu@ibm.com"}, - {name = "Paul Coccoli", email = "pcoccoli@us.ibm.com"}, -] -keywords = [ - "kestrel", - "cybersecurity", - "threat hunting", -] -classifiers = [ - "Topic :: Security", - "Operating System :: OS Independent", - "Development Status :: 4 - Beta", - "Programming Language :: Python :: 3", -] - -dependencies = [ - "kestrel_core>=2.0.0", - "opensearch-py>=2.4.2", -] - -[project.urls] -Homepage = "https://github.com/opencybersecurityalliance/kestrel-lang" -Documentation = "https://kestrel.readthedocs.io/" -Repository = "https://github.com/opencybersecurityalliance/kestrel-lang.git" diff --git a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/__init__.py b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/__init__.py deleted file mode 100644 index 3ee389ca..00000000 --- a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from kestrel_interface_opensearch.interface import OpenSearchInterface diff --git a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/config.py b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/config.py deleted file mode 100644 index 26d02ccf..00000000 --- a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/config.py +++ /dev/null @@ -1,69 +0,0 @@ -import logging -from dataclasses import dataclass, field -from typing import Dict, Mapping, Optional - -import yaml -from mashumaro.mixins.json import DataClassJSONMixin - -from kestrel.config.utils import ( - CONFIG_DIR_DEFAULT, - load_user_config, -) -from kestrel.exceptions import InterfaceNotConfigured -from kestrel.mapping.data_model import load_default_mapping - - -PROFILE_PATH_DEFAULT = CONFIG_DIR_DEFAULT / "opensearch.yaml" -PROFILE_PATH_ENV_VAR = "KESTREL_OPENSEARCH_CONFIG" - -_logger = logging.getLogger(__name__) - - -@dataclass -class Auth: - username: str - password: str - - -@dataclass -class Connection(DataClassJSONMixin): - url: str - auth: Auth - verify_certs: bool = True - - def __post_init__(self): - self.auth = Auth(**self.auth) - - -@dataclass -class Index(DataClassJSONMixin): - connection: str - timestamp: str - timestamp_format: str - data_model_mapping: Optional[str] = None # Filename for mapping - data_model_map: Mapping = field(default_factory=dict) - - def __post_init__(self): - if self.data_model_mapping: - with open(self.data_model_mapping, "r") as fp: - self.data_model_map = yaml.safe_load(fp) - else: - # Default to the built-in ECS mapping - self.data_model_map = load_default_mapping("ecs") - - -@dataclass -class Config(DataClassJSONMixin): - connections: Dict[str, Connection] - indexes: Dict[str, Index] - - def __post_init__(self): - self.connections = {k: Connection(**v) for k, v in self.connections.items()} - self.indexes = {k: Index(**v) for k, v in self.indexes.items()} - - -def load_config(): - try: - return Config(**load_user_config(PROFILE_PATH_ENV_VAR, PROFILE_PATH_DEFAULT)) - except TypeError: - raise InterfaceNotConfigured() diff --git a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/interface.py b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/interface.py deleted file mode 100644 index 8c70eb95..00000000 --- a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/interface.py +++ /dev/null @@ -1,211 +0,0 @@ -import logging -from typing import Iterable, Mapping, Optional -from uuid import UUID - -from opensearchpy import OpenSearch -from pandas import DataFrame, Series, concat - -from kestrel.display import GraphletExplanation -from kestrel.exceptions import DataSourceError -from kestrel.interface import AbstractInterface -from kestrel.ir.graph import IRGraphEvaluable -from kestrel.ir.instructions import ( - DataSource, - Instruction, - Return, - Variable, - Filter, - SourceInstruction, - TransformingInstruction, - SolePredecessorTransformingInstruction, -) -from kestrel.mapping.data_model import translate_dataframe - -from kestrel_interface_opensearch.config import load_config -from kestrel_interface_opensearch.ossql import OpenSearchTranslator - - -_logger = logging.getLogger(__name__) - - -def _jdbc2df(schema: dict, datarows: dict) -> DataFrame: - """Convert a JDBC query result response to a DataFrame""" - columns = [c.get("alias", c["name"]) for c in schema] - return DataFrame(datarows, columns=columns) - - -def read_sql(sql: str, conn: OpenSearch, dmm: Optional[dict] = None) -> DataFrame: - """Execute `sql` and return the results as a DataFrame, a la pandas.read_sql""" - # https://opensearch.org/docs/latest/search-plugins/sql/sql-ppl-api/#query-api - body = { - # Temporarily comment out fetch_size due to https://github.com/opensearch-project/sql/issues/2579 - # FIXME: "fetch_size": 10000, # Should we make this configurable? - "query": sql, - } - query_resp = conn.http.post("/_plugins/_sql?format=jdbc", body=body) - status = query_resp.get("status", 500) - if status != 200: - raise DataSourceError(f"OpenSearch query returned {status}") - _logger.debug( - "total=%d size=%d rows=%d", - query_resp["total"], - query_resp["size"], - len(query_resp["datarows"]), - ) - - # Only the first page contains the schema - # https://opensearch.org/docs/latest/search-plugins/sql/sql-ppl-api/#paginating-results - schema = query_resp["schema"] - dfs = [] - done = False - while not done: - df = _jdbc2df(schema, query_resp["datarows"]) - if dmm is not None: - # Need to use Data Model Map to do results translation - dfs.append(translate_dataframe(df, dmm)) - else: - dfs.append(df) - cursor = query_resp.get("cursor") - if not cursor: - break - query_resp = conn.http.post( - "/_plugins/_sql?format=jdbc", body={"cursor": cursor} - ) - - # Merge all pages together - return concat(dfs) - - -class OpenSearchInterface(AbstractInterface): - def __init__( - self, - serialized_cache_catalog: Optional[str] = None, - session_id: Optional[UUID] = None, - ): - super().__init__(serialized_cache_catalog, session_id) - self.config = load_config() - self.schemas: dict = {} # Schema per table (index) - self.conns: dict = {} # Map of conn name -> connection - for info in self.config.indexes.values(): - name = info.connection - if name not in self.conns: - conn = self.config.connections[name] - client = OpenSearch( - [conn.url], - http_auth=(conn.auth.username, conn.auth.password), - verify_certs=conn.verify_certs, - ) - self.conns[name] = client - - @staticmethod - def schemes() -> Iterable[str]: - return ["opensearch"] - - def store( - self, - instruction_id: UUID, - data: DataFrame, - ): - raise NotImplementedError("OpenSearchInterface.store") # TEMP - - def evaluate_graph( - self, - graph: IRGraphEvaluable, - instructions_to_evaluate: Optional[Iterable[Instruction]] = None, - ) -> Mapping[UUID, DataFrame]: - mapping = {} - if not instructions_to_evaluate: - instructions_to_evaluate = graph.get_sink_nodes() - for instruction in instructions_to_evaluate: - translator = self._evaluate_instruction_in_graph(graph, instruction) - # TODO: may catch error in case evaluation starts from incomplete SQL - sql = translator.result() - _logger.debug("SQL query generated: %s", sql) - ds = self.config.indexes[translator.table] # table == datasource - conn = self.config.connections[ds.connection] - client = OpenSearch( - [conn.url], - http_auth=(conn.auth.username, conn.auth.password), - verify_certs=conn.verify_certs, - ) - mapping[instruction.id] = read_sql( - sql, client, translator.from_ocsf_map[translator.entity] - ) - client.close() - return mapping - - def explain_graph( - self, - graph: IRGraphEvaluable, - instructions_to_explain: Optional[Iterable[Instruction]] = None, - ) -> Mapping[UUID, GraphletExplanation]: - mapping = {} - if not instructions_to_explain: - instructions_to_explain = graph.get_sink_nodes() - for instruction in instructions_to_explain: - translator = self._evaluate_instruction_in_graph(graph, instruction) - dep_graph = graph.duplicate_dependent_subgraph_of_node(instruction) - graph_dict = dep_graph.to_dict() - query_stmt = translator.result() - mapping[instruction.id] = GraphletExplanation(graph_dict, query_stmt) - return mapping - - def _evaluate_instruction_in_graph( - self, - graph: IRGraphEvaluable, - instruction: Instruction, - ) -> OpenSearchTranslator: - _logger.debug("instruction: %s", str(instruction)) - translator = None - if isinstance(instruction, TransformingInstruction): - trunk, _r2n = graph.get_trunk_n_branches(instruction) - translator = self._evaluate_instruction_in_graph(graph, trunk) - - if isinstance(instruction, SolePredecessorTransformingInstruction): - if isinstance(instruction, Return): - pass - elif isinstance(instruction, Variable): - pass - else: - translator.add_instruction(instruction) - - elif isinstance(instruction, Filter): - translator.add_instruction(instruction) - - else: - raise NotImplementedError(f"Unknown instruction type: {instruction}") - - elif isinstance(instruction, SourceInstruction): - if isinstance(instruction, DataSource): - ds = self.config.indexes[instruction.datasource] - schema = self.get_schema(instruction.datasource) - translator = OpenSearchTranslator( - ds.timestamp_format, - ds.timestamp, - instruction.datasource, - ds.data_model_map, - schema, - ) - else: - raise NotImplementedError(f"Unhandled instruction type: {instruction}") - - return translator - - def _get_client_for_index(self, index: str) -> OpenSearch: - conn = self.config.indexes[index].connection - _logger.debug( - "Fetching schema for %s from %s", index, self.config.connections[conn].url - ) - return self.conns[conn] - - def get_schema(self, index: str) -> dict: - client = self._get_client_for_index(index) - if index not in self.schemas: - df = read_sql(f"DESCRIBE TABLES LIKE {index}", client) - self.schemas[index] = ( - df[["TYPE_NAME", "COLUMN_NAME"]] - .set_index("COLUMN_NAME") - .T.to_dict("records")[0] - ) - _logger.debug("%s schema:\n%s", index, self.schemas[index]) - return self.schemas[index] diff --git a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/ossql.py b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/ossql.py deleted file mode 100644 index 018cd4c8..00000000 --- a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/ossql.py +++ /dev/null @@ -1,249 +0,0 @@ -import logging -from functools import reduce -from typing import Optional, Union - -from typeguard import typechecked - -from kestrel.exceptions import UnsupportedOperatorError -from kestrel.ir.filter import ( - BoolExp, - ExpOp, - FComparison, - ListOp, - MultiComp, - NumCompOp, - StrComparison, - StrCompOp, -) -from kestrel.ir.instructions import ( - Filter, - Instruction, - Limit, - Offset, - ProjectAttrs, - ProjectEntity, - Sort, - SortDirection, -) -from kestrel.mapping.data_model import ( - translate_comparison_to_native, - translate_projection_to_native, -) - - -_logger = logging.getLogger(__name__) - - -Value = Union[ - int, - float, - str, - list, -] - - -@typechecked -def _and(lhs: str, rhs: Value) -> str: - return " AND ".join((lhs, rhs)) - - -@typechecked -def _or(lhs: str, rhs: Value) -> str: - return " OR ".join((lhs, rhs)) - - -# SQL comparison operator functions -comp2func = { - NumCompOp.EQ: "=", - NumCompOp.NEQ: "<>", - NumCompOp.LT: "<", - NumCompOp.LE: "<=", - NumCompOp.GT: ">", - NumCompOp.GE: ">=", - StrCompOp.EQ: "=", - StrCompOp.NEQ: "<>", - StrCompOp.LIKE: "LIKE", - StrCompOp.NLIKE: "NOT LIKE", - # UNSUPPORTED BY OpenSearch SQL: StrCompOp.MATCHES: "REGEXP", - # UNSUPPORTED BY OpenSearch SQL: StrCompOp.NMATCHES: "NOT REGEXP", - ListOp.IN: "IN", - ListOp.NIN: "NOT IN", -} - - -def _format_value(value): - if isinstance(value, str): - # Need to quote string values - value = f"'{value}'" - elif isinstance(value, list): - # SQL uses parens for lists - value = tuple(value) - return value - - -@typechecked -class OpenSearchTranslator: - def __init__( - self, - timefmt: str, - timestamp: str, - select_from: str, - data_model_map: dict, - schema: dict, - ): - # Time format string for datasource - self.timefmt = timefmt - - # Primary timestamp field in target table - self.timestamp = timestamp - - # Query clauses - self.table: str = select_from - self.filt: Optional[Filter] = None - self.entity: Optional[str] = None - self.project: Optional[ProjectAttrs] = None - self.limit: int = 0 - self.offset: int = 0 - self.order_by: str = "" - self.sort_dir = SortDirection.DESC - - # Data model mapping: should be ocsf -> native - self.from_ocsf_map = data_model_map - - # Index "schema" (field name -> type) - self.schema = schema - - @typechecked - def _render_comp(self, comp: FComparison) -> str: - prefix = ( - f"{self.entity}." if (self.entity and comp.field != self.timestamp) else "" - ) - ocsf_field = f"{prefix}{comp.field}" - comps = translate_comparison_to_native( - self.from_ocsf_map, ocsf_field, comp.op, comp.value - ) - try: - comps = [f"{f} {comp2func[o]} {_format_value(v)}" for f, o, v in comps] - conj = " OR ".join(comps) - result = conj if len(comps) == 1 else f"({conj})" - except KeyError: - raise UnsupportedOperatorError( - comp.op.value - ) # FIXME: need to report the mapped op, not the original - return result - - @typechecked - def _render_multi_comp(self, comps: MultiComp) -> str: - op = _and if comps.op == ExpOp.AND else _or - return reduce(op, map(self._render_comp, comps.comps)) - - @typechecked - def _render_exp(self, exp: BoolExp) -> str: - if isinstance(exp.lhs, BoolExp): - lhs = self._render_exp(exp.lhs) - elif isinstance(exp.lhs, MultiComp): - lhs = self._render_multi_comp(exp.lhs) - else: - lhs = self._render_comp(exp.lhs) - if isinstance(exp.rhs, BoolExp): - rhs = self._render_exp(exp.rhs) - elif isinstance(exp.rhs, MultiComp): - rhs = self._render_multi_comp(exp.rhs) - else: - rhs = self._render_comp(exp.rhs) - return _and(lhs, rhs) if exp.op == ExpOp.AND else _or(lhs, rhs) - - @typechecked - def _render_filter(self) -> Optional[str]: - if not self.filt: - return None - if self.filt.timerange.start: - # Convert the timerange to the appropriate pair of comparisons - start_comp = StrComparison( - self.timestamp, ">=", self.filt.timerange.start.strftime(self.timefmt) - ) - stop_comp = StrComparison( - self.timestamp, "<", self.filt.timerange.stop.strftime(self.timefmt) - ) - # AND them together - time_exp = BoolExp(start_comp, ExpOp.AND, stop_comp) - # AND that with any existing filter expression - exp = BoolExp(self.filt.exp, ExpOp.AND, time_exp) - else: - exp = self.filt.exp - if isinstance(exp, BoolExp): - comp = self._render_exp(exp) - elif isinstance(exp, MultiComp): - comp = self._render_multi_comp(exp) - else: - comp = self._render_comp(exp) - return comp - - def add_Filter(self, filt: Filter) -> None: - # Just save filter and compile it later - # Probably need the entity projection set first - self.filt = filt - - def add_ProjectAttrs(self, proj: ProjectAttrs) -> None: - # Just save projection and compile it later - self.project = proj - - def _render_proj(self): - """Get a list of native cols to project with their OCSF equivalents as SQL aliases""" - projection = self.project.attrs if self.project else None - name_pairs = translate_projection_to_native( - self.from_ocsf_map, self.entity, projection - ) - proj = [ - f"`{k}` AS `{v}`" if k != v else f"`{k}`" - for k, v in name_pairs - if k in self.schema # Ignore mapped attrs the index doesn't have - ] - if not proj: - # If this is still empty, then the attr projection must be for attrs "outside" to entity projection? - proj = [f"`{attr}`" for attr in self.project.attrs] - _logger.debug("Set projection to %s", proj) - return proj - - def add_ProjectEntity(self, proj: ProjectEntity) -> None: - self.entity = proj.entity_type - _logger.debug("Set base entity to '%s'", self.entity) - - def add_Limit(self, lim: Limit) -> None: - self.limit = lim.num - - def add_Offset(self, offset: Offset) -> None: - self.offset = offset.num - - def add_Sort(self, sort: Sort) -> None: - self.order_by = sort.attribute - self.sort_dir = sort.direction - - def add_instruction(self, i: Instruction) -> None: - inst_name = i.instruction - method_name = f"add_{inst_name}" - try: - method = getattr(self, method_name) - except AttributeError as e: - raise NotImplementedError(f"OpenSearchTranslator.{method_name}") - method(i) - - def result(self) -> str: - stages = ["SELECT"] - cols = ", ".join(self._render_proj()) - stages.append(f"{cols}") - stages.append(f"FROM {self.table}") - where = self._render_filter() - if where: - stages.append(f"WHERE {where}") - if self.order_by: - stages.append(f"ORDER BY {self.order_by} {self.sort_dir.value}") - if self.limit: - # https://opensearch.org/docs/latest/search-plugins/sql/sql/basic/#limit - if self.offset: - stages.append(f"LIMIT {self.offset}, {self.limit}") - else: - stages.append(f"LIMIT {self.limit}") - sql = " ".join(stages) - _logger.debug("SQL: %s", sql) - return sql diff --git a/packages-nextgen/kestrel_interface_opensearch/tests/__init__.py b/packages-nextgen/kestrel_interface_opensearch/tests/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/packages-nextgen/kestrel_interface_opensearch/tests/test_config.py b/packages-nextgen/kestrel_interface_opensearch/tests/test_config.py deleted file mode 100644 index 85241b71..00000000 --- a/packages-nextgen/kestrel_interface_opensearch/tests/test_config.py +++ /dev/null @@ -1,52 +0,0 @@ -import os - -import yaml - -from kestrel_interface_opensearch.config import ( - PROFILE_PATH_ENV_VAR, - Connection, - load_config, -) - - -def test_load_config(tmp_path): - config = { - "connections": { - "localhost": { - "url": "https://localhost:9200", - "verify_certs": False, - "auth": { - "username": "admin", - "password": "admin", - } - }, - "some-cloud-thing": { - "url": "https://www.example.com:9200", - "verify_certs": True, - "auth": { - "username": "hunter", - "password": "super_secret", - } - } - }, - "indexes": { - "some_index": { - "connection": "some-cloud-thing", - "timestamp": "@timestamp", - "timestamp_format": "%Y-%m-%d %H:%M:%S.%f", - "data_model_mapping": str(tmp_path / "mapping.yaml") - } - } - } - map_file = tmp_path / "mapping.yaml" - with open(map_file, 'w') as fp: - fp.write("some.field: other.field\n") - config_file = tmp_path / "opensearch.yaml" - with open(config_file, 'w') as fp: - yaml.dump(config, fp) - os.environ[PROFILE_PATH_ENV_VAR] = str(config_file) - read_config = load_config() - conn: Connection = read_config.connections["localhost"] - assert conn.url == config["connections"]["localhost"]["url"] - assert read_config.connections["localhost"].url == config["connections"]["localhost"]["url"] - assert read_config.indexes["some_index"].timestamp == config["indexes"]["some_index"]["timestamp"] diff --git a/packages-nextgen/kestrel_interface_opensearch/tests/test_ossql.py b/packages-nextgen/kestrel_interface_opensearch/tests/test_ossql.py deleted file mode 100644 index 838b57e2..00000000 --- a/packages-nextgen/kestrel_interface_opensearch/tests/test_ossql.py +++ /dev/null @@ -1,127 +0,0 @@ -from datetime import datetime -from dateutil import parser - -from kestrel_interface_opensearch.ossql import OpenSearchTranslator -from kestrel.exceptions import UnsupportedOperatorError -from kestrel.ir.filter import ( - BoolExp, - ExpOp, - FComparison, - IntComparison, - ListOp, - ListComparison, - MultiComp, - NumCompOp, - StrCompOp, - StrComparison, - TimeRange, -) -from kestrel.ir.instructions import ( - DataSource, - Filter, - Limit, - Offset, - ProjectAttrs, - ProjectEntity, - Sort, - SortDirection, -) - -import pytest - - -TIMEFMT = '%Y-%m-%dT%H:%M:%S.%fZ' - - -# A much-simplified test mapping -data_model_map = { - "process": { - "cmd_line": "CommandLine", - "file": { - "path": "Image", - # "name": [ - # { - # "native_field": "Image", - # "native_value": "basename", - # "ocsf_op": "LIKE", - # "ocsf_value": "endswith" - # } - # ] - }, - "pid": "ProcessId", - "parent_process": { - "pid": "ParentProcessId", - }, - }, -} - -schema = { - "CommandLine": "text", - "Image": "text", - "ProcessId": "text", - "ParentProcessId": "text", -} - - -def _dt(timestr: str) -> datetime: - return parser.parse(timestr) - - -def _remove_nl(s): - return s.replace('\n', '') - - -@pytest.mark.parametrize( - "iseq, sql", [ - # Try a simple filter - ([Filter(IntComparison('foo', NumCompOp.GE, 0))], - "SELECT {} FROM my_table WHERE foo >= 0"), - # Try a simple filter with sorting - ([Filter(IntComparison('foo', NumCompOp.GE, 0)), Sort('bar')], - "SELECT {} FROM my_table WHERE foo >= 0 ORDER BY bar DESC"), - # Simple filter plus time range - ([Filter(IntComparison('foo', NumCompOp.GE, 0), timerange=TimeRange(_dt('2023-12-06T08:17:00Z'), _dt('2023-12-07T08:17:00Z')))], - "SELECT {} FROM my_table WHERE foo >= 0 AND timestamp >= '2023-12-06T08:17:00.000000Z' AND timestamp < '2023-12-07T08:17:00.000000Z'"), - # Add a limit and projection - ([Limit(3), ProjectAttrs(['foo', 'bar', 'baz']), Filter(StrComparison('foo', StrCompOp.EQ, 'abc'))], - "SELECT `foo`, `bar`, `baz` FROM my_table WHERE foo = 'abc' LIMIT 3"), - # Same as above but reverse order - ([Filter(StrComparison('foo', StrCompOp.EQ, 'abc')), ProjectAttrs(['foo', 'bar', 'baz']), Limit(3)], - "SELECT `foo`, `bar`, `baz` FROM my_table WHERE foo = 'abc' LIMIT 3"), - ([Filter(ListComparison('foo', ListOp.NIN, ['abc', 'def']))], - "SELECT {} FROM my_table WHERE foo NOT IN ('abc', 'def')"), - ([Filter(MultiComp(ExpOp.OR, [IntComparison('foo', NumCompOp.EQ, 1), IntComparison('bar', NumCompOp.EQ, 1)]))], - "SELECT {} FROM my_table WHERE foo = 1 OR bar = 1"), - ([Filter(MultiComp(ExpOp.AND, [IntComparison('foo', NumCompOp.EQ, 1), IntComparison('bar', NumCompOp.EQ, 1)]))], - "SELECT {} FROM my_table WHERE foo = 1 AND bar = 1"), - ([Limit(1000), Offset(2000)], - "SELECT {} FROM my_table LIMIT 2000, 1000"), - # Test entity projection - ([Limit(3), Filter(StrComparison('cmd_line', StrCompOp.EQ, 'foo bar')), ProjectEntity('process')], - "SELECT {} FROM my_table WHERE CommandLine = 'foo bar' LIMIT 3"), - ] -) -def test_opensearch_translator(iseq, sql): - cols = '`CommandLine` AS `cmd_line`, `Image` AS `file.path`, `ProcessId` AS `pid`, `ParentProcessId` AS `parent_process.pid`' - if ProjectEntity in {type(i) for i in iseq}: - cols = '`CommandLine` AS `cmd_line`, `Image` AS `file.path`, `ProcessId` AS `pid`, `ParentProcessId` AS `parent_process.pid`' - else: - cols = '`CommandLine` AS `process.cmd_line`, `Image` AS `process.file.path`, `ProcessId` AS `process.pid`, `ParentProcessId` AS `process.parent_process.pid`' - trans = OpenSearchTranslator(TIMEFMT, "timestamp", "my_table", data_model_map, schema) - for i in iseq: - trans.add_instruction(i) - result = trans.result() - assert _remove_nl(str(result)) == sql.format(cols) - - -@pytest.mark.parametrize( - "instruction", [ - Filter(StrComparison('foo', StrCompOp.MATCHES, '.*abc.*')), - Filter(StrComparison('foo', StrCompOp.NMATCHES, '.*abc.*')), - ] -) -def test_opensearch_translator_unsupported(instruction): - trans = OpenSearchTranslator(TIMEFMT, "timestamp", "my_table", data_model_map, schema) - with pytest.raises(UnsupportedOperatorError): - trans.add_instruction(instruction) - _ = trans.result() diff --git a/packages-nextgen/kestrel_interface_sqlalchemy/pyproject.toml b/packages-nextgen/kestrel_interface_sqlalchemy/pyproject.toml deleted file mode 100644 index c4309e70..00000000 --- a/packages-nextgen/kestrel_interface_sqlalchemy/pyproject.toml +++ /dev/null @@ -1,35 +0,0 @@ -[build-system] -requires = ["setuptools >= 68.2.2", "wheel"] -build-backend = "setuptools.build_meta" - -[project] -name = "kestrel_interface_sqlalchemy" -version = "2.0.0" -description = "Kestrel SQLAlchemy Datasource Interface" -readme = "README.rst" -requires-python = ">=3.8" -license = {text = "Apache 2.0 License"} -maintainers = [ - {name = "Xiaokui Shu", email = "xiaokui.shu@ibm.com"}, - {name = "Paul Coccoli", email = "pcoccoli@us.ibm.com"}, -] -keywords = [ - "kestrel", - "cybersecurity", - "threat hunting", -] -classifiers = [ - "Topic :: Security", - "Operating System :: OS Independent", - "Development Status :: 4 - Beta", - "Programming Language :: Python :: 3", -] - -dependencies = [ - "kestrel_core>=2.0.0", -] - -[project.urls] -Homepage = "https://github.com/opencybersecurityalliance/kestrel-lang" -Documentation = "https://kestrel.readthedocs.io/" -Repository = "https://github.com/opencybersecurityalliance/kestrel-lang.git" diff --git a/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/__init__.py b/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/__init__.py deleted file mode 100644 index 781df021..00000000 --- a/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from kestrel_interface_sqlalchemy.interface import SQLAlchemyInterface diff --git a/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/config.py b/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/config.py deleted file mode 100644 index e9d148e4..00000000 --- a/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/config.py +++ /dev/null @@ -1,58 +0,0 @@ -import logging -from dataclasses import dataclass, field -from typing import Dict, Mapping, Optional - -import yaml -from mashumaro.mixins.json import DataClassJSONMixin - -from kestrel.config.utils import ( - CONFIG_DIR_DEFAULT, - load_user_config, -) -from kestrel.exceptions import InterfaceNotConfigured -from kestrel.mapping.data_model import load_default_mapping - - -PROFILE_PATH_DEFAULT = CONFIG_DIR_DEFAULT / "sqlalchemy.yaml" -PROFILE_PATH_ENV_VAR = "KESTREL_SQLALCHEMY_CONFIG" - -_logger = logging.getLogger(__name__) - - -@dataclass -class Connection(DataClassJSONMixin): - url: str # SQLAlchemy "connection URL" or "connection string" - - -@dataclass -class Table(DataClassJSONMixin): - connection: str - timestamp: str - timestamp_format: str - data_model_mapping: Optional[str] = None # Filename for mapping - data_model_map: Mapping = field(default_factory=dict) - - def __post_init__(self): - if self.data_model_mapping: - with open(self.data_model_mapping, "r") as fp: - self.data_model_map = yaml.safe_load(fp) - else: - # Default to the built-in ECS mapping - self.data_model_map = load_default_mapping("ecs") # FIXME: need a default? - - -@dataclass -class Config(DataClassJSONMixin): - connections: Dict[str, Connection] - tables: Dict[str, Table] - - def __post_init__(self): - self.connections = {k: Connection(**v) for k, v in self.connections.items()} - self.tables = {k: Table(**v) for k, v in self.tables.items()} - - -def load_config(): - try: - return Config(**load_user_config(PROFILE_PATH_ENV_VAR, PROFILE_PATH_DEFAULT)) - except TypeError: - raise InterfaceNotConfigured() diff --git a/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/interface.py b/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/interface.py deleted file mode 100644 index 6197ab5e..00000000 --- a/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/interface.py +++ /dev/null @@ -1,268 +0,0 @@ -import logging -from functools import reduce -from typing import Callable, Iterable, Mapping, Optional -from uuid import UUID - -from pandas import DataFrame, read_sql -import sqlalchemy -from sqlalchemy import and_, column, or_ -from sqlalchemy.sql.elements import BooleanClauseList -from sqlalchemy.sql.expression import ColumnClause -from typeguard import typechecked - -from kestrel.display import GraphletExplanation -from kestrel.interface import AbstractInterface -from kestrel.interface.codegen.sql import SqlTranslator, comp2func -from kestrel.ir.filter import ( - BoolExp, - ExpOp, - FComparison, - MultiComp, - StrComparison, - StrCompOp, -) -from kestrel.ir.graph import IRGraphEvaluable -from kestrel.ir.instructions import ( - DataSource, - Filter, - Instruction, - ProjectAttrs, - ProjectEntity, - Return, - SolePredecessorTransformingInstruction, - SourceInstruction, - TransformingInstruction, - Variable, -) -from kestrel.mapping.data_model import ( - translate_comparison_to_native, - translate_dataframe, - translate_projection_to_native, -) - -from kestrel_interface_sqlalchemy.config import load_config - - -_logger = logging.getLogger(__name__) - - -@typechecked -class SQLAlchemyTranslator(SqlTranslator): - def __init__( - self, - dialect: sqlalchemy.engine.default.DefaultDialect, - timefmt: Callable, - timestamp: str, - from_obj: sqlalchemy.FromClause, - dmm: dict, - ): - super().__init__(dialect, timefmt, timestamp, from_obj) - self.dmm = dmm - self.proj = None - self.entity_type = None - - @typechecked - def _render_comp(self, comp: FComparison): - prefix = ( - f"{self.entity_type}." - if (self.entity_type and comp.field != self.timestamp) - else "" - ) - ocsf_field = f"{prefix}{comp.field}" - comps = translate_comparison_to_native( - self.dmm, ocsf_field, comp.op, comp.value - ) - translated_comps = [] - for comp in comps: - field, op, value = comp - col: ColumnClause = column(field) - if op == StrCompOp.NMATCHES: - tmp = ~comp2func[op](col, value) - else: - tmp = comp2func[op](col, value) - translated_comps.append(tmp) - return reduce(or_, translated_comps) - - @typechecked - def _render_multi_comp(self, comps: MultiComp): - op = and_ if comps.op == ExpOp.AND else or_ - return reduce(op, map(self._render_comp, comps.comps)) - - # This is copied verbatim from sql.py but we need to supply our own _render_comp - def _render_exp(self, exp: BoolExp) -> BooleanClauseList: - if isinstance(exp.lhs, BoolExp): - lhs = self._render_exp(exp.lhs) - elif isinstance(exp.lhs, MultiComp): - lhs = self._render_multi_comp(exp.lhs) - else: - lhs = self._render_comp(exp.lhs) - if isinstance(exp.rhs, BoolExp): - rhs = self._render_exp(exp.rhs) - elif isinstance(exp.rhs, MultiComp): - rhs = self._render_multi_comp(exp.rhs) - else: - rhs = self._render_comp(exp.rhs) - return and_(lhs, rhs) if exp.op == ExpOp.AND else or_(lhs, rhs) - - @typechecked - def _add_filter(self) -> Optional[str]: - if not self.filt: - return None - filt = self.filt - if filt.timerange.start: - # Convert the timerange to the appropriate pair of comparisons - start_comp = StrComparison( - self.timestamp, ">=", self.timefmt(filt.timerange.start) - ) - stop_comp = StrComparison( - self.timestamp, "<", self.timefmt(filt.timerange.stop) - ) - # AND them together - time_exp = BoolExp(start_comp, ExpOp.AND, stop_comp) - # AND that with any existing filter expression - exp = BoolExp(filt.exp, ExpOp.AND, time_exp) - else: - exp = filt.exp - if isinstance(exp, BoolExp): - comp = self._render_exp(exp) - elif isinstance(exp, MultiComp): - comp = self._render_multi_comp(exp) - else: - comp = self._render_comp(exp) - self.query = self.query.where(comp) - - def add_Filter(self, filt: Filter) -> None: - # Just save filter and compile it later - # Probably need the entity projection set first - self.filt = filt - - def add_ProjectAttrs(self, proj: ProjectAttrs) -> None: - self.proj = proj - - def add_ProjectEntity(self, proj: ProjectEntity) -> None: - self.entity_type = proj.entity_type - - def result(self) -> sqlalchemy.Compiled: - proj = self.proj.attrs if self.proj else None - pairs = translate_projection_to_native(self.dmm, self.entity_type, proj) - cols = [sqlalchemy.column(i).label(j) for i, j in pairs] - self._add_filter() - self.query = self.query.with_only_columns(*cols) # TODO: mapping? - return self.query.compile(dialect=self.dialect) - - -class SQLAlchemyInterface(AbstractInterface): - def __init__( - self, - serialized_cache_catalog: Optional[str] = None, - session_id: Optional[UUID] = None, - ): - _logger.debug("SQLAlchemyInterface: loading config") - super().__init__(serialized_cache_catalog, session_id) - self.config = load_config() - self.schemas: dict = {} # Schema per table (index) - self.engines: dict = {} # Map of conn name -> engine - self.conns: dict = {} # Map of conn name -> connection - for info in self.config.tables.values(): - name = info.connection - conn_info = self.config.connections[name] - if name not in self.engines: - self.engines[name] = sqlalchemy.create_engine(conn_info.url) - if name not in self.conns: - engine = self.engines[name] - self.conns[name] = engine.connect() - _logger.debug("SQLAlchemyInterface: configured %s", name) - - @staticmethod - def schemes() -> Iterable[str]: - return ["sqlalchemy"] - - def store( - self, - instruction_id: UUID, - data: DataFrame, - ): - raise NotImplementedError("SQLAlchemyInterface.store") # TEMP - - def evaluate_graph( - self, - graph: IRGraphEvaluable, - instructions_to_evaluate: Optional[Iterable[Instruction]] = None, - ) -> Mapping[UUID, DataFrame]: - mapping = {} - if not instructions_to_evaluate: - instructions_to_evaluate = graph.get_sink_nodes() - for instruction in instructions_to_evaluate: - translator = self._evaluate_instruction_in_graph(graph, instruction) - # TODO: may catch error in case evaluation starts from incomplete SQL - sql = translator.result() - _logger.debug("SQL query generated: %s", sql) - # Get the "from" table for this query - tables = translator.query.selectable.get_final_froms() - table = tables[0].name # TODO: what if there's more than 1? - # Get the data source's SQLAlchemy connection object - conn = self.conns[self.config.tables[table].connection] - df = read_sql(sql, conn) - dmm = translator.dmm[ - translator.entity_type - ] # TODO: need a method for this? - mapping[instruction.id] = translate_dataframe(df, dmm) - return mapping - - def explain_graph( - self, - graph: IRGraphEvaluable, - instructions_to_explain: Optional[Iterable[Instruction]] = None, - ) -> Mapping[UUID, GraphletExplanation]: - mapping = {} - if not instructions_to_explain: - instructions_to_explain = graph.get_sink_nodes() - for instruction in instructions_to_explain: - translator = self._evaluate_instruction_in_graph(graph, instruction) - dep_graph = graph.duplicate_dependent_subgraph_of_node(instruction) - graph_dict = dep_graph.to_dict() - query_stmt = translator.result() - mapping[instruction.id] = GraphletExplanation(graph_dict, query_stmt) - return mapping - - def _evaluate_instruction_in_graph( - self, - graph: IRGraphEvaluable, - instruction: Instruction, - ) -> SQLAlchemyTranslator: - _logger.debug("instruction: %s", str(instruction)) - translator = None - if isinstance(instruction, TransformingInstruction): - trunk, _r2n = graph.get_trunk_n_branches(instruction) - translator = self._evaluate_instruction_in_graph(graph, trunk) - - if isinstance(instruction, SolePredecessorTransformingInstruction): - if isinstance(instruction, Return): - pass - elif isinstance(instruction, Variable): - pass - else: - translator.add_instruction(instruction) - - elif isinstance(instruction, Filter): - translator.add_instruction(instruction) - - else: - raise NotImplementedError(f"Unknown instruction type: {instruction}") - - elif isinstance(instruction, SourceInstruction): - if isinstance(instruction, DataSource): - ds = self.config.tables[instruction.datasource] - connection = ds.connection - dialect = self.engines[connection].dialect - translator = SQLAlchemyTranslator( - dialect, - lambda dt: dt.strftime(ds.timestamp_format), - ds.timestamp, - sqlalchemy.table(instruction.datasource), - ds.data_model_map, - ) - else: - raise NotImplementedError(f"Unhandled instruction type: {instruction}") - - return translator diff --git a/packages-nextgen/kestrel_interface_sqlalchemy/tests/test_config.py b/packages-nextgen/kestrel_interface_sqlalchemy/tests/test_config.py deleted file mode 100644 index a19d97a6..00000000 --- a/packages-nextgen/kestrel_interface_sqlalchemy/tests/test_config.py +++ /dev/null @@ -1,42 +0,0 @@ -import os - -import yaml - -from kestrel_interface_sqlalchemy.config import ( - PROFILE_PATH_ENV_VAR, - Connection, - load_config, -) - - -def test_load_config(tmp_path): - config = { - "connections": { - "localhost": { - "url": "sqlite:////home/jdoe/test.db", - }, - "some-data-lake": { - "url": "presto://jdoe@example.com:8889/hive", - } - }, - "tables": { - "cloud_table": { - "connection": "some-data-lake", - "timestamp": "eventTime", - "timestamp_format": "%Y-%m-%d %H:%M:%S.%f", - "data_model_mapping": str(tmp_path / "mapping.yaml") - } - } - } - map_file = tmp_path / "mapping.yaml" - with open(map_file, 'w') as fp: - fp.write("some.field: other.field\n") - config_file = tmp_path / "sqlalchemy.yaml" - with open(config_file, 'w') as fp: - yaml.dump(config, fp) - os.environ[PROFILE_PATH_ENV_VAR] = str(config_file) - read_config = load_config() - conn: Connection = read_config.connections["localhost"] - assert conn.url == config["connections"]["localhost"]["url"] - assert read_config.connections["localhost"].url == config["connections"]["localhost"]["url"] - assert read_config.tables["cloud_table"].timestamp == config["tables"]["cloud_table"]["timestamp"] diff --git a/packages-nextgen/kestrel_jupyter/README.rst b/packages-nextgen/kestrel_jupyter/README.rst deleted file mode 120000 index c768ff7d..00000000 --- a/packages-nextgen/kestrel_jupyter/README.rst +++ /dev/null @@ -1 +0,0 @@ -../../README.rst \ No newline at end of file diff --git a/packages-nextgen/kestrel_jupyter/pyproject.toml b/packages-nextgen/kestrel_jupyter/pyproject.toml deleted file mode 100644 index 3cc31435..00000000 --- a/packages-nextgen/kestrel_jupyter/pyproject.toml +++ /dev/null @@ -1,56 +0,0 @@ -[build-system] -requires = ["setuptools >= 68.2.2", "wheel"] -build-backend = "setuptools.build_meta" - -[project] -name = "kestrel_jupyter" -version = "2.0.0" -description = "Kestrel Jupyter Kernel" -readme = "README.rst" -requires-python = ">=3.8" -license = {text = "Apache 2.0 License"} -maintainers = [ - {name = "Xiaokui Shu", email = "xiaokui.shu@ibm.com"}, - {name = "Paul Coccoli", email = "pcoccoli@us.ibm.com"}, -] -keywords = [ - "kestrel", - "Jupyter", - "kernel", -] -classifiers = [ - "Topic :: Security", - "Operating System :: OS Independent", - "Development Status :: 4 - Beta", - "Programming Language :: Python :: 3", -] - -dependencies = [ - "kestrel_core==2.0.0", - "jupyterlab-server", - "jupyterlab", - "jupyter_client", - "nbclassic", - "sqlparse==0.4.4", - "pygments==2.17.2", - "matplotlib==3.8.3", -] - -[project.optional-dependencies] -test = [ - "pytest", -] - -[project.urls] -Homepage = "https://github.com/opencybersecurityalliance/kestrel-lang" -Documentation = "https://kestrel.readthedocs.io/" -Repository = "https://github.com/opencybersecurityalliance/kestrel-lang.git" - -[project.scripts] -kestrel_jupyter_setup = "kestrel_jupyter_kernel.setup:run" - -[tool.setuptools.packages.find] -where = ["src"] - -[tool.setuptools.package-data] -"*" = ["*.js"] diff --git a/packages-nextgen/kestrel_jupyter/src/kestrel_ipython/__init__.py b/packages-nextgen/kestrel_jupyter/src/kestrel_ipython/__init__.py deleted file mode 100644 index b79424d7..00000000 --- a/packages-nextgen/kestrel_jupyter/src/kestrel_ipython/__init__.py +++ /dev/null @@ -1 +0,0 @@ -import kestrel_ipython.magic diff --git a/packages-nextgen/kestrel_jupyter/src/kestrel_ipython/magic.py b/packages-nextgen/kestrel_jupyter/src/kestrel_ipython/magic.py deleted file mode 100644 index aeac9c38..00000000 --- a/packages-nextgen/kestrel_jupyter/src/kestrel_ipython/magic.py +++ /dev/null @@ -1,64 +0,0 @@ -import sys -import re - -from IPython.core.magic import ( - line_cell_magic, - Magics, - magics_class, -) - -from kestrel.session import Session - - -@magics_class -class KestrelMagic(Magics): - def __init__(self, shell=None, config=None, user_magics=None, **traits): - super().__init__(shell=shell, config=config, user_magics=user_magics, **traits) - self.session = None - - def __check_magic(self, line="", cell=None): - """ - Some non-Kestrel commands to handle separately for initializing the session. - This likely includes how to connect to UDI, ATK, and other parameters. - """ - # regex is a simple hack - r = r"^\s*(session)\s+(init)\s*(true|false)?\s*$" - m = re.match(r, line, re.IGNORECASE) - if m is None: - return False - stderr = m.groups()[2] is not None and m.groups()[2].lower() == "true" - self.session = Session(stderr) - return True - - @line_cell_magic - def kestrel(self, line="", cell=None): - """ - session init [true / false] - """ - if self.__check_magic(line, cell): - if len(line) > 0: - line = "" - if cell is None: - return - - if self.session is None: - self.session = Session() - if len(line) == 0 and cell is None: - sys.stderr.write("Need to provide a Kestrel query to execute") - return None - if cell is None: - # assert cell is None - return self.session.execute(line) - else: - sys.stderr.write(repr(cell)) - if len(line) != 0: - self.session.execute(line) - return self.session.execute(cell) - # indx = line.lower().find('as df') - # if indx != -1: - # return pd.DataFrame.from_records(self.session.execute(line[:indx])[0]) - # else: return self.session.execute(line) - - -ip = get_ipython() -ip.register_magics(KestrelMagic) diff --git a/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/__init__.py b/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/__init__.py deleted file mode 100644 index e25addab..00000000 --- a/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from kestrel_jupyter_kernel.kernel import KestrelKernel diff --git a/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/__main__.py b/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/__main__.py deleted file mode 100644 index 5eebb1a3..00000000 --- a/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/__main__.py +++ /dev/null @@ -1,5 +0,0 @@ -from ipykernel.kernelapp import IPKernelApp -from kestrel_jupyter_kernel import KestrelKernel - -if __name__ == "__main__": - IPKernelApp.launch_instance(kernel_class=KestrelKernel) diff --git a/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/codemirror/__init__.py b/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/codemirror/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/codemirror/kestrel_template.js b/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/codemirror/kestrel_template.js deleted file mode 100644 index 9a9ac5cf..00000000 --- a/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/codemirror/kestrel_template.js +++ /dev/null @@ -1,153 +0,0 @@ -(function(mod) { - if (typeof exports == "object" && typeof module == "object") // CommonJS - mod(require("../../lib/codemirror")); - else if (typeof define == "function" && define.amd) // AMD - define(["../../lib/codemirror"], mod); - else // Plain browser env - mod(CodeMirror); -})(function(CodeMirror) { - "use strict"; - - CodeMirror.defineMode("kestrel", function() { - - function switchState(source, setState, f) { - setState(f); - return f(source, setState); - } - - var smallRE = /[a-z_]/; - var largeRE = /[A-Z]/; - var digitRE = /[0-9]/; - var hexitRE = /[0-9A-Fa-f]/; - var octitRE = /[0-7]/; - var idRE = /[a-z_A-Z0-9\']/; - var typeRE = /[a-zA-Z0-9-]/; - var symbolRE = /[-!#$%&*+.\/<=>?@\\^|~:]/; - var specialRE = /[(),;[\]`{}]/; - var whiteCharRE = /[ \t\v\f]/; // newlines are handled in tokenizer - var isoTimestamp = /[0-9:.\-TZ]/; - - function normal() { - return function (source, setState) { - if (source.eatWhile(whiteCharRE)) { - return null; - } - - var ch = source.next(); - - if (ch == '#') { - source.skipToEnd(); - return "comment"; - } - - if (ch == '\'') { - return switchState(source, setState, stringLiteral); - } - - if (ch == 't') { - if (source.eat('\'')) { - source.eatWhile(isoTimestamp); - if (source.eat('\'')) { - return "string-2"; - } - } - } - - if (typeRE.test(source)) { - source.eatWhile(typeRE); - return "type"; - } - - if (largeRE.test(ch)) { - source.eatWhile(idRE); - return "error"; - } - - if (smallRE.test(ch)) { - source.eatWhile(idRE); - return "variable"; - } - - if (digitRE.test(ch)) { - if (ch == '0') { - if (source.eat(/[xX]/)) { - source.eatWhile(hexitRE); // should require at least 1 - return "integer"; - } - if (source.eat(/[oO]/)) { - source.eatWhile(octitRE); // should require at least 1 - return "number"; - } - } - source.eatWhile(digitRE); - var t = "number"; - if (source.eat('.')) { - t = "number"; - source.eatWhile(digitRE); // should require at least 1 - } - if (source.eat(/[eE]/)) { - t = "number"; - source.eat(/[-+]/); - source.eatWhile(digitRE); // should require at least 1 - } - return t; - } - - if (symbolRE.test(ch)) { - if (ch == '#') { - source.skipToEnd(); - return "comment"; - } - } - - return "error"; - } - } - - function stringLiteral(source, setState) { - while (!source.eol()) { - var ch = source.next(); - if (ch == '\'') { - setState(normal()); - return "string"; - } - // escape handling: need to test correctness - //if (ch == '\\') { - // if (source.eat('\'')) source.next(); - //} - } - setState(normal()); - return "error"; - } - - var wellKnownWords = (function() { - var wkw = {}; - - var keywords = <<>>; - - for (var i = keywords.length; i--;) - wkw[keywords[i]] = "keyword"; - - var ops = ["IN", "NOT", "LIKE", "MATCHES", "ISSUBSET", "in", "not", "like", "matches", "isubset", "=", "!=", "<", ">", "<=", ">=",]; - - for (var i = ops.length; i--;) - wkw[ops[i]] = "operator"; - - return wkw; - })(); - - return { - startState: function () { return { f: normal() }; }, - copyState: function (s) { return { f: s.f }; }, - - token: function(stream, state) { - var t = state.f(stream, function(s) { state.f = s; }); - var w = stream.current(); - return (wellKnownWords.hasOwnProperty(w)) ? wellKnownWords[w] : t; - } - }; - - }); - - CodeMirror.defineMIME("text/x-kestrel", "kestrel"); -}); diff --git a/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/codemirror/setup.py b/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/codemirror/setup.py deleted file mode 100644 index 944569fd..00000000 --- a/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/codemirror/setup.py +++ /dev/null @@ -1,53 +0,0 @@ -import os -import json -import nbclassic -import notebook -import pkgutil -import kestrel - - -def update_codemirror_mode(): - for codemirror_file_path in _get_codemirror_file_paths(): - src_current = "" - if os.path.isfile(codemirror_file_path): - try: - with open(codemirror_file_path) as fp: - src_current = fp.read() - except PermissionError: - pass - - src_latest = _instantiate_codemirror_mode_src() - - if src_latest != src_current: - try: - with open(codemirror_file_path, "w") as fp: - fp.write(src_latest) - except PermissionError: - pass - - -################################################################ -# Private Functions -################################################################ - - -def _get_codemirror_file_paths(): - paths = [] - for pkg_path in (notebook.__path__[0], nbclassic.__path__[0]): - codemirror_dir = os.path.join(pkg_path, "static/components/codemirror/mode") - if os.path.isdir(codemirror_dir): - kestrel_dir = os.path.join(codemirror_dir, "kestrel") - if not os.path.isdir(kestrel_dir): - try: - os.mkdir(kestrel_dir) - except PermissionError: - pass - paths.append(os.path.join(kestrel_dir, "kestrel.js")) - return paths - - -def _instantiate_codemirror_mode_src(): - keywords = json.dumps(kestrel.frontend.parser.get_keywords()) - codemirror_src = pkgutil.get_data(__name__, "kestrel_template.js").decode("utf-8") - codemirror_src = codemirror_src.replace("<<>>", keywords) - return codemirror_src diff --git a/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/config.py b/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/config.py deleted file mode 100644 index 83d6c93f..00000000 --- a/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/config.py +++ /dev/null @@ -1 +0,0 @@ -LOG_FILE_NAME = "session.log" diff --git a/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/display.py b/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/display.py deleted file mode 100644 index 21e10883..00000000 --- a/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/display.py +++ /dev/null @@ -1,68 +0,0 @@ -from pandas import DataFrame -import tempfile -import base64 -import sqlparse -from typing import Iterable, Mapping -from pygments import highlight -from pygments.lexers import guess_lexer -from pygments.lexers.sql import SqlLexer -from pygments.lexers.kusto import KustoLexer -from pygments.formatters import HtmlFormatter -import networkx as nx -import matplotlib.pyplot as plt - -from kestrel.display import Display, GraphExplanation -from kestrel.ir.graph import IRGraph -from kestrel.ir.instructions import Instruction, DataSource, Variable, Construct - - -def gen_label_mapping(g: IRGraph) -> Mapping[Instruction, str]: - d = {} - for n in g: - if isinstance(n, Variable): - d[n] = n.name - elif isinstance(n, Construct): - d[n] = n.id.hex[:4] - elif isinstance(n, DataSource): - d[n] = n.datasource - else: - d[n] = f"[{n.instruction.upper()}]" - return d - - -def to_html_blocks(d: Display) -> Iterable[str]: - if isinstance(d, DataFrame): - yield d.to_html() - elif isinstance(d, GraphExplanation): - for graphlet in d.graphlets: - graph = IRGraph(graphlet.graph) - plt.figure(figsize=(4, 2)) - nx.draw( - graph, - with_labels=True, - labels=gen_label_mapping(graph), - font_size=8, - node_size=260, - node_color="#bfdff5", - ) - with tempfile.NamedTemporaryFile(delete_on_close=False) as tf: - tf.close() - plt.savefig(tf.name, format="png") - with open(tf.name, "rb") as tfx: - data = tfx.read() - - img = data_uri = base64.b64encode(data).decode("utf-8") - imgx = f'' - yield imgx - - query = graphlet.query.statement - if graphlet.query.language == "SQL": - lexer = SqlLexer() - query = sqlparse.format(query, reindent=True, keyword_case="upper") - elif graphlet.query.language == "KQL": - lexer = KustoLexer() - else: - lexer = guess_lexer(query) - query = highlight(query, lexer, HtmlFormatter()) - style = "" - yield style + query diff --git a/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/kernel.py b/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/kernel.py deleted file mode 100644 index 456cde96..00000000 --- a/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/kernel.py +++ /dev/null @@ -1,60 +0,0 @@ -from ipykernel.kernelbase import Kernel -import logging -import networkx as nx - -from kestrel.session import Session -from kestrel_jupyter_kernel.display import to_html_blocks - - -_logger = logging.getLogger(__name__) - - -class KestrelKernel(Kernel): - implementation = "kestrel" - implementation_version = "2.0" - language = "kestrel" - language_version = "2.0" - # https://jupyter-client.readthedocs.io/en/stable/messaging.html#msging-kernel-info - language_info = {"name": "kestrel", "file_extension": ".hf"} - banner = "Kestrel" - - def __init__(self, **kwargs): - super().__init__(**kwargs) - self.kestrel_session = Session() - - def do_complete(self, code, cursor_pos): - return { - "matches": self.kestrel_session.do_complete(code, cursor_pos), - "cursor_end": cursor_pos, - "cursor_start": cursor_pos, - "metadata": {}, - "status": "ok", - } - - def do_execute( - self, code, silent, store_history=True, user_expressions=None, allow_stdin=False - ): - if not silent: - try: - for result in self.kestrel_session.execute_to_generate(code): - for html in to_html_blocks(result): - self.send_response( - self.iopub_socket, - "display_data", - {"data": {"text/html": html}, "metadata": {}}, - ) - # how to clear output (if needed in the future): - # self.send_response(self.iopub_socket, "clear_output") - - except Exception as e: - _logger.error("Exception occurred", exc_info=True) - self.send_response( - self.iopub_socket, "stream", {"name": "stderr", "text": str(e)} - ) - - return { - "status": "ok", - "execution_count": self.execution_count, - "payload": [], - "user_expressions": {}, - } diff --git a/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/setup.py b/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/setup.py deleted file mode 100644 index f0c884c4..00000000 --- a/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/setup.py +++ /dev/null @@ -1,50 +0,0 @@ -################################################################ -# Setup Kestrel Jupyter Kernel -# -# This module setups the Kestrel Jupyter kernel: -# 1. install the kernel to Jupyter environment (local env) -# 2. generate codemirror mode for Kestrel based on the -# installed kestrel Python package for syntax highlighting -# 3. install the codemirror mode into Jupyter -# -# Install: pip will install the utility `kestrel_jupyter_setup` -# -# Usage: `kestrel_jupyter_setup` -# -################################################################ - -import os -import tempfile -import json -from jupyter_client.kernelspec import KernelSpecManager -from kestrel_jupyter_kernel.codemirror.setup import update_codemirror_mode - -_KERNEL_SPEC = { - "argv": ["python3", "-m", "kestrel_jupyter_kernel", "-f", "{connection_file}"], - "display_name": "Kestrel", - "language": "kestrel", -} - - -def install_kernelspec(): - with tempfile.TemporaryDirectory() as tmp_dirname: - kernel_dirname = os.path.join(tmp_dirname, "kestrel_kernel") - os.mkdir(kernel_dirname) - kernel_filename = os.path.join(kernel_dirname, "kernel.json") - with open(kernel_filename, "w") as kf: - json.dump(_KERNEL_SPEC, kf) - - m = KernelSpecManager() - m.install_kernel_spec(kernel_dirname, "kestrel", user=True) - - -def run(): - print("Setup Kestrel Jupyter Kernel") - print(" Install new Jupyter kernel ...", end=" ") - install_kernelspec() - print("done") - - # generate and install kestrel codemirrmor mode - print(" Compute and install syntax highlighting ...", end=" ") - update_codemirror_mode() - print("done") diff --git a/packages-nextgen/kestrel_jupyter/tests/__init__.py b/packages-nextgen/kestrel_jupyter/tests/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/packages-nextgen/kestrel_jupyter/tests/test_kernel_install.py b/packages-nextgen/kestrel_jupyter/tests/test_kernel_install.py deleted file mode 100644 index faa29375..00000000 --- a/packages-nextgen/kestrel_jupyter/tests/test_kernel_install.py +++ /dev/null @@ -1,13 +0,0 @@ -from jupyter_client.kernelspec import KernelSpecManager - -from kestrel_jupyter_kernel.setup import install_kernelspec - - -def test_kernel_install(): - m = KernelSpecManager() - ks = m.get_all_specs() - if "kestrel" in ks: - m.remove_kernel_spec("kestrel") - - install_kernelspec() - assert "kestrel" in m.get_all_specs() diff --git a/packages-nextgen/kestrel_jupyter/tests/test_notebook_syntax_gen.py b/packages-nextgen/kestrel_jupyter/tests/test_notebook_syntax_gen.py deleted file mode 100644 index 8511a28a..00000000 --- a/packages-nextgen/kestrel_jupyter/tests/test_notebook_syntax_gen.py +++ /dev/null @@ -1,13 +0,0 @@ -from os.path import exists - -from kestrel_jupyter_kernel.codemirror.setup import ( - update_codemirror_mode, - _get_codemirror_file_paths, -) - - -def test_notebook_syntax_gen(): - js_paths = _get_codemirror_file_paths() - update_codemirror_mode() - for js_path in js_paths: - assert exists(js_path) diff --git a/packages/kestrel_datasource_stixshifter/pyproject.toml b/packages/kestrel_datasource_stixshifter/pyproject.toml index 05e831f7..ecc22b3e 100644 --- a/packages/kestrel_datasource_stixshifter/pyproject.toml +++ b/packages/kestrel_datasource_stixshifter/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "kestrel_datasource_stixshifter" -version = "1.8.2" +version = "1.8.3" description = "Kestrel STIX-shifter Datasource Interface" readme = "README.rst" requires-python = ">=3.8" @@ -28,7 +28,7 @@ classifiers = [ dependencies = [ "kestrel_core>=1.8.1", - "lxml>=5.2.1", + "lxml==4.9.4", # Python 3.8 on mac error >5.0.0; stackoverflow #75442675 "requests>=2.31.0", "nest-asyncio>=1.6.0", "stix-shifter==7.0.6", diff --git a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/cli.py b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/cli.py index 3cc90fb5..b4cc2df6 100644 --- a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/cli.py +++ b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/cli.py @@ -1,18 +1,22 @@ import argparse import datetime import logging +import sys from kestrel_datasource_stixshifter.diagnosis import Diagnosis from kestrel_datasource_stixshifter.connector import setup_connector_module from firepit.timestamp import timefmt -def default_patterns(use_now_as_stop_time: bool): - start_time = "START t'2000-01-01T00:00:00.000Z'" - stop_time = ( - f"STOP t'{timefmt(datetime.datetime.utcnow())}'" - if use_now_as_stop_time - else "STOP t'3000-01-01T00:00:00.000Z'" - ) +def default_patterns(start=None, stop=None, last_minutes=0): + if start: + start_time = f"START t'{start}'" + stop_time = f"STOP t'{stop}'" + else: + to_time = datetime.datetime.utcnow() + from_time = timefmt(to_time - datetime.timedelta(minutes=last_minutes)) + to_time = timefmt(to_time) + start_time = f"START t'{from_time}'" + stop_time = f"STOP t'{to_time}'" patterns = [ "[ipv4-addr:value != '255.255.255.255']", "[process:pid > 0]", @@ -45,9 +49,23 @@ def stix_shifter_diag(): ) parser.add_argument( "--stop-at-now", - help="use the current timestamp as the STOP time instead of default year 3000 for default patterns", + help="ignored (retained for backwards compatibility)", action="store_true", ) + parser.add_argument( + "--start", + help="start time for default pattern search (%Y-%m-%dT%H:%M:%S.%fZ)", + ) + parser.add_argument( + "--stop", + help="stop time for default pattern search (%Y-%m-%dT%H:%M:%S.%fZ)", + ) + parser.add_argument( + "--last-minutes", + help="relative timespan for default pattern searches in minutes", + default=5, + type=int, + ) parser.add_argument( "-t", "--translate-only", @@ -68,13 +86,21 @@ def stix_shifter_diag(): ch.setFormatter(formatter) logger.addHandler(ch) + if (args.start and not args.stop) or (args.stop and not args.start): + print( + "Must specify both --start and --stop for absolute time range; else use --last-minutes", + file=sys.stderr, + ) + parser.print_usage(sys.stderr) + sys.exit(1) + if args.stix_pattern: patterns = [args.stix_pattern] elif args.pattern_file: with open(args.pattern_file) as pf: patterns = [pf.read()] else: - patterns = default_patterns(args.stop_at_now) + patterns = default_patterns(args.start, args.stop, args.last_minutes) diag = Diagnosis(args.datasource) diff --git a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/config.py b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/config.py index 73eb8ff8..e0f4be9e 100644 --- a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/config.py +++ b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/config.py @@ -20,6 +20,7 @@ ENV_VAR_PREFIX = "STIXSHIFTER_" RETRIEVAL_BATCH_SIZE = 2000 SINGLE_BATCH_TIMEOUT = 60 +SUBQUERY_TIME_WINDOW_IN_SECONDS = 0 # if >0, then segment START/STOP into this Windows Size to file multiple subqueries COOL_DOWN_AFTER_TRANSMISSION = 0 ALLOW_DEV_CONNECTOR = False VERIFY_CERT = True @@ -184,6 +185,14 @@ def get_datasource_from_profiles(profile_name, profiles): profile_name, ) + subquery_time_window = _extract_param_from_connection_config( + "subquery_time_window", + int, + SUBQUERY_TIME_WINDOW_IN_SECONDS, + connection, + profile_name, + ) + return ( connector_name, connection, @@ -192,6 +201,7 @@ def get_datasource_from_profiles(profile_name, profiles): cool_down_after_transmission, allow_dev_connector, verify_cert, + subquery_time_window, ) diff --git a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/diagnosis.py b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/diagnosis.py index c3631f7a..c725e543 100644 --- a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/diagnosis.py +++ b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/diagnosis.py @@ -30,6 +30,7 @@ def __init__(self, datasource_name): self.cool_down_after_transmission, self.allow_dev_connector, self.verify_cert, + self.subquery_time_window, ) = get_datasource_from_profiles(datasource_name, self.profiles) self.if_fast_translation = ( self.connector_name in self.kestrel_options["fast_translate"] @@ -50,6 +51,10 @@ def diagnose_config(self): print("#### Kestrel specific config") print(f"retrieval batch size: {self.retrieval_batch_size}") print(f"cool down after transmission: {self.cool_down_after_transmission}") + print(f"allow unverified connector: {self.allow_dev_connector}") + print(f"verify SSL or not: {self.verify_cert}") + print(f"split query into subquery: {bool(self.subquery_time_window)}") + print(f"subquery with time window (in seconds): {self.subquery_time_window}") print(f"enable fast translation: {self.if_fast_translation}") print() diff --git a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/interface.py b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/interface.py index 9435cebe..3c2c43a5 100644 --- a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/interface.py +++ b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/interface.py @@ -32,6 +32,7 @@ retrieval_batch_size: 10000 # set to 10000 to match default Elasticsearch page size; Kestrel default across connectors: 2000 single_batch_timeout: 120 # increase it if hit 60 seconds (Kestrel default) timeout error for each batch of retrieval cool_down_after_transmission: 2 # seconds to cool down between data source API calls, required by some API such as sentinelone; Kestrel default: 0 + subquery_time_window: 3600 # split each query into multiple subqueries with smaller time windows specified here in seconds; Kestrel default: 0 (not split query) allow_dev_connector: True # do not check version of a connector to allow custom/testing connector installed with any version; Kestrel default: False dialects: # more info: https://github.com/opencybersecurityalliance/stix-shifter/tree/develop/stix_shifter_modules/elastic_ecs#dialects - beats # need it if the index is created by Filebeat/Winlogbeat/*beat diff --git a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/query.py b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/query.py index 46b07b7f..603dc67e 100644 --- a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/query.py +++ b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/query.py @@ -11,6 +11,7 @@ from kestrel.exceptions import DataSourceError, DataSourceManagerInternalError from kestrel_datasource_stixshifter.connector import setup_connector_module from kestrel_datasource_stixshifter import multiproc +from kestrel_datasource_stixshifter.subquery import split_subquery_by_time_window from kestrel_datasource_stixshifter.config import ( get_datasource_from_profiles, load_options, @@ -63,16 +64,16 @@ def query_datasource(uri, pattern, session_id, config, store, limit=None): _logger.debug(f"prepare query with ID: {query_id}") num_records = 0 - profile_limit = limit + limit_per_profile = limit for profile in profiles: if limit: if num_records >= limit: break if num_records > 0: - profile_limit = limit - num_records + limit_per_profile = limit - num_records _logger.debug(f"entering stix-shifter data source: {profile}") - _logger.debug(f"profile = {profile}, profile_limit = {profile_limit}") + _logger.debug(f"profile = {profile}, limit_per_profile = {limit_per_profile}") # STIX-shifter will alter the config objects, thus making them not reusable. # So only give STIX-shifter a copy of the configs. # Check `modernize` functions in the `stix_shifter_utils` for details. @@ -84,6 +85,7 @@ def query_datasource(uri, pattern, session_id, config, store, limit=None): cool_down_after_transmission, allow_dev_connector, verify_cert, + subquery_time_window, ) = map( copy.deepcopy, get_datasource_from_profiles(profile, config["profiles"]) ) @@ -98,43 +100,52 @@ def query_datasource(uri, pattern, session_id, config, store, limit=None): observation_metadata = gen_observation_metadata(connector_name, query_id) - dsl = translate_query( - connector_name, observation_metadata, pattern, connection_dict - ) + for pattern in split_subquery_by_time_window(pattern, subquery_time_window): + + if limit_per_profile: + if num_records >= limit_per_profile: + _logger.debug("do not execute subquery due to limit return reached") + break + if num_records > 0: + limit_per_profile = limit_per_profile - num_records - raw_records_queue = Queue() - translated_data_queue = Queue() + dsl = translate_query( + connector_name, observation_metadata, pattern, connection_dict + ) - exceptions = [] + raw_records_queue = Queue() + translated_data_queue = Queue() - with multiproc.translate( - connector_name, - observation_metadata, - connection_dict.get("options", {}), - cache_data_path_prefix, - connector_name in config["options"]["fast_translate"], - raw_records_queue, - translated_data_queue, - config["options"]["translation_workers_count"], - ): - with multiproc.transmit( + exceptions = [] + + with multiproc.translate( connector_name, - connection_dict, - configuration_dict, - retrieval_batch_size, - config["options"]["translation_workers_count"], - cool_down_after_transmission, - verify_cert, - dsl["queries"], + observation_metadata, + connection_dict.get("options", {}), + cache_data_path_prefix, + connector_name in config["options"]["fast_translate"], raw_records_queue, - profile_limit, + translated_data_queue, + config["options"]["translation_workers_count"], ): - for result in multiproc.read_translated_results( - translated_data_queue, + with multiproc.transmit( + connector_name, + connection_dict, + configuration_dict, + retrieval_batch_size, config["options"]["translation_workers_count"], + cool_down_after_transmission, + verify_cert, + dsl["queries"], + raw_records_queue, + limit_per_profile, ): - num_records += get_num_objects(result) - ingest(result, observation_metadata, query_id, store) + for result in multiproc.read_translated_results( + translated_data_queue, + config["options"]["translation_workers_count"], + ): + num_records += get_num_objects(result) + ingest(result, observation_metadata, query_id, store) return ReturnFromStore(query_id) diff --git a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/subquery.py b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/subquery.py new file mode 100644 index 00000000..116c4c34 --- /dev/null +++ b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/subquery.py @@ -0,0 +1,47 @@ +import logging +from typeguard import typechecked +from typing import Iterable +from datetime import timedelta + +from firepit import timestamp + + +_logger = logging.getLogger(__name__) + + +@typechecked +def split_subquery_by_time_window( + stix_pattern: str, time_win_unit_in_seconds: int +) -> Iterable[str]: + if not time_win_unit_in_seconds: + _logger.debug("not use time-window-based subquery") + yield stix_pattern + else: + items = stix_pattern.split() + if items[-2] != "STOP" or items[-4] != "START": + # no timestamp in pattern + _logger.debug("not use subquery due to no time range") + yield stix_pattern + else: + stop_entire = timestamp.to_datetime(items[-1][2:-1]) + start_entire = timestamp.to_datetime(items[-3][2:-1]) + stop = stop_entire + start = start_entire + time_window_unit = timedelta(seconds=time_win_unit_in_seconds) + while stop - time_window_unit > start_entire: + start = stop - time_window_unit + _items = items[:] + _items[-3] = f"t'{timestamp.timefmt(start)}'" + _items[-1] = f"t'{timestamp.timefmt(stop)}'" + subquery_pattern = " ".join(_items) + _logger.debug(f"subquery pattern generated: {subquery_pattern}") + yield subquery_pattern + stop = start + else: + start = start_entire + _items = items[:] + _items[-3] = f"t'{timestamp.timefmt(start)}'" + _items[-1] = f"t'{timestamp.timefmt(stop)}'" + subquery_pattern = " ".join(_items) + _logger.debug(f"subquery pattern generated: {subquery_pattern}") + yield subquery_pattern diff --git a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/worker/transmitter.py b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/worker/transmitter.py index 31534781..423d7566 100644 --- a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/worker/transmitter.py +++ b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/worker/transmitter.py @@ -192,9 +192,16 @@ def retrieve_data(self): ) # prepare for next round retrieval - result_retrieval_offset += len(result_batch["data"]) + result_len = len(result_batch["data"]) + result_retrieval_offset += result_len + + if result_len < batch_size: + has_remaining_results = False + if "metadata" in result_batch: metadata = result_batch["metadata"] + else: + has_remaining_results = False if self.limit: if result_retrieval_offset >= self.limit: diff --git a/packages/kestrel_datasource_stixshifter/tests/test_command_get.py b/packages/kestrel_datasource_stixshifter/tests/test_command_get.py index 058205f3..624eb09c 100644 --- a/packages/kestrel_datasource_stixshifter/tests/test_command_get.py +++ b/packages/kestrel_datasource_stixshifter/tests/test_command_get.py @@ -166,11 +166,8 @@ def test_get_multiple_stixshifter_stix_limit_1(set_no_prefetch_kestrel_config, s s.execute(stmt) v = s.get_variable("var") - # The extended graph [ipv4-addr:value = '127.0.0.1'] is recognized and - # merged to prefetch query, resultsing in limited (32) processes. If - # not used by prefetch, the total number of process records prefetched - # is 240. - assert len(v) == 28 + # HOST1 returns 26, which is larger than 15 + assert len(v) == 26 for i in range(len(v)): assert v[i]["type"] == "process" assert v[i]["name"] in [ diff --git a/packages/kestrel_datasource_stixshifter/tests/test_stixshifter.py b/packages/kestrel_datasource_stixshifter/tests/test_stixshifter.py index 610a513c..93722c41 100644 --- a/packages/kestrel_datasource_stixshifter/tests/test_stixshifter.py +++ b/packages/kestrel_datasource_stixshifter/tests/test_stixshifter.py @@ -79,6 +79,7 @@ def test_yaml_profiles_refresh(tmp_path): cool_down_after_transmission: 5 allow_dev_connector: True verify_cert: false + subquery_time_window: 600 dialects: - beats config: @@ -107,7 +108,7 @@ def test_yaml_profiles_refresh(tmp_path): ss_config = s.config["datasources"]["kestrel_datasource_stixshifter"] ss_profiles = ss_config["profiles"] - connector_name, connection, configuration, retrieval_batch_size, cool_down_after_transmission, allow_dev_connector, verify_cert = get_datasource_from_profiles("host101", ss_profiles) + connector_name, connection, configuration, retrieval_batch_size, cool_down_after_transmission, allow_dev_connector, verify_cert, subquery_time_window = get_datasource_from_profiles("host101", ss_profiles) assert connector_name == "elastic_ecs" assert configuration["auth"]["id"] == "profileA" assert configuration["auth"]["api_key"] == "qwer" @@ -116,6 +117,7 @@ def test_yaml_profiles_refresh(tmp_path): assert retrieval_batch_size == 2000 assert cool_down_after_transmission == 0 assert verify_cert == True + assert subquery_time_window == 0 with open(profile_file, "w") as pf: pf.write(profileB) @@ -124,7 +126,7 @@ def test_yaml_profiles_refresh(tmp_path): # need to refresh the pointers since the dict is updated ss_profiles = ss_config["profiles"] - connector_name, connection, configuration, retrieval_batch_size, cool_down_after_transmission, allow_dev_connector, verify_cert = get_datasource_from_profiles("host101", ss_profiles) + connector_name, connection, configuration, retrieval_batch_size, cool_down_after_transmission, allow_dev_connector, verify_cert, subquery_time_window = get_datasource_from_profiles("host101", ss_profiles) assert connector_name == "elastic_ecs" assert configuration["auth"]["id"] == "profileB" assert configuration["auth"]["api_key"] == "xxxxxx" @@ -134,5 +136,6 @@ def test_yaml_profiles_refresh(tmp_path): assert cool_down_after_transmission == 5 assert allow_dev_connector == True assert verify_cert == False + assert subquery_time_window == 600 del os.environ["KESTREL_STIXSHIFTER_CONFIG"] diff --git a/packages/kestrel_datasource_stixshifter/tests/test_stixshifter_diagnosis.py b/packages/kestrel_datasource_stixshifter/tests/test_stixshifter_diagnosis.py index e406306b..a382cff0 100644 --- a/packages/kestrel_datasource_stixshifter/tests/test_stixshifter_diagnosis.py +++ b/packages/kestrel_datasource_stixshifter/tests/test_stixshifter_diagnosis.py @@ -31,6 +31,10 @@ def test_cli(stixshifter_profile_lab101): #### Kestrel specific config retrieval batch size: 2000 cool down after transmission: 0 +allow unverified connector: False +verify SSL or not: True +split query into subquery: False +subquery with time window (in seconds): 0 enable fast translation: False #### Config to be passed to stix-shifter @@ -78,7 +82,7 @@ def test_cli(stixshifter_profile_lab101): """ result = subprocess.run( - args=[STIX_SHIFTER_DIAG, "lab101"], + args=[STIX_SHIFTER_DIAG, "--start=2000-01-01T00:00:00.000Z", "--stop=3000-01-01T00:00:00.000Z", "lab101"], universal_newlines=True, stdout=subprocess.PIPE, ) @@ -98,6 +102,10 @@ def test_cli_ecs(stixshifter_profile_ecs): #### Kestrel specific config retrieval batch size: 2000 cool down after transmission: 0 +allow unverified connector: False +verify SSL or not: True +split query into subquery: False +subquery with time window (in seconds): 0 enable fast translation: False #### Config to be passed to stix-shifter diff --git a/packages/kestrel_jupyter/pyproject.toml b/packages/kestrel_jupyter/pyproject.toml index 1375304b..d54f4c0e 100644 --- a/packages/kestrel_jupyter/pyproject.toml +++ b/packages/kestrel_jupyter/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "kestrel_jupyter" -version = "1.8.4" +version = "1.8.5" description = "Kestrel Jupyter Kernel" readme = "README.rst" requires-python = ">=3.8" @@ -28,7 +28,7 @@ classifiers = [ dependencies = [ "kestrel_core==1.8.2", "kestrel_datasource_stixbundle==1.8.0", - "kestrel_datasource_stixshifter==1.8.2", + "kestrel_datasource_stixshifter==1.8.3", "kestrel_analytics_python==1.8.0", "kestrel_analytics_docker==1.8.1", "jupyterlab-server",