diff --git a/.gitignore b/.gitignore index b6e4761..ca676f3 100644 --- a/.gitignore +++ b/.gitignore @@ -127,3 +127,9 @@ dmypy.json # Pyre type checker .pyre/ + +# PyCharm, IntelliJ etc +.idea/ + +# Mac +.DS_Store diff --git a/Makefile b/Makefile index 552bbc3..ee358d7 100644 --- a/Makefile +++ b/Makefile @@ -57,7 +57,7 @@ check-all: build-local isort black pylint mypy xenon pytest @echo "Running formatter, linter, typechecker and tests..." black: - @$(RUN_NO_DEPS) black --check fideslog/ + @$(RUN_NO_DEPS) black --check --exclude="sdk/python/_version\.py" fideslog/ isort: @echo "Running isort checks..." diff --git a/docker-compose.yml b/docker-compose.yml index c963635..559cb27 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -13,7 +13,7 @@ services: environment: FIDESLOG__LOGGING_LEVEL: "debug" FIDESLOG__SERVER_HOST: "fideslog" - FIDESLOG__SERVER_HOT_RELOAD: true + FIDESLOG__SERVER_HOT_RELOAD: "true" expose: - 8080 ports: diff --git a/fideslog/api/database/data_access.py b/fideslog/api/database/data_access.py index 44df8d7..4c782d8 100644 --- a/fideslog/api/database/data_access.py +++ b/fideslog/api/database/data_access.py @@ -1,25 +1,34 @@ from json import dumps from logging import getLogger +from typing import Optional +from urllib.parse import urlparse from sqlalchemy.orm import Session from ..models.analytics_event import AnalyticsEvent from .models import AnalyticsEvent as AnalyticsEventORM +EXCLUDED_ATTRIBUTES = set(("client_id", "endpoint", "extra_data", "os")) + + log = getLogger(__name__) def create_event(database: Session, event: AnalyticsEvent) -> None: """Create a new analytics event.""" - logged_event = event.dict(exclude=set(("client_id", "extra_data", "os"))) - log.debug("Creating event: %s", logged_event) + logged_event = event.dict(exclude=EXCLUDED_ATTRIBUTES) + log.debug("Creating event from: %s", logged_event) + log.debug( + "The following attributes have been excluded as PII: %s", EXCLUDED_ATTRIBUTES + ) extra_data = dumps(event.extra_data) if event.extra_data else None flags = ", ".join(event.flags) if event.flags else None resource_counts = ( dumps(event.resource_counts.dict()) if event.resource_counts else None ) + endpoint = truncate_endpoint_url(event.endpoint) database.add( AnalyticsEventORM( @@ -27,7 +36,7 @@ def create_event(database: Session, event: AnalyticsEvent) -> None: command=event.command, developer=event.developer, docker=event.docker, - endpoint=event.endpoint, + endpoint=endpoint, error=event.error, event=event.event, event_created_at=event.event_created_at, @@ -44,3 +53,17 @@ def create_event(database: Session, event: AnalyticsEvent) -> None: database.commit() log.debug("Event created: %s", logged_event) + + +def truncate_endpoint_url(endpoint: Optional[str]) -> Optional[str]: + """ + Guarantee that only the endpoint path is stored in the database. + """ + + if endpoint is None: + return None + + endpoint_components = endpoint.split(":", maxsplit=1) + http_method = endpoint_components[0].strip().upper() + url = endpoint_components[1].strip() + return f"{http_method}: {urlparse(url).path}" diff --git a/fideslog/api/models/analytics_event.py b/fideslog/api/models/analytics_event.py index 4868b56..e6c3954 100644 --- a/fideslog/api/models/analytics_event.py +++ b/fideslog/api/models/analytics_event.py @@ -2,12 +2,24 @@ from datetime import datetime, timezone from typing import Dict, List, Optional -from urllib.parse import urlparse from pydantic import BaseModel, Field, validator +from validators import url as is_valid_url from .manifest_file_counts import ManifestFileCounts +ALLOWED_HTTP_METHODS = [ + "CONNECT", + "DELETE", + "GET", + "HEAD", + "OPTIONS", + "PATCH", + "POST", + "PUT", + "TRACE", +] + class AnalyticsEvent(BaseModel): """The model for analytics events.""" @@ -30,7 +42,7 @@ class AnalyticsEvent(BaseModel): ) endpoint: Optional[str] = Field( None, - description="For events submitted as a result of making API server requests, the API endpoint path that was requested.", + description="For events submitted as a result of making API server requests, the HTTP method and full API endpoint URL included on the request, delimited by a colon. Ex: `GET: https://www.example.com/api/path`. The URL will be truncated, and only the URL path will be stored.", ) error: Optional[str] = Field( None, @@ -81,12 +93,28 @@ def check_not_an_email_address(cls, value: str) -> str: return value @validator("endpoint") - def check_no_hostname(cls, value: str) -> str: + def validate_endpoint_format(cls, value: Optional[str]) -> Optional[str]: """ - Ensure that endpoints contain only the URL path. + Ensure that `endpoint` contains the request's HTTP method and URL. """ - return urlparse(value).path + if value is None: + return None + + endpoint_components = value.split(":", maxsplit=1) + assert ( + len(endpoint_components) == 2 + ), "endpoint must contain only the HTTP method and URL, delimited by a colon" + + http_method = endpoint_components[0].strip().upper() + assert ( + http_method in ALLOWED_HTTP_METHODS + ), f"HTTP method must be one of {', '.join(ALLOWED_HTTP_METHODS)}" + + url = endpoint_components[1].strip() + assert is_valid_url(url), "endpoint URL must be a valid URL" + + return f"{http_method}: {url}" @validator("event_created_at") def check_in_the_past(cls, value: datetime) -> datetime: diff --git a/fideslog/api/requirements.txt b/fideslog/api/requirements.txt index 8653e0a..30cf9a4 100644 --- a/fideslog/api/requirements.txt +++ b/fideslog/api/requirements.txt @@ -5,3 +5,4 @@ snowflake-sqlalchemy==1.3.3 sqlalchemy==1.4.31 toml==0.10.2 uvicorn==0.17.5 +validators==0.20.0 diff --git a/fideslog/sdk/__init__.py b/fideslog/sdk/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/fideslog/sdk/python/__init__.py b/fideslog/sdk/python/__init__.py index 4d52a61..8b97719 100644 --- a/fideslog/sdk/python/__init__.py +++ b/fideslog/sdk/python/__init__.py @@ -1,3 +1,3 @@ from . import _version -__version__ = _version.get_versions()["version"] +__version__ = _version.get_versions()["version"] # type: ignore diff --git a/fideslog/sdk/python/_version.py b/fideslog/sdk/python/_version.py index 2951327..019f860 100644 --- a/fideslog/sdk/python/_version.py +++ b/fideslog/sdk/python/_version.py @@ -1,3 +1,5 @@ +# type: ignore +# pylint: skip-file # This file helps to compute a version number in source trees obtained from # git-archive tarball (such as those provided by githubs download-from-tag diff --git a/fideslog/sdk/python/event.py b/fideslog/sdk/python/event.py index 2b47d3b..339e9aa 100644 --- a/fideslog/sdk/python/event.py +++ b/fideslog/sdk/python/event.py @@ -1,11 +1,24 @@ -# pylint: disable= too-many-arguments, too-many-instance-attributes +# pylint: disable= too-many-arguments, too-many-instance-attributes, too-many-locals from datetime import datetime, timezone from typing import Dict, List, Optional -from urllib.parse import urlparse + +from validators import url as is_valid_url from .exceptions import InvalidEventError +ALLOWED_HTTP_METHODS = [ + "CONNECT", + "DELETE", + "GET", + "HEAD", + "OPTIONS", + "PATCH", + "POST", + "PUT", + "TRACE", +] + class AnalyticsEvent: """ @@ -33,7 +46,7 @@ def __init__( :param event_created_at: The UTC timestamp when the event occurred, in [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601) format. Must include the UTC timezone, and represent a datetime in the past. :param command: For events submitted as a result of running CLI commands, the name of the command that was submitted. May include the subcommand name(s). :param docker: `True` if the command was submitted within a Docker container. Default: `False`. - :param endpoint: For events submitted as a result of making API server requests, the API endpoint path that was requested. If a fully-qualified URL is provided, only the URL path will be persisted. + :param endpoint: For events submitted as a result of making API server requests, the HTTP method and full API endpoint URL included on the request, delimited by a colon. Ex: `GET: https://www.example.com/api/path`. The URL will be truncated, and only the URL path will be stored. :param error: For events submitted as a result of running CLI commands that exit with a non-0 status code, or events submitted as a result of API server requests that respond with a non-2xx status code, the error type, without specific error details. :param extra_data: Any additional key/value pairs that should be associated with this event. :param flags: For events submitted as a result of running CLI commands, the flags in use when the command was submitted. Omits flag values (when they exist) by persisting only the portion of each string in this list that come before `=` or `space` characters. @@ -70,8 +83,20 @@ def __init__( self.endpoint = None if endpoint is not None: - assert urlparse(endpoint).path != "", "endpoint must include a URL path" - self.endpoint = endpoint + endpoint_components = endpoint.split(":", maxsplit=1) + assert ( + len(endpoint_components) == 2 + ), "endpoint must contain only the HTTP method and URL, delimited by a colon" + + http_method = endpoint_components[0].strip().upper() + assert ( + http_method in ALLOWED_HTTP_METHODS + ), f"HTTP method must be one of {', '.join(ALLOWED_HTTP_METHODS)}" + + url = endpoint_components[1].strip() + assert is_valid_url(url), "endpoint URL must be a valid URL" + + self.endpoint = f"{http_method}: {url}" self.command = command self.docker = docker diff --git a/fideslog/sdk/python/requirements.txt b/fideslog/sdk/python/requirements.txt index 5987fa6..4544daa 100644 --- a/fideslog/sdk/python/requirements.txt +++ b/fideslog/sdk/python/requirements.txt @@ -1,4 +1,5 @@ aiohttp[speedups]==3.8.1 bcrypt~=3.2.0 types-requests==2.27.11 +validators==0.20.0 versioneer>=0.19 diff --git a/tests/api/test_models.py b/tests/api/test_models.py index 5ff1c75..e5f456e 100644 --- a/tests/api/test_models.py +++ b/tests/api/test_models.py @@ -21,7 +21,7 @@ def test_full_payload() -> Generator: "product_name": "test_product", "production_version": "1.2.3", "status_code": 200, - "endpoint": "https://www.example.com/path/string", + "endpoint": "GET: https://www.example.com/path/string", "flags": ["-f", "-y", "--test"], "command": "apply", "error": "Internal Server Error", @@ -29,6 +29,7 @@ def test_full_payload() -> Generator: "extra_data": {"extra_value": "extra_value"}, "docker": True, "resource_counts": {"datasets": 1, "policies": 1, "systems": 27}, + "developer": True, }