Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

s3 connector (for data detection & discovery) POC - fides #4930

Merged
merged 18 commits into from
Jun 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ The types of changes are:
- Added option in FidesJS SDK to only disable notice-served API [#4965](https://github.com/ethyca/fides/pull/4965)
- External ID support for consent management [#4927](https://github.com/ethyca/fides/pull/4927)
- Added access and erasure support for the Greenhouse Harvest integration [#4945](https://github.com/ethyca/fides/pull/4945)
- Add an S3 connection type (currently used for discovery and detection only) [#4930](https://github.com/ethyca/fides/pull/4930)
- Support for Limited FIDES__CELERY__* Env Vars [#4980](https://github.com/ethyca/fides/pull/4980)
- Implement sending emails via property-specific messaging templates [#4950](https://github.com/ethyca/fides/pull/4950)

Expand Down
34 changes: 34 additions & 0 deletions clients/admin-ui/public/images/connector-logos/s3.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ export const CONNECTION_TYPE_LOGO_MAP = new Map<ConnectionType, string>([
[ConnectionType.MYSQL, "mysql.svg"],
[ConnectionType.POSTGRES, "postgres.svg"],
[ConnectionType.REDSHIFT, "redshift.svg"],
[ConnectionType.S3, "s3.svg"],
[ConnectionType.SCYLLA, "scylla.svg"],
[ConnectionType.SNOWFLAKE, "snowflake.svg"],
[ConnectionType.SOVRN, "sovrn.svg"],
Expand Down
1 change: 1 addition & 0 deletions clients/admin-ui/src/types/api/models/ConnectionType.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ export enum ConnectionType {
MANUAL_WEBHOOK = "manual_webhook",
TIMESCALE = "timescale",
FIDES = "fides",
S3 = "s3",
SCYLLA = "scylla",
GENERIC_ERASURE_EMAIL = "generic_erasure_email",
GENERIC_CONSENT_EMAIL = "generic_consent_email",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""property specific messaging db models

Revision ID: 2736c942faa2
Revises: efddde14da21
Revises: 52a5f1a957bc
Create Date: 2024-05-28 14:26:09.012859

"""
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
"""add s3 connection type

Revision ID: cb344673f633
Revises: 3304082a6cee
Create Date: 2024-05-31 20:46:08.829330

"""

import sqlalchemy as sa
from alembic import op

# revision identifiers, used by Alembic.
revision = "cb344673f633"
down_revision = "3304082a6cee"
branch_labels = None
depends_on = None


def upgrade():
# Add 's3' to ConnectionType enum
op.execute("alter type connectiontype rename to connectiontype_old")
op.execute(
"create type connectiontype as enum('mongodb', 'mysql', 'https', 'snowflake', 'redshift', 'mssql', 'mariadb', 'bigquery', 'saas', 'manual', 'manual_webhook', 'timescale', 'fides', 'sovrn', 'attentive', 'dynamodb', 'postgres', 'generic_consent_email', 'generic_erasure_email', 'scylla', 's3')"
)
op.execute(
(
"alter table connectionconfig alter column connection_type type connectiontype using "
"connection_type::text::connectiontype"
)
)
op.execute("drop type connectiontype_old")


def downgrade():
# Remove 's3' from ConnectionType enum
op.execute("delete from connectionconfig where connection_type in ('s3')")
op.execute("alter type connectiontype rename to connectiontype_old")
op.execute(
"create type connectiontype as enum('mongodb', 'mysql', 'https', 'snowflake', 'redshift', 'mssql', 'mariadb', 'bigquery', 'saas', 'manual', 'manual_webhook', 'timescale', 'fides', 'sovrn', 'attentive', 'dynamodb', 'postgres', 'generic_consent_email', 'generic_erasure_email', 'scylla')"
)
op.execute(
(
"alter table connectionconfig alter column connection_type type connectiontype using "
"connection_type::text::connectiontype"
)
)
op.execute("drop type connectiontype_old")
4 changes: 2 additions & 2 deletions src/fides/api/api/v1/endpoints/storage_endpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@
from fides.api.schemas.storage.data_upload_location_response import DataUpload
from fides.api.schemas.storage.storage import (
FULLY_CONFIGURED_STORAGE_TYPES,
AWSAuthMethod,
BulkPutStorageConfigResponse,
S3AuthMethod,
StorageConfigStatus,
StorageConfigStatusMessage,
StorageDestination,
Expand Down Expand Up @@ -420,7 +420,7 @@ def get_storage_status(
def _storage_config_requires_secrets(storage_config: StorageConfig) -> bool:
return (
storage_config.details.get(StorageDetails.AUTH_METHOD.value, None)
== S3AuthMethod.SECRET_KEYS.value
== AWSAuthMethod.SECRET_KEYS.value
)


Expand Down
2 changes: 2 additions & 0 deletions src/fides/api/models/connectionconfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ class ConnectionType(enum.Enum):
fides = "fides"
generic_erasure_email = "generic_erasure_email" # Run after the traversal
generic_consent_email = "generic_consent_email" # Run after the traversal
s3 = "s3"
scylla = "scylla"

@property
Expand All @@ -76,6 +77,7 @@ def human_readable(self) -> str:
ConnectionType.mysql.value: "MySQL",
ConnectionType.postgres.value: "PostgreSQL",
ConnectionType.redshift.value: "Amazon Redshift",
ConnectionType.s3.value: "Amazon S3",
ConnectionType.saas.value: "SaaS",
ConnectionType.scylla.value: "Scylla DB",
ConnectionType.snowflake.value: "Snowflake",
Expand Down
8 changes: 8 additions & 0 deletions src/fides/api/schemas/connection_configuration/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,12 @@
from fides.api.schemas.connection_configuration.connection_secrets_redshift import (
RedshiftSchema as RedshiftSchema,
)
from fides.api.schemas.connection_configuration.connection_secrets_s3 import (
S3DocsSchema as S3DocsSchema,
)
from fides.api.schemas.connection_configuration.connection_secrets_s3 import (
S3Schema as S3Schema,
)
from fides.api.schemas.connection_configuration.connection_secrets_saas import (
SaaSSchema as SaaSSchema,
)
Expand Down Expand Up @@ -127,6 +133,7 @@
ConnectionType.postgres.value: PostgreSQLSchema,
ConnectionType.redshift.value: RedshiftSchema,
ConnectionType.saas.value: SaaSSchema,
ConnectionType.s3.value: S3Schema,
ConnectionType.scylla.value: ScyllaSchema,
ConnectionType.snowflake.value: SnowflakeSchema,
ConnectionType.sovrn.value: SovrnSchema,
Expand Down Expand Up @@ -176,5 +183,6 @@ def get_connection_secrets_schema(
FidesDocsSchema,
SovrnDocsSchema,
DynamoDBDocsSchema,
S3DocsSchema,
ScyllaDocsSchema,
]
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ class DynamoDBSchema(ConnectionConfigSecretsSchema):
sensitive=True,
)

# TODO: include an aws_assume_role_arn and more closely follow the pattern in `connection_secrets_s3`
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i don't want to make our changes too wide-ranging quite yet, so i've just put in a note that we should look to switch dynamoDB over to this authentication paradigm soon. dynamoDB does require a region (I think?), so i'm not sure how that changes things. but I still think that provided an option to assume a role will be the right thing to support here moving forward 👍


_required_components: List[str] = [
"region_name",
"aws_access_key_id",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from typing import Dict, List, Optional

from pydantic import Field, root_validator

from fides.api.schemas.base_class import NoValidationSchema
from fides.api.schemas.connection_configuration.connection_secrets import (
ConnectionConfigSecretsSchema,
)
from fides.api.schemas.storage.storage import AWSAuthMethod


class S3Schema(ConnectionConfigSecretsSchema):
"""Schema to validate the secrets needed to connect to Amazon S3"""

auth_method: AWSAuthMethod = Field(
pattisdr marked this conversation as resolved.
Show resolved Hide resolved
title="Authentication Method",
description="Determines which type of authentication method to use for connecting to Amazon S3",
)

aws_access_key_id: Optional[str] = Field(
title="Access Key ID",
description="Part of the credentials that provide access to your AWS account. This is required if using secret key authentication.",
)
aws_secret_access_key: Optional[str] = Field(
title="Secret Access Key",
description="Part of the credentials that provide access to your AWS account. This is required if using secret key authentication.",
sensitive=True,
)

aws_assume_role_arn: Optional[str] = Field(
pattisdr marked this conversation as resolved.
Show resolved Hide resolved
title="Assume Role ARN",
description="If provided, the ARN of the role that should be assumed to connect to s3.",
)
Comment on lines +20 to +33
Copy link
Contributor

@daveqnet daveqnet Jun 12, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Schema looks good! From my perspective:

  • Access Key ID: not particularly sensitive, a bit like a username
  • Secret Access Key: sensitive, treat it like a password
  • Assume Role ARN: not particularly sensitive, again a bit like a username


_required_components: List[str] = ["auth_method"]

@root_validator(pre=True)
@classmethod
def keys_provided_if_needed(cls, values: Dict) -> Dict:
"""
Validates that both access and secret access keys are provided if using a `secret_keys` auth method.
"""
if values.get("auth_method") == AWSAuthMethod.SECRET_KEYS.value and not (
values.get("aws_access_key_id") and values.get("aws_secret_access_key")
):
raise ValueError(

Check warning on line 46 in src/fides/api/schemas/connection_configuration/connection_secrets_s3.py

View check run for this annotation

Codecov / codecov/patch

src/fides/api/schemas/connection_configuration/connection_secrets_s3.py#L46

Added line #L46 was not covered by tests
f"An Access Key ID and a Secret Access Key must be provided if using the `{AWSAuthMethod.SECRET_KEYS.value}` Authentication Method"
)

return values

Check warning on line 50 in src/fides/api/schemas/connection_configuration/connection_secrets_s3.py

View check run for this annotation

Codecov / codecov/patch

src/fides/api/schemas/connection_configuration/connection_secrets_s3.py#L50

Added line #L50 was not covered by tests


class S3DocsSchema(S3Schema, NoValidationSchema):
"""S3 Secrets Schema for API Docs"""
4 changes: 2 additions & 2 deletions src/fides/api/schemas/storage/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,15 +48,15 @@ class Config:
extra = Extra.forbid


class S3AuthMethod(Enum):
class AWSAuthMethod(str, Enum):
AUTOMATIC = "automatic"
SECRET_KEYS = "secret_keys"


class StorageDetailsS3(FileBasedStorageDetails):
"""The details required to represent an AWS S3 storage bucket."""

auth_method: S3AuthMethod
auth_method: AWSAuthMethod
bucket: str
max_retries: Optional[int] = 0

Expand Down
2 changes: 2 additions & 0 deletions src/fides/api/service/connectors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from fides.api.service.connectors.mongodb_connector import (
MongoDBConnector as MongoDBConnector,
)
from fides.api.service.connectors.s3_connector import S3Connector
from fides.api.service.connectors.saas_connector import SaaSConnector as SaaSConnector
from fides.api.service.connectors.scylla_connector import ScyllaConnector
from fides.api.service.connectors.sql_connector import (
Expand Down Expand Up @@ -75,6 +76,7 @@
ConnectionType.snowflake.value: SnowflakeConnector,
ConnectionType.sovrn.value: SovrnConnector,
ConnectionType.timescale.value: TimescaleConnector,
ConnectionType.s3.value: S3Connector,
}


Expand Down
73 changes: 73 additions & 0 deletions src/fides/api/service/connectors/s3_connector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
from typing import Any, Dict, List, Optional

from loguru import logger

from fides.api.common_exceptions import ConnectionException
from fides.api.graph.execution import ExecutionNode
from fides.api.models.connectionconfig import ConnectionTestStatus
from fides.api.models.policy import Policy
from fides.api.models.privacy_request import PrivacyRequest, RequestTask
from fides.api.schemas.connection_configuration.connection_secrets_s3 import S3Schema
from fides.api.service.connectors.base_connector import BaseConnector
from fides.api.service.connectors.query_config import QueryConfig
from fides.api.util.aws_util import get_aws_session
from fides.api.util.collection_util import Row


class S3Connector(BaseConnector):
"""
AWS S3 Connector - this is currently used just to test connections to S3.

NOTE: No DSR processing is yet supported for S3.
"""

def create_client(self) -> Any: # type: ignore
"""Returns a client for s3"""
config = S3Schema(**self.configuration.secrets or {})
pattisdr marked this conversation as resolved.
Show resolved Hide resolved
return get_aws_session(

Check warning on line 27 in src/fides/api/service/connectors/s3_connector.py

View check run for this annotation

Codecov / codecov/patch

src/fides/api/service/connectors/s3_connector.py#L26-L27

Added lines #L26 - L27 were not covered by tests
pattisdr marked this conversation as resolved.
Show resolved Hide resolved
auth_method=config.auth_method.value,
storage_secrets=config.dict(), # type: ignore
assume_role_arn=config.aws_assume_role_arn,
)

def query_config(self, node: ExecutionNode) -> QueryConfig[Any]:
"""DSR execution not yet supported for S3"""
raise NotImplementedError()

Check warning on line 35 in src/fides/api/service/connectors/s3_connector.py

View check run for this annotation

Codecov / codecov/patch

src/fides/api/service/connectors/s3_connector.py#L35

Added line #L35 was not covered by tests

def test_connection(self) -> Optional[ConnectionTestStatus]:
"""
Connects to AWS S3 and gets caller identity to validate credentials.
"""
logger.info("Starting test connection to {}", self.configuration.key)
try:
session = self.client()
sts_client = session.client("sts")
sts_client.get_caller_identity()
except Exception as error:
raise ConnectionException(str(error))

Check warning on line 47 in src/fides/api/service/connectors/s3_connector.py

View check run for this annotation

Codecov / codecov/patch

src/fides/api/service/connectors/s3_connector.py#L41-L47

Added lines #L41 - L47 were not covered by tests

return ConnectionTestStatus.succeeded

Check warning on line 49 in src/fides/api/service/connectors/s3_connector.py

View check run for this annotation

Codecov / codecov/patch

src/fides/api/service/connectors/s3_connector.py#L49

Added line #L49 was not covered by tests

def retrieve_data(
self,
node: ExecutionNode,
policy: Policy,
privacy_request: PrivacyRequest,
request_task: RequestTask,
input_data: Dict[str, List[Any]],
) -> List[Row]:
"""DSR execution not yet supported for S3"""

def mask_data(
self,
node: ExecutionNode,
policy: Policy,
privacy_request: PrivacyRequest,
request_task: RequestTask,
rows: List[Row],
) -> int:
"""DSR execution not yet supported for S3"""

def close(self) -> None:
"""Close any held resources"""
# no held resources for S3 connector
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@

from fides.api.schemas.storage.storage import (
SUPPORTED_STORAGE_SECRETS,
S3AuthMethod,
AWSAuthMethod,
StorageSecrets,
StorageType,
)
from fides.api.util.storage_authenticator import get_s3_session
from fides.api.util.aws_util import get_aws_session


def secrets_are_valid(
Expand All @@ -31,7 +31,7 @@ def secrets_are_valid(
def _s3_authenticator(secrets: Dict[StorageSecrets, Any]) -> bool:
"""Authenticates secrets for s3, returns true if secrets are valid"""
try:
get_s3_session(S3AuthMethod.SECRET_KEYS.value, secrets.dict()) # type: ignore
get_aws_session(AWSAuthMethod.SECRET_KEYS.value, secrets.dict()) # type: ignore
return True
except ClientError:
return False
Expand Down
3 changes: 3 additions & 0 deletions src/fides/api/task/task_resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
TimescaleConnector,
)
from fides.api.service.connectors.base_email_connector import BaseEmailConnector
from fides.api.service.connectors.s3_connector import S3Connector
from fides.api.util.cache import get_cache
from fides.api.util.collection_util import Row, extract_key_for_address

Expand Down Expand Up @@ -74,6 +75,8 @@
return DynamoDBConnector(connection_config)
if connection_config.connection_type == ConnectionType.fides:
return FidesConnector(connection_config)
if connection_config.connection_type == ConnectionType.s3:
return S3Connector(connection_config)

Check warning on line 79 in src/fides/api/task/task_resources.py

View check run for this annotation

Codecov / codecov/patch

src/fides/api/task/task_resources.py#L79

Added line #L79 was not covered by tests
raise NotImplementedError(
f"No connector available for {connection_config.connection_type}"
)
Expand Down
Loading
Loading