Skip to content

Commit

Permalink
Support Dependent Identity Data and "After" Param in Saas Configs [#291
Browse files Browse the repository at this point in the history
…] (#306)

* Allow the "after" param to be optionally specified on a saas config endpoint.

* WIP If a collection has grouped identity data, meaning it is dependent on other inputs, combine the seed data with the other grouped inputs when processing the data to be fed into the current collection.

* Add tests for "after" query param on saas configs and that identity data is grouped with other dependent fields if it is listed as a grouped_input.

- Pull identity data off from the data that is passed into the collection, instead of out of the cache for consistency.
- Revert a hardcoded static portion of a query param used in local testing.

* Make corresponding change to saas dataset.

* Remove double quotes.

* Fix bad docs merge.

* Fix missed import.

* Respond to CR.
  • Loading branch information
pattisdr authored Mar 29, 2022
1 parent 11eb3d8 commit f19702f
Show file tree
Hide file tree
Showing 9 changed files with 122 additions and 26 deletions.
12 changes: 9 additions & 3 deletions data/saas/config/saas_example_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ saas_config:
- name: On-Behalf-Of
value: <email>
- name: Token
value: Custom <api_key>
value: Custom <api_key>
query_params:
- name: limit
value: <limit>
Expand Down Expand Up @@ -134,12 +134,16 @@ saas_config:
param_values:
- name: placeholder
identity: email
- name: user_feedback
- name: users
after: [ saas_connector_example.projects ]
requests:
read:
method: GET
path: /api/0/projects/<organization_slug>/<project_slug>/user-feedback/
grouped_inputs: [ organization_slug, project_slug ]
query_params:
- name: query
value: <query>
grouped_inputs: [ organization_slug, project_slug, query ]
param_values:
- name: organization_slug
references:
Expand All @@ -151,3 +155,5 @@ saas_config:
- dataset: saas_connector_example
field: projects.slug
direction: from
- name: query
identity: email
2 changes: 1 addition & 1 deletion data/saas/dataset/saas_example_dataset.yml
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ dataset:
- name: slug
fidesops_meta:
data_type: string
- name: user_feedback
- name: users
fields:
- name: id
data_categories: [ system.operations ]
Expand Down
1 change: 1 addition & 0 deletions docs/fidesops/docs/guides/saas_config.md
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ test_request:
This is where we define how we are going to access and update each collection in the corresponding Dataset. The endpoint section contains the following members:

- `name` This name corresponds to a Collection in the corresponding Dataset.
- `after` To configure if this endpoint should run after other endpoints or collections. This should be a list of collection addresses, for example: `after: [ mailchimp_connector_example.member ]` would cause the current endpoint to run after the member endpoint.
- `requests` A map of `read`, `update`, and `delete` requests for this collection. Each collection can define a way to read and a way to update the data. Each request is made up of:
- `method` The HTTP method used for the endpoint.
- `path` A static or dynamic resource path. The dynamic portions of the path are enclosed within angle brackets `<dynamic_value>` and are replaced with values from `param_values`.
Expand Down
15 changes: 13 additions & 2 deletions src/fidesops/schemas/saas/saas_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,15 @@
from fidesops.service.pagination.pagination_strategy_factory import get_strategy
from pydantic import BaseModel, validator, root_validator, Extra
from fidesops.schemas.base_class import BaseSchema
from fidesops.schemas.dataset import FidesopsDatasetReference
from fidesops.graph.config import Collection, Dataset, FieldAddress, ScalarField
from fidesops.schemas.dataset import FidesopsDatasetReference, FidesCollectionKey
from fidesops.graph.config import (
Collection,
Dataset,
FieldAddress,
ScalarField,
CollectionAddress,
Field,
)
from fidesops.schemas.saas.strategy_configuration import ConnectorParamRef
from fidesops.schemas.shared_schemas import FidesOpsKey

Expand Down Expand Up @@ -143,6 +150,7 @@ class Endpoint(BaseModel):

name: str
requests: Dict[Literal["read", "update", "delete"], SaaSRequest]
after: List[FidesCollectionKey] = []


class ConnectorParam(BaseModel):
Expand Down Expand Up @@ -214,6 +222,9 @@ def get_graph(self) -> Dataset:
name=endpoint.name,
fields=fields,
grouped_inputs=grouped_inputs,
after={
CollectionAddress(*s.split(".")) for s in endpoint.after
},
)
)

Expand Down
46 changes: 46 additions & 0 deletions src/fidesops/task/graph_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,15 @@ def grouped_fields(self) -> Set[Optional[str]]:
"""
return self.traversal_node.node.collection.grouped_inputs or set()

@property
def dependent_identity_fields(self) -> bool:
"""If the current collection needs inputs from other collections, in addition to its seed data."""
collection = self.traversal_node.node.collection
for field in self.grouped_fields:
if collection.field(FieldPath(field)).identity:
return True
return False

def build_incoming_field_path_maps(
self, group_dependent_fields: bool = False
) -> Tuple[COLLECTION_FIELD_PATH_MAP, COLLECTION_FIELD_PATH_MAP]:
Expand Down Expand Up @@ -168,6 +177,27 @@ def can_write_data(self) -> bool:
connection_config: ConnectionConfig = self.connector.configuration
return connection_config.access == AccessLevel.write

def _combine_seed_data(
self,
*data: List[Row],
grouped_data: Dict[str, Any],
dependent_field_mappings: COLLECTION_FIELD_PATH_MAP,
) -> Dict[str, Any]:
"""Combine the seed data with the other dependent inputs. This is used when the seed data in a collection requires
inputs from another collection to generate subsequent queries."""
# Get the identity values from the seeds that were passed into this collection.
seed_index = self.input_keys.index(ROOT_COLLECTION_ADDRESS)
seed_data = data[seed_index]

for (foreign_field_path, local_field_path) in dependent_field_mappings[
ROOT_COLLECTION_ADDRESS
]:
dependent_values: List = consolidate_query_matches(
row=seed_data, target_path=foreign_field_path
)
grouped_data[local_field_path.string_path] = dependent_values
return grouped_data

def pre_process_input_data(
self, *data: List[Row], group_dependent_fields: bool = False
) -> NodeInput:
Expand Down Expand Up @@ -209,6 +239,14 @@ def pre_process_input_data(
for i, rowset in enumerate(data):
collection_address = self.input_keys[i]

if (
group_dependent_fields
and self.dependent_identity_fields
and collection_address == ROOT_COLLECTION_ADDRESS
):
# Skip building data for the root collection if the seed data needs to be combined with other inputs
continue

logger.info(
f"Consolidating incoming data into {self.traversal_node.node.address} from {collection_address}."
)
Expand All @@ -234,6 +272,14 @@ def pre_process_input_data(
row=row, target_path=foreign_field_path
)
grouped_data[local_field_path.string_path] = dependent_values

if self.dependent_identity_fields:
grouped_data = self._combine_seed_data(
*data,
grouped_data=grouped_data,
dependent_field_mappings=dependent_field_mappings,
)

output[FIDESOPS_GROUPED_INPUTS].append(grouped_data)
return output

Expand Down
16 changes: 14 additions & 2 deletions src/fidesops/util/saas_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@
from functools import reduce
from typing import Any, Dict, List, Optional, Set
from fidesops.common_exceptions import FidesopsException
from fidesops.graph.config import Collection, Dataset, Field

from fidesops.graph.config import Collection, Dataset, Field, CollectionAddress

FIDESOPS_GROUPED_INPUTS = "fidesops_grouped_inputs"

Expand Down Expand Up @@ -43,6 +42,18 @@ def get_collection_grouped_inputs(
return collection.grouped_inputs


def get_collection_after(
collections: List[Collection], name: str
) -> Set[CollectionAddress]:
"""If specified, return the collections that need to run before the current collection for saas configs"""
collection: Collection = next(
(collect for collect in collections if collect.name == name), {}
)
if not collection:
return set()
return collection.after


def merge_datasets(dataset: Dataset, config_dataset: Dataset) -> Dataset:
"""
Merges all Collections and Fields from the config_dataset into the dataset.
Expand All @@ -63,6 +74,7 @@ def merge_datasets(dataset: Dataset, config_dataset: Dataset) -> Dataset:
grouped_inputs=get_collection_grouped_inputs(
config_dataset.collections, collection_name
),
after=get_collection_after(config_dataset.collections, collection_name),
)
)

Expand Down
14 changes: 9 additions & 5 deletions tests/models/test_saasconfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import pytest
from pydantic import ValidationError

from fidesops.graph.config import FieldAddress
from fidesops.graph.config import CollectionAddress, FieldAddress
from fidesops.schemas.saas.saas_config import SaaSConfig, SaaSRequest


Expand Down Expand Up @@ -40,19 +40,23 @@ def test_saas_config_to_dataset(saas_configs: Dict[str, Dict]):
assert query_field.name == "email"
assert query_field.identity == "email"

user_feedback_collection = saas_dataset.collections[5]
assert user_feedback_collection.grouped_inputs == {
user_collection = saas_dataset.collections[5]
assert user_collection.after == {
CollectionAddress("saas_connector_example", "projects")
}
assert user_collection.grouped_inputs == {
"organization_slug",
"project_slug",
"query"
}

org_slug_reference, direction = user_feedback_collection.fields[0].references[0]
org_slug_reference, direction = user_collection.fields[0].references[0]
assert org_slug_reference == FieldAddress(
saas_config.fides_key, "projects", "organization", "slug"
)
assert direction == "from"

project_slug_reference, direction = user_feedback_collection.fields[1].references[0]
project_slug_reference, direction = user_collection.fields[1].references[0]
assert project_slug_reference == FieldAddress(
saas_config.fides_key, "projects", "slug"
)
Expand Down
35 changes: 24 additions & 11 deletions tests/task/test_graph_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,15 +249,14 @@ def test_pre_process_input_conversation_collection(
}

def test_pre_process_input_data_group_dependent_fields(self):
"""Test processing inputs where fields have been marked as dependent"""
"""Test processing inputs where several reference fields and an identity field have
been marked as dependent.
"""
traversal_with_grouped_inputs = traversal_paired_dependency()
n = traversal_with_grouped_inputs.traversal_node_dict[
CollectionAddress("mysql", "User")
]

task = MockSqlTask(
n, TaskResources(EMPTY_REQUEST, Policy(), connection_configs)
)
task = MockSqlTask(n, TaskResources(EMPTY_REQUEST, Policy(), connection_configs))

project_output = [
{
Expand All @@ -277,22 +276,36 @@ def test_pre_process_input_data_group_dependent_fields(self):
},
]

identity_output = [{"email": "email@gmail.com"}]
# Typical output - project ids and organization ids would be completely independent from each other
assert task.pre_process_input_data(project_output) == {
"organization": ["12345", "54321", "54321"],
assert task.pre_process_input_data(identity_output, project_output) == {
"email": ["email@gmail.com"],
"project": ["abcde", "fghij", "klmno"],
"organization": ["12345", "54321", "54321"],
"fidesops_grouped_inputs": [],
}

# With group_dependent_fields = True. Fields are grouped together under a key that shouldn't overlap
# with actual table keys "fidesops_grouped_inputs"
assert task.pre_process_input_data(
project_output, group_dependent_fields=True
identity_output, project_output, group_dependent_fields=True
) == {
"fidesops_grouped_inputs": [
{"organization": ["12345"], "project": ["abcde"]},
{"organization": ["54321"], "project": ["fghij"]},
{"organization": ["54321"], "project": ["klmno"]},
{
"project": ["abcde"],
"organization": ["12345"],
"email": ["email@gmail.com"],
},
{
"project": ["fghij"],
"organization": ["54321"],
"email": ["email@gmail.com"],
},
{
"project": ["klmno"],
"organization": ["54321"],
"email": ["email@gmail.com"],
},
]
}

Expand Down
7 changes: 5 additions & 2 deletions tests/task/traversal_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -610,6 +610,9 @@ def traversal_paired_dependency() -> Traversal:
)
users = Collection(
name="User",
after={
CollectionAddress("mysql", "Project"),
},
fields=[
ScalarField(
name="project",
Expand All @@ -620,11 +623,11 @@ def traversal_paired_dependency() -> Traversal:
references=[(FieldAddress("mysql", "Project", "organization_id"), "from")],
),
ScalarField(name="username"),
ScalarField(name="email"),
ScalarField(name="email", identity="email"),
ScalarField(name="position"),

],
grouped_inputs= {"project", "organization"}
grouped_inputs={"project", "organization", "email"}
)

mysql = Dataset(
Expand Down

0 comments on commit f19702f

Please sign in to comment.