Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add server config option to disable validation of outgoing data #1530

Merged
merged 3 commits into from
Feb 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,12 @@ jobs:
env:
OPTIMADE_DATABASE_BACKEND: 'mongomock'

- name: Run server tests with no API validation (using `mongomock`)
run: pytest -rs -vvv --cov=./optimade/ --cov-report=xml --cov-append tests/server tests/filtertransformers
env:
OPTIMADE_DATABASE_BACKEND: 'mongomock'
OPTIMADE_VALIDATE_API_RESPONSE: false

- name: Run server tests (using a real MongoDB)
run: pytest -rs -vvv --cov=./optimade/ --cov-report=xml --cov-append tests/server tests/filtertransformers
env:
Expand Down
6 changes: 6 additions & 0 deletions optimade/server/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,12 @@ class ServerConfig(BaseSettings):
description="If True, the server will check whether the query parameters given in the request are correct.",
)

validate_api_response: Optional[bool] = Field(
True,
description="""If False, data from the database will not undergo validation before being emitted by the API, and
only the mapping of aliases will occur.""",
)

@validator("implementation", pre=True)
def set_implementation_version(cls, v):
"""Set defaults and modify bypassed value(s)"""
Expand Down
36 changes: 23 additions & 13 deletions optimade/server/entry_collections/entry_collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def count(self, **kwargs: Any) -> int:
def find(
self, params: Union[EntryListingQueryParams, SingleEntryQueryParams]
) -> Tuple[
Union[List[EntryResource], EntryResource],
Union[None, List[EntryResource], EntryResource, List[Dict]],
int,
bool,
Set[str],
Expand All @@ -135,6 +135,11 @@ def find(
See [`EntryListingQueryParams`][optimade.server.query_params.EntryListingQueryParams]
for more information.

Returns either the list of validated pydantic models matching the query, or simply the
mapped database reponse, depending on the value of `CONFIG.validate_api_response`.

If no results match the query, then `results` is set to `None`.

Parameters:
params: Entry listing URL query params.

Expand All @@ -151,14 +156,6 @@ def find(
criteria, single_entry
)

if single_entry:
raw_results = raw_results[0] if raw_results else None # type: ignore[assignment]

if data_returned > 1:
raise NotFound(
detail=f"Instead of a single entry, {data_returned} entries were found",
)

exclude_fields = self.all_fields - response_fields
include_fields = (
response_fields - self.resource_mapper.TOP_LEVEL_NON_ATTRIBUTES_FIELDS
Expand Down Expand Up @@ -189,10 +186,23 @@ def find(
detail=f"Unrecognised OPTIMADE field(s) in requested `response_fields`: {bad_optimade_fields}."
)

if raw_results is not None:
results = self.resource_mapper.deserialize(raw_results)
else:
results = None
results: Union[None, List[EntryResource], EntryResource, List[Dict]] = None

if raw_results:
if CONFIG.validate_api_response:
results = self.resource_mapper.deserialize(raw_results)
else:
results = [self.resource_mapper.map_back(doc) for doc in raw_results]

if single_entry:
results = results[0] # type: ignore[assignment]

if CONFIG.validate_api_response and data_returned > 1:
raise NotFound(
detail=f"Instead of a single entry, {data_returned} entries were found",
)
else:
data_returned = 1

return (
results,
Expand Down
4 changes: 4 additions & 0 deletions optimade/server/mappers/entries.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,10 @@ def map_back(cls, doc: dict) -> dict:
def deserialize(
cls, results: Union[dict, Iterable[dict]]
) -> Union[List[EntryResource], EntryResource]:
"""Converts the raw database entries for this class into serialized models,
mapping the data along the way.

"""
if isinstance(results, dict):
return cls.ENTRY_RESOURCE_CLASS(**cls.map_back(results))
Comment on lines 363 to 371
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think these lines are no longer needed for our implementation, now we always pass a list.

Suggested change
def deserialize(
cls, results: Union[dict, Iterable[dict]]
) -> Union[List[EntryResource], EntryResource]:
"""Converts the raw database entries for this class into serialized models,
mapping the data along the way.
"""
if isinstance(results, dict):
return cls.ENTRY_RESOURCE_CLASS(**cls.map_back(results))
def deserialize(
cls, results: Iterable[dict]
) -> List[EntryResource]:
"""Converts the raw database entries for this class into serialized models,
mapping the data along the way.
"""


Expand Down
30 changes: 22 additions & 8 deletions optimade/server/routers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def meta_values(


def handle_response_fields(
results: Union[List[EntryResource], EntryResource],
results: Union[List[EntryResource], EntryResource, List[Dict], Dict],
exclude_fields: Set[str],
include_fields: Set[str],
) -> List[Dict[str, Any]]:
Expand All @@ -115,7 +115,11 @@ def handle_response_fields(

new_results = []
while results:
new_entry = results.pop(0).dict(exclude_unset=True, by_alias=True)
new_entry = results.pop(0)
try:
new_entry = new_entry.dict(exclude_unset=True, by_alias=True) # type: ignore[union-attr]
except AttributeError:
pass
Comment on lines +119 to +122
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You should not use try and except here. Handling an exception is very slow. So you should only use it when failure is rare (< 1%).
When validation is turned off the new_entry is however always a dictionary, so failure is not rare.
It is therefore better to do:

Suggested change
try:
new_entry = new_entry.dict(exclude_unset=True, by_alias=True) # type: ignore[union-attr]
except AttributeError:
pass
if not isinstance(new_entry, dict):
new_entry = new_entry.dict(exclude_unset=True, by_alias=True) # type: ignore[union-attr]

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure this is so clear-cut; I just made an artificial benchmark with a very simple pydantic model with exception handling and isinstance checks.

If you use exception handling then the validate_api_response: true (our default) branch is about 1% faster using exceptions than not, and the isinstance check is about 2% faster when you are passing raw dictionaries, i.e., not much changes. This is also dwarfed by the difference between using dicts vs pydantic models, which is a factor of 20x.

I would rather avoid slowing down the "slower" method, i.e., using exception handling by default.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If performance is important, the database will probably turn off validation to speed things up.
If performance is less important, the database will use a slower method and leave the validation on.
So I would argue, it would be the best to make the fastest method as fast as possible.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps, though I'm not convinced that disabling validation provides any meaningful performance boost, and instead is just used to bypass some of the strict rules we have on databases where the effort is too much to apply them (e.g., NOMAD uses "X" in like 10 out of millions of chemical formulae, and trying to query them with validation on causes crashes).

Copy link
Contributor

@JPBergsma JPBergsma Mar 7, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just did a quick try on my laptop with the test data, and it takes 25% longer to process the response request with validation. So it is not a huge performance increase, but definitively noticeable.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wow, really? I tried via the validator and could only get 1-2% difference. I'll re-investigate if I get time.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I meant that the total processing time of a request increases by 25% if I do the validation, compared to not validating.

I just did some more testing and it seems that the try except block takes about 1.5 times longer to execute than the "if" statement. Using "if" saves about 2.25 µs per entry. This is smaller than what I had expected. So for our example server we would only save 40 µs on 0.2 s so only 0.02%.
So it is probably not worth continuing this discussion.


# Remove fields excluded by their omission in `response_fields`
for field in exclude_fields:
Expand All @@ -133,7 +137,7 @@ def handle_response_fields(


def get_included_relationships(
results: Union[EntryResource, List[EntryResource]],
results: Union[EntryResource, List[EntryResource], Dict, List[Dict]],
ENTRY_COLLECTIONS: Dict[str, EntryCollection],
include_param: List[str],
) -> List[Union[EntryResource, Dict]]:
Expand Down Expand Up @@ -170,11 +174,17 @@ def get_included_relationships(
if doc is None:
continue

relationships = doc.relationships
try:
relationships = doc.relationships # type: ignore
except AttributeError:
relationships = doc.get("relationships", None)
JPBergsma marked this conversation as resolved.
Show resolved Hide resolved

if relationships is None:
continue

relationships = relationships.dict()
if not isinstance(relationships, dict):
relationships = relationships.dict()

for entry_type in ENTRY_COLLECTIONS:
# Skip entry type if it is not in `include_param`
if entry_type not in include_param:
Expand All @@ -187,7 +197,9 @@ def get_included_relationships(
if ref["id"] not in endpoint_includes[entry_type]:
endpoint_includes[entry_type][ref["id"]] = ref

included = {}
included: Dict[
str, Union[List[EntryResource], EntryResource, List[Dict], Dict]
] = {}
for entry_type in endpoint_includes:
compound_filter = " OR ".join(
['id="{}"'.format(ref_id) for ref_id in endpoint_includes[entry_type]]
Expand All @@ -203,6 +215,8 @@ def get_included_relationships(

# still need to handle pagination
ref_results, _, _, _, _ = ENTRY_COLLECTIONS[entry_type].find(params)
if ref_results is None:
ref_results = []
included[entry_type] = ref_results

# flatten dict by endpoint to list
Expand Down Expand Up @@ -273,7 +287,7 @@ def get_entries(

return response(
links=links,
data=results,
data=results if results else [],
meta=meta_values(
url=request.url,
data_returned=data_returned,
Expand Down Expand Up @@ -326,7 +340,7 @@ def get_single_entry(

return response(
links=links,
data=results,
data=results if results else None,
meta=meta_values(
url=request.url,
data_returned=data_returned,
Expand Down