Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

APIDataSet reduce number of arguments and move to Load args #1633

Merged
merged 22 commits into from
Sep 12, 2022
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
a0a7464
initial version with load args.
daBlesr Jun 19, 2022
728b50a
changed configurations into load_args. Created BaseAuthFactory to ins…
daBlesr Jun 17, 2022
9e16c86
Custom authenticator instances from auth_type & credentials
daBlesr Jun 18, 2022
620a22f
added a bunch more tests that shows some new capabilities such as str…
daBlesr Jun 19, 2022
362b38d
linting
daBlesr Jun 19, 2022
82584e6
removed unnecessary comments
daBlesr Jun 19, 2022
e49140c
add tests for auth factory for completing coverage.
daBlesr Jun 19, 2022
68b293c
removed implementation auth_type. Moved documentation to 0.19.0 and a…
daBlesr Jun 20, 2022
78f9a2c
removed unused variable DEFAULT_CREDENTIALS.
daBlesr Jun 20, 2022
6f1eaab
changed example for api_dataset to show credentials input as a tuple.
daBlesr Jun 20, 2022
6f446e7
cert, auth, cred, timeout, should be able to be provided as lists and…
daBlesr Jun 25, 2022
0e3b5f3
lint
daBlesr Jun 25, 2022
b54e1bd
changed type of credentials in constructor.
daBlesr Jun 27, 2022
6ef284b
changed type of credentials in constructor.
daBlesr Jun 27, 2022
5c33895
prevent None objects from being logged. Updated release description.
daBlesr Jun 29, 2022
9b28fff
Merge branch 'develop' of https://github.com/kedro-org/kedro into fea…
daBlesr Jun 29, 2022
922815f
merge develop
daBlesr Jun 29, 2022
f4f2f34
Test commit
antonymilne Sep 12, 2022
4a16385
Merge branch 'develop' into feature/api-dataset-load-args
antonymilne Sep 12, 2022
b17ddbf
Add AuthBase to type hint
antonymilne Sep 12, 2022
dd028c3
Apply suggestions from code review
antonymilne Sep 12, 2022
3de2915
Apply suggestions from code review
antonymilne Sep 12, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@

## Breaking changes to the API

### DataSets
* Reduced constructor arguments for `APIDataSet` by replacing most arguments with a single constructor argument `load_args`. This makes it more consistent with other Kedro DataSets and the underlying `requests` API, and automatically enables the full configuration domain: stream, certificates, proxies, and more.

antonymilne marked this conversation as resolved.
Show resolved Hide resolved

### CLI
* Removed deprecated `kedro docs` command.

Expand All @@ -15,6 +19,10 @@

## Migration guide from Kedro 0.18.* to 0.19.*

### DataSets

* If you use `APIDataSet`, move all `requests` specific arguments (e.g. `params`, `headers`), except for `url` and `method`, to under `load_args`.

# Upcoming Release 0.18.3

## Major features and improvements
Expand Down
15 changes: 8 additions & 7 deletions docs/source/data/data_catalog.md
Original file line number Diff line number Diff line change
Expand Up @@ -271,13 +271,14 @@ us_corn_yield_data:
type: api.APIDataSet
url: https://quickstats.nass.usda.gov
credentials: usda_credentials
params:
key: SOME_TOKEN
format: JSON
commodity_desc: CORN
statisticcat_des: YIELD
agg_level_desc: STATE
year: 2000
load_args:
params:
key: SOME_TOKEN
format: JSON
commodity_desc: CORN
statisticcat_des: YIELD
agg_level_desc: STATE
year: 2000
```

Note that `usda_credientials` will be passed as the `auth` argument in the `requests` library. Specify the username and password as a list in your `credentials.yml` file as follows:
Expand Down
104 changes: 53 additions & 51 deletions kedro/extras/datasets/api/api_dataset.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
"""``APIDataSet`` loads the data from HTTP(S) APIs.
It uses the python requests library: https://requests.readthedocs.io/en/latest/
"""
from typing import Any, Dict, Iterable, List, NoReturn, Union
from typing import Any, Dict, List, NoReturn, Tuple, Union

import requests
from requests import Session, sessions
from requests.auth import AuthBase

from kedro.io.core import AbstractDataSet, DataSetError
Expand All @@ -21,83 +22,83 @@ class APIDataSet(AbstractDataSet[None, requests.Response]):
>>>
>>> data_set = APIDataSet(
>>> url="https://quickstats.nass.usda.gov",
>>> params={
>>> "key": "SOME_TOKEN",
>>> "format": "JSON",
>>> "commodity_desc": "CORN",
>>> "statisticcat_des": "YIELD",
>>> "agg_level_desc": "STATE",
>>> "year": 2000
>>> }
>>> load_args={
>>> "params": {
>>> "key": "SOME_TOKEN",
>>> "format": "JSON",
>>> "commodity_desc": "CORN",
>>> "statisticcat_des": "YIELD",
>>> "agg_level_desc": "STATE",
>>> "year": 2000
>>> }
>>> },
>>> credentials=("username", "password")
>>> )
>>> data = data_set.load()
"""

# pylint: disable=too-many-arguments
def __init__(
self,
url: str,
method: str = "GET",
data: Any = None,
params: Dict[str, Any] = None,
headers: Dict[str, Any] = None,
auth: Union[Iterable[str], AuthBase] = None,
json: Union[List, Dict[str, Any]] = None,
timeout: int = 60,
credentials: Union[Iterable[str], AuthBase] = None,
load_args: Dict[str, Any] = None,
credentials: Union[Tuple[str, str], List[str], AuthBase] = None,
) -> None:
"""Creates a new instance of ``APIDataSet`` to fetch data from an API endpoint.

Args:
url: The API URL endpoint.
method: The Method of the request, GET, POST, PUT, DELETE, HEAD, etc...
data: The request payload, used for POST, PUT, etc requests
https://requests.readthedocs.io/en/latest/user/quickstart/#more-complicated-post-requests
params: The url parameters of the API.
https://requests.readthedocs.io/en/latest/user/quickstart/#passing-parameters-in-urls
headers: The HTTP headers.
https://requests.readthedocs.io/en/latest/user/quickstart/#custom-headers
auth: Anything ``requests`` accepts. Normally it's either ``('login', 'password')``,
or ``AuthBase``, ``HTTPBasicAuth`` instance for more complex cases. Any
iterable will be cast to a tuple.
json: The request payload, used for POST, PUT, etc requests, passed in
to the json kwarg in the requests object.
https://requests.readthedocs.io/en/latest/user/quickstart/#more-complicated-post-requests
timeout: The wait time in seconds for a response, defaults to 1 minute.
https://requests.readthedocs.io/en/latest/user/quickstart/#timeouts
credentials: same as ``auth``. Allows specifying ``auth`` secrets in
credentials.yml.

load_args: Additional parameters to be fed to requests.request.
https://docs.python-requests.org/en/latest/api/
antonymilne marked this conversation as resolved.
Show resolved Hide resolved
credentials: Allows specifying secrets in credentials.yml.
daBlesr marked this conversation as resolved.
Show resolved Hide resolved
Expected format is ``('login', 'password')``.
antonymilne marked this conversation as resolved.
Show resolved Hide resolved
Raises:
ValueError: if both ``credentials`` and ``auth`` are specified.
ValueError: if both ``auth`` in ``load_args`` and ``credentials`` are specified.
"""
super().__init__()

if credentials is not None and auth is not None:
self._load_args = load_args or {}
self._load_args_auth = self._load_args.pop("auth", None)

if credentials is not None and self._load_args_auth is not None:
raise ValueError("Cannot specify both auth and credentials.")

auth = credentials or auth
self._auth = credentials or self._load_args_auth

if isinstance(auth, Iterable):
auth = tuple(auth)
if "cert" in self._load_args:
self._load_args["cert"] = self._convert_type(self._load_args["cert"])

if "timeout" in self._load_args:
self._load_args["timeout"] = self._convert_type(self._load_args["timeout"])

self._request_args: Dict[str, Any] = {
"url": url,
"method": method,
"data": data,
"params": params,
"headers": headers,
"auth": auth,
"json": json,
"timeout": timeout,
"auth": self._convert_type(self._auth),
**self._load_args,
}

@staticmethod
def _convert_type(value: Any):
"""
From the Catalog, iterations are provided as lists.
However, for some Parameters in the python Requests library,
only Tuples are allowed.
"""
antonymilne marked this conversation as resolved.
Show resolved Hide resolved
if isinstance(value, List):
return tuple(value)
return value

def _describe(self) -> Dict[str, Any]:
return dict(**self._request_args)
# prevent auth from logging
request_args_cp = self._request_args.copy()
request_args_cp.pop("auth", None)
daBlesr marked this conversation as resolved.
Show resolved Hide resolved
return request_args_cp

def _execute_request(self) -> requests.Response:
def _execute_request(self, session: Session) -> requests.Response:
try:
response = requests.request(**self._request_args)
response = session.request(**self._request_args)
response.raise_for_status()
except requests.exceptions.HTTPError as exc:
raise DataSetError("Failed to fetch data", exc) from exc
Expand All @@ -107,12 +108,13 @@ def _execute_request(self) -> requests.Response:
return response

def _load(self) -> requests.Response:
return self._execute_request()
with sessions.Session() as session:
return self._execute_request(session)
daBlesr marked this conversation as resolved.
Show resolved Hide resolved

def _save(self, data: None) -> NoReturn:
raise DataSetError(f"{self.__class__.__name__} is a read only data set type")

def _exists(self) -> bool:
response = self._execute_request()

with sessions.Session() as session:
response = self._execute_request(session)
return response.ok
Loading