Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

APIDataSet reduce number of arguments and move to Load args #1633

Merged
merged 22 commits into from
Sep 12, 2022
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
a0a7464
initial version with load args.
daBlesr Jun 19, 2022
728b50a
changed configurations into load_args. Created BaseAuthFactory to ins…
daBlesr Jun 17, 2022
9e16c86
Custom authenticator instances from auth_type & credentials
daBlesr Jun 18, 2022
620a22f
added a bunch more tests that shows some new capabilities such as str…
daBlesr Jun 19, 2022
362b38d
linting
daBlesr Jun 19, 2022
82584e6
removed unnecessary comments
daBlesr Jun 19, 2022
e49140c
add tests for auth factory for completing coverage.
daBlesr Jun 19, 2022
68b293c
removed implementation auth_type. Moved documentation to 0.19.0 and a…
daBlesr Jun 20, 2022
78f9a2c
removed unused variable DEFAULT_CREDENTIALS.
daBlesr Jun 20, 2022
6f1eaab
changed example for api_dataset to show credentials input as a tuple.
daBlesr Jun 20, 2022
6f446e7
cert, auth, cred, timeout, should be able to be provided as lists and…
daBlesr Jun 25, 2022
0e3b5f3
lint
daBlesr Jun 25, 2022
b54e1bd
changed type of credentials in constructor.
daBlesr Jun 27, 2022
6ef284b
changed type of credentials in constructor.
daBlesr Jun 27, 2022
5c33895
prevent None objects from being logged. Updated release description.
daBlesr Jun 29, 2022
9b28fff
Merge branch 'develop' of https://github.com/kedro-org/kedro into fea…
daBlesr Jun 29, 2022
922815f
merge develop
daBlesr Jun 29, 2022
f4f2f34
Test commit
antonymilne Sep 12, 2022
4a16385
Merge branch 'develop' into feature/api-dataset-load-args
antonymilne Sep 12, 2022
b17ddbf
Add AuthBase to type hint
antonymilne Sep 12, 2022
dd028c3
Apply suggestions from code review
antonymilne Sep 12, 2022
3de2915
Apply suggestions from code review
antonymilne Sep 12, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
* Bumped the upper bound for the Flake8 dependency to <5.0.
* `kedro jupyter notebook/lab` no longer reuses a Jupyter kernel.
* Required `cookiecutter>=2.1.1` to address a [known command injection vulnerability](https://security.snyk.io/vuln/SNYK-PYTHON-COOKIECUTTER-2414281).

* Reduced constructor arguments for `APIDataSet`, and made it consistent w.r.t. requests API, automatically enabling the full configuration domain: stream, certificates, proxies, and more.
daBlesr marked this conversation as resolved.
Show resolved Hide resolved

## Upcoming deprecations for Kedro 0.19.0
* `kedro.extras.ColorHandler` will be removed in 0.19.0.
Expand Down
6 changes: 3 additions & 3 deletions docs/source/data/data_catalog.md
Original file line number Diff line number Diff line change
Expand Up @@ -265,12 +265,12 @@ us_corn_yield_data:
year: 2000
```

Note that `usda_credientials` will be passed as the `auth` argument in the `requests` library. Specify the username and password as a list in your `credentials.yml` as follows:
Note that `usda_credientials` will be passed as the `auth` argument in the `requests` library. Specify the username and password as a Dict in your `credentials.yml` as follows:

```yaml
usda_credentials:
- username
- password
username: John
password: Doe
```

Example 14: Loading data from Minio (S3 API Compatible Storage)
Expand Down
80 changes: 31 additions & 49 deletions kedro/extras/datasets/api/api_dataset.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
"""``APIDataSet`` loads the data from HTTP(S) APIs.
It uses the python requests library: https://requests.readthedocs.io/en/master/
"""
from typing import Any, Dict, Iterable, List, Union
from copy import deepcopy
from typing import Any, Dict

import requests
from requests.auth import AuthBase

from kedro.extras.datasets.api.auth_factory import create_authenticator
from kedro.io.core import AbstractDataSet, DataSetError

_DEFAULT_CREDENTIALS: Dict[str, Any] = {}


class APIDataSet(AbstractDataSet):
"""``APIDataSet`` loads the data from HTTP(S) APIs.
Expand All @@ -21,14 +24,17 @@ class APIDataSet(AbstractDataSet):
>>>
>>> data_set = APIDataSet(
>>> url="https://quickstats.nass.usda.gov",
>>> params={
>>> "key": "SOME_TOKEN",
>>> "format": "JSON",
>>> "commodity_desc": "CORN",
>>> "statisticcat_des": "YIELD",
>>> "agg_level_desc": "STATE",
>>> "year": 2000
>>> }
>>> load_args={
>>> "params": {
>>> "key": "SOME_TOKEN",
>>> "format": "JSON",
>>> "commodity_desc": "CORN",
>>> "statisticcat_des": "YIELD",
>>> "agg_level_desc": "STATE",
>>> "year": 2000
>>> }
>>> },
>>> credentials={"username": "John", "password": "Doe"}
>>> )
>>> data = data_set.load()
"""
Expand All @@ -38,58 +44,35 @@ def __init__(
self,
url: str,
method: str = "GET",
data: Any = None,
params: Dict[str, Any] = None,
headers: Dict[str, Any] = None,
auth: Union[Iterable[str], AuthBase] = None,
json: Union[List, Dict[str, Any]] = None,
timeout: int = 60,
credentials: Union[Iterable[str], AuthBase] = None,
auth_type: str = "requests.auth.HTTPBasicAuth",
load_args: Dict[str, Any] = None,
credentials: Dict[str, Any] = None,
) -> None:
"""Creates a new instance of ``APIDataSet`` to fetch data from an API endpoint.

Args:
url: The API URL endpoint.
method: The Method of the request, GET, POST, PUT, DELETE, HEAD, etc...
data: The request payload, used for POST, PUT, etc requests
https://requests.readthedocs.io/en/master/user/quickstart/#more-complicated-post-requests
params: The url parameters of the API.
https://requests.readthedocs.io/en/master/user/quickstart/#passing-parameters-in-urls
headers: The HTTP headers.
https://requests.readthedocs.io/en/master/user/quickstart/#custom-headers
auth: Anything ``requests`` accepts. Normally it's either ``('login', 'password')``,
or ``AuthBase``, ``HTTPBasicAuth`` instance for more complex cases. Any
iterable will be cast to a tuple.
json: The request payload, used for POST, PUT, etc requests, passed in
to the json kwarg in the requests object.
https://requests.readthedocs.io/en/master/user/quickstart/#more-complicated-post-requests
timeout: The wait time in seconds for a response, defaults to 1 minute.
https://requests.readthedocs.io/en/master/user/quickstart/#timeouts
credentials: same as ``auth``. Allows specifying ``auth`` secrets in
credentials.yml.

Raises:
ValueError: if both ``credentials`` and ``auth`` are specified.
load_args: Additional parameters to be fed to requests.request.
https://docs.python-requests.org/en/latest/api/
antonymilne marked this conversation as resolved.
Show resolved Hide resolved
auth_type: provide type to construct a Requests `BaseAuth` object.
credentials: Allows specifying secrets in credentials.yml.
daBlesr marked this conversation as resolved.
Show resolved Hide resolved
"""
super().__init__()

if credentials is not None and auth is not None:
raise ValueError("Cannot specify both auth and credentials.")

auth = credentials or auth
self._credentials = deepcopy(_DEFAULT_CREDENTIALS)
if credentials is not None:
self._credentials.update(credentials)

if isinstance(auth, Iterable):
auth = tuple(auth)
self._auth = None
if credentials is not None:
self._auth = create_authenticator(class_type=auth_type, **credentials)

self._request_args: Dict[str, Any] = {
**(load_args or {}),
daBlesr marked this conversation as resolved.
Show resolved Hide resolved
"url": url,
"method": method,
"data": data,
"params": params,
"headers": headers,
"auth": auth,
"json": json,
"timeout": timeout,
"auth": self._auth,
daBlesr marked this conversation as resolved.
Show resolved Hide resolved
}

def _describe(self) -> Dict[str, Any]:
Expand All @@ -114,5 +97,4 @@ def _save(self, data: Any) -> None:

def _exists(self) -> bool:
response = self._execute_request()

return response.ok
47 changes: 47 additions & 0 deletions kedro/extras/datasets/api/auth_factory.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
"""``auth_factory`` creates `requests.auth.AuthBase` instances from Catalog configuration.
"""
from requests.auth import AuthBase

from kedro.io import DataSetError
from kedro.utils import load_obj


def create_authenticator(class_type: str, **kwargs):
"""
Args:
class_type: path to class that inherits from `requests.auth.AuthBase`.
**kwargs: constructor parameters for this class.

Returns:
An instance of the class that is provided.
Raises:
DataSetError: if class cannot be loaded or instantiated,
or class does not inherit from `requests.auth.AuthBase`
"""
try:
class_obj = load_obj(class_type)
except Exception as err:
raise DataSetError(
f"The specified class path {class_type} "
f"for constructing an Auth object cannot be found."
) from err

try:
authenticator = class_obj(**kwargs) # type: ignore
except TypeError as err:
raise DataSetError(
f"\n{err}.\nAuthenticator Object '{class_type}' "
f"must only contain arguments valid for the "
f"constructor of '{class_obj.__module__}.{class_obj.__qualname__}'."
) from err
except Exception as err:
raise DataSetError(
f"\n{err}.\nFailed to instantiate Authenticator Object '{class_type}' "
f"of type '{class_obj.__module__}.{class_obj.__qualname__}'."
) from err
else:
if not isinstance(authenticator, AuthBase):
raise DataSetError(
f"The requests library expects {class_type} to be an instance of AuthBase."
)
return authenticator
8 changes: 8 additions & 0 deletions tests/extras/datasets/api/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import pytest
import requests_mock


@pytest.fixture
def requests_mocker():
daBlesr marked this conversation as resolved.
Show resolved Hide resolved
with requests_mock.Mocker() as mock:
yield mock
82 changes: 82 additions & 0 deletions tests/extras/datasets/api/test_api_auth.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import base64

import pytest
import requests
from requests.auth import AuthBase

from kedro.extras.datasets.api import APIDataSet
from tests.extras.datasets.api.test_api_dataset import (
TEST_HEADERS,
TEST_METHOD,
TEST_TEXT_RESPONSE_DATA,
TEST_URL,
)


class AccessTokenAuth(AuthBase):
"""Attaches Access Token Authentication to the given Request object."""

def __init__(self, token):
self.token = token

def __call__(self, r):
r.headers["Authorization"] = f"access_token {self.token}"
return r


def _basic_auth(username, password):
encoded = base64.b64encode(f"{username}:{password}".encode("latin-1"))
return f"Basic {encoded.decode('latin-1')}"


class TestApiAuth:
@pytest.mark.parametrize(
"auth_type,auth_cred,auth_header_key, auth_header_value",
[
(
"requests.auth.HTTPBasicAuth",
{"username": "john", "password": "doe"},
"Authorization",
_basic_auth("john", "doe"),
),
(
"requests.auth.HTTPProxyAuth",
{"username": "john", "password": "doe"},
"Proxy-Authorization",
_basic_auth("john", "doe"),
),
(
"tests.extras.datasets.api.test_api_auth.AccessTokenAuth",
{"token": "abc"},
"Authorization",
"access_token abc",
),
],
)
def test_auth_sequence(
self, requests_mocker, auth_cred, auth_type, auth_header_key, auth_header_value
):
"""
Tests to make sure request Authenticator instances
can be created and configured with the right credentials.
The created authenticator is passed in with a request
and headers are tested for the correct value.
"""
api_data_set = APIDataSet(
url=TEST_URL,
method=TEST_METHOD,
auth_type=auth_type,
load_args={"headers": TEST_HEADERS},
credentials=auth_cred,
)

requests_mocker.register_uri(
TEST_METHOD, TEST_URL, headers=TEST_HEADERS, text=TEST_TEXT_RESPONSE_DATA
)

response = api_data_set.load()
assert isinstance(response, requests.Response)
assert response.text == TEST_TEXT_RESPONSE_DATA
assert (
requests_mocker.last_request.headers[auth_header_key] == auth_header_value
)
Loading