forked from amundsen-io/amundsen
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: s3 preview client (amundsen-io#1499)
This commit implements a new preview client that gets preview data from S3. The preview client is good for folks who want to persist their preview data somewhere rather than relaying on an external API call that can both fail and take a while depending on the table the query is being run on. In addition to the base client, I implemented a JSON version of the preview client that my organization is using. This fetches data from S3 in a JSON format and works very nicely with marshmellow serialization to the PreviewData format. Signed-off-by: jroof88 <jack.roof@samsara.com>
- Loading branch information
Showing
3 changed files
with
141 additions
and
0 deletions.
There are no files selected for viewing
48 changes: 48 additions & 0 deletions
48
frontend/amundsen_application/base/base_s3_preview_client.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
# Copyright Contributors to the Amundsen project. | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
import abc | ||
import logging | ||
from http import HTTPStatus | ||
from typing import Dict | ||
|
||
from amundsen_application.base.base_preview_client import BasePreviewClient | ||
from amundsen_application.models.preview_data import (PreviewData, | ||
PreviewDataSchema) | ||
from flask import Response as FlaskResponse | ||
from flask import jsonify, make_response | ||
from marshmallow import ValidationError | ||
|
||
|
||
class BaseS3PreviewClient(BasePreviewClient): | ||
def __init__(self) -> None: | ||
pass | ||
|
||
@abc.abstractmethod | ||
def get_s3_preview_data(self, *, params: Dict) -> PreviewData: | ||
""" | ||
Returns the data from S3 in PreviewData model format | ||
""" | ||
pass # pragma: no cover | ||
|
||
def get_preview_data(self, params: Dict, optionalHeaders: Dict = None) -> FlaskResponse: | ||
try: | ||
preview_data = self.get_s3_preview_data(params=params) | ||
try: | ||
data = PreviewDataSchema().dump(preview_data) | ||
PreviewDataSchema().load(data) # for validation only | ||
payload = jsonify({'preview_data': data}) | ||
return make_response(payload, HTTPStatus.OK) | ||
except ValidationError as err: | ||
logging.error("PreviewDataSchema serialization error " + str(err.messages)) | ||
return make_response(jsonify({'preview_data': {}}), HTTPStatus.INTERNAL_SERVER_ERROR) | ||
except Exception as err: | ||
logging.error("error getting s3 preview data " + str(err)) | ||
return make_response(jsonify({'preview_data': {}}), HTTPStatus.INTERNAL_SERVER_ERROR) | ||
|
||
def get_feature_preview_data(self, params: Dict, optionalHeaders: Dict = None) -> FlaskResponse: | ||
""" | ||
BaseS3PreviewClient only supports data preview currently but this function needs to be stubbed to | ||
implement the BasePreviewClient interface | ||
""" | ||
pass |
73 changes: 73 additions & 0 deletions
73
frontend/amundsen_application/base/examples/example_s3_json_preview_client.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
# Copyright Contributors to the Amundsen project. | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
import json | ||
import os | ||
from typing import Dict | ||
|
||
import boto3 | ||
from amundsen_application.base.base_s3_preview_client import \ | ||
BaseS3PreviewClient | ||
from amundsen_application.models.preview_data import ColumnItem, PreviewData | ||
|
||
|
||
class S3JSONPreviewClient(BaseS3PreviewClient): | ||
""" | ||
S3JSONPreviewClient is an S3 Preview Client that: | ||
1. Gets JSON files from S3 that are stored in a bucket with keys preview_data/{schema}/{table}.json | ||
2. Converts the JSON values to PreviewData model | ||
3. Returns the serialized model | ||
In order for this preview client to work you must: | ||
- Have S3 files stored in a bucket with keys 'preview_data/{schema}/{table}.json' | ||
- Files are formatted as list of rows as map with key being the column name and value being column value | ||
Ex: | ||
[ | ||
{ | ||
'col1': 1, | ||
'col2': '2' | ||
}, | ||
{ | ||
'col1': 3, | ||
'col2': '4' | ||
} | ||
... | ||
] | ||
- Nested field are not supported. We suggest flattening your nested fields. | ||
Ex: | ||
[ | ||
{ | ||
'col1': { | ||
'col2: 1 | ||
} | ||
] | ||
should be: | ||
[ | ||
{ | ||
'col1.col2': 1 | ||
} | ||
] | ||
- Run your frontend service with an IAM Profile that has s3:GetObject permissions on the 'preview_data/' prefix | ||
""" | ||
|
||
def __init__(self) -> None: | ||
self.s3 = boto3.client("s3") | ||
bucket = os.getenv("PREVIEW_CLIENT_S3_BUCKET") | ||
if bucket == "": | ||
raise Exception("When using the S3CSVPreviewClient you must set the PREVIEW_CLIENT_S3_BUCKET environment " | ||
"variable to point to where your preview_data CSVs are stored.") | ||
self.s3_bucket = bucket | ||
|
||
def get_s3_preview_data(self, *, params: Dict) -> PreviewData: | ||
schema = params.get("schema") | ||
table = params.get("tableName") | ||
|
||
try: | ||
obj = self.s3.get_object(Bucket=self.s3_bucket, Key=f"preview_data/{schema}/{table}.json") | ||
except Exception as e: | ||
raise Exception(f"Error getting object from s3. preview_data/{schema}/{table}.json" | ||
f"Caused by: {e}") | ||
|
||
data = json.loads(obj['Body'].read().decode('utf-8')) | ||
columns = [ColumnItem(col_name, '') for col_name in data[0]] # TODO: figure out how to do Type. Is it needed? | ||
return PreviewData(columns=columns, data=data) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
# Overview | ||
|
||
Amundsen's data preview feature requires that developers create a custom implementation of `base_preview_client` for requesting that data. This feature assists with data discovery by providing the end user the option to view a sample of the actual resource data so that they can verify whether or not they want to transition into exploring that data, or continue their search. | ||
|
||
[S3](https://aws.amazon.com/s3/) is AWS block storage and is used in this scenario for storing precomputed preview data. | ||
|
||
## Implementation | ||
|
||
Implement the `base_s3_preview_client` to make a request to AWS using [boto3](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html) to fetch your preview data. | ||
Your preview data needs to already be stored in S3 for this to work. | ||
You can take a look at `example_s3_json_preview_client` to see a working implementation that fetches JSON files. | ||
|
||
|
||
## Usage | ||
|
||
To use a preview client set these environment variables in your deployment. | ||
|
||
- `PREVIEW_CLIENT_ENABLED`: `true` | ||
- `PREVIEW_CLIENT`: `{python path to preview client class}` (ex: `amundsen_application.base.examples.example_s3_json_preview_client.S3JSONPreviewClient` if you are using the JSON example client) | ||
- `PREVIEW_CLIENT_S3_BUCKET`: `{S3 bucket where the preview data is stored}` |