From 20bdb07aca9e697d656853ee427933048b70d47e Mon Sep 17 00:00:00 2001 From: Jack Roof Date: Fri, 1 Oct 2021 21:42:26 -0700 Subject: [PATCH] feat: s3 preview client (#1499) This commit implements a new preview client that gets preview data from S3. The preview client is good for folks who want to persist their preview data somewhere rather than relaying on an external API call that can both fail and take a while depending on the table the query is being run on. In addition to the base client, I implemented a JSON version of the preview client that my organization is using. This fetches data from S3 in a JSON format and works very nicely with marshmellow serialization to the PreviewData format. Signed-off-by: jroof88 --- .../base/base_s3_preview_client.py | 48 ++++++++++++ .../example_s3_json_preview_client.py | 73 +++++++++++++++++++ frontend/docs/examples/s3_preview_client.md | 20 +++++ 3 files changed, 141 insertions(+) create mode 100644 frontend/amundsen_application/base/base_s3_preview_client.py create mode 100644 frontend/amundsen_application/base/examples/example_s3_json_preview_client.py create mode 100644 frontend/docs/examples/s3_preview_client.md diff --git a/frontend/amundsen_application/base/base_s3_preview_client.py b/frontend/amundsen_application/base/base_s3_preview_client.py new file mode 100644 index 0000000000..98552b2525 --- /dev/null +++ b/frontend/amundsen_application/base/base_s3_preview_client.py @@ -0,0 +1,48 @@ +# Copyright Contributors to the Amundsen project. +# SPDX-License-Identifier: Apache-2.0 + +import abc +import logging +from http import HTTPStatus +from typing import Dict + +from amundsen_application.base.base_preview_client import BasePreviewClient +from amundsen_application.models.preview_data import (PreviewData, + PreviewDataSchema) +from flask import Response as FlaskResponse +from flask import jsonify, make_response +from marshmallow import ValidationError + + +class BaseS3PreviewClient(BasePreviewClient): + def __init__(self) -> None: + pass + + @abc.abstractmethod + def get_s3_preview_data(self, *, params: Dict) -> PreviewData: + """ + Returns the data from S3 in PreviewData model format + """ + pass # pragma: no cover + + def get_preview_data(self, params: Dict, optionalHeaders: Dict = None) -> FlaskResponse: + try: + preview_data = self.get_s3_preview_data(params=params) + try: + data = PreviewDataSchema().dump(preview_data) + PreviewDataSchema().load(data) # for validation only + payload = jsonify({'preview_data': data}) + return make_response(payload, HTTPStatus.OK) + except ValidationError as err: + logging.error("PreviewDataSchema serialization error " + str(err.messages)) + return make_response(jsonify({'preview_data': {}}), HTTPStatus.INTERNAL_SERVER_ERROR) + except Exception as err: + logging.error("error getting s3 preview data " + str(err)) + return make_response(jsonify({'preview_data': {}}), HTTPStatus.INTERNAL_SERVER_ERROR) + + def get_feature_preview_data(self, params: Dict, optionalHeaders: Dict = None) -> FlaskResponse: + """ + BaseS3PreviewClient only supports data preview currently but this function needs to be stubbed to + implement the BasePreviewClient interface + """ + pass diff --git a/frontend/amundsen_application/base/examples/example_s3_json_preview_client.py b/frontend/amundsen_application/base/examples/example_s3_json_preview_client.py new file mode 100644 index 0000000000..d6d544d1af --- /dev/null +++ b/frontend/amundsen_application/base/examples/example_s3_json_preview_client.py @@ -0,0 +1,73 @@ +# Copyright Contributors to the Amundsen project. +# SPDX-License-Identifier: Apache-2.0 + +import json +import os +from typing import Dict + +import boto3 +from amundsen_application.base.base_s3_preview_client import \ + BaseS3PreviewClient +from amundsen_application.models.preview_data import ColumnItem, PreviewData + + +class S3JSONPreviewClient(BaseS3PreviewClient): + """ + S3JSONPreviewClient is an S3 Preview Client that: + 1. Gets JSON files from S3 that are stored in a bucket with keys preview_data/{schema}/{table}.json + 2. Converts the JSON values to PreviewData model + 3. Returns the serialized model + + In order for this preview client to work you must: + - Have S3 files stored in a bucket with keys 'preview_data/{schema}/{table}.json' + - Files are formatted as list of rows as map with key being the column name and value being column value + Ex: + [ + { + 'col1': 1, + 'col2': '2' + }, + { + 'col1': 3, + 'col2': '4' + } + ... + ] + - Nested field are not supported. We suggest flattening your nested fields. + Ex: + [ + { + 'col1': { + 'col2: 1 + } + ] + should be: + [ + { + 'col1.col2': 1 + } + ] + - Run your frontend service with an IAM Profile that has s3:GetObject permissions on the 'preview_data/' prefix + """ + + def __init__(self) -> None: + self.s3 = boto3.client("s3") + bucket = os.getenv("PREVIEW_CLIENT_S3_BUCKET") + if bucket == "": + raise Exception("When using the S3CSVPreviewClient you must set the PREVIEW_CLIENT_S3_BUCKET environment " + "variable to point to where your preview_data CSVs are stored.") + self.s3_bucket = bucket + + def get_s3_preview_data(self, *, params: Dict) -> PreviewData: + schema = params.get("schema") + table = params.get("tableName") + + try: + obj = self.s3.get_object(Bucket=self.s3_bucket, Key=f"preview_data/{schema}/{table}.json") + except Exception as e: + raise Exception(f"Error getting object from s3. preview_data/{schema}/{table}.json" + f"Caused by: {e}") + + data = json.loads(obj['Body'].read().decode('utf-8')) + columns = [ColumnItem(col_name, '') for col_name in data[0]] # TODO: figure out how to do Type. Is it needed? + return PreviewData(columns=columns, data=data) diff --git a/frontend/docs/examples/s3_preview_client.md b/frontend/docs/examples/s3_preview_client.md new file mode 100644 index 0000000000..3d70e15fcb --- /dev/null +++ b/frontend/docs/examples/s3_preview_client.md @@ -0,0 +1,20 @@ +# Overview + +Amundsen's data preview feature requires that developers create a custom implementation of `base_preview_client` for requesting that data. This feature assists with data discovery by providing the end user the option to view a sample of the actual resource data so that they can verify whether or not they want to transition into exploring that data, or continue their search. + +[S3](https://aws.amazon.com/s3/) is AWS block storage and is used in this scenario for storing precomputed preview data. + +## Implementation + +Implement the `base_s3_preview_client` to make a request to AWS using [boto3](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html) to fetch your preview data. +Your preview data needs to already be stored in S3 for this to work. +You can take a look at `example_s3_json_preview_client` to see a working implementation that fetches JSON files. + + +## Usage + +To use a preview client set these environment variables in your deployment. + +- `PREVIEW_CLIENT_ENABLED`: `true` +- `PREVIEW_CLIENT`: `{python path to preview client class}` (ex: `amundsen_application.base.examples.example_s3_json_preview_client.S3JSONPreviewClient` if you are using the JSON example client) +- `PREVIEW_CLIENT_S3_BUCKET`: `{S3 bucket where the preview data is stored}` \ No newline at end of file