diff --git a/frontend/amundsen_application/base/base_s3_preview_client.py b/frontend/amundsen_application/base/base_s3_preview_client.py new file mode 100644 index 0000000000..98552b2525 --- /dev/null +++ b/frontend/amundsen_application/base/base_s3_preview_client.py @@ -0,0 +1,48 @@ +# Copyright Contributors to the Amundsen project. +# SPDX-License-Identifier: Apache-2.0 + +import abc +import logging +from http import HTTPStatus +from typing import Dict + +from amundsen_application.base.base_preview_client import BasePreviewClient +from amundsen_application.models.preview_data import (PreviewData, + PreviewDataSchema) +from flask import Response as FlaskResponse +from flask import jsonify, make_response +from marshmallow import ValidationError + + +class BaseS3PreviewClient(BasePreviewClient): + def __init__(self) -> None: + pass + + @abc.abstractmethod + def get_s3_preview_data(self, *, params: Dict) -> PreviewData: + """ + Returns the data from S3 in PreviewData model format + """ + pass # pragma: no cover + + def get_preview_data(self, params: Dict, optionalHeaders: Dict = None) -> FlaskResponse: + try: + preview_data = self.get_s3_preview_data(params=params) + try: + data = PreviewDataSchema().dump(preview_data) + PreviewDataSchema().load(data) # for validation only + payload = jsonify({'preview_data': data}) + return make_response(payload, HTTPStatus.OK) + except ValidationError as err: + logging.error("PreviewDataSchema serialization error " + str(err.messages)) + return make_response(jsonify({'preview_data': {}}), HTTPStatus.INTERNAL_SERVER_ERROR) + except Exception as err: + logging.error("error getting s3 preview data " + str(err)) + return make_response(jsonify({'preview_data': {}}), HTTPStatus.INTERNAL_SERVER_ERROR) + + def get_feature_preview_data(self, params: Dict, optionalHeaders: Dict = None) -> FlaskResponse: + """ + BaseS3PreviewClient only supports data preview currently but this function needs to be stubbed to + implement the BasePreviewClient interface + """ + pass diff --git a/frontend/amundsen_application/base/examples/example_s3_json_preview_client.py b/frontend/amundsen_application/base/examples/example_s3_json_preview_client.py new file mode 100644 index 0000000000..d6d544d1af --- /dev/null +++ b/frontend/amundsen_application/base/examples/example_s3_json_preview_client.py @@ -0,0 +1,73 @@ +# Copyright Contributors to the Amundsen project. +# SPDX-License-Identifier: Apache-2.0 + +import json +import os +from typing import Dict + +import boto3 +from amundsen_application.base.base_s3_preview_client import \ + BaseS3PreviewClient +from amundsen_application.models.preview_data import ColumnItem, PreviewData + + +class S3JSONPreviewClient(BaseS3PreviewClient): + """ + S3JSONPreviewClient is an S3 Preview Client that: + 1. Gets JSON files from S3 that are stored in a bucket with keys preview_data/{schema}/{table}.json + 2. Converts the JSON values to PreviewData model + 3. Returns the serialized model + + In order for this preview client to work you must: + - Have S3 files stored in a bucket with keys 'preview_data/{schema}/{table}.json' + - Files are formatted as list of rows as map with key being the column name and value being column value + Ex: + [ + { + 'col1': 1, + 'col2': '2' + }, + { + 'col1': 3, + 'col2': '4' + } + ... + ] + - Nested field are not supported. We suggest flattening your nested fields. + Ex: + [ + { + 'col1': { + 'col2: 1 + } + ] + should be: + [ + { + 'col1.col2': 1 + } + ] + - Run your frontend service with an IAM Profile that has s3:GetObject permissions on the 'preview_data/' prefix + """ + + def __init__(self) -> None: + self.s3 = boto3.client("s3") + bucket = os.getenv("PREVIEW_CLIENT_S3_BUCKET") + if bucket == "": + raise Exception("When using the S3CSVPreviewClient you must set the PREVIEW_CLIENT_S3_BUCKET environment " + "variable to point to where your preview_data CSVs are stored.") + self.s3_bucket = bucket + + def get_s3_preview_data(self, *, params: Dict) -> PreviewData: + schema = params.get("schema") + table = params.get("tableName") + + try: + obj = self.s3.get_object(Bucket=self.s3_bucket, Key=f"preview_data/{schema}/{table}.json") + except Exception as e: + raise Exception(f"Error getting object from s3. preview_data/{schema}/{table}.json" + f"Caused by: {e}") + + data = json.loads(obj['Body'].read().decode('utf-8')) + columns = [ColumnItem(col_name, '') for col_name in data[0]] # TODO: figure out how to do Type. Is it needed? + return PreviewData(columns=columns, data=data) diff --git a/frontend/docs/examples/s3_preview_client.md b/frontend/docs/examples/s3_preview_client.md new file mode 100644 index 0000000000..3d70e15fcb --- /dev/null +++ b/frontend/docs/examples/s3_preview_client.md @@ -0,0 +1,20 @@ +# Overview + +Amundsen's data preview feature requires that developers create a custom implementation of `base_preview_client` for requesting that data. This feature assists with data discovery by providing the end user the option to view a sample of the actual resource data so that they can verify whether or not they want to transition into exploring that data, or continue their search. + +[S3](https://aws.amazon.com/s3/) is AWS block storage and is used in this scenario for storing precomputed preview data. + +## Implementation + +Implement the `base_s3_preview_client` to make a request to AWS using [boto3](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html) to fetch your preview data. +Your preview data needs to already be stored in S3 for this to work. +You can take a look at `example_s3_json_preview_client` to see a working implementation that fetches JSON files. + + +## Usage + +To use a preview client set these environment variables in your deployment. + +- `PREVIEW_CLIENT_ENABLED`: `true` +- `PREVIEW_CLIENT`: `{python path to preview client class}` (ex: `amundsen_application.base.examples.example_s3_json_preview_client.S3JSONPreviewClient` if you are using the JSON example client) +- `PREVIEW_CLIENT_S3_BUCKET`: `{S3 bucket where the preview data is stored}` \ No newline at end of file