From a09232c1e234e78ef7646eb0a67c16c62ed5c608 Mon Sep 17 00:00:00 2001 From: Phil Snyder Date: Tue, 3 Jan 2023 15:02:37 -0800 Subject: [PATCH 1/4] Refactor upload-and-deploy github workflow --- .github/workflows/upload-and-deploy.yaml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/upload-and-deploy.yaml b/.github/workflows/upload-and-deploy.yaml index a971a20..b215459 100755 --- a/.github/workflows/upload-and-deploy.yaml +++ b/.github/workflows/upload-and-deploy.yaml @@ -43,7 +43,7 @@ jobs: run: echo "NAMESPACE=$GITHUB_REF_NAME" >> $GITHUB_ENV - name: Copy files to templates bucket - run: python src/scripts/manage_artifacts/artifacts.py --upload --ref $GITHUB_REF_NAME + run: python src/scripts/manage_artifacts/artifacts.py --upload --ref $NAMESPACE pytest-docker: name: Build and push testing docker image to ECR @@ -127,8 +127,6 @@ jobs: name: Deploy branch using sceptre runs-on: ubuntu-latest needs: upload-files - env: - NAMESPACE: $GITHUB_REF_NAME if: github.ref_name != 'main' && github.ref_type != 'tag' environment: develop @@ -144,7 +142,7 @@ jobs: run: mkdir -p templates/remote/ - name: Deploy sceptre stacks - run: pipenv run sceptre --var "namespace=${{ env.NAMESPACE }}" launch develop --yes + run: pipenv run sceptre --var "namespace=$GITHUB_REF_NAME" launch develop --yes integration-test: name: Deploys test events from the workflow From 2a5c39e9549c18bf3bb5b1ab53cf4f5bc38f4a47 Mon Sep 17 00:00:00 2001 From: Phil Snyder Date: Tue, 3 Jan 2023 15:03:20 -0800 Subject: [PATCH 2/4] Ignore specific non-severe Android validation errors --- src/glue/jobs/s3_to_json_s3.py | 62 ++++++++++++++++++++++++++++++++-- 1 file changed, 59 insertions(+), 3 deletions(-) diff --git a/src/glue/jobs/s3_to_json_s3.py b/src/glue/jobs/s3_to_json_s3.py index 16fa5fd..4e72e9d 100644 --- a/src/glue/jobs/s3_to_json_s3.py +++ b/src/glue/jobs/s3_to_json_s3.py @@ -271,8 +271,8 @@ def validate_data(s3_obj, archive_map, json_schemas, dataset_mapping): * assessmentRevision (str) * appId (str) * recordId (str) - * errors (dict): mapping file names to their validation errors. - See `validate_against_schema` for format. + * errors (dict): mapping file names (str) to their + validation errors (list[str]). """ logger = logging.getLogger(__name__) assessment_id = s3_obj["Metadata"]["assessmentid"] @@ -338,6 +338,57 @@ def validate_against_schema(data, schema): all_errors = [e.message for e in validator.iter_errors(data)] return all_errors +def is_expected_validation_error(validation_result, client_info): + """ + In the first year of MTB there were a number of issues with Android + data not conforming to the JSON Schema. These aren't very severe + inconsistincies (mostly missing or superfluous properties), but they + prevent almost all Android data from being processed. This function + checks whether the validation errors fall into this non-severe category. + See ETL-312 for more details. + + Args: + validation_result (dict): A dictionary containing keys + * assessmentId (str) + * assessmentRevision (str) + * appId (str) + * recordId (str) + * errors (dict): mapping file names (str) to their + validation errors (list[str]). + client_info (str): A JSON blob containing the client info from the + relevant S3 object's metadata. + + Returns: + bool: Whether the validation errors match expected, non-severe errors. + """ + if not validation_result["errors"]: + return False + else: + if "Android" not in client_info: + return False + if "metadata.json" in validation_result["errors"]: + metadata_errors = validation_result["errors"]["metadata.json"] + if not (len(metadata_errors) == 2 and + "'appName' is a required property" in metadata_errors and + "'files' is a required property" in metadata_errors): + return False + if "taskData.json" in validation_result["errors"]: + taskdata_errors = validation_result["errors"]["taskData.json"] + if not taskdata_errors == [ + "Additional properties are not allowed ('type' was unexpected)"]: + return False + if "weather.json" in validation_result["errors"]: + weather_errors = validation_result["errors"]["weather.json"] + if not weather_errors == [ + "'type' is a required property"]: + return False + if "motion.json" in validation_result["errors"]: + motion_errors = validation_result["errors"]["motion.json"] + if not motion_errors == [ + "'type' is a required property"]: + return False + return True + def get_dataset_identifier(json_schema, schema_mapping, dataset_mapping, file_metadata): """ Get a dataset identifier for a file. @@ -615,7 +666,12 @@ def main(): json_schemas=json_schemas, dataset_mapping=dataset_mapping ) - if len(validation_result["errors"]) > 0: + expected_validation_error = is_expected_validation_error( + validation_result=validation_result, + client_info=s3_obj["Metadata"]["clientinfo"]) + if validation_result["errors"] and expected_validation_error: + validation_result["errors"] = {} + if validation_result["errors"]: for file_name in validation_result["errors"]: # limit 10 errors reported per file to avoid redundandant errors validation_result["errors"][file_name] = \ From fb09f1254daaca6e8f28471d274564c58991239c Mon Sep 17 00:00:00 2001 From: Phil Snyder Date: Tue, 3 Jan 2023 15:59:37 -0800 Subject: [PATCH 3/4] Minor refactor and add test for is_expected_validation_error --- src/glue/jobs/s3_to_json_s3.py | 47 ++++++++++++------------ tests/test_s3_to_json_s3.py | 65 ++++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+), 23 deletions(-) diff --git a/src/glue/jobs/s3_to_json_s3.py b/src/glue/jobs/s3_to_json_s3.py index 4e72e9d..4ec3c9e 100644 --- a/src/glue/jobs/s3_to_json_s3.py +++ b/src/glue/jobs/s3_to_json_s3.py @@ -363,30 +363,31 @@ def is_expected_validation_error(validation_result, client_info): """ if not validation_result["errors"]: return False - else: - if "Android" not in client_info: + if validation_result["appId"] != "mobile-toolbox": + return False + if "Android" not in client_info: + return False + if "metadata.json" in validation_result["errors"]: + metadata_errors = validation_result["errors"]["metadata.json"] + if not (len(metadata_errors) == 2 and + "'appName' is a required property" in metadata_errors and + "'files' is a required property" in metadata_errors): + return False + if "taskData.json" in validation_result["errors"]: + taskdata_errors = validation_result["errors"]["taskData.json"] + if not taskdata_errors == [ + "Additional properties are not allowed ('type' was unexpected)"]: + return False + if "weather.json" in validation_result["errors"]: + weather_errors = validation_result["errors"]["weather.json"] + if not weather_errors == [ + "'type' is a required property"]: + return False + if "motion.json" in validation_result["errors"]: + motion_errors = validation_result["errors"]["motion.json"] + if not motion_errors == [ + "'type' is a required property"]: return False - if "metadata.json" in validation_result["errors"]: - metadata_errors = validation_result["errors"]["metadata.json"] - if not (len(metadata_errors) == 2 and - "'appName' is a required property" in metadata_errors and - "'files' is a required property" in metadata_errors): - return False - if "taskData.json" in validation_result["errors"]: - taskdata_errors = validation_result["errors"]["taskData.json"] - if not taskdata_errors == [ - "Additional properties are not allowed ('type' was unexpected)"]: - return False - if "weather.json" in validation_result["errors"]: - weather_errors = validation_result["errors"]["weather.json"] - if not weather_errors == [ - "'type' is a required property"]: - return False - if "motion.json" in validation_result["errors"]: - motion_errors = validation_result["errors"]["motion.json"] - if not motion_errors == [ - "'type' is a required property"]: - return False return True def get_dataset_identifier(json_schema, schema_mapping, dataset_mapping, file_metadata): diff --git a/tests/test_s3_to_json_s3.py b/tests/test_s3_to_json_s3.py index 6c1a35e..490a17a 100644 --- a/tests/test_s3_to_json_s3.py +++ b/tests/test_s3_to_json_s3.py @@ -533,6 +533,71 @@ def test_validate_data(self, s3_obj, archive_map, dataset_mapping): assert key in validation_result assert isinstance(validation_result["errors"], dict) + def test_is_expected_validation_error(self): + validation_result = { + "assessmentId": "flanker", + "assessmentRevision": "5", + "appId": "mobile-toolbox", + "recordId": "123456789", + "errors": {} + } + client_info = "{osName:'Android'}" + # test not validation_result["errors"] + assert not s3_to_json_s3.is_expected_validation_error( + validation_result=validation_result, + client_info=client_info) + # test "Android" not in client_info: + assert not s3_to_json_s3.is_expected_validation_error( + validation_result=validation_result, + client_info="{osName:'iOS'}") + # test validation_result["appId"] != "mobile-toolbox": + assert not s3_to_json_s3.is_expected_validation_error( + validation_result = { + "assessmentId": "flanker", + "assessmentRevision": "5", + "appId": "example-app", + "recordId": "123456789", + "errors": {} + }, + client_info=client_info) + # test metadata.json + validation_result["errors"] = { + "metadata.json": [ + "'appName' is a required property", + "'files' is a required property" + ] + } + assert s3_to_json_s3.is_expected_validation_error( + validation_result=validation_result, + client_info=client_info) + # test taskData.json + validation_result["errors"] = { + "taskData.json": [ + "Additional properties are not allowed ('type' was unexpected)" + ] + } + assert s3_to_json_s3.is_expected_validation_error( + validation_result=validation_result, + client_info=client_info) + # test weather.json + validation_result["errors"] = { + "weather.json": [ + "'type' is a required property" + ] + } + assert s3_to_json_s3.is_expected_validation_error( + validation_result=validation_result, + client_info=client_info) + # test motion.json + validation_result["errors"] = { + "motion.json": [ + "'type' is a required property" + ] + } + assert s3_to_json_s3.is_expected_validation_error( + validation_result=validation_result, + client_info=client_info) + def test_write_metadata_file_to_json_dataset(self, s3_obj, namespace, monkeypatch): monkeypatch.setattr("boto3.client", lambda x : MockAWSClient()) workflow_run_properties = { From 50ab23f1a25f2825631f3780d02e5f11426c7be1 Mon Sep 17 00:00:00 2001 From: Phil Snyder Date: Wed, 4 Jan 2023 12:31:46 -0800 Subject: [PATCH 4/4] Minor syntax edit --- src/glue/jobs/s3_to_json_s3.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/glue/jobs/s3_to_json_s3.py b/src/glue/jobs/s3_to_json_s3.py index 4ec3c9e..426b839 100644 --- a/src/glue/jobs/s3_to_json_s3.py +++ b/src/glue/jobs/s3_to_json_s3.py @@ -375,17 +375,17 @@ def is_expected_validation_error(validation_result, client_info): return False if "taskData.json" in validation_result["errors"]: taskdata_errors = validation_result["errors"]["taskData.json"] - if not taskdata_errors == [ + if taskdata_errors != [ "Additional properties are not allowed ('type' was unexpected)"]: return False if "weather.json" in validation_result["errors"]: weather_errors = validation_result["errors"]["weather.json"] - if not weather_errors == [ + if weather_errors != [ "'type' is a required property"]: return False if "motion.json" in validation_result["errors"]: motion_errors = validation_result["errors"]["motion.json"] - if not motion_errors == [ + if motion_errors != [ "'type' is a required property"]: return False return True