Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ETL-312] Ignore specific non-severe Android validation errors #112

Merged
merged 4 commits into from
Jan 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions .github/workflows/upload-and-deploy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ jobs:
run: echo "NAMESPACE=$GITHUB_REF_NAME" >> $GITHUB_ENV

- name: Copy files to templates bucket
run: python src/scripts/manage_artifacts/artifacts.py --upload --ref $GITHUB_REF_NAME
run: python src/scripts/manage_artifacts/artifacts.py --upload --ref $NAMESPACE

pytest-docker:
name: Build and push testing docker image to ECR
Expand Down Expand Up @@ -127,8 +127,6 @@ jobs:
name: Deploy branch using sceptre
runs-on: ubuntu-latest
needs: upload-files
env:
NAMESPACE: $GITHUB_REF_NAME
if: github.ref_name != 'main' && github.ref_type != 'tag'
environment: develop

Expand All @@ -144,7 +142,7 @@ jobs:
run: mkdir -p templates/remote/

- name: Deploy sceptre stacks
run: pipenv run sceptre --var "namespace=${{ env.NAMESPACE }}" launch develop --yes
run: pipenv run sceptre --var "namespace=$GITHUB_REF_NAME" launch develop --yes

integration-test:
name: Deploys test events from the workflow
Expand Down
63 changes: 60 additions & 3 deletions src/glue/jobs/s3_to_json_s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,8 +271,8 @@ def validate_data(s3_obj, archive_map, json_schemas, dataset_mapping):
* assessmentRevision (str)
* appId (str)
* recordId (str)
* errors (dict): mapping file names to their validation errors.
See `validate_against_schema` for format.
* errors (dict): mapping file names (str) to their
validation errors (list[str]).
"""
logger = logging.getLogger(__name__)
assessment_id = s3_obj["Metadata"]["assessmentid"]
Expand Down Expand Up @@ -338,6 +338,58 @@ def validate_against_schema(data, schema):
all_errors = [e.message for e in validator.iter_errors(data)]
return all_errors

def is_expected_validation_error(validation_result, client_info):
"""
In the first year of MTB there were a number of issues with Android
data not conforming to the JSON Schema. These aren't very severe
inconsistincies (mostly missing or superfluous properties), but they
prevent almost all Android data from being processed. This function
checks whether the validation errors fall into this non-severe category.
See ETL-312 for more details.

Args:
validation_result (dict): A dictionary containing keys
* assessmentId (str)
* assessmentRevision (str)
* appId (str)
* recordId (str)
* errors (dict): mapping file names (str) to their
validation errors (list[str]).
client_info (str): A JSON blob containing the client info from the
relevant S3 object's metadata.

Returns:
bool: Whether the validation errors match expected, non-severe errors.
"""
if not validation_result["errors"]:
return False
if validation_result["appId"] != "mobile-toolbox":
return False
if "Android" not in client_info:
return False
if "metadata.json" in validation_result["errors"]:
metadata_errors = validation_result["errors"]["metadata.json"]
if not (len(metadata_errors) == 2 and
"'appName' is a required property" in metadata_errors and
"'files' is a required property" in metadata_errors):
return False
if "taskData.json" in validation_result["errors"]:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit. Use .get(....) then you don't need the extra if statement.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Normally, yes. But I'm checking for inequality (if not errors == ...) so unless I made the non-severe error the default return value of the get method then in the case that there were no taskData.json errors, the inequality would evaluate to true and is_expected_validation_error would prematurely return False. The way it's written now is more readable and less redundant imo than duplicating the non-severe errors in the arguments to get. Although, I think the not == syntax is a bit unconventional for single comparisons, so I will change that.

taskdata_errors = validation_result["errors"]["taskData.json"]
if taskdata_errors != [
"Additional properties are not allowed ('type' was unexpected)"]:
return False
if "weather.json" in validation_result["errors"]:
weather_errors = validation_result["errors"]["weather.json"]
if weather_errors != [
"'type' is a required property"]:
return False
if "motion.json" in validation_result["errors"]:
motion_errors = validation_result["errors"]["motion.json"]
if motion_errors != [
"'type' is a required property"]:
return False
return True

def get_dataset_identifier(json_schema, schema_mapping, dataset_mapping, file_metadata):
"""
Get a dataset identifier for a file.
Expand Down Expand Up @@ -615,7 +667,12 @@ def main():
json_schemas=json_schemas,
dataset_mapping=dataset_mapping
)
if len(validation_result["errors"]) > 0:
expected_validation_error = is_expected_validation_error(
validation_result=validation_result,
client_info=s3_obj["Metadata"]["clientinfo"])
if validation_result["errors"] and expected_validation_error:
validation_result["errors"] = {}
if validation_result["errors"]:
for file_name in validation_result["errors"]:
# limit 10 errors reported per file to avoid redundandant errors
validation_result["errors"][file_name] = \
Expand Down
65 changes: 65 additions & 0 deletions tests/test_s3_to_json_s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -533,6 +533,71 @@ def test_validate_data(self, s3_obj, archive_map, dataset_mapping):
assert key in validation_result
assert isinstance(validation_result["errors"], dict)

def test_is_expected_validation_error(self):
validation_result = {
"assessmentId": "flanker",
"assessmentRevision": "5",
"appId": "mobile-toolbox",
"recordId": "123456789",
"errors": {}
}
client_info = "{osName:'Android'}"
# test not validation_result["errors"]
assert not s3_to_json_s3.is_expected_validation_error(
validation_result=validation_result,
client_info=client_info)
# test "Android" not in client_info:
assert not s3_to_json_s3.is_expected_validation_error(
validation_result=validation_result,
client_info="{osName:'iOS'}")
# test validation_result["appId"] != "mobile-toolbox":
assert not s3_to_json_s3.is_expected_validation_error(
validation_result = {
"assessmentId": "flanker",
"assessmentRevision": "5",
"appId": "example-app",
"recordId": "123456789",
"errors": {}
},
client_info=client_info)
# test metadata.json
validation_result["errors"] = {
"metadata.json": [
"'appName' is a required property",
"'files' is a required property"
]
}
assert s3_to_json_s3.is_expected_validation_error(
validation_result=validation_result,
client_info=client_info)
# test taskData.json
validation_result["errors"] = {
"taskData.json": [
"Additional properties are not allowed ('type' was unexpected)"
]
}
assert s3_to_json_s3.is_expected_validation_error(
validation_result=validation_result,
client_info=client_info)
# test weather.json
validation_result["errors"] = {
"weather.json": [
"'type' is a required property"
]
}
assert s3_to_json_s3.is_expected_validation_error(
validation_result=validation_result,
client_info=client_info)
# test motion.json
validation_result["errors"] = {
"motion.json": [
"'type' is a required property"
]
}
assert s3_to_json_s3.is_expected_validation_error(
validation_result=validation_result,
client_info=client_info)

def test_write_metadata_file_to_json_dataset(self, s3_obj, namespace, monkeypatch):
monkeypatch.setattr("boto3.client", lambda x : MockAWSClient())
workflow_run_properties = {
Expand Down