Skip to content

Commit

Permalink
Add task for VAT validation #3 (#7360)
Browse files Browse the repository at this point in the history
  • Loading branch information
rsasch authored Jul 21, 2021
1 parent 1592e65 commit d336359
Showing 1 changed file with 71 additions and 1 deletion.
72 changes: 71 additions & 1 deletion scripts/variantstore/wdl/GvsValidateVAT.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,19 @@ workflow GvsValidateVatTable {
last_modified_timestamp = GetBQTableLastModifiedDatetime.last_modified_timestamp
}

call SpotCheckForExpectedTranscripts {
input:
query_project_id = query_project_id,
fq_vat_table = fq_vat_table,
service_account_json_path = service_account_json_path,
last_modified_timestamp = GetBQTableLastModifiedDatetime.last_modified_timestamp
}

# once there is more than one check, they will be gathered into this workflow output, in the format
# [{ValidationRule1: "PASS/FAIL Extra info from this test"},
# {ValidationRule2: "PASS/FAIL Extra from this test"}]
output {
Array[Map[String, String]] validation_results = [EnsureVatTableHasVariants.result]
Array[Map[String, String]] validation_results = [EnsureVatTableHasVariants.result, SpotCheckForExpectedTranscripts.result]
}
}

Expand Down Expand Up @@ -85,6 +93,68 @@ task EnsureVatTableHasVariants {
}
}

task SpotCheckForExpectedTranscripts {
input {
String query_project_id
String fq_vat_table
String? service_account_json_path
String last_modified_timestamp
}

String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false'

command <<<
if [ ~{has_service_account_file} = 'true' ]; then
gsutil cp ~{service_account_json_path} local.service_account.json
gcloud auth activate-service-account --key-file=local.service_account.json
gcloud config set project ~{query_project_id}
fi
echo "project_id = ~{query_project_id}" > ~/.bigqueryrc

bq query --nouse_legacy_sql --project_id=~{query_project_id} --format=csv 'SELECT
contig,
position,
vid,
gene_symbol,
variant_consequence
FROM
~{fq_vat_table},
UNNEST(consequence) AS variant_consequence
WHERE
contig = "chr19" AND
position >= 35740407 AND
position <= 35740469 AND
variant_consequence NOT IN ("downstream_gene_variant","upstream_gene_variant") AND
gene_symbol NOT IN ("IGFLR1","AD000671.2")' > bq_query_output.csv

# get number of lines in bq query output
NUMRESULTS=$(awk 'END{print NR}' bq_query_output.csv)

# if the result of the query has any rows, that means there were unexpected transcripts at the
# specified location, so report those back in the output
if [[ $NUMRESULTS = "0" ]]; then
echo "PASS: The VAT table ~{fq_vat_table} only has the expected transcripts at the tested location ('IGFLR1' and 'AD000671.2' in chromosome 19, between positions 35,740,407 - 35,740,469)." > validation_results.txt
else
echo "FAIL: The VAT table ~{fq_vat_table} had unexpected transcripts at the tested location: [csv output follows] " > validation_results.txt
cat bq_query_output.csv >> validation_results.txt
fi
>>>
# ------------------------------------------------
# Runtime settings:
runtime {
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:305.0.0"
memory: "1 GB"
preemptible: 3
cpu: "1"
disks: "local-disk 100 HDD"
}
# ------------------------------------------------
# Output: {"Name of validation rule": "PASS/FAIL plus additional validation results"}
output {
Map[String, String] result = {"SpotCheckForExpectedTranscripts": read_string('validation_results.txt')}
}
}

task GetBQTableLastModifiedDatetime {
# because this is being used to determine if the data has changed, never use call cache
meta {
Expand Down

0 comments on commit d336359

Please sign in to comment.