diff --git a/scripts/variantstore/callset_QC/README.md b/scripts/variantstore/callset_QC/README.md index 89c6981b316..576beacc696 100644 --- a/scripts/variantstore/callset_QC/README.md +++ b/scripts/variantstore/callset_QC/README.md @@ -1,20 +1,31 @@ # Sample set QC -Sample set qc is performed after a callset has been extracted to validate the quality of the callset with respect to the VQSR filter set used. Most of these steps are manual at this point and are executed in the BQ console. +Sample set qc is performed after a callset has been extracted to validate the quality of the callset with respect to the VQSR filter set used. Most of these steps are manual at this point and are executed in the BQ console. + +**NOTE:** You will need permission to query table `spec-ops-aou:gvs_public_reference_data.gnomad_v3_sites`. ## Create a `vet_all` view We need to create a view that contains all the vet data. Run - bq mk --project_id= --use_legacy_sql=false --view .vet_all + bq mk --project_id= --use_legacy_sql=false --view ._vet_all + +where `` is of the form: -Where the query is of the form: + "SELECT * + FROM ( + SELECT * FROM .vet_001 UNION ALL + SELECT * FROM .vet_002 UNION ALL + SELECT * FROM .vet_003) unioned_vets + WHERE + unioned_vets.sample_id IN + (SELECT sample_id FROM .__SAMPLES)" - select * from .vet_001 union all select * from .vet_002 union all select * from .vet_003 +and `prefix` is the "cohort_extract_table_prefix" from `GvsExtractCallset` and there is a `SELECT * FROM .vet_` in the `` for every `vet_` table from the sample set. ## Create the sample_metrics table - Copy the contents of `filter_set_samples_create_metrics.example.sql` to the BQ console -- Replace `$FQ_DATASET` with `.` +- Replace `$FQ_PREFIX` with `..` - Replace `$NAME_OF_FILTER_SET` with the name of the filter set you used to filter the callset - Execute the query @@ -23,8 +34,10 @@ This will currently create/overwrite the sample_metrics table in the dataset. (T ## Evaluate and extract the qc data - Copy the contents of `filter_set_samples_calculate_threshold.example.sql` to the BQ console -- Replace `$FQ_DATASET` with `.` +- Replace `$FQ_PREFIX` with `..` - Replace `$NAME_OF_FILTER_SET` with the name of the filter set you used to filter the callset - Execute the query -- Click save results to export the results +- Click the SAVE RESULTS button to export the results, or you can save the query as a SQL file and then run this command locally to get a CSV file in your local environment: + + bq --project_id= --use_legacy_sql=false query --format=csv --use_legacy_sql=false "$(cat )" > diff --git a/scripts/variantstore/callset_QC/filter_set_samples_calculate_threshold.example.sql b/scripts/variantstore/callset_QC/filter_set_samples_calculate_threshold.example.sql index 96745454a0f..ff5f696a93b 100644 --- a/scripts/variantstore/callset_QC/filter_set_samples_calculate_threshold.example.sql +++ b/scripts/variantstore/callset_QC/filter_set_samples_calculate_threshold.example.sql @@ -4,7 +4,7 @@ WITH fss AS ( (ti_count / tv_count) as ti_tv_ratio, (snp_het_count / snp_homvar_count) snp_het_homvar_ratio, (indel_het_count / indel_homvar_count) as indel_het_homvar_ratio - FROM `$FQ_DATASET.sample_metrics` + FROM `$FQ_PREFIX_sample_metrics` WHERE filter_set_name = '$NAME_OF_FILTER_SET'), medians AS ( SELECT @@ -58,7 +58,7 @@ SELECT indel_het_homvar_ratio, m_indel_het_homvar_ratio, mad_indel_het_homvar_ratio, CASE WHEN indel_het_homvar_ratio BETWEEN m_indel_het_homvar_ratio - 4*mad_indel_het_homvar_ratio AND m_indel_het_homvar_ratio + 4*mad_indel_het_homvar_ratio THEN true ELSE false END pass_indel_het_homvar_ratio, FROM fss -JOIN `$FQ_DATASET.sample_info` si ON (fss.sample_id = si.sample_id) +JOIN `$FQ_PREFIX__SAMPLES` si ON (fss.sample_id = si.sample_id) CROSS JOIN medians CROSS JOIN mads -order by 1 \ No newline at end of file +order by 1 diff --git a/scripts/variantstore/callset_QC/filter_set_samples_create_metrics.example.sql b/scripts/variantstore/callset_QC/filter_set_samples_create_metrics.example.sql index 27e8a6deddc..215386a98c7 100644 --- a/scripts/variantstore/callset_QC/filter_set_samples_create_metrics.example.sql +++ b/scripts/variantstore/callset_QC/filter_set_samples_create_metrics.example.sql @@ -1,20 +1,20 @@ CREATE TEMPORARY FUNCTION titv(ref STRING, allele STRING) -RETURNS STRING +RETURNS STRING LANGUAGE js AS """ if ( ref.length > 1 || allele.length > 1) { return "other"; } else if ( (ref == "A" && allele == "G") || - (ref == "G" && allele == "A") || - (ref == "C" && allele == "T") || + (ref == "G" && allele == "A") || + (ref == "C" && allele == "T") || (ref == "T" && allele == "C") ) { - return "ti"; + return "ti"; } else { return "tv"; - } + } """; - + CREATE TEMPORARY FUNCTION type(ref STRING, allele STRING, gt_str STRING) -RETURNS STRING +RETURNS STRING LANGUAGE js AS """ alts = allele.split(",") @@ -37,20 +37,20 @@ if (alt_lengths.size > 1) { return "del" } else if (ref.length < al) { return "ins" - } else { + } else { return "other" - } -} + } +} """; --- TODO Do we eventually want to support multiple filter sets in this table? +-- TODO Do we eventually want to support multiple filter sets in this table? -- If so have separate 'create table if not exists' statement and make this an insert -CREATE OR REPLACE TABLE `$FQ_DATASET.sample_metrics` AS +CREATE OR REPLACE TABLE `$FQ_PREFIX_sample_metrics` AS SELECT "$NAME_OF_FILTER_SET" filter_set_name, - sample_id, + sample_id, count(1) variant_entries, SUM(CASE WHEN type = "del" THEN 1 ELSE 0 END) del_count, - SUM(CASE WHEN type = "ins" THEN 1 ELSE 0 END) ins_count, + SUM(CASE WHEN type = "ins" THEN 1 ELSE 0 END) ins_count, SUM(CASE WHEN type = "snp" THEN 1 ELSE 0 END) snp_count, SUM(CASE WHEN type = "snp" AND titv = "ti" THEN 1 ELSE 0 END) ti_count, # TODO: minimize alleles SUM(CASE WHEN type = "snp" AND titv = "tv" THEN 1 ELSE 0 END) tv_count, # TODO: minimize alleles @@ -60,13 +60,13 @@ SELECT "$NAME_OF_FILTER_SET" filter_set_name, SUM(CASE WHEN type IN ("ins","del") AND gt_type = "homvar" THEN 1 ELSE 0 END) indel_homvar_count, COUNTIF(not in_gnomad) singleton, null AS pass_qc -- placeholder - FROM ( - SELECT sample_id, - type(ref, alt, call_GT) as type, + FROM ( + SELECT sample_id, + type(ref, alt, call_GT) as type, CASE WHEN INSTR(call_GT, "0") > 0 THEN "het" ELSE "homvar" END as gt_type, -- if GT contains a zero, its a het titv(ref, alt) as titv, CASE WHEN gnomad.location IS NULL THEN false ELSE true END in_gnomad - FROM `$FQ_DATASET.vet_all` v + FROM `$FQ_PREFIX_vet_all` v LEFT JOIN `spec-ops-aou.gvs_public_reference_data.gnomad_v3_sites` gnomad ON (v.location = gnomad.location) WHERE call_GT != "./." -- for safety AND v.location < 23000000000000 -- autosomal only