Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Quoting of table names #7666

Merged
merged 11 commits into from
Feb 14, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ workflows:
branches:
- master
- ah_var_store
- kc_quoting_bug
- name: GvsAoUReblockGvcf
subclass: WDL
primaryDescriptorPath: /scripts/variantstore/wdl/GvsAoUReblockGvcf.wdl
Expand All @@ -90,6 +91,7 @@ workflows:
- ah_var_store
- rsa_split_intervals_part_2
- kc_cluster_vqsr
- kc_quoting_bug
- name: GvsCreateAltAllele
subclass: WDL
primaryDescriptorPath: /scripts/variantstore/wdl/GvsCreateAltAllele.wdl
Expand All @@ -99,6 +101,7 @@ workflows:
branches:
- master
- ah_var_store
- kc_quoting_bug
- name: GvsCreateTables
subclass: WDL
primaryDescriptorPath: /scripts/variantstore/wdl/GvsCreateTables.wdl
Expand Down Expand Up @@ -129,6 +132,7 @@ workflows:
branches:
- master
- ah_var_store
- kc_quoting_bug
- name: GvsPrepareCallset
subclass: WDL
primaryDescriptorPath: /scripts/variantstore/wdl/GvsPrepareCallset.wdl
Expand Down
12 changes: 6 additions & 6 deletions scripts/variantstore/wdl/GvsAssignIds.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -136,20 +136,20 @@ task AssignIds {

# add sample_name to sample_info_table
bq --project_id=~{project_id} query --use_legacy_sql=false \
'INSERT into ~{dataset_name}.~{sample_info_table} (sample_name) select sample_name from ~{dataset_name}.sample_id_assignment_lock m where m.sample_name not in (SELECT sample_name FROM ~{dataset_name}.~{sample_info_table})'
'INSERT into `~{dataset_name}.~{sample_info_table}` (sample_name) select sample_name from `~{dataset_name}.sample_id_assignment_lock` m where m.sample_name not in (SELECT sample_name FROM `~{dataset_name}.~{sample_info_table}`)'

# get the current maximum id, or 0 if there are none
bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false "SELECT IFNULL(MAX(sample_id),0) FROM ~{dataset_name}.~{sample_info_table}" > maxid
bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false 'SELECT IFNULL(MAX(sample_id),0) FROM `~{dataset_name}.~{sample_info_table}`' > maxid
offset=$(tail -1 maxid)

# perform actual id assignment
bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false \
"UPDATE ~{dataset_name}.~{sample_info_table} m SET m.sample_id = id_assign.id FROM (SELECT sample_name, $offset + ROW_NUMBER() OVER() as id FROM ~{dataset_name}.~{sample_info_table} WHERE sample_id IS NULL) id_assign WHERE m.sample_name = id_assign.sample_name;"
bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false --parameter=offset:INTEGER:$offset \
'UPDATE `~{dataset_name}.~{sample_info_table}` m SET m.sample_id = id_assign.id FROM (SELECT sample_name, @offset + ROW_NUMBER() OVER() as id FROM `~{dataset_name}.~{sample_info_table}` WHERE sample_id IS NULL) id_assign WHERE m.sample_name = id_assign.sample_name;'

# retrieve the list of assigned ids and samples to update the datamodel
echo "entity:sample_id,gvs_id" > update.tsv
bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false -n $num_samples \
"SELECT sample_name, sample_id from ~{dataset_name}.~{sample_info_table} WHERE sample_id >= $offset" > update.tsv
bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false -n $num_samples --parameter=offset:INTEGER:$offset \
'SELECT sample_name, sample_id from `~{dataset_name}.~{sample_info_table}` WHERE sample_id >= @offset' > update.tsv
cat update.tsv | sed -e 's/sample_id/gvs_id/' -e 's/sample_name/entity:sample_id/' -e 's/,/\t/g' > gvs_ids.tsv

# get the max id to create tables for
Expand Down
8 changes: 4 additions & 4 deletions scripts/variantstore/wdl/GvsCreateAltAllele.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ task GetVetTableNames {

echo "project_id = ~{query_project_id}" > ~/.bigqueryrc
bq query --location=US --project_id=~{query_project_id} --format=csv --use_legacy_sql=false \
"SELECT table_name FROM ~{dataset_project_id}.~{dataset_name}.INFORMATION_SCHEMA.TABLES WHERE table_name LIKE 'vet_%' ORDER BY table_name" > vet_tables.csv
'SELECT table_name FROM `~{dataset_project_id}.~{dataset_name}.INFORMATION_SCHEMA.TABLES` WHERE table_name LIKE "vet_%" ORDER BY table_name' > vet_tables.csv

# remove the header row from the CSV file
sed -i 1d vet_tables.csv
Expand Down Expand Up @@ -113,7 +113,7 @@ task CreateAltAlleleTable {

echo "project_id = ~{query_project_id}" > ~/.bigqueryrc
bq query --location=US --project_id=~{query_project_id} --format=csv --use_legacy_sql=false \
"CREATE OR REPLACE TABLE ~{dataset_project_id}.~{dataset_name}.alt_allele (
'CREATE OR REPLACE TABLE `~{dataset_project_id}.~{dataset_name}.alt_allele` (
location INT64,
sample_id INT64,
ref STRING,
Expand All @@ -139,7 +139,7 @@ task CreateAltAlleleTable {
ref_ad INT64,
ad INT64
) PARTITION BY RANGE_BUCKET(location, GENERATE_ARRAY(0, 25000000000000, 1000000000000))
CLUSTER BY location, sample_id;"
CLUSTER BY location, sample_id;'

>>>

Expand Down Expand Up @@ -193,7 +193,7 @@ task PopulateAltAlleleTable {
}

runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_20210923"
docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_20220211"
memory: "3 GB"
disks: "local-disk 10 HDD"
cpu: 1
Expand Down
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/GvsCreateFilterSet.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,7 @@ task GetNumSamplesLoaded {

echo "project_id = ~{project_id}" > ~/.bigqueryrc
bq query --location=US --project_id=~{project_id} --format=csv --use_legacy_sql=false \
"SELECT COUNT(*) as num_rows FROM ~{fq_sample_table} WHERE is_loaded = true" > num_rows.csv
'SELECT COUNT(*) as num_rows FROM `~{fq_sample_table}` WHERE is_loaded = true' > num_rows.csv

NUMROWS=$(python3 -c "csvObj=open('num_rows.csv','r');csvContents=csvObj.read();print(csvContents.split('\n')[1]);")

Expand Down
15 changes: 0 additions & 15 deletions scripts/variantstore/wdl/GvsCreateTables.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,6 @@ workflow CreateBQTables {
String vet_schema_json = '[{"name": "sample_id", "type" :"INTEGER", "mode": "REQUIRED"},{"name": "location", "type" :"INTEGER", "mode": "REQUIRED"},{"name": "ref", "type" :"STRING", "mode": "REQUIRED"},{"name": "alt", "type" :"STRING", "mode": "REQUIRED"},{"name": "AS_RAW_MQ", "type" :"STRING", "mode": "NULLABLE"},{"name": "AS_RAW_MQRankSum", "type" :"STRING", "mode": "NULLABLE"},{"name": "QUALapprox", "type" :"STRING", "mode": "NULLABLE"},{"name": "AS_QUALapprox", "type" :"STRING", "mode": "NULLABLE"},{"name": "AS_RAW_ReadPosRankSum", "type" :"STRING", "mode": "NULLABLE"},{"name": "AS_SB_TABLE", "type" :"STRING", "mode": "NULLABLE"},{"name": "AS_VarDP", "type" :"STRING", "mode": "NULLABLE"},{"name": "call_GT", "type" :"STRING", "mode": "NULLABLE"},{"name": "call_AD", "type" :"STRING", "mode": "NULLABLE"},{"name": "call_GQ", "type" :"INTEGER", "mode": "NULLABLE"},{"name": "call_PGT", "type" :"STRING", "mode": "NULLABLE"},{"name": "call_PID", "type" :"STRING", "mode": "NULLABLE"},{"name": "call_PL", "type" :"STRING", "mode": "NULLABLE"}]'
String ref_ranges_schema_json = '[{"name": "location","type": "INTEGER","mode": "REQUIRED"},{"name": "sample_id","type": "INTEGER","mode": "REQUIRED"},{"name": "length","type": "INTEGER","mode": "REQUIRED"},{"name": "state","type": "STRING","mode": "REQUIRED"}]'
Int? preemptible_tries

}

call CreateTables as CreatePetTables {
input:
project_id = project_id,
dataset_name = dataset_name,
datatype = "pet",
max_table_id = max_table_id,
schema_json = pet_schema_json,
superpartitioned = "true",
partitioned = "true",
service_account_json_path = service_account_json_path,
preemptible_tries = preemptible_tries
}

call CreateTables as CreateVetTables {
Expand Down Expand Up @@ -54,7 +40,6 @@ workflow CreateBQTables {
}

output {
String petDone = CreatePetTables.done
String vetDone = CreateVetTables.done
String refDone = CreateRefRangesTables.done
}
Expand Down
23 changes: 12 additions & 11 deletions scripts/variantstore/wdl/GvsImportGenomes.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -131,14 +131,15 @@ task CheckForDuplicateData {

# check the INFORMATION_SCHEMA.PARTITIONS table to see if any of input sample names/ids have data loaded into their partitions
# this returns the list of sample names that do already have data loaded
bq --location=US --project_id=~{project_id} query --format=csv -n ~{num_samples} --use_legacy_sql=false \
"WITH items as (SELECT s.sample_id, s.sample_name, s.is_loaded FROM ${TEMP_TABLE} t left outer join ${SAMPLE_INFO_TABLE} s on (s.sample_name = t.sample_name)) " \
"SELECT i.sample_name FROM ${INFO_SCHEMA_TABLE} p JOIN items i ON (p.partition_id = CAST(i.sample_id AS STRING)) WHERE p.total_logical_bytes > 0 AND (table_name like 'ref_ranges_%' OR table_name like 'vet_%' OR table_name like 'pet_%')" \
"UNION DISTINCT " \
"SELECT i.sample_name FROM items i WHERE i.is_loaded = True " \
"UNION DISTINCT " \
"SELECT i.sample_name FROM items i WHERE i.sample_id IN (SELECT sample_id FROM ~{dataset_name}.sample_load_status) " \
| sed -e '/sample_name/d' > duplicates
echo "WITH items as (SELECT s.sample_id, s.sample_name, s.is_loaded FROM \`${TEMP_TABLE}\` t left outer join \`${SAMPLE_INFO_TABLE}\` s on (s.sample_name = t.sample_name)) " >> query.sql
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Had to adopt this approach of putting the SQL into a temp file and then run it so we could get around the mix of single-quotes, double-quotes, need for $ variable interpretation in bash and the backticks required for BQ.

echo "SELECT i.sample_name FROM \`${INFO_SCHEMA_TABLE}\` p JOIN items i ON (p.partition_id = CAST(i.sample_id AS STRING)) WHERE p.total_logical_bytes > 0 AND (table_name like 'ref_ranges_%' OR table_name like 'vet_%' OR table_name like 'pet_%')" >> query.sql
echo "UNION DISTINCT " >> query.sql
echo "SELECT i.sample_name FROM items i WHERE i.is_loaded = True " >> query.sql
echo "UNION DISTINCT " >> query.sql
echo "SELECT i.sample_name FROM items i WHERE i.sample_id IN (SELECT sample_id FROM \`~{dataset_name}.sample_load_status\`) " >> query.sql


cat query.sql | bq --location=US --project_id=~{project_id} query --format=csv -n ~{num_samples} --use_legacy_sql=false | sed -e '/sample_name/d' > duplicates

# remove the temp table
bq --project_id=~{project_id} rm -f -t ${TEMP_TABLE}
Expand Down Expand Up @@ -351,7 +352,7 @@ task SetIsLoadedColumn {

# set is_loaded to true if there is a corresponding vet table partition with rows for that sample_id
bq --location=US --project_id=~{project_id} query --format=csv --use_legacy_sql=false \
"UPDATE ~{dataset_name}.sample_info SET is_loaded = true WHERE sample_id IN (SELECT CAST(partition_id AS INT64) from ~{dataset_name}.INFORMATION_SCHEMA.PARTITIONS WHERE partition_id NOT LIKE \"__%\" AND total_logical_bytes > 0 AND table_name LIKE \"vet_%\") OR sample_id IN (SELECT sample_id FROM ~{dataset_name}.sample_load_status GROUP BY 1 HAVING COUNT(1) = 2)"
'UPDATE `~{dataset_name}.sample_info` SET is_loaded = true WHERE sample_id IN (SELECT CAST(partition_id AS INT64) from `~{dataset_name}.INFORMATION_SCHEMA.PARTITIONS` WHERE partition_id NOT LIKE "__%" AND total_logical_bytes > 0 AND table_name LIKE "vet_%") OR sample_id IN (SELECT sample_id FROM `~{dataset_name}.sample_load_status` GROUP BY 1 HAVING COUNT(1) = 2)'

>>>

Expand Down Expand Up @@ -417,7 +418,7 @@ task GetSampleIds {

# get the current maximum id, or 0 if there are none
bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false \
"SELECT IFNULL(MIN(sample_id),0) as min, IFNULL(MAX(sample_id),0) as max FROM ~{dataset_name}.~{table_name} AS samples JOIN ${TEMP_TABLE} AS temp ON samples.sample_name=temp.sample_name" > results
"SELECT IFNULL(MIN(sample_id),0) as min, IFNULL(MAX(sample_id),0) as max FROM \`~{dataset_name}.~{table_name}\` AS samples JOIN \`${TEMP_TABLE}\` AS temp ON samples.sample_name=temp.sample_name" > results

# prep for being able to return min table id
min_sample_id=$(tail -1 results | cut -d, -f1)
Expand All @@ -433,7 +434,7 @@ task GetSampleIds {
python3 -c "from math import ceil; print(ceil($min_sample_id/~{samples_per_table}))" > min_sample_id

bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false -n ~{num_samples} \
"SELECT sample_id, samples.sample_name FROM ~{dataset_name}.~{table_name} AS samples JOIN ${TEMP_TABLE} AS temp ON samples.sample_name=temp.sample_name" > sample_map
"SELECT sample_id, samples.sample_name FROM \`~{dataset_name}.~{table_name}\` AS samples JOIN \`${TEMP_TABLE}\` AS temp ON samples.sample_name=temp.sample_name" > sample_map
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

An example of where we need $ interpolation (so we can't use single quotes) but also have the back-ticks to deal with


cut -d, -f1 sample_map > gvs_ids

Expand Down
6 changes: 3 additions & 3 deletions scripts/variantstore/wdl/extract/populate_alt_allele_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@ def populate_alt_allele_table(query_project, vet_table_name, fq_dataset, sa_key_
alt_allele_temp_function = Path('alt_allele_temp_function.sql').read_text()
alt_allele_positions = Path('alt_allele_positions.sql').read_text()
fq_vet_table = f"{fq_dataset}.{vet_table_name}"
query_with = f"""INSERT INTO {fq_dataset}.alt_allele
query_with = f"""INSERT INTO `{fq_dataset}.alt_allele`
WITH
position1 as (select * from {fq_vet_table} WHERE call_GT IN ('0/1', '1/0', '1/1', '0|1', '1|0', '1|1', '0/2', '0|2','2/0', '2|0')),
position2 as (select * from {fq_vet_table} WHERE call_GT IN ('1/2', '1|2', '2/1', '2|1'))"""
position1 as (select * from `{fq_vet_table}` WHERE call_GT IN ('0/1', '1/0', '1/1', '0|1', '1|0', '1|1', '0/2', '0|2','2/0', '2|0')),
position2 as (select * from `{fq_vet_table}` WHERE call_GT IN ('1/2', '1|2', '2/1', '2|1'))"""

sql = alt_allele_temp_function + query_with + alt_allele_positions
result = utils.execute_with_retry(client, f"into alt allele from {vet_table_name}", sql)
Expand Down