Skip to content

Commit

Permalink
Rename "metadata" table to "sample_info" table, fix vet schema (#7196)
Browse files Browse the repository at this point in the history
* refactor CreateVariantIngestFiles metadata -> sample_info

* refactor ImportGenomes metadata -> sample_info

* add QUALapprox to default vet schema

* update sample_info for ImportArrays.wdl

* revert arrays changes

* update metadata -> sample_info inputs

* add comment for arrays constant

* update reference docs and input json files

* remove branch from dockstore yml
  • Loading branch information
mmorgantaylor authored Apr 10, 2021
1 parent eabd71f commit 3a2cb47
Show file tree
Hide file tree
Showing 12 changed files with 59 additions and 115 deletions.
8 changes: 4 additions & 4 deletions scripts/variantstore/tieout/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -158,16 +158,16 @@ gatk --java-options "-Xms8g -Xdebug -Xrunjdwp:transport=dt_socket,address=5005,s

```bash
reference="/Users/kcibul/projects/references/hg38/v0/Homo_sapiens_assembly38.fasta"
dataset="spec-ops-aou.kc_acmg_tieout_v6"
dataset="spec-ops-aou.gvs_tieout_acmg_v1"

gatk --java-options "-Xms2g -Xdebug -Xrunjdwp:transport=dt_socket,address=5005,server=y,suspend=n" \
ExtractCohort --mode GENOMES --ref-version 38 --query-mode LOCAL_SORT \
-R $reference \
-O acmg_35_debug.vcf \
-O gvs_tieout_acmg_v1.vcf \
--local-sort-max-records-in-ram 1000000 \
--print-debug-information \
--sample-table ${dataset}.metadata \
--sample-table ${dataset}.sample_info \
--project-id spec-ops-aou \
--cohort-extract-table ${dataset}.exported_cohort_35_test \
--cohort-extract-table ${dataset}.exported_cohort_all_samples \
-L chr1:55398671
```
36 changes: 18 additions & 18 deletions scripts/variantstore/wdl/ImportGenomes.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ workflow ImportGenomes {
String project_id
String dataset_name
String? pet_schema = "location:INTEGER,sample_id:INTEGER,state:STRING"
String? vet_schema = "sample_id:INTEGER,location:INTEGER,ref:STRING,alt:STRING,AS_RAW_MQ:STRING,AS_RAW_MQRankSum:STRING,AS_QUALapprox:STRING,AS_RAW_ReadPosRankSum:STRING,AS_SB_TABLE:STRING,AS_VarDP:STRING,call_GT:STRING,call_AD:STRING,call_GQ:INTEGER,call_PGT:STRING,call_PID:STRING,call_PL:STRING"
String? metadata_schema = "sample_name:STRING,sample_id:INTEGER,interval_list_blob:STRING,inferred_state:STRING"
String? vet_schema = "sample_id:INTEGER,location:INTEGER,ref:STRING,alt:STRING,AS_RAW_MQ:STRING,AS_RAW_MQRankSum:STRING,QUALapprox:STRING,AS_QUALapprox:STRING,AS_RAW_ReadPosRankSum:STRING,AS_SB_TABLE:STRING,AS_VarDP:STRING,call_GT:STRING,call_AD:STRING,call_GQ:INTEGER,call_PGT:STRING,call_PID:STRING,call_PL:STRING"
String? sample_info_schema = "sample_name:STRING,sample_id:INTEGER,inferred_state:STRING"
File? service_account_json
String? drop_state
Boolean? drop_state_includes_greater_than = false
Expand All @@ -36,13 +36,13 @@ workflow ImportGenomes {
sample_map = sample_map
}

call CreateTables as CreateMetadataTables {
call CreateTables as CreateSampleInfoTables {
input:
project_id = project_id,
dataset_name = dataset_name,
datatype = "metadata",
datatype = "sample_info",
max_table_id = GetMaxTableId.max_table_id,
schema = metadata_schema,
schema = sample_info_schema,
superpartitioned = "false",
partitioned = "false",
uuid = "",
Expand Down Expand Up @@ -100,17 +100,17 @@ workflow ImportGenomes {
}

scatter (i in range(GetMaxTableId.max_table_id)) {
call LoadTable as LoadMetadataTable {
call LoadTable as LoadSampleInfoTable {
input:
project_id = project_id,
table_id = i + 1,
dataset_name = dataset_name,
storage_location = output_directory,
datatype = "metadata",
datatype = "sample_info",
superpartitioned = "false",
schema = metadata_schema,
schema = sample_info_schema,
service_account_json = service_account_json,
table_creation_done = CreateMetadataTables.done,
table_creation_done = CreateSampleInfoTables.done,
tsv_creation_done = CreateImportTsvs.done,
docker = docker_final,
run_uuid = SetLock.run_uuid
Expand Down Expand Up @@ -157,7 +157,7 @@ workflow ImportGenomes {
input:
run_uuid = SetLock.run_uuid,
output_directory = output_directory,
load_metadata_done = LoadMetadataTable.done,
load_sample_info_done = LoadSampleInfoTable.done,
load_pet_done = LoadPetTable.done,
load_vet_done = LoadVetTable.done,
service_account_json = service_account_json,
Expand Down Expand Up @@ -201,7 +201,7 @@ task SetLock {
LOCKFILE="LOCKFILE"
HAS_LOCKFILE=$(gsutil ls "${DIR}${LOCKFILE}" | wc -l)
if [ $HAS_LOCKFILE -gt 0 ]; then
echo "ERROR: lock file in place. Check whether another run of ImportGenomes with this output directory is in progress or a previous run had an error.
echo "ERROR: lock file in place. Check whether another run of ImportGenomes with this output directory is in progress or a previous run had an error.
If you would like to proceed, run the following command and re-run the workflow: \
gsutil rm ${DIR}${LOCKFILE} \
" && exit 1
Expand Down Expand Up @@ -233,7 +233,7 @@ task ReleaseLock {
input {
String run_uuid
String output_directory
Array[String] load_metadata_done
Array[String] load_sample_info_done
Array[String] load_pet_done
Array[String] load_vet_done
File? service_account_json
Expand Down Expand Up @@ -362,7 +362,7 @@ task CreateImportTsvs {
echo "ERROR: found mismatched lockfile containing run ${EXISTING_LOCK_ID}, which does not match this run ${CURRENT_RUN_ID}." 1>&2
exit 1
fi

gatk --java-options "-Xmx7000m" CreateVariantIngestFiles \
-V ~{updated_input_vcf} \
-L ~{interval_list} \
Expand All @@ -372,7 +372,7 @@ task CreateImportTsvs {
-SNM ~{sample_map} \
--ref-version 38

gsutil -m cp metadata_*.tsv ~{output_directory}/metadata_tsvs/
gsutil -m cp sample_info_*.tsv ~{output_directory}/sample_info_tsvs/
gsutil -m cp pet_*.tsv ~{output_directory}/pet_tsvs/
gsutil -m cp vet_*.tsv ~{output_directory}/vet_tsvs/
>>>
Expand Down Expand Up @@ -515,7 +515,7 @@ task LoadTable {

printf -v PADDED_TABLE_ID "%03d" ~{table_id}

# even for non-superpartitioned tables (e.g. metadata), the TSVs do have the suffix
# even for non-superpartitioned tables (e.g. sample_info), the TSVs do have the suffix
FILES="~{datatype}_${PADDED_TABLE_ID}_*"

NUM_FILES=$(gsutil ls "${DIR}${FILES}" | wc -l)
Expand All @@ -530,7 +530,7 @@ task LoadTable {
# get list of of pet files and their byte sizes
echo "Getting file sizes(bytes), paths to each file, and determining sets for chunking."
echo -e "bytes\tfile_path\tsum_bytes\tset_number" > ~{datatype}_du_sets.txt
# tr to replace each space -> tab, squeeze (remove) "empty" tabs,
# tr to replace each space -> tab, squeeze (remove) "empty" tabs,
gsutil du "${DIR}${FILES}" | tr " " "\t" | tr -s "\t" | sed "/~{datatype}_tsvs\/$/d" | awk '{s+=$1}{print $1"\t"$2"\t"s"\t" (1+int(s / 16000000000000))}' >> ~{datatype}_du_sets.txt

# per set, load table
Expand All @@ -554,15 +554,15 @@ task LoadTable {
while IFS="\t" read -r line_bq_load
do
bq wait --project_id=~{project_id} $(echo "$line_bq_load" | cut -f1) > bq_wait_status

# capture SUCCESS or FAILURE, echo to file
wait_status=$(sed '6q;d' bq_wait_status | tr " " "\t" | tr -s "\t" | cut -f3)
echo "$wait_status" >> bq_wait_details.tmp
done < bq_load_details.tmp

# combine load status and wait status into final report
paste bq_load_details.tmp bq_wait_details.tmp > bq_final_job_statuses.txt

# move files from each set into set-level "done" directories
gsutil -m mv "${DIR}set_${set}/${FILES}" "${DIR}set_${set}/done/" 2> gsutil_mv_done.log

Expand Down
4 changes: 2 additions & 2 deletions scripts/variantstore/wdl/extract/run_gvs_tieout_extract.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,6 @@ python ngs_cohort_extract.py \
--fq_temp_table_dataset ${PROJECT}.temp_tables \
--fq_destination_dataset ${PROJECT}.${DATASET} \
--destination_table exported_cohort_all_samples \
--fq_cohort_sample_names ${PROJECT}.${DATASET}.metadata \
--fq_cohort_sample_names ${PROJECT}.${DATASET}.sample_info \
--query_project ${PROJECT} \
--fq_sample_mapping_table ${PROJECT}.${DATASET}.metadata
--fq_sample_mapping_table ${PROJECT}.${DATASET}.sample_info
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,17 @@

"NgsCohortExtract.gatk_override": "gs://broad-dsp-spec-ops/kcibul/gatk-package-4.2.0.0-300-g98a024d-SNAPSHOT-local.jar",

"NgsCohortExtract.fq_sample_table": "spec-ops-aou.gvs_tieout_acmg_v1.metadata",
"NgsCohortExtract.fq_sample_table": "spec-ops-aou.gvs_tieout_acmg_v1.sample_info",
"NgsCohortExtract.fq_cohort_extract_table": "spec-ops-aou.gvs_tieout_acmg_v1.exported_cohort_all_samples",

"NgsCohortExtract.query_project": "spec-ops-aou",
"NgsCohortExtract.output_file_base_name": "acmg_35_full_noeh",

"NgsCohortExtract.fq_filter_set_table": "spec-ops-aou.gvs_tieout_acmg_v1.filter_set_info",
"NgsCohortExtract.filter_set_name": "gvs_tieout_acmg_v4new_no_eh",

"NgsCohortExtract.excluded_intervals": "gs://broad-dsp-spec-ops/scratch/bigquery-jointcalling/warp/excess_het_sites.sorted.bed",

"NgsCohortExtract.emit_pls": "true"

}
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,10 @@

"NgsFilterExtract.gatk_override": "gs://broad-dsp-spec-ops/scratch/bigquery-jointcalling/jars/ah_var_store_20210318/gatk-package-4.2.0.0-150-g869a173-SNAPSHOT-local.jar",

"NgsFilterExtract.fq_sample_table": "spec-ops-aou.gvs_tieout_acmg_v1.metadata",
"NgsFilterExtract.fq_sample_table": "spec-ops-aou.gvs_tieout_acmg_v1.sample_info",
"NgsFilterExtract.fq_alt_allele_table": "spec-ops-aou.gvs_tieout_acmg_v1.alt_allele",
"NgsFilterExtract.query_project": "spec-ops-aou",

"NgsFilterExtract.excluded_sites_bed": "gs://broad-dsp-spec-ops/scratch/bigquery-jointcalling/warp/excess_het_sites.sorted.bed",

"NgsFilterExtract.filter_set_name": "gvs_tieout_acmg_v4new_no_eh",
Expand Down
43 changes: 0 additions & 43 deletions scripts/variantstore/wdl/ngs_filter_extract.inputs.json

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ public class IngestConstants {

public static final char SEPARATOR = '\t';
public static final String FILETYPE = ".tsv";
public static final String sampleMetadataFilePrefix = "sample_";
public static final String metadataFilePrefix = "metadata_";
public static final String sampleMetadataFilePrefix = "sample_"; // used for arrays
public static final String sampleInfoFilePrefix = "sample_info_";
public static final int partitionPerTable = 4000;

}
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ public final class CreateVariantIngestFiles extends VariantWalker {

private PetTsvCreator petTsvCreator;
private VetTsvCreator vetTsvCreator;
private MetadataTsvCreator metadataTsvCreator;
private SampleInfoTsvCreator sampleInfoTsvCreator;
private GenomeLocSortedSet intervalArgumentGenomeLocSortedSet;

private String sampleName;
Expand All @@ -49,7 +49,7 @@ public final class CreateVariantIngestFiles extends VariantWalker {

// Inside the parent directory, a directory for each chromosome will be created, with a pet directory and vet directory in each one.
// Each pet and vet directory will hold all of the pet and vet tsvs for each sample
// A metadata directory will be created, with a metadata tsv for each sample
// A sample_info directory will be created, with a sample_info tsv for each sample

// @Argument(fullName = "output-path",
// shortName = "VPO",
Expand Down Expand Up @@ -147,8 +147,8 @@ public void onTraversalStart() {

// parentDirectory = parentOutputDirectory.toPath(); // TODO do we need this? More efficient way to do this?
// final Path sampleDirectoryPath = IngestUtils.createSampleDirectory(parentDirectory, sampleDirectoryNumber);
metadataTsvCreator = new MetadataTsvCreator(sampleName, sampleId, tableNumberPrefix, outputDir);
metadataTsvCreator.createRow(sampleName, sampleId, userIntervals, gqStateToIgnore);
sampleInfoTsvCreator = new SampleInfoTsvCreator(sampleName, sampleId, tableNumberPrefix, outputDir);
sampleInfoTsvCreator.createRow(sampleName, sampleId, userIntervals, gqStateToIgnore);

// To set up the missing positions
SAMSequenceDictionary seqDictionary = getBestAvailableSequenceDictionary();
Expand Down Expand Up @@ -223,11 +223,11 @@ public void closeTool() {
if (vetTsvCreator != null) {
vetTsvCreator.closeTool();;
}
if (metadataTsvCreator != null) {
if (sampleInfoTsvCreator != null) {
try {
metadataTsvCreator.closeTool();
sampleInfoTsvCreator.closeTool();
} catch (final Exception e) {
throw new IllegalArgumentException("Couldn't close Sample Metadata writer", e);
throw new IllegalArgumentException("Couldn't close SampleInfo writer", e);
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,43 +14,40 @@
import java.util.List;
import java.util.stream.Collectors;

public class MetadataTsvCreator {
public class SampleInfoTsvCreator {

private SimpleXSVWriter sampleMetadataWriter = null;
private SimpleXSVWriter sampleInfoWriter = null;

/**
* Expected headers for the Sample List Table
*/
public enum HeaderFieldEnum {
sample_name,
sample_id,
interval_list_blob,
inferred_state,
}

public MetadataTsvCreator(String sampleName, String sampleId, String tableNumberPrefix, final File outputDirectory) {
public SampleInfoTsvCreator(String sampleName, String sampleId, String tableNumberPrefix, final File outputDirectory) {
try {
final File sampleMetadataFile = new File(outputDirectory, IngestConstants.metadataFilePrefix + tableNumberPrefix + sampleName + IngestConstants.FILETYPE);
final File sampleInfoFile = new File(outputDirectory, IngestConstants.sampleInfoFilePrefix + tableNumberPrefix + sampleName + IngestConstants.FILETYPE);
// write header to it
List<String> sampleListHeader = MetadataTsvCreator.getHeaders();
sampleMetadataWriter = new SimpleXSVWriter(sampleMetadataFile.toPath(), IngestConstants.SEPARATOR);
sampleMetadataWriter.setHeaderLine(sampleListHeader);
List<String> sampleListHeader = SampleInfoTsvCreator.getHeaders();
sampleInfoWriter = new SimpleXSVWriter(sampleInfoFile.toPath(), IngestConstants.SEPARATOR);
sampleInfoWriter.setHeaderLine(sampleListHeader);
} catch (final IOException e) {
throw new UserException("Could not create pet outputs", e);
throw new UserException("Could not create sample info outputs", e);
}

}
private List<String> createSampleListRow(
String sampleName,
String sampleId,
String intervalListBlob,
PetTsvCreator.GQStateEnum inferredMissingState
) {

List<String> row = new ArrayList<>();
row.add(sampleName);
row.add(sampleId);
row.add(intervalListBlob);
if (inferredMissingState == null) {
row.add("");
} else {
Expand All @@ -61,36 +58,26 @@ private List<String> createSampleListRow(
}

public static List<String> getHeaders() {
return Arrays.stream(MetadataTsvCreator.HeaderFieldEnum.values()).map(String::valueOf).collect(Collectors.toList());
return Arrays.stream(SampleInfoTsvCreator.HeaderFieldEnum.values()).map(String::valueOf).collect(Collectors.toList());
}

public void createRow(String sampleName, String sampleId, List<SimpleInterval> userIntervals, PetTsvCreator.GQStateEnum gqStateToIgnore) {
// if the metadata tsvs don't exist yet -- create them
// Create a metadata file to go into the metadata dir for _this_ sample
// if the sample_info tsvs don't exist yet -- create them
// Create a sample_info file to go into the sample_info dir for _this_ sample
// TODO--this should just be one file per sample set?
String intervalListMd5 = "NA";

if (userIntervals != null) {
// write values
List<String> intervalList = userIntervals.stream().map(interval -> interval.toString())
.collect(Collectors.toList());
String intervalListBlob = StringUtils.join(intervalList, ", ");
intervalListMd5 = Utils.calcMD5(intervalListBlob);
}
final List<String> TSVLineToCreateSampleMetadata = createSampleListRow(
final List<String> TSVLineToCreateSampleInfo = createSampleListRow(
sampleName,
sampleId,
intervalListMd5,
gqStateToIgnore);
sampleMetadataWriter.getNewLineBuilder().setRow(TSVLineToCreateSampleMetadata).write();
sampleInfoWriter.getNewLineBuilder().setRow(TSVLineToCreateSampleInfo).write();
}

public void closeTool() {
if (sampleMetadataWriter != null) {
if (sampleInfoWriter != null) {
try {
sampleMetadataWriter.close();
sampleInfoWriter.close();
} catch (final Exception e) {
throw new IllegalArgumentException("Couldn't close VET writer", e);
throw new IllegalArgumentException("Couldn't close SampleInfo writer", e);
}
}

Expand Down
Loading

0 comments on commit 3a2cb47

Please sign in to comment.