Rename "metadata" table to "sample_info" table, fix vet schema (#7196)

* refactor CreateVariantIngestFiles metadata -> sample_info * refactor ImportGenomes metadata -> sample_info * add QUALapprox to default vet schema * update sample_info for ImportArrays.wdl * revert arrays changes * update metadata -> sample_info inputs * add comment for arrays constant * update reference docs and input json files * remove branch from dockstore yml
broadinstitute · Apr 10, 2021 · 3a2cb47 · 3a2cb47
1 parent eabd71f
commit 3a2cb47
Show file tree

Hide file tree

Showing 12 changed files with 59 additions and 115 deletions.
diff --git a/scripts/variantstore/tieout/README.md b/scripts/variantstore/tieout/README.md
@@ -158,16 +158,16 @@ gatk --java-options "-Xms8g -Xdebug -Xrunjdwp:transport=dt_socket,address=5005,s
 
 ```bash
 reference="/Users/kcibul/projects/references/hg38/v0/Homo_sapiens_assembly38.fasta"
-dataset="spec-ops-aou.kc_acmg_tieout_v6"
+dataset="spec-ops-aou.gvs_tieout_acmg_v1"
 
 gatk --java-options "-Xms2g -Xdebug -Xrunjdwp:transport=dt_socket,address=5005,server=y,suspend=n" \
   ExtractCohort --mode GENOMES --ref-version 38 --query-mode LOCAL_SORT \
   -R $reference \
-  -O acmg_35_debug.vcf \
+  -O gvs_tieout_acmg_v1.vcf \
   --local-sort-max-records-in-ram 1000000 \
   --print-debug-information \
-  --sample-table ${dataset}.metadata  \
+  --sample-table ${dataset}.sample_info  \
   --project-id spec-ops-aou \
-  --cohort-extract-table ${dataset}.exported_cohort_35_test \
+  --cohort-extract-table ${dataset}.exported_cohort_all_samples \
   -L chr1:55398671
 ```
diff --git a/scripts/variantstore/wdl/ImportGenomes.wdl b/scripts/variantstore/wdl/ImportGenomes.wdl
@@ -11,8 +11,8 @@ workflow ImportGenomes {
     String project_id
     String dataset_name
     String? pet_schema = "location:INTEGER,sample_id:INTEGER,state:STRING"
-    String? vet_schema = "sample_id:INTEGER,location:INTEGER,ref:STRING,alt:STRING,AS_RAW_MQ:STRING,AS_RAW_MQRankSum:STRING,AS_QUALapprox:STRING,AS_RAW_ReadPosRankSum:STRING,AS_SB_TABLE:STRING,AS_VarDP:STRING,call_GT:STRING,call_AD:STRING,call_GQ:INTEGER,call_PGT:STRING,call_PID:STRING,call_PL:STRING"
-    String? metadata_schema = "sample_name:STRING,sample_id:INTEGER,interval_list_blob:STRING,inferred_state:STRING"
+    String? vet_schema = "sample_id:INTEGER,location:INTEGER,ref:STRING,alt:STRING,AS_RAW_MQ:STRING,AS_RAW_MQRankSum:STRING,QUALapprox:STRING,AS_QUALapprox:STRING,AS_RAW_ReadPosRankSum:STRING,AS_SB_TABLE:STRING,AS_VarDP:STRING,call_GT:STRING,call_AD:STRING,call_GQ:INTEGER,call_PGT:STRING,call_PID:STRING,call_PL:STRING"
+    String? sample_info_schema = "sample_name:STRING,sample_id:INTEGER,inferred_state:STRING"
     File? service_account_json
     String? drop_state
     Boolean? drop_state_includes_greater_than = false
@@ -36,13 +36,13 @@ workflow ImportGenomes {
       sample_map = sample_map
   }
 
-  call CreateTables as CreateMetadataTables {
+  call CreateTables as CreateSampleInfoTables {
   	input:
       project_id = project_id,
       dataset_name = dataset_name,
-      datatype = "metadata",
+      datatype = "sample_info",
       max_table_id = GetMaxTableId.max_table_id,
-      schema = metadata_schema,
+      schema = sample_info_schema,
       superpartitioned = "false",
       partitioned = "false",
       uuid = "",
@@ -100,17 +100,17 @@ workflow ImportGenomes {
   }
 
   scatter (i in range(GetMaxTableId.max_table_id)) {
-    call LoadTable as LoadMetadataTable {
+    call LoadTable as LoadSampleInfoTable {
       input:
         project_id = project_id,
         table_id = i + 1,
         dataset_name = dataset_name,
         storage_location = output_directory,
-        datatype = "metadata",
+        datatype = "sample_info",
         superpartitioned = "false",
-        schema = metadata_schema,
+        schema = sample_info_schema,
         service_account_json = service_account_json,
-        table_creation_done = CreateMetadataTables.done,
+        table_creation_done = CreateSampleInfoTables.done,
         tsv_creation_done = CreateImportTsvs.done,
         docker = docker_final,
         run_uuid = SetLock.run_uuid
@@ -157,7 +157,7 @@ workflow ImportGenomes {
     input:
       run_uuid = SetLock.run_uuid,
       output_directory = output_directory,
-      load_metadata_done = LoadMetadataTable.done,
+      load_sample_info_done = LoadSampleInfoTable.done,
       load_pet_done = LoadPetTable.done,
       load_vet_done = LoadVetTable.done,
       service_account_json = service_account_json,
@@ -201,7 +201,7 @@ task SetLock {
     LOCKFILE="LOCKFILE"
     HAS_LOCKFILE=$(gsutil ls "${DIR}${LOCKFILE}" | wc -l)
     if [ $HAS_LOCKFILE -gt 0 ]; then
-      echo "ERROR: lock file in place. Check whether another run of ImportGenomes with this output directory is in progress or a previous run had an error. 
+      echo "ERROR: lock file in place. Check whether another run of ImportGenomes with this output directory is in progress or a previous run had an error.
             If you would like to proceed, run the following command and re-run the workflow: \
             gsutil rm ${DIR}${LOCKFILE} \
             " && exit 1
@@ -233,7 +233,7 @@ task ReleaseLock {
   input {
     String run_uuid
     String output_directory
-    Array[String] load_metadata_done
+    Array[String] load_sample_info_done
     Array[String] load_pet_done
     Array[String] load_vet_done
     File? service_account_json
@@ -362,7 +362,7 @@ task CreateImportTsvs {
         echo "ERROR: found mismatched lockfile containing run ${EXISTING_LOCK_ID}, which does not match this run ${CURRENT_RUN_ID}." 1>&2
         exit 1
       fi
-      
+
       gatk --java-options "-Xmx7000m" CreateVariantIngestFiles \
         -V ~{updated_input_vcf} \
         -L ~{interval_list} \
@@ -372,7 +372,7 @@ task CreateImportTsvs {
         -SNM ~{sample_map} \
         --ref-version 38
 
-      gsutil -m cp metadata_*.tsv ~{output_directory}/metadata_tsvs/
+      gsutil -m cp sample_info_*.tsv ~{output_directory}/sample_info_tsvs/
       gsutil -m cp pet_*.tsv ~{output_directory}/pet_tsvs/
       gsutil -m cp vet_*.tsv ~{output_directory}/vet_tsvs/
   >>>
@@ -515,7 +515,7 @@ task LoadTable {
 
     printf -v PADDED_TABLE_ID "%03d" ~{table_id}
 
-    # even for non-superpartitioned tables (e.g. metadata), the TSVs do have the suffix
+    # even for non-superpartitioned tables (e.g. sample_info), the TSVs do have the suffix
     FILES="~{datatype}_${PADDED_TABLE_ID}_*"
 
     NUM_FILES=$(gsutil ls "${DIR}${FILES}" | wc -l)
@@ -530,7 +530,7 @@ task LoadTable {
         # get list of of pet files and their byte sizes
         echo "Getting file sizes(bytes), paths to each file, and determining sets for chunking."
         echo -e "bytes\tfile_path\tsum_bytes\tset_number" > ~{datatype}_du_sets.txt
-        # tr to replace each space -> tab, squeeze (remove) "empty" tabs, 
+        # tr to replace each space -> tab, squeeze (remove) "empty" tabs,
         gsutil du "${DIR}${FILES}" | tr " " "\t" | tr -s "\t" | sed "/~{datatype}_tsvs\/$/d" | awk '{s+=$1}{print $1"\t"$2"\t"s"\t" (1+int(s / 16000000000000))}' >> ~{datatype}_du_sets.txt
 
         # per set, load table
@@ -554,15 +554,15 @@ task LoadTable {
         while IFS="\t" read -r line_bq_load
         do
           bq wait --project_id=~{project_id} $(echo "$line_bq_load" | cut -f1) > bq_wait_status
-            
+
           # capture SUCCESS or FAILURE, echo to file
           wait_status=$(sed '6q;d' bq_wait_status | tr " " "\t" | tr -s "\t" | cut -f3)
           echo "$wait_status" >> bq_wait_details.tmp
         done < bq_load_details.tmp
 
         # combine load status and wait status into final report
         paste bq_load_details.tmp bq_wait_details.tmp > bq_final_job_statuses.txt
-        
+
         # move files from each set into set-level "done" directories
         gsutil -m mv "${DIR}set_${set}/${FILES}" "${DIR}set_${set}/done/" 2> gsutil_mv_done.log
 

diff --git a/scripts/variantstore/wdl/extract/run_gvs_tieout_extract.sh b/scripts/variantstore/wdl/extract/run_gvs_tieout_extract.sh
@@ -6,6 +6,6 @@ python ngs_cohort_extract.py \
   --fq_temp_table_dataset ${PROJECT}.temp_tables \
   --fq_destination_dataset ${PROJECT}.${DATASET} \
   --destination_table exported_cohort_all_samples \
-  --fq_cohort_sample_names ${PROJECT}.${DATASET}.metadata \
+  --fq_cohort_sample_names ${PROJECT}.${DATASET}.sample_info \
   --query_project ${PROJECT} \
-  --fq_sample_mapping_table ${PROJECT}.${DATASET}.metadata
+  --fq_sample_mapping_table ${PROJECT}.${DATASET}.sample_info
diff --git a/scripts/variantstore/wdl/ngs_cohort_extract.gvs_tieout_acmg_v1.inputs.json b/scripts/variantstore/wdl/ngs_cohort_extract.gvs_tieout_acmg_v1.inputs.json
@@ -7,17 +7,17 @@
 
   "NgsCohortExtract.gatk_override": "gs://broad-dsp-spec-ops/kcibul/gatk-package-4.2.0.0-300-g98a024d-SNAPSHOT-local.jar",
 
-  "NgsCohortExtract.fq_sample_table": "spec-ops-aou.gvs_tieout_acmg_v1.metadata",
+  "NgsCohortExtract.fq_sample_table": "spec-ops-aou.gvs_tieout_acmg_v1.sample_info",
   "NgsCohortExtract.fq_cohort_extract_table": "spec-ops-aou.gvs_tieout_acmg_v1.exported_cohort_all_samples",
 
   "NgsCohortExtract.query_project": "spec-ops-aou",
   "NgsCohortExtract.output_file_base_name": "acmg_35_full_noeh",
 
   "NgsCohortExtract.fq_filter_set_table": "spec-ops-aou.gvs_tieout_acmg_v1.filter_set_info",
   "NgsCohortExtract.filter_set_name": "gvs_tieout_acmg_v4new_no_eh",
-  
+
   "NgsCohortExtract.excluded_intervals": "gs://broad-dsp-spec-ops/scratch/bigquery-jointcalling/warp/excess_het_sites.sorted.bed",
-    
+
   "NgsCohortExtract.emit_pls": "true"
-  
+
 }
diff --git a/scripts/variantstore/wdl/ngs_filter_extract.gvs_tieout_acmg_v1.inputs.json b/scripts/variantstore/wdl/ngs_filter_extract.gvs_tieout_acmg_v1.inputs.json
@@ -29,10 +29,10 @@
 
   "NgsFilterExtract.gatk_override": "gs://broad-dsp-spec-ops/scratch/bigquery-jointcalling/jars/ah_var_store_20210318/gatk-package-4.2.0.0-150-g869a173-SNAPSHOT-local.jar",
 
-  "NgsFilterExtract.fq_sample_table": "spec-ops-aou.gvs_tieout_acmg_v1.metadata",
+  "NgsFilterExtract.fq_sample_table": "spec-ops-aou.gvs_tieout_acmg_v1.sample_info",
   "NgsFilterExtract.fq_alt_allele_table": "spec-ops-aou.gvs_tieout_acmg_v1.alt_allele",
   "NgsFilterExtract.query_project": "spec-ops-aou",
-  
+
   "NgsFilterExtract.excluded_sites_bed": "gs://broad-dsp-spec-ops/scratch/bigquery-jointcalling/warp/excess_het_sites.sorted.bed",
 
   "NgsFilterExtract.filter_set_name": "gvs_tieout_acmg_v4new_no_eh",

diff --git a/scripts/variantstore/wdl/ngs_filter_extract.inputs.json b/scripts/variantstore/wdl/ngs_filter_extract.inputs.json
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/IngestConstants.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/IngestConstants.java
@@ -4,8 +4,8 @@ public class IngestConstants {
 
     public static final char SEPARATOR = '\t';
     public static final String FILETYPE = ".tsv";
-    public static final String sampleMetadataFilePrefix = "sample_";
-    public static final String metadataFilePrefix = "metadata_";
+    public static final String sampleMetadataFilePrefix = "sample_"; // used for arrays
+    public static final String sampleInfoFilePrefix = "sample_info_";
     public static final int partitionPerTable = 4000;
 
 }
diff --git a/.../java/org/broadinstitute/hellbender/tools/variantdb/nextgen/CreateVariantIngestFiles.java b/.../java/org/broadinstitute/hellbender/tools/variantdb/nextgen/CreateVariantIngestFiles.java
@@ -40,7 +40,7 @@ public final class CreateVariantIngestFiles extends VariantWalker {
 
     private PetTsvCreator petTsvCreator;
     private VetTsvCreator vetTsvCreator;
-    private MetadataTsvCreator metadataTsvCreator;
+    private SampleInfoTsvCreator sampleInfoTsvCreator;
     private GenomeLocSortedSet intervalArgumentGenomeLocSortedSet;
 
     private String sampleName;
@@ -49,7 +49,7 @@ public final class CreateVariantIngestFiles extends VariantWalker {
 
     // Inside the parent directory, a directory for each chromosome will be created, with a pet directory and vet directory in each one.
     // Each pet and vet directory will hold all of the pet and vet tsvs for each sample
-    // A metadata directory will be created, with a metadata tsv for each sample
+    // A sample_info directory will be created, with a sample_info tsv for each sample
 
 //    @Argument(fullName = "output-path",
 //            shortName = "VPO",
@@ -147,8 +147,8 @@ public void onTraversalStart() {
 
 //        parentDirectory = parentOutputDirectory.toPath(); // TODO do we need this? More efficient way to do this?
 //        final Path sampleDirectoryPath = IngestUtils.createSampleDirectory(parentDirectory, sampleDirectoryNumber);
-        metadataTsvCreator = new MetadataTsvCreator(sampleName, sampleId, tableNumberPrefix, outputDir);
-        metadataTsvCreator.createRow(sampleName, sampleId, userIntervals, gqStateToIgnore);
+        sampleInfoTsvCreator = new SampleInfoTsvCreator(sampleName, sampleId, tableNumberPrefix, outputDir);
+        sampleInfoTsvCreator.createRow(sampleName, sampleId, userIntervals, gqStateToIgnore);
 
         // To set up the missing positions
         SAMSequenceDictionary seqDictionary = getBestAvailableSequenceDictionary();
@@ -223,11 +223,11 @@ public void closeTool() {
         if (vetTsvCreator != null) {
             vetTsvCreator.closeTool();;
         }
-        if (metadataTsvCreator != null) {
+        if (sampleInfoTsvCreator != null) {
             try {
-                metadataTsvCreator.closeTool();
+                sampleInfoTsvCreator.closeTool();
             } catch (final Exception e) {
-                throw new IllegalArgumentException("Couldn't close Sample Metadata writer", e);
+                throw new IllegalArgumentException("Couldn't close SampleInfo writer", e);
             }
         }
     }

diff --git a/...variantdb/nextgen/MetadataTsvCreator.java → ...riantdb/nextgen/SampleInfoTsvCreator.java b/...variantdb/nextgen/MetadataTsvCreator.java → ...riantdb/nextgen/SampleInfoTsvCreator.java
@@ -14,43 +14,40 @@
 import java.util.List;
 import java.util.stream.Collectors;
 
-public class MetadataTsvCreator {
+public class SampleInfoTsvCreator {
 
-    private SimpleXSVWriter sampleMetadataWriter = null;
+    private SimpleXSVWriter sampleInfoWriter = null;
 
     /**
      * Expected headers for the Sample List Table
      */
     public enum HeaderFieldEnum {
         sample_name,
         sample_id,
-        interval_list_blob,
         inferred_state,
     }
 
-    public MetadataTsvCreator(String sampleName, String sampleId, String tableNumberPrefix, final File outputDirectory) {
+    public SampleInfoTsvCreator(String sampleName, String sampleId, String tableNumberPrefix, final File outputDirectory) {
         try {
-            final File sampleMetadataFile = new File(outputDirectory, IngestConstants.metadataFilePrefix + tableNumberPrefix + sampleName + IngestConstants.FILETYPE);
+            final File sampleInfoFile = new File(outputDirectory, IngestConstants.sampleInfoFilePrefix + tableNumberPrefix + sampleName + IngestConstants.FILETYPE);
             // write header to it
-            List<String> sampleListHeader = MetadataTsvCreator.getHeaders();
-            sampleMetadataWriter = new SimpleXSVWriter(sampleMetadataFile.toPath(), IngestConstants.SEPARATOR);
-            sampleMetadataWriter.setHeaderLine(sampleListHeader);
+            List<String> sampleListHeader = SampleInfoTsvCreator.getHeaders();
+            sampleInfoWriter = new SimpleXSVWriter(sampleInfoFile.toPath(), IngestConstants.SEPARATOR);
+            sampleInfoWriter.setHeaderLine(sampleListHeader);
         } catch (final IOException e) {
-            throw new UserException("Could not create pet outputs", e);
+            throw new UserException("Could not create sample info outputs", e);
         }
 
     }
     private List<String> createSampleListRow(
             String sampleName,
             String sampleId,
-            String intervalListBlob,
             PetTsvCreator.GQStateEnum inferredMissingState
     ) {
 
         List<String> row = new ArrayList<>();
         row.add(sampleName);
         row.add(sampleId);
-        row.add(intervalListBlob);
         if (inferredMissingState == null) {
             row.add("");
         } else {
@@ -61,36 +58,26 @@ private List<String> createSampleListRow(
     }
 
     public static List<String> getHeaders() {
-        return Arrays.stream(MetadataTsvCreator.HeaderFieldEnum.values()).map(String::valueOf).collect(Collectors.toList());
+        return Arrays.stream(SampleInfoTsvCreator.HeaderFieldEnum.values()).map(String::valueOf).collect(Collectors.toList());
     }
 
     public void createRow(String sampleName, String sampleId, List<SimpleInterval> userIntervals, PetTsvCreator.GQStateEnum gqStateToIgnore) {
-        // if the metadata tsvs don't exist yet -- create them
-        // Create a metadata file to go into the metadata dir for _this_ sample
+        // if the sample_info tsvs don't exist yet -- create them
+        // Create a sample_info file to go into the sample_info dir for _this_ sample
         // TODO--this should just be one file per sample set?
-        String intervalListMd5 = "NA";
-
-        if (userIntervals != null) {
-            // write values
-            List<String> intervalList = userIntervals.stream().map(interval -> interval.toString())
-                    .collect(Collectors.toList());
-            String intervalListBlob = StringUtils.join(intervalList, ", ");
-            intervalListMd5 = Utils.calcMD5(intervalListBlob);
-        }
-        final List<String> TSVLineToCreateSampleMetadata = createSampleListRow(
+        final List<String> TSVLineToCreateSampleInfo = createSampleListRow(
                 sampleName,
                 sampleId,
-                intervalListMd5,
                 gqStateToIgnore);
-        sampleMetadataWriter.getNewLineBuilder().setRow(TSVLineToCreateSampleMetadata).write();
+        sampleInfoWriter.getNewLineBuilder().setRow(TSVLineToCreateSampleInfo).write();
     }
 
     public void closeTool() {
-        if (sampleMetadataWriter != null) {
+        if (sampleInfoWriter != null) {
             try {
-                sampleMetadataWriter.close();
+                sampleInfoWriter.close();
             } catch (final Exception e) {
-                throw new IllegalArgumentException("Couldn't close VET writer", e);
+                throw new IllegalArgumentException("Couldn't close SampleInfo writer", e);
             }
         }