From 4d4124cae02488196d8966ed6aad9dba7a9b209d Mon Sep 17 00:00:00 2001 From: Eric Song Date: Thu, 8 Oct 2020 00:27:38 -0400 Subject: [PATCH 1/6] optionally provide sample-map-file instead of sample-map-table --- .../variantdb/arrays/ArrayExtractCohort.java | 20 ++++++++++++++++--- .../variantdb/arrays/tables/ProbeInfo.java | 3 ++- .../variantdb/arrays/tables/SampleList.java | 15 ++++++++++++++ .../utils/bigquery/StorageAPIAvroReader.java | 3 ++- 4 files changed, 36 insertions(+), 5 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArrayExtractCohort.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArrayExtractCohort.java index 3368688edcb..2ee81fcb015 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArrayExtractCohort.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArrayExtractCohort.java @@ -2,6 +2,7 @@ import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.vcf.VCFHeader; +import java.io.File; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.broadinstitute.barclay.argparser.Argument; @@ -64,6 +65,13 @@ public enum QueryMode { ) private String sampleTableName = null; + @Argument( + fullName = "cohort-sample-file", + doc = "CSV of sample_id,sample_name map in the cohort", + optional = true + ) + private File cohortSampleFile = null; + @Argument( fullName = "probe-info-table", doc = "Fully qualified name of a bigquery table containing probe information", @@ -191,10 +199,16 @@ protected void onStartup() { vcfWriter = createVCFWriter(IOUtils.getPath(outputVcfPathString)); - Map sampleIdMap = SampleList.getSampleIdMap(new TableReference(sampleTableName, SampleList.SAMPLE_LIST_FIELDS), printDebugInformation); + Map sampleIdMap; + if (sampleTableName != null) { + sampleIdMap = SampleList.getSampleIdMap(new TableReference(sampleTableName, SampleList.SAMPLE_LIST_FIELDS), printDebugInformation); + } else if (cohortSampleFile != null) { + sampleIdMap = SampleList.getSampleIdMap(cohortSampleFile); + } else { + throw new IllegalArgumentException("--cohort-sample-names or --cohort-sample-table must be provided."); + } - Collection sampleNames = sampleIdMap.values(); - VCFHeader header = CommonCode.generateRawArrayVcfHeader(new HashSet<>(sampleNames), reference.getSequenceDictionary()); + VCFHeader header = CommonCode.generateRawArrayVcfHeader(new HashSet<>(sampleIdMap.values()), reference.getSequenceDictionary()); Map probeIdMap; if (probeCsvExportFile == null) { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/tables/ProbeInfo.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/tables/ProbeInfo.java index 2dfae31def1..a074acd1048 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/tables/ProbeInfo.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/tables/ProbeInfo.java @@ -101,10 +101,11 @@ public static Map getProbeIdMapFromExport(final String probeCsv probeIdMap.put(p.probeId, p); } + return probeIdMap; } catch (final Exception e) { throw new GATKException("Error processing probe CSV file", e); - } + } } public static Map getProbeIdMapWithStorageAPI(String fqProbeTableName, boolean printDebugInformation) { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/tables/SampleList.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/tables/SampleList.java index b7a9b0d8bbf..ba898ee84c2 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/tables/SampleList.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/tables/SampleList.java @@ -2,6 +2,11 @@ import com.google.cloud.bigquery.FieldValueList; import com.google.cloud.bigquery.TableResult; +import java.io.File; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.util.stream.Collectors; import org.apache.commons.lang.StringUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -49,6 +54,16 @@ public static Map getSampleIdMap(TableReference sampleTable, bo return results; } + public static Map getSampleIdMap(File cohortSampleFile) { + try { + return Files.readAllLines(cohortSampleFile.toPath(), StandardCharsets.US_ASCII).stream() + .map(s -> s.split(",")) + .collect(Collectors.toMap(tokens -> Integer.parseInt(tokens[0]), tokens -> tokens[1])); + } catch (IOException e) { + throw new IllegalArgumentException("Could not parse --cohort-sample-file", e); + } + } + private static TableResult querySampleTable(String fqSampleTableName, String whereClause, boolean printDebugInformation) { // Get the query string: final String sampleListQueryString = diff --git a/src/main/java/org/broadinstitute/hellbender/utils/bigquery/StorageAPIAvroReader.java b/src/main/java/org/broadinstitute/hellbender/utils/bigquery/StorageAPIAvroReader.java index c965a3d81a0..f4f843eb36f 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/bigquery/StorageAPIAvroReader.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/bigquery/StorageAPIAvroReader.java @@ -1,6 +1,7 @@ package org.broadinstitute.hellbender.utils.bigquery; import com.google.api.gax.rpc.ServerStream; +import com.google.cloud.bigquery.BigQueryOptions; import com.google.cloud.bigquery.storage.v1beta1.*; import com.google.cloud.bigquery.storage.v1beta1.ReadOptions.TableReadOptions.Builder; import com.google.common.base.Preconditions; @@ -50,7 +51,7 @@ public StorageAPIAvroReader(final TableReference tableRef, final String rowRestr try { this.client = BigQueryStorageClient.create(); - final String parent = String.format("projects/%s", tableRef.tableProject); + final String parent = String.format("projects/%s", BigQueryOptions.getDefaultInstance().getProjectId()); final TableReferenceProto.TableReference tableReference = TableReferenceProto.TableReference.newBuilder() .setProjectId(tableRef.tableProject) From 34189496a8589af0c6ab6dd9a63467f7e50d7dd3 Mon Sep 17 00:00:00 2001 From: Eric Song Date: Tue, 13 Oct 2020 14:41:57 -0400 Subject: [PATCH 2/6] fix variantstore test --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index b28605bd059..4919d39566e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -148,7 +148,7 @@ script: elif [[ $RUN_CNN_WDL == true ]]; then echo "Running CNN WDL"; travis_wait 60 sudo bash scripts/cnn_variant_cromwell_tests/run_cnn_variant_wdl.sh; - elif [[ $RUN_VARIANTSTORE_WDL == true ]]; then + elif [[ $RUN_VARIANTSTORE_WDL == true && $TRAVIS_SECURE_ENV_VARS == true ]]; then echo "Running variantstore WDL"; travis_wait 60 sudo bash scripts/variantstore_cromwell_tests/run_variantstore_wdl.sh; elif [[ $TEST_DOCKER == true ]]; then From 3f4a300d73ee5124b2cf633a0765b3dc932e3757 Mon Sep 17 00:00:00 2001 From: Megan Shand Date: Fri, 16 Oct 2020 14:13:54 -0400 Subject: [PATCH 3/6] trying to fix test --- .travis.yml | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 4919d39566e..b704907782a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -148,9 +148,13 @@ script: elif [[ $RUN_CNN_WDL == true ]]; then echo "Running CNN WDL"; travis_wait 60 sudo bash scripts/cnn_variant_cromwell_tests/run_cnn_variant_wdl.sh; - elif [[ $RUN_VARIANTSTORE_WDL == true && $TRAVIS_SECURE_ENV_VARS == true ]]; then - echo "Running variantstore WDL"; - travis_wait 60 sudo bash scripts/variantstore_cromwell_tests/run_variantstore_wdl.sh; + elif [[ $RUN_VARIANTSTORE_WDL == true ]]; then + if [[ $TRAVIS_SECURE_ENV_VARS == true ]]; then + echo "Running variantstore WDL"; + travis_wait 60 sudo bash scripts/variantstore_cromwell_tests/run_variantstore_wdl.sh; + else + echo "Skipping variantstore tests since google cloud authentication is required."; + fi; elif [[ $TEST_DOCKER == true ]]; then echo "Building docker image and running appropriate tests..." ; if [ ${TRAVIS_PULL_REQUEST} != false ]; then From 4cc8731c9e423f4259182f9d3e3297bdad4a1150 Mon Sep 17 00:00:00 2001 From: Eric Song Date: Mon, 19 Oct 2020 13:01:05 -0400 Subject: [PATCH 4/6] pass in read project id --- .../tools/variantdb/arrays/ArrayExtractCohort.java | 12 ++++++------ .../variantdb/arrays/ArrayExtractCohortEngine.java | 9 ++++----- .../tools/variantdb/arrays/ExtractCohortBQ.java | 2 +- .../tools/variantdb/arrays/tables/ProbeInfo.java | 4 ++-- .../variantdb/arrays/tables/ProbeQcMetrics.java | 4 ++-- .../utils/bigquery/StorageAPIAvroReader.java | 11 +++++++---- 6 files changed, 22 insertions(+), 20 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArrayExtractCohort.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArrayExtractCohort.java index 2ee81fcb015..a7b03bc38a7 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArrayExtractCohort.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArrayExtractCohort.java @@ -52,11 +52,11 @@ public enum QueryMode { private String outputVcfPathString = null; @Argument( - fullName = "project-id", - doc = "ID of the Google Cloud project to use when executing queries", + fullName = "read-project-id", + doc = "ID of the Google Cloud project to use (bill) when reading the microarray data tables", optional = false ) - private String projectID = null; + private String readProjectID = null; @Argument( fullName = "cohort-sample-table", @@ -212,7 +212,7 @@ protected void onStartup() { Map probeIdMap; if (probeCsvExportFile == null) { - probeIdMap = ProbeInfo.getProbeIdMapWithStorageAPI(probeTableName, printDebugInformation); + probeIdMap = ProbeInfo.getProbeIdMapWithStorageAPI(probeTableName, printDebugInformation, readProjectID); } else { probeIdMap = ProbeInfo.getProbeIdMapFromExport(probeCsvExportFile); } @@ -220,13 +220,13 @@ protected void onStartup() { // if we have a qcMetrics table, augment the probeInfo map with that information Map probeQcMetricsMap = null; if (qcMetricsTableName != null) { - probeQcMetricsMap = ProbeQcMetrics.getProbeQcMetricsWithStorageAPI(qcMetricsTableName); + probeQcMetricsMap = ProbeQcMetrics.getProbeQcMetricsWithStorageAPI(qcMetricsTableName, readProjectID); } //ChromosomeEnum.setRefVersion(refVersion); engine = new ArrayExtractCohortEngine( - projectID, + readProjectID, vcfWriter, header, annotationEngine, diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArrayExtractCohortEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArrayExtractCohortEngine.java index 50d19fc29d9..3ec9ba4caab 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArrayExtractCohortEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArrayExtractCohortEngine.java @@ -18,7 +18,6 @@ import org.broadinstitute.hellbender.utils.SimpleInterval; import org.broadinstitute.hellbender.utils.bigquery.*; import org.broadinstitute.hellbender.utils.localsort.SortingCollection; -import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; import java.text.DecimalFormat; import java.util.*; @@ -44,7 +43,7 @@ public class ArrayExtractCohortEngine { private final ReferenceDataSource refSource; private final ProgressMeter progressMeter; - private final String projectID; + private final String readProjectId; /** List of sample names seen in the variant data from BigQuery. */ private final Map sampleIdMap; @@ -65,7 +64,7 @@ public class ArrayExtractCohortEngine { final float callRateThreshold; final boolean filterInvariants; - public ArrayExtractCohortEngine(final String projectID, + public ArrayExtractCohortEngine(final String readProjectId, final VariantContextWriter vcfWriter, final VCFHeader vcfHeader, final VariantAnnotatorEngine annotationEngine, @@ -92,7 +91,7 @@ public ArrayExtractCohortEngine(final String projectID, this.localSortMaxRecordsInRam = localSortMaxRecordsInRam; - this.projectID = projectID; + this.readProjectId = readProjectId; this.vcfWriter = vcfWriter; this.refSource = refSource; this.sampleIdMap = sampleIdMap; @@ -133,7 +132,7 @@ public void traverse() { rowRestriction = "probe_id >= " + minProbeId + " AND probe_id <= " + maxProbeId; } - final StorageAPIAvroReader storageAPIAvroReader = new StorageAPIAvroReader(cohortTableRef, rowRestriction); + final StorageAPIAvroReader storageAPIAvroReader = new StorageAPIAvroReader(cohortTableRef, rowRestriction, readProjectId); createVariantsFromUngroupedTableResult(storageAPIAvroReader); } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ExtractCohortBQ.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ExtractCohortBQ.java index 18497ff16dd..5fde835e08a 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ExtractCohortBQ.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ExtractCohortBQ.java @@ -28,7 +28,7 @@ public static Set populateSampleNames(TableReference sampleTableRef, boo public static Map getProbeNameMap(String fqProbeTableName, boolean printDebugInformation) { Map results = new HashMap<>(); - for (final ProbeInfo pi : ProbeInfo.getProbeIdMapWithStorageAPI(fqProbeTableName, printDebugInformation).values()) { + for (final ProbeInfo pi : ProbeInfo.getProbeIdMapWithStorageAPI(fqProbeTableName, printDebugInformation, null).values()) { results.put(pi.name, pi); } return results; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/tables/ProbeInfo.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/tables/ProbeInfo.java index a074acd1048..f5ec524bd6b 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/tables/ProbeInfo.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/tables/ProbeInfo.java @@ -108,7 +108,7 @@ public static Map getProbeIdMapFromExport(final String probeCsv } } - public static Map getProbeIdMapWithStorageAPI(String fqProbeTableName, boolean printDebugInformation) { + public static Map getProbeIdMapWithStorageAPI(String fqProbeTableName, boolean printDebugInformation, String readProjectId) { Map results = new HashMap<>(); TableReference tableRef = new TableReference(fqProbeTableName, ProbeInfoSchema.PROBE_INFO_FIELDS); @@ -116,7 +116,7 @@ public static Map getProbeIdMapWithStorageAPI(String fqProbeTab System.out.println("Beginning probe retrieval..."); long start = System.currentTimeMillis(); - try (final StorageAPIAvroReader reader = new StorageAPIAvroReader(tableRef)) { + try (final StorageAPIAvroReader reader = new StorageAPIAvroReader(tableRef, readProjectId)) { for ( final GenericRecord row : reader ) { ProbeInfo p = new ProbeInfo( (Long) row.get(ProbeInfoSchema.PROBE_ID), diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/tables/ProbeQcMetrics.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/tables/ProbeQcMetrics.java index 2b3343f424a..419a6714e46 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/tables/ProbeQcMetrics.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/tables/ProbeQcMetrics.java @@ -20,7 +20,7 @@ public ProbeQcMetrics(final long probeId, final Double hwe_pval, final Double ca this.invariant = invariant; } - public static Map getProbeQcMetricsWithStorageAPI(String fqProbeTableName) { + public static Map getProbeQcMetricsWithStorageAPI(String fqProbeTableName, String readProjectId) { Map results = new HashMap<>(); TableReference tableRef = new TableReference(fqProbeTableName, ProbeQcMetricsSchema.PROBE_QC_METRIC_FIELDS); @@ -28,7 +28,7 @@ public static Map getProbeQcMetricsWithStorageAPI(String f System.out.println("Beginning probe QC metrics retrieval..."); long start = System.currentTimeMillis(); - try (final StorageAPIAvroReader reader = new StorageAPIAvroReader(tableRef)) { + try (final StorageAPIAvroReader reader = new StorageAPIAvroReader(tableRef, readProjectId)) { for ( final GenericRecord row : reader ) { ProbeQcMetrics p = new ProbeQcMetrics( (Long) row.get(ProbeQcMetricsSchema.PROBE_ID), diff --git a/src/main/java/org/broadinstitute/hellbender/utils/bigquery/StorageAPIAvroReader.java b/src/main/java/org/broadinstitute/hellbender/utils/bigquery/StorageAPIAvroReader.java index f4f843eb36f..2320f434133 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/bigquery/StorageAPIAvroReader.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/bigquery/StorageAPIAvroReader.java @@ -16,7 +16,6 @@ import java.io.IOException; import java.util.Iterator; -import java.util.List; import java.util.NoSuchElementException; public class StorageAPIAvroReader implements GATKAvroReader { @@ -43,15 +42,19 @@ public class StorageAPIAvroReader implements GATKAvroReader { private GenericRecord nextRow = null; public StorageAPIAvroReader(final TableReference tableRef) { - this(tableRef, null); + this(tableRef, null, ""); } - public StorageAPIAvroReader(final TableReference tableRef, final String rowRestriction) { + public StorageAPIAvroReader(final TableReference tableRef, String parentProjectId) { + this(tableRef, null, parentProjectId); + } + + public StorageAPIAvroReader(final TableReference tableRef, final String rowRestriction, String parentProjectId) { try { this.client = BigQueryStorageClient.create(); - final String parent = String.format("projects/%s", BigQueryOptions.getDefaultInstance().getProjectId()); + final String parent = String.format("projects/%s", parentProjectId == null || parentProjectId.isEmpty() ? tableRef.tableProject : parentProjectId); final TableReferenceProto.TableReference tableReference = TableReferenceProto.TableReference.newBuilder() .setProjectId(tableRef.tableProject) From 13bf40a2c6e2de7685654e56935f0b204ea24ec6 Mon Sep 17 00:00:00 2001 From: Eric Song Date: Mon, 19 Oct 2020 13:59:44 -0400 Subject: [PATCH 5/6] empty string -> null --- .../hellbender/utils/bigquery/StorageAPIAvroReader.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/broadinstitute/hellbender/utils/bigquery/StorageAPIAvroReader.java b/src/main/java/org/broadinstitute/hellbender/utils/bigquery/StorageAPIAvroReader.java index 2320f434133..7ce809b43dd 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/bigquery/StorageAPIAvroReader.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/bigquery/StorageAPIAvroReader.java @@ -42,7 +42,7 @@ public class StorageAPIAvroReader implements GATKAvroReader { private GenericRecord nextRow = null; public StorageAPIAvroReader(final TableReference tableRef) { - this(tableRef, null, ""); + this(tableRef, null, null); } public StorageAPIAvroReader(final TableReference tableRef, String parentProjectId) { From 0643ae961b9546b7362bc0245f3e3d9c0adda87c Mon Sep 17 00:00:00 2001 From: Eric Song Date: Mon, 19 Oct 2020 14:39:34 -0400 Subject: [PATCH 6/6] make read-project-id optional --- .../hellbender/tools/variantdb/arrays/ArrayExtractCohort.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArrayExtractCohort.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArrayExtractCohort.java index a7b03bc38a7..674dd83d033 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArrayExtractCohort.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArrayExtractCohort.java @@ -54,7 +54,7 @@ public enum QueryMode { @Argument( fullName = "read-project-id", doc = "ID of the Google Cloud project to use (bill) when reading the microarray data tables", - optional = false + optional = true ) private String readProjectID = null;