From c25782f7aee09c96bffae9c6700778acc538c1e6 Mon Sep 17 00:00:00 2001 From: jamesemery Date: Wed, 11 Oct 2023 10:01:21 -0400 Subject: [PATCH] Funcotator Update for Datasource Release V1.8 (#8512) --- .../cosmic/createCosmicFusionGeneTsv.py | 4 +- .../cosmic/createSqliteCosmicDb.sh | 6 ++ .../cosmic/getCosmicDataSources.sh | 15 ++- .../data_sources/downloadHgncDataSource.sh | 2 +- .../finalizeFuncotatorReleaseDirectory.sh | 28 ++++++ scripts/funcotator/data_sources/getGencode.sh | 2 +- .../data_sources/getGencodeXHGNC.sh | 4 +- .../data_sources/getGencodeXRefseq.sh | 4 +- .../FuncotatorDataSourceDownloader.java | 77 ++++++++++++--- .../dataSources/DataSourceUtils.java | 37 +++++-- .../mafOutput/MafOutputRendererConstants.java | 43 ++++---- ...orDataSourceDownloaderIntegrationTest.java | 34 +++++-- .../dataSources/DataSourceUtilsUnitTest.java | 99 +++++++++++++++++-- .../dataSources/DbSnpIntegrationTest.java | 8 +- .../mafOutput/MafOutputRendererUnitTest.java | 22 +++++ .../utils/test/FuncotatorTestUtils.java | 8 +- 16 files changed, 312 insertions(+), 81 deletions(-) create mode 100644 scripts/funcotator/data_sources/finalizeFuncotatorReleaseDirectory.sh diff --git a/scripts/funcotator/data_sources/cosmic/createCosmicFusionGeneTsv.py b/scripts/funcotator/data_sources/cosmic/createCosmicFusionGeneTsv.py index 8964c9dc294..826795e92b5 100755 --- a/scripts/funcotator/data_sources/cosmic/createCosmicFusionGeneTsv.py +++ b/scripts/funcotator/data_sources/cosmic/createCosmicFusionGeneTsv.py @@ -90,7 +90,7 @@ def renderFusionGeneDictEntry(geneKey, fusionGeneDict): tsvReader = GenericTsvReader(inputFilename) headers = tsvReader.getFieldNames() print('Found headers (input): ' + str(headers)) - if "Translocation Name" not in headers: + if "TRANSLOCATION_NAME" not in headers: raise NotImplementedError("Could not find Translocation Name column in the input file.") outputHeaders = ['gene', 'fusion_genes', 'fusion_id'] @@ -99,7 +99,7 @@ def renderFusionGeneDictEntry(geneKey, fusionGeneDict): fusionGeneDict = OrderedDict() last_i = 0 for i, line in enumerate(tsvReader): - fusion_gene_description = line['Translocation Name'] + fusion_gene_description = line['TRANSLOCATION_NAME'] if len(fusion_gene_description.strip()) == 0: # blank diff --git a/scripts/funcotator/data_sources/cosmic/createSqliteCosmicDb.sh b/scripts/funcotator/data_sources/cosmic/createSqliteCosmicDb.sh index 800242de91a..b7a756a7932 100755 --- a/scripts/funcotator/data_sources/cosmic/createSqliteCosmicDb.sh +++ b/scripts/funcotator/data_sources/cosmic/createSqliteCosmicDb.sh @@ -18,6 +18,7 @@ set -e COSMIC_FILE=CosmicCompleteTargetedScreensMutantExport.tsv OUT_DB_FILE="Cosmic.db" +OUT_TMP_FOLDER="~/tmp" ################################################################################ @@ -29,6 +30,10 @@ if [[ $# -gt 1 ]] ; then OUT_DB_FILE=$2 fi +if [[ $# -gt 2 ]] ; then + OUT_TMP_FOLDER=$3 +fi + if [ ! -f ${COSMIC_FILE} ] ; then echo "ERROR: Given COSMIC file does not exist: ${COSMIC_FILE}" 1>&2 exit 1 @@ -42,6 +47,7 @@ sqlite3 ${OUT_DB_FILE} < cosmic/metadata.txt echo "User: ${EMAIL}" >> cosmic/metadata.txt diff --git a/scripts/funcotator/data_sources/downloadHgncDataSource.sh b/scripts/funcotator/data_sources/downloadHgncDataSource.sh index aa3d120d67d..b3e86a90b81 100755 --- a/scripts/funcotator/data_sources/downloadHgncDataSource.sh +++ b/scripts/funcotator/data_sources/downloadHgncDataSource.sh @@ -2,5 +2,5 @@ # Downloads the HGNC data source from the HGNC website. -curl 'https://www.genenames.org/cgi-bin/download?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_status&col=gd_locus_type&col=gd_locus_group&col=gd_prev_sym&col=gd_prev_name&col=gd_aliases&col=gd_name_aliases&col=gd_pub_chrom_map&col=gd_date_mod&col=gd_date_sym_change&col=gd_date_name_change&col=gd_pub_acc_ids&col=gd_enz_ids&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_pubmed_ids&col=gd_pub_refseq_ids&col=family.id&col=family.name&col=gd_ccds_ids&col=gd_vega_ids&col=md_eg_id&col=md_mim_id&col=md_refseq_id&col=md_prot_id&col=md_ensembl_id&col=md_ucsc_id&status=Approved&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit' > hgnc_download_$(date +%b%d%Y).tsv +curl 'https://www.genenames.org/cgi-bin/download/custom?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_status&col=gd_locus_type&col=gd_locus_group&col=gd_prev_sym&col=gd_prev_name&col=gd_aliases&col=gd_name_aliases&col=gd_pub_chrom_map&col=gd_date_mod&col=gd_date_sym_change&col=gd_date_name_change&col=gd_pub_acc_ids&col=gd_enz_ids&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_pubmed_ids&col=gd_pub_refseq_ids&col=family.id&col=family.name&col=gd_ccds_ids&col=gd_vega_ids&col=md_eg_id&col=md_mim_id&col=md_refseq_id&col=md_prot_id&col=md_ensembl_id&col=md_ucsc_id&status=Approved&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submi' > hgnc_download_$(date +%b%d%Y).tsv diff --git a/scripts/funcotator/data_sources/finalizeFuncotatorReleaseDirectory.sh b/scripts/funcotator/data_sources/finalizeFuncotatorReleaseDirectory.sh new file mode 100644 index 00000000000..5f124ef6a67 --- /dev/null +++ b/scripts/funcotator/data_sources/finalizeFuncotatorReleaseDirectory.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash + +#NOTE: This script has been checked in to aid in the release process for future Funcotator datasource bundles. + +echo "Making Tarballs of each Datasource Directory..." + +tar -zcvf funcotator_dataSources.v1.8.hg38.20230908s.tar.gz funcotator_dataSources.v1.8.hg38.20230908s +tar -zcvf funcotator_dataSources.v1.8.hg38.20230908g.tar.gz funcotator_dataSources.v1.8.hg38.20230908g +tar -zcvf funcotator_dataSources.v1.8.hg19.20230908s.tar.gz funcotator_dataSources.v1.8.hg19.20230908s +tar -zcvf funcotator_dataSources.v1.8.hg19.20230908g.tar.gz funcotator_dataSources.v1.8.hg19.20230908g + +echo "Making the various hashfiles for release" + +find funcotator_dataSources.v1.8.hg38.20230908s -type f | xargs md5sum > funcotator_dataSources.v1.8.hg38.20230908s.dir.long.md5sum +md5sum funcotator_dataSources.v1.8.hg38.20230908s.tar.gz | awk '{print $1}' > funcotator_dataSources.v1.8.hg38.20230908s.dir.md5sum +sha256sum funcotator_dataSources.v1.8.hg38.20230908s.tar.gz > funcotator_dataSources.v1.8.hg38.20230908s.sha256 + +find funcotator_dataSources.v1.8.hg38.20230908g -type f | xargs md5sum > funcotator_dataSources.v1.8.hg38.20230908g.dir.long.md5sum +md5sum funcotator_dataSources.v1.8.hg38.20230908g.tar.gz | awk '{print $1}' > funcotator_dataSources.v1.8.hg38.20230908g.dir.md5sum +sha256sum funcotator_dataSources.v1.8.hg38.20230908g.tar.gz > funcotator_dataSources.v1.8.hg38.20230908g.sha256 + +find funcotator_dataSources.v1.8.hg19.20230908s -type f | xargs md5sum > funcotator_dataSources.v1.8.hg19.20230908s.dir.long.md5sum +md5sum funcotator_dataSources.v1.8.hg19.20230908s.tar.gz | awk '{print $1}' > funcotator_dataSources.v1.8.hg19.20230908s.dir.md5sum +sha256sum funcotator_dataSources.v1.8.hg19.20230908s.tar.gz > funcotator_dataSources.v1.8.hg19.20230908s.sha256 + +find funcotator_dataSources.v1.8.hg19.20230908g -type f | xargs md5sum > funcotator_dataSources.v1.8.hg19.20230908g.dir.long.md5sum +md5sum funcotator_dataSources.v1.8.hg19.20230908g.tar.gz | awk '{print $1}' > funcotator_dataSources.v1.8.hg19.20230908g.dir.md5sum +sha256sum funcotator_dataSources.v1.8.hg19.20230908g.tar.gz > funcotator_dataSources.v1.8.hg19.20230908g.sha256 \ No newline at end of file diff --git a/scripts/funcotator/data_sources/getGencode.sh b/scripts/funcotator/data_sources/getGencode.sh index 7ab0316787b..aaf159add43 100755 --- a/scripts/funcotator/data_sources/getGencode.sh +++ b/scripts/funcotator/data_sources/getGencode.sh @@ -15,7 +15,7 @@ MAXARGS=0 # Latest release numbers for our references. # Update these numbers when a new Gencode is released. -LATEST_RELEASE=34 +LATEST_RELEASE=43 DATA_SOURCE_NAME="Gencode" OUT_DIR_NAME='gencode' diff --git a/scripts/funcotator/data_sources/getGencodeXHGNC.sh b/scripts/funcotator/data_sources/getGencodeXHGNC.sh index 656e6c8bc45..cb4ada59b1f 100755 --- a/scripts/funcotator/data_sources/getGencodeXHGNC.sh +++ b/scripts/funcotator/data_sources/getGencodeXHGNC.sh @@ -8,10 +8,10 @@ outFileBaseName="gencode_xhgnc" outExt=".tsv" hg19db="homo_sapiens_core_75_37" -hg38db="homo_sapiens_core_90_38" +hg38db="homo_sapiens_core_110_38" hg19FileName=${outFileBaseName}_v75_37.hg19${outExt} -hg38FileName=${outFileBaseName}_v90_38.hg38${outExt} +hg38FileName=${outFileBaseName}_v110_38.hg38${outExt} ################################################################################ diff --git a/scripts/funcotator/data_sources/getGencodeXRefseq.sh b/scripts/funcotator/data_sources/getGencodeXRefseq.sh index 58c06455e4b..809fe22fcbb 100755 --- a/scripts/funcotator/data_sources/getGencodeXRefseq.sh +++ b/scripts/funcotator/data_sources/getGencodeXRefseq.sh @@ -8,10 +8,10 @@ outFileBaseName="gencode_xrefseq" outExt=".tsv" hg19db="homo_sapiens_core_75_37" -hg38db="homo_sapiens_core_90_38" +hg38db="homo_sapiens_core_110_38" hg19FileName=${outFileBaseName}_v75_37.hg19${outExt} -hg38FileName=${outFileBaseName}_v90_38.hg38${outExt} +hg38FileName=${outFileBaseName}_v110_38.hg38${outExt} ################################################################################ diff --git a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorDataSourceDownloader.java b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorDataSourceDownloader.java index 8468594d506..80b7a5bce40 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorDataSourceDownloader.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorDataSourceDownloader.java @@ -32,8 +32,8 @@ *

* To download and extract the data sources, you can invoke {@link FuncotatorDataSourceDownloader} in the following ways: *

*

* @@ -63,6 +63,8 @@ public class FuncotatorDataSourceDownloader extends CommandLineProgram { public static final String GERMLINE_ARG_LONG_NAME = "germline"; public static final String OVERWRITE_ARG_LONG_NAME = "overwrite-output-file"; public static final String EXTRACT_AFTER_DOWNLOAD = "extract-after-download"; + public static final String HG38_ARG_LONG_NAME = "hg38"; + public static final String HG19_ARG_LONG_NAME = "hg19"; //================================================================================================================== // Private Static Members: @@ -73,18 +75,27 @@ public class FuncotatorDataSourceDownloader extends CommandLineProgram { // Private Static Members: // Set to always get the latest version of the data sources: - private static final String BASE_URL = DataSourceUtils.DATA_SOURCES_BUCKET_PATH + - DataSourceUtils.DATA_SOURCES_NAME_PREFIX + "." + DataSourceUtils.getDataSourceMaxVersionString(); + private static final String HG38_BASE_URL = DataSourceUtils.DATA_SOURCES_BUCKET_PATH + + DataSourceUtils.DATA_SOURCES_NAME_PREFIX + "." + DataSourceUtils.getDataSourceMaxVersionString(38); + private static final String HG19_BASE_URL = DataSourceUtils.DATA_SOURCES_BUCKET_PATH + + DataSourceUtils.DATA_SOURCES_NAME_PREFIX + "." + DataSourceUtils.getDataSourceMaxVersionString(19); + + private static final String HG38_GERMLINE_GCLOUD_DATASOURCES_BASEURL = HG38_BASE_URL + DataSourceUtils.DS_GERMLINE_NAME_MODIFIER; + private static final String HG19_GERMLINE_GCLOUD_DATASOURCES_BASEURL = HG19_BASE_URL + DataSourceUtils.DS_GERMLINE_NAME_MODIFIER; - private static final String GERMLINE_GCLOUD_DATASOURCES_BASEURL = BASE_URL + DataSourceUtils.DS_GERMLINE_NAME_MODIFIER; @VisibleForTesting - static final Path GERMLINE_GCLOUD_DATASOURCES_PATH = IOUtils.getPath(GERMLINE_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_EXTENSION); - private static final Path GERMLINE_GCLOUD_DATASOURCES_SHA256_PATH = IOUtils.getPath(GERMLINE_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_CHECKSUM_EXTENSION); + static final Path HG38_GERMLINE_GCLOUD_DATASOURCES_PATH = IOUtils.getPath(HG38_GERMLINE_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_EXTENSION); + static final Path HG19_GERMLINE_GCLOUD_DATASOURCES_PATH = IOUtils.getPath(HG19_GERMLINE_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_EXTENSION); + private static final Path HG38_GERMLINE_GCLOUD_DATASOURCES_SHA256_PATH = IOUtils.getPath(HG38_GERMLINE_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_CHECKSUM_EXTENSION); + private static final Path HG19_GERMLINE_GCLOUD_DATASOURCES_SHA256_PATH = IOUtils.getPath(HG19_GERMLINE_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_CHECKSUM_EXTENSION); - public static final String SOMATIC_GCLOUD_DATASOURCES_BASEURL = BASE_URL + DataSourceUtils.DS_SOMATIC_NAME_MODIFIER;; + public static final String HG38_SOMATIC_GCLOUD_DATASOURCES_BASEURL = HG38_BASE_URL + DataSourceUtils.DS_SOMATIC_NAME_MODIFIER;; + public static final String HG19_SOMATIC_GCLOUD_DATASOURCES_BASEURL = HG19_BASE_URL + DataSourceUtils.DS_SOMATIC_NAME_MODIFIER;; - public static final Path SOMATIC_GCLOUD_DATASOURCES_PATH = IOUtils.getPath(SOMATIC_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_EXTENSION); - private static final Path SOMATIC_GCLOUD_DATASOURCES_SHA256_PATH = IOUtils.getPath(SOMATIC_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_CHECKSUM_EXTENSION); + public static final Path HG38_SOMATIC_GCLOUD_DATASOURCES_PATH = IOUtils.getPath(HG38_SOMATIC_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_EXTENSION); + public static final Path HG19_SOMATIC_GCLOUD_DATASOURCES_PATH = IOUtils.getPath(HG19_SOMATIC_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_EXTENSION); + private static final Path HG38_SOMATIC_GCLOUD_DATASOURCES_SHA256_PATH = IOUtils.getPath(HG38_SOMATIC_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_CHECKSUM_EXTENSION); + private static final Path HG19_SOMATIC_GCLOUD_DATASOURCES_SHA256_PATH = IOUtils.getPath(HG19_SOMATIC_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_CHECKSUM_EXTENSION); //================================================================================================================== // Private Members: @@ -129,6 +140,23 @@ public class FuncotatorDataSourceDownloader extends CommandLineProgram { optional = true) protected boolean extractDataSourcesAfterDownload = false; + @Argument( + shortName = HG38_ARG_LONG_NAME, + fullName = HG38_ARG_LONG_NAME, + mutex = {HG19_ARG_LONG_NAME, TESTING_OVERRIDE_PATH_FOR_DATA_SOURCES_SHA256_ARG}, + doc = "If set, will extract data from the HG38 data sources bucket.", + optional = true) + protected boolean getHg38Datasources = false; + + @Argument( + //TODO should these be MUTEX or should one be allowed to download either? + shortName = HG19_ARG_LONG_NAME, + fullName = HG19_ARG_LONG_NAME, + mutex = {HG38_ARG_LONG_NAME, TESTING_OVERRIDE_PATH_FOR_DATA_SOURCES_SHA256_ARG}, + doc = "If set, will extract data from the HG19 data sources bucket.", + optional = true) + protected boolean getHg19Datasources = false; + // Testing arguments: @Hidden @Advanced @@ -164,6 +192,11 @@ protected void onStartup() { throw new UserException("Must select either somatic or germline datasources."); } + // Make sure the user specified at least one reference source to download: + if ((!getHg38Datasources) && (!getHg19Datasources) && (testingOverrideDataSourcesPath == null)) { + throw new UserException("Must select either HG19 or HG38 datasources."); + } + // Make sure the testing inputs are correct: if ( ((testingOverrideDataSourcesPath == null) && (testingOverrideDataSourcesSha256Path != null)) || ((testingOverrideDataSourcesSha256Path == null) && (testingOverrideDataSourcesPath != null)) ) { @@ -184,14 +217,26 @@ protected Object doWork() { // Get the correct data source: if ( getSomaticDataSources ) { - dataSourceDescription = "Somatic"; - dataSourcesPath = SOMATIC_GCLOUD_DATASOURCES_PATH; - dataSourcesSha256Path = SOMATIC_GCLOUD_DATASOURCES_SHA256_PATH; + if (getHg38Datasources) { + dataSourceDescription = "HG38_Somatic"; + dataSourcesPath = HG38_SOMATIC_GCLOUD_DATASOURCES_PATH; + dataSourcesSha256Path = HG38_SOMATIC_GCLOUD_DATASOURCES_SHA256_PATH; + } else { // Okay because HG38 and HG19 datasources are currently MUTEX and at least one is required + dataSourceDescription = "HG19_Somatic"; + dataSourcesPath = HG19_SOMATIC_GCLOUD_DATASOURCES_PATH; + dataSourcesSha256Path = HG19_SOMATIC_GCLOUD_DATASOURCES_SHA256_PATH; + } } else if ( getGermlineDataSources ) { - dataSourceDescription = "Germline"; - dataSourcesPath = GERMLINE_GCLOUD_DATASOURCES_PATH; - dataSourcesSha256Path = GERMLINE_GCLOUD_DATASOURCES_SHA256_PATH; + if (getHg38Datasources) { + dataSourceDescription = "HG38_Germline"; + dataSourcesPath = HG38_GERMLINE_GCLOUD_DATASOURCES_PATH; + dataSourcesSha256Path = HG38_GERMLINE_GCLOUD_DATASOURCES_SHA256_PATH; + } else { + dataSourceDescription = "HG19_Germline"; + dataSourcesPath = HG19_GERMLINE_GCLOUD_DATASOURCES_PATH; + dataSourcesSha256Path = HG19_GERMLINE_GCLOUD_DATASOURCES_SHA256_PATH; + } } else { // Test case: diff --git a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/DataSourceUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/DataSourceUtils.java index 48dd6560bee..01791264a01 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/DataSourceUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/DataSourceUtils.java @@ -53,7 +53,9 @@ private DataSourceUtils() {} private static final String MANIFEST_SOURCE_LINE_START = "Source:"; private static final String MANIFEST_ALT_SOURCE_LINE_START = "Alternate Source:"; @VisibleForTesting - static final Pattern VERSION_PATTERN = Pattern.compile(MANIFEST_VERSION_LINE_START + "\\s+(\\d+)\\.(\\d+)\\.(\\d\\d\\d\\d)(\\d\\d)(\\d\\d)(.*)"); + static final Pattern OLD_VERSION_PATTERN = Pattern.compile(MANIFEST_VERSION_LINE_START + "\\s+(\\d+)\\.(\\d+)\\.(\\d\\d\\d\\d)(\\d\\d)(\\d\\d)(.*)"); + static final Pattern NEW_VERSION_PATTERN = Pattern.compile(MANIFEST_VERSION_LINE_START + "\\s+(\\d+)\\.(\\d+)\\.hg(\\d+)\\.(\\d\\d\\d\\d)(\\d\\d)(\\d\\d)(.*)"); + private static final Pattern SOURCE_PATTERN = Pattern.compile(MANIFEST_SOURCE_LINE_START + "\\s+(ftp.*)"); private static final Pattern ALT_SOURCE_PATTERN = Pattern.compile(MANIFEST_ALT_SOURCE_LINE_START + "\\s+(gs.*)"); @@ -69,9 +71,9 @@ private DataSourceUtils() {} @VisibleForTesting static final int MAX_MAJOR_VERSION_NUMBER = 1; @VisibleForTesting - static final int MAX_MINOR_VERSION_NUMBER = 7; + static final int MAX_MINOR_VERSION_NUMBER = 8; @VisibleForTesting - static final LocalDate MAX_DATE = LocalDate.of(2020, Month.MAY, 21); + static final LocalDate MAX_DATE = LocalDate.of(2023, Month.SEPTEMBER, 8); //================================================================================================================== // Public Static Members: @@ -80,7 +82,7 @@ private DataSourceUtils() {} public static final String CURRENT_MINIMUM_DATA_SOURCE_VERSION = getDataSourceMinVersionString(); /** The maximum supported version of the data sources for funcotator to run. */ - public static final String CURRENT_MAXIMUM_DATA_SOURCE_VERSION = getDataSourceMaxVersionString(); + public static final String CURRENT_MAXIMUM_DATA_SOURCE_VERSION = getDataSourceMaxVersionString(38); public static final String MANIFEST_FILE_NAME = "MANIFEST.txt"; public static final String DATA_SOURCES_FTP_PATH = "ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/funcotator/"; @@ -88,6 +90,8 @@ private DataSourceUtils() {} public static final String DATA_SOURCES_NAME_PREFIX = "funcotator_dataSources"; public static final String DS_SOMATIC_NAME_MODIFIER = "s"; public static final String DS_GERMLINE_NAME_MODIFIER = "g"; + public static final String DS_HG38_NAME_MODIFIER = "hg38"; + public static final String DS_HG19_NAME_MODIFIER = "hg19"; public static final String DS_EXTENSION = ".tar.gz"; public static final String DS_CHECKSUM_EXTENSION = ".sha256"; @@ -137,8 +141,8 @@ public static String getDataSourceMinVersionString() { * {@link #MAX_DATE} * @return A {@link String} representing the Max version information as it would appear in the data sources file name. */ - public static String getDataSourceMaxVersionString() { - return getDataSourceVersionString(MAX_MAJOR_VERSION_NUMBER, MAX_MINOR_VERSION_NUMBER, MAX_DATE); + public static String getDataSourceMaxVersionString(final int ref) { + return getNewDataSourceVersionString(MAX_MAJOR_VERSION_NUMBER, MAX_MINOR_VERSION_NUMBER, ref, MAX_DATE); } @@ -159,6 +163,25 @@ public static String getDataSourceVersionString(final int major, final int minor date.getDayOfMonth() ); } + /** + * Get the string representing the given version information for funcotator as it would be written in the data sources + * release files. + * @param major {@code int} representing the major version of the data sources to use. + * @param minor {@code int} representing the minor version of the data sources to use. + * @param ref {@code int} representing the hg reference number of the data sources to use. + * @param date {@link LocalDate} representing the date of the data sources to use. + * @return A {@link String} representing the given version information as it would appear in the data sources file name. + */ + public static String getNewDataSourceVersionString(final int major, final int minor, final int ref, final LocalDate date) { + return String.format("v%d.%d.hg%d.%d%02d%02d", + major, + minor, + ref, + date.getYear(), + date.getMonthValue(), + date.getDayOfMonth() + ); + } /** * Initializes the data sources for {@link Funcotator}. @@ -704,7 +727,7 @@ private static boolean logDataSourcesInfo(final Path dataSourcesPath) { while ((line != null) && ((version == null) || (source == null) || (alternateSource == null))) { if (version == null && line.startsWith(MANIFEST_VERSION_LINE_START)) { - final Matcher matcher = VERSION_PATTERN.matcher(line); + final Matcher matcher = NEW_VERSION_PATTERN.matcher(line); if ( matcher.matches() ) { versionMajor = Integer.valueOf(matcher.group(1)); versionMinor = Integer.valueOf(matcher.group(2)); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/mafOutput/MafOutputRendererConstants.java b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/mafOutput/MafOutputRendererConstants.java index 359ab701153..9a7be220c34 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/mafOutput/MafOutputRendererConstants.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/mafOutput/MafOutputRendererConstants.java @@ -178,19 +178,20 @@ public class MafOutputRendererConstants { static final Map VariantClassificationMapInverse; // Output Field Name Map Defaults: - static final List OutputFieldNameMap_Hugo_Symbol = Arrays.asList(FieldName_Hugo_Symbol, "Gencode_19_hugoSymbol", "Gencode_27_hugoSymbol", "Gencode_28_hugoSymbol", "Gencode_34_hugoSymbol", "gene", "Gene"); + //TODO these are hardcoded to gencode versions and should be updated to generalize to any version of gencode (see https://github.com/broadinstitute/gatk/issues/8482) + static final List OutputFieldNameMap_Hugo_Symbol = Arrays.asList(FieldName_Hugo_Symbol, "Gencode_19_hugoSymbol", "Gencode_27_hugoSymbol", "Gencode_28_hugoSymbol", "Gencode_34_hugoSymbol", "Gencode_43_hugoSymbol", "gene", "Gene"); static final List OutputFieldNameMap_Entrez_Gene_Id = Arrays.asList(FieldName_Entrez_Gene_Id, "HGNC_Entrez_Gene_ID", "HGNC_Entrez Gene ID", "HGNC_Entrez_Gene_ID(supplied_by_NCBI)", "HGNC_Entrez Gene ID(supplied by NCBI)", "entrez_id", "gene_id"); static final List OutputFieldNameMap_Center = Arrays.asList(FieldName_Center, "center"); - static final List OutputFieldNameMap_NCBI_Build = Arrays.asList(FieldName_NCBI_Build, "Gencode_19_ncbiBuild", "Gencode_27_ncbiBuild", "Gencode_28_ncbiBuild", "Gencode_34_ncbiBuild", "ncbi_build"); - static final List OutputFieldNameMap_Chromosome = Arrays.asList(FieldName_Chromosome, "Gencode_19_chromosome", "Gencode_27_chromosome", "Gencode_28_chromosome", "Gencode_34_chromosome", "chr", "contig", "chromosome", "chrom", "Chrom"); - static final List OutputFieldNameMap_Start_Position = Arrays.asList(FieldName_Start_Position, "Start_position", "Gencode_19_start", "Gencode_27_start", "Gencode_28_start", "Gencode_34_start", "start", "Start", "start_pos", "pos"); - static final List OutputFieldNameMap_End_Position = Arrays.asList(FieldName_End_Position, "End_position", "Gencode_19_end", "Gencode_27_end", "Gencode_28_end", "Gencode_34_end", "end", "End", "end_pos"); + static final List OutputFieldNameMap_NCBI_Build = Arrays.asList(FieldName_NCBI_Build, "Gencode_19_ncbiBuild", "Gencode_27_ncbiBuild", "Gencode_28_ncbiBuild", "Gencode_34_ncbiBuild", "Gencode_43_ncbiBuild", "ncbi_build"); + static final List OutputFieldNameMap_Chromosome = Arrays.asList(FieldName_Chromosome, "Gencode_19_chromosome", "Gencode_27_chromosome", "Gencode_28_chromosome", "Gencode_34_chromosome", "Gencode_43_chromosome", "chr", "contig", "chromosome", "chrom", "Chrom"); + static final List OutputFieldNameMap_Start_Position = Arrays.asList(FieldName_Start_Position, "Start_position", "Gencode_19_start", "Gencode_27_start", "Gencode_28_start", "Gencode_34_start", "Gencode_43_start", "start", "Start", "start_pos", "pos"); + static final List OutputFieldNameMap_End_Position = Arrays.asList(FieldName_End_Position, "End_position", "Gencode_19_end", "Gencode_27_end", "Gencode_28_end", "Gencode_34_end", "Gencode_43_end", "end", "End", "end_pos"); static final List OutputFieldNameMap_Strand = Collections.singletonList(FieldName_Strand); - static final List OutputFieldNameMap_Variant_Classification = Arrays.asList(FieldName_Variant_Classification, "Gencode_19_variantClassification", "Gencode_27_variantClassification", "Gencode_28_variantClassification", "Gencode_34_variantClassification", "variant_classification"); - static final List OutputFieldNameMap_Variant_Type = Arrays.asList(FieldName_Variant_Type, "Gencode_19_variantType", "Gencode_27_variantType", "Gencode_28_variantType", "Gencode_34_variantType", "variant_type"); - static final List OutputFieldNameMap_Reference_Allele = Arrays.asList(FieldName_Reference_Allele, "Gencode_19_refAllele", "Gencode_27_refAllele", "Gencode_28_refAllele", "Gencode_34_refAllele", "ref", "ref_allele", "reference_allele"); - static final List OutputFieldNameMap_Tumor_Seq_Allele1 = Arrays.asList(FieldName_Tumor_Seq_Allele1, "Gencode_19_tumorSeqAllele1", "Gencode_27_tumorSeqAllele1", "Gencode_28_tumorSeqAllele1", "Gencode_34_tumorSeqAllele1", "ref", "ref_allele", "reference_allele"); - static final List OutputFieldNameMap_Tumor_Seq_Allele2 = Arrays.asList(FieldName_Tumor_Seq_Allele2, "Gencode_19_tumorSeqAllele2", "Gencode_27_tumorSeqAllele2", "Gencode_28_tumorSeqAllele2", "Gencode_34_tumorSeqAllele2", "alt", "alt_allele", "alt2", "alt_allele2", "alternate_allele2", "observed_allele2", "alternate_allele", "observed_allele", "alt1", "alt_allele1", "alternate_allele1", "observed_allele1"); + static final List OutputFieldNameMap_Variant_Classification = Arrays.asList(FieldName_Variant_Classification, "Gencode_19_variantClassification", "Gencode_27_variantClassification", "Gencode_28_variantClassification", "Gencode_34_variantClassification", "Gencode_43_variantClassification", "variant_classification"); + static final List OutputFieldNameMap_Variant_Type = Arrays.asList(FieldName_Variant_Type, "Gencode_19_variantType", "Gencode_27_variantType", "Gencode_28_variantType", "Gencode_34_variantType", "Gencode_43_variantType", "variant_type"); + static final List OutputFieldNameMap_Reference_Allele = Arrays.asList(FieldName_Reference_Allele, "Gencode_19_refAllele", "Gencode_27_refAllele", "Gencode_28_refAllele", "Gencode_34_refAllele", "Gencode_43_refAllele", "ref", "ref_allele", "reference_allele"); + static final List OutputFieldNameMap_Tumor_Seq_Allele1 = Arrays.asList(FieldName_Tumor_Seq_Allele1, "Gencode_19_tumorSeqAllele1", "Gencode_27_tumorSeqAllele1", "Gencode_28_tumorSeqAllele1", "Gencode_34_tumorSeqAllele1", "Gencode_43_tumorSeqAllele1", "ref", "ref_allele", "reference_allele"); + static final List OutputFieldNameMap_Tumor_Seq_Allele2 = Arrays.asList(FieldName_Tumor_Seq_Allele2, "Gencode_19_tumorSeqAllele2", "Gencode_27_tumorSeqAllele2", "Gencode_28_tumorSeqAllele2", "Gencode_34_tumorSeqAllele2", "Gencode_43_tumorSeqAllele2", "alt", "alt_allele", "alt2", "alt_allele2", "alternate_allele2", "observed_allele2", "alternate_allele", "observed_allele", "alt1", "alt_allele1", "alternate_allele1", "observed_allele1"); static final List OutputFieldNameMap_dbSNP_RS = Arrays.asList(FieldName_dbSNP_RS, "dbsnp_rs", "dbSNP_RSPOS"); static final List OutputFieldNameMap_dbSNP_Val_Status = Arrays.asList(FieldName_dbSNP_Val_Status, MAF_DBSNP_VAL_STATUS_FIELD, "dbsnp_val_status", DBSNP_VLD_NAME); static final List OutputFieldNameMap_Tumor_Sample_Barcode = Arrays.asList(FieldName_Tumor_Sample_Barcode, "tumor_barcode", "tumor_id", "case_barcode", "case_id", "tumor_name"); @@ -212,15 +213,15 @@ public class MafOutputRendererConstants { static final List OutputFieldNameMap_Sequencer = Arrays.asList(FieldName_Sequencer, "sequencer", "platform"); static final List OutputFieldNameMap_Tumor_Sample_UUID = Arrays.asList(FieldName_Tumor_Sample_UUID, "tumor_uuid", "case_uuid", "tumor_barcode", "tumor_id", "case_barcode", "case_id", "tumor_name", "Tumor_Sample_Barcode"); static final List OutputFieldNameMap_Matched_Norm_Sample_UUID = Arrays.asList(FieldName_Matched_Norm_Sample_UUID, "normal_uuid", "control_uuid", "normal_barcode", "normal_id", "control_barcode", "control_id", "normal_name", "sample_name", "Matched_Norm_Sample_Barcode"); - static final List OutputFieldNameMap_Genome_Change = Arrays.asList(FieldName_Genome_Change, "Gencode_19_genomeChange", "Gencode_27_genomeChange", "Gencode_28_genomeChange", "Gencode_34_genomeChange", "genome_change"); - static final List OutputFieldNameMap_Annotation_Transcript = Arrays.asList(FieldName_Annotation_Transcript, "Gencode_19_annotationTranscript", "Gencode_27_annotationTranscript", "Gencode_28_annotationTranscript", "Gencode_34_annotationTranscript", "annotation_transcript", "transcript_id"); - static final List OutputFieldNameMap_Transcript_Strand = Arrays.asList(FieldName_Transcript_Strand, "Gencode_19_transcriptStrand", "Gencode_27_transcriptStrand", "Gencode_28_transcriptStrand", "Gencode_34_transcriptStrand", "transcript_strand"); - static final List OutputFieldNameMap_Transcript_Exon = Arrays.asList(FieldName_Transcript_Exon, "Gencode_19_transcriptExon", "Gencode_27_transcriptExon", "Gencode_28_transcriptExon", "Gencode_34_transcriptExon", "transcript_exon"); - static final List OutputFieldNameMap_Transcript_Position = Arrays.asList(FieldName_Transcript_Position, "Gencode_19_transcriptPos", "Gencode_27_transcriptPos", "Gencode_28_transcriptPos", "Gencode_34_transcriptPos", "transcript_position"); - static final List OutputFieldNameMap_cDNA_Change = Arrays.asList(FieldName_cDNA_Change, "Gencode_19_cDnaChange", "Gencode_27_cDnaChange", "Gencode_28_cDnaChange", "Gencode_34_cDnaChange", "transcript_change"); - static final List OutputFieldNameMap_Codon_Change = Arrays.asList(FieldName_Codon_Change, "Gencode_19_codonChange", "Gencode_27_codonChange", "Gencode_28_codonChange", "Gencode_34_codonChange", "codon_change"); - static final List OutputFieldNameMap_Protein_Change = Arrays.asList(FieldName_Protein_Change, "Gencode_19_proteinChange", "Gencode_27_proteinChange", "Gencode_28_proteinChange", "Gencode_34_proteinChange", "protein_change"); - static final List OutputFieldNameMap_Other_Transcripts = Arrays.asList(FieldName_Other_Transcripts, "Gencode_19_otherTranscripts", "Gencode_27_otherTranscripts", "Gencode_28_otherTranscripts", "Gencode_34_otherTranscripts", "other_transcripts"); + static final List OutputFieldNameMap_Genome_Change = Arrays.asList(FieldName_Genome_Change, "Gencode_19_genomeChange", "Gencode_27_genomeChange", "Gencode_28_genomeChange", "Gencode_34_genomeChange", "Gencode_43_genomeChange", "genome_change"); + static final List OutputFieldNameMap_Annotation_Transcript = Arrays.asList(FieldName_Annotation_Transcript, "Gencode_19_annotationTranscript", "Gencode_27_annotationTranscript", "Gencode_28_annotationTranscript", "Gencode_34_annotationTranscript", "Gencode_43_annotationTranscript", "annotation_transcript", "transcript_id"); + static final List OutputFieldNameMap_Transcript_Strand = Arrays.asList(FieldName_Transcript_Strand, "Gencode_19_transcriptStrand", "Gencode_27_transcriptStrand", "Gencode_28_transcriptStrand", "Gencode_34_transcriptStrand", "Gencode_43_transcriptStrand", "transcript_strand"); + static final List OutputFieldNameMap_Transcript_Exon = Arrays.asList(FieldName_Transcript_Exon, "Gencode_19_transcriptExon", "Gencode_27_transcriptExon", "Gencode_28_transcriptExon", "Gencode_34_transcriptExon", "Gencode_43_transcriptExon", "transcript_exon"); + static final List OutputFieldNameMap_Transcript_Position = Arrays.asList(FieldName_Transcript_Position, "Gencode_19_transcriptPos", "Gencode_27_transcriptPos", "Gencode_28_transcriptPos", "Gencode_34_transcriptPos", "Gencode_43_transcriptPos", "transcript_position"); + static final List OutputFieldNameMap_cDNA_Change = Arrays.asList(FieldName_cDNA_Change, "Gencode_19_cDnaChange", "Gencode_27_cDnaChange", "Gencode_28_cDnaChange", "Gencode_34_cDnaChange", "Gencode_43_cDnaChange", "transcript_change"); + static final List OutputFieldNameMap_Codon_Change = Arrays.asList(FieldName_Codon_Change, "Gencode_19_codonChange", "Gencode_27_codonChange", "Gencode_28_codonChange", "Gencode_34_codonChange", "Gencode_43_codonChange", "codon_change"); + static final List OutputFieldNameMap_Protein_Change = Arrays.asList(FieldName_Protein_Change, "Gencode_19_proteinChange", "Gencode_27_proteinChange", "Gencode_28_proteinChange", "Gencode_34_proteinChange", "Gencode_43_proteinChange", "protein_change"); + static final List OutputFieldNameMap_Other_Transcripts = Arrays.asList(FieldName_Other_Transcripts, "Gencode_19_otherTranscripts", "Gencode_27_otherTranscripts", "Gencode_28_otherTranscripts", "Gencode_34_otherTranscripts", "Gencode_43_otherTranscripts", "other_transcripts"); static final List OutputFieldNameMap_Refseq_mRNA_Id = Arrays.asList(FieldName_Refseq_mRNA_Id, "Gencode_XRefSeq_mRNA_id", "gencode_xref_refseq_mRNA_id", "ENSEMBL_RefSeq_mRNA_accession", "RefSeq_mRNA_Id", "HGNC_RefSeq IDs"); static final List OutputFieldNameMap_Refseq_prot_Id = Arrays.asList(FieldName_Refseq_prot_Id, "Gencode_XRefSeq_prot_acc", "gencode_xref_refseq_prot_acc", "ENSEMBL_RefSeq_protein_accession", "RefSeq_prot_Id"); static final List OutputFieldNameMap_SwissProt_acc_Id = Arrays.asList(FieldName_SwissProt_acc_Id, "Simple_Uniprot_uniprot_accession", "uniprot_accession", "UniProt_uniprot_accession"); @@ -243,8 +244,8 @@ public class MafOutputRendererConstants { static final List OutputFieldNameMap_TCGAscape_Amplification_Peaks = Arrays.asList(FieldName_TCGAscape_Amplification_Peaks, "TCGAScape_Amplification_Peaks"); static final List OutputFieldNameMap_TCGAscape_Deletion_Peaks = Arrays.asList(FieldName_TCGAscape_Deletion_Peaks, "TCGAScape_Deletion_Peaks"); static final List OutputFieldNameMap_DrugBank = Arrays.asList(FieldName_DrugBank, "Simple_Uniprot_DrugBank", "UniProt_DrugBank"); - static final List OutputFieldNameMap_ref_context = Arrays.asList(FieldName_ref_context, "Gencode_19_referenceContext", "Gencode_27_referenceContext", "Gencode_28_referenceContext", "Gencode_34_referenceContext", "ref_context"); - static final List OutputFieldNameMap_gc_content = Arrays.asList(FieldName_gc_content, "Gencode_19_gcContent", "Gencode_27_gcContent", "Gencode_28_gcContent", "Gencode_34_gcContent", "gc_content"); + static final List OutputFieldNameMap_ref_context = Arrays.asList(FieldName_ref_context, "Gencode_19_referenceContext", "Gencode_27_referenceContext", "Gencode_28_referenceContext", "Gencode_34_referenceContext", "Gencode_43_referenceContext", "ref_context"); + static final List OutputFieldNameMap_gc_content = Arrays.asList(FieldName_gc_content, "Gencode_19_gcContent", "Gencode_27_gcContent", "Gencode_28_gcContent", "Gencode_34_gcContent", "Gencode_43_gcContent", "gc_content"); static final List OutputFieldNameMap_CCLE_ONCOMAP_overlapping_mutations = Arrays.asList(FieldName_CCLE_ONCOMAP_overlapping_mutations, "CCLE_By_GP_overlapping_mutations"); static final List OutputFieldNameMap_CCLE_ONCOMAP_total_mutations_in_gene = Arrays.asList(FieldName_CCLE_ONCOMAP_total_mutations_in_gene, "CCLE_By_Gene_total_mutations_in_gene"); static final List OutputFieldNameMap_CGC_Mutation_Type = Arrays.asList(FieldName_CGC_Mutation_Type, "CGC_Mutation Type"); diff --git a/src/test/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorDataSourceDownloaderIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorDataSourceDownloaderIntegrationTest.java index 932cd39f087..e8f477c130c 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorDataSourceDownloaderIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorDataSourceDownloaderIntegrationTest.java @@ -30,20 +30,33 @@ public class FuncotatorDataSourceDownloaderIntegrationTest extends CommandLinePr //================================================================================================================== // Helper Methods: - private Path getDataSourceRemotePath(final String dsTypeArg) { + private Path getDataSourceRemotePath(final String dsTypeArg, final String refVer) { switch (dsTypeArg) { case FuncotatorDataSourceDownloader.SOMATIC_ARG_LONG_NAME: - return FuncotatorDataSourceDownloader.SOMATIC_GCLOUD_DATASOURCES_PATH; + switch (refVer) { + case "hg19": + return FuncotatorDataSourceDownloader.HG19_SOMATIC_GCLOUD_DATASOURCES_PATH; + case "hg38": + return FuncotatorDataSourceDownloader.HG38_SOMATIC_GCLOUD_DATASOURCES_PATH; + default: throw new GATKException("Data source Reference Version does not exist: " + refVer); + } + case FuncotatorDataSourceDownloader.GERMLINE_ARG_LONG_NAME: - return FuncotatorDataSourceDownloader.GERMLINE_GCLOUD_DATASOURCES_PATH; + switch (refVer) { + case "hg19": + return FuncotatorDataSourceDownloader.HG19_GERMLINE_GCLOUD_DATASOURCES_PATH; + case "hg38": + return FuncotatorDataSourceDownloader.HG38_GERMLINE_GCLOUD_DATASOURCES_PATH; + default: throw new GATKException("Data source Reference Version does not exist: " + refVer); + } default: throw new GATKException("Data source type does not exist: " + dsTypeArg); } } - private void verifyDataSourcesExistThenDeleteThem(final String dsTypeArg, final boolean doExtract) { + private void verifyDataSourcesExistThenDeleteThem(final String dsTypeArg, final String refVer, final boolean doExtract) { // Get the path to our files: final Path currentPath = IOUtils.getPath("."); - final Path remoteDataSourcePath = getDataSourceRemotePath(dsTypeArg); + final Path remoteDataSourcePath = getDataSourceRemotePath(dsTypeArg, refVer); final Path expectedDownloadedDataSourcePath = currentPath.resolve(remoteDataSourcePath.getFileName().toString()); // Verify it exists and delete it: @@ -105,36 +118,42 @@ private Object[][] provideForTestDownload() { return new Object[][] { { FuncotatorDataSourceDownloader.SOMATIC_ARG_LONG_NAME, + FuncotatorDataSourceDownloader.HG38_ARG_LONG_NAME, true, true, false }, { FuncotatorDataSourceDownloader.SOMATIC_ARG_LONG_NAME, + FuncotatorDataSourceDownloader.HG19_ARG_LONG_NAME, true, false, false }, { FuncotatorDataSourceDownloader.GERMLINE_ARG_LONG_NAME, + FuncotatorDataSourceDownloader.HG38_ARG_LONG_NAME, true, true, false }, { FuncotatorDataSourceDownloader.GERMLINE_ARG_LONG_NAME, + FuncotatorDataSourceDownloader.HG19_ARG_LONG_NAME, true, false, false }, { FuncotatorDataSourceDownloader.SOMATIC_ARG_LONG_NAME, + FuncotatorDataSourceDownloader.HG38_ARG_LONG_NAME, true, false, true }, { FuncotatorDataSourceDownloader.SOMATIC_ARG_LONG_NAME, + FuncotatorDataSourceDownloader.HG19_ARG_LONG_NAME, true, false, true @@ -149,10 +168,11 @@ private Object[][] provideForTestDownload() { dataProvider = "provideForTestDownload", groups = {"funcotatorValidation", "bucket"} ) - void testDownloadRealDataSources(final String dsTypeArg, final boolean doOverwrite, final boolean doValidate, final boolean doExtract) { + void testDownloadRealDataSources(final String dsTypeArg, final String refVer, final boolean doOverwrite, final boolean doValidate, final boolean doExtract) { final ArgumentsBuilder arguments = new ArgumentsBuilder(); arguments.add(dsTypeArg, true); + arguments.add(refVer, true); arguments.add(FuncotatorDataSourceDownloader.OVERWRITE_ARG_LONG_NAME, doOverwrite); arguments.add(FuncotatorDataSourceDownloader.VALIDATE_INTEGRITY_ARG_LONG_NAME, doValidate); arguments.add(FuncotatorDataSourceDownloader.EXTRACT_AFTER_DOWNLOAD, doExtract); @@ -162,7 +182,7 @@ void testDownloadRealDataSources(final String dsTypeArg, final boolean doOverwri // Now verify we got the data sources and clean up the files // so we don't have up to 30 gigs of stuff lying around: - verifyDataSourcesExistThenDeleteThem(dsTypeArg, doExtract); + verifyDataSourcesExistThenDeleteThem(dsTypeArg, refVer, doExtract); } @Test(dataProvider = "provideForTestDownloadSmallDummyDataSources", diff --git a/src/test/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/DataSourceUtilsUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/DataSourceUtilsUnitTest.java index 442281e3a4d..943043724bf 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/DataSourceUtilsUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/DataSourceUtilsUnitTest.java @@ -318,7 +318,7 @@ private Iterator provideForValidateVersionInformation() { } @DataProvider - private Iterator provideForTestVersionRegex() { + private Iterator provideForTestOldVersionRegex() { final ArrayList testArgs = new ArrayList<>(); @@ -351,6 +351,44 @@ private Iterator provideForTestVersionRegex() { } + @DataProvider + private Iterator provideForTestNewVersionRegex() { + + final ArrayList testArgs = new ArrayList<>(); + + final List baseArgs = createBaseTestVersionData(); + final String[] refVersions = new String[] { "hg19", "hg38"}; + + for ( final Object[] args : baseArgs ) { + for ( int whitespace = 0; whitespace < 2; ++whitespace ) { + for ( int decoratorCount = 0; decoratorCount < 10; ++decoratorCount ) { + for ( int refidx = 0; refidx < refVersions.length; ++refidx ) { + + // Some sanity checks here for proper version numbers: + if (((Integer) args[0]) < 0 || ((Integer) args[1]) < 0) { + continue; + } + + final String whitespaceString = whitespace != 0 ? "\t \t \t " : " "; + final String decoratorString = decoratorCount != 0 ? RandomStringUtils.randomAlphanumeric(decoratorCount) : ""; + + testArgs.add( + new Object[]{ + args[0], args[1], args[2], + refVersions[refidx], + decoratorString, + whitespaceString + } + ); + } + } + } + } + + return testArgs.iterator(); + + } + @DataProvider private Object[][] provideForGetDataSourceVersionString() { return new Object[][] { @@ -409,10 +447,11 @@ public void testValidateVersionInformation(final Integer major, @Test public void testGetDataSourceMaxVersionString() { Assert.assertEquals( - DataSourceUtils.getDataSourceMaxVersionString(), - DataSourceUtils.getDataSourceVersionString( + DataSourceUtils.getDataSourceMaxVersionString(38), + DataSourceUtils.getNewDataSourceVersionString( DataSourceUtils.MAX_MAJOR_VERSION_NUMBER, DataSourceUtils.MAX_MINOR_VERSION_NUMBER, + 38, DataSourceUtils.MAX_DATE ) ); @@ -430,8 +469,51 @@ public void testGetDataSourceMinVersionString() { ); } - @Test(dataProvider = "provideForTestVersionRegex") - public void testVersionRegex(final Integer major, + @Test(dataProvider = "provideForTestNewVersionRegex") + public void testNewVersionRegex(final Integer major, + final Integer minor, + final LocalDate releaseDate, + final String reference, + final String decorator, + final String leadingWhitespace ) { + + // Construct the string: + final String versionString = String.format( + "%s%s%d.%d.%s.%4d%02d%02d%s", + DataSourceUtils.MANIFEST_VERSION_LINE_START, + leadingWhitespace, + major, + minor, + reference, + releaseDate.getYear(), + releaseDate.getMonthValue(), + releaseDate.getDayOfMonth(), + decorator + ); + + final Matcher matcher = DataSourceUtils.NEW_VERSION_PATTERN.matcher(versionString); + + Assert.assertTrue(matcher.matches()); + + final Integer versionMajor = Integer.valueOf(matcher.group(1)); + final Integer versionMinor = Integer.valueOf(matcher.group(2)); + final Integer versionRef = Integer.valueOf(matcher.group(3)); + final Integer versionYear = Integer.valueOf(matcher.group(4)); + final Integer versionMonth = Integer.valueOf(matcher.group(5)); + final Integer versionDay = Integer.valueOf(matcher.group(6)); + final String versionDecorator = matcher.group(7); + + Assert.assertEquals( versionMajor, major ); + Assert.assertEquals( versionMinor, minor ); + Assert.assertEquals( "hg"+versionRef, reference ); + Assert.assertEquals( versionYear.intValue(), releaseDate.getYear() ); + Assert.assertEquals( versionMonth.intValue(), releaseDate.getMonthValue() ); + Assert.assertEquals( versionDay.intValue(), releaseDate.getDayOfMonth() ); + Assert.assertEquals( versionDecorator, decorator ); + } + + @Test(dataProvider = "provideForTestOldVersionRegex") + public void testOldVersionRegex(final Integer major, final Integer minor, final LocalDate releaseDate, final String decorator, @@ -450,7 +532,7 @@ public void testVersionRegex(final Integer major, decorator ); - final Matcher matcher = DataSourceUtils.VERSION_PATTERN.matcher(versionString); + final Matcher matcher = DataSourceUtils.OLD_VERSION_PATTERN.matcher(versionString); Assert.assertTrue(matcher.matches()); @@ -479,7 +561,10 @@ public void testCurrentDataSourcesAvailable() { "." + DataSourceUtils.getDataSourceMinVersionString(), DataSourceUtils.DATA_SOURCES_BUCKET_PATH + DataSourceUtils.DATA_SOURCES_NAME_PREFIX + - "." + DataSourceUtils.getDataSourceMaxVersionString() + "." + DataSourceUtils.getDataSourceMaxVersionString(38), + DataSourceUtils.DATA_SOURCES_BUCKET_PATH + + DataSourceUtils.DATA_SOURCES_NAME_PREFIX + + "." + DataSourceUtils.getDataSourceMaxVersionString(19) ); for (final String basePath : dataSourcesBasePaths) { for (final String useCaseModifier : Arrays.asList(DataSourceUtils.DS_SOMATIC_NAME_MODIFIER, DataSourceUtils.DS_GERMLINE_NAME_MODIFIER)) { diff --git a/src/test/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/DbSnpIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/DbSnpIntegrationTest.java index a35f84f4dcd..0c7fbf9936d 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/DbSnpIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/DbSnpIntegrationTest.java @@ -22,20 +22,20 @@ public class DbSnpIntegrationTest extends CommandLineProgramTest { private final Path DB_SNP_HG19_FILE_PATH = IOUtils.getPath( - FuncotatorDataSourceDownloader.SOMATIC_GCLOUD_DATASOURCES_BASEURL + "/" + FuncotatorDataSourceDownloader.HG19_SOMATIC_GCLOUD_DATASOURCES_BASEURL + "/" + "dbsnp/hg19/" + "hg19_All_20180423.vcf.gz" ); private final Path DB_SNP_HG19_INDEX_FILE_PATH = IOUtils.getPath( - FuncotatorDataSourceDownloader.SOMATIC_GCLOUD_DATASOURCES_BASEURL + "/" + FuncotatorDataSourceDownloader.HG19_SOMATIC_GCLOUD_DATASOURCES_BASEURL + "/" + "dbsnp/hg19/" + "hg19_All_20180423.vcf.gz.tbi" ); private final Path DB_SNP_HG38_FILE_PATH = IOUtils.getPath( - FuncotatorDataSourceDownloader.SOMATIC_GCLOUD_DATASOURCES_BASEURL + "/" + FuncotatorDataSourceDownloader.HG38_SOMATIC_GCLOUD_DATASOURCES_BASEURL + "/" + "dbsnp/hg38/" + "hg38_All_20180418.vcf.gz" ); private final Path DB_SNP_HG38_INDEX_FILE_PATH = IOUtils.getPath( - FuncotatorDataSourceDownloader.SOMATIC_GCLOUD_DATASOURCES_BASEURL + "/" + FuncotatorDataSourceDownloader.HG38_SOMATIC_GCLOUD_DATASOURCES_BASEURL + "/" + "dbsnp/hg38/" + "hg38_All_20180418.vcf.gz.tbi" ); diff --git a/src/test/java/org/broadinstitute/hellbender/tools/funcotator/mafOutput/MafOutputRendererUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/funcotator/mafOutput/MafOutputRendererUnitTest.java index 30140b6e337..6f9bea3e279 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/funcotator/mafOutput/MafOutputRendererUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/funcotator/mafOutput/MafOutputRendererUnitTest.java @@ -1220,4 +1220,26 @@ public void testCreateMafCompliantOutputMapSanitized() { maf.getRecords().forEach(r -> Assert.assertTrue(r.hasAnnotation("FAKEDATA_BAZ"))); maf.getRecords().forEach(r -> Assert.assertEquals(r.getAnnotationValue("FAKEDATA_BAZ"), "_%09_YES_%0A_")); } + + @Test + // Asserting that we can write a MAF file with a version 43 GencodeFuncotation without error + public void testVersion43GencodeFuncotation() { + final File outFile = getSafeNonExistentFile("TestMafOutputSanitized.maf"); + final String dummyTranscriptName = "FAKE00001.1"; + final VariantContext dummyVariantContext = FuncotatorTestUtils.createSimpleVariantContext(FuncotatorReferenceTestUtils.retrieveHg19Chr3Ref(),"3", 1000000, 1000000, "C", "T"); + final GencodeFuncotation dummyGencodeFuncotation = (GencodeFuncotation) FuncotatorTestUtils.createDummyGencodeFuncotation(dummyTranscriptName, dummyVariantContext, "43"); + final Set excludedFields = Collections.emptySet(); + try ( final MafOutputRenderer mafOutputRenderer = createMafOutputRenderer( outFile, FuncotatorTestConstants.REFERENCE_VERSION_HG19, excludedFields) ) { + final FuncotationMap funcotationMap = FuncotationMap.createFromGencodeFuncotations(Collections.singletonList(dummyGencodeFuncotation)); + funcotationMap.add(dummyTranscriptName, FuncotatorTestUtils.createDummyTableFuncotation()); + mafOutputRenderer.write(dummyVariantContext, funcotationMap); + } + + final AnnotatedIntervalCollection maf = AnnotatedIntervalCollection.create(outFile.toPath(), null); + Assert.assertTrue(maf.getRecords().size() > 0); + maf.getRecords().forEach(r -> Assert.assertTrue(r.hasAnnotation("FAKEDATA_FOO"))); + maf.getRecords().forEach(r -> Assert.assertTrue(r.hasAnnotation("FAKEDATA_BAR"))); + maf.getRecords().forEach(r -> Assert.assertTrue(r.hasAnnotation("FAKEDATA_BAZ"))); + maf.getRecords().forEach(r -> Assert.assertEquals(r.getAnnotationValue("FAKEDATA_BAZ"), "_%09_YES_%0A_")); + } } diff --git a/src/test/java/org/broadinstitute/hellbender/utils/test/FuncotatorTestUtils.java b/src/test/java/org/broadinstitute/hellbender/utils/test/FuncotatorTestUtils.java index b97e2d27af8..b48c5274416 100644 --- a/src/test/java/org/broadinstitute/hellbender/utils/test/FuncotatorTestUtils.java +++ b/src/test/java/org/broadinstitute/hellbender/utils/test/FuncotatorTestUtils.java @@ -461,9 +461,10 @@ public static T assertRoundTripInKryo(final T input, final Class inputCla * * @param dummyTranscriptName An aritrary string. Never {@code null} * @param dummyVariantContext An aritrary {@link VariantContext}. Never {@code null} + * @param gencodeVersion Should be a valid gencode version. Never {@code null} * @return Never {@code null} */ - public static Funcotation createDummyGencodeFuncotation(final String dummyTranscriptName, final VariantContext dummyVariantContext) { + public static Funcotation createDummyGencodeFuncotation(final String dummyTranscriptName, final VariantContext dummyVariantContext, final String gencodeVersion) { Utils.nonNull(dummyTranscriptName); Utils.nonNull(dummyVariantContext); return createGencodeFuncotation("GENE","b37", dummyVariantContext.getContig(), dummyVariantContext.getStart(),dummyVariantContext.getEnd(), @@ -474,7 +475,10 @@ public static Funcotation createDummyGencodeFuncotation(final String dummyTransc 1, 1500, 1500, " ", " ", "p.L300P", 0.5, - "ACTGATCGATCGA",Collections.singletonList("FAKE00002.5"), "27"); + "ACTGATCGATCGA",Collections.singletonList("FAKE00002.5"), gencodeVersion); + } + public static Funcotation createDummyGencodeFuncotation(final String dummyTranscriptName, final VariantContext dummyVariantContext) { + return createDummyGencodeFuncotation(dummyTranscriptName, dummyVariantContext, "27"); } /**