Skip to content

Commit

Permalink
Funcotator Update for Datasource Release V1.8 (#8512)
Browse files Browse the repository at this point in the history
  • Loading branch information
jamesemery authored and rickymagner committed Nov 28, 2023
1 parent ea6ae82 commit c25782f
Show file tree
Hide file tree
Showing 16 changed files with 312 additions and 81 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def renderFusionGeneDictEntry(geneKey, fusionGeneDict):
tsvReader = GenericTsvReader(inputFilename)
headers = tsvReader.getFieldNames()
print('Found headers (input): ' + str(headers))
if "Translocation Name" not in headers:
if "TRANSLOCATION_NAME" not in headers:
raise NotImplementedError("Could not find Translocation Name column in the input file.")

outputHeaders = ['gene', 'fusion_genes', 'fusion_id']
Expand All @@ -99,7 +99,7 @@ def renderFusionGeneDictEntry(geneKey, fusionGeneDict):
fusionGeneDict = OrderedDict()
last_i = 0
for i, line in enumerate(tsvReader):
fusion_gene_description = line['Translocation Name']
fusion_gene_description = line['TRANSLOCATION_NAME']

if len(fusion_gene_description.strip()) == 0:
# blank
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ set -e

COSMIC_FILE=CosmicCompleteTargetedScreensMutantExport.tsv
OUT_DB_FILE="Cosmic.db"
OUT_TMP_FOLDER="~/tmp"

################################################################################

Expand All @@ -29,6 +30,10 @@ if [[ $# -gt 1 ]] ; then
OUT_DB_FILE=$2
fi

if [[ $# -gt 2 ]] ; then
OUT_TMP_FOLDER=$3
fi

if [ ! -f ${COSMIC_FILE} ] ; then
echo "ERROR: Given COSMIC file does not exist: ${COSMIC_FILE}" 1>&2
exit 1
Expand All @@ -42,6 +47,7 @@ sqlite3 ${OUT_DB_FILE} <<EOF
.echo on
.mode tabs
.import ${COSMIC_FILE} RawCosmic
pragma temp_store_directory = ${OUT_TMP_FOLDER};
CREATE TABLE Cosmic AS SELECT * FROM RawCosmic WHERE ("Mutation AA" != "" OR "Mutation genome position" != "");
DROP TABLE RawCosmic;
UPDATE Cosmic SET "Mutation genome position" = "chr"||"Mutation genome position" WHERE "Mutation genome position" != "";
Expand Down
15 changes: 6 additions & 9 deletions scripts/funcotator/data_sources/cosmic/getCosmicDataSources.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ SCRIPTNAME=$( echo $0 | sed 's#.*/##g' )

################################################################################

version="v84"
version="v98"

EMAIL=""
PASSWORD=""
Expand Down Expand Up @@ -135,15 +135,12 @@ mkdir -vp cosmic/hg19 cosmic/hg38 cosmic_fusion/hg19 cosmic_fusion/hg38 cosmic_t
# Get the data files:

echo "Getting files ... "
lftp --norc -u "${EMAIL}","${PASSWORD}" sftp://sftp-cancer.sanger.ac.uk <<EOF
AUTH_TOKEN=$(echo -n "$EMAIL:$PASSWORD" | base64)

get cosmic/grch37/cosmic/${version}/CosmicCompleteTargetedScreensMutantExport.tsv.gz -o cosmic/hg19/CosmicCompleteTargetedScreensMutantExport.tsv.gz
get cosmic/grch37/cosmic/${version}/CosmicFusionExport.tsv.gz -o cosmic_fusion/hg19/CosmicFusionExport.tsv.gz
get cosmic/grch38/cosmic/${version}/CosmicCompleteTargetedScreensMutantExport.tsv.gz -o cosmic/hg38/CosmicCompleteTargetedScreensMutantExport.tsv.gz
get cosmic/grch38/cosmic/${version}/CosmicFusionExport.tsv.gz -o cosmic_fusion/hg38/CosmicFusionExport.tsv.gz
bye
EOF
curl -sS "$(curl -H "Authorization: Basic $AUTH_TOKEN" https://cancer.sanger.ac.uk/cosmic/file_download/GRCh37/cosmic/${version}/CosmicCompleteTargetedScreensMutantExport.tsv.gz | jq -r '.url')" -o cosmic/hg19/CosmicCompleteTargetedScreensMutantExport.tsv.gz
curl -sS "$(curl -H "Authorization: Basic $AUTH_TOKEN" https://cancer.sanger.ac.uk/cosmic/file_download/GRCh37/cosmic/${version}/CosmicFusionExport.tsv.gz | jq -r '.url')" -o cosmic_fusion/hg19/CosmicFusionExport.tsv.gz
curl -sS "$(curl -H "Authorization: Basic $AUTH_TOKEN" https://cancer.sanger.ac.uk/cosmic/file_download/GRCh38/cosmic/${version}/CosmicCompleteTargetedScreensMutantExport.tsv.gz | jq -r '.url')" -o cosmic/hg38/CosmicCompleteTargetedScreensMutantExport.tsv.gz
curl -sS "$(curl -H "Authorization: Basic $AUTH_TOKEN" https://cancer.sanger.ac.uk/cosmic/file_download/GRCh38/cosmic/${version}/CosmicFusionExport.tsv.gz | jq -r '.url')" -o cosmic_fusion/hg38/CosmicFusionExport.tsv.gz

echo "Retrieved COSMIC version ${version} on $(date) from sftp-cancer.sanger.ac.uk by: ${SCRIPTNAME}:" > cosmic/metadata.txt
echo "User: ${EMAIL}" >> cosmic/metadata.txt
Expand Down
2 changes: 1 addition & 1 deletion scripts/funcotator/data_sources/downloadHgncDataSource.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@

# Downloads the HGNC data source from the HGNC website.

curl 'https://www.genenames.org/cgi-bin/download?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_status&col=gd_locus_type&col=gd_locus_group&col=gd_prev_sym&col=gd_prev_name&col=gd_aliases&col=gd_name_aliases&col=gd_pub_chrom_map&col=gd_date_mod&col=gd_date_sym_change&col=gd_date_name_change&col=gd_pub_acc_ids&col=gd_enz_ids&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_pubmed_ids&col=gd_pub_refseq_ids&col=family.id&col=family.name&col=gd_ccds_ids&col=gd_vega_ids&col=md_eg_id&col=md_mim_id&col=md_refseq_id&col=md_prot_id&col=md_ensembl_id&col=md_ucsc_id&status=Approved&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit' > hgnc_download_$(date +%b%d%Y).tsv
curl 'https://www.genenames.org/cgi-bin/download/custom?col=gd_hgnc_id&amp;col=gd_app_sym&amp;col=gd_app_name&amp;col=gd_status&amp;col=gd_locus_type&amp;col=gd_locus_group&amp;col=gd_prev_sym&amp;col=gd_prev_name&amp;col=gd_aliases&amp;col=gd_name_aliases&amp;col=gd_pub_chrom_map&amp;col=gd_date_mod&amp;col=gd_date_sym_change&amp;col=gd_date_name_change&amp;col=gd_pub_acc_ids&amp;col=gd_enz_ids&amp;col=gd_pub_eg_id&amp;col=gd_pub_ensembl_id&amp;col=gd_pubmed_ids&amp;col=gd_pub_refseq_ids&amp;col=family.id&amp;col=family.name&amp;col=gd_ccds_ids&amp;col=gd_vega_ids&amp;col=md_eg_id&amp;col=md_mim_id&amp;col=md_refseq_id&amp;col=md_prot_id&amp;col=md_ensembl_id&amp;col=md_ucsc_id&amp;status=Approved&amp;status_opt=2&amp;where=&amp;order_by=gd_app_sym_sort&amp;format=text&amp;limit=&amp;hgnc_dbtag=on&amp;submit=submi' > hgnc_download_$(date +%b%d%Y).tsv

Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/usr/bin/env bash

#NOTE: This script has been checked in to aid in the release process for future Funcotator datasource bundles.

echo "Making Tarballs of each Datasource Directory..."

tar -zcvf funcotator_dataSources.v1.8.hg38.20230908s.tar.gz funcotator_dataSources.v1.8.hg38.20230908s
tar -zcvf funcotator_dataSources.v1.8.hg38.20230908g.tar.gz funcotator_dataSources.v1.8.hg38.20230908g
tar -zcvf funcotator_dataSources.v1.8.hg19.20230908s.tar.gz funcotator_dataSources.v1.8.hg19.20230908s
tar -zcvf funcotator_dataSources.v1.8.hg19.20230908g.tar.gz funcotator_dataSources.v1.8.hg19.20230908g

echo "Making the various hashfiles for release"

find funcotator_dataSources.v1.8.hg38.20230908s -type f | xargs md5sum > funcotator_dataSources.v1.8.hg38.20230908s.dir.long.md5sum
md5sum funcotator_dataSources.v1.8.hg38.20230908s.tar.gz | awk '{print $1}' > funcotator_dataSources.v1.8.hg38.20230908s.dir.md5sum
sha256sum funcotator_dataSources.v1.8.hg38.20230908s.tar.gz > funcotator_dataSources.v1.8.hg38.20230908s.sha256

find funcotator_dataSources.v1.8.hg38.20230908g -type f | xargs md5sum > funcotator_dataSources.v1.8.hg38.20230908g.dir.long.md5sum
md5sum funcotator_dataSources.v1.8.hg38.20230908g.tar.gz | awk '{print $1}' > funcotator_dataSources.v1.8.hg38.20230908g.dir.md5sum
sha256sum funcotator_dataSources.v1.8.hg38.20230908g.tar.gz > funcotator_dataSources.v1.8.hg38.20230908g.sha256

find funcotator_dataSources.v1.8.hg19.20230908s -type f | xargs md5sum > funcotator_dataSources.v1.8.hg19.20230908s.dir.long.md5sum
md5sum funcotator_dataSources.v1.8.hg19.20230908s.tar.gz | awk '{print $1}' > funcotator_dataSources.v1.8.hg19.20230908s.dir.md5sum
sha256sum funcotator_dataSources.v1.8.hg19.20230908s.tar.gz > funcotator_dataSources.v1.8.hg19.20230908s.sha256

find funcotator_dataSources.v1.8.hg19.20230908g -type f | xargs md5sum > funcotator_dataSources.v1.8.hg19.20230908g.dir.long.md5sum
md5sum funcotator_dataSources.v1.8.hg19.20230908g.tar.gz | awk '{print $1}' > funcotator_dataSources.v1.8.hg19.20230908g.dir.md5sum
sha256sum funcotator_dataSources.v1.8.hg19.20230908g.tar.gz > funcotator_dataSources.v1.8.hg19.20230908g.sha256
2 changes: 1 addition & 1 deletion scripts/funcotator/data_sources/getGencode.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ MAXARGS=0

# Latest release numbers for our references.
# Update these numbers when a new Gencode is released.
LATEST_RELEASE=34
LATEST_RELEASE=43

DATA_SOURCE_NAME="Gencode"
OUT_DIR_NAME='gencode'
Expand Down
4 changes: 2 additions & 2 deletions scripts/funcotator/data_sources/getGencodeXHGNC.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@ outFileBaseName="gencode_xhgnc"
outExt=".tsv"

hg19db="homo_sapiens_core_75_37"
hg38db="homo_sapiens_core_90_38"
hg38db="homo_sapiens_core_110_38"

hg19FileName=${outFileBaseName}_v75_37.hg19${outExt}
hg38FileName=${outFileBaseName}_v90_38.hg38${outExt}
hg38FileName=${outFileBaseName}_v110_38.hg38${outExt}

################################################################################

Expand Down
4 changes: 2 additions & 2 deletions scripts/funcotator/data_sources/getGencodeXRefseq.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@ outFileBaseName="gencode_xrefseq"
outExt=".tsv"

hg19db="homo_sapiens_core_75_37"
hg38db="homo_sapiens_core_90_38"
hg38db="homo_sapiens_core_110_38"

hg19FileName=${outFileBaseName}_v75_37.hg19${outExt}
hg38FileName=${outFileBaseName}_v90_38.hg38${outExt}
hg38FileName=${outFileBaseName}_v110_38.hg38${outExt}

################################################################################

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@
* <p>
* To download and extract the data sources, you can invoke {@link FuncotatorDataSourceDownloader} in the following ways:
* <ul>
* <li>For <strong>somatic</strong> data sources:<br /><pre>{@code ./gatk FuncotatorDataSourceDownloader --somatic --validate-integrity --extract-after-download}</pre></li>
* <li>For <strong>germline</strong> data sources:<br /><pre>{@code ./gatk FuncotatorDataSourceDownloader --germline --validate-integrity --extract-after-download}</pre></li>
* <li>For <strong>somatic</strong> data sources:<br /><pre>{@code ./gatk FuncotatorDataSourceDownloader --somatic --validate-integrity --hg38 --extract-after-download}</pre></li>
* <li>For <strong>germline</strong> data sources:<br /><pre>{@code ./gatk FuncotatorDataSourceDownloader --germline --validate-integrity --hg19 --extract-after-download}</pre></li>
* </ul>
* </p>
*
Expand Down Expand Up @@ -63,6 +63,8 @@ public class FuncotatorDataSourceDownloader extends CommandLineProgram {
public static final String GERMLINE_ARG_LONG_NAME = "germline";
public static final String OVERWRITE_ARG_LONG_NAME = "overwrite-output-file";
public static final String EXTRACT_AFTER_DOWNLOAD = "extract-after-download";
public static final String HG38_ARG_LONG_NAME = "hg38";
public static final String HG19_ARG_LONG_NAME = "hg19";

//==================================================================================================================
// Private Static Members:
Expand All @@ -73,18 +75,27 @@ public class FuncotatorDataSourceDownloader extends CommandLineProgram {
// Private Static Members:

// Set to always get the latest version of the data sources:
private static final String BASE_URL = DataSourceUtils.DATA_SOURCES_BUCKET_PATH +
DataSourceUtils.DATA_SOURCES_NAME_PREFIX + "." + DataSourceUtils.getDataSourceMaxVersionString();
private static final String HG38_BASE_URL = DataSourceUtils.DATA_SOURCES_BUCKET_PATH +
DataSourceUtils.DATA_SOURCES_NAME_PREFIX + "." + DataSourceUtils.getDataSourceMaxVersionString(38);
private static final String HG19_BASE_URL = DataSourceUtils.DATA_SOURCES_BUCKET_PATH +
DataSourceUtils.DATA_SOURCES_NAME_PREFIX + "." + DataSourceUtils.getDataSourceMaxVersionString(19);

private static final String HG38_GERMLINE_GCLOUD_DATASOURCES_BASEURL = HG38_BASE_URL + DataSourceUtils.DS_GERMLINE_NAME_MODIFIER;
private static final String HG19_GERMLINE_GCLOUD_DATASOURCES_BASEURL = HG19_BASE_URL + DataSourceUtils.DS_GERMLINE_NAME_MODIFIER;

private static final String GERMLINE_GCLOUD_DATASOURCES_BASEURL = BASE_URL + DataSourceUtils.DS_GERMLINE_NAME_MODIFIER;
@VisibleForTesting
static final Path GERMLINE_GCLOUD_DATASOURCES_PATH = IOUtils.getPath(GERMLINE_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_EXTENSION);
private static final Path GERMLINE_GCLOUD_DATASOURCES_SHA256_PATH = IOUtils.getPath(GERMLINE_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_CHECKSUM_EXTENSION);
static final Path HG38_GERMLINE_GCLOUD_DATASOURCES_PATH = IOUtils.getPath(HG38_GERMLINE_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_EXTENSION);
static final Path HG19_GERMLINE_GCLOUD_DATASOURCES_PATH = IOUtils.getPath(HG19_GERMLINE_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_EXTENSION);
private static final Path HG38_GERMLINE_GCLOUD_DATASOURCES_SHA256_PATH = IOUtils.getPath(HG38_GERMLINE_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_CHECKSUM_EXTENSION);
private static final Path HG19_GERMLINE_GCLOUD_DATASOURCES_SHA256_PATH = IOUtils.getPath(HG19_GERMLINE_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_CHECKSUM_EXTENSION);

public static final String SOMATIC_GCLOUD_DATASOURCES_BASEURL = BASE_URL + DataSourceUtils.DS_SOMATIC_NAME_MODIFIER;;
public static final String HG38_SOMATIC_GCLOUD_DATASOURCES_BASEURL = HG38_BASE_URL + DataSourceUtils.DS_SOMATIC_NAME_MODIFIER;;
public static final String HG19_SOMATIC_GCLOUD_DATASOURCES_BASEURL = HG19_BASE_URL + DataSourceUtils.DS_SOMATIC_NAME_MODIFIER;;

public static final Path SOMATIC_GCLOUD_DATASOURCES_PATH = IOUtils.getPath(SOMATIC_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_EXTENSION);
private static final Path SOMATIC_GCLOUD_DATASOURCES_SHA256_PATH = IOUtils.getPath(SOMATIC_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_CHECKSUM_EXTENSION);
public static final Path HG38_SOMATIC_GCLOUD_DATASOURCES_PATH = IOUtils.getPath(HG38_SOMATIC_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_EXTENSION);
public static final Path HG19_SOMATIC_GCLOUD_DATASOURCES_PATH = IOUtils.getPath(HG19_SOMATIC_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_EXTENSION);
private static final Path HG38_SOMATIC_GCLOUD_DATASOURCES_SHA256_PATH = IOUtils.getPath(HG38_SOMATIC_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_CHECKSUM_EXTENSION);
private static final Path HG19_SOMATIC_GCLOUD_DATASOURCES_SHA256_PATH = IOUtils.getPath(HG19_SOMATIC_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_CHECKSUM_EXTENSION);

//==================================================================================================================
// Private Members:
Expand Down Expand Up @@ -129,6 +140,23 @@ public class FuncotatorDataSourceDownloader extends CommandLineProgram {
optional = true)
protected boolean extractDataSourcesAfterDownload = false;

@Argument(
shortName = HG38_ARG_LONG_NAME,
fullName = HG38_ARG_LONG_NAME,
mutex = {HG19_ARG_LONG_NAME, TESTING_OVERRIDE_PATH_FOR_DATA_SOURCES_SHA256_ARG},
doc = "If set, will extract data from the HG38 data sources bucket.",
optional = true)
protected boolean getHg38Datasources = false;

@Argument(
//TODO should these be MUTEX or should one be allowed to download either?
shortName = HG19_ARG_LONG_NAME,
fullName = HG19_ARG_LONG_NAME,
mutex = {HG38_ARG_LONG_NAME, TESTING_OVERRIDE_PATH_FOR_DATA_SOURCES_SHA256_ARG},
doc = "If set, will extract data from the HG19 data sources bucket.",
optional = true)
protected boolean getHg19Datasources = false;

// Testing arguments:
@Hidden
@Advanced
Expand Down Expand Up @@ -164,6 +192,11 @@ protected void onStartup() {
throw new UserException("Must select either somatic or germline datasources.");
}

// Make sure the user specified at least one reference source to download:
if ((!getHg38Datasources) && (!getHg19Datasources) && (testingOverrideDataSourcesPath == null)) {
throw new UserException("Must select either HG19 or HG38 datasources.");
}

// Make sure the testing inputs are correct:
if ( ((testingOverrideDataSourcesPath == null) && (testingOverrideDataSourcesSha256Path != null)) ||
((testingOverrideDataSourcesSha256Path == null) && (testingOverrideDataSourcesPath != null)) ) {
Expand All @@ -184,14 +217,26 @@ protected Object doWork() {

// Get the correct data source:
if ( getSomaticDataSources ) {
dataSourceDescription = "Somatic";
dataSourcesPath = SOMATIC_GCLOUD_DATASOURCES_PATH;
dataSourcesSha256Path = SOMATIC_GCLOUD_DATASOURCES_SHA256_PATH;
if (getHg38Datasources) {
dataSourceDescription = "HG38_Somatic";
dataSourcesPath = HG38_SOMATIC_GCLOUD_DATASOURCES_PATH;
dataSourcesSha256Path = HG38_SOMATIC_GCLOUD_DATASOURCES_SHA256_PATH;
} else { // Okay because HG38 and HG19 datasources are currently MUTEX and at least one is required
dataSourceDescription = "HG19_Somatic";
dataSourcesPath = HG19_SOMATIC_GCLOUD_DATASOURCES_PATH;
dataSourcesSha256Path = HG19_SOMATIC_GCLOUD_DATASOURCES_SHA256_PATH;
}
}
else if ( getGermlineDataSources ) {
dataSourceDescription = "Germline";
dataSourcesPath = GERMLINE_GCLOUD_DATASOURCES_PATH;
dataSourcesSha256Path = GERMLINE_GCLOUD_DATASOURCES_SHA256_PATH;
if (getHg38Datasources) {
dataSourceDescription = "HG38_Germline";
dataSourcesPath = HG38_GERMLINE_GCLOUD_DATASOURCES_PATH;
dataSourcesSha256Path = HG38_GERMLINE_GCLOUD_DATASOURCES_SHA256_PATH;
} else {
dataSourceDescription = "HG19_Germline";
dataSourcesPath = HG19_GERMLINE_GCLOUD_DATASOURCES_PATH;
dataSourcesSha256Path = HG19_GERMLINE_GCLOUD_DATASOURCES_SHA256_PATH;
}
}
else {
// Test case:
Expand Down
Loading

0 comments on commit c25782f

Please sign in to comment.