Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Funcotator Update for Datasource Release V1.8 #8512

Merged
merged 13 commits into from
Oct 11, 2023
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def renderFusionGeneDictEntry(geneKey, fusionGeneDict):
tsvReader = GenericTsvReader(inputFilename)
headers = tsvReader.getFieldNames()
print('Found headers (input): ' + str(headers))
if "Translocation Name" not in headers:
if "TRANSLOCATION_NAME" not in headers:
raise NotImplementedError("Could not find Translocation Name column in the input file.")

outputHeaders = ['gene', 'fusion_genes', 'fusion_id']
Expand All @@ -99,7 +99,7 @@ def renderFusionGeneDictEntry(geneKey, fusionGeneDict):
fusionGeneDict = OrderedDict()
last_i = 0
for i, line in enumerate(tsvReader):
fusion_gene_description = line['Translocation Name']
fusion_gene_description = line['TRANSLOCATION_NAME']

if len(fusion_gene_description.strip()) == 0:
# blank
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ set -e

COSMIC_FILE=CosmicCompleteTargetedScreensMutantExport.tsv
OUT_DB_FILE="Cosmic.db"
OUT_TMP_FOLDER="~/tmp"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why did you decide to remove the temp dir?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Other way around... i added a tmp folder and i needed to do so because i was running into space issues on-prem


################################################################################

Expand All @@ -29,6 +30,10 @@ if [[ $# -gt 1 ]] ; then
OUT_DB_FILE=$2
fi

if [[ $# -gt 2 ]] ; then
OUT_TMP_FOLDER=$3
fi

if [ ! -f ${COSMIC_FILE} ] ; then
echo "ERROR: Given COSMIC file does not exist: ${COSMIC_FILE}" 1>&2
exit 1
Expand All @@ -42,6 +47,7 @@ sqlite3 ${OUT_DB_FILE} <<EOF
.echo on
.mode tabs
.import ${COSMIC_FILE} RawCosmic
pragma temp_store_directory = ${OUT_TMP_FOLDER};
CREATE TABLE Cosmic AS SELECT * FROM RawCosmic WHERE ("Mutation AA" != "" OR "Mutation genome position" != "");
DROP TABLE RawCosmic;
UPDATE Cosmic SET "Mutation genome position" = "chr"||"Mutation genome position" WHERE "Mutation genome position" != "";
Expand Down
15 changes: 6 additions & 9 deletions scripts/funcotator/data_sources/cosmic/getCosmicDataSources.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ SCRIPTNAME=$( echo $0 | sed 's#.*/##g' )

################################################################################

version="v84"
version="v98"

EMAIL=""
PASSWORD=""
Expand Down Expand Up @@ -135,15 +135,12 @@ mkdir -vp cosmic/hg19 cosmic/hg38 cosmic_fusion/hg19 cosmic_fusion/hg38 cosmic_t
# Get the data files:

echo "Getting files ... "
lftp --norc -u "${EMAIL}","${PASSWORD}" sftp://sftp-cancer.sanger.ac.uk <<EOF
AUTH_TOKEN=$(echo -n "$EMAIL:$PASSWORD" | base64)

get cosmic/grch37/cosmic/${version}/CosmicCompleteTargetedScreensMutantExport.tsv.gz -o cosmic/hg19/CosmicCompleteTargetedScreensMutantExport.tsv.gz
get cosmic/grch37/cosmic/${version}/CosmicFusionExport.tsv.gz -o cosmic_fusion/hg19/CosmicFusionExport.tsv.gz
get cosmic/grch38/cosmic/${version}/CosmicCompleteTargetedScreensMutantExport.tsv.gz -o cosmic/hg38/CosmicCompleteTargetedScreensMutantExport.tsv.gz
get cosmic/grch38/cosmic/${version}/CosmicFusionExport.tsv.gz -o cosmic_fusion/hg38/CosmicFusionExport.tsv.gz
bye

EOF
curl -sS "$(curl -H "Authorization: Basic $AUTH_TOKEN" https://cancer.sanger.ac.uk/cosmic/file_download/GRCh37/cosmic/${version}/CosmicCompleteTargetedScreensMutantExport.tsv.gz | jq -r '.url')" -o cosmic/hg19/CosmicCompleteTargetedScreensMutantExport.tsv.gz
curl -sS "$(curl -H "Authorization: Basic $AUTH_TOKEN" https://cancer.sanger.ac.uk/cosmic/file_download/GRCh37/cosmic/${version}/CosmicFusionExport.tsv.gz | jq -r '.url')" -o cosmic_fusion/hg19/CosmicFusionExport.tsv.gz
curl -sS "$(curl -H "Authorization: Basic $AUTH_TOKEN" https://cancer.sanger.ac.uk/cosmic/file_download/GRCh38/cosmic/${version}/CosmicCompleteTargetedScreensMutantExport.tsv.gz | jq -r '.url')" -o cosmic/hg38/CosmicCompleteTargetedScreensMutantExport.tsv.gz
curl -sS "$(curl -H "Authorization: Basic $AUTH_TOKEN" https://cancer.sanger.ac.uk/cosmic/file_download/GRCh38/cosmic/${version}/CosmicFusionExport.tsv.gz | jq -r '.url')" -o cosmic_fusion/hg38/CosmicFusionExport.tsv.gz

echo "Retrieved COSMIC version ${version} on $(date) from sftp-cancer.sanger.ac.uk by: ${SCRIPTNAME}:" > cosmic/metadata.txt
echo "User: ${EMAIL}" >> cosmic/metadata.txt
Expand Down
2 changes: 1 addition & 1 deletion scripts/funcotator/data_sources/downloadHgncDataSource.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@

# Downloads the HGNC data source from the HGNC website.

curl 'https://www.genenames.org/cgi-bin/download?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_status&col=gd_locus_type&col=gd_locus_group&col=gd_prev_sym&col=gd_prev_name&col=gd_aliases&col=gd_name_aliases&col=gd_pub_chrom_map&col=gd_date_mod&col=gd_date_sym_change&col=gd_date_name_change&col=gd_pub_acc_ids&col=gd_enz_ids&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_pubmed_ids&col=gd_pub_refseq_ids&col=family.id&col=family.name&col=gd_ccds_ids&col=gd_vega_ids&col=md_eg_id&col=md_mim_id&col=md_refseq_id&col=md_prot_id&col=md_ensembl_id&col=md_ucsc_id&status=Approved&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit' > hgnc_download_$(date +%b%d%Y).tsv
curl 'https://www.genenames.org/cgi-bin/download/custom?col=gd_hgnc_id&amp;col=gd_app_sym&amp;col=gd_app_name&amp;col=gd_status&amp;col=gd_locus_type&amp;col=gd_locus_group&amp;col=gd_prev_sym&amp;col=gd_prev_name&amp;col=gd_aliases&amp;col=gd_name_aliases&amp;col=gd_pub_chrom_map&amp;col=gd_date_mod&amp;col=gd_date_sym_change&amp;col=gd_date_name_change&amp;col=gd_pub_acc_ids&amp;col=gd_enz_ids&amp;col=gd_pub_eg_id&amp;col=gd_pub_ensembl_id&amp;col=gd_pubmed_ids&amp;col=gd_pub_refseq_ids&amp;col=family.id&amp;col=family.name&amp;col=gd_ccds_ids&amp;col=gd_vega_ids&amp;col=md_eg_id&amp;col=md_mim_id&amp;col=md_refseq_id&amp;col=md_prot_id&amp;col=md_ensembl_id&amp;col=md_ucsc_id&amp;status=Approved&amp;status_opt=2&amp;where=&amp;order_by=gd_app_sym_sort&amp;format=text&amp;limit=&amp;hgnc_dbtag=on&amp;submit=submi' > hgnc_download_$(date +%b%d%Y).tsv
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The last characters before the > look to be truncated.

Is it really submit=submi?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i just checked and that link works... ask the HGNC devs? Evidently both submi and submit work there...


Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/usr/bin/env bash

#NOTE: This script has been checked in to aid in the release process for future Funcotator datasource bundles.

jamesemery marked this conversation as resolved.
Show resolved Hide resolved
echo "Making Tarballs of each Datasource Directory..."

tar -zcvf funcotator_dataSources.v1.8.hg38.20230908s.tar.gz funcotator_dataSources.v1.8.hg38.20230908s
tar -zcvf funcotator_dataSources.v1.8.hg38.20230908g.tar.gz funcotator_dataSources.v1.8.hg38.20230908g
tar -zcvf funcotator_dataSources.v1.8.hg19.20230908s.tar.gz funcotator_dataSources.v1.8.hg19.20230908s
tar -zcvf funcotator_dataSources.v1.8.hg19.20230908g.tar.gz funcotator_dataSources.v1.8.hg19.20230908g

echo "Making the various hashfiles for release"

find funcotator_dataSources.v1.8.hg38.20230908s -type f | xargs md5sum > funcotator_dataSources.v1.8.hg38.20230908s.dir.long.md5sum
md5sum funcotator_dataSources.v1.8.hg38.20230908s.tar.gz | awk '{print $1}' > funcotator_dataSources.v1.8.hg38.20230908s.dir.md5sum
sha256sum funcotator_dataSources.v1.8.hg38.20230908s.tar.gz > funcotator_dataSources.v1.8.hg38.20230908s.sha256

find funcotator_dataSources.v1.8.hg38.20230908g -type f | xargs md5sum > funcotator_dataSources.v1.8.hg38.20230908g.dir.long.md5sum
md5sum funcotator_dataSources.v1.8.hg38.20230908g.tar.gz | awk '{print $1}' > funcotator_dataSources.v1.8.hg38.20230908g.dir.md5sum
sha256sum funcotator_dataSources.v1.8.hg38.20230908g.tar.gz > funcotator_dataSources.v1.8.hg38.20230908g.sha256

find funcotator_dataSources.v1.8.hg19.20230908s -type f | xargs md5sum > funcotator_dataSources.v1.8.hg19.20230908s.dir.long.md5sum
md5sum funcotator_dataSources.v1.8.hg19.20230908s.tar.gz | awk '{print $1}' > funcotator_dataSources.v1.8.hg19.20230908s.dir.md5sum
sha256sum funcotator_dataSources.v1.8.hg19.20230908s.tar.gz > funcotator_dataSources.v1.8.hg19.20230908s.sha256

find funcotator_dataSources.v1.8.hg19.20230908g -type f | xargs md5sum > funcotator_dataSources.v1.8.hg19.20230908g.dir.long.md5sum
md5sum funcotator_dataSources.v1.8.hg19.20230908g.tar.gz | awk '{print $1}' > funcotator_dataSources.v1.8.hg19.20230908g.dir.md5sum
sha256sum funcotator_dataSources.v1.8.hg19.20230908g.tar.gz > funcotator_dataSources.v1.8.hg19.20230908g.sha256
2 changes: 1 addition & 1 deletion scripts/funcotator/data_sources/getGencode.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ MAXARGS=0

# Latest release numbers for our references.
# Update these numbers when a new Gencode is released.
LATEST_RELEASE=34
LATEST_RELEASE=43

DATA_SOURCE_NAME="Gencode"
OUT_DIR_NAME='gencode'
Expand Down
4 changes: 2 additions & 2 deletions scripts/funcotator/data_sources/getGencodeXHGNC.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@ outFileBaseName="gencode_xhgnc"
outExt=".tsv"

hg19db="homo_sapiens_core_75_37"
hg38db="homo_sapiens_core_90_38"
hg38db="homo_sapiens_core_110_38"

hg19FileName=${outFileBaseName}_v75_37.hg19${outExt}
hg38FileName=${outFileBaseName}_v90_38.hg38${outExt}
hg38FileName=${outFileBaseName}_v110_38.hg38${outExt}

################################################################################

Expand Down
4 changes: 2 additions & 2 deletions scripts/funcotator/data_sources/getGencodeXRefseq.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@ outFileBaseName="gencode_xrefseq"
outExt=".tsv"

hg19db="homo_sapiens_core_75_37"
hg38db="homo_sapiens_core_90_38"
hg38db="homo_sapiens_core_110_38"

hg19FileName=${outFileBaseName}_v75_37.hg19${outExt}
hg38FileName=${outFileBaseName}_v90_38.hg38${outExt}
hg38FileName=${outFileBaseName}_v110_38.hg38${outExt}

################################################################################

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@
* <p>
* To download and extract the data sources, you can invoke {@link FuncotatorDataSourceDownloader} in the following ways:
* <ul>
* <li>For <strong>somatic</strong> data sources:<br /><pre>{@code ./gatk FuncotatorDataSourceDownloader --somatic --validate-integrity --extract-after-download}</pre></li>
* <li>For <strong>germline</strong> data sources:<br /><pre>{@code ./gatk FuncotatorDataSourceDownloader --germline --validate-integrity --extract-after-download}</pre></li>
* <li>For <strong>somatic</strong> data sources:<br /><pre>{@code ./gatk FuncotatorDataSourceDownloader --somatic --validate-integrity --hg38 --extract-after-download}</pre></li>
* <li>For <strong>germline</strong> data sources:<br /><pre>{@code ./gatk FuncotatorDataSourceDownloader --germline --validate-integrity --hg19 --extract-after-download}</pre></li>
* </ul>
* </p>
*
Expand Down Expand Up @@ -63,6 +63,8 @@ public class FuncotatorDataSourceDownloader extends CommandLineProgram {
public static final String GERMLINE_ARG_LONG_NAME = "germline";
public static final String OVERWRITE_ARG_LONG_NAME = "overwrite-output-file";
public static final String EXTRACT_AFTER_DOWNLOAD = "extract-after-download";
public static final String HG38_ARG_LONG_NAME = "hg38";
public static final String HG19_ARG_LONG_NAME = "hg19";

//==================================================================================================================
// Private Static Members:
Expand All @@ -73,18 +75,27 @@ public class FuncotatorDataSourceDownloader extends CommandLineProgram {
// Private Static Members:

// Set to always get the latest version of the data sources:
private static final String BASE_URL = DataSourceUtils.DATA_SOURCES_BUCKET_PATH +
DataSourceUtils.DATA_SOURCES_NAME_PREFIX + "." + DataSourceUtils.getDataSourceMaxVersionString();
private static final String HG38_BASE_URL = DataSourceUtils.DATA_SOURCES_BUCKET_PATH +
DataSourceUtils.DATA_SOURCES_NAME_PREFIX + "." + DataSourceUtils.getDataSourceMaxVersionString(38);
private static final String HG19_BASE_URL = DataSourceUtils.DATA_SOURCES_BUCKET_PATH +
DataSourceUtils.DATA_SOURCES_NAME_PREFIX + "." + DataSourceUtils.getDataSourceMaxVersionString(19);

private static final String HG38_GERMLINE_GCLOUD_DATASOURCES_BASEURL = HG38_BASE_URL + DataSourceUtils.DS_GERMLINE_NAME_MODIFIER;
private static final String HG19_GERMLINE_GCLOUD_DATASOURCES_BASEURL = HG19_BASE_URL + DataSourceUtils.DS_GERMLINE_NAME_MODIFIER;

private static final String GERMLINE_GCLOUD_DATASOURCES_BASEURL = BASE_URL + DataSourceUtils.DS_GERMLINE_NAME_MODIFIER;
@VisibleForTesting
static final Path GERMLINE_GCLOUD_DATASOURCES_PATH = IOUtils.getPath(GERMLINE_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_EXTENSION);
private static final Path GERMLINE_GCLOUD_DATASOURCES_SHA256_PATH = IOUtils.getPath(GERMLINE_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_CHECKSUM_EXTENSION);
static final Path HG38_GERMLINE_GCLOUD_DATASOURCES_PATH = IOUtils.getPath(HG38_GERMLINE_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_EXTENSION);
static final Path HG19_GERMLINE_GCLOUD_DATASOURCES_PATH = IOUtils.getPath(HG19_GERMLINE_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_EXTENSION);
private static final Path HG38_GERMLINE_GCLOUD_DATASOURCES_SHA256_PATH = IOUtils.getPath(HG38_GERMLINE_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_CHECKSUM_EXTENSION);
private static final Path HG19_GERMLINE_GCLOUD_DATASOURCES_SHA256_PATH = IOUtils.getPath(HG19_GERMLINE_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_CHECKSUM_EXTENSION);

public static final String SOMATIC_GCLOUD_DATASOURCES_BASEURL = BASE_URL + DataSourceUtils.DS_SOMATIC_NAME_MODIFIER;;
public static final String HG38_SOMATIC_GCLOUD_DATASOURCES_BASEURL = HG38_BASE_URL + DataSourceUtils.DS_SOMATIC_NAME_MODIFIER;;
public static final String HG19_SOMATIC_GCLOUD_DATASOURCES_BASEURL = HG19_BASE_URL + DataSourceUtils.DS_SOMATIC_NAME_MODIFIER;;

public static final Path SOMATIC_GCLOUD_DATASOURCES_PATH = IOUtils.getPath(SOMATIC_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_EXTENSION);
private static final Path SOMATIC_GCLOUD_DATASOURCES_SHA256_PATH = IOUtils.getPath(SOMATIC_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_CHECKSUM_EXTENSION);
public static final Path HG38_SOMATIC_GCLOUD_DATASOURCES_PATH = IOUtils.getPath(HG38_SOMATIC_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_EXTENSION);
public static final Path HG19_SOMATIC_GCLOUD_DATASOURCES_PATH = IOUtils.getPath(HG19_SOMATIC_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_EXTENSION);
private static final Path HG38_SOMATIC_GCLOUD_DATASOURCES_SHA256_PATH = IOUtils.getPath(HG38_SOMATIC_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_CHECKSUM_EXTENSION);
private static final Path HG19_SOMATIC_GCLOUD_DATASOURCES_SHA256_PATH = IOUtils.getPath(HG19_SOMATIC_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_CHECKSUM_EXTENSION);

//==================================================================================================================
// Private Members:
Expand Down Expand Up @@ -129,6 +140,23 @@ public class FuncotatorDataSourceDownloader extends CommandLineProgram {
optional = true)
protected boolean extractDataSourcesAfterDownload = false;

@Argument(
shortName = HG38_ARG_LONG_NAME,
fullName = HG38_ARG_LONG_NAME,
mutex = {HG19_ARG_LONG_NAME, TESTING_OVERRIDE_PATH_FOR_DATA_SOURCES_SHA256_ARG},
doc = "If set, will extract data from the HG38 data sources bucket.",
optional = true)
protected boolean getHg38Datasources = false;

@Argument(
//TODO should these be MUTEX or should one be allowed to download either?
shortName = HG19_ARG_LONG_NAME,
fullName = HG19_ARG_LONG_NAME,
mutex = {HG38_ARG_LONG_NAME, TESTING_OVERRIDE_PATH_FOR_DATA_SOURCES_SHA256_ARG},
doc = "If set, will extract data from the HG19 data sources bucket.",
optional = true)
protected boolean getHg19Datasources = false;

// Testing arguments:
@Hidden
@Advanced
Expand Down Expand Up @@ -164,6 +192,11 @@ protected void onStartup() {
throw new UserException("Must select either somatic or germline datasources.");
}

// Make sure the user specified at least one reference source to download:
if ((!getHg38Datasources) && (!getHg19Datasources) && (testingOverrideDataSourcesPath == null)) {
throw new UserException("Must select either HG19 or HG38 datasources.");
}

// Make sure the testing inputs are correct:
if ( ((testingOverrideDataSourcesPath == null) && (testingOverrideDataSourcesSha256Path != null)) ||
((testingOverrideDataSourcesSha256Path == null) && (testingOverrideDataSourcesPath != null)) ) {
Expand All @@ -184,14 +217,26 @@ protected Object doWork() {

// Get the correct data source:
if ( getSomaticDataSources ) {
dataSourceDescription = "Somatic";
dataSourcesPath = SOMATIC_GCLOUD_DATASOURCES_PATH;
dataSourcesSha256Path = SOMATIC_GCLOUD_DATASOURCES_SHA256_PATH;
if (getHg38Datasources) {
dataSourceDescription = "HG38_Somatic";
dataSourcesPath = HG38_SOMATIC_GCLOUD_DATASOURCES_PATH;
dataSourcesSha256Path = HG38_SOMATIC_GCLOUD_DATASOURCES_SHA256_PATH;
} else { // Okay because HG38 and HG19 datasources are currently MUTEX and at least one is required
dataSourceDescription = "HG19_Somatic";
dataSourcesPath = HG19_SOMATIC_GCLOUD_DATASOURCES_PATH;
dataSourcesSha256Path = HG19_SOMATIC_GCLOUD_DATASOURCES_SHA256_PATH;
}
}
else if ( getGermlineDataSources ) {
dataSourceDescription = "Germline";
dataSourcesPath = GERMLINE_GCLOUD_DATASOURCES_PATH;
dataSourcesSha256Path = GERMLINE_GCLOUD_DATASOURCES_SHA256_PATH;
if (getHg38Datasources) {
dataSourceDescription = "HG38_Germline";
dataSourcesPath = HG38_GERMLINE_GCLOUD_DATASOURCES_PATH;
dataSourcesSha256Path = HG38_GERMLINE_GCLOUD_DATASOURCES_SHA256_PATH;
} else {
dataSourceDescription = "HG19_Germline";
dataSourcesPath = HG19_GERMLINE_GCLOUD_DATASOURCES_PATH;
dataSourcesSha256Path = HG19_GERMLINE_GCLOUD_DATASOURCES_SHA256_PATH;
}
}
else {
// Test case:
Expand Down
Loading