diff --git a/data/Makefile b/data/Makefile index 31befda..2c0e7fd 100644 --- a/data/Makefile +++ b/data/Makefile @@ -155,7 +155,7 @@ $(TMP_DIR)/ensembl_transcript_info.txt: $(TMP_DIR)/$(SPECIES).gff3.gz python ../scripts/transform_gff_to_tsv_for_exon_info_from_ensembl.py $^ $@ # Add HGNC symbols, exons, UTRs, PFAM domains and Uniprot id to Ensembl Transcript -$(TMP_DIR)/ensembl_biomart_transcripts.json.gz: $(TMP_DIR)/ensembl_biomart_transcripts.txt $(TMP_DIR)/ensembl_transcript_info.txt $(VERSION)/input/ensembl_biomart_pfam.txt $(VERSION)/input/ensembl_biomart_refseq.txt $(VERSION)/input/ensembl_biomart_ccds.txt uniprot/export/$(VERSION)_enst_to_uniprot_mapping_id.txt common_input/isoform_overrides_uniprot.txt common_input/$(MSKCC_ISOFORM_OVERRIDES_FILE_NAME) common_input/hgnc_complete_set_2023-12-26.txt +$(TMP_DIR)/ensembl_biomart_transcripts.json.gz: $(TMP_DIR)/ensembl_biomart_transcripts.txt $(TMP_DIR)/ensembl_transcript_info.txt $(VERSION)/input/ensembl_biomart_pfam.txt $(VERSION)/input/ensembl_biomart_refseq.txt $(VERSION)/input/ensembl_biomart_ccds.txt uniprot/export/$(VERSION)_enst_to_uniprot_mapping_id.txt common_input/isoform_overrides_uniprot.txt common_input/$(MSKCC_ISOFORM_OVERRIDES_FILE_NAME) common_input/hgnc_complete_set_2023-10.txt python ../scripts/add_domains_hugo_ccds_refseq_exon_info_uniprot_to_ensembl_transcript.py $^ $@ # for mouse a specific recipe without overrides @@ -165,9 +165,9 @@ $(TMP_DIR)/ensembl_biomart_transcripts_mouse.json.gz: $(TMP_DIR)/ensembl_biomart # give default/canonical geneid/transcript based on given hugo symbol takes # about 50m to run (TODO: this can be easily optimized) # isoform_overrides_genome_nexus.txt is made for genome nexus, others files are generated for vcf2maf -# Please note: we should keep hgnc_complete_set_2023-12-26 in sync with https://github.com/cBioPortal/datahub-study-curation-tools/blob/master/gene-table-update/build-input-for-importer/hgnc_complete_set.txt +# Please note: we should keep hgnc_complete_set_2023-10 in sync with https://github.com/cBioPortal/datahub-study-curation-tools/blob/master/gene-table-update/build-input-for-importer/hgnc_complete_set.txt # isoform_overrides_oncokb_grch3*.txt is a list of OncoKB transcripts and genes, it's generated by download_oncokb_isoform_overrides.py -$(TMP_DIR)/ensembl_biomart_canonical_transcripts_per_hgnc.txt: $(TMP_DIR)/ensembl_canonical_data.txt common_input/hgnc_complete_set_2023-12-26.txt common_input/isoform_overrides_uniprot.txt common_input/$(MSKCC_ISOFORM_OVERRIDES_FILE_NAME) common_input/$(GENOME_NEXUS_ISOFORM_OVERRIDES_FILE_NAME) common_input/$(ONCOKB_ISOFORM_OVERRIDES_FILE_NAME) common_input/ignored_genes.txt +$(TMP_DIR)/ensembl_biomart_canonical_transcripts_per_hgnc.txt: $(TMP_DIR)/ensembl_canonical_data.txt common_input/hgnc_complete_set_2023-10.txt common_input/isoform_overrides_uniprot.txt common_input/$(MSKCC_ISOFORM_OVERRIDES_FILE_NAME) common_input/$(GENOME_NEXUS_ISOFORM_OVERRIDES_FILE_NAME) common_input/$(ONCOKB_ISOFORM_OVERRIDES_FILE_NAME) common_input/ignored_genes.txt python ../scripts/make_one_canonical_transcript_per_gene.py $^ $@ # mouse version. A different script is called that set the canonicals based on Ensembl lookup. diff --git a/data/common_input/hgnc_complete_set_2023-12-26.txt b/data/common_input/hgnc_complete_set_2023-10.txt similarity index 100% rename from data/common_input/hgnc_complete_set_2023-12-26.txt rename to data/common_input/hgnc_complete_set_2023-10.txt diff --git a/data/common_input/version_info.txt b/data/common_input/version_info.txt index abbb272..3bf01aa 100644 --- a/data/common_input/version_info.txt +++ b/data/common_input/version_info.txt @@ -1,8 +1,8 @@ name version type id genome_build description url VEP grch37 mirrored vep grch37 VEP determines the effect of your variants(SNPs, insertions, deletions, CNVs or structural variants) on genes, transcripts, and protein sequence, as well as regulatory regions. https://grch37.ensembl.org/info/docs/tools/vep/index.html VEP grch38 mirrored vep grch38 VEP determines the effect of your variants(SNPs, insertions, deletions, CNVs or structural variants) on genes, transcripts, and protein sequence, as well as regulatory regions. https://useast.ensembl.org/info/docs/tools/vep/index.html -HGNC 2023-12 mirrored hgnc grch37 The resource for approved human gene nomenclature. Genome Nexus uses HGNC gene symbols in annotation http://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/archive/monthly/tsv/ -HGNC 2023-12 mirrored hgnc grch38 The resource for approved human gene nomenclature. Genome Nexus uses HGNC gene symbols in annotation http://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/archive/monthly/tsv/ +HGNC 2023-10 mirrored hgnc grch37 The resource for approved human gene nomenclature. Genome Nexus uses HGNC gene symbols in annotation http://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/archive/monthly/tsv/ +HGNC 2023-10 mirrored hgnc grch38 The resource for approved human gene nomenclature. Genome Nexus uses HGNC gene symbols in annotation http://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/archive/monthly/tsv/ Cancer Hotspots v2 mirrored cancer_hotspots grch37 A resource for statistically significant mutations in cancer https://www.cancerhotspots.org Cancer Hotspots v2 mirrored cancer_hotspots grch38 A resource for statistically significant mutations in cancer https://www.cancerhotspots.org 3D Hotspots v2 mirrored 3d_hotspots grch37 A resource for statistically significant mutations clustering in 3d protein structures in cancer https://www.3dhotspots.org/ diff --git a/scripts/add_domains_hugo_ccds_refseq_exon_info_uniprot_to_ensembl_transcript.py b/scripts/add_domains_hugo_ccds_refseq_exon_info_uniprot_to_ensembl_transcript.py index ebdac6d..dd1906e 100755 --- a/scripts/add_domains_hugo_ccds_refseq_exon_info_uniprot_to_ensembl_transcript.py +++ b/scripts/add_domains_hugo_ccds_refseq_exon_info_uniprot_to_ensembl_transcript.py @@ -246,7 +246,7 @@ def main(ensembl_biomart_transcripts, help="common_input/isoform_overrides_uniprot.txt") parser.add_argument("vcf2maf_isoform_overrides_mskcc", help="common_input/isoform_overrides_at_mskcc_grch37.txt or common_input/isoform_overrides_at_mskcc_grch38.txt") - parser.add_argument("hgnc_symbol_set", help="common_input/hgnc_complete_set_2023-12-26.txt") + parser.add_argument("hgnc_symbol_set", help="common_input/hgnc_complete_set_2023-10.txt") parser.add_argument("ensembl_biomart_transcripts_json", help="tmp/ensembl_biomart_transcripts.json.gz") diff --git a/scripts/make_one_canonical_transcript_per_gene.py b/scripts/make_one_canonical_transcript_per_gene.py index 858aab8..e79bef5 100644 --- a/scripts/make_one_canonical_transcript_per_gene.py +++ b/scripts/make_one_canonical_transcript_per_gene.py @@ -224,7 +224,7 @@ def main(ensembl_biomart_geneids_transcript_info, parser.add_argument("ensembl_biomart_geneids_transcript_info", help="tmp/ensembl_biomart_geneids.transcript_info.txt") parser.add_argument("hgnc_complete_set", - help="common_input/hgnc_complete_set_2023-12-26.txt") + help="common_input/hgnc_complete_set_2023-10.txt") parser.add_argument("isoform_overrides_uniprot", help="common_input/isoform_overrides_uniprot.txt") parser.add_argument("isoform_overrides_at_mskcc",