From 1ef34cf26a50389fdd881f35585df8b993c9ca68 Mon Sep 17 00:00:00 2001 From: matthijspon Date: Fri, 2 Dec 2022 12:56:36 +0100 Subject: [PATCH] implement script for mutational signature extraction --- .../mutational_signatures_annotation.csv | 139 +++++ core/src/main/scripts/mutationalSignatures.py | 562 ++++++++++++++++++ 2 files changed, 701 insertions(+) create mode 100644 core/src/main/resources/mutational_signatures_annotation.csv create mode 100644 core/src/main/scripts/mutationalSignatures.py diff --git a/core/src/main/resources/mutational_signatures_annotation.csv b/core/src/main/resources/mutational_signatures_annotation.csv new file mode 100644 index 00000000000..7bc2a9b83ce --- /dev/null +++ b/core/src/main/resources/mutational_signatures_annotation.csv @@ -0,0 +1,139 @@ +SIGNATURE,NAME,DESCRIPTION,URL +DBS1,DBS1 (UV),Exposure to ultraviolet light; associated with SBS7a/SBS7b and also with ID13 which predominantly generates single base thymine deletions at TT dinucleotides,https://cancer.sanger.ac.uk/signatures/dbs/dbs1 +DBS2,DBS2 (Smoking),"Exposure to tobacco smoking as well as other endogenous and/or exogenous mutagens (e.g., acetaldehyde); in addition to its presence in tobacco smoking induced cancers, DBS2 is also found in many cancer types unrelated to tobacco smoking; associated with SBS4 and ID3",https://cancer.sanger.ac.uk/signatures/dbs/dbs2 +DBS3,DBS3 (POLE),Polymerase epsilon exonuclease domain mutations; associated with SBS10a/SBS10b,https://cancer.sanger.ac.uk/signatures/dbs/dbs3 +DBS4,DBS4 (Unknown),Unknown etiology; its mutation burden correlates with the age of cancer diagnosis and this clock-like feature suggests that it is generated in normal human cells,https://cancer.sanger.ac.uk/signatures/dbs/dbs4 +DBS5,DBS5 (Platinum),Prior chemotherapy treatment with platinum drugs; associated with SBS31 and SBS35,https://cancer.sanger.ac.uk/signatures/dbs/dbs5 +DBS6,DBS6 (Unknown),Unknown etiology,https://cancer.sanger.ac.uk/signatures/dbs/dbs6 +DBS7,DBS7 (MMR),"Defective DNA mismatch repair (MMR); associated with large numbers of ID1 and ID2 mutations and in certain cancers with single base substitution signatures associated with defective DNA mismatch repair including SBS6, SBS14, SBS15, SBS20, SBS21, SBS26 and SBS44",https://cancer.sanger.ac.uk/signatures/dbs/dbs7 +DBS8,DBS8 (Unknown),"Unknown; found in rare hypermutated cancers with large numbers of single base substitutions similar to SBS1 and SBS5, but not apparently DNA mismatch repair defective or polymerase epsilon mutated; associated with SBS89",https://cancer.sanger.ac.uk/signatures/dbs/dbs8 +DBS9,DBS9 (Unknown),Unknown etiology,https://cancer.sanger.ac.uk/signatures/dbs/dbs9 +DBS10,DBS10 (MMR),"Defective DNA mismatch repair (MMR); associated with some of the single base substitution mutational signatures present in DNA mismatch repair deficient cancers SBS6, SBS14, SBS15, SBS20, SBS21, SBS26, and SBS44, and with large numbers of ID1 and ID2 mutations",https://cancer.sanger.ac.uk/signatures/dbs/dbs10 +DBS11,DBS11 (Unknown),"Unknown, possibly related to APOBEC mutagenesis; found in samples with large numbers of SBS2 and SBS13 mutations",https://cancer.sanger.ac.uk/signatures/dbs/dbs11 +ID1,ID1 (Slippage),"Slippage during DNA replication of the replicated DNA strand; tends to be highly elevated in cancer samples with defective DNA mismatch repair and microsatellite instability (MSI); associated with SBS6, SBS14, SBS15, SBS20, SBS21, SBS26, and/or SBS44 and with SBS1 in non-hypermutated samples",https://cancer.sanger.ac.uk/signatures/id/id1 +ID2,ID2 (Slippage),"Slippage during DNA replication of the template DNA strand; tends to be highly elevated in cancer samples with defective DNA mismatch repair and microsatellite instability (MSI); associated with SBS6, SBS14, SBS15, SBS20, SBS21, SBS26, and/or SBS44 and with SBS1 in non-hypermutated samples",https://cancer.sanger.ac.uk/signatures/id/id2 +ID3,ID3 (Smoking),Associated with tobacco smoking; associated with SBS4 and DBS2,https://cancer.sanger.ac.uk/signatures/id/id3 +ID4,ID4 (Unknown),Unknown; shows large numbers of mutations in a subset of samples which do not obviously have the mutational features of defective DNA mismatch repair,https://cancer.sanger.ac.uk/signatures/id/id4 +ID5,ID5 (Unknown),Unknown; the mutation burden is correlated with the age of cancer diagnosis and this clock-like behaviour suggests that ID5 mutations may accumulate in normal cells; associated with SBS40,https://cancer.sanger.ac.uk/signatures/id/id5 +ID6,ID6 (HRD),"homologous recombination-based DNA damage repair defects (HRD), often due to inactivating BRCA1 or BRCA2 mutations, leading to non-homologous DNA end-joining activity; associated with SBS3",https://cancer.sanger.ac.uk/signatures/id/id6 +ID7,ID7 (MMR),Defective DNA mismatch repair; associated with ID1 and ID2 mutations,https://cancer.sanger.ac.uk/signatures/id/id7 +ID8,ID8 (Unknown),Appears to be caused by at least two underlying mechanisms: the features of ID8 mutations have some similarities to those of radiation induced mutations and the small number of tumors with the somatic p.K743N mutation have a form of ID8 that shows evidence of transcription-associated damage,https://cancer.sanger.ac.uk/signatures/id/id8 +ID9,ID9 (Unknown),Unknown,https://cancer.sanger.ac.uk/signatures/id/id9 +ID10,ID10 (Unknown),Unknown,https://cancer.sanger.ac.uk/signatures/id/id10 +ID11,ID11 (Unknown),Unknown; occasional samples show large numbers of ID11 mutations without obvious evidence of defective DNA mismatch repair,https://cancer.sanger.ac.uk/signatures/id/id11 +ID12,ID12 (Unknown),Unknown,https://cancer.sanger.ac.uk/signatures/id/id12 +ID13,ID13 (UV),"Found in cancers of the skin from sun exposed areas, therefore, this signature is likely to be due to exposure to ultraviolet light; associated with DBS1, SBS7a and SBS7b",https://cancer.sanger.ac.uk/signatures/id/id13 +ID14,ID14 (Unknown),Unknown; generates large numbers of indels in a small number of samples without obvious evidence of defective DNA mismatch repair,https://cancer.sanger.ac.uk/signatures/id/id14 +ID15,ID15 (Unknown),Unknown; generates a large number of indels in a small number of cancers without obvious evidence of defective DNA mismatch repair,https://cancer.sanger.ac.uk/signatures/id/id15 +ID16,ID16 (Unknown),Unknown; generates high number of indels in a single ovarian cancer without obvious evidence of defective DNA mismatch repair,https://cancer.sanger.ac.uk/signatures/id/id16 +ID17,ID17 (TOP2A),Appears to be caused by mutations in topoisomerase TOP2A; ID17 generates a large number of indels in a small number of cancers without obvious evidence of defective DNA mismatch repair,https://cancer.sanger.ac.uk/signatures/id/id17 +ID18,ID18 (Colibactin),"Exposure to E.coli bacteria carrying pks pathogenicity island, producing genotoxic compound colibactin; associated with SBS88",https://cancer.sanger.ac.uk/signatures/id/id18 +SBS1,SBS1 (Age),Number of mutations correlated with age; associated with endogenous process initiated by deamination of 5-methylcytosine to thymine generating T>C substitutions; closely correlated with SBS5 in many tumor types,https://cancer.sanger.ac.uk/signatures/sbs/sbs1 +SBS2,SBS2 (APOBEC),Attributed to activity of AID/APOBEC family of cytidine deaminases; commonly found in some cancers with local hypermutation (kataegis); associated with SBS13 and DBS 11,https://cancer.sanger.ac.uk/signatures/sbs/sbs2 +SBS3,SBS3 (HRD),"Attributed to homologous recombination defects (HRD) in DNA damage response, most often found with germline and somatic BRCA mutations in breast, pancreatic, ovarian cancers, and in pancreatic tumors responsive to platinum therapy; associated with ID6",https://cancer.sanger.ac.uk/signatures/sbs/sbs3 +SBS4,SBS4 (Smoking),Associated with tobacco smoking likely due to direct DNA damage by tobacco smoke mutagens; frequently found in lung and head & neck cancers; associated with DSB2,https://cancer.sanger.ac.uk/signatures/sbs/sbs4 +SBS5,SBS5 (Unknown),Unknown etiology but is clock-like with the number of mutations in most cancers and normal cells correlated with the age,https://cancer.sanger.ac.uk/signatures/sbs/sbs5 +SBS6,SBS6 (MMR/MSI),"Associated with defective DNA mismatch repair (MMR) and is found in microsatellite unstable (MSI) tumors; one of 7 signatures associated with MMR/MSI along with SBS14, SBS15, SBS20, SBS21, SBS26, and SBS44; also associated with large numbers of ID1 and ID2 ",https://cancer.sanger.ac.uk/signatures/sbs/sbs6 +SBS7a,SBS7a (UV),"Attributed to ultraviolet light (UV) exposure and found in cancers of the skin; one of 4 signatures associated with UV exposure (SBS7b, SBS7c, and SBS7d); also associated with DSB1 and ID13",https://cancer.sanger.ac.uk/signatures/sbs/sbs7a +SBS7b,SBS7b (UV),"Attributed to ultraviolet light (UV) exposure and found in cancers of the skin; one of 4 signatures associated with UV exposure (SBS7a, SBS7c, and SBS7d); also associated with DSB1 and ID13",https://cancer.sanger.ac.uk/signatures/sbs/sbs7b +SBS7c,SBS7c (UV),"Attributed to ultraviolet light (UV) exposure and found in cancers of the skin; one of 4 signatures associated with UV exposure (SBS7a, SBS7b, and SBS7d)",https://cancer.sanger.ac.uk/signatures/sbs/sbs7c +SBS7d,SBS7d (UV),"Attributed to ultraviolet light (UV) exposure and found in cancers of the skin; one of 4 signatures associated with UV exposure (SBS7a, SBS7b, and SBS7c)",https://cancer.sanger.ac.uk/signatures/sbs/sbs7d +SBS8,SBS8 (Unknown),Unknown etiology,https://cancer.sanger.ac.uk/signatures/sbs/sbs8 +SBS9,SBS9 (POL-eta),May be due in part to mutations induced during replication by polymerase eta as part of somatic hypermutation in lymphoid cells; CLLs possessing immunoglobulin gene (IGHV) hypermutation have elevated levels than those without,https://cancer.sanger.ac.uk/signatures/sbs/sbs9 +SBS10a,SBS10a (POLE),"Attributed to polymerase epsilon exonuclease domain mutations; usually found in hypermutated samples (>100 per Mb); associated with SBS10b, SBS28 and DSB3",https://cancer.sanger.ac.uk/signatures/sbs/sbs10a +SBS10b,SBS10b (POLE),"Attributed to polymerase epsilon exonuclease domain mutations; usually found in hypermutated samples (>100 per Mb); associated with SBS10b, SBS28 and DSB3",https://cancer.sanger.ac.uk/signatures/sbs/sbs10b +SBS10c,SBS10c (POLD1),Defective POLD1 proofreading; strong lagging replication strand bias; associated with ID1,https://cancer.sanger.ac.uk/signatures/sbs/sbs10c +SBS10d,SBS10d (POLD1),Adenoma from individuals with germline POLD1 exonuclease domain mutations; strong lagging replication strand bias; associated with SBS10c and ID1,https://cancer.sanger.ac.uk/signatures/sbs/sbs10d +SBS11,SBS11 (TMZ),Exhibits a mutational pattern resembling that of alkylating agents and associated with patient histories of temozolomide (TMZ); usually generates large number of mutations (>10 per Mb),https://cancer.sanger.ac.uk/signatures/sbs/sbs11 +SBS12,SBS12 (Unknown),Etiology unknown; usually contributes a small percentage (<20%) of mutations in liver cancer,https://cancer.sanger.ac.uk/signatures/sbs/sbs12 +SBS13,SBS13 (APOBEC),Attributed to activity of the AID/APOBEC family of cytidine deaminases; usually found with SBS2 and in samples with local hypermutation (kataegis); associated with DSB11,https://cancer.sanger.ac.uk/signatures/sbs/sbs13 +SBS14,SBS14 (MMR/MSI),"Associated with concurrent polymerase epsilon mutation, defective DNA mismatch repair (MMR), and microsatellite instability (MSI); present in very high numbers in all samples where it has been observed; one of 7 signatures associated with MMR/MSI (SBS6, SBS15, SBS20, SBS21, SBS26, and SBS44); associated with ID1 and ID2",https://cancer.sanger.ac.uk/signatures/sbs/sbs14 +SBS15,SBS15 (MMR/MSI),"Associated with defective DNA mismatch repair (MMR), and microsatellite instability (MSI); one of 7 signatures associated with MMR/MSI (SBS6, SBS14, SBS20, SBS21, SBS26, and SBS44); associated with ID1 and ID2",https://cancer.sanger.ac.uk/signatures/sbs/sbs15 +SBS16,SBS16 (Unknown),Unknown etiology; lower levels of nucleotide excision repair and elevated levels of DNA damage on untranscribed strands may contribute,https://cancer.sanger.ac.uk/signatures/sbs/sbs16 +SBS17a,SBS17a (Unknown),Unknown etiology; commonly found with SBS17b,https://cancer.sanger.ac.uk/signatures/sbs/sbs17a +SBS17b,SBS17b (Unknown),Unknown etiology but has been linked to fluorouracil (5FU) chemotherapy treatment and reactive oxygen species damage; has similarities to SBS28 and these two signatures can be mistaken for one another; commonly found with SBS17a,https://cancer.sanger.ac.uk/signatures/sbs/sbs17b +SBS18,SBS18 (ROS),Possibly linked to damage by reactive oxygen species (ROS); similar in profile to SBS36 associated with defective base excision repair due to MUTYH mutations,https://cancer.sanger.ac.uk/signatures/sbs/sbs18 +SBS19,SBS19 (Unknown),Unknown etiology,https://cancer.sanger.ac.uk/signatures/sbs/sbs19 +SBS20,SBS20 (MMR/MSI),"Associated with concurrent POLD1 mutations and defective DNA mismatch repair (MMR)/microsatellite instability (MSI); one of 7 MMR/MSI signatures: SBS6, SBS14, SBS15, SBS21, SBS26, and SBS44; associated with ID1 and ID2",https://cancer.sanger.ac.uk/signatures/sbs/sbs20 +SBS21,SBS21 (MMR/MSI),"DNA mismatch repair (MMR) and microsatellite instability (MSI); one of 7 MMR/MSI signatures: SBS6, SBS14, SBS15, SBS20, SBS26, and SBS44; associated with ID1 and ID2",https://cancer.sanger.ac.uk/signatures/sbs/sbs21 +SBS22,SBS22 (Arisolochic acid),Associated with aristolochic acid exposure,https://cancer.sanger.ac.uk/signatures/sbs/sbs22 +SBS23,SBS23 (Unknown),Unknown etiology,https://cancer.sanger.ac.uk/signatures/sbs/sbs23 +SBS24,SBS24 (Aflatoxin),Associated with aflatoxin exposure,https://cancer.sanger.ac.uk/signatures/sbs/sbs24 +SBS25,SBS25 (Unknown),Unknown etiology but found in some Hodgkin's cell line samples derived from patients exposed to chemotherapy,https://cancer.sanger.ac.uk/signatures/sbs/sbs25 +SBS26,SBS26 (MMR/MSI),"One of seven signatures with defective DNA mismatch repair (MMR) and microsatellite instability (MSI); one of 7 MMR/MSI signatures: SBS6, SBS14, SBS15, SBS20, SBS26, and SBS44; associated with ID1 and ID2",https://cancer.sanger.ac.uk/signatures/sbs/sbs26 +SBS27,SBS27 (Sequencing),Possible sequencing artefact,https://cancer.sanger.ac.uk/signatures/sbs/sbs27 +SBS28,SBS28 (Unknown),Unknown etiology; found in most samples with hypermutator signatures SBS10a/SBS10b but contributes much smaller number of mutations,https://cancer.sanger.ac.uk/signatures/sbs/sbs28 +SBS29,SBS29 (Tobacco chewing),Associated with tobacco chewing habit,https://cancer.sanger.ac.uk/signatures/sbs/sbs29 +SBS30,SBS30 (BER),Attributed to deficiency in base excision repair (BER) due to inactivating mutations in NTHL1,https://cancer.sanger.ac.uk/signatures/sbs/sbs30 +SBS31,SBS31 (Platinum),Associated with prior chemotherapy treatment with platinum drugs; exhibits mutation patters similar to SBS35; associated with DSB5,https://cancer.sanger.ac.uk/signatures/sbs/sbs31 +SBS32,SBS32 (AZA),Associated with prior treatment with azathioprine (AZA) to induce immunosuppression,https://cancer.sanger.ac.uk/signatures/sbs/sbs32 +SBS33,SBS33 (Unknown),Unknown etiology,https://cancer.sanger.ac.uk/signatures/sbs/sbs33 +SBS34,SBS34 (Unknown),Unknown etiology,https://cancer.sanger.ac.uk/signatures/sbs/sbs34 +SBS35,SBS35 (Platinum),Associated with prior chemotherapy treatment with platinum drugs; exhibits mutation patters similar to SBS31; associated with DBS5,https://cancer.sanger.ac.uk/signatures/sbs/sbs35 +SBS36,SBS36 (BER),"Associated with defective base excision repair (BER), including DNA damage due to reactive oxygen species and biallelic germline/somatic MUTYH mutations; similar to SBS18",https://cancer.sanger.ac.uk/signatures/sbs/sbs36 +SBS37,SBS37 (Unknown),Unknown etiology,https://cancer.sanger.ac.uk/signatures/sbs/sbs37 +SBS38,SBS38 (Unknown),"Unknown etiology; found only in UV-associated melanoma, suggesting indirect damage from UV light",https://cancer.sanger.ac.uk/signatures/sbs/sbs38 +SBS39,SBS39 (Unknown),Unknown etiology,https://cancer.sanger.ac.uk/signatures/sbs/sbs39 +SBS40,SBS40 (Unknown),Unknown etiology; number of mutations correlated with patient's age in some types of cancer,https://cancer.sanger.ac.uk/signatures/sbs/sbs40 +SBS41,SBS41 (Unknown),Unknown etiology,https://cancer.sanger.ac.uk/signatures/sbs/sbs41 +SBS42,SBS42 (Haloalkane),Associated with occupational exposure to haloalkanes,https://cancer.sanger.ac.uk/signatures/sbs/sbs42 +SBS43,SBS43 (Sequencing),Possible sequencing artefact,https://cancer.sanger.ac.uk/signatures/sbs/sbs43 +SBS44,SBS44 (MMR/MSI),"Associated with defective DNA mismatch repair (MMR) and is found in microsatellite unstable (MSI) tumors; one of 7 signatures associated with MMR/MSI along with SBS6, SBS14, SBS15, SBS20, SBS21, and SBS26; associated ID1 and ID2",https://cancer.sanger.ac.uk/signatures/sbs/sbs44 +SBS45,SBS45 (Sequencing),Possible artefact due to 8-oxo-guanine introduced during sequencing,https://cancer.sanger.ac.uk/signatures/sbs/sbs45 +SBS46,SBS46 (Sequencing),Possible sequencing artefact; commonly found in colorectal cancers from early releases (prior to 2013) of TCGA,https://cancer.sanger.ac.uk/signatures/sbs/sbs46 +SBS47,SBS47 (Sequencing),Possible sequencing artefact; found in cancer samples subsequently blacklisted for poor quality of sequencing data,https://cancer.sanger.ac.uk/signatures/sbs/sbs47 +SBS48,SBS48 (Sequencing),Possible sequencing artefact; found in cancer samples subsequently blacklisted for poor quality of sequencing data,https://cancer.sanger.ac.uk/signatures/sbs/sbs48 +SBS49,SBS49 (Sequencing),Possible sequencing artefact,https://cancer.sanger.ac.uk/signatures/sbs/sbs49 +SBS50,SBS50 (Sequencing),Possible sequencing artefact; found in cancer samples subsequently blacklisted for poor quality of sequencing data,https://cancer.sanger.ac.uk/signatures/sbs/sbs50 +SBS51,SBS51 (Sequencing),Possible sequencing artefact,https://cancer.sanger.ac.uk/signatures/sbs/sbs51 +SBS52,SBS52 (Sequencing),Possible sequencing artefact,https://cancer.sanger.ac.uk/signatures/sbs/sbs52 +SBS53,SBS53 (Sequencing),Possible sequencing artefact; found in cancer samples subsequently blacklisted for poor quality of sequencing data,https://cancer.sanger.ac.uk/signatures/sbs/sbs53 +SBS54,SBS54 (Sequencing),Possible sequencing artefact; possible contamination with germline variants,https://cancer.sanger.ac.uk/signatures/sbs/sbs54 +SBS55,SBS55 (Sequencing),Possible sequencing artefact,https://cancer.sanger.ac.uk/signatures/sbs/sbs55 +SBS56,SBS56 (Sequencing),Possible sequencing artefact,https://cancer.sanger.ac.uk/signatures/sbs/sbs56 +SBS57,SBS57 (Sequencing),Possible sequencing artefact,https://cancer.sanger.ac.uk/signatures/sbs/sbs57 +SBS58,SBS58 (Sequencing),Possible sequencing artefact,https://cancer.sanger.ac.uk/signatures/sbs/sbs58 +SBS59,SBS59 (Sequencing),Possible sequencing artefact,https://cancer.sanger.ac.uk/signatures/sbs/sbs59 +SBS60,SBS60 (Sequencing),Possible sequencing artefact,https://cancer.sanger.ac.uk/signatures/sbs/sbs60 +SBS84,SBS84 (AID),Attributed to activity of activation-induced cytidine deaminase (AID); found in clustered mutations in immunoglobulin gene and other regions in lymphoid cancers; associated with SBS85,https://cancer.sanger.ac.uk/signatures/sbs/sbs84 +SBS85,SBS85 (AID),Attributed to indirect effects of activation-induced cytidine deaminase (AID) induced in somatic mutagenesis in lymphoid cells; found in clustered mutations in immunoglobulin gene and other regions in lymphoid cancers; associated with SBS84,https://cancer.sanger.ac.uk/signatures/sbs/sbs85 +SBS86,SBS86 (Unknown),Unknown chemotherapy treatment; found in combination with SBS87 in some relapsed ALL patients,https://cancer.sanger.ac.uk/signatures/sbs/sbs86 +SBS87,SBS87 (Thiopurine),"Thiopurine chemotherapy treatment, experimentally validated; found in combination with SBS86 in some relapsed ALL patients; associated with increased CG>NN doublet base substitutions",https://cancer.sanger.ac.uk/signatures/sbs/sbs87 +SBS88,SBS88 (Colibactin),"Exposure to E.coli bacteria carrying pks pathogenicity island, producing genotoxic compound colibactin; appears to be most active in the first decade of life; associated with ID18",https://cancer.sanger.ac.uk/signatures/sbs/sbs88 +SBS89,SBS89 (Unknown),"Unknown; SBS89 correlates with DBS8, suggesting that they are due to the same underlying mutational process",https://cancer.sanger.ac.uk/signatures/sbs/sbs89 +SBS90,SBS90 (Duocarmycin),Duocarmycin exposure,https://cancer.sanger.ac.uk/signatures/sbs/sbs90 +SBS91,SBS91 (Unknown),Unknown; enrichment in ALU elements,https://cancer.sanger.ac.uk/signatures/sbs/sbs91 +SBS92,SBS92 (Smoking),"Associated with tobacco smoking; found increased in bladder tumours; SBS4, ID3 and DBS2 have also been found associated with tobacco smoking",https://cancer.sanger.ac.uk/signatures/sbs/sbs92 +SBS93,SBS93 (Unknown),Unknown etiology,https://cancer.sanger.ac.uk/signatures/sbs/sbs93 +SBS94,SBS94 (Unknown),Unknown etiology,https://cancer.sanger.ac.uk/signatures/sbs/sbs94 +SBS95,SBS95 (Sequencing),Possible sequencing artefact,https://cancer.sanger.ac.uk/signatures/sbs/sbs95 +Signature_1,Signature 1 (Aging),Correlated with age of cancer diagnosis and found in all cancer types and in most cancer samples ,https://cancer.sanger.ac.uk/cosmic/signatures_v2.tt +Signature_2,Signature 2 (APOBEC),Attributed to activity of AID/APOBEC family of cytidine deaminases; usually found along with Signature 13 and in samples with local hypermutation (kataegis),https://cancer.sanger.ac.uk/cosmic/signatures_v2.tt +Signature_3,Signature 3 (HRD),"Associated with defetcs in homologous recombination-based DNA repair and BRCA mutations, especially in breast, ovarian, and pancreatic cancers",https://cancer.sanger.ac.uk/cosmic/signatures_v2.tt +Signature_4,Signature 4 (Smoking),"Associated with smoking in lung, head & neck, liver, and esophageal cancers; exhibits strong transcriptional bias for C>A mutations",https://cancer.sanger.ac.uk/cosmic/signatures_v2.tt +Signature_5,Signature 5 (Unknown),Found in all cancer types and most cancer samples; etiology unknown; exhibits transcriptional strand bias for T>C substitutions at ApTpN context,https://cancer.sanger.ac.uk/cosmic/signatures_v2.tt +Signature_6,Signature 6 (MMR),"Associated with defective DNA mismatch repair (MMR) and found in microsatellite (MSI)-unstable tumors; often accompanied by Signatures 15, 20, and 26; most common in colorectal and uterine cancers",https://cancer.sanger.ac.uk/cosmic/signatures_v2.tt +Signature_7,Signature 7 (UV),Associated with ultraviolet light (UV) exposure in skin cancers; exhibits a strong transcriptional strand bias,https://cancer.sanger.ac.uk/cosmic/signatures_v2.tt +Signature_8,Signature 8 (Unknown),Etiology unknown; found in breast cancer and medulloblastoma; shows a weak strand bias for C>A substitions,https://cancer.sanger.ac.uk/cosmic/signatures_v2.tt +Signature_9,Signature 9 (IGHV),Attributed to polymerase eta action and the activity of AID during hypermutation in CLL and B-cell lymphomas; IGHV-mutated CLLs show high Signature 9 compared to those without them,https://cancer.sanger.ac.uk/cosmic/signatures_v2.tt +Signature_10,Signature 10 (POLE),Attributed to altered activity of error-prone polymerase eta (POLE) in ultra-hypermutated samples; strand bias for C>A mutations at TpCpT and T>G at TpTPT,https://cancer.sanger.ac.uk/cosmic/signatures_v2.tt +Signature_11,Signature 11 (TMZ),Signature 11 has been found in melanoma and glioblastoma. Signature 11 exhibits a mutational pattern resembling that of alkylating agents. Patient histories have revealed an association between treatments with the alkylating agent temozolomide and Signature 11 mutations. It exhibits a strong transcriptional strand-bias for C>T substitutions indicating that mutations occur on guanine and that these mutations are effectively repaired by transcription-coupled nucleotide excision repair.,https://cancer.sanger.ac.uk/cosmic/signatures_v2.tt +Signature_12,Signature 12 (Unknown),Signature 12 has been found in liver cancer. The aetiology of Signature 12 remains unknown. It exhibits a strong transcriptional strand-bias for T>C substitutions.,https://cancer.sanger.ac.uk/cosmic/signatures_v2.tt +Signature_13,Signature 13 (APOBEC),"Signature 13 has been found in 22 cancer types and seems to be commonest in cervical and bladder cancers. In most of these 22 cancer types, Signature 13 is present in at least 10% of samples. Signature 13 has been attributed to activity of the AID/APOBEC family of cytidine deaminases converting cytosine to uracil. On the basis of similarities in the sequence context of cytosine mutations caused by APOBEC enzymes in experimental systems, a role for APOBEC1, APOBEC3A and/or APOBEC3B in human cancer appears more likely than for other members of the family. Signature 13 causes predominantly C>G mutations. This may be due to generation of abasic sites after removal of uracil by base excision repair and replication over these abasic sites by REV1.",https://cancer.sanger.ac.uk/cosmic/signatures_v2.tt +Signature_14,Signature 14 (MMR),Signature 14 has been observed in four uterine cancers and a single adult low-grade glioma sample. The aetiology of Signature 14 remains unknown. Signature 14 generates very high numbers of somatic mutations (>200 mutations per MB) in all samples in which it has been observed.,https://cancer.sanger.ac.uk/cosmic/signatures_v2.tt +Signature_15,Signature 15 (MMR),"Signature 15 has been found in several stomach cancers and a single small cell lung carcinoma. Signature 15 is associated with defective DNA mismatch repair. Signature 15 is one of four mutational signatures associated with defective DNA mismatch repair and is often found in the same samples as Signatures 6, 20, and 26.",https://cancer.sanger.ac.uk/cosmic/signatures_v2.tt +Signature_16,Signature 16 (Unknown),"Signature 16 has been found in liver cancer. The aetiology of Signature 16 remains unknown. Signature 16 exhibits an extremely strong transcriptional strand bias for T>C mutations at ApTpN context, with T>C mutations occurring almost exclusively on the transcribed strand.",https://cancer.sanger.ac.uk/cosmic/signatures_v2.tt +Signature_17,Signature 17 (Unknown),"Signature 17 has been found in oesophagus cancer, breast cancer, liver cancer, lung adenocarcinoma, B-cell lymphoma, stomach cancer and melanoma. The aetiology of Signature 17 remains unknown.",https://cancer.sanger.ac.uk/cosmic/signatures_v2.tt +Signature_18,Signature 18 (Unknown),"Signature 18 has been found commonly in neuroblastoma. Additionally, Signature 18 has been also observed in breast and stomach carcinomas.",https://cancer.sanger.ac.uk/cosmic/signatures_v2.tt +Signature_19,Signature 19 (Unknown),Signature 19 has been found only in pilocytic astrocytoma. The aetiology of Signature 19 remains unknown.,https://cancer.sanger.ac.uk/cosmic/signatures_v2.tt +Signature_20,Signature 20 (MMR),"Signature 20 has been found in stomach and breast cancers. Signature 20 is believed to be associated with defective DNA mismatch repair. Signature 20 is one of four mutational signatures associated with defective DNA mismatch repair and is often found in the same samples as Signatures 6, 15, and 26.",https://cancer.sanger.ac.uk/cosmic/signatures_v2.tt +Signature_21,Signature 21 (MMR),"Signature 21 has been found only in stomach cancer. The aetiology of Signature 21 remains unknown. Signature 21 is found only in four samples all generated by the same sequencing centre. The mutational pattern of Signature 21 is somewhat similar to the one of Signature 26. Additionally, Signature 21 is found only in samples that also have Signatures 15 and 20. As such, Signature 21 is probably also related to microsatellite unstable tumours.",https://cancer.sanger.ac.uk/cosmic/signatures_v2.tt +Signature_22,Signature 22 (Aristolochic acid),"Signature 22 has been found in urothelial (renal pelvis) carcinoma and liver cancers. Signature 22 has been found in cancer samples with known exposures to aristolochic acid. Additionally, the pattern of mutations exhibited by the signature is consistent with the one previous observed in experimental systems exposed to aristolochic acid. Signature 22 exhibits a very strong transcriptional strand bias for T>A mutations indicating adenine damage that is being repaired by transcription-coupled nucleotide excision repair. Signature 22 has a very high mutational burden in urothelial carcinoma; however, its mutational burden is much lower in liver cancers.",https://cancer.sanger.ac.uk/cosmic/signatures_v2.tt +Signature_23,Signature 23 (Unknown),Signature 23 has been found only in a single liver cancer sample. The aetiology of Signature 23 remains unknown. Signature 23 exhibits very strong transcriptional strand bias for C>T mutations.,https://cancer.sanger.ac.uk/cosmic/signatures_v2.tt +Signature_24,Signature 24 (Aflatoxin),"Signature 24 has been observed in a subset of liver cancers. Signature 24 has been found in cancer samples with known exposures to aflatoxin. Additionally, the pattern of mutations exhibited by the signature is consistent with that previous observed in experimental systems exposed to aflatoxin. It exhibits a very strong transcriptional strand bias for C>A mutations indicating guanine damage that is being repaired by transcription-coupled nucleotide excision repair.",https://cancer.sanger.ac.uk/cosmic/signatures_v2.tt +Signature_25,Signature 25 (Unknown),Signature 25 has been observed in Hodgkin lymphomas. The aetiology of Signature 25 remains unknown. It exhibits transcriptional strand bias for T>A mutations. This signature has only been identified in Hodgkin’s cell lines. Data is not available from primary Hodgkin lymphomas.,https://cancer.sanger.ac.uk/cosmic/signatures_v2.tt +Signature_26,Signature 26 (MMR),"Signature 26 has been found in breast cancer, cervical cancer, stomach cancer and uterine carcinoma. Signature 26 is believed to be associated with defective DNA mismatch repair. Signature 26 is one of four mutational signatures associated with defective DNA mismatch repair and is often found in the same samples as Signatures 6, 15 and 20.",https://cancer.sanger.ac.uk/cosmic/signatures_v2.tt +Signature_27,Signature 27 (Unknown),Signature 27 has been observed in a subset of kidney clear cell carcinomas. The aetiology of Signature 27 remains unknown. It exhibits very strong transcriptional strand bias for T>A mutations. Signature 27 is associated with high numbers of small (shorter than 3bp) insertions and deletions at mono/polynucleotide repeats.,https://cancer.sanger.ac.uk/cosmic/signatures_v2.tt +Signature_28,Signature 28 (Unknown),Signature 28 has been observed in a subset of stomach cancers. The aetiology of Signature 28 remains unknown.,https://cancer.sanger.ac.uk/cosmic/signatures_v2.tt +Signature_29,Signature 29 (Tobacco),Signature 29 has been observed only in gingivo-buccal oral squamous cell carcinoma. Signature 29 has been found in cancer samples from individuals with a tobacco chewing habit. Signature 29 exhibits transcriptional strand bias for C>A mutations indicating guanine damage that is most likely repaired by transcription-coupled nucleotide excision repair. Signature 29 is also associated with CC>AA dinucleotide substitutions. The Signature 29 pattern of C>A mutations due to tobacco chewing appears different from the pattern of mutations due to tobacco smoking reflected by Signature 4.,https://cancer.sanger.ac.uk/cosmic/signatures_v2.tt +Signature_30,Signature 30 (Unknown),Signature 30 has been observed in a small subset of breast cancers. The aetiology of Signature 30 remains unknown.,https://cancer.sanger.ac.uk/cosmic/signatures_v2.tt diff --git a/core/src/main/scripts/mutationalSignatures.py b/core/src/main/scripts/mutationalSignatures.py new file mode 100644 index 00000000000..9e302179929 --- /dev/null +++ b/core/src/main/scripts/mutationalSignatures.py @@ -0,0 +1,562 @@ +#!/usr/bin/env python3 + +# +# Copyright (c) 2022 The Hyve B.V. +# This code is licensed under the GNU Affero General Public License (AGPL), +# version 3, or (at your option) any later version. +# + +# +# This file is part of cBioPortal. +# +# cBioPortal is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +# + +""" +Mutational signatures script. Create mutation matrices and calculate mutational +signatures from mutational data. For usage, run the script with the -h option. + +Example: + python3 mutationalSignatures.py --study-path /data/study1 --out-dir /data/out -t /app/tempoSig.R + -s /cosmic/mut_signatures_SBS.txt -d /cosmic/mut_signatures_DBS.txt -i /cosmic/mut_signatures_ID.txt +""" + +# imports +import argparse +import glob +import os +import re +import shutil +import subprocess + +import os.path as osp +import pandas as pd +from SigProfilerMatrixGenerator import install as genInstall +from SigProfilerMatrixGenerator.scripts import SigProfilerMatrixGeneratorFunc as matGen + + +def parse_args() -> argparse.Namespace: + """Parse command line arguments + + :return: parsed arguments + """ + parser = argparse.ArgumentParser( + description="Extract mutational signatures from a cBioPortal study/MAF/VCFs and generate output in cBioPortal format.", + ) + input_group = parser.add_argument_group("input (mutually exclusive)") + input_excl = input_group.add_mutually_exclusive_group(required=True) + input_excl.add_argument("--study-path", metavar="./path/", dest="study_path", type=existing_file, + help="cBioPortal study path") + input_excl.add_argument("--maf", metavar="file.maf", type=existing_file, + help="MAF file (--study-id required)") + input_excl.add_argument("--vcf-folder", metavar="./path/", dest="vcf_folder", type=existing_file, + help="folder containing VCF files with sampleID as filename " + "(--study-id and --genome required)") + # REQUIRED ARGUMENTS + required_group = parser.add_argument_group("required arguments") + required_group.add_argument("-t", "--temposig-loc", dest="temposig_loc", metavar="temposig.R", type=existing_file, + required=True, help="location of 'tempoSig.R' script") + required_group.add_argument("-o", "--out-dir", metavar="./path/", dest="out_dir", required=True, + type=lambda x: existing_file(x, create_dir=True), help="output directory") + # SIGNATURE FILES + sigs = parser.add_argument_group("signature files (at least one is required)") + sigs.add_argument("-s", "--cosmic-sbs-file", dest="sbs_file", metavar="cosmic_SBS.txt", + type=existing_file, help="extract single base substitution signatures with the provided file") + sigs.add_argument("-d", "--cosmic-dbs-file", dest="dbs_file", metavar="cosmic_DBS.txt", + type=existing_file, help="extract double base substitution signatures with the provided file") + sigs.add_argument("-i", "--cosmic-id-file", dest="id_file", metavar="cosmic_ID.txt", + type=existing_file, help="extract insertion/deletion signatures with the provided file") + # OPTIONAL DEPENDING ON INPUT + optional_group = parser.add_argument_group("optional arguments") + optional_group.add_argument("--study-id", metavar="studyid", dest="study_id", type=str, + help="study ID (used for metafile generation)") + optional_group.add_argument("--genome", dest="ncbi_build", metavar="{GRCh37,GRCh38,GRCm37,GRCm38}", + type=implemented_genome, help="NCBI build") + # REQUIRED ONCE + optional_group.add_argument("--install-genome", dest="install_genome", action='store_true', + help="install the given genome for matrix generation (only required on first run " + "per genome)") + # OPTIONAL + optional_group.add_argument("--tmp-dir", dest="tmp_dir", metavar="./path/", default="./tmp", + type=lambda x: existing_file(x, create_dir=True, shouldnt_exist=True), + help="temporary directory for files (default: ./tmp)") + optional_group.add_argument("--seed", metavar="N", type=int, + help="seed for reproducibility") + optional_group.add_argument("-n", "--nperm", metavar="N", type=int, default=1000, + help="number of permutations for p-value estimation (default: 1000)") + optional_group.add_argument("--alt-allele", dest="alt_allele", choices=["Tumor_Seq_Allele1", "Tumor_Seq_Allele2"], + help="Manually set alternative allele") + optional_group.add_argument("--annotate", metavar="annotate.csv", type=existing_file, # Location relative to repo + default=osp.normpath(osp.join(osp.dirname(__file__), + "../resources/mutational_signatures_annotation.csv")), + help="Path to signature annotation file (default: " + "../resources/mutational_signatures_annotation.csv)") + args = parser.parse_args() + # Check some conditionally required arguments + arg_conditionals(args) + return args + + +def existing_file(path, create_dir=False, shouldnt_exist=False): + """Check if file or directory exists + + :param path: file or directory path + :type path: str + :param create_dir: create directory if non-existent + :param shouldnt_exist: raise error if exists + :return: file or directory path + :rtype: str + """ + if not os.path.exists(path): + if not create_dir: + raise argparse.ArgumentTypeError(f"{path} does not exist.") + os.mkdir(path) + return path + if shouldnt_exist: + raise argparse.ArgumentTypeError(f"{path} should not exist before running the script.") + return path + + +def implemented_genome(genome: str) -> str: + """Check if genome has been implemented. Ambiguous options default to human genome. + + :param genome: genome + :return: NCBI build + """ + ncbi_build_options = { + "19": "GRCh37", "hg19": "GRCh37", "grch37": "GRCh37", "37": "GRCh37", "GRCh37": "GRCh37", + "grch38": "GRCh38", "38": "GRCh38", "GRCh38": "GRCh38", + "mm9": "GRCm37", "9": "GRCm37", "grcm37": "GRCm37", + "mm10": "GRCm38", "10": "GRCm38", "grcm38": "GRCm38" + } + ncbi_build = ncbi_build_options.get(genome) + if ncbi_build is None: + raise argparse.ArgumentTypeError(f"Genome: `{genome}` has not been implemented. " + f"The following genomes are implemented: {set(ncbi_build_options.values())}") + return ncbi_build + + +def arg_conditionals(args: argparse.Namespace): + """Check if argument conditionals are followed, else raise error + + :param args: argparse parsed arguments + """ + # If using maf/vcf-folder, study-ID is required + if (args.maf or args.vcf_folder) and not args.study_id: + used_command = f"{'--maf' if args.maf else '--vcf-folder'}" + parameters = f"{'--study-id parameter is' if args.maf else '--study-id and --genome arguments are'}" + raise argparse.ArgumentTypeError(f"If using the {used_command} command, the {parameters} required") + # If using vcf-folder, genome is required + if args.vcf_folder and not args.ncbi_build: + raise argparse.ArgumentTypeError(f"If using the --vcf-folder argument, the --study-id and " + f"--genome arguments are required") + # Check if at least one of the signature files is given + if not (args.sbs_file or args.dbs_file or args.id_file): + raise argparse.ArgumentTypeError("At least one of the [-s,-d,-i] arguments is required.") + + +def load_yaml(file: iter) -> dict: + """Simple load a yaml file. Only accepts key: value pairs, no multi-line statements + + :param file: open yaml file + :return: yaml key-value pairs + """ + yaml_cont = {} + for line in file: + key, val = line.strip().split(":", 1) + yaml_cont[key.strip()] = val.strip() + return yaml_cont + + +def read_mutations_meta_file(study_path: str) -> dict: + """Find and read mutation data meta file in cBioPortal study folder + + :param study_path: path to cBioPortal study folder + :return: contents of mutations meta file + """ + meta_files = glob.glob(osp.join(study_path, "meta*")) + for file in meta_files: + with open(file) as f: + yaml_cont = load_yaml(f) + if yaml_cont.get("stable_id") == "mutations" and yaml_cont.get("datatype") == "MAF": + # Correct file, retrieve associated datafile + print(f"Mutation meta file found in study: \'{file}\'") + return yaml_cont + raise FileNotFoundError(f"Mutation meta file not found in study \'{study_path}\'.") + + +def acquire_maf_study_id(study_path: str) -> tuple: + """Find meta file in study and retrieve mutations file and study ID + + :param study_path: path to cBioPortal study + :returns: mutation file path, study ID + """ + # Find meta files in study + yaml_cont = read_mutations_meta_file(study_path) + mut_file = yaml_cont.get("data_filename") + study_id = yaml_cont.get("cancer_study_identifier") + mut_file = osp.join(study_path, mut_file) + return mut_file, study_id + + +def get_ncbi_build(maf_df: pd.DataFrame) -> str: + """Try to get the ncbi build from a MAF file + + :param maf_df: dataframe containing 'NCBI_Build' column + :return: ncbi build + """ + genome = list(maf_df["NCBI_Build"].unique()) + if len(genome) != 1: + raise ValueError(f"There are either no, or multiple ncbi builds present in your MAF file: \'{genome}\'") + return implemented_genome(genome[0].lower()) + + +def determine_alternative_allele(maf: pd.DataFrame) -> str: + """Determine which alternative allele to use from MAF file + + :param maf: maf dataframe + :return: alternative allele to use + """ + regexp = re.compile(r'^[ATGC]*$') + # This is normally advised against due to speed, but we need only one good row. + for idx, row in maf.iterrows(): + if row["Tumor_Seq_Allele1"] != row["Reference_Allele"] \ + and regexp.fullmatch(row["Tumor_Seq_Allele1"]): + return "Tumor_Seq_Allele1" + elif row["Tumor_Seq_Allele2"] != row["Reference_Allele"] \ + and regexp.fullmatch(row["Tumor_Seq_Allele2"]): + return "Tumor_Seq_Allele2" + raise ValueError("Alternative Allele in maf file could not be determined. " + "Please use the --alt-allele parameter.") + + +def preprocess_maf(mut_file: str, tmp_dir: str, args: argparse.Namespace) -> tuple: + """Preprocess the cBioPortal mutations data file for matrix generation + + :param mut_file: path to mutations file + :param tmp_dir: path to temp folder + :param args: parsed argparse.ArgumentParser + :return: path to preprocessed MAF, NCBI build + """ + maf = pd.read_csv(mut_file, sep="\t", comment="#", dtype=str) + # Get NCBI build from MAF file + if not args.ncbi_build: + ncbi_build = get_ncbi_build(maf) + else: + ncbi_build = args.ncbi_build + print(f"Using ncbi build: {ncbi_build}.") + # Determine alt allele + alt_al = args.alt_allele + if not alt_al: + alt_al = determine_alternative_allele(maf) + print(f"Using '{alt_al}' as alternative allele.") + # Remap maf file using important parts, as cbioportal MAF format does not conform to + # GFC MAF file format (column "Consequence" is not placed on the correct location). + headers = ["Chromosome", "Start_Position", "End_Position", "Reference_Allele", alt_al, "Tumor_Sample_Barcode"] + maf = maf[headers] + new_headers = ["NA" for i in range(16)] + # SigProfilerMatrixGenerator selects alternative allele by location, so the header doesn't matter + for i, j in enumerate([4, 5, 6, 10, 12, 15]): + new_headers[j] = headers[i] + maf = maf.reindex(new_headers, axis=1) + # File needs the MAF file extension, otherwise it is not recognized by SigProfilerMatrixGenerator + outfile = osp.join(tmp_dir, "mutations.maf") + # Fill empty cells with NA string, otherwise they get parsed incorrectly by SigProfilerMatrixGenerator + maf.to_csv(outfile, sep="\t", index=False, na_rep="NA") + return outfile, ncbi_build + + +def preprocess_vcf(args: argparse.Namespace, tmp_dir): + """Preprocess vcf files by copying them into the tmp dir + + :param args: argparse parsed arguments + :param tmp_dir: temporary directory + """ + vcf_list = glob.glob(osp.join(args.vcf_folder, "*.vcf")) + for vcf in vcf_list: + shutil.copy(vcf, osp.join(tmp_dir, osp.basename(vcf))) + + +def prepare_input(args: argparse.Namespace) -> tuple: + """Wrapper function for preparing input files for matrix generation + + :param args: argparse parsed arguments + :return: ncbi_build, study ID + """ + # Init variables + maf_file, study_id, ncbi_build = [""] * 3 + + if args.study_path: + # Get maf file and study ID from cBioPortal study metadata + maf_file, study_id = acquire_maf_study_id(args.study_path) + elif args.maf: + maf_file = args.maf + + if maf_file: + # Check if given maf file exists in study path, fix order of columns and copy to temporary directory + maf_file, ncbi_build = preprocess_maf(maf_file, args.tmp_dir, args) + else: + # No MAF, so VCF as input + preprocess_vcf(args, args.tmp_dir) + + # Set ncbi_build and study_id if provided as argument + if args.ncbi_build: + ncbi_build = args.ncbi_build + if args.study_id: + study_id = args.study_id + + return ncbi_build, study_id + + +def genome_install(ncbi_build: str): + """Install a genome for SigProfilerMatrixGenerator + + :param ncbi_build: ncbi build + """ + print(f"Installing genome: {ncbi_build} for SigProfilerMatrixGenerator.") + genInstall.install(ncbi_build) + + +def run_matrix_generator(ncbi_build: str, input_folder: str): + """Run SigProfilerMatrixGenerator to generate substitution matrices in input folder + + :param ncbi_build: ncbi build + :param input_folder: path to folder containing maf/vcf input files + """ + print("-" * 120 + "\nGenerating nucleotide matrices: ") + # Name of the project is irrelevant, the name will not be used any further + project = "tmp" + matGen.SigProfilerMatrixGeneratorFunc(project, ncbi_build, input_folder) + + +def matrix_stable_id(string: str) -> str: + """Replace non-stableID characters by - and _ in matrix names + + :param string: matrix name + :return: stable ID with only allowed characters + """ + replace = { + ">": "-", ":": "_", "[": "_", "]": "_" + } + new = ["mutational_signatures_matrix_"] + for char in string: + if replace.get(char): + new.append(replace.get(char)) + else: + new.append(char) + return "".join(new) + + +def matrix_to_cbioportal(matrix: str, outfile: str): + """Convert a matrix to cBioPortal format + + :param matrix: path to input matrix + :param outfile: path to output file + """ + data = pd.read_csv(matrix, sep="\t", dtype=str) + data = data.rename({"MutationType": "NAME"}, axis=1) + data = data.set_index("NAME") + data = data.reset_index() + data["ENTITY_STABLE_ID"] = data["NAME"].apply(matrix_stable_id) + column_order = ["ENTITY_STABLE_ID", "NAME"] + [x for x in data.columns if x not in ["ENTITY_STABLE_ID", "NAME"]] + data = data[column_order] + data.to_csv(outfile, sep="\t", index=False, na_rep="NA") + + +def preprocess_temposig(file: str): + """Remove the first value on the first row of a matrix, tempoSig requires this + + :param file: path to matrix file + """ + with open(file, "r") as infile: + data = infile.readlines() + data[0] = "\t".join(data[0].split("\t")[1:]) + with open(file, "w") as outfile: + outfile.writelines(data) + + +def run_temposig(temposig_loc: str, inf_matrix: str, out_cont: str, out_pval: str, signature_file: str, nperm=1000, + seed: int=None): + """Generate mutational signatures using tempoSig + + :param temposig_loc: path to tempoSig.R script + :param inf_matrix: path to input mutational matrix + :param out_cont: path to output file for contributions + :param out_pval: path to output file for p-values + :param signature_file: location of signature file + :param nperm: number of permutations for p-value estimation (default: 1000) + :param seed: random number seed for algorithm (default: None) + """ + command = f"{temposig_loc} {inf_matrix} {out_cont} " \ + f"--pvalue --pv.out {out_pval} --nperm {nperm} --sigfile {signature_file}" + if seed: + command += f" --seed {seed}" + subprocess.run(command, shell=True) + + +def temposig_to_generic_assay(infile: str, outfile:str, annotation_file: str, data_type: str) -> str: + """Transform tempoSig output to the cBioPortal generic assay format + + :param infile: path to input file + :param outfile: path to output file + :param annotation_file: path to annotation file + :param data_type: data type used to generate entity stable IDs + :return: path to output file + """ + # read in TS output + data = pd.read_csv(infile, sep="\t", dtype=str, comment="#") + # Number of mutations column in unnecessary + data = data.drop("Number of Mutations", axis=1) + data = data.fillna("NA") + data.loc[:, data.columns != "Sample Name"] = data.loc[:, + data.columns != "Sample Name"].applymap(lambda x: x.upper()) + data = data.set_index("Sample Name") + # transpose + data = data.transpose() + # add extra data + extra = pd.read_csv(annotation_file, dtype=str) + extra = extra.set_index("SIGNATURE") + data = extra.join(data, how='right') + data = data.reset_index() + # Generate stable IDs + data["ENTITY_STABLE_ID"] = data["index"].apply(lambda x: f"mutational_signature_{data_type}_{x}") + # Output signatures that were not in the annotation file + sigs = list(data.loc[data['NAME'].isnull(), 'index']) + if sigs: + print(f"The following signatures do not have additional annotation: {sigs}") + data.loc[data.index.isin(sigs), "NAME"] = data.loc[data.index.isin(sigs), "index"] + data = data.drop("index", axis=1) + data = data.set_index("ENTITY_STABLE_ID") + data.to_csv(outfile, sep="\t", na_rep="NA") + return outfile + + +def calculate_mutational_signatures(tmp_dir: str, out_dir: str, args: argparse.Namespace) -> list: + """Wrapper function for mutational signatures algorithms and generating meta file + + :param tmp_dir: path to tmp directory + :param out_dir: path to output directory + :param args: argparse parsed arguments + :return: types of matrices for which mutational signatures were created + """ + matrices_ran = [] + + print("-" * 120) + # These substitution matrix filenames are consistent + for substitution_matrix, matrix_type, cosmic_file in [("output/SBS/tmp.SBS96.all", "SBS", args.sbs_file), + ("output/DBS/tmp.DBS78.all", "DBS", args.dbs_file), + ("output/ID/tmp.ID83.all", "ID", args.id_file)]: + inf_matrix = f"{tmp_dir}/{substitution_matrix}" + + # Cannot run without signature file + if cosmic_file is None: + print(f"No cosmic signatures file given for matrix type '{matrix_type}'. Skipping...\n") + # Make sure the substitution matrix was created + elif not os.path.exists(inf_matrix): + print(f"Matrix type \'{matrix_type}\' was not created. Skipping...\n") + else: + base_cont, base_pval, base_mat = [f"data_mutational_signature_{value}_{matrix_type}.txt" + for value in ["contribution", "pvalue", "matrix"]] + + # Preprocess mutation matrix + matrix_to_cbioportal(inf_matrix, osp.join(out_dir, base_mat)) + preprocess_temposig(inf_matrix) + # Preprocess signature file + tmp_cosmic = osp.join(tmp_dir, "signature_files", osp.basename(cosmic_file)) + if not osp.exists(osp.join(tmp_dir, "signature_files")): + os.mkdir(osp.join(tmp_dir, "signature_files")) + shutil.copy(cosmic_file, tmp_cosmic) + preprocess_temposig(tmp_cosmic) + + print(f"Running tempoSig for {matrix_type}...") + run_temposig(args.temposig_loc, inf_matrix, osp.join(tmp_dir, base_cont), osp.join(tmp_dir, base_pval), + nperm=args.nperm, seed=args.seed, signature_file=tmp_cosmic) + if not osp.exists(osp.join(tmp_dir, base_cont)) or not osp.exists(osp.join(tmp_dir, base_pval)): + print(f"!! Something went wrong with extracting {matrix_type} signatures. File not created.") + else: + # Rewrite tempoSig output to cBioPortal format + temposig_to_generic_assay(osp.join(tmp_dir, base_cont), osp.join(out_dir, base_cont), + args.annotate, "contribution") + temposig_to_generic_assay(osp.join(tmp_dir, base_pval), osp.join(out_dir, base_pval), + args.annotate, "pvalue") + matrices_ran.append(matrix_type) + print(f"Finished running tempoSig") + return matrices_ran + + +def generate_single_meta_file(study_id: str, meta_file: str, file_type: str, algorithm_string: str, out_dir: str, + meta_properties: str="NAME,DESCRIPTION,URL"): + """Generate a single cBioPortal generic essay meta file + + :param study_id: study ID + :param meta_file: output filename excl. 'meta_' and filetype extension + :param file_type: `contribution`, `pvalue` or other value + :param algorithm_string: string describing the algorithm used + :param out_dir: path to output directory + :param meta_properties: generic_entity_meta_properties string (default: 'NAME,DESCRIPTION,URL') + """ + profile_name = " ".join(meta_file.split("_")) + out_lines = f"cancer_study_identifier: {study_id}\n" \ + f"genetic_alteration_type: GENERIC_ASSAY\n" \ + f"generic_assay_type: MUTATIONAL_SIGNATURE\n" \ + f"datatype: LIMIT-VALUE\n" \ + f"stable_id: {meta_file}\n" \ + f"profile_name: {profile_name}\n" \ + f"profile_description: profile for {file_type} value of mutational signatures, generated using " \ + f"{algorithm_string}\n" \ + f"data_filename: data_{meta_file}.txt\n" \ + f"show_profile_in_analysis_tab: {str(file_type == 'contribution').lower()}\n" \ + f"generic_entity_meta_properties: {meta_properties}" + with open(f"{out_dir}/meta_{meta_file}.txt", "w+") as f: + f.write(out_lines) + + +def generate_all_meta_files(args: argparse.Namespace, matrices_ran: list, study_id: str, algorithm_string: str): + """Generate meta files for all files in list + + :param args: argparse arguments + :param matrices_ran: files to generate meta files for + :param study_id: study ID + :param algorithm_string: string describing the algorithm used + """ + for file_type in ["contribution", "pvalue", "matrix"]: + for matrix in matrices_ran: + meta_filename = f"mutational_signature_{file_type}_{matrix}" + print(f"Generating meta file: {args.out_dir}/meta_{meta_filename}.txt...") + if file_type == "matrix": + generate_single_meta_file(study_id, meta_filename, file_type, algorithm_string, args.out_dir, + meta_properties="NAME") + else: + generate_single_meta_file(study_id, meta_filename, file_type, algorithm_string, args.out_dir) + + +def main(): + """Main function""" + # Parse arguments + args = parse_args() + # Prepare input files + ncbi_build, study_id = prepare_input(args) + # Install genome if asked + if args.install_genome: + genome_install(ncbi_build) + # Create matrix + run_matrix_generator(ncbi_build, args.tmp_dir) + # Run desired algorithm (and possible preprocessing) for mutational signatures + matrices_ran = calculate_mutational_signatures(args.tmp_dir, args.out_dir, args) + # Create associated meta files + generate_all_meta_files(args, matrices_ran, study_id, "tempoSig") + + +if __name__ == "__main__": + main()