From 3d918d64c1f1deb1ea0acae596c2218a33bbda1e Mon Sep 17 00:00:00 2001 From: matthijspon Date: Wed, 18 Jan 2023 16:39:33 +0100 Subject: [PATCH] add DSB/ID support, create matrix file in cbio format --- core/src/main/scripts/mutationalSignatures.py | 595 ++++++++++++------ 1 file changed, 387 insertions(+), 208 deletions(-) diff --git a/core/src/main/scripts/mutationalSignatures.py b/core/src/main/scripts/mutationalSignatures.py index 67f59bfb5b7..dafbb08d9e8 100644 --- a/core/src/main/scripts/mutationalSignatures.py +++ b/core/src/main/scripts/mutationalSignatures.py @@ -1,8 +1,31 @@ #!/usr/bin/env python3 + +# +# Copyright (c) 2022 The Hyve B.V. +# This code is licensed under the GNU Affero General Public License (AGPL), +# version 3, or (at your option) any later version. +# + +# +# This file is part of cBioPortal. +# +# cBioPortal is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +# + """ -Author(s): Matthijs Pon -Company/Institution: The Hyve -Email: matthijs@thehyve.nl +Mutational signatures script. Create mutation matrices and calculate mutational +signatures from mutational data. For usage, run the script with the -h option. """ # imports @@ -11,42 +34,80 @@ import os import shutil import subprocess +import re + import pandas as pd +import os.path as osp -from rpy2 import robjects from SigProfilerMatrixGenerator import install as genInstall from SigProfilerMatrixGenerator.scripts import SigProfilerMatrixGeneratorFunc as matGen +from SigProfilerExtractor.sigpro import sigProfilerExtractor - -def parse_args(): +def parse_args() -> argparse.Namespace: """Parse command line arguments - :return: object, ArgumentParser with parsed objects + :return: parsed arguments """ + # TODO WHEN FINISHED: put all the arguments in a logical order parser = argparse.ArgumentParser( description="Extract mutational signatures from a cBioPortal study and generate output in cBioPortal format.", ) subparsers = parser.add_subparsers(dest="algorithm", help="select algorithm for mutational signature extraction") - parser.add_argument("study_path", metavar="study-path", - help="cBioPortal study path") - parser.add_argument("out_dir", metavar="out-dir", # TODO this can be an optional parameter and default can be study path? - help="output directory") + + input_group = parser.add_mutually_exclusive_group(required=True) + input_group.add_argument("--study_path", metavar="study-path", type=existing_file, + help="cBioPortal study path") + # TODO implement the following options for input files + input_group.add_argument("--maf", metavar="maf", type=existing_file, + help="MAF file in cBioPortal format") + input_group.add_argument("--vcf", metavar="vcf", type=existing_file, + help="a single VCF file") + input_group.add_argument("--vcf_folder", metavar="vcf-folder", type=existing_file, + help="folder containing VCF files") + + parser.add_argument("-o", "--out_dir", metavar="out-dir", type=lambda x: existing_file(x, create_dir=True), + help="output directory", required=True) + parser.add_argument("--genome", metavar="{GRCh37, GRCh38, GRCm37, GRCm38}", type=implemented_genome, + help="NCBI build") parser.add_argument("--install-genome", dest="install_genome", action='store_true', - help="install the given genome for SigProfilerMatrixGenerator (only required on first run " + help="install the given genome for matrix generation (only required on first run " "per genome)") parser.add_argument("--tmp-dir", dest="tmp_dir", metavar="'./tmp/'", default="./tmp", - help="temporary directory for files, directory will be deleted after runtime (default: ./tmp)") - - ts_parser = subparsers.add_parser("tempoSig") - ts_parser.add_argument("--temposig", dest="temposig_loc", metavar="'./tempoSig.R'", default=None, - help="location of tempoSig.R script, only necessary if file cannot be automatically found " - "or installed") - ts_parser.add_argument("--cosmic", metavar="[2/3]", default=2, type=int, - help="cosmic version to use in tempoSig (default: 2)") - ts_parser.add_argument("--nperm", metavar="n", type=int, default=1000, - help="number of permutations for p-value estimation (default: 1000)") + type=lambda x: existing_file(x, create_dir=True, shouldnt_exist=True), + help="temporary directory for files (default: ./tmp)") + parser.add_argument("--seed", metavar="N", type=int, + help="seed for algorithm reproducibility") + parser.add_argument("--alt-allele", dest="alt_allele", choices=["Tumor_Seq_Allele1", "Tumor_Seq_Allele2"], + help="Manually set alternative allele ('Tumor_Seq_Allele1', 'Tumor_Seq_Allele2')") + # TODO WHEN FINISHED: make this relative to the cBioPortal repository and give a better help description + parser.add_argument("--transform", default="input/cosmic_transform/mutsig_cbioportal_cosmic_v3.csv", + help="Path to signature transformation file (default: file included in git repo)") + + # TempoSig specific parser + ts_parser = subparsers.add_parser("tempoSig") + ts_parser.add_argument("-n", "--nperm", metavar="N", type=int, default=1000, + help="number of permutations for p-value estimation (default: 1000)") + ts_parser.add_argument("-t", "--temposig-loc", dest="temposig_loc", metavar="'./temposig.R'", required=True, + type=existing_file, + help="location of 'tempoSig.R' script") + # Either pick cosmic v2 or provide your own cosmic v3 files + ts_group = ts_parser.add_mutually_exclusive_group(required=True) + ts_group.add_argument("--cosmic-v2", dest="cosmic_v2", default=False, action="store_true", + help="Use cosmic v2 for signature extraction " + "(only SBS extraction, no additional files needed)") + ts_group.add_argument("-s", "--cosmic-SBS-file", dest="SBS_file", metavar="'./cosmic_SBS.txt'", + type=existing_file, + help="extract single base substitution signatures with the provided file") + # Additional DBS and ID files + ts_parser.add_argument("-d", "--cosmic-DBS-file", dest="DBS_file", metavar="'./cosmic_DBS.txt'", + type=existing_file, + help="extract double base substitution signatures with the provided file") + ts_parser.add_argument("-i", "--cosmic-ID-file", dest="ID_file", metavar="'./cosmic_ID.txt'", + type=existing_file, + help="extract INDEL signatures with the provided file") + # This parser is unnecessary for now, but provided for easier algorithm implementation sp_parser = subparsers.add_parser("SigProfiler") @@ -54,253 +115,381 @@ def parse_args(): return args -def load_yaml(file): +def existing_file(path, create_dir=False, shouldnt_exist=False): + """Check if file or directory exists + + :param path: file or directory path + :type path: str + :param create_dir: create directory if non-existent + :param shouldnt_exist: raise error if exists + :return: file or directory path + :rtype: str + """ + if not os.path.exists(path): + if not create_dir: + raise argparse.ArgumentTypeError(f"{path} does not exist.") + os.mkdir(path) + return path + if shouldnt_exist: + raise argparse.ArgumentTypeError(f"{path} should not exist before running the script.") + return path + + +def implemented_genome(genome: str) -> str: + """Check if genome has been implemented. Ambiguous options default to human genome. + + :param genome: genome + :return: NCBI build + """ + ncbi_build_options = { + "19": "GRCh37", "hg19": "GRCh37", "grch37": "GRCh37", "37": "GRCh37", + "grch38": "GRCh38", "38": "GRCh38", + "mm9": "GRCm37", "9": "GRCm37", "grcm37": "GRCm37", + "mm10": "GRCm38", "10": "GRCm38", "grcm38": "GRCm38" + } + ncbi_build = ncbi_build_options.get(genome) + if ncbi_build is None: + raise argparse.ArgumentTypeError(f"Genome: `{genome}` has not been implemented. " + f"The following genomes are implemented: {set(ncbi_build_options.values())}") + return ncbi_build + + +def load_yaml(file: iter) -> dict: """Simple load a yaml file. Only accepts key: value pairs, no multi-line statements - :param file: object, open yaml file - :return: dict, yaml key-value pairs + :param file: open yaml file + :return: yaml key-value pairs """ yaml_cont = {} for line in file: - split_line = line.strip().split(":") - key = split_line[0].strip() - val = "".join(split_line[1:]).strip() - yaml_cont[key] = val + key, val = line.strip().split(":", 1) + yaml_cont[key.strip()] = val.strip() return yaml_cont -def find_mutations_data(study_path): - """Find mutation data (MAF) file in cBioPortal study folder +def read_mutations_meta_file(study_path: str) -> dict: + """Find and read mutation data meta file in cBioPortal study folder - :param study_path: str, path to cBioPortal study folder - :return: str, name of mutations data (MAF) file + :param study_path: path to cBioPortal study folder + :return: contents of mutations meta file """ meta_files = glob.glob(study_path + "/meta*") - if len(meta_files) == 0: - raise FileNotFoundError("No meta files present in study") for file in meta_files: with open(file) as f: yaml_cont = load_yaml(f) if yaml_cont.get("stable_id") == "mutations" and yaml_cont.get("datatype") == "MAF": # Correct file, retrieve associated datafile print(f"Mutation meta file found in study: \'{file}\'") - return yaml_cont.get("data_filename"), yaml_cont.get("cancer_study_identifier") + return yaml_cont raise FileNotFoundError(f"Mutation meta file not found in study \'{study_path}\'.") -def acquire_maf_study_id(study_path): +def acquire_maf_study_id(study_path: str) -> tuple: """Find meta file in study and retrieve mutations file and study ID - :param study_path: str, path to cBioPortal study - :return: str, mutation file path; str, study ID + :param study_path: path to cBioPortal study + :returns: mutation file path, study ID """ - # Check if study exist - if not os.path.exists(study_path): - raise FileNotFoundError(f"{study_path} does not exist") # Find meta files in study - mut_file, study_id = find_mutations_data(study_path) - mut_file = f"{study_path}/{mut_file}" + yaml_cont = read_mutations_meta_file(study_path) + mut_file = yaml_cont.get("data_filename") + study_id = yaml_cont.get("cancer_study_identifier") + mut_file = osp.join(study_path, mut_file) return mut_file, study_id -def decipher_genome(maf_df): - """Try to get the genome build from a MAF file +def get_ncbi_build(maf_df: pd.DataFrame) -> str: + """Try to get the ncbi build from a MAF file - :param maf_df: pandas object, pandas dataframe containing 'NCBI_Build' column - :return: str, genome build (GRCh**) + :param maf_df: dataframe containing 'NCBI_Build' column + :return: ncbi build """ + genome = list(maf_df["NCBI_Build"].unique()) if len(genome) != 1: - raise ValueError(f"There are either no or multiple genome builds present in your MAF file: \'{genome}\'") - try: - genome = int(genome[0]) - except ValueError: - try: - genome = int(genome[0][-2:]) - except ValueError: - genome = input("Can't decipher genome build. Please enter manually (37/38): ") - genome = int(genome) - genome = "GRCh" + str(genome) - return genome - - -def pre_process_maf(mut_file, tmp_dir): - """Pre-process the cBioPortal mutations data file for matrix generation - - :param mut_file: str, path to mutations file - :param tmp_dir: str, path to temp folder - :return: path to pre-processed MAF, genome build - """ - - # Create temporary directory and copy file(s) - try: - os.mkdir(tmp_dir) - except FileExistsError: - answer = input(f"\nDirectory \'{tmp_dir}\' already exists. Remove directory (including contents) " - f"and continue script? (y/n): ") - if answer.lower() in ["y", "yes"]: - print(f"Removing {tmp_dir}...") - try: - shutil.rmtree(tmp_dir) - except NotADirectoryError: - answer = input(f"{tmp_dir} is not a directory. Still remove? (y/n): ") - if answer.lower() in ["y", "yes"]: - os.remove(tmp_dir) - else: - raise NotADirectoryError(f"Not a directory: '{tmp_dir}'") - os.mkdir(tmp_dir) - else: - raise FileExistsError(f"{tmp_dir} directory exists. Please give another temporary directory using the " - f"--temp-dir flag.") + raise ValueError(f"There are either no, or multiple ncbi builds present in your MAF file: \'{genome}\'") + return implemented_genome(genome[0].lower()) - # TODO block input this to avoid memory overflow? - # Read in maf and take relevant columns - maf = pd.read_csv(mut_file, sep="\t", comment="#") - # Get genome from MAF file - genome = decipher_genome(maf) - # TODO select which alternative allele? - alt_al = "Tumor_Seq_Allele2" - headers = ["Chromosome", "Start_Position", "End_Position", "Reference_Allele", alt_al, "Tumor_Sample_Barcode"] - maf = maf[headers] +def determine_alternative_allele(maf: pd.DataFrame) -> str: + """Determine which alternative allele to use from MAF file + + :param maf: maf dataframe + :return: alternative allele to use + """ + regexp = re.compile(r'^[ATGC]*$') + # This is normally advised against due to speed, but we need only one good row. + for idx, row in maf.iterrows(): + if row["Tumor_Seq_Allele1"] != row["Reference_Allele"] \ + and regexp.fullmatch(row["Tumor_Seq_Allele1"]): + return "Tumor_Seq_Allele1" + elif row["Tumor_Seq_Allele2"] != row["Reference_Allele"] \ + and regexp.fullmatch(row["Tumor_Seq_Allele2"]): + return "Tumor_Seq_Allele2" + raise ValueError("Alternative Allele in maf file could not be determined. " + "Please use the --alt-allele parameter.") + + +def preprocess_maf(mut_file: str, tmp_dir: str, args: argparse.Namespace) -> tuple: + """Preprocess the cBioPortal mutations data file for matrix generation + + :param mut_file: path to mutations file + :param tmp_dir: path to temp folder + :param args: parsed argparse.ArgumentParser + :return: path to preprocessed MAF, NCBI build + """ + maf = pd.read_csv(mut_file, sep="\t", comment="#", dtype=str) + # Get NCBI build from MAF file + if not args.genome: + ncbi_build = get_ncbi_build(maf) + else: + ncbi_build = args.genome + print(f"Using ncbi build: {ncbi_build}.") + # Determine alt allele + alt_al = args.alt_allele + if not alt_al: + alt_al = determine_alternative_allele(maf) + print(f"Using '{alt_al}' as alternative allele.") + # Remap maf file using important parts, as cbioportal MAF format does not conform to # GFC MAF file format (column "Consequence" is not placed on the correct location). + headers = ["Chromosome", "Start_Position", "End_Position", "Reference_Allele", alt_al, "Tumor_Sample_Barcode"] + maf = maf[headers] new_headers = ["NA" for i in range(16)] - for i, j in enumerate([4, 5, 6, 10, 12, 15]): + # SigProfilerMatrixGenerator selects alternative allele by location, so the header doesn't matter + for i, j in enumerate([4, 5, 6, 10, 12, 15]): new_headers[j] = headers[i] maf = maf.reindex(new_headers, axis=1) - # Fill empty cells with NA string, otherwise they get parsed out by SigProfilerMatrixGenerator - maf = maf.fillna("NA") # File needs the MAF file extension, otherwise it is not recognized by SigProfilerMatrixGenerator - outfile = tmp_dir + "/mutations.maf" - maf.to_csv(outfile, sep="\t", index=False) - return outfile, genome + outfile = osp.join(tmp_dir, "mutations.maf") + # Fill empty cells with NA string, otherwise they get parsed incorrectly by SigProfilerMatrixGenerator + maf.to_csv(outfile, sep="\t", index=False, na_rep="NA") + return outfile, ncbi_build -def genome_install(genome): +def genome_install(ncbi_build: str): """Install a genome for SigProfilerMatrixGenerator - :param genome: genome build (GRCh37 or GRCh38) - :return: None + :param ncbi_build: ncbi build """ - genInstall.install(genome) - return None + print(f"Installing genome: {ncbi_build} for SigProfilerMatrixGenerator.") + genInstall.install(ncbi_build) -def run_matrix_generator(genome_build, maf_folder): +def run_matrix_generator(ncbi_build: str, maf_folder: str): """Run SigProfilerMatrixGenerator to generate substitution matrices in input folder - :param genome_build: str, genome build (GRCh37 or GRCh38) - :param maf_folder: str, path to folder containing maf input files - :return: None + :param ncbi_build: ncbi build + :param maf_folder: path to folder containing maf input files """ - print("-" * 60 + "\nGenerating nucleotide matrices: ") + print("-" * 120 + "\nGenerating nucleotide matrices: ") # Name of the project is irrelevant, these files will be removed after MS algorithm has been run project = "tmp" - matGen.SigProfilerMatrixGeneratorFunc(project, genome_build, maf_folder) - return None + matGen.SigProfilerMatrixGeneratorFunc(project, ncbi_build, maf_folder) -def install_temposig(): - """Call R to install tempoSig +def matrix_stable_id(string: str) -> str: + """Replace non-stableID characters by - and _ in matrix names - :return: None + :param string: matrix name + :return: stable ID with only allowed characters """ - robjects.r('devtools::install_github("mskcc/tempoSig")') - return None + replace = { + ">": "-", + ":": "_", + "[": "_", + "]": "_" + } + new = ["mutational_signatures_matrix_"] + for char in string: + if replace.get(char): + new.append(replace.get(char)) + else: + new.append(char) + return "".join(new) -def check_install_temposig(): - """Call R to check if tempoSig is installed and where, otherwise install using R +def matrix_to_cbioportal(matrix: str, outfile: str): + """Convert a matrix to cBioPortal format - :return: str, location of tempoSig.R script + :param matrix: path to input matrix + :param outfile: path to output file """ - temposig_location = robjects.r("system.file('exec', 'tempoSig.R', package = 'tempoSig')")[0] - if not temposig_location: - install_temposig() - temposig_location = robjects.r("system.file('exec', 'tempoSig.R', package = 'tempoSig')")[0] - if not temposig_location: - raise ImportError("Unable to install or locate the tempoSig R-package. Please install manually. " - "If this error persists, either use the --temposig parameter or use another algorithm.") - return temposig_location + data = pd.read_csv(matrix, sep="\t", dtype=str) + data = data.rename({"MutationType": "NAME"}, axis=1) + data = data.set_index("NAME") + data = data.reset_index() + data["ENTITY_STABLE_ID"] = data["NAME"].apply(matrix_stable_id) + column_order = ["ENTITY_STABLE_ID", "NAME"] + [x for x in data.columns if x not in ["ENTITY_STABLE_ID", "NAME"]] + data = data[column_order] + data.to_csv(outfile, sep="\t", index=False, na_rep="NA") -def matrix_temposig(file_name): - """Remove the first column on the first row, tempoSig requires this +def preprocess_temposig(file: str): + """Remove the first value on the first row of a matrix, tempoSig requires this - :param file_name: str, name of matrix file to alter - :return: None, file is altered on host machine + :param file: path to matrix file """ - with open(file_name, "r") as infile: + with open(file, "r") as infile: data = infile.readlines() data[0] = "\t".join(data[0].split("\t")[1:]) - with open(file_name, "w") as outfile: + with open(file, "w") as outfile: outfile.writelines(data) - return None -def run_temposig(temposig_loc, inf_matrix, outf, cosmic=2, nperm=1000): +def run_temposig(temposig_loc: str, inf_matrix: str, out_cont: str, out_pval: str, signature_file: str, nperm=1000, + seed: int=None): """Generate mutational signatures using tempoSig - :param temposig_loc: str, path to tempoSig.R script - :param inf_matrix: str, path to input mutational matrix - :param outf: str, path to output file - :param cosmic: int, cosmic version to use (2, 3) - :param nperm: int, number of permutations for p-value estimation (default: 1000) - :return: None + :param temposig_loc: path to tempoSig.R script + :param inf_matrix: path to input mutational matrix + :param out_cont: path to output file for contributions + :param out_pval: path to output file for p-values + :param signature_file: location of signature file + :param nperm: number of permutations for p-value estimation (default: 1000) + :param seed: random number seed for algorithm (default: None) """ - command = f"{temposig_loc} --cosmic_v{cosmic} --pvalue --nperm {nperm} --cbio {inf_matrix} " \ - f"{outf}_contribution_v{cosmic}.txt --pv.out {outf}_pvalue_v{cosmic}.txt" + command = f"{temposig_loc} {inf_matrix} {out_cont} " \ + f"--pvalue --pv.out {out_pval} --nperm {nperm}" + if seed: + command += f" --seed {seed}" + if signature_file: + command += f" --sigfile {signature_file}" + else: + command += " --cosmic_v2" + subprocess.run(command, shell=True) - return None -def wrapper_mut_sig(algorithm, tmp_dir, out_dir, args): +def run_sigprofilerextractor(input_file: str, output_dir: str, ncbi_build: str, seed: int=None): + """""" # TODO docstring + # TODO have a look at other functions from sigprofiler + input_type = "matrix" + if not seed: + seed = "random" + else: + seed = str(seed) # TODO THIS IS INCORRECT make a file with all the same seeds, check file structure + sigProfilerExtractor(input_type, output_dir, input_file, opportunity_genome=ncbi_build, + context_type="96", nmf_replicates=1) + + +def temposig_to_generic_assay(infile: str, outfile:str, transform_file: str, data_type: str) -> str: + """Transform tempoSig output to the cBioPortal generic assay format + + :param infile: path to input file + :param outfile: path to output file + :param transform_file: path to transformation file + :param data_type: data type used to generate entity stable IDs + :return: path to output file + """ + # read in TS output + data = pd.read_csv(infile, sep="\t", dtype=str, comment="#") + # Number of mutations column in unnecessary + data = data.drop("Number of Mutations", axis=1) + # Replace sample ID . by -, this is introduced by tempoSig + data["Sample Name"] = data["Sample Name"].str.replace(".", "-") + data = data.fillna("NA") + data.loc[:, data.columns != "Sample Name"] = data.loc[:, data.columns != "Sample Name"].applymap(lambda x: x.upper()) + data = data.set_index("Sample Name") + # transpose + data = data.transpose() + # add extra data + extra = pd.read_csv(transform_file, dtype=str) + extra = extra.set_index("SIGNATURE") + data = extra.join(data, how='right') + data = data.reset_index() + # Generate stable IDs + data["ENTITY_STABLE_ID"] = data["index"].apply(lambda x: f"mutational_signature_{data_type}_{x}") + # Output signatures that were not in the transformation file + sigs = list(data.loc[data['NAME'].isnull(), 'index']) + if sigs: + print(f"The following signatures do not have additional annotation: {sigs}") + data = data.drop("index", axis=1) + data = data.set_index("ENTITY_STABLE_ID") + data.to_csv(outfile, sep="\t", na_rep="NA") + return outfile + + +def calculate_mutational_signatures(algorithm: str, tmp_dir: str, out_dir: str, args: argparse.Namespace) -> list: """Wrapper function for mutational signatures algorithms and generating meta file - :param algorithm: str, algorithm to use - :param tmp_dir: str, path to tmp directory - :param out_dir: str, path to output directory - :param args: object, argparse parsed arguments - :return: list of str, types of matrices for which mutational signatures were created + :param algorithm: algorithm to use + :param tmp_dir: path to tmp directory + :param out_dir: path to output directory + :param args: argparse parsed arguments + :return: types of matrices for which mutational signatures were created """ matrices_ran = [] - print("-" * 60) + print("-" * 120) if algorithm == "tempoSig": - algorithm_string = f"{algorithm} and COSMIC version {args.cosmic}" - if not args.temposig_loc: - temposig_loc = check_install_temposig() - else: - temposig_loc = args.temposig_loc - # TODO check if TempoSig can handle DNS and INDEL # These substitution matrix filenames are consistent - for substitution_matrix, matrix_type in [("output/SBS/tmp.SBS96.all", "SNS")]: + for substitution_matrix, matrix_type, cosmic_file in [("output/SBS/tmp.SBS96.all", "SBS", args.SBS_file), + ("output/DBS/tmp.DBS78.all", "DBS", args.DBS_file), + ("output/ID/tmp.ID83.all", "ID", args.ID_file)]: inf_matrix = f"{tmp_dir}/{substitution_matrix}" + + # Cannot run without signature file + if cosmic_file is None and not args.cosmic_v2: + print(f"No cosmic signatures file given for matrix type '{matrix_type}'. Skipping...\n") # Make sure the substitution matrix was created - if not os.path.exists(inf_matrix): + elif not os.path.exists(inf_matrix): print(f"Matrix type \'{matrix_type}\' was not created. Skipping...\n") + elif matrix_type != "SBS" and args.cosmic_v2: + # Cosmic v2 only works for SBS signatures + continue else: - matrices_ran.append(matrix_type) - matrix_temposig(inf_matrix) - outf = f"{out_dir}/data_mutational_signature_{matrix_type}" - print(f"Running tempoSig for {matrix_type}...") - run_temposig(temposig_loc, inf_matrix, outf, cosmic=args.cosmic, nperm=args.nperm) + base_cont, base_pval, base_mat = [f"data_mutational_signature_{value}_{matrix_type}.txt" + for value in ["contribution", "pvalue", "matrix"]] + + # Preprocess mutation matrix + matrix_to_cbioportal(inf_matrix, osp.join(out_dir, base_mat)) + preprocess_temposig(inf_matrix) + # Cosmic v2 signature and transformation file is included in tempoSig + tmp_cosmic = None + if not args.cosmic_v2: + # Preprocess signature file + tmp_cosmic = osp.join(tmp_dir, "signature_files", osp.basename(cosmic_file)) + if not osp.exists(osp.join(tmp_dir, "signature_files")): + os.mkdir(osp.join(tmp_dir, "signature_files")) + shutil.copy(cosmic_file, tmp_cosmic) + preprocess_temposig(tmp_cosmic) + print(f"Running tempoSig for {matrix_type}...") + run_temposig(args.temposig_loc, inf_matrix, osp.join(tmp_dir, base_cont), osp.join(tmp_dir, base_pval), + nperm=args.nperm, seed=args.seed, signature_file=tmp_cosmic) + if not osp.exists(osp.join(tmp_dir, base_cont)) or not osp.exists(osp.join(tmp_dir, base_pval)): + print(f"!! Something went wrong with extracting {matrix_type} signatures. File not created.") + else: + # Rewrite tempoSig output to cBioPortal format + temposig_to_generic_assay(osp.join(tmp_dir, base_cont), osp.join(out_dir, base_cont), args.transform, + "contribution") + temposig_to_generic_assay(osp.join(tmp_dir, base_pval), osp.join(out_dir, base_pval), args.transform, + "pvalue") + matrices_ran.append(matrix_type) + elif algorithm == "SigProfiler": raise NotImplementedError("The SigProfiler algorithm has not been implemented yet.") + # run_sigprofilerextractor(seed=args.seed) else: raise NotImplementedError(f"{algorithm} is not implemented.") print(f"Finished running {algorithm}") - return matrices_ran, algorithm_string + return matrices_ran -def generate_meta_file(study_id, meta_file, file_type, algorithm_string, out_dir): - """""" +def generate_single_meta_file(study_id: str, meta_file: str, file_type: str, algorithm_string: str, out_dir: str, + meta_properties: str="NAME,DESCRIPTION,URL"): + """Generate a single cBioPortal generic essay meta file + + :param study_id: study ID + :param meta_file: output filename excl. 'meta_' and filetype extension + :param file_type: `contribution`, `pvalue` or other value + :param algorithm_string: string describing the algorithm used + :param out_dir: path to output directory + :param meta_properties: generic_entity_meta_properties string (default: 'NAME,DESCRIPTION,URL') + """ profile_name = " ".join(meta_file.split("_")) - # Meta files follow structure of Gaofei Zhao mutational signatures data format example: - # https://github.com/cBioPortal/cbioportal-frontend/tree/master/end-to-end-test/local/studies/lgg_ucsf_2014_test_generic_assay out_lines = f"cancer_study_identifier: {study_id}\n" \ f"genetic_alteration_type: GENERIC_ASSAY\n" \ f"generic_assay_type: MUTATIONAL_SIGNATURE\n" \ @@ -310,59 +499,49 @@ def generate_meta_file(study_id, meta_file, file_type, algorithm_string, out_dir f"profile_description: profile for {file_type} value of mutational signatures, generated using " \ f"{algorithm_string}\n" \ f"data_filename: data_{meta_file}.txt\n" \ - f"show_profile_in_analysis_tab: {'false' if file_type != 'contribution' else 'true'}\n" \ - f"generic_entity_meta_properties: NAME,DESCRIPTION,URL" + f"show_profile_in_analysis_tab: {str(file_type == 'contribution').lower()}\n" \ + f"generic_entity_meta_properties: {meta_properties}" with open(f"{out_dir}/meta_{meta_file}.txt", "w+") as f: f.write(out_lines) - return None - - -def wrapper_meta_files(args, matrices_ran, study_id, algorithm_string): - """""" - # TODO meta file generation for matrices can be put here as well - for file_type in ["contribution", "pvalue"]: # "matrix"]: - for matrix in matrices_ran: - meta_file = f"mutational_signature_{matrix}_{file_type}_v{args.cosmic}" - print(f"Generating Meta File {args.out_dir}/meta_{meta_file}.txt...") - generate_meta_file(study_id, meta_file, file_type, algorithm_string, args.out_dir) - return None - -def remove_tmp_files(tmp_dir): - """Remove directory tree - :param tmp_dir: str, path to directory - :return: None +def generate_all_meta_files(args: argparse.Namespace, matrices_ran: list, study_id: str, algorithm_string: str): + """Generate meta files for all files in list + + :param args: argparse arguments + :param matrices_ran: files to generate meta files for + :param study_id: study ID + :param algorithm_string: string describing the algorithm used """ - print(f"\nRemoving tmp directory: {tmp_dir} ...") - shutil.rmtree(tmp_dir) - return None + for file_type in ["contribution", "pvalue", "matrix"]: + for matrix in matrices_ran: + meta_filename = f"mutational_signature_{file_type}_{matrix}" + print(f"Generating meta file: {args.out_dir}/meta_{meta_filename}.txt...") + if file_type == "matrix": + generate_single_meta_file(study_id, meta_filename, file_type, algorithm_string, args.out_dir, + meta_properties="NAME") + else: + generate_single_meta_file(study_id, meta_filename, file_type, algorithm_string, args.out_dir) def main(): """Main function - - :Return: None """ # Parse arguments args = parse_args() # Get correct meta file and study ID maf_file, study_id = acquire_maf_study_id(args.study_path) # Check if given maf file exists in study path, fix order of columns and copy to temporary directory - maf_file, genome_build = pre_process_maf(maf_file, args.tmp_dir) + maf_file, ncbi_build = preprocess_maf(maf_file, args.tmp_dir, args) # Install genome if asked if args.install_genome: - genome_install(genome_build) + genome_install(ncbi_build) # Create matrix - run_matrix_generator(genome_build, args.tmp_dir) - # TODO keep the relevant matrices and port them into a cBioPortal format for visualisation - # Run desired algorithm (and possible pre-processing) for mutational signatures - matrices_ran, algorithm_string = wrapper_mut_sig(args.algorithm, args.tmp_dir, args.out_dir, args) + run_matrix_generator(ncbi_build, args.tmp_dir) + # Run desired algorithm (and possible preprocessing) for mutational signatures + matrices_ran = calculate_mutational_signatures(args.algorithm, args.tmp_dir, args.out_dir, args) # Create associated meta files - wrapper_meta_files(args, matrices_ran, study_id, algorithm_string) - # cleanup temporary files - remove_tmp_files(args.tmp_dir) - return None + generate_all_meta_files(args, matrices_ran, study_id, args.algorithm) if __name__ == "__main__":