diff --git a/.version-level b/.version-level index 9eb7b90..acb503f 100644 --- a/.version-level +++ b/.version-level @@ -1 +1 @@ -patch +minor diff --git a/AnnotatorCore.py b/AnnotatorCore.py index 89d9bd0..f0b4fca 100644 --- a/AnnotatorCore.py +++ b/AnnotatorCore.py @@ -15,7 +15,6 @@ from urllib3 import Retry from datetime import date - logging.basicConfig(level=logging.INFO) logging.getLogger("requests").setLevel(logging.WARNING) logging.getLogger("urllib3").setLevel(logging.WARNING) @@ -27,7 +26,8 @@ API_REQUEST_RETRY_STATUS_FORCELIST = [429, 500, 502, 503, 504] -csv.field_size_limit(int(ct.c_ulong(-1).value // 2)) # Deal with overflow problem on Windows, https://stackoverflow.co/120m/questions/15063936/csv-error-field-larger-than-field-limit-131072 +csv.field_size_limit(int(ct.c_ulong( + -1).value // 2)) # Deal with overflow problem on Windows, https://stackoverflow.co/120m/questions/15063936/csv-error-field-larger-than-field-limit-131072 sizeLimit = csv.field_size_limit() csv.field_size_limit(sizeLimit) # for reading large files @@ -185,7 +185,8 @@ def setsampleidsfileterfile(f): HGVSG_HEADER = 'HGVSG' # columns for copy number alteration CNA_HEADERS = [ALTERATION_HEADER, 'COPY_NUMBER_ALTERATION', 'CNA', 'GISTIC'] -HGVS_HEADERS = [ALTERATION_HEADER, HGVSP_SHORT_HEADER, HGVSP_HEADER, HGVSG_HEADER, 'AMINO_ACID_CHANGE', 'FUSION'] + CNA_HEADERS +HGVS_HEADERS = [ALTERATION_HEADER, HGVSP_SHORT_HEADER, HGVSP_HEADER, HGVSG_HEADER, 'AMINO_ACID_CHANGE', + 'FUSION'] + CNA_HEADERS SAMPLE_HEADERS = ['SAMPLE_ID', 'TUMOR_SAMPLE_BARCODE'] PROTEIN_START_HEADERS = ['PROTEIN_START'] PROTEIN_END_HEADERS = ['PROTEIN_END'] @@ -210,6 +211,11 @@ def setsampleidsfileterfile(f): SV_TYPE_HEADER = ['SV_CLASS_NAME', 'SV_TYPE', 'CLASS'] SV_TYPES = ['DELETION', 'TRANSLOCATION', 'DUPLICATION', 'INSERTION', 'INVERSION', 'FUSION', 'UNKNOWN'] +DESCRIPTION_HEADERS = ['GENE_SUMMARY', 'VARIANT_SUMMARY', 'TUMOR_TYPE_SUMMARY', 'DIAGNOSTIC_SUMMARY', + 'PROGNOSTIC_SUMMARY', 'MUTATION_EFFECT_DESCRIPTION'] + +ONCOKB_ANNOTATION_HEADERS_GC = ["ONCOKB_HUGO_SYMBOL", "ONCOKB_PROTEIN_CHANGE", "ONCOKB_CONSEQUENCE"] + UNKNOWN = 'UNKNOWN' @@ -249,7 +255,8 @@ def getOncokbInfo(): def validate_oncokb_token(): if not oncokb_annotation_api_url.startswith(DEFAULT_ONCOKB_URL): - log.warning("OncoKB base url has been specified by the user that is different from the default www.oncokb.org. The token validation is skipped.") + log.warning( + "OncoKB base url has been specified by the user that is different from the default www.oncokb.org. The token validation is skipped.") return None if oncokb_api_bearer_token is None or not oncokb_api_bearer_token: @@ -483,8 +490,17 @@ def get_reference_genome_from_row(row_reference_genome, default_reference_genome return reference_genome +def append_headers(outf, newncols, include_descriptions, genomic_change_annotation): + oncokb_annotation_headers = get_oncokb_annotation_column_headers(include_descriptions, genomic_change_annotation) + outf.write("\t".join(oncokb_annotation_headers)) + newncols += len(oncokb_annotation_headers) + + outf.write("\n") + return newncols + + def processalterationevents(eventfile, outfile, previousoutfile, defaultCancerType, cancerTypeMap, - annotatehotspots, user_input_query_type, default_reference_genome): + annotatehotspots, user_input_query_type, default_reference_genome, include_descriptions): if annotatehotspots: init_3d_hotspots() if os.path.isfile(previousoutfile): @@ -507,32 +523,30 @@ def processalterationevents(eventfile, outfile, previousoutfile, defaultCancerTy outf.write("\tIS-A-3D-HOTSPOT") newncols += 2 - oncokb_annotation_headers = get_oncokb_annotation_column_headers() - outf.write("\t") - outf.write("\t".join(oncokb_annotation_headers)) - newncols += len(oncokb_annotation_headers) - - outf.write("\n") query_type = resolve_query_type(user_input_query_type, headers) if (query_type == QueryType.HGVSP_SHORT): + newncols = append_headers(outf, newncols, include_descriptions, False) process_alteration(reader, outf, headers, [HGVSP_SHORT_HEADER, ALTERATION_HEADER], ncols, newncols, defaultCancerType, - cancerTypeMap, annotatehotspots, default_reference_genome) + cancerTypeMap, annotatehotspots, default_reference_genome, include_descriptions) if (query_type == QueryType.HGVSP): + newncols = append_headers(outf, newncols, include_descriptions, False) process_alteration(reader, outf, headers, [HGVSP_HEADER, ALTERATION_HEADER], ncols, newncols, defaultCancerType, - cancerTypeMap, annotatehotspots, default_reference_genome) + cancerTypeMap, annotatehotspots, default_reference_genome, include_descriptions) if (query_type == QueryType.HGVSG): + newncols = append_headers(outf, newncols, include_descriptions, True) process_hvsg(reader, outf, headers, [HGVSG_HEADER, ALTERATION_HEADER], ncols, newncols, defaultCancerType, - cancerTypeMap, annotatehotspots, default_reference_genome) + cancerTypeMap, annotatehotspots, default_reference_genome, include_descriptions) if (query_type == QueryType.GENOMIC_CHANGE): + newncols = append_headers(outf, newncols, include_descriptions, True) process_genomic_change(reader, outf, headers, ncols, newncols, defaultCancerType, cancerTypeMap, - annotatehotspots, default_reference_genome) + annotatehotspots, default_reference_genome, include_descriptions) outf.close() @@ -546,10 +560,17 @@ def get_cell_content(row, index, return_empty_string=False): return None -def get_oncokb_annotation_column_headers(): - headers = [ANNOTATED_HEADER, GENE_IN_ONCOKB_HEADER, VARIANT_IN_ONCOKB_HEADER, "MUTATION_EFFECT", - "MUTATION_EFFECT_CITATIONS", - "ONCOGENIC"] +def get_oncokb_annotation_column_headers(include_descriptions, genomic_change_annotation): + headers = [ANNOTATED_HEADER] + if genomic_change_annotation: + headers.extend(ONCOKB_ANNOTATION_HEADERS_GC) + + headers.extend([GENE_IN_ONCOKB_HEADER, + VARIANT_IN_ONCOKB_HEADER, + "MUTATION_EFFECT", + "MUTATION_EFFECT_CITATIONS", + "ONCOGENIC"]) + for level in sorted(levels): headers.append(level) headers.append("HIGHEST_LEVEL") @@ -566,12 +587,16 @@ def get_oncokb_annotation_column_headers(): headers.append(px_level) headers.append("HIGHEST_PX_LEVEL") headers.append("PX_CITATIONS") + + if include_descriptions: + headers.extend(DESCRIPTION_HEADERS) + return headers def process_alteration(maffilereader, outf, maf_headers, alteration_column_names, ncols, nannotationcols, defaultCancerType, cancerTypeMap, - annotatehotspots, default_reference_genome): + annotatehotspots, default_reference_genome, include_descriptions): ihugo = geIndexOfHeader(maf_headers, HUGO_HEADERS) iconsequence = geIndexOfHeader(maf_headers, CONSEQUENCE_HEADERS) ihgvs = geIndexOfHeader(maf_headers, alteration_column_names) @@ -619,7 +644,8 @@ def process_alteration(maffilereader, outf, maf_headers, alteration_column_names end = get_cell_content(row, iend) - if start is None and iproteinpos >= 0 and row[iproteinpos] != "" and row[iproteinpos] != "." and row[iproteinpos] != "-": + if start is None and iproteinpos >= 0 and row[iproteinpos] != "" and row[iproteinpos] != "." and \ + row[iproteinpos] != "-": poss = row[iproteinpos].split('/')[0].split('-') try: if len(poss) > 0: @@ -642,13 +668,13 @@ def process_alteration(maffilereader, outf, maf_headers, alteration_column_names rows.append(row) if len(queries) == POST_QUERIES_THRESHOLD: - annotations = pull_protein_change_info(queries, annotatehotspots) + annotations = pull_protein_change_info(queries, include_descriptions, annotatehotspots) append_annotation_to_file(outf, ncols + nannotationcols, rows, annotations) queries = [] rows = [] if len(queries) > 0: - annotations = pull_protein_change_info(queries, annotatehotspots) + annotations = pull_protein_change_info(queries, include_descriptions, annotatehotspots) append_annotation_to_file(outf, ncols + nannotationcols, rows, annotations) @@ -673,7 +699,7 @@ def get_var_allele(ref_allele, tumor_seq_allele1, tumor_seq_allele2): def process_genomic_change(maffilereader, outf, maf_headers, ncols, nannotationcols, defaultCancerType, cancerTypeMap, - annotatehotspots, default_reference_genome): + annotatehotspots, default_reference_genome, include_descriptions): ichromosome = geIndexOfHeader(maf_headers, [GC_CHROMOSOME_HEADER]) istart = geIndexOfHeader(maf_headers, [GC_START_POSITION_HEADER]) iend = geIndexOfHeader(maf_headers, [GC_END_POSITION_HEADER]) @@ -717,18 +743,18 @@ def process_genomic_change(maffilereader, outf, maf_headers, ncols, nannotationc rows.append(row) if len(queries) == POST_QUERIES_THRESHOLD_GC_HGVSG: - annotations = pull_genomic_change_info(queries, annotatehotspots) + annotations = pull_genomic_change_info(queries, include_descriptions, annotatehotspots) append_annotation_to_file(outf, ncols + nannotationcols, rows, annotations) queries = [] rows = [] if len(queries) > 0: - annotations = pull_genomic_change_info(queries, annotatehotspots) + annotations = pull_genomic_change_info(queries, include_descriptions, annotatehotspots) append_annotation_to_file(outf, ncols + nannotationcols, rows, annotations) def process_hvsg(maffilereader, outf, maf_headers, alteration_column_names, ncols, nannotationcols, defaultCancerType, - cancerTypeMap, annotatehotspots, default_reference_genome): + cancerTypeMap, annotatehotspots, default_reference_genome, include_descriptions): ihgvsg = geIndexOfHeader(maf_headers, alteration_column_names) isample = geIndexOfHeader(maf_headers, SAMPLE_HEADERS) icancertype = geIndexOfHeader(maf_headers, CANCER_TYPE_HEADERS) @@ -757,9 +783,9 @@ def process_hvsg(maffilereader, outf, maf_headers, alteration_column_names, ncol if hgvsg is None: if annotatehotspots: - default_cols = [['', '', GENE_IN_ONCOKB_DEFAULT, VARIANT_IN_ONCOKB_DEFAULT]] + default_cols = [['', '', 'False']] else: - default_cols = [[GENE_IN_ONCOKB_DEFAULT, VARIANT_IN_ONCOKB_DEFAULT]] + default_cols = [['False']] append_annotation_to_file(outf, ncols + nannotationcols, [row], default_cols) else: @@ -768,13 +794,13 @@ def process_hvsg(maffilereader, outf, maf_headers, alteration_column_names, ncol rows.append(row) if len(queries) == POST_QUERIES_THRESHOLD_GC_HGVSG: - annotations = pull_hgvsg_info(queries, annotatehotspots) + annotations = pull_hgvsg_info(queries, include_descriptions, annotatehotspots) append_annotation_to_file(outf, ncols + nannotationcols, rows, annotations) queries = [] rows = [] if len(queries) > 0: - annotations = pull_hgvsg_info(queries, annotatehotspots) + annotations = pull_hgvsg_info(queries, include_descriptions, annotatehotspots) append_annotation_to_file(outf, ncols + nannotationcols, rows, annotations) @@ -794,7 +820,7 @@ def getgenesfromfusion(fusion, nameregex=None): return geneA, geneB -def process_fusion(svdata, outfile, previousoutfile, defaultCancerType, cancerTypeMap, nameregex): +def process_fusion(svdata, outfile, previousoutfile, defaultCancerType, cancerTypeMap, nameregex, include_descriptions): if os.path.isfile(previousoutfile): cacheannotated(previousoutfile, defaultCancerType, cancerTypeMap) outf = open(outfile, 'w+') @@ -809,7 +835,7 @@ def process_fusion(svdata, outfile, previousoutfile, defaultCancerType, cancerTy return outf.write(headers['^-$']) - oncokb_annotation_headers = get_oncokb_annotation_column_headers() + oncokb_annotation_headers = get_oncokb_annotation_column_headers(include_descriptions, False) outf.write("\t") outf.write("\t".join(oncokb_annotation_headers)) outf.write("\n") @@ -853,18 +879,18 @@ def process_fusion(svdata, outfile, previousoutfile, defaultCancerType, cancerTy rows.append(row) if len(queries) == POST_QUERIES_THRESHOLD: - annotations = pull_structural_variant_info(queries) + annotations = pull_structural_variant_info(queries, include_descriptions) append_annotation_to_file(outf, newcols, rows, annotations) queries = [] rows = [] if len(queries) > 0: - annotations = pull_structural_variant_info(queries) + annotations = pull_structural_variant_info(queries, include_descriptions) append_annotation_to_file(outf, newcols, rows, annotations) outf.close() -def process_sv(svdata, outfile, previousoutfile, defaultCancerType, cancerTypeMap): +def process_sv(svdata, outfile, previousoutfile, defaultCancerType, cancerTypeMap, include_descriptions): if os.path.isfile(previousoutfile): cacheannotated(previousoutfile, defaultCancerType, cancerTypeMap) outf = open(outfile, 'w+') @@ -879,7 +905,7 @@ def process_sv(svdata, outfile, previousoutfile, defaultCancerType, cancerTypeMa return outf.write(headers['^-$']) - oncokb_annotation_headers = get_oncokb_annotation_column_headers() + oncokb_annotation_headers = get_oncokb_annotation_column_headers(include_descriptions, False) outf.write("\t") outf.write("\t".join(oncokb_annotation_headers)) outf.write("\n") @@ -926,13 +952,13 @@ def process_sv(svdata, outfile, previousoutfile, defaultCancerType, cancerTypeMa rows.append(row) if len(queries) == POST_QUERIES_THRESHOLD: - annotations = pull_structural_variant_info(queries) + annotations = pull_structural_variant_info(queries, include_descriptions) append_annotation_to_file(outf, newcols, rows, annotations) queries = [] rows = [] if len(queries) > 0: - annotations = pull_structural_variant_info(queries) + annotations = pull_structural_variant_info(queries, include_descriptions) append_annotation_to_file(outf, newcols, rows, annotations) outf.close() @@ -951,7 +977,8 @@ def get_cna(cell_value, annotate_gain_loss=False): return cna -def process_gistic_data(outf, gistic_data_file, defaultCancerType, cancerTypeMap, annotate_gain_loss): +def process_gistic_data(outf, gistic_data_file, defaultCancerType, cancerTypeMap, annotate_gain_loss, + include_descriptions): with open(gistic_data_file, DEFAULT_READ_FILE_MODE) as infile: reader = csv.reader(infile, delimiter='\t') headers = readheaders(reader) @@ -1000,17 +1027,19 @@ def process_gistic_data(outf, gistic_data_file, defaultCancerType, cancerTypeMap rows.append([sample, cancer_type, hugo, cna_type]) queries.append(CNAQuery(hugo, cna_type, cancer_type)) - headers = ['SAMPLE_ID', 'CANCER_TYPE', 'HUGO_SYMBOL', 'ALTERATION'] + get_oncokb_annotation_column_headers() + headers = ['SAMPLE_ID', 'CANCER_TYPE', 'HUGO_SYMBOL', 'ALTERATION'] + get_oncokb_annotation_column_headers( + include_descriptions, False) outf.write('\t'.join(headers)) outf.write('\n') return headers, rows, queries -def process_individual_cna_file(outf, cna_data_file, defaultCancerType, cancerTypeMap, annotate_gain_loss): +def process_individual_cna_file(outf, cna_data_file, defaultCancerType, cancerTypeMap, annotate_gain_loss, + include_descriptions): with open(cna_data_file, DEFAULT_READ_FILE_MODE) as infile: reader = csv.reader(infile, delimiter='\t') headers = readheaders(reader) - row_headers = headers['^-$'].split('\t') + get_oncokb_annotation_column_headers() + row_headers = headers['^-$'].split('\t') + get_oncokb_annotation_column_headers(include_descriptions, False) i = 0 rows = [] @@ -1047,7 +1076,8 @@ def process_individual_cna_file(outf, cna_data_file, defaultCancerType, cancerTy return row_headers, rows, queries -def process_cna_data(cnafile, outfile, previousoutfile, defaultCancerType, cancerTypeMap, annotate_gain_loss=False, +def process_cna_data(cnafile, outfile, previousoutfile, defaultCancerType, cancerTypeMap, include_descriptions, + annotate_gain_loss=False, cna_format=CNA_FILE_FORMAT_GISTIC): if os.path.isfile(previousoutfile): cacheannotated(previousoutfile, defaultCancerType, cancerTypeMap) @@ -1063,10 +1093,10 @@ def process_cna_data(cnafile, outfile, previousoutfile, defaultCancerType, cance queries = [] if cna_format == CNA_FILE_FORMAT_GISTIC: headers, rows, queries = process_gistic_data(outf, cnafile, defaultCancerType, cancerTypeMap, - annotate_gain_loss) + annotate_gain_loss, include_descriptions) else: headers, rows, queries = process_individual_cna_file(outf, cnafile, defaultCancerType, cancerTypeMap, - annotate_gain_loss) + annotate_gain_loss, include_descriptions) ncols = len(headers) @@ -1076,7 +1106,7 @@ def process_cna_data(cnafile, outfile, previousoutfile, defaultCancerType, cance log.info(i) rows_sec, rows = rows[:POST_QUERIES_THRESHOLD], rows[POST_QUERIES_THRESHOLD:] queries_sec, queries = queries[:POST_QUERIES_THRESHOLD], queries[POST_QUERIES_THRESHOLD:] - annotations = pull_cna_info(queries_sec) + annotations = pull_cna_info(queries_sec, include_descriptions) append_annotation_to_file(outf, ncols, rows_sec, annotations) outf.close() @@ -1137,8 +1167,8 @@ def process_clinical_data(annotatedmutfiles, clinicalfile, outfile): isample = geIndexOfHeader(headers, SAMPLE_HEADERS) ioncogenic = headers['ONCOGENIC'] - isfusion = (igeneA != -1 & igeneB != -1) or ifusion != -1 - ismutorcna = ihugo != -1 & ihgvs != -1 + isfusion = (igeneA != -1 and igeneB != -1) or ifusion != -1 + ismutorcna = ihugo != -1 and ihgvs != -1 if not isfusion and not ismutorcna: log.error("file " + annotatedmutfile + " missing proper header") @@ -1238,7 +1268,7 @@ def process_clinical_data(annotatedmutfiles, clinicalfile, outfile): outf = open(outfile, 'w+') - # export to anntoated file + # export to annotated file with open(clinicalfile, DEFAULT_READ_FILE_MODE) as clinfile: reader = csv.reader(clinfile, delimiter='\t') headers = readheaders(reader) @@ -1462,7 +1492,8 @@ def __init__(self, hugo, hgvs, cancertype, reference_genome=None, consequence=No self.referenceGenome = reference_genome.value def __repr__(self): - return ",".join([self.gene.hugoSymbol, self.alteration, self.tumorType, self.consequence, self.proteinStart, self.proteinEnd, self.referenceGenome]) + return ",".join([self.gene.hugoSymbol, self.alteration, self.tumorType, self.consequence, self.proteinStart, + self.proteinEnd, self.referenceGenome]) class HGVSgQuery: @@ -1547,7 +1578,7 @@ def __repr__(self): self.tumorType]) -def pull_protein_change_info(queries, annotate_hotspot): +def pull_protein_change_info(queries, include_descriptions, annotate_hotspot): url = oncokb_annotation_api_url + '/annotate/mutations/byProteinChange' response = makeoncokbpostrequest(url, queries) if response.status_code == 401: @@ -1579,11 +1610,12 @@ def pull_protein_change_info(queries, annotate_hotspot): processed_annotation = [] for query_annotation in annotation: - processed_annotation.append(process_oncokb_annotation(query_annotation, annotate_hotspot)) + processed_annotation.append( + process_oncokb_annotation(query_annotation, include_descriptions, False, annotate_hotspot)) return processed_annotation -def pull_hgvsg_info(queries, annotate_hotspot): +def pull_hgvsg_info(queries, include_descriptions, annotate_hotspot): url = oncokb_annotation_api_url + '/annotate/mutations/byHGVSg' response = makeoncokbpostrequest(url, queries) if response.status_code == 401: @@ -1607,11 +1639,12 @@ def pull_hgvsg_info(queries, annotate_hotspot): processed_annotation = [] for query_annotation in annotation: - processed_annotation.append(process_oncokb_annotation(query_annotation, annotate_hotspot)) + processed_annotation.append( + process_oncokb_annotation(query_annotation, include_descriptions, True, annotate_hotspot)) return processed_annotation -def pull_genomic_change_info(queries, annotate_hotspot): +def pull_genomic_change_info(queries, include_descriptions, annotate_hotspot): url = oncokb_annotation_api_url + '/annotate/mutations/byGenomicChange' response = makeoncokbpostrequest(url, queries) if response.status_code == 401: @@ -1635,11 +1668,12 @@ def pull_genomic_change_info(queries, annotate_hotspot): processed_annotation = [] for query_annotation in annotation: - processed_annotation.append(process_oncokb_annotation(query_annotation, annotate_hotspot)) + processed_annotation.append( + process_oncokb_annotation(query_annotation, include_descriptions, True, annotate_hotspot)) return processed_annotation -def pull_cna_info(queries): +def pull_cna_info(queries, include_descriptions): url = oncokb_annotation_api_url + '/annotate/copyNumberAlterations' response = makeoncokbpostrequest(url, queries) @@ -1665,11 +1699,12 @@ def pull_cna_info(queries): processed_annotation = [] for query_annotation in annotation: - processed_annotation.append(process_oncokb_annotation(query_annotation, annotate_hotspot=False)) + processed_annotation.append( + process_oncokb_annotation(query_annotation, include_descriptions, False, annotate_hotspot=False)) return processed_annotation -def pull_structural_variant_info(queries): +def pull_structural_variant_info(queries, include_descriptions): url = oncokb_annotation_api_url + '/annotate/structuralVariants' response = makeoncokbpostrequest(url, queries) @@ -1699,11 +1734,12 @@ def pull_structural_variant_info(queries): processed_annotation = [] for query_annotation in annotation: - processed_annotation.append(process_oncokb_annotation(query_annotation, annotate_hotspot=False)) + processed_annotation.append( + process_oncokb_annotation(query_annotation, include_descriptions, False, annotate_hotspot=False)) return processed_annotation -def process_oncokb_annotation(annotation, annotate_hotspot): +def process_oncokb_annotation(annotation, include_descriptions, genomic_change_annotation, annotate_hotspot): if annotation is None: return ['False'] @@ -1719,6 +1755,7 @@ def process_oncokb_annotation(annotation, annotate_hotspot): oncokbdata[VARIANT_IN_ONCOKB_HEADER] = VARIANT_IN_ONCOKB_DEFAULT oncokbdata['mutation_effect'] = "" oncokbdata['mutation_effect_citations'] = [] + oncokbdata['mutation_effect_description'] = "" oncokbdata['citations'] = [] oncokbdata['oncogenic'] = "" oncokbdata['tx_citations'] = [] @@ -1741,6 +1778,7 @@ def process_oncokb_annotation(annotation, annotate_hotspot): # mutation effect if (annotation['mutationEffect'] is not None): oncokbdata['mutation_effect'] = annotation['mutationEffect']['knownEffect'] + oncokbdata['mutation_effect_description'] = annotation['mutationEffect']['description'] oncokbdata['mutation_effect_citations'] = appendoncokbcitations(oncokbdata['mutation_effect_citations'], annotation['mutationEffect']['citations'][ 'pmids'], @@ -1794,6 +1832,17 @@ def process_oncokb_annotation(annotation, annotate_hotspot): ret.append(_3dhotspot) ret.append('True') + + if genomic_change_annotation: + query_hugo_symbol = annotation['query']['hugoSymbol'] + ret.append('' if query_hugo_symbol is None else query_hugo_symbol) + + query_alteration = annotation['query']['alteration'] + ret.append('' if query_alteration is None else query_alteration) + + query_consequence = annotation['query']['consequence'] + ret.append('' if query_consequence is None else query_consequence) + ret.append(oncokbdata[GENE_IN_ONCOKB_HEADER]) ret.append(oncokbdata[VARIANT_IN_ONCOKB_HEADER]) ret.append(oncokbdata['mutation_effect']) @@ -1816,6 +1865,14 @@ def process_oncokb_annotation(annotation, annotate_hotspot): ret.append(get_highest_dxpx_level(pxLevels, [oncokbdata['highestPrognosticImplicationLevel']])) ret.append(';'.join(oncokbdata['px_citations'])) + if include_descriptions: + ret.append(annotation['geneSummary']) + ret.append(annotation['variantSummary']) + ret.append(annotation['tumorTypeSummary']) + ret.append(annotation['diagnosticSummary']) + ret.append(annotation['prognosticSummary']) + ret.append(oncokbdata['mutation_effect_description']) + return ret diff --git a/CnaAnnotator.py b/CnaAnnotator.py index e1b4bae..34d3e8f 100644 --- a/CnaAnnotator.py +++ b/CnaAnnotator.py @@ -20,7 +20,9 @@ def main(argv): if argv.help: log.info( '\n' - 'CnaAnnotator.py -i -o [-p previous results] [-c ] [-s sample list filter] [-t ] [-u oncokb-base-url] [-b oncokb_api_bear_token] [-z annotate_gain_loss] [-f CNA file formt, gistic or individual]\n' + 'CnaAnnotator.py -i -o [-p previous results] [-c ] ' + '[-s sample list filter] [-t ] [-u oncokb-base-url] [-b oncokb_api_bear_token] ' + '[-z annotate_gain_loss] [-f CNA file formt, gistic or individual] [-d include descriptions]\n' ' Input CNA file uses GISTIC output by default (https://docs.cbioportal.org/5.1-data-loading/data-loading/file-formats#data-file-1). You can also list copy number alteration individually by specifying -f=individual\n' ' Essential clinical columns:\n' ' SAMPLE_ID: sample ID\n' @@ -57,7 +59,7 @@ def main(argv): validate_oncokb_token() log.info('annotating %s ...' % argv.input_file) - process_cna_data(argv.input_file, argv.output_file, argv.previous_result_file, argv.default_cancer_type, cancertypemap, argv.annotate_gain_loss, argv.cna_file_format.lower()) + process_cna_data(argv.input_file, argv.output_file, argv.previous_result_file, argv.default_cancer_type, cancertypemap, argv.include_descriptions, argv.annotate_gain_loss, argv.cna_file_format.lower()) log.info('done!') @@ -75,6 +77,7 @@ def main(argv): parser.add_argument('-b', dest='oncokb_api_bearer_token', default='', type=str) parser.add_argument('-z', dest='annotate_gain_loss', action="store_true", default=False) parser.add_argument('-f', dest='cna_file_format', default=CNA_FILE_FORMAT_GISTIC) + parser.add_argument('-d', dest='include_descriptions', action="store_true", default=False) parser.set_defaults(func=main) args = parser.parse_args() diff --git a/FusionAnnotator.py b/FusionAnnotator.py index b1ef773..1cc957a 100644 --- a/FusionAnnotator.py +++ b/FusionAnnotator.py @@ -20,7 +20,10 @@ def main(argv): if argv.help: log.info( '\n' - "FusionAnnotator.py -i -o [-p previous results] [-c ] [-s sample list filter] [-t ] [-u ] [-b ] [-r ]\n" + "FusionAnnotator.py -i -o [-p previous results] " + "[-c ] [-s sample list filter] [-t ] [-u ] " + "[-b ] [-r ] " + "[-d include descriptions]\n" ' Essential Fusion columns (case insensitive):\n' ' HUGO_SYMBOL: Hugo gene symbol\n' ' VARIANT_CLASSIFICATION: Translational effect of variant allele\n' @@ -63,7 +66,7 @@ def main(argv): validate_oncokb_token() log.info('annotating %s ...' % argv.input_file) - process_fusion(argv.input_file, argv.output_file, argv.previous_result_file, argv.default_cancer_type, cancertypemap, argv.structural_variant_name_format) + process_fusion(argv.input_file, argv.output_file, argv.previous_result_file, argv.default_cancer_type, cancertypemap, argv.structural_variant_name_format, argv.include_descriptions) log.info('done!') @@ -82,6 +85,7 @@ def main(argv): parser.add_argument('-v', dest='cancer_hotspots_base_url', default='', type=str) parser.add_argument('-b', dest='oncokb_api_bearer_token', default='', type=str) parser.add_argument('-r', dest='structural_variant_name_format', default=None, type=str) + parser.add_argument('-d', dest='include_descriptions', action="store_true", default=False) parser.set_defaults(func=main) args = parser.parse_args() diff --git a/MafAnnotator.py b/MafAnnotator.py index 97c451e..037e311 100644 --- a/MafAnnotator.py +++ b/MafAnnotator.py @@ -23,7 +23,8 @@ def main(argv): log.info( '\n' 'MafAnnotator.py -i -o [-p previous results] [-c ] ' - '[-s sample list filter] [-t ] [-u oncokb-base-url] [-b oncokb api bear token] [-a] [-q query type] [-r default reference genome]\n' + '[-s sample list filter] [-t ] [-u oncokb-base-url] [-b oncokb api bear token] [-a] ' + '[-q query type] [-r default reference genome] [-d include descriptions]\n' 'For definitions of the MAF format, please see https://docs.gdc.cancer.gov/Data/File_Formats/MAF_Format/\n\n' 'Essential MAF columns for querying HGVSp_Short and HGVSp(case insensitive):\n' ' Hugo_Symbol: Hugo gene symbol\n' @@ -111,7 +112,8 @@ def main(argv): validate_oncokb_token() processalterationevents(argv.input_file, argv.output_file, argv.previous_result_file, argv.default_cancer_type, - cancertypemap, argv.annotate_hotspots, user_input_query_type, default_reference_genome) + cancertypemap, argv.annotate_hotspots, user_input_query_type, default_reference_genome, + argv.include_descriptions) log.info('done!') @@ -131,6 +133,7 @@ def main(argv): parser.add_argument('-b', dest='oncokb_api_bearer_token', default='', type=str) parser.add_argument('-q', dest='query_type', default=None, type=str) parser.add_argument('-r', dest='default_reference_genome', default=None, type=str) + parser.add_argument('-d', dest='include_descriptions', action="store_true", default=False) parser.set_defaults(func=main) args = parser.parse_args() diff --git a/README.md b/README.md index a86e1c8..b84abc1 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,11 @@ -## UPDATE: API token required, please see [OncoKB™ API section](#oncokb-api) for more information -## UPDATE: v3.3 introduces additional annotation columns, please check your pipeline before using +## UPDATE: +- v3.4 allows you to include descriptions into the annotated files with `-d` parameter. +- When annotating genomic change, HGVSg, three additional columns will be added. `ONCOKB_HUGO_SYMBOL`, `ONCOKB_PROTEIN_CHANGE` and `ONCOKB_CONSEQUENCE` +- See [Columns added section](#columns-added) for more details # oncokb-annotator +API token required, please see [OncoKB™ API section](#oncokb-api) for more information ## Status @@ -114,28 +117,38 @@ python ${FILE_NAME.py} -i ${INPUT_FILE} -o ${OUTPUT_FILE} -b ${ONCOKB_API_TOKEN} ``` -## Columns added in the annotation files using MafAnnotator/CnaAnnotator/StructuralVariantAnnotator/FusionAnnotator -| Column | Possible Values | Description | -|---------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| ANNOTATED | True, False | Whether the variant is annotated by OncoKB successfully. | -| GENE_IN_ONCOKB | True, False | Whether the gene has been curated by the OncoKB Team. | -| VARIANT_IN_ONCOKB | True, False | Whether the variant has been curated by the OncoKB Team. Note: when a variant does not exist, it may still have annotations. | -| MUTATION_EFFECT | Gain-of-function, Likely Gain-of-function, Loss-of-function, Likely Loss-of-function, Switch-of-function, Likely Switch-of-function, Neutral, Likely Neutral, Inconclusive, Unknown | The biological effect of a mutation/alteration on the protein function that gives rise to changes in the biological properties of cells expressing the mutant/altered protein compared to cells expressing the wildtype protein. | -| MUTATION_EFFECT_CITATIONS | PMID, Abstract, Website link | All citations related to the biological effect. | -| ONCOGENIC | Oncogenic, Likely Oncogenic, Likely Neutral, Inconclusive, Unknown, Resistance | In OncoKB™, “oncogenic” is defined as “referring to the ability to induce or cause cancer” as described in the second edition of The Biology of Cancer by Robert Weinberg (2014). | -| LEVEL_* | Therapeutic implications | The leveled therapeutic implications. | -| HIGHEST_LEVEL | LEVEL_1, LEVEL_2, LEVEL_3A, LEVEL_3B, LEVEL_4, LEVEL_R1, LEVEL_R2 | The highest level of evidence for therapeutic implications. Order: LEVEL_R1 > LEVEL_1 > LEVEL_2 > LEVEL_3A > LEVEL_3B > LEVEL_4 > LEVEL_R2 | -| HIGHEST_SENSITIVE_LEVEL | LEVEL_1, LEVEL_2, LEVEL_3A, LEVEL_3B, LEVEL_4 | The highest sensitive level of evidence for therapeutic implications. Order: LEVEL_1 > LEVEL_2 > LEVEL_3A > LEVEL_3B > LEVEL_4 | -| HIGHEST_RESISTANCE_LEVEL | LEVEL_R1, LEVEL_R2 | The highest resistance level of evidence for therapeutic implications. Order: LEVEL_R1 > LEVEL_R2 | -| TX_CITATIONS | PMID, Abstract, Website link | All citations related to therapeutic implications. | -| LEVEL_Dx* | Tumor type the level of evidence is assigned to | The leveled diagnostic implications. | -| HIGHEST_DX_LEVEL | LEVEL_Dx1, LEVEL_Dx2, LEVEL_Dx3 | The highest level of evidence for diagnostic implications. | -| DX_CITATIONS | PMID, Abstract, Website link | All citations related to diagnostic implications. | -| LEVEL_Px* | Tumor type the level of evidence is assigned to | The leveled prognostic implications. | -| HIGHEST_PX_LEVEL | LEVEL_Px1, LEVEL_Px2, LEVEL_Px3 | The highest level of evidence for prognostic implications. | -| PX_CITATIONS | PMID, Abstract, Website link | All citations related to prognostic implications. | - -## Columns added in the files using ClinicalDataAnnotator +## Columns added +### MafAnnotator/CnaAnnotator/StructuralVariantAnnotator/FusionAnnotator +| Column | Conditions | Possible Values | Description | +|-----------------------------|----------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| ANNOTATED | | True, False | Whether the variant is annotated by OncoKB successfully. | +| ONCOKB_HUGO_SYMBOL | Only added when annotating genomic change or HGVSg | | When annotating genomic change, we obtained gene hugo symbol from GenomeNexus. This can be cross-referenced with your own gene name. | +| ONCOKB_PROTEIN_CHANGE | Only added when annotating genomic change or HGVSg | | When annotating genomic change, we obtained alteration protein change from GenomeNexus. This can be cross-referenced with your own protein change. | +| ONCOKB_CONSEQUENCE | Only added when annotating genomic change or HGVSg | | When annotating genomic change, we obtained alteration consequence from GenomeNexus. This can be cross-referenced with your own consequence/Variant Class. | +| GENE_IN_ONCOKB | | True, False | Whether the gene has been curated by the OncoKB Team. | +| VARIANT_IN_ONCOKB | | True, False | Whether the variant has been curated by the OncoKB Team. Note: when a variant does not exist, it may still have annotations. | +| MUTATION_EFFECT | | Gain-of-function, Likely Gain-of-function, Loss-of-function, Likely Loss-of-function, Switch-of-function, Likely Switch-of-function, Neutral, Likely Neutral, Inconclusive, Unknown | The biological effect of a mutation/alteration on the protein function that gives rise to changes in the biological properties of cells expressing the mutant/altered protein compared to cells expressing the wildtype protein. | +| MUTATION_EFFECT_CITATIONS | | PMID, Abstract, Website link | All citations related to the biological effect. | +| ONCOGENIC | | Oncogenic, Likely Oncogenic, Likely Neutral, Inconclusive, Unknown, Resistance | In OncoKB™, “oncogenic” is defined as “referring to the ability to induce or cause cancer” as described in the second edition of The Biology of Cancer by Robert Weinberg (2014). | +| LEVEL_* | | Therapeutic implications | The leveled therapeutic implications. | +| HIGHEST_LEVEL | | LEVEL_1, LEVEL_2, LEVEL_3A, LEVEL_3B, LEVEL_4, LEVEL_R1, LEVEL_R2 | The highest level of evidence for therapeutic implications. Order: LEVEL_R1 > LEVEL_1 > LEVEL_2 > LEVEL_3A > LEVEL_3B > LEVEL_4 > LEVEL_R2 | +| HIGHEST_SENSITIVE_LEVEL | | LEVEL_1, LEVEL_2, LEVEL_3A, LEVEL_3B, LEVEL_4 | The highest sensitive level of evidence for therapeutic implications. Order: LEVEL_1 > LEVEL_2 > LEVEL_3A > LEVEL_3B > LEVEL_4 | +| HIGHEST_RESISTANCE_LEVEL | | LEVEL_R1, LEVEL_R2 | The highest resistance level of evidence for therapeutic implications. Order: LEVEL_R1 > LEVEL_R2 | +| TX_CITATIONS | | PMID, Abstract, Website link | All citations related to therapeutic implications. | +| LEVEL_Dx* | | Tumor type the level of evidence is assigned to | The leveled diagnostic implications. | +| HIGHEST_DX_LEVEL | | LEVEL_Dx1, LEVEL_Dx2, LEVEL_Dx3 | The highest level of evidence for diagnostic implications. | +| DX_CITATIONS | | PMID, Abstract, Website link | All citations related to diagnostic implications. | +| LEVEL_Px* | | Tumor type the level of evidence is assigned to | The leveled prognostic implications. | +| HIGHEST_PX_LEVEL | | LEVEL_Px1, LEVEL_Px2, LEVEL_Px3 | The highest level of evidence for prognostic implications. | +| PX_CITATIONS | | PMID, Abstract, Website link | All citations related to prognostic implications. | +| GENE_SUMMARY | Only when parameter -d is specified | | Brief overview of the gene and its role in cancer | +| VARIANT_SUMMARY | Only when parameter -d is specified | | Variant summary describes the variant oncogenicity, last review if it is VUS | +| TUMOR_TYPE_SUMMARY | Only when parameter -d is specified | | Tumor type summary describes the therapeutic implication that applies to the indication | +| DIAGNOSTIC_SUMMARY | Only when parameter -d is specified | | Diagnostic summary that applies to the indication, for hematologic malignancies only | +| PROGNOSTIC_SUMMARY | Only when parameter -d is specified | | Prognostic summary that applies to the indication, for hematologic malignancies only | +| MUTATION_EFFECT_DESCRIPTION | Only when parameter -d is specified | | The mutation effect description provides a brief overview of the biological and oncogenic effect of the VPS and includes appropriate references to peer-reviewed literature. | + +### ClinicalDataAnnotator Please see description above for columns LEVEL_*, HIGHEST_LEVEL, HIGHEST_SENSITIVE_LEVEL, HIGHEST_RESISTANCE_LEVEL, LEVEL_Dx*, HIGHEST_DX_LEVEL, LEVEL_Px*, HIGHEST_PX_LEVEL. Beside these columsn, the following columns will also be added. @@ -151,4 +164,4 @@ Beside these columsn, the following columns will also be added. | #MUTATIONS_WITH_PROGNOSTIC_IMPLICATIONS | Number of mutations in the sample with prognostic implications. | | #MUTATIONS | Number of mutations in the sample. | ## Questions? -The best way is to email contact@oncokb.org so all our team members can help. +The best way is to email contact@oncokb.org, so all our team members can help. diff --git a/StructuralVariantAnnotator.py b/StructuralVariantAnnotator.py index 7e4775a..8f8ca93 100644 --- a/StructuralVariantAnnotator.py +++ b/StructuralVariantAnnotator.py @@ -20,7 +20,9 @@ def main(argv): if argv.help: log.info( '\n' - 'StructuralVariantAnnotator.py -i -o [-p previous results] [-c ] [-s sample list filter] [-t ] [-u ] [-b ]\n' + 'StructuralVariantAnnotator.py -i -o ' + '[-p previous results] [-c ] [-s sample list filter] [-t ] ' + '[-u ] [-b ] [-d include descriptions]\n' ' Essential structural variant columns (case insensitive):\n' ' GENEA: Hugo gene symbol for gene A\n' ' GENEB: Hugo gene symbol for gene B\n' @@ -63,7 +65,8 @@ def main(argv): validate_oncokb_token() log.info('annotating %s ...' % argv.input_file) - process_sv(argv.input_file, argv.output_file, argv.previous_result_file, argv.default_cancer_type, cancertypemap) + process_sv(argv.input_file, argv.output_file, argv.previous_result_file, argv.default_cancer_type, cancertypemap, + argv.include_descriptions) log.info('done!') @@ -81,6 +84,7 @@ def main(argv): parser.add_argument('-u', dest='oncokb_api_url', default='', type=str) parser.add_argument('-v', dest='cancer_hotspots_base_url', default='', type=str) parser.add_argument('-b', dest='oncokb_api_bearer_token', default='', type=str) + parser.add_argument('-d', dest='include_descriptions', action="store_true", default=False) parser.set_defaults(func=main) args = parser.parse_args() diff --git a/flake8.ini b/flake8.ini index 28b9283..03d9cfd 100644 --- a/flake8.ini +++ b/flake8.ini @@ -1,2 +1,2 @@ [flake8] -ignore = E501,W503 \ No newline at end of file +ignore = E501,W503,E126 \ No newline at end of file diff --git a/test_Annotation.py b/test_Annotation.py index 326c7ee..8e699ae 100644 --- a/test_Annotation.py +++ b/test_Annotation.py @@ -3,7 +3,7 @@ import os import logging -from AnnotatorCore import pull_hgvsg_info +from AnnotatorCore import pull_hgvsg_info, DESCRIPTION_HEADERS, ONCOKB_ANNOTATION_HEADERS_GC from AnnotatorCore import pull_genomic_change_info from AnnotatorCore import pull_protein_change_info from AnnotatorCore import pull_structural_variant_info @@ -33,13 +33,19 @@ HIGHEST_PX_LEVEL_INDEX = HIGHEST_DX_LEVEL_INDEX + 5 UNKNOWN = 'Unknown' NUMBER_OF_ANNOTATION_COLUMNS = 27 +NUMBER_OF_DESCRIPTION_COLUMNS = len(DESCRIPTION_HEADERS) +NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS = len(ONCOKB_ANNOTATION_HEADERS_GC) +NUMBER_OF_ANNOTATION_COLUMNS_WITH_DESCRIPTIONS = NUMBER_OF_ANNOTATION_COLUMNS + NUMBER_OF_DESCRIPTION_COLUMNS +NUMBER_OF_GC_ANNOTATION_COLUMNS = NUMBER_OF_ANNOTATION_COLUMNS + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS +NUMBER_OF_GC_ANNOTATION_COLUMNS_WITH_DESCRIPTIONS = NUMBER_OF_GC_ANNOTATION_COLUMNS + NUMBER_OF_DESCRIPTION_COLUMNS -def fake_gene_one_query_suite(annotations): +def fake_gene_one_query_suite(annotations, include_descriptions): assert len(annotations) == 1 annotation = annotations[0] - assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS + assert len( + annotation) == NUMBER_OF_ANNOTATION_COLUMNS if include_descriptions is False else NUMBER_OF_ANNOTATION_COLUMNS_WITH_DESCRIPTIONS assert annotation[MUTATION_EFFECT_INDEX] == UNKNOWN assert annotation[ONCOGENIC_INDEX] == UNKNOWN assert annotation[HIGHEST_LEVEL_INDEX] == '' @@ -52,7 +58,7 @@ def test_check_protein_change(): ProteinChangeQuery('ABL1', 'BCR-ABL1 Fusion', 'Acute Leukemias of Ambiguous Lineage'), ] - annotations = pull_protein_change_info(queries, False) + annotations = pull_protein_change_info(queries, False, False) assert len(annotations) == 2 annotation = annotations[0] @@ -77,7 +83,7 @@ def test_reference_genome(): GenomicChangeQuery('7', '140753336', '140753336', 'A', 'T', 'LUAD', ReferenceGenome.GRCH38) ] - annotations = pull_genomic_change_info(queries, False) + annotations = pull_genomic_change_info(queries, False, False) assert len(annotations) == 2 annotation37 = annotations[0] @@ -89,7 +95,7 @@ def test_reference_genome(): ProteinChangeQuery('MYD88', 'M219T', 'Ovarian Cancer', ReferenceGenome.GRCH38) ] - annotations = pull_protein_change_info(queries, False) + annotations = pull_protein_change_info(queries, False, False) assert len(annotations) == 2 annotation37 = annotations[0] @@ -103,8 +109,11 @@ def test_fake_gene_protein_change(): ProteinChangeQuery('test1', 'V600E', 'Ovarian Cancer') ] - annotations = pull_protein_change_info(queries, False) - fake_gene_one_query_suite(annotations) + annotations = pull_protein_change_info(queries, False, False) + fake_gene_one_query_suite(annotations, False) + + annotations = pull_protein_change_info(queries, False, False) + fake_gene_one_query_suite(annotations, True) @pytest.mark.skipif(ONCOKB_API_TOKEN in (None, ''), reason="oncokb api token required") @@ -116,7 +125,7 @@ def test_check_atypical_alts(): ProteinChangeQuery('TERT', 'Promoter Mutation', 'Bladder Cancer', None, '5\'Flank') ] - annotations = pull_protein_change_info(queries, False) + annotations = pull_protein_change_info(queries, False, False) assert len(annotations) == 4 annotation = annotations[0] @@ -153,26 +162,26 @@ def test_check_hgvsg(): HGVSgQuery('5:g.1295167_1295168delinsAATG', 'LUAD'), ] - annotations = pull_hgvsg_info(queries, False) + annotations = pull_hgvsg_info(queries, False, False) assert len(annotations) == 3 annotation = annotations[0] - assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS - assert annotation[MUTATION_EFFECT_INDEX] == 'Gain-of-function' - assert annotation[ONCOGENIC_INDEX] == 'Oncogenic' - assert annotation[HIGHEST_LEVEL_INDEX] == 'LEVEL_1' + assert len(annotation) == NUMBER_OF_GC_ANNOTATION_COLUMNS + assert annotation[MUTATION_EFFECT_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Gain-of-function' + assert annotation[ONCOGENIC_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Oncogenic' + assert annotation[HIGHEST_LEVEL_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'LEVEL_1' annotation = annotations[1] - assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS - assert annotation[MUTATION_EFFECT_INDEX] == 'Gain-of-function' - assert annotation[ONCOGENIC_INDEX] == 'Oncogenic' - assert annotation[HIGHEST_LEVEL_INDEX] == 'LEVEL_1' + assert len(annotation) == NUMBER_OF_GC_ANNOTATION_COLUMNS + assert annotation[MUTATION_EFFECT_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Gain-of-function' + assert annotation[ONCOGENIC_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Oncogenic' + assert annotation[HIGHEST_LEVEL_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'LEVEL_1' annotation = annotations[2] - assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS - assert annotation[MUTATION_EFFECT_INDEX] == 'Likely Gain-of-function' - assert annotation[ONCOGENIC_INDEX] == 'Likely Oncogenic' - assert annotation[HIGHEST_LEVEL_INDEX] == '' + assert len(annotation) == NUMBER_OF_GC_ANNOTATION_COLUMNS + assert annotation[MUTATION_EFFECT_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Likely Gain-of-function' + assert annotation[ONCOGENIC_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Likely Oncogenic' + assert annotation[HIGHEST_LEVEL_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == '' @pytest.mark.skipif(ONCOKB_API_TOKEN in (None, ''), reason="oncokb api token required") @@ -186,26 +195,26 @@ def test_check_genomic_change(): GenomicChangeQuery('5', '1295167', '1295168', 'TC', 'AATG', 'LUAD'), ] - annotations = pull_genomic_change_info(queries, False) + annotations = pull_genomic_change_info(queries, False, False) assert len(annotations) == 3 annotation = annotations[0] - assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS - assert annotation[MUTATION_EFFECT_INDEX] == 'Gain-of-function' - assert annotation[ONCOGENIC_INDEX] == 'Oncogenic' - assert annotation[HIGHEST_LEVEL_INDEX] == 'LEVEL_1' + assert len(annotation) == NUMBER_OF_GC_ANNOTATION_COLUMNS + assert annotation[MUTATION_EFFECT_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Gain-of-function' + assert annotation[ONCOGENIC_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Oncogenic' + assert annotation[HIGHEST_LEVEL_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'LEVEL_1' annotation = annotations[1] - assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS - assert annotation[MUTATION_EFFECT_INDEX] == 'Gain-of-function' - assert annotation[ONCOGENIC_INDEX] == 'Oncogenic' - assert annotation[HIGHEST_LEVEL_INDEX] == 'LEVEL_1' + assert len(annotation) == NUMBER_OF_GC_ANNOTATION_COLUMNS + assert annotation[MUTATION_EFFECT_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Gain-of-function' + assert annotation[ONCOGENIC_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Oncogenic' + assert annotation[HIGHEST_LEVEL_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'LEVEL_1' annotation = annotations[2] - assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS - assert annotation[MUTATION_EFFECT_INDEX] == 'Likely Gain-of-function' - assert annotation[ONCOGENIC_INDEX] == 'Likely Oncogenic' - assert annotation[HIGHEST_LEVEL_INDEX] == '' + assert len(annotation) == NUMBER_OF_GC_ANNOTATION_COLUMNS + assert annotation[MUTATION_EFFECT_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Likely Gain-of-function' + assert annotation[ONCOGENIC_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Likely Oncogenic' + assert annotation[HIGHEST_LEVEL_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == '' @pytest.mark.skipif(ONCOKB_API_TOKEN in (None, ''), reason="oncokb api token required") @@ -216,7 +225,7 @@ def test_check_structural_variants(): StructuralVariantQuery('BCR', 'ABL1', 'FUSION', 'Acute Leukemias of Ambiguous Lineage'), ] - annotations = pull_structural_variant_info(queries) + annotations = pull_structural_variant_info(queries, False) assert len(annotations) == 3 annotation = annotations[0] @@ -246,8 +255,11 @@ def test_fake_fusion_gene(): StructuralVariantQuery('test1', 'test2', 'FUSION', 'NSCLC'), ] - annotations = pull_structural_variant_info(queries) - fake_gene_one_query_suite(annotations) + annotations = pull_structural_variant_info(queries, False) + fake_gene_one_query_suite(annotations, False) + + annotations = pull_structural_variant_info(queries, False) + fake_gene_one_query_suite(annotations, True) @pytest.mark.skipif(ONCOKB_API_TOKEN in (None, ''), reason="oncokb api token required") @@ -259,7 +271,7 @@ def test_cna(): CNAQuery('CDKN2A', 'Deletion', 'AML with BCR-ABL1'), ] - annotations = pull_cna_info(queries) + annotations = pull_cna_info(queries, False) assert len(annotations) == 4 annotation = annotations[0] @@ -295,18 +307,27 @@ def test_fake_cna(): CNAQuery('test1', 'Amplification', 'Breast Cancer'), ] - annotations = pull_cna_info(queries) - fake_gene_one_query_suite(annotations) + annotations = pull_cna_info(queries, False) + fake_gene_one_query_suite(annotations, False) + annotations = pull_cna_info(queries, True) + fake_gene_one_query_suite(annotations, True) -def check_brca2_s1882_without_cancertype(annotation): - assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS - assert annotation[MUTATION_EFFECT_INDEX] == 'Likely Loss-of-function' - assert annotation[ONCOGENIC_INDEX] == 'Likely Oncogenic' - assert annotation[HIGHEST_LEVEL_INDEX] == 'LEVEL_1' - assert annotation[LEVEL_1_INDEX] == 'Olaparib,Olaparib+Bevacizumab,Rucaparib,Olaparib+Abiraterone+Prednisone,Niraparib,Olaparib+Abiraterone+Prednisolone,Talazoparib+Enzalutamide' - assert annotation[LEVEL_2_INDEX] == 'Olaparib,Rucaparib,Niraparib' - assert annotation[LEVEL_3A_INDEX] == 'Olaparib,Talazoparib' + +def check_brca2_s1882_without_cancertype(annotation, genomic_query=False): + assert len(annotation) == NUMBER_OF_GC_ANNOTATION_COLUMNS if genomic_query else NUMBER_OF_ANNOTATION_COLUMNS + assert annotation[( + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS + MUTATION_EFFECT_INDEX) if genomic_query else MUTATION_EFFECT_INDEX] == 'Likely Loss-of-function' + assert annotation[( + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS + ONCOGENIC_INDEX) if genomic_query else ONCOGENIC_INDEX] == 'Likely Oncogenic' + assert annotation[( + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS + HIGHEST_LEVEL_INDEX) if genomic_query else HIGHEST_LEVEL_INDEX] == 'LEVEL_1' + assert annotation[( + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS + LEVEL_1_INDEX) if genomic_query else LEVEL_1_INDEX] == 'Olaparib,Olaparib+Bevacizumab,Rucaparib,Olaparib+Abiraterone+Prednisone,Niraparib,Olaparib+Abiraterone+Prednisolone,Talazoparib+Enzalutamide,Niraparib+Abiraterone Acetate+Prednisone' + assert annotation[( + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS + LEVEL_2_INDEX) if genomic_query else LEVEL_2_INDEX] == 'Olaparib,Rucaparib,Niraparib' + assert annotation[( + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS + LEVEL_3A_INDEX) if genomic_query else LEVEL_3A_INDEX] == 'Olaparib,Talazoparib' @pytest.mark.skipif(ONCOKB_API_TOKEN in (None, ''), reason="oncokb api token required") @@ -317,7 +338,7 @@ def test_duplicated_treatments(): queries = [ ProteinChangeQuery('BRCA2', 'S1882*', ''), ] - annotations = pull_protein_change_info(queries, False) + annotations = pull_protein_change_info(queries, False, False) assert len(annotations) == 1 check_brca2_s1882_without_cancertype(annotations[0]) @@ -326,7 +347,7 @@ def test_duplicated_treatments(): queries = [ GenomicChangeQuery('13', '32914137', '32914137', 'C', 'A', ''), ] - annotations = pull_genomic_change_info(queries, False) + annotations = pull_genomic_change_info(queries, False, False) assert len(annotations) == 1 - check_brca2_s1882_without_cancertype(annotations[0]) + check_brca2_s1882_without_cancertype(annotations[0], True)