From f1b21363d4e5f354e7791c252075759762c09b20 Mon Sep 17 00:00:00 2001 From: fairliereese Date: Thu, 21 Sep 2023 12:30:42 -0700 Subject: [PATCH 01/31] update --- src/talon/post/create_anndata_from_database.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/talon/post/create_anndata_from_database.py b/src/talon/post/create_anndata_from_database.py index 515eaa6..f3ebca3 100644 --- a/src/talon/post/create_anndata_from_database.py +++ b/src/talon/post/create_anndata_from_database.py @@ -30,33 +30,27 @@ def getOptions(): parser.add_option("--db", dest = "database", help = "TALON database", metavar = "FILE", type = "string") - parser.add_option("--annot", "-a", dest = "annot", help = """Which annotation version to use. Will determine which annotation transcripts are considered known or novel relative to. Note: must be in the TALON database.""", type = "string") - parser.add_option("--pass_list", dest = "pass_list", help = "Pass list file of transcripts to include in the \ output. First column should be TALON gene ID, \ second column should be TALON transcript ID", metavar = "FILE", type = "string", default = None) - parser.add_option("--build", "-b", dest = "build", help = "Genome build to use. Note: must be in the TALON database.", type = "string") - parser.add_option('--gene', dest='gene_level', help='Output AnnData on the gene level rather than the transcript', action='store_true') - parser.add_option("--datasets", "-d", dest = "dataset_file", help = """Optional: A file indicating which datasets should be included (one dataset name per line). Default is to include all datasets.""", metavar = "FILE", type = "string", default = None) - parser.add_option("--o", dest = "ofile", help = "Output file name", metavar = "FILE", type = "string") From 417565a9fce866c065de59a357e14a3b569d2596 Mon Sep 17 00:00:00 2001 From: fairliereese Date: Thu, 21 Sep 2023 12:32:04 -0700 Subject: [PATCH 02/31] added untested functionality to distinguish fusion / readthrough transcripts --- src/talon/post/get_read_annotations.py | 102 +++++++++++--------- src/talon/talon.py | 126 ++++++++++++++++--------- 2 files changed, 137 insertions(+), 91 deletions(-) diff --git a/src/talon/post/get_read_annotations.py b/src/talon/post/get_read_annotations.py index dc18748..c7d93e4 100644 --- a/src/talon/post/get_read_annotations.py +++ b/src/talon/post/get_read_annotations.py @@ -1,7 +1,7 @@ # TALON: Techonology-Agnostic Long Read Analysis Pipeline # Author: Dana Wyman # ----------------------------------------------------------------------------- -# get_read_annotations.py is a utility that queries a TALON +# get_read_annotations.py is a utility that queries a TALON # database in order to get read-specific annotation information. import argparse @@ -34,9 +34,9 @@ def get_args(): def fetch_reads(database, build, tmp_file = None, datasets = None): """ Performs database query to fetch location and gene/transcript assignment - info for each long read in the specified datasets. + info for each long read in the specified datasets. If tmp_file is set to None (default), then the function will return - the query results in a list of lists. If an alternate value is provided, + the query results in a list of lists. If an alternate value is provided, then the results will be written to a tmp file of that name.""" if datasets != None: @@ -50,8 +50,8 @@ def fetch_reads(database, build, tmp_file = None, datasets = None): conn.row_factory = sqlite3.Row cursor = conn.cursor() - query = """ SELECT os.read_name, - os.dataset, + query = """ SELECT os.read_name, + os.dataset, loc1.genome_build, os.gene_ID as gene_ID, os.transcript_ID as transcript_ID, @@ -66,17 +66,17 @@ def fetch_reads(database, build, tmp_file = None, datasets = None): os.fraction_As, os.custom_label, os.allelic_label, - os.start_support, + os.start_support, os.end_support FROM observed as os - LEFT JOIN location as loc1 ON - loc1.location_ID = os.start_vertex - LEFT JOIN location as loc2 ON + LEFT JOIN location as loc1 ON + loc1.location_ID = os.start_vertex + LEFT JOIN location as loc2 ON loc2.location_ID = os.end_vertex LEFT JOIN genes ON genes.gene_ID = os.gene_ID - LEFT JOIN transcripts ON + LEFT JOIN transcripts ON transcripts.transcript_ID = os.transcript_ID - WHERE loc1.genome_build = '$build' + WHERE loc1.genome_build = '$build' AND loc2.genome_build = '$build' """ query = Template(query + dataset_str) try: @@ -86,7 +86,7 @@ def fetch_reads(database, build, tmp_file = None, datasets = None): raise RuntimeError("Problem with reads database query") if tmp_file != None: - o = open(tmp_file, 'w') + o = open(tmp_file, 'w') else: reads = [] @@ -113,16 +113,16 @@ def fetch_reads(database, build, tmp_file = None, datasets = None): read_end = entry["end_vertex_pos"] - TTS_diff else: raise ValueError("Unrecognized strand value: " + str(strand)) - + # Create entry for output out_read = (entry["read_name"], entry["dataset"], entry["genome_build"], entry["gene_ID"], - entry["transcript_ID"], entry["chrom"], + entry["transcript_ID"], entry["chrom"], read_start, read_end, strand, entry["n_exons"], entry["read_length"], entry["fraction_As"], entry["custom_label"], entry["allelic_label"], entry["start_support"], entry["end_support"]) - + if tmp_file != None: o.write("\t".join([ str(x) for x in out_read ]) + "\n") else: @@ -131,7 +131,7 @@ def fetch_reads(database, build, tmp_file = None, datasets = None): # Return results or close file if count == 0: - raise ValueError(("No reads detected. Make sure your dataset names are " + raise ValueError(("No reads detected. Make sure your dataset names are " "correct.")) if tmp_file != None: @@ -161,6 +161,13 @@ def get_gene_novelty(database): for entry in cursor: gene_novelty[entry[0]] = "Antisense" + # Fetch fusion genes + cursor.execute("""SELECT ID FROM gene_annotations + WHERE attribute = "fusion_novel" + AND value = "TRUE";""") + for entry in cursor: + gene_novelty[entry[0]] = "Fusion" + # Fetch intergenic genes cursor.execute("""SELECT ID FROM gene_annotations WHERE attribute = "intergenic_novel" @@ -172,7 +179,7 @@ def get_gene_novelty(database): def get_transcript_novelty(database): """ Given a database, get the novelty status of each transcript. """ - + transcript_novelty = {} with sqlite3.connect(database) as conn: conn.row_factory = sqlite3.Row @@ -184,59 +191,66 @@ def get_transcript_novelty(database): AND value = "KNOWN";""") for entry in cursor: transcript_novelty[entry[0]] = "Known" - + # Fetch ISM transcripts cursor.execute("""SELECT ID FROM transcript_annotations WHERE attribute = "ISM_transcript" AND value = "TRUE";""") for entry in cursor: transcript_novelty[entry[0]] = "ISM" - + # Fetch NIC transcripts cursor.execute("""SELECT ID FROM transcript_annotations WHERE attribute = "NIC_transcript" AND value = "TRUE";""") for entry in cursor: transcript_novelty[entry[0]] = "NIC" - + # Fetch NNC transcripts cursor.execute("""SELECT ID FROM transcript_annotations WHERE attribute = "NNC_transcript" AND value = "TRUE";""") for entry in cursor: transcript_novelty[entry[0]] = "NNC" - + # Fetch antisense transcripts cursor.execute("""SELECT ID FROM transcript_annotations WHERE attribute = "antisense_transcript" AND value = "TRUE";""") for entry in cursor: transcript_novelty[entry[0]] = "Antisense" - + # Fetch intergenic transcripts cursor.execute("""SELECT ID FROM transcript_annotations WHERE attribute = "intergenic_transcript" AND value = "TRUE";""") for entry in cursor: transcript_novelty[entry[0]] = "Intergenic" - + # Fetch genomic transcripts cursor.execute("""SELECT ID FROM transcript_annotations WHERE attribute = "genomic_transcript" AND value = "TRUE";""") for entry in cursor: transcript_novelty[entry[0]] = "Genomic" - + + # Fetch fusion transcripts + cursor.execute("""SELECT ID FROM transcript_annotations + WHERE attribute = "fusion_transcript" + AND value = "TRUE";""") + for entry in cursor: + transcript_novelty[entry[0]] = "Fusion" + return transcript_novelty def get_ISM_novelty(database): """ Given a database, get the ISM subtype of each ISM transcript. """ - + all_ISMs = set() prefix_ISMs = set() suffix_ISMs = set() ISM_novelty = {} - + with sqlite3.connect(database) as conn: conn.row_factory = sqlite3.Row cursor = conn.cursor() @@ -275,7 +289,7 @@ def get_ISM_novelty(database): return ISM_novelty -def get_gene_annotations(database): +def get_gene_annotations(database): """ Create a dictionary linking each TALON gene ID to its human-readable name and accession ID """ @@ -286,13 +300,13 @@ def get_gene_annotations(database): conn.row_factory = sqlite3.Row cursor = conn.cursor() - cursor.execute("""SELECT ID, ga.value FROM gene_annotations as ga + cursor.execute("""SELECT ID, ga.value FROM gene_annotations as ga WHERE attribute = "gene_name";""") for entry in cursor: gene_name[entry["ID"]] = entry["value"] cursor.execute("""SELECT ID, ga.value FROM gene_annotations as ga - WHERE attribute = "gene_id";""") + WHERE attribute = "gene_id";""") for entry in cursor: gene_ID[entry["ID"]] = entry["value"] @@ -347,8 +361,8 @@ def make_read_annot_file(database, build, outprefix, datasets = "all"): 22. Start support (external assay) 23. End support (external assay) - By default, reads from all datasets in the database are included, but - this can be modified by supplying a list/tuple of dataset names to the + By default, reads from all datasets in the database are included, but + this can be modified by supplying a list/tuple of dataset names to the datasets parameter. """ tmp_read_file = outprefix + "_reads.tmp" @@ -356,19 +370,19 @@ def make_read_annot_file(database, build, outprefix, datasets = "all"): # Make annotation dicts gene_names, gene_IDs = get_gene_annotations(database) - transcript_names, transcript_IDs = get_transcript_annotations(database) + transcript_names, transcript_IDs = get_transcript_annotations(database) # Make novelty dicts gene_novelty = get_gene_novelty(database) transcript_novelty = get_transcript_novelty(database) - ISM_novelty = get_ISM_novelty(database) + ISM_novelty = get_ISM_novelty(database) fname = outprefix + "_talon_read_annot.tsv" o = open(fname, 'w') - colnames = [ "read_name", "dataset", "genome_build", "chrom", + colnames = [ "read_name", "dataset", "genome_build", "chrom", "read_start", "read_end", "strand", "n_exons", "read_length", "gene_ID", "transcript_ID", "annot_gene_id", "annot_transcript_id", - "annot_gene_name", "annot_transcript_name", "gene_novelty", + "annot_gene_name", "annot_transcript_name", "gene_novelty", "transcript_novelty", "ISM_subtype", "fraction_As", "custom_label", "allelic_label", "start_support", "end_support"] o.write("\t".join(colnames) + "\n") @@ -396,8 +410,8 @@ def make_read_annot_file(database, build, outprefix, datasets = "all"): if curr_transcript_novelty == "ISM": curr_ISM_novelty = ISM_novelty[transcript_ID] else: - curr_ISM_novelty = "None" - + curr_ISM_novelty = "None" + # Get annotation info try: annot_gene_id = gene_IDs[gene_ID] @@ -413,7 +427,7 @@ def make_read_annot_file(database, build, outprefix, datasets = "all"): annot_transcript_id = "None" try: annot_transcript_name = transcript_names[transcript_ID] - except: + except: annot_transcript_name = "None" gene_ID = str(gene_ID) @@ -421,9 +435,9 @@ def make_read_annot_file(database, build, outprefix, datasets = "all"): o.write("\t".join([read_name, dataset, genome_build, chrom, read_start, read_end, strand, n_exons, read_length, gene_ID, transcript_ID, - annot_gene_id, annot_transcript_id, - annot_gene_name, annot_transcript_name, - curr_gene_novelty, curr_transcript_novelty, + annot_gene_id, annot_transcript_id, + annot_gene_name, annot_transcript_name, + curr_gene_novelty, curr_transcript_novelty, curr_ISM_novelty, fraction_As, custom_label, allelic_label, start_support, end_support]) + "\n") @@ -467,10 +481,10 @@ def main(): raise ValueError("Database file '%s' does not exist!" % database) if datasets != None: - datasets = datasets.split(",") - + datasets = datasets.split(",") + make_read_annot_file(database, build, outprefix, datasets = datasets) - + if __name__ == '__main__': main() diff --git a/src/talon/talon.py b/src/talon/talon.py index e9e0a5a..bc6058e 100644 --- a/src/talon/talon.py +++ b/src/talon/talon.py @@ -950,30 +950,13 @@ def process_NIC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict, this, look up each vertex in the vertex_2_gene dict, and keep track of all same-strand genes. """ - gene_matches = [] start_end_info = {} - for vertex in vertex_IDs: - if vertex in vertex_2_gene: - curr_matches = vertex_2_gene[vertex] - - # Make sure the gene is on the correct strand - gene_matches += [x[0] - for x in list(curr_matches) if x[1] == strand] - - - # Now count up how often we see each gene - gene_tally = dict((x, gene_matches.count(x)) for x in set(gene_matches)) - - # print(gene_matches) - # print(gene_tally) - # TODO: deal with fusions - - # For the main assignment, pick the gene that is observed the most - if len(gene_tally) == 0: - return None, None, [], None - - gene_ID = max(gene_tally, key=gene_tally.get) + gene_ID, fusion = find_gene_match_on_vertex_basis(vertex_IDs, + strand, + vertex_2_gene) + if gene_ID == None: + return None, None, [], None, fusion # Get matches for the ends start_vertex, start_exon, start_novelty, known_start, diff_5p = process_5p(chrom, @@ -1009,35 +992,60 @@ def process_NIC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict, transcript_ID = novel_transcript["transcript_ID"] novelty = [(transcript_ID, run_info.idprefix, "TALON", "NIC_transcript", "TRUE")] + fusion = False - return gene_ID, transcript_ID, novelty, start_end_info + return gene_ID, transcript_ID, novelty, start_end_info, fusion def find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex_2_gene): """ Use vertices in a transcript to try to pinpoint the gene it belongs to. + + Parameters: + vertex_IDs (list of int): List of vertices in the read + strand (str): Strand of read + vertex_2_gene (dict): Dict. w/ keys = vertices, items = gene IDs of genes + that use this vertex + + Returns: + gene_ID (str or None): Gene ID of matching gene, or None if novel gene + needs to be created + fusion (bool): Whether gene read is from might be fusion / read through """ gene_matches = [] + n_gene_matches = [] + for vertex in vertex_IDs: if vertex in vertex_2_gene: curr_matches = vertex_2_gene[vertex] - # Make sure the gene is on the correct strand - gene_matches += [x[0] for x in curr_matches if x[1] == strand] + # enforce same strandedness + matches = [m for m in list(curr_matches) if m[1] == strand] - if len(gene_matches) == 0: - return None + gene_matches += [x[0] for x in list(matches)] - # Now count up how often we see each gene + # how many genes have this splice site? + n_gene_matches.append(len(matches)) + + # how many splice sites are from each gene gene_tally = dict((x, gene_matches.count(x)) for x in set(gene_matches)) - # print(gene_matches) - # print(gene_tally) - # TODO: deal with fusions + # no shared splice junctions + if len(gene_matches) == 0: + return None, False + + # if there is more than one splice site from two non-overlapping genes + # we need to make a new gene + # when there are no shared splice sites between gene hits but we did + # hit more than one gene + elif max(n_gene_matches) <= 1 and len(gene_tally) > 1: + return None, True # For the main assignment, pick the gene that is observed the most - gene_ID = max(gene_tally, key=gene_tally.get) + else: + gene_ID = max(gene_tally, key=gene_tally.get) + fusion = False - return gene_ID + return gene_ID, fusion def process_NNC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict, @@ -1047,10 +1055,10 @@ def process_NNC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict, novelty = [] start_end_info = {} - gene_ID = find_gene_match_on_vertex_basis( + gene_ID, fusion = find_gene_match_on_vertex_basis( vertex_IDs, strand, vertex_2_gene) if gene_ID == None: - return None, None, [], None + return None, None, [], None, False # Get matches for the ends start_vertex, start_exon, start_novelty, known_start, diff_5p = process_5p(chrom, @@ -1085,8 +1093,9 @@ def process_NNC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict, novelty.append((transcript_ID, run_info.idprefix, "TALON", "NNC_transcript", "TRUE")) + fusion = False - return gene_ID, transcript_ID, novelty, start_end_info + return gene_ID, transcript_ID, novelty, start_end_info, fusion def process_spliced_antisense(chrom, positions, strand, edge_IDs, vertex_IDs, @@ -1102,7 +1111,7 @@ def process_spliced_antisense(chrom, positions, strand, edge_IDs, vertex_IDs, anti_strand = "-" else: anti_strand = "+" - anti_gene_ID = find_gene_match_on_vertex_basis(vertex_IDs, anti_strand, + anti_gene_ID, fusion = find_gene_match_on_vertex_basis(vertex_IDs, anti_strand, vertex_2_gene) if anti_gene_ID == None: return None, None, gene_novelty, transcript_novelty, start_end_info @@ -1155,7 +1164,8 @@ def process_spliced_antisense(chrom, positions, strand, edge_IDs, vertex_IDs, def process_remaining_mult_cases(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict, gene_starts, gene_ends, edge_dict, - locations, vertex_2_gene, run_info, cursor, tmp_gene): + locations, vertex_2_gene, run_info, cursor, tmp_gene, + fusion): """ This function is a catch-all for multiexonic transcripts that were not FSM, ISM, NIC, NNC, or spliced antisense. """ @@ -1199,17 +1209,25 @@ def process_remaining_mult_cases(chrom, positions, strand, edge_IDs, vertex_IDs, start_end_info["vertex_IDs"] = vertex_IDs if gene_ID == None: + + if fusion: + t_nov = 'fusion_transcript' + g_nov = 'fusion_novel' + else: + t_nov = 'intergenic_transcript' + g_nov = 'intergenic_novel' + gene_ID = create_gene(chrom, positions[0], positions[-1], strand, cursor, tmp_gene) gene_novelty.append((gene_ID, run_info.idprefix, "TALON", - "intergenic_novel", "TRUE")) + g_nov, "TRUE")) transcript_ID = create_transcript(chrom, positions[0], positions[-1], gene_ID, edge_IDs, vertex_IDs, transcript_dict)["transcript_ID"] transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON", - "intergenic_transcript", "TRUE")) + t_nov, "TRUE")) elif match_strand != strand: anti_gene_ID = gene_ID @@ -1291,6 +1309,7 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di all_exons_known = check_all_exons_known(e_novelty) splice_vertices_known = (sum(v_novelty) == 0) all_exons_novel = (reduce(operator.mul, e_novelty, 1) == 1) + fusion = False # Look for FSM or ISM. if all_SJs_known: @@ -1298,6 +1317,7 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di all_matches = search_for_ISM(edge_IDs, transcript_dict) if all_matches != None: # Look for FSM first + print('looking for fsm') gene_ID, transcript_ID, transcript_novelty, start_end_info = process_FSM(chrom, positions, strand, edge_IDs, vertex_IDs, all_matches, @@ -1306,6 +1326,7 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di location_dict, run_info) if gene_ID == None: # Now look for ISM + print('looking for ism') gene_ID, transcript_ID, transcript_novelty, start_end_info = process_ISM(chrom, positions, strand, edge_IDs, @@ -1317,7 +1338,8 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di run_info) # Look for NIC if gene_ID == None: - gene_ID, transcript_ID, transcript_novelty, start_end_info = process_NIC(chrom, + print('looking for nic') + gene_ID, transcript_ID, transcript_novelty, start_end_info, fusion = process_NIC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict, @@ -1328,7 +1350,8 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di # Novel in catalog transcripts have known splice donors and acceptors, # but new connections between them. elif splice_vertices_known and gene_ID == None: - gene_ID, transcript_ID, transcript_novelty, start_end_info = process_NIC(chrom, + print('looking for nic (again?)') + gene_ID, transcript_ID, transcript_novelty, start_end_info, fusion = process_NIC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict, @@ -1337,7 +1360,8 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di vertex_2_gene, run_info) # Antisense transcript with splice junctions matching known gene - if splice_vertices_known and gene_ID == None: + if splice_vertices_known and gene_ID == None and not fusion: + print('looking for spliced antisese') gene_ID, transcript_ID, gene_novelty, transcript_novelty, start_end_info = \ process_spliced_antisense(chrom, positions, strand, edge_IDs, @@ -1351,8 +1375,9 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di # Novel not in catalog transcripts contain new splice donors/acceptors # and contain at least one splice junction. - elif not(splice_vertices_known): - gene_ID, transcript_ID, transcript_novelty, start_end_info = process_NNC(chrom, + elif not(splice_vertices_known) and not fusion: + print('lookign for NNCs') + gene_ID, transcript_ID, transcript_novelty, start_end_info, fusion = process_NNC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict, @@ -1361,6 +1386,7 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di vertex_2_gene, run_info) # Transcripts that don't match the previous categories end up here if gene_ID == None: + print('looking for this other stuff') gene_ID, transcript_ID, gene_novelty, transcript_novelty, start_end_info = \ process_remaining_mult_cases(chrom, positions, strand, edge_IDs, @@ -1369,7 +1395,12 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di gene_starts, gene_ends, edge_dict, location_dict, vertex_2_gene, run_info, - cursor, tmp_gene) + cursor, tmp_gene, + fusion) + + print(gene_ID) + print(gene_novelty) + print(transcript_novelty) # Add all novel vertices to vertex_2_gene now that we have the gene ID vertex_IDs = start_end_info["vertex_IDs"] @@ -1398,7 +1429,6 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di "transcript_name", talon_transcript_name)) transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON", "transcript_id", talon_transcript_name)) - # Add annotation entries for any novel exons exon_novelty = [] exons = edge_IDs[::2] @@ -2466,6 +2496,8 @@ def annotate_read(sam_record: pysam.AlignedSegment, cursor, run_info, """ # Parse attributes to determine the chromosome, positions, and strand of the transcript read_ID = sam_record.query_name + print() + print(read_ID) if not run_info.use_cb_tag: dataset = sam_record.get_tag("RG") else: From 0519f32d9681b26f455f5f8bbccf4f011e4902a0 Mon Sep 17 00:00:00 2001 From: fairliereese Date: Thu, 21 Sep 2023 12:32:54 -0700 Subject: [PATCH 03/31] added tests for option to filter known transcripts for all known, regardless of whether observed --- testing_suite/filtering/optparse_mock_filt.py | 1 + testing_suite/filtering/test_fetch_known.py | 27 ++++++++++++++++--- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/testing_suite/filtering/optparse_mock_filt.py b/testing_suite/filtering/optparse_mock_filt.py index ef5c7cb..75aebe0 100644 --- a/testing_suite/filtering/optparse_mock_filt.py +++ b/testing_suite/filtering/optparse_mock_filt.py @@ -11,3 +11,4 @@ def __init__(self, database, annot, max_frac_A = 0.5, self.outprefix = outprefix self.allow_genomic = allow_genomic self.exclude_ISMs = exclude_ISMs + self.include_annot = False diff --git a/testing_suite/filtering/test_fetch_known.py b/testing_suite/filtering/test_fetch_known.py index 27fcc39..2d883e4 100644 --- a/testing_suite/filtering/test_fetch_known.py +++ b/testing_suite/filtering/test_fetch_known.py @@ -7,36 +7,55 @@ def test_get_known_transcripts_all_datasets(): transcripts when datasets are not speicified """ database = "scratch/filter/test.db" - known = filt.get_known_transcripts(database, "toy", datasets = None) + include_annot = False + known = filt.get_known_transcripts(database, "toy", + include_annot, + datasets = None) + assert list(known.gene_ID) == [1, 1] + assert list(known.transcript_ID) == [1, 2] + +def test_get_known_transcripts_dataset_1_include_annot(): + """ Make sure the get_known_transcripts function returns all known + transcripts with the include_annot function """ + + database = "scratch/filter/test.db" + include_annot = False + known = filt.get_known_transcripts(database, "toy", + include_annot, + datasets = None) assert list(known.gene_ID) == [1, 1] assert list(known.transcript_ID) == [1, 2] def test_get_known_transcripts_specific_dataset(): - """ Now make sure the correct transcript is returned when the dataset is + """ Now make sure the correct transcript is returned when the dataset is specified. """ database = "scratch/filter/test.db" + include_annot = False # Both datasets - known = filt.get_known_transcripts(database, "toy", + known = filt.get_known_transcripts(database, "toy", + include_annot, datasets = ["dataset_1", "dataset_2"]) assert list(known.gene_ID) == [1, 1] assert list(known.transcript_ID) == [1, 2] # Dataset 1 known = filt.get_known_transcripts(database, "toy", + include_annot, datasets = ["dataset_1"]) assert list(known.iloc[0]) == [1, 1] assert len(known) == 1 # Dataset 2 known = filt.get_known_transcripts(database, "toy", + include_annot, datasets = ["dataset_2"]) assert list(known.iloc[0]) == [1, 2] assert len(known) == 1 # Dataset 3 known = filt.get_known_transcripts(database, "toy", + include_annot, datasets = ["dataset_3"]) assert len(known) == 0 - From c64fd572404cbe61b4dcf396750f0b13cb059643 Mon Sep 17 00:00:00 2001 From: fairliereese Date: Thu, 21 Sep 2023 12:34:18 -0700 Subject: [PATCH 04/31] updated existing tests to support new function signatures --- testing_suite/test_NIC_identification.py | 32 +++++++++---------- testing_suite/test_NNC_identification.py | 3 +- .../test_find_gene_match_on_vertex_basis.py | 9 ++++-- .../test_process_remaining_mult_cases.py | 22 ++++++++----- 4 files changed, 38 insertions(+), 28 deletions(-) diff --git a/testing_suite/test_NIC_identification.py b/testing_suite/test_NIC_identification.py index 33eb664..92e10fe 100644 --- a/testing_suite/test_NIC_identification.py +++ b/testing_suite/test_NIC_identification.py @@ -6,7 +6,7 @@ class TestIdentifyNIC(object): def test_NIC_match(self): - """ Example where the transcript is an NIC match to an existing one by + """ Example where the transcript is an NIC match to an existing one by virtue of skipping an exon. """ conn, cursor = get_db_cursor() @@ -28,22 +28,23 @@ def test_NIC_match(self): strand = "+" v_novelty = [0, 0] - gene_ID, transcript_ID, novelty, start_end_info = talon.process_NIC(chrom, - positions, - strand, edge_IDs, + gene_ID, transcript_ID, novelty, start_end_info, fusion = talon.process_NIC(chrom, + positions, + strand, edge_IDs, vertex_IDs, transcript_dict, - gene_starts, gene_ends, - edge_dict, location_dict, + gene_starts, gene_ends, + edge_dict, location_dict, vertex_2_gene, run_info) correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor) assert gene_ID == correct_gene_ID assert start_end_info["vertex_IDs"] == [1,2,5,6] assert transcript_dict[frozenset(start_end_info["edge_IDs"])] != None + assert fusion == False conn.close() def test_antisense(self): - """ Example where the vertices are known but there is no same-strand + """ Example where the vertices are known but there is no same-strand match """ conn, cursor = get_db_cursor() @@ -64,7 +65,7 @@ def test_antisense(self): chrom = "chr1" start = 1000 end = 1 - edge_IDs = [ talon.edge_counter.value() + 1 ] + edge_IDs = [ talon.edge_counter.value() + 1 ] positions = [ 1000, 900, 100, 1] vertex_IDs = [ 5, 2 ] strand = "-" @@ -73,16 +74,16 @@ def test_antisense(self): # Find antisense match gene_ID, transcript_ID, gene_novelty, transcript_novelty, start_end_info = \ - talon.process_spliced_antisense(chrom, positions, - strand, edge_IDs, - vertex_IDs, + talon.process_spliced_antisense(chrom, positions, + strand, edge_IDs, + vertex_IDs, transcript_dict, - gene_starts, - gene_ends, - edge_dict, locations, + gene_starts, + gene_ends, + edge_dict, locations, vertex_2_gene, run_info, cursor, "temp_gene") - #anti_gene_ID = talon.find_gene_match_on_vertex_basis(vertex_IDs, + #anti_gene_ID = talon.find_gene_match_on_vertex_basis(vertex_IDs, # anti_strand, # vertex_2_gene) @@ -92,4 +93,3 @@ def test_antisense(self): assert start_end_info["vertex_IDs"] == [6, 5, 2, 1] conn.close() - diff --git a/testing_suite/test_NNC_identification.py b/testing_suite/test_NNC_identification.py index 87c426e..63f1cd6 100644 --- a/testing_suite/test_NNC_identification.py +++ b/testing_suite/test_NNC_identification.py @@ -28,7 +28,7 @@ def test_NNC_match(self): strand = "+" v_novelty = [0, 0] - gene_ID, transcript_ID, transcript_novelty, start_end_info = talon.process_NNC(chrom, + gene_ID, transcript_ID, transcript_novelty, start_end_info, fusion = talon.process_NNC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict, @@ -40,4 +40,5 @@ def test_NNC_match(self): assert gene_ID == correct_gene_ID assert start_end_info["vertex_IDs"] == [1] + vertex_IDs + [6] assert transcript_dict[frozenset(start_end_info["edge_IDs"])] != None + assert fusion == False conn.close() diff --git a/testing_suite/test_find_gene_match_on_vertex_basis.py b/testing_suite/test_find_gene_match_on_vertex_basis.py index 730e482..b7c625e 100644 --- a/testing_suite/test_find_gene_match_on_vertex_basis.py +++ b/testing_suite/test_find_gene_match_on_vertex_basis.py @@ -19,10 +19,11 @@ def test_perfect_match(self): vertex_IDs = (1, 2, 3, 4, 5, 6) strand = "+" - gene_ID = talon.find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex2gene) + gene_ID, fusion = talon.find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex2gene) correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor) assert gene_ID == correct_gene_ID + assert fusion == False conn.close() def test_NNC_type_match(self): @@ -38,10 +39,11 @@ def test_NNC_type_match(self): vertex_IDs = (1, 200, 3, 4, 5, 6) strand = "+" - gene_ID = talon.find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex2gene) + gene_ID, fusion = talon.find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex2gene) correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor) assert gene_ID == correct_gene_ID + assert fusion == False conn.close() def test_no_match(self): @@ -56,7 +58,8 @@ def test_no_match(self): vertex_IDs = (1000, 2000, 3000, 4000) strand = "+" - gene_ID = talon.find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex2gene) + gene_ID, fusion = talon.find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex2gene) assert gene_ID == None + assert fusion == False conn.close() diff --git a/testing_suite/test_process_remaining_mult_cases.py b/testing_suite/test_process_remaining_mult_cases.py index a912741..cc63516 100644 --- a/testing_suite/test_process_remaining_mult_cases.py +++ b/testing_suite/test_process_remaining_mult_cases.py @@ -30,16 +30,18 @@ def test_intergenic(self): edge_IDs = [ talon.edge_counter.value() + 1, talon.edge_counter.value() + 2 ] vertex_IDs = [ talon.vertex_counter.value() + 1, talon.vertex_counter.value() + 2 ] strand = "+" + fusion = False gene_ID, transcript_ID, gene_novelty, transcript_novelty, start_end_info = \ - talon.process_remaining_mult_cases(chrom, positions, - strand, edge_IDs, - vertex_IDs, + talon.process_remaining_mult_cases(chrom, positions, + strand, edge_IDs, + vertex_IDs, transcript_dict, - gene_starts, gene_ends, + gene_starts, gene_ends, edge_dict, location_dict, - vertex_2_gene, run_info, - cursor, "temp_gene") + vertex_2_gene, run_info, + cursor, "temp_gene", + fusion) assert gene_ID == correct_gene_ID assert transcript_dict[frozenset(start_end_info["edge_IDs"])] != None @@ -72,6 +74,7 @@ def test_antisense(self): edge_IDs = [ talon.edge_counter.value() + 1, talon.edge_counter.value() + 2 ] vertex_IDs = [ talon.vertex_counter.value() + 1, talon.vertex_counter.value() + 2 ] strand = "-" + fusion = False gene_ID, transcript_ID, gene_novelty, transcript_novelty, start_end_info = \ talon.process_remaining_mult_cases(chrom, positions, @@ -81,7 +84,8 @@ def test_antisense(self): gene_starts, gene_ends, edge_dict, location_dict, vertex_2_gene, run_info, - cursor, "temp_gene") + cursor, "temp_gene", + fusion) assert gene_ID == correct_gene_ID assert transcript_dict[frozenset(start_end_info["edge_IDs"])] != None assert gene_novelty[0][-2] == "antisense_gene" @@ -111,6 +115,7 @@ def test_genomic(self): edge_IDs = [ talon.edge_counter.value() + 1, talon.edge_counter.value() + 2 ] vertex_IDs = [ talon.vertex_counter.value() + 1, talon.vertex_counter.value() + 2 ] strand = "-" + fusion = False gene_ID, transcript_ID, gene_novelty, transcript_novelty, start_end_info = \ talon.process_remaining_mult_cases(chrom, positions, @@ -120,7 +125,8 @@ def test_genomic(self): gene_starts, gene_ends, edge_dict, location_dict, vertex_2_gene, run_info, - cursor, "temp_gene") + cursor, "temp_gene", + fusion) correct_gene_ID = fetch_correct_ID("TG3", "gene", cursor) assert gene_ID == correct_gene_ID assert transcript_dict[frozenset(start_end_info["edge_IDs"])] != None From 5c2e1970e8c40ae1b07b86c9cc5b5748fa38f61b Mon Sep 17 00:00:00 2001 From: fairliereese Date: Thu, 21 Sep 2023 13:18:12 -0700 Subject: [PATCH 05/31] added test for low-level fusion gene det. and fixed relevant function --- src/talon/talon.py | 2 +- .../test_process_remaining_mult_cases.py | 46 +++++++++++++++++++ 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/src/talon/talon.py b/src/talon/talon.py index bc6058e..8c7809d 100644 --- a/src/talon/talon.py +++ b/src/talon/talon.py @@ -1173,7 +1173,7 @@ def process_remaining_mult_cases(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_novelty = [] start_end_info = {} - if not run_info.create_novel_spliced_genes: + if not run_info.create_novel_spliced_genes and not fusion: gene_ID, match_strand = search_for_overlap_with_gene(chrom, positions[0], positions[-1], strand, cursor, run_info, tmp_gene) diff --git a/testing_suite/test_process_remaining_mult_cases.py b/testing_suite/test_process_remaining_mult_cases.py index cc63516..36966bb 100644 --- a/testing_suite/test_process_remaining_mult_cases.py +++ b/testing_suite/test_process_remaining_mult_cases.py @@ -5,6 +5,52 @@ class TestIdentifyRemaining(object): + def test_fusion(self): + """ Example where the transcript is shares splice junctions between + two different genes + """ + conn, cursor = get_db_cursor() + build = "toy_build" + db = "scratch/toy.db" + talon.get_counters(db) + edge_dict = init_refs.make_edge_dict(cursor) + location_dict = init_refs.make_location_dict(build, cursor) + run_info = talon.init_run_info(db, build) + transcript_dict = init_refs.make_transcript_dict(cursor, build) + vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor) + gene_starts = init_refs.make_gene_start_or_end_dict(cursor, build, "start") + gene_ends = init_refs.make_gene_start_or_end_dict(cursor, build, "end") + correct_gene_ID = talon.gene_counter.value() + 1 + + chrom = "chr1" + positions = [1, 100, 500, 600, 900, 1010, 5000, 5550, 6000] + strand = "+" + edge_IDs = [2, 3, 4]+[ talon.edge_counter.value() + 1, talon.edge_counter.value() + 2 ] + vertex_IDs = [2, 3, 4, 5, 9, 10] + v_novelty = [0, 0, 0, 0, 0, 0] + + # Construct temp novel gene db + init_refs.make_temp_novel_gene_table(cursor, "toy_build") + fusion = True + + gene_ID, transcript_ID, gene_novelty, transcript_novelty, start_end_info = \ + talon.process_remaining_mult_cases(chrom, positions, + strand, edge_IDs, + vertex_IDs, + transcript_dict, + gene_starts, gene_ends, + edge_dict, location_dict, + vertex_2_gene, run_info, + cursor, "temp_gene", + fusion) + + assert gene_ID == correct_gene_ID + assert transcript_dict[frozenset(start_end_info["edge_IDs"])] != None + assert gene_novelty[0][-2] == "fusion_novel" + conn.close() + conn.close() + + def test_intergenic(self): """ Example where the transcript is an NIC match to an existing one by virtue of a new splice donor. From 04894a196d31407a50129018a7fd84a7f3c47696 Mon Sep 17 00:00:00 2001 From: fairliereese Date: Thu, 21 Sep 2023 14:16:11 -0700 Subject: [PATCH 06/31] added examples for readthrough --- .../input_files/readthrough/config.csv | 1 + .../hl60_1_1_subset_remapped_sorted.bam | Bin 0 -> 6521 bytes .../input_files/readthrough/readthrough.gtf | 294 ++++++++++++++++++ 3 files changed, 295 insertions(+) create mode 100644 testing_suite/input_files/readthrough/config.csv create mode 100644 testing_suite/input_files/readthrough/hl60_1_1_subset_remapped_sorted.bam create mode 100644 testing_suite/input_files/readthrough/readthrough.gtf diff --git a/testing_suite/input_files/readthrough/config.csv b/testing_suite/input_files/readthrough/config.csv new file mode 100644 index 0000000..974c858 --- /dev/null +++ b/testing_suite/input_files/readthrough/config.csv @@ -0,0 +1 @@ +hl60_1_1,hl60,SequelII,input_files/readthrough/hl60_1_1_subset_remapped_sorted.bam diff --git a/testing_suite/input_files/readthrough/hl60_1_1_subset_remapped_sorted.bam b/testing_suite/input_files/readthrough/hl60_1_1_subset_remapped_sorted.bam new file mode 100644 index 0000000000000000000000000000000000000000..f33a4b84b7f0927fc5be26c7b840261a1f010e43 GIT binary patch literal 6521 zcmZXZRZtv^mV}2vgF}G81ef6M1h){}-2x;KT!!E>I0X0LE`z(n4DK2v5Zr?d8XWF_ z?`~~v?Vg7|&;8I}RaaAe$3Ox6`(Drh(lS5*4tBPIL$s+JvG@bW$_dFG6Ocs8B!D!? z#RK6xRSB|%P%ot>4jR&a@mpD49(7!=?_g&~`>DWXQJ}%%FL`T=N=Ji6$Uu!bgQ7A( z`p6^|EiFfA$z=ny;5r!Z^f>iFWTX2pl}t>ETThOAP5yA(%Q3it;K*=xr`VjJPf@MZ zyJV>Rshdbsc0yc}SbLnDa!gejcHGU@O}s5_K^R+E9to0M@EZr?!C@ z0Jqfl#o83{et8?p-bH9-82Tk4yD?-Uz>?Zf*Am=x3?{}RbXlBQg7~Oc6`|Eg9Kc>o zMjT#u=@=@I697gS+j>sp5hzR*KV~XbvE%;Q9lmgxFou1&EEWiL zYo6qeTkz3MPic1?Ug*{X(_m3?@^91&!JQrHD_;C%vhq?KNyRSnzqg)?9{SmTsq64* zZkfhI*5NBCN#w;5hGW4K(jB1N@vgerYRKBEZ zK@&6~g796YMJ3sVXk_^wG~=!TvSXN|{#?nY`ZykX^u6G$xoIca`P%)nS@l`<2EVYW z(43Gg^yi}MkL`9B5#!~YuL+tqvHcm#A!;myk!{Va_QBiqLzzg_#`&Nem2R6PO`-yq zXZ?XFySmr_Yn!3#J)nqtI zvfVEJ82J#U%2DU-vUR>W>wBS3M9FsT195Ld&+9dnnvJy%BPV zfuSj1)UYJ%3N>T~*GsYl)puD2)5;KH7b(DGa1r0=+0AU|Va}DShueFUMzF_bDvp}oG~AQOco`aY`MIt044qWT#-xoDlsAWqEh_yy zDy)yRS&)VBytEL(uVt2QoicKc)RY!LB)`%-j6d58ZR*8J^Bq@Du8@k>Cklotya!`y zltE>fFl8YYIwZ4P5jw>E-STd&p+xeEGMr`L=tp1V^I0s=P$Y{YCrf^OlD7M#@L+{d zBbORe4%47io~$d)+_YS8v1n*%+8+u#|4kjBrmhzegDpeScjC1kl~75GH!I0#S%a5a zlzQnBM)#bu?|W6g_HbgHrL!Dq(j*8;8$h>g2cK|BARZ^ZA~L$pf52#GY}=3i$<13wsp2xbnklmjyZ@vyp?T@^bK<) z5f?JXCn(=5C!mO28GHOoh-&C~FFgwwO7zSo-T6D0m}?5lay^}j_YBWi0+nm~mSqbn z?Vu6%`Vb`y&s0d_E)FiuZoIW_Xk!-Tq$Y6{$9i|2JLK%G>BJ_$x!Oq2c2%G#DKGVO z=y`{Nab3E75{G9hVS<7aU7ehp-Z%3}DIVwvSnHW`oxYMhZ7f{&j%DKYh0P znBKGgen*3D%+Hp6vv_hinKSEpeL$VogazeUfQ#p@KP>=NMMr10g+#tYgLvDNichC_ zrI4G;4Wx5X-WE&+kuRP-MaBUYC<=rMkPn8=GItY;Zpf$Dg90M0x!9l=cG#VvnOEgo zoSP*SLdCdUHS1%7hCQlhzy4s~a-L@NqaHP1S>M0jWB(1_T$0;Y<2+vOvdixXk8@ZS zy7l^Q?VUIlDQ9MLx8!xiYUOL;l@+4~zKP>J`LW*Q=jdyJh&_*tEN(mLT5VVzKt`;u zzReXMuj9*J$4Uvw{x{{5$B^3d zo)L=G^X$v^^iR)-*)<0xx3OnEr7^vd7!LP0sXkqI#kDi=sZ3Djmn3DC`r1u;?(VMl zJzVd_>#C}Jzqm|JE%OS#Ka_jvd2069({ntjx+g<8j8p_X`8gbMRNgZa#M}m~`9$lA ze!J>-g&o4+rXUQE-&N%H7^En(=651ml)qbVja~|}ikg||545nKy`@8=km4b{)uwuK zq5Gx2{c2+k4#Vjkl^04CI(a8E59v^s_M3$O?esiE)>VHLUn2PFL&vxRvh{tJuZz zz18o}SMD9GG;a^!O|nkR^J(GLe;s9d+Dnt*!OxU2W=T_&;Ohg%7xK{!Dc-m>-b8ql z9+~%fsq&1+$R{ZP_&`u zIZ-d+>tEIjZ@}m}+uIZ{4Zz9oSrRe*QzP7T=~^n*^ZZwe^RX&vs>bVS3Y~e#?o#50 zJ#g`F%OCt{Gq=aNb&7FxPSDlp3^6e-CC>r$?G)7c%vb8=-k~%s?b7b_ZP!67c8!?s z^eUeAvy>AN*xrwE$nMe1lX2HGX8q=gbiC&LVcH$=kFIqsiw_VqEux?O~ z)*@2p;QI9YiU!miVw)Ge2;G~_K{p_U(JO$H;E7xyW3(d(HPAYUl41nnjB?O60XWMn zY^rdh-qC1B2mw3=4!QK{iUhrJ?FH#|y!}Z5;?8Ay0GLlmiPL!C(rszxrO807go6QU zpyXDtCE(1okf3O^BeT7YqOL-%0s@T(9Jq4PEuY(-mMy_bo%ZPmC&ivP_F4WOAG#K? zEV(eN6IXYGCo1U3@H05BP-;cNyN3u?0RHh3SQ9s-@WWnIJ~;+Gr6)DI^h$QlxXyt8 z`)k*4mWA1>D3aixIKn4&O{y)s#sfJb*%@xcnTOyZT|me{5BLisasm3fpGZ0Z?RkO+ zfZF-`C#ZH6c50@E$_qSu65N%%x+8|c)G-fExBoHgyo}e1Ir2Sc0-57m&*9py@!+~6 zm7D#|*DI@FM@sR!DU}DLll(6Eb(EE+^D-J8$m-G5IAr=2y$4_%1158`CwcHR6Q%`= zHs7G|QjK1HGal?j`&RI>cqCBDAEqaxSGVqE2xq-%yZ{?o}s2Z+1dZN8wT&7{Nql7HwQW2 z0UIY;!xaRuNX`OcM{CpzPl?y@E6cz%@=xUTApFw2>CM0~%UjW?FDQz!fum%go-3+G z&xnd42*o)zr5rENnIwaNqz5#C@qCNAtfT*kW@VnY6K5l@?mj;P(}2UohJ!!pQK3sw zv^Oxt1_{h0s3mua}21Z4$j8&RnD=oD~yq%#gUXp~D{!0DOhVpKG`= ze<4^44*BIPB3BYj3$7LC>;M4O|7ioNI48zqan^Fg-D1d0yfoY8+R}0nn?%G_A%M9{ zD`!4!cZO6_jJilH0Xkgsh(50TkH!Z|$#uZ8vhAW2}P8D79~qeT?5Hb$6y?F*P`iZA{5% z?BbQN}zVBq1-`~2Mt->^}ez$U(2m1ly%&3Wh40EmKthtCf zuPtp_gX6N*MiRU7+3>s=ieDcs6hAtT^B8(F(p(-43=)N_8Qal@n+|6%?fe-|)(2|v zj8~J%Z>^|aGoV2dc(3;TuH16X-v)H^Odq1<7hQgGekR+85=}uy^JO;;sf-bxP-Q)34 ztQ6Rq&!jpN*ocrZt!qQ_n2tozU)Go}iSA<6GrVur3YvoQI_e<32nXyJlg;ya^o6Bm zy}9l$Y+)~<>#N)4gU@8nsy^kK;-Z)@$EwRcBu21#LGY#7`e=HP_vW|HdRouz)dXJl zFMUT=4~q9L$CKAGL#d@6@Dn8afWuyApMNN|8#|U3TK5q9`Qb%vV?T0n?ZYiQ&Vj}C zBFfgQeytK;XVeW}GT7Z0)O|*e;LCNS$roRAp@47bS&MPi0hrpmq8ArNzjkJ} z!*$KCd)FLlz5jeetyo_gtM$7Mh5zx7zcI6W=S3N)NH4PlN<+fl!NJ)|uSB*5`0^kp zvEanECGg@7!(l*rUpf!0v&hsy*PttIWVAy0sA9|^(3 zBERf$Rm2i+TF<#|Kc#SMchr(o=1GLuG{USfsWi2s?&5{dZ%41PpP#X;^1#+vTs0#LL(6t@mq4#l8bpm8XUulUxPQ%gknz;iLK`HuFnSBWtFi2-)=1KW=iSl@pC#R7=Wy2|7 z`)U{KRtnnSH8zusFb;;Gq8O~ffB;rHy+J;*L!Qq5d!2<0QU5%7P%G$SK^gUSvNoB! z{hK+84IP~ii-Y_5a;DU>60`SXs809@pJ0^Cx&GuxT1eZ%ir}MbKQV;=%=JhG+XA{c-J4uas{m&u`U=z9APr=B)RLKZ zxCzy26BWI(eH)Dd3<)M^{yQLe#vZ4d7m*asojROrxdMW#-5z?m0s(a%c7B)bPe$ParoX!T+_&<1 z-{!Vg{dPYXRuOa{H<76Kje9l;5OKcvdGXOc;J){ov8$_izLc@sT?48*BVt)(v>zT= zM1r-T=>57PD2{tE2eWJm>b*sKP|n^3`4o&TP$xsyp1!s4YOKiNroNo%*j&on~r3s)T+`& zRglI)y~S2v@Xile!lXnW;1NaMJ@XuZwHZF7uG`X;ND?I(%Uw)_W5h$CyqpP4#Y{a& zt_{Mk9<``UZaiurh8w>AQNA#oUmuvb4bo+0Y2sl-r&J}N?=~7oF5-R11xDDGo)(5L zWVDF!fVEX{!lcx);}olK3a#<);O6}(tMt<@g}XszzgIJGca69{C6Pw!NJ6ulnG4^s z*TlQ({^@JrF~>CDPG>e^#Z4XMUZ72Fg3jZBe9AJwx9@Bux>ta)+2whVscxC_{IA?J zUw^O=cX5OCc5%tJ-Y>6^gEMwiLUx<@Eb=I7M$a0(#RqzSHusL!YFkuMm`9_6DuJA3 zMQ=Fi2s0JZsqBHT!37F3nsZPMHZqetdE(!1w{2?ENDMUPsnl8Zg|0D@cHV_G;H`M* z&woW-JhWBU)eFtF9SBvx_FdTO#TjCd^9u|!o;#wt?h0icP|ooPd^+Ql7^7Q#HKcYLy$E}f6ics*|OfuN9FcYb~?M~ zhg9)S!x^O1pcq&IkwI04BC6ax`1uizG8j{BpgMm~#%YY4 z^TF!;71oFLU51&3DcD-{qIEnJDT-M-xPT& z8GcY1cURC~wx(L;thKn?6t21<{PemX?^hUp^o7UODDrq9LzKZra;`g~)zdeti zoBSAGe2e`}A16)Bn0y}EN_Ov>KDX_PQVHMB?WgT>-aee^tWa^j4{8u4Wg;#2V3Ybv zZNtHQgpObt37ox#HG1E`TlG`0ilN#o5hE2X?orr&1VwL+gdj*Gq#O5d4pSOv>$Ef5 zq8;0;?IfYW-JerX*=Gv5Q?JI_@84m4sdPHf$Chrh`a@dz=1INa$b4ePYxd1}2I2ha zjz!0Ga$F98gap>>@`SNG)*%t}moBf@|H*z+A`r6zWBty4vu=Bg>aRU=@9z39G!)sJ zi`Aw}q#*w)=eDaZsg0+K-G%54_O%80FC^zB%}D{5NSjE?k7ms~l}=nFlJUCsWWkc9 zt>QOJmNN)kTq=2|?2oK1vfr>NFM~y?$bX$5)uHvoc0Ao@o*mU34LzCqwU|+fyV%^l W>2(jGRj~a3b?Pku`9E9BzvW-7#D!S^ literal 0 HcmV?d00001 diff --git a/testing_suite/input_files/readthrough/readthrough.gtf b/testing_suite/input_files/readthrough/readthrough.gtf new file mode 100644 index 0000000..7891fb5 --- /dev/null +++ b/testing_suite/input_files/readthrough/readthrough.gtf @@ -0,0 +1,294 @@ +chr1 HAVANA gene 10430102 10452153 . + . gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; tag "overlapping_locus"; +chr1 HAVANA transcript 10430102 10452153 . + . gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000400900.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-201"; transcript_support_level "2"; tag "basic,appris_principal_4,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000368904.1"; protein_id "ENSP00000383692.2"; ccdsid "CCDS114.1"; +chr1 HAVANA exon 10430102 10430568 . + . gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000400900.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-201"; transcript_support_level "2"; tag "basic,appris_principal_4,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000368904.1"; exon_number "1"; exon_id "ENSE00003720917.1"; protein_id "ENSP00000383692.2"; ccdsid "CCDS114.1"; +chr1 HAVANA CDS 10430518 10430568 . + 0 gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000400900.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-201"; transcript_support_level "2"; tag "basic,appris_principal_4,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000368904.1"; exon_number "1"; exon_id "ENSE00003720917.1"; protein_id "ENSP00000383692.2"; ccdsid "CCDS114.1"; +chr1 HAVANA start_codon 10430518 10430520 . + 0 gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000400900.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-201"; transcript_support_level "2"; tag "basic,appris_principal_4,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000368904.1"; exon_number "1"; exon_id "ENSE00003720917.1"; protein_id "ENSP00000383692.2"; ccdsid "CCDS114.1"; +chr1 HAVANA exon 10433842 10433965 . + . gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000400900.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-201"; transcript_support_level "2"; tag "basic,appris_principal_4,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000368904.1"; exon_number "2"; exon_id "ENSE00003626111.1"; protein_id "ENSP00000383692.2"; ccdsid "CCDS114.1"; +chr1 HAVANA CDS 10433842 10433965 . + 0 gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000400900.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-201"; transcript_support_level "2"; tag "basic,appris_principal_4,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000368904.1"; exon_number "2"; exon_id "ENSE00003626111.1"; protein_id "ENSP00000383692.2"; ccdsid "CCDS114.1"; +chr1 HAVANA exon 10434657 10434690 . + . gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000400900.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-201"; transcript_support_level "2"; tag "basic,appris_principal_4,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000368904.1"; exon_number "3"; exon_id "ENSE00003530260.1"; protein_id "ENSP00000383692.2"; ccdsid "CCDS114.1"; +chr1 HAVANA CDS 10434657 10434690 . + 2 gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000400900.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-201"; transcript_support_level "2"; tag "basic,appris_principal_4,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000368904.1"; exon_number "3"; exon_id "ENSE00003530260.1"; protein_id "ENSP00000383692.2"; ccdsid "CCDS114.1"; +chr1 HAVANA exon 10440347 10440413 . + . gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000400900.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-201"; transcript_support_level "2"; tag "basic,appris_principal_4,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000368904.1"; exon_number "4"; exon_id "ENSE00003654668.1"; protein_id "ENSP00000383692.2"; ccdsid "CCDS114.1"; +chr1 HAVANA CDS 10440347 10440413 . + 1 gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000400900.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-201"; transcript_support_level "2"; tag "basic,appris_principal_4,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000368904.1"; exon_number "4"; exon_id "ENSE00003654668.1"; protein_id "ENSP00000383692.2"; ccdsid "CCDS114.1"; +chr1 HAVANA exon 10451377 10452153 . + . gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000400900.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-201"; transcript_support_level "2"; tag "basic,appris_principal_4,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000368904.1"; exon_number "5"; exon_id "ENSE00003482235.1"; protein_id "ENSP00000383692.2"; ccdsid "CCDS114.1"; +chr1 HAVANA CDS 10451377 10451592 . + 0 gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000400900.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-201"; transcript_support_level "2"; tag "basic,appris_principal_4,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000368904.1"; exon_number "5"; exon_id "ENSE00003482235.1"; protein_id "ENSP00000383692.2"; ccdsid "CCDS114.1"; +chr1 HAVANA stop_codon 10451593 10451595 . + 0 gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000400900.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-201"; transcript_support_level "2"; tag "basic,appris_principal_4,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000368904.1"; exon_number "5"; exon_id "ENSE00003482235.1"; protein_id "ENSP00000383692.2"; ccdsid "CCDS114.1"; +chr1 HAVANA UTR 10430102 10430517 . + . gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000400900.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-201"; transcript_support_level "2"; tag "basic,appris_principal_4,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000368904.1"; exon_number "1"; exon_id "ENSE00003720917.1"; protein_id "ENSP00000383692.2"; ccdsid "CCDS114.1"; +chr1 HAVANA UTR 10451593 10452153 . + . gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000400900.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-201"; transcript_support_level "2"; tag "basic,appris_principal_4,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000368904.1"; exon_number "5"; exon_id "ENSE00003482235.1"; protein_id "ENSP00000383692.2"; ccdsid "CCDS114.1"; +chr1 HAVANA transcript 10430384 10451659 . + . gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000470413.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-203"; transcript_support_level "2"; tag "basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000130799.1"; protein_id "ENSP00000433615.1"; ccdsid "CCDS53266.1"; +chr1 HAVANA exon 10430384 10430568 . + . gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000470413.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-203"; transcript_support_level "2"; tag "basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000130799.1"; exon_number "1"; exon_id "ENSE00002253437.1"; protein_id "ENSP00000433615.1"; ccdsid "CCDS53266.1"; +chr1 HAVANA CDS 10430518 10430568 . + 0 gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000470413.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-203"; transcript_support_level "2"; tag "basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000130799.1"; exon_number "1"; exon_id "ENSE00002253437.1"; protein_id "ENSP00000433615.1"; ccdsid "CCDS53266.1"; +chr1 HAVANA start_codon 10430518 10430520 . + 0 gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000470413.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-203"; transcript_support_level "2"; tag "basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000130799.1"; exon_number "1"; exon_id "ENSE00002253437.1"; protein_id "ENSP00000433615.1"; ccdsid "CCDS53266.1"; +chr1 HAVANA exon 10433842 10433965 . + . gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000470413.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-203"; transcript_support_level "2"; tag "basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000130799.1"; exon_number "2"; exon_id "ENSE00003626111.1"; protein_id "ENSP00000433615.1"; ccdsid "CCDS53266.1"; +chr1 HAVANA CDS 10433842 10433965 . + 0 gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000470413.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-203"; transcript_support_level "2"; tag "basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000130799.1"; exon_number "2"; exon_id "ENSE00003626111.1"; protein_id "ENSP00000433615.1"; ccdsid "CCDS53266.1"; +chr1 HAVANA exon 10434657 10434690 . + . gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000470413.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-203"; transcript_support_level "2"; tag "basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000130799.1"; exon_number "3"; exon_id "ENSE00003530260.1"; protein_id "ENSP00000433615.1"; ccdsid "CCDS53266.1"; +chr1 HAVANA CDS 10434657 10434690 . + 2 gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000470413.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-203"; transcript_support_level "2"; tag "basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000130799.1"; exon_number "3"; exon_id "ENSE00003530260.1"; protein_id "ENSP00000433615.1"; ccdsid "CCDS53266.1"; +chr1 HAVANA exon 10451377 10451659 . + . gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000470413.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-203"; transcript_support_level "2"; tag "basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000130799.1"; exon_number "4"; exon_id "ENSE00003665651.1"; protein_id "ENSP00000433615.1"; ccdsid "CCDS53266.1"; +chr1 HAVANA CDS 10451377 10451398 . + 1 gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000470413.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-203"; transcript_support_level "2"; tag "basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000130799.1"; exon_number "4"; exon_id "ENSE00003665651.1"; protein_id "ENSP00000433615.1"; ccdsid "CCDS53266.1"; +chr1 HAVANA stop_codon 10451399 10451401 . + 0 gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000470413.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-203"; transcript_support_level "2"; tag "basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000130799.1"; exon_number "4"; exon_id "ENSE00003665651.1"; protein_id "ENSP00000433615.1"; ccdsid "CCDS53266.1"; +chr1 HAVANA UTR 10430384 10430517 . + . gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000470413.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-203"; transcript_support_level "2"; tag "basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000130799.1"; exon_number "1"; exon_id "ENSE00002253437.1"; protein_id "ENSP00000433615.1"; ccdsid "CCDS53266.1"; +chr1 HAVANA UTR 10451399 10451659 . + . gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000470413.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-203"; transcript_support_level "2"; tag "basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000130799.1"; exon_number "4"; exon_id "ENSE00003665651.1"; protein_id "ENSP00000433615.1"; ccdsid "CCDS53266.1"; +chr1 HAVANA transcript 10430435 10451411 . + . gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602787.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-205"; transcript_support_level "3"; tag "basic,appris_alternative_2,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000467583.2"; protein_id "ENSP00000473509.2"; ccdsid "CCDS115.1"; +chr1 HAVANA exon 10430435 10430568 . + . gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602787.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-205"; transcript_support_level "3"; tag "basic,appris_alternative_2,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000467583.2"; exon_number "1"; exon_id "ENSE00003784866.1"; protein_id "ENSP00000473509.2"; ccdsid "CCDS115.1"; +chr1 HAVANA CDS 10430518 10430568 . + 0 gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602787.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-205"; transcript_support_level "3"; tag "basic,appris_alternative_2,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000467583.2"; exon_number "1"; exon_id "ENSE00003784866.1"; protein_id "ENSP00000473509.2"; ccdsid "CCDS115.1"; +chr1 HAVANA start_codon 10430518 10430520 . + 0 gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602787.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-205"; transcript_support_level "3"; tag "basic,appris_alternative_2,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000467583.2"; exon_number "1"; exon_id "ENSE00003784866.1"; protein_id "ENSP00000473509.2"; ccdsid "CCDS115.1"; +chr1 HAVANA exon 10433842 10433965 . + . gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602787.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-205"; transcript_support_level "3"; tag "basic,appris_alternative_2,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000467583.2"; exon_number "2"; exon_id "ENSE00003626111.1"; protein_id "ENSP00000473509.2"; ccdsid "CCDS115.1"; +chr1 HAVANA CDS 10433842 10433965 . + 0 gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602787.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-205"; transcript_support_level "3"; tag "basic,appris_alternative_2,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000467583.2"; exon_number "2"; exon_id "ENSE00003626111.1"; protein_id "ENSP00000473509.2"; ccdsid "CCDS115.1"; +chr1 HAVANA exon 10434657 10434690 . + . gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602787.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-205"; transcript_support_level "3"; tag "basic,appris_alternative_2,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000467583.2"; exon_number "3"; exon_id "ENSE00003530260.1"; protein_id "ENSP00000473509.2"; ccdsid "CCDS115.1"; +chr1 HAVANA CDS 10434657 10434690 . + 2 gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602787.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-205"; transcript_support_level "3"; tag "basic,appris_alternative_2,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000467583.2"; exon_number "3"; exon_id "ENSE00003530260.1"; protein_id "ENSP00000473509.2"; ccdsid "CCDS115.1"; +chr1 HAVANA exon 10440347 10440413 . + . gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602787.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-205"; transcript_support_level "3"; tag "basic,appris_alternative_2,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000467583.2"; exon_number "4"; exon_id "ENSE00003654668.1"; protein_id "ENSP00000473509.2"; ccdsid "CCDS115.1"; +chr1 HAVANA CDS 10440347 10440413 . + 1 gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602787.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-205"; transcript_support_level "3"; tag "basic,appris_alternative_2,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000467583.2"; exon_number "4"; exon_id "ENSE00003654668.1"; protein_id "ENSP00000473509.2"; ccdsid "CCDS115.1"; +chr1 HAVANA exon 10442265 10442441 . + . gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602787.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-205"; transcript_support_level "3"; tag "basic,appris_alternative_2,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000467583.2"; exon_number "5"; exon_id "ENSE00003232254.1"; protein_id "ENSP00000473509.2"; ccdsid "CCDS115.1"; +chr1 HAVANA CDS 10442265 10442402 . + 0 gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602787.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-205"; transcript_support_level "3"; tag "basic,appris_alternative_2,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000467583.2"; exon_number "5"; exon_id "ENSE00003232254.1"; protein_id "ENSP00000473509.2"; ccdsid "CCDS115.1"; +chr1 HAVANA stop_codon 10442403 10442405 . + 0 gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602787.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-205"; transcript_support_level "3"; tag "basic,appris_alternative_2,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000467583.2"; exon_number "5"; exon_id "ENSE00003232254.1"; protein_id "ENSP00000473509.2"; ccdsid "CCDS115.1"; +chr1 HAVANA exon 10451377 10451411 . + . gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602787.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-205"; transcript_support_level "3"; tag "basic,appris_alternative_2,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000467583.2"; exon_number "6"; exon_id "ENSE00003573719.3"; protein_id "ENSP00000473509.2"; ccdsid "CCDS115.1"; +chr1 HAVANA UTR 10430435 10430517 . + . gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602787.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-205"; transcript_support_level "3"; tag "basic,appris_alternative_2,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000467583.2"; exon_number "1"; exon_id "ENSE00003784866.1"; protein_id "ENSP00000473509.2"; ccdsid "CCDS115.1"; +chr1 HAVANA UTR 10442403 10442441 . + . gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602787.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-205"; transcript_support_level "3"; tag "basic,appris_alternative_2,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000467583.2"; exon_number "5"; exon_id "ENSE00003232254.1"; protein_id "ENSP00000473509.2"; ccdsid "CCDS115.1"; +chr1 HAVANA UTR 10451377 10451411 . + . gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602787.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-205"; transcript_support_level "3"; tag "basic,appris_alternative_2,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000467583.2"; exon_number "6"; exon_id "ENSE00003573719.3"; protein_id "ENSP00000473509.2"; ccdsid "CCDS115.1"; +chr1 HAVANA transcript 10430804 10452003 . + . gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602296.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-204"; transcript_support_level "3"; tag "overlapping_uORF,upstream_uORF,basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000488626.1"; protein_id "ENSP00000473401.2"; ccdsid "CCDS72699.1"; +chr1 HAVANA exon 10430804 10431401 . + . gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602296.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-204"; transcript_support_level "3"; tag "overlapping_uORF,upstream_uORF,basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000488626.1"; exon_number "1"; exon_id "ENSE00003725534.2"; protein_id "ENSP00000473401.2"; ccdsid "CCDS72699.1"; +chr1 HAVANA CDS 10431290 10431401 . + 0 gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602296.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-204"; transcript_support_level "3"; tag "overlapping_uORF,upstream_uORF,basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000488626.1"; exon_number "1"; exon_id "ENSE00003725534.2"; protein_id "ENSP00000473401.2"; ccdsid "CCDS72699.1"; +chr1 HAVANA start_codon 10431290 10431292 . + 0 gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602296.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-204"; transcript_support_level "3"; tag "overlapping_uORF,upstream_uORF,basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000488626.1"; exon_number "1"; exon_id "ENSE00003725534.2"; protein_id "ENSP00000473401.2"; ccdsid "CCDS72699.1"; +chr1 HAVANA exon 10434657 10434690 . + . gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602296.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-204"; transcript_support_level "3"; tag "overlapping_uORF,upstream_uORF,basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000488626.1"; exon_number "2"; exon_id "ENSE00003530260.1"; protein_id "ENSP00000473401.2"; ccdsid "CCDS72699.1"; +chr1 HAVANA CDS 10434657 10434690 . + 2 gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602296.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-204"; transcript_support_level "3"; tag "overlapping_uORF,upstream_uORF,basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000488626.1"; exon_number "2"; exon_id "ENSE00003530260.1"; protein_id "ENSP00000473401.2"; ccdsid "CCDS72699.1"; +chr1 HAVANA exon 10440347 10440413 . + . gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602296.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-204"; transcript_support_level "3"; tag "overlapping_uORF,upstream_uORF,basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000488626.1"; exon_number "3"; exon_id "ENSE00003654668.1"; protein_id "ENSP00000473401.2"; ccdsid "CCDS72699.1"; +chr1 HAVANA CDS 10440347 10440413 . + 1 gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602296.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-204"; transcript_support_level "3"; tag "overlapping_uORF,upstream_uORF,basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000488626.1"; exon_number "3"; exon_id "ENSE00003654668.1"; protein_id "ENSP00000473401.2"; ccdsid "CCDS72699.1"; +chr1 HAVANA exon 10451377 10452003 . + . gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602296.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-204"; transcript_support_level "3"; tag "overlapping_uORF,upstream_uORF,basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000488626.1"; exon_number "4"; exon_id "ENSE00003613950.2"; protein_id "ENSP00000473401.2"; ccdsid "CCDS72699.1"; +chr1 HAVANA CDS 10451377 10451592 . + 0 gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602296.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-204"; transcript_support_level "3"; tag "overlapping_uORF,upstream_uORF,basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000488626.1"; exon_number "4"; exon_id "ENSE00003613950.2"; protein_id "ENSP00000473401.2"; ccdsid "CCDS72699.1"; +chr1 HAVANA stop_codon 10451593 10451595 . + 0 gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602296.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-204"; transcript_support_level "3"; tag "overlapping_uORF,upstream_uORF,basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000488626.1"; exon_number "4"; exon_id "ENSE00003613950.2"; protein_id "ENSP00000473401.2"; ccdsid "CCDS72699.1"; +chr1 HAVANA UTR 10430804 10431289 . + . gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602296.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-204"; transcript_support_level "3"; tag "overlapping_uORF,upstream_uORF,basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000488626.1"; exon_number "1"; exon_id "ENSE00003725534.2"; protein_id "ENSP00000473401.2"; ccdsid "CCDS72699.1"; +chr1 HAVANA UTR 10451593 10452003 . + . gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602296.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-204"; transcript_support_level "3"; tag "overlapping_uORF,upstream_uORF,basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000488626.1"; exon_number "4"; exon_id "ENSE00003613950.2"; protein_id "ENSP00000473401.2"; ccdsid "CCDS72699.1"; +chr1 HAVANA transcript 10431258 10451659 . + . gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000465026.1"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-202"; transcript_support_level "3"; tag "basic,readthrough_transcript"; havana_transcript "OTTHUMT00000130794.2"; protein_id "ENSP00000489060.1"; +chr1 HAVANA exon 10431258 10431401 . + . gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000465026.1"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-202"; transcript_support_level "3"; tag "basic,readthrough_transcript"; havana_transcript "OTTHUMT00000130794.2"; exon_number "1"; exon_id "ENSE00001928985.1"; protein_id "ENSP00000489060.1"; +chr1 HAVANA CDS 10431290 10431401 . + 0 gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000465026.1"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-202"; transcript_support_level "3"; tag "basic,readthrough_transcript"; havana_transcript "OTTHUMT00000130794.2"; exon_number "1"; exon_id "ENSE00001928985.1"; protein_id "ENSP00000489060.1"; +chr1 HAVANA start_codon 10431290 10431292 . + 0 gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000465026.1"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-202"; transcript_support_level "3"; tag "basic,readthrough_transcript"; havana_transcript "OTTHUMT00000130794.2"; exon_number "1"; exon_id "ENSE00001928985.1"; protein_id "ENSP00000489060.1"; +chr1 HAVANA exon 10440347 10440413 . + . gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000465026.1"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-202"; transcript_support_level "3"; tag "basic,readthrough_transcript"; havana_transcript "OTTHUMT00000130794.2"; exon_number "2"; exon_id "ENSE00003589213.1"; protein_id "ENSP00000489060.1"; +chr1 HAVANA CDS 10440347 10440381 . + 2 gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000465026.1"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-202"; transcript_support_level "3"; tag "basic,readthrough_transcript"; havana_transcript "OTTHUMT00000130794.2"; exon_number "2"; exon_id "ENSE00003589213.1"; protein_id "ENSP00000489060.1"; +chr1 HAVANA stop_codon 10440382 10440384 . + 0 gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000465026.1"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-202"; transcript_support_level "3"; tag "basic,readthrough_transcript"; havana_transcript "OTTHUMT00000130794.2"; exon_number "2"; exon_id "ENSE00003589213.1"; protein_id "ENSP00000489060.1"; +chr1 HAVANA exon 10451377 10451659 . + . gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000465026.1"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-202"; transcript_support_level "3"; tag "basic,readthrough_transcript"; havana_transcript "OTTHUMT00000130794.2"; exon_number "3"; exon_id "ENSE00003497845.1"; protein_id "ENSP00000489060.1"; +chr1 HAVANA UTR 10431258 10431289 . + . gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000465026.1"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-202"; transcript_support_level "3"; tag "basic,readthrough_transcript"; havana_transcript "OTTHUMT00000130794.2"; exon_number "1"; exon_id "ENSE00001928985.1"; protein_id "ENSP00000489060.1"; +chr1 HAVANA UTR 10440382 10440413 . + . gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000465026.1"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-202"; transcript_support_level "3"; tag "basic,readthrough_transcript"; havana_transcript "OTTHUMT00000130794.2"; exon_number "2"; exon_id "ENSE00003589213.1"; protein_id "ENSP00000489060.1"; +chr1 HAVANA UTR 10451377 10451659 . + . gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000465026.1"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-202"; transcript_support_level "3"; tag "basic,readthrough_transcript"; havana_transcript "OTTHUMT00000130794.2"; exon_number "3"; exon_id "ENSE00003497845.1"; protein_id "ENSP00000489060.1"; +chr1 HAVANA gene 10430443 10442809 . + . gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "1"; havana_gene "OTTHUMG00000059085.10"; tag "overlapping_locus"; +chr1 HAVANA transcript 10430443 10442809 . + . gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000309048.7"; transcript_type "protein_coding"; transcript_name "CENPS-201"; transcript_support_level "1"; tag "basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000130797.2"; protein_id "ENSP00000308583.2"; ccdsid "CCDS115.1"; +chr1 HAVANA exon 10430443 10430568 . + . gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000309048.7"; transcript_type "protein_coding"; transcript_name "CENPS-201"; transcript_support_level "1"; tag "basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000130797.2"; exon_number "1"; exon_id "ENSE00001849117.1"; protein_id "ENSP00000308583.2"; ccdsid "CCDS115.1"; +chr1 HAVANA CDS 10430518 10430568 . + 0 gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000309048.7"; transcript_type "protein_coding"; transcript_name "CENPS-201"; transcript_support_level "1"; tag "basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000130797.2"; exon_number "1"; exon_id "ENSE00001849117.1"; protein_id "ENSP00000308583.2"; ccdsid "CCDS115.1"; +chr1 HAVANA start_codon 10430518 10430520 . + 0 gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000309048.7"; transcript_type "protein_coding"; transcript_name "CENPS-201"; transcript_support_level "1"; tag "basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000130797.2"; exon_number "1"; exon_id "ENSE00001849117.1"; protein_id "ENSP00000308583.2"; ccdsid "CCDS115.1"; +chr1 HAVANA exon 10433842 10433965 . + . gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000309048.7"; transcript_type "protein_coding"; transcript_name "CENPS-201"; transcript_support_level "1"; tag "basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000130797.2"; exon_number "2"; exon_id "ENSE00003666910.1"; protein_id "ENSP00000308583.2"; ccdsid "CCDS115.1"; +chr1 HAVANA CDS 10433842 10433965 . + 0 gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000309048.7"; transcript_type "protein_coding"; transcript_name "CENPS-201"; transcript_support_level "1"; tag "basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000130797.2"; exon_number "2"; exon_id "ENSE00003666910.1"; protein_id "ENSP00000308583.2"; ccdsid "CCDS115.1"; +chr1 HAVANA exon 10434657 10434690 . + . gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000309048.7"; transcript_type "protein_coding"; transcript_name "CENPS-201"; transcript_support_level "1"; tag "basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000130797.2"; exon_number "3"; exon_id "ENSE00003664823.1"; protein_id "ENSP00000308583.2"; ccdsid "CCDS115.1"; +chr1 HAVANA CDS 10434657 10434690 . + 2 gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000309048.7"; transcript_type "protein_coding"; transcript_name "CENPS-201"; transcript_support_level "1"; tag "basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000130797.2"; exon_number "3"; exon_id "ENSE00003664823.1"; protein_id "ENSP00000308583.2"; ccdsid "CCDS115.1"; +chr1 HAVANA exon 10440347 10440413 . + . gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000309048.7"; transcript_type "protein_coding"; transcript_name "CENPS-201"; transcript_support_level "1"; tag "basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000130797.2"; exon_number "4"; exon_id "ENSE00003599799.1"; protein_id "ENSP00000308583.2"; ccdsid "CCDS115.1"; +chr1 HAVANA CDS 10440347 10440413 . + 1 gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000309048.7"; transcript_type "protein_coding"; transcript_name "CENPS-201"; transcript_support_level "1"; tag "basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000130797.2"; exon_number "4"; exon_id "ENSE00003599799.1"; protein_id "ENSP00000308583.2"; ccdsid "CCDS115.1"; +chr1 HAVANA exon 10442265 10442809 . + . gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000309048.7"; transcript_type "protein_coding"; transcript_name "CENPS-201"; transcript_support_level "1"; tag "basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000130797.2"; exon_number "5"; exon_id "ENSE00001833855.2"; protein_id "ENSP00000308583.2"; ccdsid "CCDS115.1"; +chr1 HAVANA CDS 10442265 10442402 . + 0 gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000309048.7"; transcript_type "protein_coding"; transcript_name "CENPS-201"; transcript_support_level "1"; tag "basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000130797.2"; exon_number "5"; exon_id "ENSE00001833855.2"; protein_id "ENSP00000308583.2"; ccdsid "CCDS115.1"; +chr1 HAVANA stop_codon 10442403 10442405 . + 0 gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000309048.7"; transcript_type "protein_coding"; transcript_name "CENPS-201"; transcript_support_level "1"; tag "basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000130797.2"; exon_number "5"; exon_id "ENSE00001833855.2"; protein_id "ENSP00000308583.2"; ccdsid "CCDS115.1"; +chr1 HAVANA UTR 10430443 10430517 . + . gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000309048.7"; transcript_type "protein_coding"; transcript_name "CENPS-201"; transcript_support_level "1"; tag "basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000130797.2"; exon_number "1"; exon_id "ENSE00001849117.1"; protein_id "ENSP00000308583.2"; ccdsid "CCDS115.1"; +chr1 HAVANA UTR 10442403 10442809 . + . gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000309048.7"; transcript_type "protein_coding"; transcript_name "CENPS-201"; transcript_support_level "1"; tag "basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000130797.2"; exon_number "5"; exon_id "ENSE00001833855.2"; protein_id "ENSP00000308583.2"; ccdsid "CCDS115.1"; +chr1 HAVANA transcript 10430512 10431130 . + . gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000602486.1"; transcript_type "processed_transcript"; transcript_name "CENPS-205"; transcript_support_level "3"; havana_transcript "OTTHUMT00000467584.1"; +chr1 HAVANA exon 10430512 10430568 . + . gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000602486.1"; transcript_type "processed_transcript"; transcript_name "CENPS-205"; transcript_support_level "3"; havana_transcript "OTTHUMT00000467584.1"; exon_number "1"; exon_id "ENSE00003405083.1"; +chr1 HAVANA exon 10430876 10431130 . + . gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000602486.1"; transcript_type "processed_transcript"; transcript_name "CENPS-205"; transcript_support_level "3"; havana_transcript "OTTHUMT00000467584.1"; exon_number "2"; exon_id "ENSE00003344661.1"; +chr1 HAVANA transcript 10430783 10442334 . + . gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "1"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000477755.1"; transcript_type "nonsense_mediated_decay"; transcript_name "CENPS-204"; transcript_support_level "2"; tag "overlapping_uORF,upstream_uORF,exp_conf"; havana_transcript "OTTHUMT00000130800.6"; protein_id "ENSP00000468629.2"; +chr1 HAVANA exon 10430783 10431401 . + . gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "1"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000477755.1"; transcript_type "nonsense_mediated_decay"; transcript_name "CENPS-204"; transcript_support_level "2"; tag "overlapping_uORF,upstream_uORF,exp_conf"; havana_transcript "OTTHUMT00000130800.6"; exon_number "1"; exon_id "ENSE00001850449.1"; protein_id "ENSP00000468629.2"; +chr1 HAVANA CDS 10431290 10431401 . + 0 gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "1"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000477755.1"; transcript_type "nonsense_mediated_decay"; transcript_name "CENPS-204"; transcript_support_level "2"; tag "overlapping_uORF,upstream_uORF,exp_conf"; havana_transcript "OTTHUMT00000130800.6"; exon_number "1"; exon_id "ENSE00001850449.1"; protein_id "ENSP00000468629.2"; +chr1 HAVANA start_codon 10431290 10431292 . + 0 gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "1"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000477755.1"; transcript_type "nonsense_mediated_decay"; transcript_name "CENPS-204"; transcript_support_level "2"; tag "overlapping_uORF,upstream_uORF,exp_conf"; havana_transcript "OTTHUMT00000130800.6"; exon_number "1"; exon_id "ENSE00001850449.1"; protein_id "ENSP00000468629.2"; +chr1 HAVANA exon 10433842 10433965 . + . gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "1"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000477755.1"; transcript_type "nonsense_mediated_decay"; transcript_name "CENPS-204"; transcript_support_level "2"; tag "overlapping_uORF,upstream_uORF,exp_conf"; havana_transcript "OTTHUMT00000130800.6"; exon_number "2"; exon_id "ENSE00003505048.1"; protein_id "ENSP00000468629.2"; +chr1 HAVANA CDS 10433842 10433963 . + 2 gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "1"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000477755.1"; transcript_type "nonsense_mediated_decay"; transcript_name "CENPS-204"; transcript_support_level "2"; tag "overlapping_uORF,upstream_uORF,exp_conf"; havana_transcript "OTTHUMT00000130800.6"; exon_number "2"; exon_id "ENSE00003505048.1"; protein_id "ENSP00000468629.2"; +chr1 HAVANA exon 10434657 10434690 . + . gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "1"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000477755.1"; transcript_type "nonsense_mediated_decay"; transcript_name "CENPS-204"; transcript_support_level "2"; tag "overlapping_uORF,upstream_uORF,exp_conf"; havana_transcript "OTTHUMT00000130800.6"; exon_number "3"; exon_id "ENSE00003290091.2"; protein_id "ENSP00000468629.2"; +chr1 HAVANA stop_codon 10433964 10433965 . + 0 gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "1"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000477755.1"; transcript_type "nonsense_mediated_decay"; transcript_name "CENPS-204"; transcript_support_level "2"; tag "overlapping_uORF,upstream_uORF,exp_conf"; havana_transcript "OTTHUMT00000130800.6"; exon_number "2"; exon_id "ENSE00003290091.2"; protein_id "ENSP00000468629.2"; +chr1 HAVANA stop_codon 10434657 10434657 . + 1 gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "1"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000477755.1"; transcript_type "nonsense_mediated_decay"; transcript_name "CENPS-204"; transcript_support_level "2"; tag "overlapping_uORF,upstream_uORF,exp_conf"; havana_transcript "OTTHUMT00000130800.6"; exon_number "3"; exon_id "ENSE00003290091.2"; protein_id "ENSP00000468629.2"; +chr1 HAVANA exon 10440347 10440413 . + . gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "1"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000477755.1"; transcript_type "nonsense_mediated_decay"; transcript_name "CENPS-204"; transcript_support_level "2"; tag "overlapping_uORF,upstream_uORF,exp_conf"; havana_transcript "OTTHUMT00000130800.6"; exon_number "4"; exon_id "ENSE00003786199.1"; protein_id "ENSP00000468629.2"; +chr1 HAVANA exon 10442265 10442334 . + . gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "1"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000477755.1"; transcript_type "nonsense_mediated_decay"; transcript_name "CENPS-204"; transcript_support_level "2"; tag "overlapping_uORF,upstream_uORF,exp_conf"; havana_transcript "OTTHUMT00000130800.6"; exon_number "5"; exon_id "ENSE00001930747.1"; protein_id "ENSP00000468629.2"; +chr1 HAVANA UTR 10430783 10431289 . + . gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "1"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000477755.1"; transcript_type "nonsense_mediated_decay"; transcript_name "CENPS-204"; transcript_support_level "2"; tag "overlapping_uORF,upstream_uORF,exp_conf"; havana_transcript "OTTHUMT00000130800.6"; exon_number "1"; exon_id "ENSE00001850449.1"; protein_id "ENSP00000468629.2"; +chr1 HAVANA UTR 10433964 10433965 . + . gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "1"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000477755.1"; transcript_type "nonsense_mediated_decay"; transcript_name "CENPS-204"; transcript_support_level "2"; tag "overlapping_uORF,upstream_uORF,exp_conf"; havana_transcript "OTTHUMT00000130800.6"; exon_number "2"; exon_id "ENSE00003505048.1"; protein_id "ENSP00000468629.2"; +chr1 HAVANA UTR 10434657 10434690 . + . gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "1"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000477755.1"; transcript_type "nonsense_mediated_decay"; transcript_name "CENPS-204"; transcript_support_level "2"; tag "overlapping_uORF,upstream_uORF,exp_conf"; havana_transcript "OTTHUMT00000130800.6"; exon_number "3"; exon_id "ENSE00003290091.2"; protein_id "ENSP00000468629.2"; +chr1 HAVANA UTR 10440347 10440413 . + . gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "1"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000477755.1"; transcript_type "nonsense_mediated_decay"; transcript_name "CENPS-204"; transcript_support_level "2"; tag "overlapping_uORF,upstream_uORF,exp_conf"; havana_transcript "OTTHUMT00000130800.6"; exon_number "4"; exon_id "ENSE00003786199.1"; protein_id "ENSP00000468629.2"; +chr1 HAVANA UTR 10442265 10442334 . + . gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "1"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000477755.1"; transcript_type "nonsense_mediated_decay"; transcript_name "CENPS-204"; transcript_support_level "2"; tag "overlapping_uORF,upstream_uORF,exp_conf"; havana_transcript "OTTHUMT00000130800.6"; exon_number "5"; exon_id "ENSE00001930747.1"; protein_id "ENSP00000468629.2"; +chr1 HAVANA transcript 10431123 10442502 . + . gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000462462.1"; transcript_type "protein_coding"; transcript_name "CENPS-202"; transcript_support_level "3"; tag "basic"; havana_transcript "OTTHUMT00000130802.2"; protein_id "ENSP00000489524.1"; +chr1 HAVANA exon 10431123 10431401 . + . gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000462462.1"; transcript_type "protein_coding"; transcript_name "CENPS-202"; transcript_support_level "3"; tag "basic"; havana_transcript "OTTHUMT00000130802.2"; exon_number "1"; exon_id "ENSE00001858020.1"; protein_id "ENSP00000489524.1"; +chr1 HAVANA CDS 10431290 10431401 . + 0 gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000462462.1"; transcript_type "protein_coding"; transcript_name "CENPS-202"; transcript_support_level "3"; tag "basic"; havana_transcript "OTTHUMT00000130802.2"; exon_number "1"; exon_id "ENSE00001858020.1"; protein_id "ENSP00000489524.1"; +chr1 HAVANA start_codon 10431290 10431292 . + 0 gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000462462.1"; transcript_type "protein_coding"; transcript_name "CENPS-202"; transcript_support_level "3"; tag "basic"; havana_transcript "OTTHUMT00000130802.2"; exon_number "1"; exon_id "ENSE00001858020.1"; protein_id "ENSP00000489524.1"; +chr1 HAVANA exon 10440347 10440413 . + . gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000462462.1"; transcript_type "protein_coding"; transcript_name "CENPS-202"; transcript_support_level "3"; tag "basic"; havana_transcript "OTTHUMT00000130802.2"; exon_number "2"; exon_id "ENSE00003788145.1"; protein_id "ENSP00000489524.1"; +chr1 HAVANA CDS 10440347 10440381 . + 2 gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000462462.1"; transcript_type "protein_coding"; transcript_name "CENPS-202"; transcript_support_level "3"; tag "basic"; havana_transcript "OTTHUMT00000130802.2"; exon_number "2"; exon_id "ENSE00003788145.1"; protein_id "ENSP00000489524.1"; +chr1 HAVANA stop_codon 10440382 10440384 . + 0 gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000462462.1"; transcript_type "protein_coding"; transcript_name "CENPS-202"; transcript_support_level "3"; tag "basic"; havana_transcript "OTTHUMT00000130802.2"; exon_number "2"; exon_id "ENSE00003788145.1"; protein_id "ENSP00000489524.1"; +chr1 HAVANA exon 10442265 10442502 . + . gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000462462.1"; transcript_type "protein_coding"; transcript_name "CENPS-202"; transcript_support_level "3"; tag "basic"; havana_transcript "OTTHUMT00000130802.2"; exon_number "3"; exon_id "ENSE00001872482.1"; protein_id "ENSP00000489524.1"; +chr1 HAVANA UTR 10431123 10431289 . + . gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000462462.1"; transcript_type "protein_coding"; transcript_name "CENPS-202"; transcript_support_level "3"; tag "basic"; havana_transcript "OTTHUMT00000130802.2"; exon_number "1"; exon_id "ENSE00001858020.1"; protein_id "ENSP00000489524.1"; +chr1 HAVANA UTR 10440382 10440413 . + . gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000462462.1"; transcript_type "protein_coding"; transcript_name "CENPS-202"; transcript_support_level "3"; tag "basic"; havana_transcript "OTTHUMT00000130802.2"; exon_number "2"; exon_id "ENSE00003788145.1"; protein_id "ENSP00000489524.1"; +chr1 HAVANA UTR 10442265 10442502 . + . gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000462462.1"; transcript_type "protein_coding"; transcript_name "CENPS-202"; transcript_support_level "3"; tag "basic"; havana_transcript "OTTHUMT00000130802.2"; exon_number "3"; exon_id "ENSE00001872482.1"; protein_id "ENSP00000489524.1"; +chr1 HAVANA transcript 10440123 10442657 . + . gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000464507.1"; transcript_type "retained_intron"; transcript_name "CENPS-203"; transcript_support_level "2"; havana_transcript "OTTHUMT00000130801.1"; +chr1 HAVANA exon 10440123 10440413 . + . gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000464507.1"; transcript_type "retained_intron"; transcript_name "CENPS-203"; transcript_support_level "2"; havana_transcript "OTTHUMT00000130801.1"; exon_number "1"; exon_id "ENSE00001880184.1"; +chr1 HAVANA exon 10442265 10442657 . + . gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000464507.1"; transcript_type "retained_intron"; transcript_name "CENPS-203"; transcript_support_level "2"; havana_transcript "OTTHUMT00000130801.1"; exon_number "2"; exon_id "ENSE00001818886.1"; +chr1 HAVANA gene 10449719 10451902 . + . gene_id "ENSG00000241563.3"; gene_type "protein_coding"; gene_name "CORT"; level "2"; havana_gene "OTTHUMG00000001906.4"; tag "overlapping_locus"; +chr1 HAVANA transcript 10449719 10451902 . + . gene_id "ENSG00000241563.3"; gene_type "protein_coding"; gene_name "CORT"; level "2"; havana_gene "OTTHUMG00000001906.4"; transcript_id "ENST00000377049.3"; transcript_type "protein_coding"; transcript_name "CORT-201"; transcript_support_level "1"; tag "upstream_ATG,basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000005410.3"; protein_id "ENSP00000366248.3"; ccdsid "CCDS117.2"; +chr1 HAVANA exon 10449719 10450322 . + . gene_id "ENSG00000241563.3"; gene_type "protein_coding"; gene_name "CORT"; level "2"; havana_gene "OTTHUMG00000001906.4"; transcript_id "ENST00000377049.3"; transcript_type "protein_coding"; transcript_name "CORT-201"; transcript_support_level "1"; tag "upstream_ATG,basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000005410.3"; exon_number "1"; exon_id "ENSE00001472620.3"; protein_id "ENSP00000366248.3"; ccdsid "CCDS117.2"; +chr1 HAVANA CDS 10450224 10450322 . + 0 gene_id "ENSG00000241563.3"; gene_type "protein_coding"; gene_name "CORT"; level "2"; havana_gene "OTTHUMG00000001906.4"; transcript_id "ENST00000377049.3"; transcript_type "protein_coding"; transcript_name "CORT-201"; transcript_support_level "1"; tag "upstream_ATG,basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000005410.3"; exon_number "1"; exon_id "ENSE00001472620.3"; protein_id "ENSP00000366248.3"; ccdsid "CCDS117.2"; +chr1 HAVANA start_codon 10450224 10450226 . + 0 gene_id "ENSG00000241563.3"; gene_type "protein_coding"; gene_name "CORT"; level "2"; havana_gene "OTTHUMG00000001906.4"; transcript_id "ENST00000377049.3"; transcript_type "protein_coding"; transcript_name "CORT-201"; transcript_support_level "1"; tag "upstream_ATG,basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000005410.3"; exon_number "1"; exon_id "ENSE00001472620.3"; protein_id "ENSP00000366248.3"; ccdsid "CCDS117.2"; +chr1 HAVANA exon 10451377 10451902 . + . gene_id "ENSG00000241563.3"; gene_type "protein_coding"; gene_name "CORT"; level "2"; havana_gene "OTTHUMG00000001906.4"; transcript_id "ENST00000377049.3"; transcript_type "protein_coding"; transcript_name "CORT-201"; transcript_support_level "1"; tag "upstream_ATG,basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000005410.3"; exon_number "2"; exon_id "ENSE00001887610.1"; protein_id "ENSP00000366248.3"; ccdsid "CCDS117.2"; +chr1 HAVANA CDS 10451377 10451592 . + 0 gene_id "ENSG00000241563.3"; gene_type "protein_coding"; gene_name "CORT"; level "2"; havana_gene "OTTHUMG00000001906.4"; transcript_id "ENST00000377049.3"; transcript_type "protein_coding"; transcript_name "CORT-201"; transcript_support_level "1"; tag "upstream_ATG,basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000005410.3"; exon_number "2"; exon_id "ENSE00001887610.1"; protein_id "ENSP00000366248.3"; ccdsid "CCDS117.2"; +chr1 HAVANA stop_codon 10451593 10451595 . + 0 gene_id "ENSG00000241563.3"; gene_type "protein_coding"; gene_name "CORT"; level "2"; havana_gene "OTTHUMG00000001906.4"; transcript_id "ENST00000377049.3"; transcript_type "protein_coding"; transcript_name "CORT-201"; transcript_support_level "1"; tag "upstream_ATG,basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000005410.3"; exon_number "2"; exon_id "ENSE00001887610.1"; protein_id "ENSP00000366248.3"; ccdsid "CCDS117.2"; +chr1 HAVANA UTR 10449719 10450223 . + . gene_id "ENSG00000241563.3"; gene_type "protein_coding"; gene_name "CORT"; level "2"; havana_gene "OTTHUMG00000001906.4"; transcript_id "ENST00000377049.3"; transcript_type "protein_coding"; transcript_name "CORT-201"; transcript_support_level "1"; tag "upstream_ATG,basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000005410.3"; exon_number "1"; exon_id "ENSE00001472620.3"; protein_id "ENSP00000366248.3"; ccdsid "CCDS117.2"; +chr1 HAVANA UTR 10451593 10451902 . + . gene_id "ENSG00000241563.3"; gene_type "protein_coding"; gene_name "CORT"; level "2"; havana_gene "OTTHUMG00000001906.4"; transcript_id "ENST00000377049.3"; transcript_type "protein_coding"; transcript_name "CORT-201"; transcript_support_level "1"; tag "upstream_ATG,basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000005410.3"; exon_number "2"; exon_id "ENSE00001887610.1"; protein_id "ENSP00000366248.3"; ccdsid "CCDS117.2"; +chr1 HAVANA gene 23691742 23696835 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; +chr1 HAVANA transcript 23691742 23696417 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000374550.8"; transcript_type "protein_coding"; transcript_name "RPL11-201"; transcript_support_level "1"; tag "NAGNAG_splice_site,basic,appris_alternative_1"; havana_transcript "OTTHUMT00000494315.1"; protein_id "ENSP00000363676.4"; +chr1 HAVANA exon 23691742 23691829 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000374550.8"; transcript_type "protein_coding"; transcript_name "RPL11-201"; transcript_support_level "1"; tag "NAGNAG_splice_site,basic,appris_alternative_1"; havana_transcript "OTTHUMT00000494315.1"; exon_number "1"; exon_id "ENSE00003819571.1"; protein_id "ENSP00000363676.4"; +chr1 HAVANA CDS 23691824 23691829 . + 0 gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000374550.8"; transcript_type "protein_coding"; transcript_name "RPL11-201"; transcript_support_level "1"; tag "NAGNAG_splice_site,basic,appris_alternative_1"; havana_transcript "OTTHUMT00000494315.1"; exon_number "1"; exon_id "ENSE00003819571.1"; protein_id "ENSP00000363676.4"; +chr1 HAVANA start_codon 23691824 23691826 . + 0 gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000374550.8"; transcript_type "protein_coding"; transcript_name "RPL11-201"; transcript_support_level "1"; tag "NAGNAG_splice_site,basic,appris_alternative_1"; havana_transcript "OTTHUMT00000494315.1"; exon_number "1"; exon_id "ENSE00003819571.1"; protein_id "ENSP00000363676.4"; +chr1 HAVANA exon 23692612 23692759 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000374550.8"; transcript_type "protein_coding"; transcript_name "RPL11-201"; transcript_support_level "1"; tag "NAGNAG_splice_site,basic,appris_alternative_1"; havana_transcript "OTTHUMT00000494315.1"; exon_number "2"; exon_id "ENSE00003826575.1"; protein_id "ENSP00000363676.4"; +chr1 HAVANA CDS 23692612 23692759 . + 0 gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000374550.8"; transcript_type "protein_coding"; transcript_name "RPL11-201"; transcript_support_level "1"; tag "NAGNAG_splice_site,basic,appris_alternative_1"; havana_transcript "OTTHUMT00000494315.1"; exon_number "2"; exon_id "ENSE00003826575.1"; protein_id "ENSP00000363676.4"; +chr1 HAVANA exon 23693807 23693913 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000374550.8"; transcript_type "protein_coding"; transcript_name "RPL11-201"; transcript_support_level "1"; tag "NAGNAG_splice_site,basic,appris_alternative_1"; havana_transcript "OTTHUMT00000494315.1"; exon_number "3"; exon_id "ENSE00003529048.1"; protein_id "ENSP00000363676.4"; +chr1 HAVANA CDS 23693807 23693913 . + 2 gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000374550.8"; transcript_type "protein_coding"; transcript_name "RPL11-201"; transcript_support_level "1"; tag "NAGNAG_splice_site,basic,appris_alternative_1"; havana_transcript "OTTHUMT00000494315.1"; exon_number "3"; exon_id "ENSE00003529048.1"; protein_id "ENSP00000363676.4"; +chr1 HAVANA exon 23694660 23694791 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000374550.8"; transcript_type "protein_coding"; transcript_name "RPL11-201"; transcript_support_level "1"; tag "NAGNAG_splice_site,basic,appris_alternative_1"; havana_transcript "OTTHUMT00000494315.1"; exon_number "4"; exon_id "ENSE00003542504.1"; protein_id "ENSP00000363676.4"; +chr1 HAVANA CDS 23694660 23694791 . + 0 gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000374550.8"; transcript_type "protein_coding"; transcript_name "RPL11-201"; transcript_support_level "1"; tag "NAGNAG_splice_site,basic,appris_alternative_1"; havana_transcript "OTTHUMT00000494315.1"; exon_number "4"; exon_id "ENSE00003542504.1"; protein_id "ENSP00000363676.4"; +chr1 HAVANA exon 23695798 23695908 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000374550.8"; transcript_type "protein_coding"; transcript_name "RPL11-201"; transcript_support_level "1"; tag "NAGNAG_splice_site,basic,appris_alternative_1"; havana_transcript "OTTHUMT00000494315.1"; exon_number "5"; exon_id "ENSE00003586241.1"; protein_id "ENSP00000363676.4"; +chr1 HAVANA CDS 23695798 23695908 . + 0 gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000374550.8"; transcript_type "protein_coding"; transcript_name "RPL11-201"; transcript_support_level "1"; tag "NAGNAG_splice_site,basic,appris_alternative_1"; havana_transcript "OTTHUMT00000494315.1"; exon_number "5"; exon_id "ENSE00003586241.1"; protein_id "ENSP00000363676.4"; +chr1 HAVANA exon 23696344 23696417 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000374550.8"; transcript_type "protein_coding"; transcript_name "RPL11-201"; transcript_support_level "1"; tag "NAGNAG_splice_site,basic,appris_alternative_1"; havana_transcript "OTTHUMT00000494315.1"; exon_number "6"; exon_id "ENSE00003823475.1"; protein_id "ENSP00000363676.4"; +chr1 HAVANA CDS 23696344 23696370 . + 0 gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000374550.8"; transcript_type "protein_coding"; transcript_name "RPL11-201"; transcript_support_level "1"; tag "NAGNAG_splice_site,basic,appris_alternative_1"; havana_transcript "OTTHUMT00000494315.1"; exon_number "6"; exon_id "ENSE00003823475.1"; protein_id "ENSP00000363676.4"; +chr1 HAVANA stop_codon 23696371 23696373 . + 0 gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000374550.8"; transcript_type "protein_coding"; transcript_name "RPL11-201"; transcript_support_level "1"; tag "NAGNAG_splice_site,basic,appris_alternative_1"; havana_transcript "OTTHUMT00000494315.1"; exon_number "6"; exon_id "ENSE00003823475.1"; protein_id "ENSP00000363676.4"; +chr1 HAVANA UTR 23691742 23691823 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000374550.8"; transcript_type "protein_coding"; transcript_name "RPL11-201"; transcript_support_level "1"; tag "NAGNAG_splice_site,basic,appris_alternative_1"; havana_transcript "OTTHUMT00000494315.1"; exon_number "1"; exon_id "ENSE00003819571.1"; protein_id "ENSP00000363676.4"; +chr1 HAVANA UTR 23696371 23696417 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000374550.8"; transcript_type "protein_coding"; transcript_name "RPL11-201"; transcript_support_level "1"; tag "NAGNAG_splice_site,basic,appris_alternative_1"; havana_transcript "OTTHUMT00000494315.1"; exon_number "6"; exon_id "ENSE00003823475.1"; protein_id "ENSP00000363676.4"; +chr1 HAVANA transcript 23691806 23696412 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000443624.6"; transcript_type "retained_intron"; transcript_name "RPL11-202"; transcript_support_level "2"; havana_transcript "OTTHUMT00000008171.3"; +chr1 HAVANA exon 23691806 23691829 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000443624.6"; transcript_type "retained_intron"; transcript_name "RPL11-202"; transcript_support_level "2"; havana_transcript "OTTHUMT00000008171.3"; exon_number "1"; exon_id "ENSE00003826268.1"; +chr1 HAVANA exon 23692609 23692759 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000443624.6"; transcript_type "retained_intron"; transcript_name "RPL11-202"; transcript_support_level "2"; havana_transcript "OTTHUMT00000008171.3"; exon_number "2"; exon_id "ENSE00003605024.1"; +chr1 HAVANA exon 23693807 23693913 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000443624.6"; transcript_type "retained_intron"; transcript_name "RPL11-202"; transcript_support_level "2"; havana_transcript "OTTHUMT00000008171.3"; exon_number "3"; exon_id "ENSE00003648500.1"; +chr1 HAVANA exon 23694660 23695908 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000443624.6"; transcript_type "retained_intron"; transcript_name "RPL11-202"; transcript_support_level "2"; havana_transcript "OTTHUMT00000008171.3"; exon_number "4"; exon_id "ENSE00001629265.2"; +chr1 HAVANA exon 23696344 23696412 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000443624.6"; transcript_type "retained_intron"; transcript_name "RPL11-202"; transcript_support_level "2"; havana_transcript "OTTHUMT00000008171.3"; exon_number "5"; exon_id "ENSE00003823259.1"; +chr1 HAVANA transcript 23691806 23696835 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000643754.2"; transcript_type "protein_coding"; transcript_name "RPL11-206"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008168.5"; protein_id "ENSP00000496250.1"; ccdsid "CCDS238.1"; +chr1 HAVANA exon 23691806 23691829 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000643754.2"; transcript_type "protein_coding"; transcript_name "RPL11-206"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008168.5"; exon_number "1"; exon_id "ENSE00001463805.4"; protein_id "ENSP00000496250.1"; ccdsid "CCDS238.1"; +chr1 HAVANA CDS 23691824 23691829 . + 0 gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000643754.2"; transcript_type "protein_coding"; transcript_name "RPL11-206"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008168.5"; exon_number "1"; exon_id "ENSE00001463805.4"; protein_id "ENSP00000496250.1"; ccdsid "CCDS238.1"; +chr1 HAVANA start_codon 23691824 23691826 . + 0 gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000643754.2"; transcript_type "protein_coding"; transcript_name "RPL11-206"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008168.5"; exon_number "1"; exon_id "ENSE00001463805.4"; protein_id "ENSP00000496250.1"; ccdsid "CCDS238.1"; +chr1 HAVANA exon 23692609 23692759 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000643754.2"; transcript_type "protein_coding"; transcript_name "RPL11-206"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008168.5"; exon_number "2"; exon_id "ENSE00003655446.1"; protein_id "ENSP00000496250.1"; ccdsid "CCDS238.1"; +chr1 HAVANA CDS 23692609 23692759 . + 0 gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000643754.2"; transcript_type "protein_coding"; transcript_name "RPL11-206"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008168.5"; exon_number "2"; exon_id "ENSE00003655446.1"; protein_id "ENSP00000496250.1"; ccdsid "CCDS238.1"; +chr1 HAVANA exon 23693807 23693913 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000643754.2"; transcript_type "protein_coding"; transcript_name "RPL11-206"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008168.5"; exon_number "3"; exon_id "ENSE00003529048.1"; protein_id "ENSP00000496250.1"; ccdsid "CCDS238.1"; +chr1 HAVANA CDS 23693807 23693913 . + 2 gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000643754.2"; transcript_type "protein_coding"; transcript_name "RPL11-206"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008168.5"; exon_number "3"; exon_id "ENSE00003529048.1"; protein_id "ENSP00000496250.1"; ccdsid "CCDS238.1"; +chr1 HAVANA exon 23694660 23694791 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000643754.2"; transcript_type "protein_coding"; transcript_name "RPL11-206"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008168.5"; exon_number "4"; exon_id "ENSE00003542504.1"; protein_id "ENSP00000496250.1"; ccdsid "CCDS238.1"; +chr1 HAVANA CDS 23694660 23694791 . + 0 gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000643754.2"; transcript_type "protein_coding"; transcript_name "RPL11-206"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008168.5"; exon_number "4"; exon_id "ENSE00003542504.1"; protein_id "ENSP00000496250.1"; ccdsid "CCDS238.1"; +chr1 HAVANA exon 23695798 23695908 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000643754.2"; transcript_type "protein_coding"; transcript_name "RPL11-206"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008168.5"; exon_number "5"; exon_id "ENSE00003586241.1"; protein_id "ENSP00000496250.1"; ccdsid "CCDS238.1"; +chr1 HAVANA CDS 23695798 23695908 . + 0 gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000643754.2"; transcript_type "protein_coding"; transcript_name "RPL11-206"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008168.5"; exon_number "5"; exon_id "ENSE00003586241.1"; protein_id "ENSP00000496250.1"; ccdsid "CCDS238.1"; +chr1 HAVANA exon 23696344 23696835 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000643754.2"; transcript_type "protein_coding"; transcript_name "RPL11-206"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008168.5"; exon_number "6"; exon_id "ENSE00003827991.1"; protein_id "ENSP00000496250.1"; ccdsid "CCDS238.1"; +chr1 HAVANA CDS 23696344 23696370 . + 0 gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000643754.2"; transcript_type "protein_coding"; transcript_name "RPL11-206"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008168.5"; exon_number "6"; exon_id "ENSE00003827991.1"; protein_id "ENSP00000496250.1"; ccdsid "CCDS238.1"; +chr1 HAVANA stop_codon 23696371 23696373 . + 0 gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000643754.2"; transcript_type "protein_coding"; transcript_name "RPL11-206"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008168.5"; exon_number "6"; exon_id "ENSE00003827991.1"; protein_id "ENSP00000496250.1"; ccdsid "CCDS238.1"; +chr1 HAVANA UTR 23691806 23691823 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000643754.2"; transcript_type "protein_coding"; transcript_name "RPL11-206"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008168.5"; exon_number "1"; exon_id "ENSE00001463805.4"; protein_id "ENSP00000496250.1"; ccdsid "CCDS238.1"; +chr1 HAVANA UTR 23696371 23696835 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000643754.2"; transcript_type "protein_coding"; transcript_name "RPL11-206"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008168.5"; exon_number "6"; exon_id "ENSE00003827991.1"; protein_id "ENSP00000496250.1"; ccdsid "CCDS238.1"; +chr1 HAVANA transcript 23691821 23696425 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000467075.2"; transcript_type "nonsense_mediated_decay"; transcript_name "RPL11-204"; transcript_support_level "3"; tag "RNA_Seq_supported_partial"; havana_transcript "OTTHUMT00000008169.2"; protein_id "ENSP00000493634.1"; +chr1 HAVANA exon 23691821 23692060 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000467075.2"; transcript_type "nonsense_mediated_decay"; transcript_name "RPL11-204"; transcript_support_level "3"; tag "RNA_Seq_supported_partial"; havana_transcript "OTTHUMT00000008169.2"; exon_number "1"; exon_id "ENSE00001914158.2"; protein_id "ENSP00000493634.1"; +chr1 HAVANA CDS 23691824 23691955 . + 0 gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000467075.2"; transcript_type "nonsense_mediated_decay"; transcript_name "RPL11-204"; transcript_support_level "3"; tag "RNA_Seq_supported_partial"; havana_transcript "OTTHUMT00000008169.2"; exon_number "1"; exon_id "ENSE00001914158.2"; protein_id "ENSP00000493634.1"; +chr1 HAVANA start_codon 23691824 23691826 . + 0 gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000467075.2"; transcript_type "nonsense_mediated_decay"; transcript_name "RPL11-204"; transcript_support_level "3"; tag "RNA_Seq_supported_partial"; havana_transcript "OTTHUMT00000008169.2"; exon_number "1"; exon_id "ENSE00001914158.2"; protein_id "ENSP00000493634.1"; +chr1 HAVANA stop_codon 23691956 23691958 . + 0 gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000467075.2"; transcript_type "nonsense_mediated_decay"; transcript_name "RPL11-204"; transcript_support_level "3"; tag "RNA_Seq_supported_partial"; havana_transcript "OTTHUMT00000008169.2"; exon_number "1"; exon_id "ENSE00001914158.2"; protein_id "ENSP00000493634.1"; +chr1 HAVANA exon 23692609 23692759 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000467075.2"; transcript_type "nonsense_mediated_decay"; transcript_name "RPL11-204"; transcript_support_level "3"; tag "RNA_Seq_supported_partial"; havana_transcript "OTTHUMT00000008169.2"; exon_number "2"; exon_id "ENSE00003605024.1"; protein_id "ENSP00000493634.1"; +chr1 HAVANA exon 23693807 23693913 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000467075.2"; transcript_type "nonsense_mediated_decay"; transcript_name "RPL11-204"; transcript_support_level "3"; tag "RNA_Seq_supported_partial"; havana_transcript "OTTHUMT00000008169.2"; exon_number "3"; exon_id "ENSE00003648500.1"; protein_id "ENSP00000493634.1"; +chr1 HAVANA exon 23694660 23694791 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000467075.2"; transcript_type "nonsense_mediated_decay"; transcript_name "RPL11-204"; transcript_support_level "3"; tag "RNA_Seq_supported_partial"; havana_transcript "OTTHUMT00000008169.2"; exon_number "4"; exon_id "ENSE00003606435.1"; protein_id "ENSP00000493634.1"; +chr1 HAVANA exon 23695798 23695908 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000467075.2"; transcript_type "nonsense_mediated_decay"; transcript_name "RPL11-204"; transcript_support_level "3"; tag "RNA_Seq_supported_partial"; havana_transcript "OTTHUMT00000008169.2"; exon_number "5"; exon_id "ENSE00003568710.1"; protein_id "ENSP00000493634.1"; +chr1 HAVANA exon 23696344 23696425 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000467075.2"; transcript_type "nonsense_mediated_decay"; transcript_name "RPL11-204"; transcript_support_level "3"; tag "RNA_Seq_supported_partial"; havana_transcript "OTTHUMT00000008169.2"; exon_number "6"; exon_id "ENSE00001044698.3"; protein_id "ENSP00000493634.1"; +chr1 HAVANA UTR 23691821 23691823 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000467075.2"; transcript_type "nonsense_mediated_decay"; transcript_name "RPL11-204"; transcript_support_level "3"; tag "RNA_Seq_supported_partial"; havana_transcript "OTTHUMT00000008169.2"; exon_number "1"; exon_id "ENSE00001914158.2"; protein_id "ENSP00000493634.1"; +chr1 HAVANA UTR 23691956 23692060 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000467075.2"; transcript_type "nonsense_mediated_decay"; transcript_name "RPL11-204"; transcript_support_level "3"; tag "RNA_Seq_supported_partial"; havana_transcript "OTTHUMT00000008169.2"; exon_number "1"; exon_id "ENSE00001914158.2"; protein_id "ENSP00000493634.1"; +chr1 HAVANA UTR 23692609 23692759 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000467075.2"; transcript_type "nonsense_mediated_decay"; transcript_name "RPL11-204"; transcript_support_level "3"; tag "RNA_Seq_supported_partial"; havana_transcript "OTTHUMT00000008169.2"; exon_number "2"; exon_id "ENSE00003605024.1"; protein_id "ENSP00000493634.1"; +chr1 HAVANA UTR 23693807 23693913 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000467075.2"; transcript_type "nonsense_mediated_decay"; transcript_name "RPL11-204"; transcript_support_level "3"; tag "RNA_Seq_supported_partial"; havana_transcript "OTTHUMT00000008169.2"; exon_number "3"; exon_id "ENSE00003648500.1"; protein_id "ENSP00000493634.1"; +chr1 HAVANA UTR 23694660 23694791 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000467075.2"; transcript_type "nonsense_mediated_decay"; transcript_name "RPL11-204"; transcript_support_level "3"; tag "RNA_Seq_supported_partial"; havana_transcript "OTTHUMT00000008169.2"; exon_number "4"; exon_id "ENSE00003606435.1"; protein_id "ENSP00000493634.1"; +chr1 HAVANA UTR 23695798 23695908 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000467075.2"; transcript_type "nonsense_mediated_decay"; transcript_name "RPL11-204"; transcript_support_level "3"; tag "RNA_Seq_supported_partial"; havana_transcript "OTTHUMT00000008169.2"; exon_number "5"; exon_id "ENSE00003568710.1"; protein_id "ENSP00000493634.1"; +chr1 HAVANA UTR 23696344 23696425 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000467075.2"; transcript_type "nonsense_mediated_decay"; transcript_name "RPL11-204"; transcript_support_level "3"; tag "RNA_Seq_supported_partial"; havana_transcript "OTTHUMT00000008169.2"; exon_number "6"; exon_id "ENSE00001044698.3"; protein_id "ENSP00000493634.1"; +chr1 HAVANA transcript 23692306 23696418 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000458455.2"; transcript_type "protein_coding"; transcript_name "RPL11-203"; transcript_support_level "1"; tag "basic"; havana_transcript "OTTHUMT00000008170.2"; protein_id "ENSP00000398888.2"; +chr1 HAVANA exon 23692306 23692759 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000458455.2"; transcript_type "protein_coding"; transcript_name "RPL11-203"; transcript_support_level "1"; tag "basic"; havana_transcript "OTTHUMT00000008170.2"; exon_number "1"; exon_id "ENSE00001874656.1"; protein_id "ENSP00000398888.2"; +chr1 HAVANA CDS 23692636 23692759 . + 0 gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000458455.2"; transcript_type "protein_coding"; transcript_name "RPL11-203"; transcript_support_level "1"; tag "basic"; havana_transcript "OTTHUMT00000008170.2"; exon_number "1"; exon_id "ENSE00001874656.1"; protein_id "ENSP00000398888.2"; +chr1 HAVANA start_codon 23692636 23692638 . + 0 gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000458455.2"; transcript_type "protein_coding"; transcript_name "RPL11-203"; transcript_support_level "1"; tag "basic"; havana_transcript "OTTHUMT00000008170.2"; exon_number "1"; exon_id "ENSE00001874656.1"; protein_id "ENSP00000398888.2"; +chr1 HAVANA exon 23693807 23693913 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000458455.2"; transcript_type "protein_coding"; transcript_name "RPL11-203"; transcript_support_level "1"; tag "basic"; havana_transcript "OTTHUMT00000008170.2"; exon_number "2"; exon_id "ENSE00003529048.1"; protein_id "ENSP00000398888.2"; +chr1 HAVANA CDS 23693807 23693913 . + 2 gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000458455.2"; transcript_type "protein_coding"; transcript_name "RPL11-203"; transcript_support_level "1"; tag "basic"; havana_transcript "OTTHUMT00000008170.2"; exon_number "2"; exon_id "ENSE00003529048.1"; protein_id "ENSP00000398888.2"; +chr1 HAVANA exon 23694660 23694791 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000458455.2"; transcript_type "protein_coding"; transcript_name "RPL11-203"; transcript_support_level "1"; tag "basic"; havana_transcript "OTTHUMT00000008170.2"; exon_number "3"; exon_id "ENSE00003542504.1"; protein_id "ENSP00000398888.2"; +chr1 HAVANA CDS 23694660 23694791 . + 0 gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000458455.2"; transcript_type "protein_coding"; transcript_name "RPL11-203"; transcript_support_level "1"; tag "basic"; havana_transcript "OTTHUMT00000008170.2"; exon_number "3"; exon_id "ENSE00003542504.1"; protein_id "ENSP00000398888.2"; +chr1 HAVANA exon 23695798 23695908 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000458455.2"; transcript_type "protein_coding"; transcript_name "RPL11-203"; transcript_support_level "1"; tag "basic"; havana_transcript "OTTHUMT00000008170.2"; exon_number "4"; exon_id "ENSE00003586241.1"; protein_id "ENSP00000398888.2"; +chr1 HAVANA CDS 23695798 23695908 . + 0 gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000458455.2"; transcript_type "protein_coding"; transcript_name "RPL11-203"; transcript_support_level "1"; tag "basic"; havana_transcript "OTTHUMT00000008170.2"; exon_number "4"; exon_id "ENSE00003586241.1"; protein_id "ENSP00000398888.2"; +chr1 HAVANA exon 23696344 23696418 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000458455.2"; transcript_type "protein_coding"; transcript_name "RPL11-203"; transcript_support_level "1"; tag "basic"; havana_transcript "OTTHUMT00000008170.2"; exon_number "5"; exon_id "ENSE00001857063.1"; protein_id "ENSP00000398888.2"; +chr1 HAVANA CDS 23696344 23696370 . + 0 gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000458455.2"; transcript_type "protein_coding"; transcript_name "RPL11-203"; transcript_support_level "1"; tag "basic"; havana_transcript "OTTHUMT00000008170.2"; exon_number "5"; exon_id "ENSE00001857063.1"; protein_id "ENSP00000398888.2"; +chr1 HAVANA stop_codon 23696371 23696373 . + 0 gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000458455.2"; transcript_type "protein_coding"; transcript_name "RPL11-203"; transcript_support_level "1"; tag "basic"; havana_transcript "OTTHUMT00000008170.2"; exon_number "5"; exon_id "ENSE00001857063.1"; protein_id "ENSP00000398888.2"; +chr1 HAVANA UTR 23692306 23692635 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000458455.2"; transcript_type "protein_coding"; transcript_name "RPL11-203"; transcript_support_level "1"; tag "basic"; havana_transcript "OTTHUMT00000008170.2"; exon_number "1"; exon_id "ENSE00001874656.1"; protein_id "ENSP00000398888.2"; +chr1 HAVANA UTR 23696371 23696418 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000458455.2"; transcript_type "protein_coding"; transcript_name "RPL11-203"; transcript_support_level "1"; tag "basic"; havana_transcript "OTTHUMT00000008170.2"; exon_number "5"; exon_id "ENSE00001857063.1"; protein_id "ENSP00000398888.2"; +chr1 HAVANA transcript 23692609 23695946 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000482370.2"; transcript_type "retained_intron"; transcript_name "RPL11-205"; transcript_support_level "1"; havana_transcript "OTTHUMT00000008574.2"; +chr1 HAVANA exon 23692609 23692759 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000482370.2"; transcript_type "retained_intron"; transcript_name "RPL11-205"; transcript_support_level "1"; havana_transcript "OTTHUMT00000008574.2"; exon_number "1"; exon_id "ENSE00003605024.1"; +chr1 HAVANA exon 23693807 23693913 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000482370.2"; transcript_type "retained_intron"; transcript_name "RPL11-205"; transcript_support_level "1"; havana_transcript "OTTHUMT00000008574.2"; exon_number "2"; exon_id "ENSE00003648500.1"; +chr1 HAVANA exon 23694660 23694791 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000482370.2"; transcript_type "retained_intron"; transcript_name "RPL11-205"; transcript_support_level "1"; havana_transcript "OTTHUMT00000008574.2"; exon_number "3"; exon_id "ENSE00003606435.1"; +chr1 HAVANA exon 23695798 23695946 . + . gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000482370.2"; transcript_type "retained_intron"; transcript_name "RPL11-205"; transcript_support_level "1"; havana_transcript "OTTHUMT00000008574.2"; exon_number "4"; exon_id "ENSE00001619539.2"; +chr1 HAVANA gene 23743155 23762059 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; +chr1 HAVANA transcript 23743155 23762059 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2"; +chr1 HAVANA exon 23743155 23743578 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "1"; exon_id "ENSE00001731717.2"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2"; +chr1 HAVANA CDS 23743426 23743578 . + 0 gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "1"; exon_id "ENSE00001731717.2"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2"; +chr1 HAVANA start_codon 23743426 23743428 . + 0 gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "1"; exon_id "ENSE00001731717.2"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2"; +chr1 HAVANA exon 23749021 23749077 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "2"; exon_id "ENSE00003559637.1"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2"; +chr1 HAVANA CDS 23749021 23749077 . + 0 gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "2"; exon_id "ENSE00003559637.1"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2"; +chr1 HAVANA exon 23749842 23749948 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "3"; exon_id "ENSE00003565857.1"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2"; +chr1 HAVANA CDS 23749842 23749948 . + 0 gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "3"; exon_id "ENSE00003565857.1"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2"; +chr1 HAVANA exon 23750845 23752030 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "4"; exon_id "ENSE00000560760.1"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2"; +chr1 HAVANA CDS 23750845 23752030 . + 1 gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "4"; exon_id "ENSE00000560760.1"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2"; +chr1 HAVANA exon 23752407 23752518 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "5"; exon_id "ENSE00000758354.1"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2"; +chr1 HAVANA CDS 23752407 23752518 . + 0 gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "5"; exon_id "ENSE00000758354.1"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2"; +chr1 HAVANA exon 23754100 23754255 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "6"; exon_id "ENSE00000388637.1"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2"; +chr1 HAVANA CDS 23754100 23754255 . + 2 gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "6"; exon_id "ENSE00000388637.1"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2"; +chr1 HAVANA exon 23754363 23754460 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "7"; exon_id "ENSE00000388638.1"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2"; +chr1 HAVANA CDS 23754363 23754460 . + 2 gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "7"; exon_id "ENSE00000388638.1"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2"; +chr1 HAVANA exon 23755843 23756023 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "8"; exon_id "ENSE00000388639.1"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2"; +chr1 HAVANA CDS 23755843 23756023 . + 0 gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "8"; exon_id "ENSE00000388639.1"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2"; +chr1 HAVANA exon 23756274 23756385 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "9"; exon_id "ENSE00000758350.1"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2"; +chr1 HAVANA CDS 23756274 23756385 . + 2 gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "9"; exon_id "ENSE00000758350.1"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2"; +chr1 HAVANA exon 23756953 23757125 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "10"; exon_id "ENSE00000758349.1"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2"; +chr1 HAVANA CDS 23756953 23757125 . + 1 gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "10"; exon_id "ENSE00000758349.1"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2"; +chr1 HAVANA exon 23759512 23762059 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "11"; exon_id "ENSE00001596651.2"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2"; +chr1 HAVANA CDS 23759512 23759570 . + 2 gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "11"; exon_id "ENSE00001596651.2"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2"; +chr1 HAVANA stop_codon 23759571 23759573 . + 0 gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "11"; exon_id "ENSE00001596651.2"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2"; +chr1 HAVANA UTR 23743155 23743425 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "1"; exon_id "ENSE00001731717.2"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2"; +chr1 HAVANA UTR 23759571 23762059 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "11"; exon_id "ENSE00001596651.2"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2"; +chr1 ENSEMBL transcript 23743366 23762040 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2"; +chr1 ENSEMBL exon 23743366 23743578 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "1"; exon_id "ENSE00003753896.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2"; +chr1 ENSEMBL CDS 23743426 23743578 . + 0 gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "1"; exon_id "ENSE00003753896.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2"; +chr1 ENSEMBL start_codon 23743426 23743428 . + 0 gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "1"; exon_id "ENSE00003753896.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2"; +chr1 ENSEMBL exon 23749021 23749077 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "2"; exon_id "ENSE00003559637.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2"; +chr1 ENSEMBL CDS 23749021 23749077 . + 0 gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "2"; exon_id "ENSE00003559637.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2"; +chr1 ENSEMBL exon 23749842 23749948 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "3"; exon_id "ENSE00003565857.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2"; +chr1 ENSEMBL CDS 23749842 23749948 . + 0 gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "3"; exon_id "ENSE00003565857.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2"; +chr1 ENSEMBL exon 23750845 23752030 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "4"; exon_id "ENSE00000560760.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2"; +chr1 ENSEMBL CDS 23750845 23752030 . + 1 gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "4"; exon_id "ENSE00000560760.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2"; +chr1 ENSEMBL exon 23752407 23752518 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "5"; exon_id "ENSE00000758354.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2"; +chr1 ENSEMBL CDS 23752407 23752518 . + 0 gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "5"; exon_id "ENSE00000758354.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2"; +chr1 ENSEMBL exon 23754100 23754255 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "6"; exon_id "ENSE00000388637.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2"; +chr1 ENSEMBL CDS 23754100 23754255 . + 2 gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "6"; exon_id "ENSE00000388637.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2"; +chr1 ENSEMBL exon 23754363 23754460 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "7"; exon_id "ENSE00000388638.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2"; +chr1 ENSEMBL CDS 23754363 23754460 . + 2 gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "7"; exon_id "ENSE00000388638.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2"; +chr1 ENSEMBL exon 23755843 23756023 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "8"; exon_id "ENSE00000388639.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2"; +chr1 ENSEMBL CDS 23755843 23756023 . + 0 gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "8"; exon_id "ENSE00000388639.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2"; +chr1 ENSEMBL exon 23756274 23756385 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "9"; exon_id "ENSE00000758350.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2"; +chr1 ENSEMBL CDS 23756274 23756385 . + 2 gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "9"; exon_id "ENSE00000758350.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2"; +chr1 ENSEMBL exon 23756953 23757125 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "10"; exon_id "ENSE00000758349.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2"; +chr1 ENSEMBL CDS 23756953 23757125 . + 1 gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "10"; exon_id "ENSE00000758349.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2"; +chr1 ENSEMBL exon 23759512 23762040 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "11"; exon_id "ENSE00003748060.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2"; +chr1 ENSEMBL CDS 23759512 23759570 . + 2 gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "11"; exon_id "ENSE00003748060.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2"; +chr1 ENSEMBL stop_codon 23759571 23759573 . + 0 gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "11"; exon_id "ENSE00003748060.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2"; +chr1 ENSEMBL UTR 23743366 23743425 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "1"; exon_id "ENSE00003753896.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2"; +chr1 ENSEMBL UTR 23759571 23762040 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "11"; exon_id "ENSE00003748060.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2"; +chr1 HAVANA transcript 23743472 23759893 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; protein_id "ENSP00000476781.1"; +chr1 HAVANA exon 23743472 23743578 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "1"; exon_id "ENSE00003703844.1"; protein_id "ENSP00000476781.1"; +chr1 HAVANA CDS 23743504 23743578 . + 0 gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "1"; exon_id "ENSE00003703844.1"; protein_id "ENSP00000476781.1"; +chr1 HAVANA start_codon 23743504 23743506 . + 0 gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "1"; exon_id "ENSE00003703844.1"; protein_id "ENSP00000476781.1"; +chr1 HAVANA exon 23749021 23749077 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "2"; exon_id "ENSE00003559637.1"; protein_id "ENSP00000476781.1"; +chr1 HAVANA CDS 23749021 23749077 . + 0 gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "2"; exon_id "ENSE00003559637.1"; protein_id "ENSP00000476781.1"; +chr1 HAVANA exon 23749842 23749948 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "3"; exon_id "ENSE00003565857.1"; protein_id "ENSP00000476781.1"; +chr1 HAVANA CDS 23749842 23749948 . + 0 gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "3"; exon_id "ENSE00003565857.1"; protein_id "ENSP00000476781.1"; +chr1 HAVANA exon 23750845 23752030 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "4"; exon_id "ENSE00000560760.1"; protein_id "ENSP00000476781.1"; +chr1 HAVANA CDS 23750845 23752030 . + 1 gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "4"; exon_id "ENSE00000560760.1"; protein_id "ENSP00000476781.1"; +chr1 HAVANA exon 23752407 23752518 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "5"; exon_id "ENSE00000758354.1"; protein_id "ENSP00000476781.1"; +chr1 HAVANA CDS 23752407 23752518 . + 0 gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "5"; exon_id "ENSE00000758354.1"; protein_id "ENSP00000476781.1"; +chr1 HAVANA exon 23754100 23754255 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "6"; exon_id "ENSE00000388637.1"; protein_id "ENSP00000476781.1"; +chr1 HAVANA CDS 23754100 23754255 . + 2 gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "6"; exon_id "ENSE00000388637.1"; protein_id "ENSP00000476781.1"; +chr1 HAVANA exon 23754363 23754460 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "7"; exon_id "ENSE00000388638.1"; protein_id "ENSP00000476781.1"; +chr1 HAVANA CDS 23754363 23754460 . + 2 gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "7"; exon_id "ENSE00000388638.1"; protein_id "ENSP00000476781.1"; +chr1 HAVANA exon 23755843 23756023 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "8"; exon_id "ENSE00000388639.1"; protein_id "ENSP00000476781.1"; +chr1 HAVANA CDS 23755843 23756023 . + 0 gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "8"; exon_id "ENSE00000388639.1"; protein_id "ENSP00000476781.1"; +chr1 HAVANA exon 23756274 23756385 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "9"; exon_id "ENSE00000758350.1"; protein_id "ENSP00000476781.1"; +chr1 HAVANA CDS 23756274 23756385 . + 2 gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "9"; exon_id "ENSE00000758350.1"; protein_id "ENSP00000476781.1"; +chr1 HAVANA exon 23756953 23757125 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "10"; exon_id "ENSE00000758349.1"; protein_id "ENSP00000476781.1"; +chr1 HAVANA CDS 23756953 23757125 . + 1 gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "10"; exon_id "ENSE00000758349.1"; protein_id "ENSP00000476781.1"; +chr1 HAVANA exon 23759512 23759893 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "11"; exon_id "ENSE00003710448.1"; protein_id "ENSP00000476781.1"; +chr1 HAVANA CDS 23759512 23759570 . + 2 gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "11"; exon_id "ENSE00003710448.1"; protein_id "ENSP00000476781.1"; +chr1 HAVANA stop_codon 23759571 23759573 . + 0 gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "11"; exon_id "ENSE00003710448.1"; protein_id "ENSP00000476781.1"; +chr1 HAVANA UTR 23743472 23743503 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "1"; exon_id "ENSE00003703844.1"; protein_id "ENSP00000476781.1"; +chr1 HAVANA UTR 23759571 23759893 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "11"; exon_id "ENSE00003710448.1"; protein_id "ENSP00000476781.1"; +chr1 HAVANA transcript 23744034 23750944 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000487554.1"; transcript_type "processed_transcript"; transcript_name "ELOA-202"; transcript_support_level "3"; havana_transcript "OTTHUMT00000008231.1"; +chr1 HAVANA exon 23744034 23744270 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000487554.1"; transcript_type "processed_transcript"; transcript_name "ELOA-202"; transcript_support_level "3"; havana_transcript "OTTHUMT00000008231.1"; exon_number "1"; exon_id "ENSE00001849808.1"; +chr1 HAVANA exon 23749021 23749077 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000487554.1"; transcript_type "processed_transcript"; transcript_name "ELOA-202"; transcript_support_level "3"; havana_transcript "OTTHUMT00000008231.1"; exon_number "2"; exon_id "ENSE00003591316.1"; +chr1 HAVANA exon 23749842 23749948 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000487554.1"; transcript_type "processed_transcript"; transcript_name "ELOA-202"; transcript_support_level "3"; havana_transcript "OTTHUMT00000008231.1"; exon_number "3"; exon_id "ENSE00003523840.1"; +chr1 HAVANA exon 23750845 23750944 . + . gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000487554.1"; transcript_type "processed_transcript"; transcript_name "ELOA-202"; transcript_support_level "3"; havana_transcript "OTTHUMT00000008231.1"; exon_number "4"; exon_id "ENSE00001846969.1"; From 7d74daccae52c3e02810b5429712a5901fea3d59 Mon Sep 17 00:00:00 2001 From: fairliereese Date: Thu, 21 Sep 2023 15:42:00 -0700 Subject: [PATCH 07/31] added more readthrough tests --- testing_suite/build_test_databases.py | 43 ++- .../test_assignment_readthrough_examples.py | 273 ++++++++++++++++++ .../test_find_gene_match_on_vertex_basis.py | 20 ++ 3 files changed, 333 insertions(+), 3 deletions(-) create mode 100644 testing_suite/test_assignment_readthrough_examples.py diff --git a/testing_suite/build_test_databases.py b/testing_suite/build_test_databases.py index 5210c4e..4baed17 100644 --- a/testing_suite/build_test_databases.py +++ b/testing_suite/build_test_databases.py @@ -112,7 +112,7 @@ "--3p", "300", "--idprefix", "ENCODEH", "--l", "300", - "--g", "hg38", "--o", + "--g", "hg38", "--o", "scratch/multiexon_read_overlapping_monoexon_transcript/talon"]) except Exception as e: @@ -160,14 +160,14 @@ "--o", "scratch/intergenic_GM12878" ]) except Exception as e: print(e) - sys.exit("TALON run failed on chr11_and_Tcf3") + sys.exit("TALON run failed on chr22") # Actually perform the chr11_and_Tcf3 TALON run try: subprocess.check_output( ["talon", "--f", "input_files/chr11_and_Tcf3/config.csv", - "--db", "scratch/chr11_and_Tcf3.db", + "--db", "scratch/chr11_and_Tcf3.db", "--build", "mm10", "--cov", "0", "--identity", "0", @@ -213,3 +213,40 @@ except Exception as e: print(e) sys.exit("Problem creating mock database for filtering tests") + + +# code to get cenps-cort and rpl11-eloa only gtf +# import pyranges as pr +# df = pr.read_gtf('/Users/fairliereese/Documents/programming/mortazavi_lab/ref/gencode.v29/gencode.v29.annotation.gtf', duplicate_attr=True).df +# gnames = ['CENPS', 'CORT', 'CENPS-CORT', 'RPL11', 'ELOA'] +# df = df.loc[df.gene_name.isin(gnames)] +# df = pr.PyRanges(df) +# df.to_gtf('input_files/readthrough/readthrough.gtf') + +try: + subprocess.check_output( + ["talon_initialize_database", + "--f", "input_files/readthrough/readthrough.gtf", + "--a", "gencode_v29", + "--5p", "500", + "--3p", "300", + "--idprefix", "TALON", + "--l", "0", + "--g", "hg38", "--o", "scratch/readthrough"]) +except Exception as e: + print(e) + sys.exit("Database initialization failed on readthrough annotation") + +# Actually perform the readthrough TALON run +try: + subprocess.check_output( + ["talon", + "--f", "input_files/readthrough/config.csv", + "--db", "scratch/readthrough.db", + "--build", "hg38", + "--cov", "0", + "--identity", "0", + "--o", "scratch/readthrough" ]) +except Exception as e: + print(e) + sys.exit("TALON run failed on readthrough") diff --git a/testing_suite/test_assignment_readthrough_examples.py b/testing_suite/test_assignment_readthrough_examples.py new file mode 100644 index 0000000..879d458 --- /dev/null +++ b/testing_suite/test_assignment_readthrough_examples.py @@ -0,0 +1,273 @@ +import pytest +import sqlite3 +from .helper_fns import fetch_correct_ID + +@pytest.mark.integration + +# All data comes from hl60_1_1 from the ENCODE data + +class TestAssignments(object): + """ The objective here is to make sure that each transcript in the + readthrough example set was assigned the expected identity. """ + + def test_FSM_of_annot_rt(self): + """ cenps_cort_fsm is a FSM to the annotated readthrough locus of + CENPS-CORT. Comes from ENCODE hl60 data""" + + conn = sqlite3.connect("scratch/readthrough.db") + conn.row_factory = sqlite3.Row + cursor = conn.cursor() + + dataset = "hl60_1_1" + read_ID = "cenps_cort_fsm" + + # Fetch observed entry from table + query = """SELECT * from observed WHERE dataset = ? AND read_name = ?""" + assignment = cursor.execute(query, [dataset, read_ID]).fetchall()[0] + + correct_gene_ID = fetch_correct_ID("CENPS-CORT", "gene", cursor) + assert assignment['gene_ID'] == correct_gene_ID + + annot_dict = make_annot_dict(cursor, assignment['transcript_ID']) + assert annot_dict["transcript_status"] == "KNOWN" + conn.close() + + def test_ISM_of_annot_rt(self): + """ cenps_cort_ism is an ISM of readthrough locus of CENPS-CORT""" + + conn = sqlite3.connect("scratch/readthrough.db") + conn.row_factory = sqlite3.Row + cursor = conn.cursor() + + dataset = "hl60_1_1" + read_ID = "cenps_cort_ism" + + # Fetch observed entry from table + query = """SELECT * from observed WHERE dataset = ? AND read_name = ?""" + assignment = cursor.execute(query, [dataset, read_ID]).fetchall()[0] + + correct_gene_ID = fetch_correct_ID("CENPS-CORT", "gene", cursor) + assert assignment['gene_ID'] == correct_gene_ID + + # Now make sure that the novel transcript was annotated correctly + annot_dict = make_annot_dict(cursor, assignment['transcript_ID']) + assert annot_dict["ISM_transcript"] == "TRUE" + conn.close() + + def test_NNC_of_annot_rt(self): + """ cenps_cort_nnc shares most sjs with CENPS-CORT redthrough locus """ + + conn = sqlite3.connect("scratch/readthrough.db") + conn.row_factory = sqlite3.Row + cursor = conn.cursor() + + dataset = "hl60_1_1" + read_ID = "cenps_cort_nnc" + + # Fetch observed entry from table + query = """SELECT * from observed WHERE dataset = ? AND read_name = ?""" + assignment = cursor.execute(query, [dataset, read_ID]).fetchall()[0] + correct_gene_ID = fetch_correct_ID("CENPS-CORT", "gene", cursor) + assert assignment['gene_ID'] == correct_gene_ID + + # Now make sure that the novel transcript was annotated correctly + annot_dict = make_annot_dict(cursor, assignment['transcript_ID']) + assert annot_dict["NNC_transcript"] == "TRUE" + assert annot_dict["transcript_status"] == "NOVEL" + conn.close() + + def test_NIC_of_annot_rt(self): + """ cenps_cort_nnc shares all ss, but has a novel sj with CENPS-CORT redthrough locus """ + + conn = sqlite3.connect("scratch/readthrough.db") + conn.row_factory = sqlite3.Row + cursor = conn.cursor() + + dataset = "hl60_1_1" + read_ID = "cenps_cort_nic" + + # Fetch observed entry from table + query = """SELECT * from observed WHERE dataset = ? AND read_name = ?""" + assignment = cursor.execute(query, [dataset, read_ID]).fetchall()[0] + correct_gene_ID = fetch_correct_ID("CENPS-CORT", "gene", cursor) + assert assignment['gene_ID'] == correct_gene_ID + + # Now make sure that the novel transcript was annotated correctly + annot_dict = make_annot_dict(cursor, assignment['transcript_ID']) + assert annot_dict["NIC_transcript"] == "TRUE" + assert annot_dict["transcript_status"] == "NOVEL" + conn.close() + + def test_FSM_of_novel_rt_1(self): + """ eloa_rpl11_fsm_1 is FSM to 2 different genes, ELOA and RPL11""" + conn = sqlite3.connect("scratch/readthrough.db") + conn.row_factory = sqlite3.Row + cursor = conn.cursor() + + dataset = "hl60_1_1" + read_ID = "eloa_rpl11_fsm_1" + + # Fetch observed entry from table + query = """SELECT * from observed WHERE dataset = ? AND read_name = ?""" + assignment = cursor.execute(query, [dataset, read_ID]).fetchall()[0] + + # we had 5 annotated genes (CENPS, CORT, CENPS-CORT, ELOA, and RPL11) + # so new gene should be 6 + correct_gene_ID = 6 + assert assignment['gene_ID'] == correct_gene_ID + + annot_dict = make_annot_dict(cursor, assignment['transcript_ID']) + assert annot_dict['transcript_status'] == 'NOVEL' + assert annot_dict["fusion_transcript"] == "TRUE" + + annot_dict = make_annot_dict_gene(cursor, assignment['gene_ID']) + assert annot_dict['gene_status'] == 'NOVEL' + assert annot_dict['fusion_novel'] == 'TRUE' + + conn.close() + + def test_FSM_of_novel_rt_2(self): + """ eloa_rpl11_fsm_2 is FSM to 2 different genes, ELOA and RPL11""" + conn = sqlite3.connect("scratch/readthrough.db") + conn.row_factory = sqlite3.Row + cursor = conn.cursor() + + dataset = "hl60_1_1" + read_ID = "eloa_rpl11_fsm_2" + + # Fetch observed entry from table + query = """SELECT * from observed WHERE dataset = ? AND read_name = ?""" + assignment = cursor.execute(query, [dataset, read_ID]).fetchall()[0] + + # we had 5 annotated genes (CENPS, CORT, CENPS-CORT, ELOA, and RPL11) + # so new gene should be 6 + correct_gene_ID = 6 + assert assignment['gene_ID'] == correct_gene_ID + + annot_dict = make_annot_dict(cursor, assignment['transcript_ID']) + assert annot_dict['transcript_status'] == 'NOVEL' + assert annot_dict["ISM_transcript"] == "TRUE" + assert annot_dict['ISM-suffix_transcript'] == 'TRUE' + + annot_dict = make_annot_dict_gene(cursor, assignment['gene_ID']) + assert annot_dict['gene_status'] == 'NOVEL' + assert annot_dict['fusion_novel'] == 'TRUE' + + conn.close() + + def test_FSM_of_overlapping_single_gene(self): + """ rpl11_fsm is an FSM an annotated gene that is subsumed by the + RPL11-ELOA readthrough loci. However it should just + be annotated to RPL11""" + + conn = sqlite3.connect("scratch/readthrough.db") + conn.row_factory = sqlite3.Row + cursor = conn.cursor() + + dataset = "hl60_1_1" + read_ID = "rpl11_fsm" + + # Fetch observed entry from table + query = """SELECT * from observed WHERE dataset = ? AND read_name = ?""" + assignment = cursor.execute(query, [dataset, read_ID]).fetchall()[0] + + correct_gene_ID = fetch_correct_ID("RPL11", "gene", cursor) + assert assignment['gene_ID'] == correct_gene_ID + + annot_dict = make_annot_dict(cursor, assignment['transcript_ID']) + assert annot_dict["transcript_status"] == "KNOWN" + conn.close() + + def test_ISM_of_overlapping_single_gene(self): + """ rpl11_ism is an ISM an annotated gene that is subsumed by the + RPL11-ELOA readthrough loci. However it should just + be annotated to RPL11""" + + conn = sqlite3.connect("scratch/readthrough.db") + conn.row_factory = sqlite3.Row + cursor = conn.cursor() + + dataset = "hl60_1_1" + read_ID = "rpl11_ism" + + # Fetch observed entry from table + query = """SELECT * from observed WHERE dataset = ? AND read_name = ?""" + assignment = cursor.execute(query, [dataset, read_ID]).fetchall()[0] + + correct_gene_ID = fetch_correct_ID("RPL11", "gene", cursor) + assert assignment['gene_ID'] == correct_gene_ID + + annot_dict = make_annot_dict(cursor, assignment['transcript_ID']) + assert annot_dict['transcript_status'] == 'NOVEL' + assert annot_dict["ISM_transcript"] == "TRUE" + assert annot_dict['ISM-suffix_transcript'] == 'TRUE' + conn.close() + + def test_NNC_of_annot_rt(self): + """ eloa_rpl11_nnc shares most sjs with novel ELOA-RPL11 rt locus """ + + conn = sqlite3.connect("scratch/readthrough.db") + conn.row_factory = sqlite3.Row + cursor = conn.cursor() + + dataset = "hl60_1_1" + read_ID = "eloa_rpl11_nnc" + + # Fetch observed entry from table + query = """SELECT * from observed WHERE dataset = ? AND read_name = ?""" + assignment = cursor.execute(query, [dataset, read_ID]).fetchall()[0] + correct_gene_ID = 6 + assert assignment['gene_ID'] == correct_gene_ID + + # Now make sure that the novel transcript was annotated correctly + annot_dict = make_annot_dict(cursor, assignment['transcript_ID']) + assert annot_dict["NNC_transcript"] == "TRUE" + assert annot_dict["transcript_status"] == "NOVEL" + conn.close() + + def test_NNC_of_annot_rt(self): + """ eloa_rpl11_nic shares all sjs but one new sj w/ novel ELOA-RPL11 rt locus """ + + conn = sqlite3.connect("scratch/readthrough.db") + conn.row_factory = sqlite3.Row + cursor = conn.cursor() + + dataset = "hl60_1_1" + read_ID = "eloa_rpl11_nic" + + # Fetch observed entry from table + query = """SELECT * from observed WHERE dataset = ? AND read_name = ?""" + assignment = cursor.execute(query, [dataset, read_ID]).fetchall()[0] + correct_gene_ID = 6 + assert assignment['gene_ID'] == correct_gene_ID + + # Now make sure that the novel transcript was annotated correctly + annot_dict = make_annot_dict(cursor, assignment['transcript_ID']) + assert annot_dict["NIC_transcript"] == "TRUE" + assert annot_dict["transcript_status"] == "NOVEL" + conn.close() + + +def make_annot_dict_gene(cursor, gene_ID): + """ Extracts all gene annotations for the transcript ID and puts + them in a dict """ + query = """SELECT * from gene_annotations WHERE ID = ?""" + annotations = cursor.execute(query, [gene_ID]).fetchall() + annot_dict = {} + for annot in annotations: + attribute = annot["attribute"] + value = annot["value"] + annot_dict[attribute] = value + return annot_dict + +def make_annot_dict(cursor, transcript_ID): + """ Extracts all transcript annotations for the transcript ID and puts + them in a dict """ + query = """SELECT * from transcript_annotations WHERE ID = ?""" + annotations = cursor.execute(query, [transcript_ID]).fetchall() + annot_dict = {} + for annot in annotations: + attribute = annot["attribute"] + value = annot["value"] + annot_dict[attribute] = value + return annot_dict diff --git a/testing_suite/test_find_gene_match_on_vertex_basis.py b/testing_suite/test_find_gene_match_on_vertex_basis.py index b7c625e..d095fa8 100644 --- a/testing_suite/test_find_gene_match_on_vertex_basis.py +++ b/testing_suite/test_find_gene_match_on_vertex_basis.py @@ -26,6 +26,26 @@ def test_perfect_match(self): assert fusion == False conn.close() + def test_fusion_match(self): + """ Example where the vertices overlap multiple genes. + """ + conn, cursor = get_db_cursor() + db = "scratch/toy.db" + build = "toy_build" + init_refs.make_temp_novel_gene_table(cursor, "toy_build") + run_info = talon.init_run_info(db, build) + vertex2gene = init_refs.make_vertex_2_gene_dict(cursor) + + vertex_IDs = (1, 2, 3, 4, 5, 9, 10, 11) + strand = "+" + + gene_ID, fusion = talon.find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex2gene) + + correct_gene_ID = None + assert gene_ID == correct_gene_ID + assert fusion == True + conn.close() + def test_NNC_type_match(self): """ Example where some vertices match a gene, while others don't. """ From 91dce6f3a4b549b5e345ebc40667294d24bd6fa4 Mon Sep 17 00:00:00 2001 From: fairliereese Date: Fri, 22 Sep 2023 10:31:00 -0700 Subject: [PATCH 08/31] added more tests for readthrough calls --- src/talon/talon.py | 4 - .../hl60_1_1_subset_remapped_sorted.bam | Bin 6521 -> 7230 bytes ...test_assignment_chr11_and_Tcf3_examples.py | 18 ++--- .../test_assignment_readthrough_examples.py | 73 ++++++++++++++++++ 4 files changed, 82 insertions(+), 13 deletions(-) diff --git a/src/talon/talon.py b/src/talon/talon.py index 8c7809d..8601ba1 100644 --- a/src/talon/talon.py +++ b/src/talon/talon.py @@ -1398,10 +1398,6 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di cursor, tmp_gene, fusion) - print(gene_ID) - print(gene_novelty) - print(transcript_novelty) - # Add all novel vertices to vertex_2_gene now that we have the gene ID vertex_IDs = start_end_info["vertex_IDs"] edge_IDs = start_end_info["edge_IDs"] diff --git a/testing_suite/input_files/readthrough/hl60_1_1_subset_remapped_sorted.bam b/testing_suite/input_files/readthrough/hl60_1_1_subset_remapped_sorted.bam index f33a4b84b7f0927fc5be26c7b840261a1f010e43..b8e0088bef55f87cfceacf8c452023c9d597a506 100644 GIT binary patch delta 4142 zcmV+}5Yg}XGQK#l9t?jI5ba&v%jd^cpJ#Xbl3nBWi~|{L(VdgQ4urJ(>}yh^Wab%g ztfT}t2v%Ve*Q5kD+mI*Vg=GpxA_kEd}Gv|EH`J9>W_xX(vn2mq@lQ8}^1DI98xg^LqkU9PTFXQRu-g_37V zohjs+$l;4NZ0~=m)h1&K&T+GN$$@ef9CHC@vX@xG9Mz?qCg;#vCXZIg*(BuKER=&p zqHII(-CCh6LS;1^7am_q>moWifKWYyC{}KGW{H=Ep zerpH1IQ?q@@dDx{OWwNzaQ+~``BMtcUpfrV{{e9RueX5nPjA6__u}r)UA%O51JHkX z;#PnL_?oXDyFEZZeQlEuzxcg(BM)8xx&U;^`$~Vd+2o%b17KY`3|QX=V2J|OH2~Iy z7v8*?O(q0e zozSkr6Yf)!tCHqyMF_ZMy-Z3aQ0Z;rT%XZM>KL=PgI!A@v7>-HNW+jv_OXKjcwEta~YCt5oCF} z#*vvv>j}Z2<``S4HZ)|60jD*d&LeOD7gH)6nUaW)edL6$IAnoyqKiOeABQKTg2PT+ z+*~TPcn&j8p2LSyVg+{BqUfJ;woO<=L;=1cvTy|cW89r`wN+yaMd(6Y(ne}-NN9f% z2dMRsbigt?qd!R$f)URG@kCngG)h_fzK?q=t}I!yozkWDMUezcW zP9RElL`e;KH)}V$yBE#bBSPfojt-GGu9?R{lU}s%a){g%(UGjzIiooI%i;>(bC_TQ zIdbd>fwj~bD+vpZq^3bG!TuN1hAgyO4ol#g!q}f z)$Zf8lB*t`6?bu7j}<~>3$s-A6S;%f3H>_+K?Hv=EA$2dYT}O87sNYa3$hf@H;-Tl zt(Snw0Fy%K21~OZ0Wl!NAq>wl9Aaw1Fp&mWhpI(1pHk}OcIf-tKo1IFDnFgo1Z z4IU*Xjhi?;lXcBG*4ud19khS!GNqYaT@|S%OP0LTvW=NfY0UhK!!h%5jhP=mGG?B; zWL`gh5Hnx-j4pKijr$-zf9&=P9Z&B+WByuKp^sSHxZ^MkmV)TISybpwsZT*%Mz}~g zt5rlG$voJV&_D;lI=#z?Hic`hg6`=mhC@_mMq?iF9cQ680;=b-1<`**cM!h@ZW^c~ zlm!k#AX_mw={|$bY4OcB-(6T7X~}J6TM4>btNq{FIaKX`4et7HJ8;)e!(Ff6_1&A* z{tIV!H`V^%wg(=2dOW>D4_!4M!{$|&J^1r%k9*0Z6an_h; zk@W6oW4;DI|H>zfdH8?B#$5TCyUk00fxr6X!S}}>Kob0EB*FLKPDqOX{@0KkEhOaSHACTiBIm#A} z6Lo>Fz|F`G8H2(QnYB7NDLg45orDwNf+k3yQwX&n{Y9S=x>^)903~BqN3|t#O)EJ*H{`!W$dFUWJ#9)FZA$eFC$~QORGWu?i)AWa znoqW}JI@_UbE!a8Gj$GKw9M+;y#*hK2NYjPqa5{Gp%7=gYDO&KPwwZ051LcStPzo-$ z%1~Rp}8?)8#aYPOcoE9JQT=WH&ITF z7A|4IazZQ?rs~mj(iJ72Q_^rKjc~kLUGd2@VU<+~O$UvTRAPaDh@Tji@--NF0uux= zg5xm46@xNzfxA@3#ZmnOnjdx ztYG@#NR^<3A$8Ne4wnh7D!OERt*H(aQyqU$XNo8YN=wi`@{$I?Xa!B6%TST9#SqjB z3eFE9O=M2noQxHAwY`a4s-Z6(_nB?@UNCzN#@(H54o4@lnhsB3=cUiy<>5OWK$k38 zvSi7UB}j2F{d0TYIM^Nk;B(jae&Uh(*>isv z9(Z8y;_e0Wd-Ol`r_&EzH9wBeKXv>1lkP^+>eI`=bw{dB$$4vKIL}i+QmX@NCEX&^ zz{f&pBUezdP4&bl&{J1O2g3##?Bb>}bW3UAdMmo9SLdvGfRhE#T=26#$b-;>{7roB z5j4xr`GfB!y?g^|{S5tLUC|I$JoMPQY&D_AO6qF2HSLbxf5 z1*N1zHy%ajYzl!|OCh!(pyarR(~3%KA!H98P6;G_IE(|NGi5Mz_HEGM#L$0QXE#!S zj+$-J2GCQlOV+vCfR;hzX8RId!(IzA}!QKvmgAhl!nVO%)JPenbkD+HRg!rJ( zdTc7N&ZC>U`ZK%?8=ZfhUAu(o%1)hkV78swWGWR})6EqQ6JC?e&2|fw_5n>53ybnp zFt`#e73jGFD+Z}`F>HWG%W*{A&!L4uOrzD_>MlUI1JR&7a2h76Z_HOKev(FH*P9Pz zcFQ=3xh_Blp9L+cA{^``2|l$taEX-(4~3Va4IF4X>9cUq3xa>swKN!A}5=yXSw+^KU6Ko!%CizVSBuer{ab zl$c(?DK9_#LtA31;(80gu1eFYG_6Y0sx+-i)2cKrS+Zowk|j%)ELpPTwz91|{jt`a z?mt|2`b(`leNyXAfBL_kH#@(nJKeqgRV;g7czWmLL;rG*{%XR<4!xo5`}>;AS9gqg z_=uMr{quj%8uPFC_xX<=yu9oJu3|7yFjDqGqS8RnTvFB#-c_>&r5iEO=Gh19Q&TRK zn^J8nwrrl(sWSCJkYQa784O6~1lPYqkDOIWwYd`N94?IFEJ$_Kr>u&;P7^I!>quWa ztxvLG#QKDeLno9==7G+#=r39E1FSiDCVjxHD^Y(PUhcFBEqVXR_N9fF^(AO;9)1bh z9|09k>90Kb&s*PFc=6I@^AE=k0O79RIRFHBxB24EQGlRtD>NT?dk}c{YPqezc`Lg% zL-7Ed;HVUrLrpjnEs{QL8Hzr^SOGy=u?^@18pZv?NF!B!@8IM(M<0Q-a!cgq40FAZ s=e;5$Tf3e+u+=tuGye@@b|GtocavKg9*g$!WdHyG delta 3427 zcmV-p4V?16IQcTL9t?j@4eeaZk7q>{zC8mdGa=)tR6<3F;S?3EgaFg?WNo960&Zp8bE=KE8KZ-rNyAcKY74 z=eACre(<4(Pu;im!1Ch9m*t7yF3YPQTb9GuEz31~mgUjA*Or5qEXzw<%kl|aW9x=J z%d@mBF$AAWC`3J&SbWZgefF{U;-i-^#B-@3`)tJ(n~KfFnqX75oYRQ0dE!(eS?s9Y zv=(cr(ubgp9oc_LVoFJH%raVfCn59b#l{qePh4vvhhE5B$KE(Qmoe;sUyh)YY<*viv!j*Nfs;*yLcA4Upe289#iG3r#a8Uhg$tFy-h0ka zD0EvACzrk8K%a?_*`;KM%|4-oOiPzpo_$>|SV)fSq;t9l4 zrhIS(;Cvat`DO*@CwGJM-vQ45dUIUI-`LBref5RG>VA;-Uu!0L9)L%lb12?-XH!CZ;6@P)Pm#z1Re>m$9q<*cFdFd@ z2h4~a!!dwcT$HJiiR@Vja2VK1a{wPN*zc-qR`GAjl=I6@#9pBh``X&BhdR>U4TxW06Z>J%Q++?TWMKdM4Uf&!?N6h#AEYwS1gYM27Yzp_Wr#$ArZM1iO7|>5*Xs| zzXqm=^IpQSYw&aWD~Nw|_}qc*<6GA*%kTCr%e7A~%j0fY0(K*;dWc9mu=@7S-C=*F zE{-i29D>duaDpz<8DU1IOYg*8WE3p%k`PHTg@7Q!30085MCOvM@v+v?nQ}{wBXW)& zN8uh4pa+;SvUI?Uu_M@$6*T}QeH2Tjf_2thQgx{|WGuu8Xig1Df*D9Qq&Q?e#<<{O zsSfci#uF%MbbjX_AK+1n(Fe%_dbd4(zuMf?&1Z) z%E9^8knkXTE1HNIAR5IOQtl}M>kOc>cT>6p&>?lhP7rYEifY|J1*Q+6R!n~t;N}`Y zPzI3C!(gKtu7F5R(n3$(G-w+=V=z!u1~(tw74seFSRslblYJ1(lu}K=_2S_NH7R@&)(9BUL)92HRurH!l+MrtVV%Jq z6MR&qR;)K@545%x$tT8y1q6Rt-oOF?DlMA}?@UnrS8tXIz7aA)kHY|ON z1Y`Mbqk{syFRN8fofaWs23mz|;HnN?#|8&=DMnomr4o)8(=Fye173ewgwQqX3Q0L2 z%g*AEu*0dWI(bA2Kd{DC_#6`GfKxKUrFfhiG(MpDuID(U#vL7#uGc|h6_zThIV{-4 z4a_MZp-!%|MXFw_>u_7Fb>ds^(1c3@9>Ka=B9K!x14Bh1CpZzD*@~|Co?DEy54Zus z41OpLc5s}fA^fq(J41h6*G$A#0zv6CKqN5PQQ0R11*xi%53~y2AQ@gP4TJ|qE%2TM zQ-$1uZ4r7&m=@xn86MTFXE;=1WD!Yn#1(-x@N(--3}UL+(883dB1pk26luUy;23}^ z{8K~KZTE#!xja1dY8#9Q0)&GRw)Vw0@GXx{pop41jD;Cg{I)h2+WT_Dg-5TDXZZ%gbdfB=#sEoQynO#SfI`jQ4o|Gp?~Bh z6@XDQy1}92eO)m zNbn$kZL}u0TILrGG)q?DgULsRWIJ3HaFeAcUPY_f3bIKee=5gTZcAY z*}lVd;-NDepSrEwb?C@VH*FkUKeBv}{#}1MaPrjhNo;@m!u2P;y`|NsC%$!2s!q{5 zW29T>hk&FO3)V_{6RU!cG1s07sMxA{;v(p&ExmzZgA9K*u|pZUq1bW0kwkHnF-q#- zWDJ@Me%3lU5PFcmf$bJRv&@LD<~9;IlM=odz44&T!`PO1LGA8V&cPtm8u_DboNew!z0xR=2&97K%WQF{Km@TQU($ z+M5EF3YLH9l(2#juF7Jpl(g%!6~S5)nW@&up|XII;~G{gDz!2v2OdrlB)(hB1Ee!1 zFm&%~r|zSN)|$6HGIUf;4TeEaosUYb4Wnle`N3es1l=UiaiisesNCI}4&_@gVUrk) zRY0npwke|~Ew7MrR_epEGGGc|*G1h+~mbI$Y4H|o>nsoRkXg|K&0*;-j zpn=c6L#d%@F(GN0Diysk8dv2G8R9$I+Z=S7Hd-a*uBCsPrX^Sx9GnRAtp%Rjy)#`9AmjtWOx@4G z4&BD;0=J}v5XXAIc2$A34#U*DKix?;!J5%@NHABj8oV9Ww(65H7icX7k3SjPCRNi6 zRN6W;l?^<~QO4wQFqEO^0<7qy-sRx~zSk6b(EI6H7=#&(j+V1@gac8h)X@xvYAb*D z)rz0Q9=vVUaURWh9E7bi(7`1^PpSwDe~E&vS~^ZKG7zDNQuKiXRgFHfXFY+f-b;hS zA*4Bt=!1drBjRghz5gZ20Z?>!PD`p*6kUq$?oEADQjHP^V#hc75S{==b5V?1CSCp2E)OfaZU5K=V1E z`5e%E4(P-F9MF8mdCHV2Q>ILrGG)q?53KAInV#1oQ#kK)Ki8i3xu0VvHjb_zULJl& zk?Furk?EE9c<$%y=~ap85j1(?mJe--sfcZr+007QtTfF^)2uYjO4F<~O_?%f%9JTn zrc9YK<$|(Pclxo`ovzKLpS{ZOLoV9n8a(ihB{64mbHV!P0kkDdS) z7Qn)B{RrC2yFY^VBcQ^5{cPbsfC{S!-}!9evE$1h_G|;f<=@!`1Vs1p-L>-pf_|-V zx%9n3;Qg!R){NjS-qu&a*#U6aQqDGUiD-!)Nnf_)tZy(DK#*2!8G}HlxIXvTW6|#p z4j)Euok=UVM0N}w^MgDe9NF>fu?D}6yZ!o`@awPZ$NgS?w_hJVacKD$!gYiqNMn;A F93I*WlT82s diff --git a/testing_suite/test_assignment_chr11_and_Tcf3_examples.py b/testing_suite/test_assignment_chr11_and_Tcf3_examples.py index 75e4392..bd8c2bd 100644 --- a/testing_suite/test_assignment_chr11_and_Tcf3_examples.py +++ b/testing_suite/test_assignment_chr11_and_Tcf3_examples.py @@ -4,7 +4,7 @@ @pytest.mark.integration class TestAssignments(object): - """ The objective here is to make sure that each transcript in the + """ The objective here is to make sure that each transcript in the chr11_and_Tcf3 example set was assigned the expected identity. """ def test_ISM_of_Canx(self): @@ -37,7 +37,7 @@ def test_ISM_of_Canx(self): conn.close() def test_prefix_ISM_of_Canx(self): - """ m54284_180814_002203/18677911/ccs is an ISM transcript of Canx at + """ m54284_180814_002203/18677911/ccs is an ISM transcript of Canx at first glance that has known 5' and 3' ends. Comes from BC017 data. """ conn = sqlite3.connect("scratch/chr11_and_Tcf3.db") @@ -55,7 +55,7 @@ def test_prefix_ISM_of_Canx(self): assert assignment['gene_ID'] == correct_gene_ID assert assignment['transcript_ID'] == 8462 assert assignment['start_delta'] == 30 - assert assignment['end_delta'] == -290 + assert assignment['end_delta'] == -290 # Now make sure that the novel transcript was annotated correctly annot_dict = make_annot_dict(cursor, assignment['transcript_ID']) @@ -117,7 +117,7 @@ def test_suffix_ISM_of_Tcf3(self): conn.close() def test_NIC_of_Drg1(self): - """ For this example, the same read was planted in two different + """ For this example, the same read was planted in two different datasets (m54284_180814_002203/49414590/ccs) """ conn = sqlite3.connect("scratch/chr11_and_Tcf3.db") @@ -129,7 +129,7 @@ def test_NIC_of_Drg1(self): read_ID = "m54284_180814_002203/49414590/ccs" # Fetch observed entry from table - query = """SELECT * from observed WHERE dataset IN + query = """SELECT * from observed WHERE dataset IN ('PB65_B017', 'PB65_B018') AND read_name = ?""" cursor.execute(query, [read_ID]) correct_gene_ID = fetch_correct_ID("Drg1", "gene", cursor) @@ -148,14 +148,14 @@ def test_NIC_of_Drg1(self): def test_FSM_of_Drg1(self): """ Read m54284_180814_002203/40042763/ccs is an FSM of the Drg1 gene (BC017) """ - + conn = sqlite3.connect("scratch/chr11_and_Tcf3.db") conn.row_factory = sqlite3.Row cursor = conn.cursor() dataset = "PB65_B017" read_ID = "m54284_180814_002203/40042763/ccs" - + # Fetch observed entry from table query = """SELECT * from observed WHERE dataset = ? AND read_name = ?""" assignment = cursor.execute(query, [dataset, read_ID]).fetchall()[0] @@ -199,8 +199,8 @@ def antisense_to_Grb10(self): assert annot_dict["antisense_transcript"] == "TRUE" assert annot_dict["transcript_status"] == "NOVEL" conn.close() - - + + def make_annot_dict_gene(cursor, gene_ID): """ Extracts all gene annotations for the transcript ID and puts diff --git a/testing_suite/test_assignment_readthrough_examples.py b/testing_suite/test_assignment_readthrough_examples.py index 879d458..d1fd28c 100644 --- a/testing_suite/test_assignment_readthrough_examples.py +++ b/testing_suite/test_assignment_readthrough_examples.py @@ -247,6 +247,79 @@ def test_NNC_of_annot_rt(self): assert annot_dict["transcript_status"] == "NOVEL" conn.close() + def test_FSM_of_overlapping_single_gene_cenps(self): + """ cenps_fsm is an FSM an annotated gene that is subsumed by the + RPL11-ELOA readthrough loci. However it should just + be annotated to RPL11""" + + conn = sqlite3.connect("scratch/readthrough.db") + conn.row_factory = sqlite3.Row + cursor = conn.cursor() + + dataset = "hl60_1_1" + read_ID = "cenps_fsm" + + # Fetch observed entry from table + query = """SELECT * from observed WHERE dataset = ? AND read_name = ?""" + assignment = cursor.execute(query, [dataset, read_ID]).fetchall()[0] + + correct_gene_ID = fetch_correct_ID("CENPS", "gene", cursor) + assert assignment['gene_ID'] == correct_gene_ID + + annot_dict = make_annot_dict(cursor, assignment['transcript_ID']) + assert annot_dict["transcript_status"] == "KNOWN" + conn.close() + + def test_NIC_of_annot_single_gene_2(self): + """ cenps_nic shares all sss w/ annotated cenps. I named the read wrong oops + validated by looking for the shared sss of the weird exon w/ known models + (cenps-204 and cenps-205) """ + + conn = sqlite3.connect("scratch/readthrough.db") + conn.row_factory = sqlite3.Row + cursor = conn.cursor() + + dataset = "hl60_1_1" + read_ID = "cenps_nic" + + # Fetch observed entry from table + query = """SELECT * from observed WHERE dataset = ? AND read_name = ?""" + assignment = cursor.execute(query, [dataset, read_ID]).fetchall()[0] + correct_gene_ID = fetch_correct_ID("CENPS", "gene", cursor) + assert assignment['gene_ID'] == correct_gene_ID + + # Now make sure that the novel transcript was annotated correctly + annot_dict = make_annot_dict(cursor, assignment['transcript_ID']) + assert annot_dict["NIC_transcript"] == "TRUE" + assert annot_dict["transcript_status"] == "NOVEL" + conn.close() + + def test_NNC_of_annot_single_gene_2(self): + """ cenps_nic shares all sss w/ annotated cenps. I named the read wrong oops + validated by looking for the shared sss of the weird exon w/ known models + (cenps-204 and cenps-205) """ + + conn = sqlite3.connect("scratch/readthrough.db") + conn.row_factory = sqlite3.Row + cursor = conn.cursor() + + dataset = "hl60_1_1" + read_ID = "cenps_nnc" + + # Fetch observed entry from table + query = """SELECT * from observed WHERE dataset = ? AND read_name = ?""" + assignment = cursor.execute(query, [dataset, read_ID]).fetchall()[0] + correct_gene_ID = fetch_correct_ID("CENPS", "gene", cursor) + assert assignment['gene_ID'] == correct_gene_ID + + # Now make sure that the novel transcript was annotated correctly + annot_dict = make_annot_dict(cursor, assignment['transcript_ID']) + assert annot_dict["NNC_transcript"] == "TRUE" + assert annot_dict["transcript_status"] == "NOVEL" + conn.close() + + + def make_annot_dict_gene(cursor, gene_ID): """ Extracts all gene annotations for the transcript ID and puts From 364cde4941b4effe83896fa901e8fb287fcd52df Mon Sep 17 00:00:00 2001 From: fairliereese Date: Mon, 25 Sep 2023 14:58:09 -0700 Subject: [PATCH 09/31] added code to pick which gene for possible fusion transcripts (that does not look like it will work)" --- src/talon/talon.py | 60 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/src/talon/talon.py b/src/talon/talon.py index 8601ba1..2948b55 100644 --- a/src/talon/talon.py +++ b/src/talon/talon.py @@ -996,6 +996,32 @@ def process_NIC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict, return gene_ID, transcript_ID, novelty, start_end_info, fusion +def get_vertex_2_gene_df(vertex_2_gene): + """ + Get a DataFrame mapping each unique combination of vertex:gene_ID + + Parameters: + vertex_2_gene (dict): Dictionary mapping each vertex ID to a list of + gene IDs that the vertex is found in + + Returns: + df (pandas DataFrame): DataFrame of unique vertex:gene combinations + """ + gids = [] + vids = [] + for key, item in vertex_2_gene.items(): + for item2 in item: + gids.append(item2[0]) + vids.append(key) + # df = pd.DataFrame.from_dict(vertex_2_gene, orient='index') + df = pd.DataFrame() + df['gid'] = gids + df['vid'] = vids + print(df.head()) + print(len(df.index)) + print(len(df.vid.unique().tolist())) + print(df.loc[df.vid.duplicated(keep=False)].sort_values(by='vid')) + return df def find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex_2_gene): """ Use vertices in a transcript to try to pinpoint the gene it belongs to. @@ -1025,9 +1051,22 @@ def find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex_2_gene): # how many genes have this splice site? n_gene_matches.append(len(matches)) + print('curr_matches)') + print(curr_matches) + + df = get_vertex_2_gene_df(vertex_2_gene) + + print('eeps epps') + print(vertex_IDs) + print(df.head()) # how many splice sites are from each gene gene_tally = dict((x, gene_matches.count(x)) for x in set(gene_matches)) + print(gene_tally) + print(len(gene_tally)) + print(n_gene_matches) + print(' genes') + print(gene_matches) # no shared splice junctions if len(gene_matches) == 0: @@ -1038,8 +1077,29 @@ def find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex_2_gene): # when there are no shared splice sites between gene hits but we did # hit more than one gene elif max(n_gene_matches) <= 1 and len(gene_tally) > 1: + print(' went here') return None, True + # if we hit more than one gene and they have overlapping sjs, + # tiebreak based on % of SJs from each + # gene that we hit. pick gene w/ greatest percentage + elif len(gene_tally) > 1: + temp = df.loc[df.gid.isin(gene_matches)].copy(deep=True) + temp = temp.drop_duplicates() + + # get total # vertices / gene + temp1 = temp.groupby('gid').count().reset_index().rename({'vid': 'n_vert'}, axis=1) + + # get total # detected vertices / gene + temp2 = temp.loc[temp.vid.isin(vertex_IDs)].copy(deep=True) + temp2 = temp2.groupby('gid').count().reset_index().rename({'vid': 'n_vert_in_t'}, axis=1) + + # merge + temp3 = temp1.merge(temp2, on='gid') + print(temp3) + + + # For the main assignment, pick the gene that is observed the most else: gene_ID = max(gene_tally, key=gene_tally.get) From bbf00348babd01afe328d2c671e1897be1204ebd Mon Sep 17 00:00:00 2001 From: fairliereese Date: Mon, 25 Sep 2023 16:28:21 -0700 Subject: [PATCH 10/31] other attempt at tiebreaking based on % ref overlap... also doesnt entirely work --- src/talon/init_refs.py | 12 ++--- src/talon/talon.py | 118 +++++++++++++++++++++++++++-------------- 2 files changed, 83 insertions(+), 47 deletions(-) diff --git a/src/talon/init_refs.py b/src/talon/init_refs.py index b6fd3ce..7c36710 100644 --- a/src/talon/init_refs.py +++ b/src/talon/init_refs.py @@ -1,7 +1,7 @@ # TALON: Techonology-Agnostic Long Read Analysis Pipeline # Author: Dana Wyman # ----------------------------------------------------------------------------- -# Contains functions that query the database to initialize various data +# Contains functions that query the database to initialize various data # structures for the TALON run. # --------------------------------------------------------------------- # make_temp_novel_gene_table @@ -13,8 +13,9 @@ # make_gene_start_and_end_dict from string import Template +import pandas as pd -def make_temp_novel_gene_table(cursor, build, chrom = None, start = None, +def make_temp_novel_gene_table(cursor, build, chrom = None, start = None, end = None, tmp_tab = "temp_gene"): """ Attaches a temporary database with a table that has the following fields: - gene_ID @@ -137,7 +138,7 @@ def make_temp_monoexonic_transcript_table(cursor, build, chrom = None, OR (max_pos >= $start AND max_pos <= $end))""") command = command.substitute({'build':build, 'chrom':chrom, - 'start':start, 'end':end, + 'start':start, 'end':end, 'tmp_tab':tmp_tab}) cursor.execute(command) @@ -292,8 +293,8 @@ def make_vertex_2_gene_dict(cursor, build = None, chrom = None, start = None, en return vertex_2_gene def make_gene_start_or_end_dict(cursor, build, mode, chrom = None, start = None, end = None): - """ Select the starts (or ends) of known genes in the database and store - in a dict. + """ Select the starts (or ends) of known genes in the database and store + in a dict. Format of dict: Key: gene ID from database Value: dict mapping positions to start vertices (or end vertices) of @@ -350,4 +351,3 @@ def make_gene_start_or_end_dict(cursor, build, mode, chrom = None, start = None, output_dict[gene_ID][pos] = vertex return output_dict - diff --git a/src/talon/talon.py b/src/talon/talon.py index 2948b55..14be17d 100644 --- a/src/talon/talon.py +++ b/src/talon/talon.py @@ -611,10 +611,12 @@ def search_for_overlap_with_gene(chromosome, start, end, strand, If there is more than one same-strand option, prioritize amount of overlap. Antisense matches may be returned if there is no same strand match. """ - + print('in search for overlap with gene') min_start = min(start, end) max_end = max(start, end) query_interval = [min_start, max_end] + print('query interval') + print(query_interval) query = Template(""" SELECT gene_ID, chromosome, @@ -637,6 +639,9 @@ def search_for_overlap_with_gene(chromosome, start, end, strand, # Among multiple matches, preferentially return the same-strand gene with # the greatest amount of overlap + # print('start+end') + # print(start) + # print(end) same_strand_matches = len([x for x in matches if x["strand"] == strand]) # for m in matches: # print() @@ -663,16 +668,28 @@ def get_best_match(matches, query_interval): """ Given a set of gene matches and a query interval, return the match that has the greatest amount of overlap with the query.""" + print('matching based on overlap') max_overlap = 0 + max_perc_overlap = 0 best_match = None - for match in matches: + print(match['gene_ID']) match_interval = [match['start'], match['end']] - overlap = get_overlap(query_interval, match_interval) - if overlap >= max_overlap: + overlap, perc_overlap = get_overlap(query_interval, match_interval) + print(overlap) + print(perc_overlap) + if overlap > max_overlap: max_overlap = overlap + max_perc_overlap = perc_overlap best_match = match + elif overlap == max_overlap: + if perc_overlap > max_perc_overlap: + max_overlap = overlap + max_perc_overlap = perc_overlap + best_match = match + print('best match') + print(best_match['gene_ID']) return best_match @@ -682,11 +699,15 @@ def get_overlap(a, b): ends of each interval as inclusive, meaning that if a = b = [10, 20], the overlap reported would be 11, not 10. Args: - a: First interval, formattted as a list - b: Second interval, formatted as a list + a: First interval, formattted as a list (query) + b: Second interval, formatted as a list (reference) + perc_overlap: Percent overlap from the reference interval that the + query interval consumed """ overlap = max(0, min(a[1], b[1]) - max(a[0], b[0]) + 1) - return overlap + ref_len = abs(b[1]-b[0]) + perc_overlap = (overlap/ref_len)*100 + return overlap, perc_overlap def search_for_transcript(edge_IDs, transcript_dict): @@ -1017,10 +1038,10 @@ def get_vertex_2_gene_df(vertex_2_gene): df = pd.DataFrame() df['gid'] = gids df['vid'] = vids - print(df.head()) - print(len(df.index)) - print(len(df.vid.unique().tolist())) - print(df.loc[df.vid.duplicated(keep=False)].sort_values(by='vid')) + # print(df.head()) + # print(len(df.index)) + # print(len(df.vid.unique().tolist())) + # print(df.loc[df.vid.duplicated(keep=False)].sort_values(by='vid')) return df def find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex_2_gene): @@ -1051,22 +1072,23 @@ def find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex_2_gene): # how many genes have this splice site? n_gene_matches.append(len(matches)) - print('curr_matches)') - print(curr_matches) + # print('curr_matches)') + # print(curr_matches) df = get_vertex_2_gene_df(vertex_2_gene) - print('eeps epps') - print(vertex_IDs) - print(df.head()) + # print('eeps epps') + # print(vertex_IDs) + # print(df.head()) # how many splice sites are from each gene gene_tally = dict((x, gene_matches.count(x)) for x in set(gene_matches)) - print(gene_tally) - print(len(gene_tally)) - print(n_gene_matches) - print(' genes') - print(gene_matches) + # print('tally') + # print(gene_tally) + # print(len(gene_tally)) + # print(n_gene_matches) + # print(' genes') + # print(gene_matches) # no shared splice junctions if len(gene_matches) == 0: @@ -1081,22 +1103,23 @@ def find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex_2_gene): return None, True # if we hit more than one gene and they have overlapping sjs, - # tiebreak based on % of SJs from each - # gene that we hit. pick gene w/ greatest percentage + # tie break based on ????? elif len(gene_tally) > 1: - temp = df.loc[df.gid.isin(gene_matches)].copy(deep=True) - temp = temp.drop_duplicates() - - # get total # vertices / gene - temp1 = temp.groupby('gid').count().reset_index().rename({'vid': 'n_vert'}, axis=1) - - # get total # detected vertices / gene - temp2 = temp.loc[temp.vid.isin(vertex_IDs)].copy(deep=True) - temp2 = temp2.groupby('gid').count().reset_index().rename({'vid': 'n_vert_in_t'}, axis=1) - - # merge - temp3 = temp1.merge(temp2, on='gid') - print(temp3) + print('i am here') + return None, False + # temp = df.loc[df.gid.isin(gene_matches)].copy(deep=True) + # temp = temp.drop_duplicates() + # + # # get total # vertices / gene + # temp1 = temp.groupby('gid').count().reset_index().rename({'vid': 'n_vert'}, axis=1) + # + # # get total # detected vertices / gene + # temp2 = temp.loc[temp.vid.isin(vertex_IDs)].copy(deep=True) + # temp2 = temp2.groupby('gid').count().reset_index().rename({'vid': 'n_vert_in_t'}, axis=1) + # + # # merge + # temp3 = temp1.merge(temp2, on='gid') + # print(temp3) @@ -1109,16 +1132,26 @@ def find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex_2_gene): def process_NNC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict, - gene_starts, gene_ends, edge_dict, locations, vertex_2_gene, run_info): + gene_starts, gene_ends, edge_dict, locations, vertex_2_gene, run_info, + cursor, tmp_gene): """ Novel not in catalog case """ novelty = [] start_end_info = {} + # first try to assign gene based on vertex concordance gene_ID, fusion = find_gene_match_on_vertex_basis( vertex_IDs, strand, vertex_2_gene) + + # otherwise look for genomic overlap with existing genes if gene_ID == None: - return None, None, [], None, False + gene_ID, match_strand = search_for_overlap_with_gene(chrom, positions[0], + positions[-1], strand, + cursor, run_info, tmp_gene) + print('geneid from search for overlap with gene') + print(gene_ID) + if gene_ID == None: + return None, None, [], None, False # Get matches for the ends start_vertex, start_exon, start_novelty, known_start, diff_5p = process_5p(chrom, @@ -1232,7 +1265,6 @@ def process_remaining_mult_cases(chrom, positions, strand, edge_IDs, vertex_IDs, gene_novelty = [] transcript_novelty = [] start_end_info = {} - if not run_info.create_novel_spliced_genes and not fusion: gene_ID, match_strand = search_for_overlap_with_gene(chrom, positions[0], positions[-1], strand, @@ -1443,7 +1475,9 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di vertex_IDs, transcript_dict, gene_starts, gene_ends, edge_dict, location_dict, - vertex_2_gene, run_info) + vertex_2_gene, run_info, + cursor, tmp_gene) + print(f'geneID from process_nnc: {gene_ID}') # Transcripts that don't match the previous categories end up here if gene_ID == None: print('looking for this other stuff') @@ -1458,6 +1492,8 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di cursor, tmp_gene, fusion) + print('this is the gene id it decided on') + print(gene_ID) # Add all novel vertices to vertex_2_gene now that we have the gene ID vertex_IDs = start_end_info["vertex_IDs"] edge_IDs = start_end_info["edge_IDs"] @@ -1885,7 +1921,7 @@ def identify_monoexon_transcript(chrom, positions, strand, cursor, location_dict for match in matches: # get overlap and compare match_interval = [match['start'], match['end']] - overlap = get_overlap([start, end], match_interval) + overlap, perc_overlap = get_overlap([start, end], match_interval) if overlap >= best_overlap: best_overlap = overlap best_match = match From b73d6aea380e7e271f39ac07d25c8ee81f30b785 Mon Sep 17 00:00:00 2001 From: fairliereese Date: Mon, 25 Sep 2023 17:04:36 -0700 Subject: [PATCH 11/31] changed gene overlap heuristic to be % overlap --- src/talon/talon.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/talon/talon.py b/src/talon/talon.py index 14be17d..682998b 100644 --- a/src/talon/talon.py +++ b/src/talon/talon.py @@ -678,15 +678,15 @@ def get_best_match(matches, query_interval): overlap, perc_overlap = get_overlap(query_interval, match_interval) print(overlap) print(perc_overlap) - if overlap > max_overlap: + if perc_overlap > max_perc_overlap: max_overlap = overlap max_perc_overlap = perc_overlap best_match = match - elif overlap == max_overlap: - if perc_overlap > max_perc_overlap: - max_overlap = overlap - max_perc_overlap = perc_overlap - best_match = match + # elif overlap == max_overlap: + # if perc_overlap > max_perc_overlap: + # max_overlap = overlap + # max_perc_overlap = perc_overlap + # best_match = match print('best match') print(best_match['gene_ID']) From ac800cee95a15ef1fe88647d22d7c1f1ba04c35a Mon Sep 17 00:00:00 2001 From: fairliereese Date: Tue, 26 Sep 2023 13:48:04 -0700 Subject: [PATCH 12/31] changed tiebreaking b/w genes to sum of 3' + 5' distance of read ends to gene end --- src/talon/talon.py | 111 ++++++++++++++++++++++++++++++++------------- 1 file changed, 79 insertions(+), 32 deletions(-) diff --git a/src/talon/talon.py b/src/talon/talon.py index 682998b..2983695 100644 --- a/src/talon/talon.py +++ b/src/talon/talon.py @@ -604,7 +604,8 @@ def search_for_ISM(edge_IDs, transcript_dict): def search_for_overlap_with_gene(chromosome, start, end, strand, - cursor, run_info, tmp_gene): + cursor, run_info, tmp_gene, + gene_starts, gene_ends): """ Given a start and an end value for an interval, query the database to determine whether the interval overlaps with any genes. If it there is more than one match, prioritize same-strand first and foremost. @@ -615,8 +616,8 @@ def search_for_overlap_with_gene(chromosome, start, end, strand, min_start = min(start, end) max_end = max(start, end) query_interval = [min_start, max_end] - print('query interval') - print(query_interval) + # print('query interval') + # print(query_interval) query = Template(""" SELECT gene_ID, chromosome, @@ -655,44 +656,76 @@ def search_for_overlap_with_gene(chromosome, start, end, strand, strand == "-" and same_strand_matches == 0: matches = [x for x in matches if x["strand"] == "+"] - best_match = get_best_match(matches, query_interval) + # best_match = get_best_match(matches, query_interval) + best_match = get_best_match(matches, start, end, + gene_starts, gene_ends) else: matches = [x for x in matches if x["strand"] == "-"] - best_match = get_best_match(matches, query_interval) + # best_match = get_best_match(matches, query_interval) + best_match = get_best_match(matches, start, end, + gene_starts, gene_ends) return best_match['gene_ID'], best_match['strand'] +def get_best_match(matches, start, end, + gene_starts, gene_ends): + """ + Get the best gene match based on distances of start and end of + read to starts and ends from transcripts of genes. The gene with the + lowest absolute genomic distance between 5' ends and 3' ends will win. + """ + min_dist = sys.maxsize + best_match = None -def get_best_match(matches, query_interval): - """ Given a set of gene matches and a query interval, return the match - that has the greatest amount of overlap with the query.""" + print(f'read start: {start}') + print(f'read end: {end}') - print('matching based on overlap') - max_overlap = 0 - max_perc_overlap = 0 - best_match = None + # TODO - maybe don't need gene_starts + gene_ends? for match in matches: - print(match['gene_ID']) - match_interval = [match['start'], match['end']] - overlap, perc_overlap = get_overlap(query_interval, match_interval) - print(overlap) - print(perc_overlap) - if perc_overlap > max_perc_overlap: - max_overlap = overlap - max_perc_overlap = perc_overlap + print() + print(f"gene: {match['gene_ID']}") + end_dist = abs(match['end']-end) + start_dist = abs(match['start']-start) + + print(f"gene start: {match['start']}") + print(f"gene end: {match['end']}") + dist = end_dist+start_dist + print(f'dist: {dist}') + if dist < min_dist: + min_dist = dist best_match = match - # elif overlap == max_overlap: - # if perc_overlap > max_perc_overlap: - # max_overlap = overlap - # max_perc_overlap = perc_overlap - # best_match = match print('best match') print(best_match['gene_ID']) return best_match + +# def get_best_match(matches, query_interval): +# """ Given a set of gene matches and a query interval, return the match +# that has the greatest amount of overlap with the query.""" +# +# print('matching based on overlap') +# max_overlap = 0 +# max_perc_overlap = 0 +# best_match = None +# for match in matches: +# print(match['gene_ID']) +# match_interval = [match['start'], match['end']] +# overlap, perc_overlap = get_overlap(query_interval, match_interval) +# print(overlap) +# print(perc_overlap) +# if overlap > max_overlap: +# max_overlap = overlap +# max_perc_overlap = perc_overlap +# best_match = match +# +# print('best match') +# print(best_match['gene_ID']) +# return best_match + + def get_overlap(a, b): """ Computes the amount of overlap between two intervals. Returns 0 if there is no overlap. The function treats the start and @@ -965,7 +998,8 @@ def process_ISM(chrom, positions, strand, edge_IDs, vertex_IDs, all_matches, tra def process_NIC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict, - gene_starts, gene_ends, edge_dict, locations, vertex_2_gene, run_info): + gene_starts, gene_ends, edge_dict, locations, vertex_2_gene, run_info, + cursor, tmp_gene): """ For a transcript that has been determined to be novel in catalog, find the proper gene match (documenting fusion event if applicable). To do this, look up each vertex in the vertex_2_gene dict, and keep track of all @@ -976,6 +1010,14 @@ def process_NIC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict, gene_ID, fusion = find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex_2_gene) + # otherwise look for closest gene based on end differences + if gene_ID == None: + gene_ID, match_strand = search_for_overlap_with_gene(chrom, positions[0], + positions[-1], strand, + cursor, run_info, tmp_gene, + gene_starts, gene_ends) + print('geneid from search for overlap with gene 9NIC)') + print(gene_ID) if gene_ID == None: return None, None, [], None, fusion @@ -1143,11 +1185,12 @@ def process_NNC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict, gene_ID, fusion = find_gene_match_on_vertex_basis( vertex_IDs, strand, vertex_2_gene) - # otherwise look for genomic overlap with existing genes + # otherwise look for closest gene based on end differences if gene_ID == None: gene_ID, match_strand = search_for_overlap_with_gene(chrom, positions[0], positions[-1], strand, - cursor, run_info, tmp_gene) + cursor, run_info, tmp_gene, + gene_starts, gene_ends) print('geneid from search for overlap with gene') print(gene_ID) if gene_ID == None: @@ -1268,7 +1311,8 @@ def process_remaining_mult_cases(chrom, positions, strand, edge_IDs, vertex_IDs, if not run_info.create_novel_spliced_genes and not fusion: gene_ID, match_strand = search_for_overlap_with_gene(chrom, positions[0], positions[-1], strand, - cursor, run_info, tmp_gene) + cursor, run_info, tmp_gene, + gene_starts, gene_ends) else: gene_ID = None match_strand = None @@ -1437,7 +1481,8 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di vertex_IDs, transcript_dict, gene_starts, gene_ends, edge_dict, location_dict, - vertex_2_gene, run_info) + vertex_2_gene, run_info, + cursor, tmp_gene) # Novel in catalog transcripts have known splice donors and acceptors, # but new connections between them. @@ -1449,7 +1494,8 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di vertex_IDs, transcript_dict, gene_starts, gene_ends, edge_dict, location_dict, - vertex_2_gene, run_info) + vertex_2_gene, run_info, + cursor, tmp_gene) # Antisense transcript with splice junctions matching known gene if splice_vertices_known and gene_ID == None and not fusion: @@ -1964,7 +2010,8 @@ def identify_monoexon_transcript(chrom, positions, strand, cursor, location_dict # Find best gene match using overlap search if the ISM/NIC check didn't work gene_ID, match_strand = search_for_overlap_with_gene(chrom, positions[0], positions[1], strand, - cursor, run_info, tmp_gene) + cursor, run_info, tmp_gene, + gene_starts, gene_ends) # Intergenic case if gene_ID == None: gene_ID = create_gene(chrom, positions[0], positions[-1], From accf79524cdc7e5174b70ce8c05a2588a1747725 Mon Sep 17 00:00:00 2001 From: fairliereese Date: Tue, 26 Sep 2023 19:07:33 -0700 Subject: [PATCH 13/31] made sure that NIC assignment tiebreaking doesn't run when finding readthrough loci --- src/talon/talon.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/talon/talon.py b/src/talon/talon.py index 2983695..a3db0c2 100644 --- a/src/talon/talon.py +++ b/src/talon/talon.py @@ -1010,8 +1010,9 @@ def process_NIC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict, gene_ID, fusion = find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex_2_gene) - # otherwise look for closest gene based on end differences - if gene_ID == None: + # otherwise look for closest gene based on end differences, + # only if it wasn't previously labeled as fusion + if gene_ID == None and fusion == False: gene_ID, match_strand = search_for_overlap_with_gene(chrom, positions[0], positions[-1], strand, cursor, run_info, tmp_gene, From 49eb8d149a3fc3b52f8a1e44b1704ad99c762c26 Mon Sep 17 00:00:00 2001 From: fairliereese Date: Wed, 27 Sep 2023 17:04:37 -0700 Subject: [PATCH 14/31] added more edge case fixes for annotating fusion transcritps --- src/talon/talon.py | 160 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 121 insertions(+), 39 deletions(-) diff --git a/src/talon/talon.py b/src/talon/talon.py index a3db0c2..c409043 100644 --- a/src/talon/talon.py +++ b/src/talon/talon.py @@ -456,7 +456,9 @@ def create_transcript(chromosome, start_pos, end_pos, gene_ID, edge_IDs, vertex_ transcript_dict): """Creates a novel transcript and adds it to the transcript data structure. """ + print('creating new transcript') new_ID = transcript_counter.increment() + print(f'new tid:{new_ID}') if len(edge_IDs) > 1: jn_path = ",".join(map(str, edge_IDs[1:-1])) else: @@ -605,13 +607,17 @@ def search_for_ISM(edge_IDs, transcript_dict): def search_for_overlap_with_gene(chromosome, start, end, strand, cursor, run_info, tmp_gene, - gene_starts, gene_ends): + gene_starts, gene_ends, gene_IDs=None): """ Given a start and an end value for an interval, query the database to determine whether the interval overlaps with any genes. If it there is more than one match, prioritize same-strand first and foremost. - If there is more than one same-strand option, prioritize amount of - overlap. Antisense matches may be returned if there is no same strand - match. """ + If there is more than one same-strand option, prioritize distance from 3' / 5'. + Antisense matches may be returned if there is no same strand + match. + + Parameters: + gene_ID (list of str or None): Restrict results to genes in this list + """ print('in search for overlap with gene') min_start = min(start, end) max_end = max(start, end) @@ -638,6 +644,11 @@ def search_for_overlap_with_gene(chromosome, start, end, strand, if len(matches) == 0: return None, None + # restrict to just the genes we care about + if gene_IDs: + print(f'restricting just to {gene_IDs}') + matches = [match for match in matches if match['gene_ID'] in gene_IDs] + # Among multiple matches, preferentially return the same-strand gene with # the greatest amount of overlap # print('start+end') @@ -666,6 +677,8 @@ def search_for_overlap_with_gene(chromosome, start, end, strand, best_match = get_best_match(matches, start, end, gene_starts, gene_ends) + print(f"but right here it says {best_match['gene_ID']}") + return best_match['gene_ID'], best_match['strand'] def get_best_match(matches, start, end, @@ -886,7 +899,8 @@ def process_3p(chrom, positions, strand, vertex_IDs, gene_ID, gene_ends, edge_di def process_ISM(chrom, positions, strand, edge_IDs, vertex_IDs, all_matches, transcript_dict, - gene_starts, gene_ends, edge_dict, locations, run_info): + gene_starts, gene_ends, edge_dict, locations, run_info, + cursor, tmp_gene): """ Given a transcript, try to find an ISM match for it. If the best match is an ISM with known ends, that will be promoted to NIC. """ @@ -899,7 +913,26 @@ def process_ISM(chrom, positions, strand, edge_IDs, vertex_IDs, all_matches, tra ISM = [] suffix = [] prefix = [] - gene_ID = all_matches[0]['gene_ID'] + + # choose gene to assign it to + gene_matches = list(set([match['gene_ID'] for match in all_matches])) + print(gene_matches) + + # tie break based on distance to 5' / 3' ends + if len(gene_matches) > 1: + gene_ID, _ = search_for_overlap_with_gene(chrom, positions[0], + positions[-1], strand, cursor, run_info, tmp_gene, + gene_starts, gene_ends, gene_IDs=gene_matches) + all_matches = [m for m in all_matches if m['gene_ID'] == gene_ID] + else: + gene_ID = all_matches[0]['gene_ID'] + + # print('edge IDs') + # print(edge_IDs) + # for match in all_matches: + # print(f"gene id:{match['gene_ID']}") + # print(match['jn_path']) + # Get matches for the ends if n_exons > 1: start_vertex, start_exon, start_novelty, known_start, diff_5p = process_5p(chrom, @@ -932,7 +965,7 @@ def process_ISM(chrom, positions, strand, edge_IDs, vertex_IDs, all_matches, tra known_start = 0 known_end = 0 - # Iterate over matches to characterize ISMs + # Iterate over all matches from assigned gene to characterize ISMs for match in all_matches: # Add ISM @@ -996,6 +1029,36 @@ def process_ISM(chrom, positions, strand, edge_IDs, vertex_IDs, all_matches, tra return gene_ID, transcript_ID, novelty, start_end_info +def assign_gene(vertex_IDs, strand, vertex_2_gene, + chrom, start, end, cursor, run_info, + tmp_gene, gene_starts, gene_ends): + """ + Assign a gene to a transcript. First do this on the basis of splice site + matching. If this yields more than one gene, then choose the gene with the + closest 5' / 3' ends. If the splice site matching returns multiple matches + between non-overlapping genes, mark as fusion and do not assign a gene. + + Returns: + gene_ID (str or None): Gene ID of assigned gene, None if not fount + fusion (bool): Whether read appears to come from a novel fusion gene + """ + + # first attempt to assign based on matching vertices + gene_ID, fusion = find_gene_match_on_vertex_basis(vertex_IDs, + strand, + vertex_2_gene) + + # if previous function returned more than one gene that we need to tiebreak, + # look for closest gene based on end differences, out of candidate genes + # only if it wasn't previously labeled as fusion + if type(gene_ID) == list and fusion == False: + gene_ID, match_strand = search_for_overlap_with_gene(chrom, start, + end, strand, + cursor, run_info, tmp_gene, + gene_starts, gene_ends, + gene_IDs=gene_ID) + return gene_ID, fusion + def process_NIC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict, gene_starts, gene_ends, edge_dict, locations, vertex_2_gene, run_info, @@ -1006,19 +1069,23 @@ def process_NIC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict, same-strand genes. """ start_end_info = {} + gene_ID, fusion = assign_gene(vertex_IDs, strand, vertex_2_gene, + chrom, positions[0], positions[-1], cursor, run_info, + tmp_gene, gene_starts, gene_ends) + + # gene_ID, fusion = find_gene_match_on_vertex_basis(vertex_IDs, + # strand, + # vertex_2_gene) + # # otherwise look for closest gene based on end differences, + # # only if it wasn't previously labeled as fusion + # if gene_ID == None and fusion == False: + # gene_ID, match_strand = search_for_overlap_with_gene(chrom, positions[0], + # positions[-1], strand, + # cursor, run_info, tmp_gene, + # gene_starts, gene_ends) + # print('geneid from search for overlap with gene 9NIC)') + # print(gene_ID) - gene_ID, fusion = find_gene_match_on_vertex_basis(vertex_IDs, - strand, - vertex_2_gene) - # otherwise look for closest gene based on end differences, - # only if it wasn't previously labeled as fusion - if gene_ID == None and fusion == False: - gene_ID, match_strand = search_for_overlap_with_gene(chrom, positions[0], - positions[-1], strand, - cursor, run_info, tmp_gene, - gene_starts, gene_ends) - print('geneid from search for overlap with gene 9NIC)') - print(gene_ID) if gene_ID == None: return None, None, [], None, fusion @@ -1148,8 +1215,10 @@ def find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex_2_gene): # if we hit more than one gene and they have overlapping sjs, # tie break based on ????? elif len(gene_tally) > 1: - print('i am here') - return None, False + print('i found more than one gene') + print(gene_tally) + print(n_gene_matches) + return list(gene_tally.keys()), False # temp = df.loc[df.gid.isin(gene_matches)].copy(deep=True) # temp = temp.drop_duplicates() # @@ -1182,20 +1251,27 @@ def process_NNC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict, novelty = [] start_end_info = {} - # first try to assign gene based on vertex concordance - gene_ID, fusion = find_gene_match_on_vertex_basis( - vertex_IDs, strand, vertex_2_gene) + # # first try to assign gene based on vertex concordance + # gene_ID, fusion = find_gene_match_on_vertex_basis( + # vertex_IDs, strand, vertex_2_gene) + # + # # otherwise look for closest gene based on end differences + # if gene_ID == None and fusion == False: + # gene_ID, match_strand = search_for_overlap_with_gene(chrom, positions[0], + # positions[-1], strand, + # cursor, run_info, tmp_gene, + # gene_starts, gene_ends) + # print('geneid from search for overlap with gene') + # print(gene_ID) + gene_ID, fusion = assign_gene(vertex_IDs, strand, vertex_2_gene, + chrom, positions[0], positions[-1], cursor, run_info, + tmp_gene, gene_starts, gene_ends) + print('gene id process_nnc') + print(gene_ID) + print(fusion) - # otherwise look for closest gene based on end differences if gene_ID == None: - gene_ID, match_strand = search_for_overlap_with_gene(chrom, positions[0], - positions[-1], strand, - cursor, run_info, tmp_gene, - gene_starts, gene_ends) - print('geneid from search for overlap with gene') - print(gene_ID) - if gene_ID == None: - return None, None, [], None, False + return None, None, [], None, fusion # Get matches for the ends start_vertex, start_exon, start_novelty, known_start, diff_5p = process_5p(chrom, @@ -1346,11 +1422,13 @@ def process_remaining_mult_cases(chrom, positions, strand, edge_IDs, vertex_IDs, start_end_info["vertex_IDs"] = vertex_IDs if gene_ID == None: - + print(f'fusion: {fusion}') if fusion: + print('i should be here') t_nov = 'fusion_transcript' g_nov = 'fusion_novel' else: + print('but I think im going here') t_nov = 'intergenic_transcript' g_nov = 'intergenic_novel' @@ -1446,6 +1524,7 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di all_exons_known = check_all_exons_known(e_novelty) splice_vertices_known = (sum(v_novelty) == 0) all_exons_novel = (reduce(operator.mul, e_novelty, 1) == 1) + print(f'all exons novel : {all_exons_novel}') fusion = False # Look for FSM or ISM. @@ -1472,7 +1551,9 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di transcript_dict, gene_starts, gene_ends, edge_dict, location_dict, - run_info) + run_info, cursor, tmp_gene) + print(f'gene id from process ism {gene_ID}') + # Look for NIC if gene_ID == None: print('looking for nic') @@ -1513,8 +1594,9 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di cursor, tmp_gene) # Novel not in catalog transcripts contain new splice donors/acceptors - # and contain at least one splice junction. - elif not(splice_vertices_known) and not fusion: + # and contain at least one splice junction. There should also be at least + # one shared exon from existing transcripts to even try assigning a gene + elif not(splice_vertices_known) and not fusion and not all_exons_novel: print('lookign for NNCs') gene_ID, transcript_ID, transcript_novelty, start_end_info, fusion = process_NNC(chrom, positions, @@ -1524,7 +1606,7 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di edge_dict, location_dict, vertex_2_gene, run_info, cursor, tmp_gene) - print(f'geneID from process_nnc: {gene_ID}') + print(f'geneID from process_nnc: {gene_ID}') # Transcripts that don't match the previous categories end up here if gene_ID == None: print('looking for this other stuff') @@ -2006,7 +2088,7 @@ def identify_monoexon_transcript(chrom, positions, strand, cursor, location_dict transcript_dict, gene_starts, gene_ends, edge_dict, location_dict, - run_info) + run_info, cursor, tmp_gene) if gene_ID == None: # Find best gene match using overlap search if the ISM/NIC check didn't work gene_ID, match_strand = search_for_overlap_with_gene(chrom, positions[0], From 200fd72aa9bcf7a3103c3214575b9a851828d5e3 Mon Sep 17 00:00:00 2001 From: fairliereese Date: Wed, 27 Sep 2023 20:23:34 -0700 Subject: [PATCH 15/31] fixed (hopefully) antisense multimatch gene case --- src/talon/talon.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/talon/talon.py b/src/talon/talon.py index c409043..0ca09b1 100644 --- a/src/talon/talon.py +++ b/src/talon/talon.py @@ -1326,6 +1326,12 @@ def process_spliced_antisense(chrom, positions, strand, edge_IDs, vertex_IDs, anti_strand = "+" anti_gene_ID, fusion = find_gene_match_on_vertex_basis(vertex_IDs, anti_strand, vertex_2_gene) + if type(anti_gene_ID) == list and fusion == False: + anti_gene_ID, match_strand = search_for_overlap_with_gene(chrom, positions[0], + positions[-1], strand, + cursor, run_info, tmp_gene, + gene_starts, gene_ends, + gene_IDs=gene_ID) if anti_gene_ID == None: return None, None, gene_novelty, transcript_novelty, start_end_info From 939ee26e9a7115f2104736678871342f8ce062ed Mon Sep 17 00:00:00 2001 From: fairliereese Date: Wed, 27 Sep 2023 21:03:22 -0700 Subject: [PATCH 16/31] fixed typo --- src/talon/talon.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/talon/talon.py b/src/talon/talon.py index 0ca09b1..7fd1d46 100644 --- a/src/talon/talon.py +++ b/src/talon/talon.py @@ -1330,6 +1330,8 @@ def process_spliced_antisense(chrom, positions, strand, edge_IDs, vertex_IDs, anti_gene_ID, match_strand = search_for_overlap_with_gene(chrom, positions[0], positions[-1], strand, cursor, run_info, tmp_gene, + gene_starts, gene_ends, + gene_IDs=anti_gene_ID) gene_starts, gene_ends, gene_IDs=gene_ID) if anti_gene_ID == None: From 00705c7b4561df734d52cb6a7ce42b538d483ef0 Mon Sep 17 00:00:00 2001 From: fairliereese Date: Wed, 27 Sep 2023 21:08:15 -0700 Subject: [PATCH 17/31] fixed typo --- src/talon/talon.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/talon/talon.py b/src/talon/talon.py index 7fd1d46..417bc36 100644 --- a/src/talon/talon.py +++ b/src/talon/talon.py @@ -1332,8 +1332,6 @@ def process_spliced_antisense(chrom, positions, strand, edge_IDs, vertex_IDs, cursor, run_info, tmp_gene, gene_starts, gene_ends, gene_IDs=anti_gene_ID) - gene_starts, gene_ends, - gene_IDs=gene_ID) if anti_gene_ID == None: return None, None, gene_novelty, transcript_novelty, start_end_info From d6f8d3a39526a541c78d9765995deaa92e9deaf6 Mon Sep 17 00:00:00 2001 From: fairliereese Date: Thu, 28 Sep 2023 12:03:17 -0700 Subject: [PATCH 18/31] hopefully fixed an edge case --- src/talon/talon.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/talon/talon.py b/src/talon/talon.py index 417bc36..46ae61c 100644 --- a/src/talon/talon.py +++ b/src/talon/talon.py @@ -641,14 +641,15 @@ def search_for_overlap_with_gene(chromosome, start, end, strand, cursor.execute(query) matches = cursor.fetchall() - if len(matches) == 0: - return None, None - # restrict to just the genes we care about if gene_IDs: print(f'restricting just to {gene_IDs}') matches = [match for match in matches if match['gene_ID'] in gene_IDs] + + if len(matches) == 0: + return None, None + # Among multiple matches, preferentially return the same-strand gene with # the greatest amount of overlap # print('start+end') From 025f4632c23cea311385cb45c8554e58f7634d6a Mon Sep 17 00:00:00 2001 From: fairliereese Date: Fri, 29 Sep 2023 10:50:46 -0700 Subject: [PATCH 19/31] changed behavior to only make a novel gene for reads w/ no known splice sites only if it doesn't overlap just one gene (ie if it is a fusion candidate) --- src/talon/talon.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/talon/talon.py b/src/talon/talon.py index 46ae61c..c37a9c7 100644 --- a/src/talon/talon.py +++ b/src/talon/talon.py @@ -648,6 +648,7 @@ def search_for_overlap_with_gene(chromosome, start, end, strand, if len(matches) == 0: + print('uwu here') return None, None # Among multiple matches, preferentially return the same-strand gene with @@ -928,6 +929,10 @@ def process_ISM(chrom, positions, strand, edge_IDs, vertex_IDs, all_matches, tra else: gene_ID = all_matches[0]['gene_ID'] + # if we didn't assign a gene ID + if gene_ID == None: + return None, None, [], None + # print('edge IDs') # print(edge_IDs) # for match in all_matches: @@ -1040,7 +1045,7 @@ def assign_gene(vertex_IDs, strand, vertex_2_gene, between non-overlapping genes, mark as fusion and do not assign a gene. Returns: - gene_ID (str or None): Gene ID of assigned gene, None if not fount + gene_ID (str or None): Gene ID of assigned gene, None if not found fusion (bool): Whether read appears to come from a novel fusion gene """ @@ -1392,7 +1397,8 @@ def process_remaining_mult_cases(chrom, positions, strand, edge_IDs, vertex_IDs, gene_novelty = [] transcript_novelty = [] start_end_info = {} - if not run_info.create_novel_spliced_genes and not fusion: + if not run_info.create_novel_spliced_genes or not fusion: + print('did i get here?') gene_ID, match_strand = search_for_overlap_with_gene(chrom, positions[0], positions[-1], strand, cursor, run_info, tmp_gene, From 4fa3a1f09e34a8a61093ba65706b9f86ad2416f3 Mon Sep 17 00:00:00 2001 From: fairliereese Date: Mon, 2 Oct 2023 15:59:28 -0700 Subject: [PATCH 20/31] changed tie breaking to work based on 3'/5' distance to any transcript from gene, rather than min/max 3'/5' of all transcripts from gene --- src/talon/init_refs.py | 61 +++++++++++++++++++ src/talon/talon.py | 133 ++++++++++++++++++++++++++--------------- 2 files changed, 145 insertions(+), 49 deletions(-) diff --git a/src/talon/init_refs.py b/src/talon/init_refs.py index 7c36710..df7f86f 100644 --- a/src/talon/init_refs.py +++ b/src/talon/init_refs.py @@ -72,6 +72,67 @@ def make_temp_novel_gene_table(cursor, build, chrom = None, start = None, return tmp_tab +def make_temp_transcript_table(cursor, build, chrom = None, + start = None, end = None, + tmp_tab = "temp_transcript"): + """ Attaches a temporary database with a table that has the following fields: + - gene_ID + - transcript_ID + - chromosome + - start (min position) + - end (max position) + - strand + The purpose is to allow location-based matching tiebreaking + transcripts. """ + + if any(val == None for val in [chrom, start, end]): + command = Template(""" CREATE TEMPORARY TABLE IF NOT EXISTS $tmp_tab AS + SELECT t.gene_ID, + t.transcript_ID, + loc1.chromosome, + genes.strand, + MIN(loc1.position, loc2.position) as min_pos, + MAX(loc1.position, loc2.position) as max_pos + FROM transcripts as t + LEFT JOIN location as loc1 + ON loc1.location_ID = t.start_vertex + LEFT JOIN location as loc2 + ON loc2.location_ID = t.end_vertex + LEFT JOIN genes + ON genes.gene_ID = t.gene_ID + WHERE loc1.genome_build = '$build' + AND loc2.genome_build = '$build' """) + else: + command = Template(""" CREATE TEMPORARY TABLE IF NOT EXISTS $tmp_tab AS + SELECT t.gene_ID, + t.transcript_ID, + loc1.chromosome, + genes.strand, + t.start_exon as exon_ID, + MIN(loc1.position, loc2.position) as min_pos, + MAX(loc1.position, loc2.position) as max_pos + FROM transcripts as t + LEFT JOIN location as loc1 + ON loc1.location_ID = t.start_vertex + LEFT JOIN location as loc2 + ON loc2.location_ID = t.end_vertex + LEFT JOIN genes + ON genes.gene_ID = t.gene_ID + WHERE loc1.genome_build = '$build' + AND loc2.genome_build = '$build' + AND loc1.chromosome = '$chrom' + AND ((min_pos <= $start AND max_pos >= $end) + OR (min_pos >= $start AND max_pos <= $end) + OR (min_pos >= $start AND min_pos <= $end) + OR (max_pos >= $start AND max_pos <= $end))""") + + command = command.substitute({'build':build, 'chrom':chrom, + 'start':start, 'end':end, + 'tmp_tab':tmp_tab}) + cursor.execute(command) + + return tmp_tab + def make_temp_monoexonic_transcript_table(cursor, build, chrom = None, start = None, end = None, tmp_tab = "temp_monoexon"): diff --git a/src/talon/talon.py b/src/talon/talon.py index c37a9c7..418ee2e 100644 --- a/src/talon/talon.py +++ b/src/talon/talon.py @@ -606,8 +606,8 @@ def search_for_ISM(edge_IDs, transcript_dict): def search_for_overlap_with_gene(chromosome, start, end, strand, - cursor, run_info, tmp_gene, - gene_starts, gene_ends, gene_IDs=None): + cursor, run_info, tmp_gene, tmp_t, + gene_IDs=None): """ Given a start and an end value for an interval, query the database to determine whether the interval overlaps with any genes. If it there is more than one match, prioritize same-strand first and foremost. @@ -625,21 +625,46 @@ def search_for_overlap_with_gene(chromosome, start, end, strand, # print('query interval') # print(query_interval) - query = Template(""" SELECT gene_ID, - chromosome, - MIN(start) AS start, - MAX(end) AS end, - strand - FROM $tmp_gene - WHERE (chromosome = '$chrom') AND - ((start <= $min_start AND end >= $max_end) OR - (start >= $min_start AND end <= $max_end) OR - (start >= $min_start AND start <= $max_end) OR - (end >= $min_start AND end <= $max_end)) - GROUP BY gene_ID;""").substitute({'tmp_gene': tmp_gene, 'chrom': chromosome, - 'min_start': min_start, 'max_end': max_end}) + # query = Template(""" SELECT gene_ID, + # chromosome, + # MIN(start) AS start, + # MAX(end) AS end, + # strand + # FROM $tmp_gene + # WHERE (chromosome = '$chrom') AND + # ((start <= $min_start AND end >= $max_end) OR + # (start >= $min_start AND end <= $max_end) OR + # (start >= $min_start AND start <= $max_end) OR + # (end >= $min_start AND end <= $max_end)) + # GROUP BY gene_ID;""").substitute({'tmp_gene': tmp_gene, 'chrom': chromosome, + # 'min_start': min_start, 'max_end': max_end}) + if isinstance(gene_IDs, list): + query = Template("""SELECT gene_ID, + chromosome, + min_pos, + max_pos, + strand + FROM $tmp_t + WHERE gene_ID IN $gene_ids""").substitute({'tmp_t': tmp_t, \ + 'gene_ids': qutils.format_for_IN(gene_IDs)}) + elif not gene_IDs: + query = Template("""SELECT gene_ID, + chromosome, + min_pos, + max_pos, + strand + FROM $tmp_t + WHERE (chromosome = '$chrom') AND + ((start <= $min_start AND end >= $max_end) OR + (start >= $min_start AND end <= $max_end) OR + (start >= $min_start AND start <= $max_end) OR + (end >= $min_start AND end <= $max_end)) + GROUP BY gene_ID;""").substitute({'tmp_t': tmp_t, 'chrom': chromosome, + 'min_start': min_start, 'max_end': max_end}) cursor.execute(query) matches = cursor.fetchall() + print('quwewy:') + print(query) # restrict to just the genes we care about if gene_IDs: @@ -648,7 +673,7 @@ def search_for_overlap_with_gene(chromosome, start, end, strand, if len(matches) == 0: - print('uwu here') + print('herere here') return None, None # Among multiple matches, preferentially return the same-strand gene with @@ -670,21 +695,18 @@ def search_for_overlap_with_gene(chromosome, start, end, strand, matches = [x for x in matches if x["strand"] == "+"] # best_match = get_best_match(matches, query_interval) - best_match = get_best_match(matches, start, end, - gene_starts, gene_ends) + best_match = get_best_match(matches, min_start, max_end) else: matches = [x for x in matches if x["strand"] == "-"] # best_match = get_best_match(matches, query_interval) - best_match = get_best_match(matches, start, end, - gene_starts, gene_ends) + best_match = get_best_match(matches, min_start, max_end) print(f"but right here it says {best_match['gene_ID']}") return best_match['gene_ID'], best_match['strand'] -def get_best_match(matches, start, end, - gene_starts, gene_ends): +def get_best_match(matches, min_end, max_end): """ Get the best gene match based on distances of start and end of read to starts and ends from transcripts of genes. The gene with the @@ -693,18 +715,17 @@ def get_best_match(matches, start, end, min_dist = sys.maxsize best_match = None - print(f'read start: {start}') - print(f'read end: {end}') + print(f'read min: {min_end}') + print(f'read end: {max_end}') - # TODO - maybe don't need gene_starts + gene_ends? for match in matches: print() print(f"gene: {match['gene_ID']}") - end_dist = abs(match['end']-end) - start_dist = abs(match['start']-start) + end_dist = abs(match['max_pos']-max_end) + start_dist = abs(match['min_pos']-min_end) - print(f"gene start: {match['start']}") - print(f"gene end: {match['end']}") + print(f"gene start: {match['min_pos']}") + print(f"gene end: {match['max_pos']}") dist = end_dist+start_dist print(f'dist: {dist}') if dist < min_dist: @@ -902,7 +923,7 @@ def process_3p(chrom, positions, strand, vertex_IDs, gene_ID, gene_ends, edge_di def process_ISM(chrom, positions, strand, edge_IDs, vertex_IDs, all_matches, transcript_dict, gene_starts, gene_ends, edge_dict, locations, run_info, - cursor, tmp_gene): + cursor, tmp_gene, tmp_t): """ Given a transcript, try to find an ISM match for it. If the best match is an ISM with known ends, that will be promoted to NIC. """ @@ -924,7 +945,7 @@ def process_ISM(chrom, positions, strand, edge_IDs, vertex_IDs, all_matches, tra if len(gene_matches) > 1: gene_ID, _ = search_for_overlap_with_gene(chrom, positions[0], positions[-1], strand, cursor, run_info, tmp_gene, - gene_starts, gene_ends, gene_IDs=gene_matches) + tmp_t, gene_IDs=gene_matches) all_matches = [m for m in all_matches if m['gene_ID'] == gene_ID] else: gene_ID = all_matches[0]['gene_ID'] @@ -1037,7 +1058,7 @@ def process_ISM(chrom, positions, strand, edge_IDs, vertex_IDs, all_matches, tra def assign_gene(vertex_IDs, strand, vertex_2_gene, chrom, start, end, cursor, run_info, - tmp_gene, gene_starts, gene_ends): + tmp_gene, tmp_t, gene_starts, gene_ends): """ Assign a gene to a transcript. First do this on the basis of splice site matching. If this yields more than one gene, then choose the gene with the @@ -1061,14 +1082,14 @@ def assign_gene(vertex_IDs, strand, vertex_2_gene, gene_ID, match_strand = search_for_overlap_with_gene(chrom, start, end, strand, cursor, run_info, tmp_gene, - gene_starts, gene_ends, + tmp_t, gene_IDs=gene_ID) return gene_ID, fusion def process_NIC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict, gene_starts, gene_ends, edge_dict, locations, vertex_2_gene, run_info, - cursor, tmp_gene): + cursor, tmp_gene, tmp_t): """ For a transcript that has been determined to be novel in catalog, find the proper gene match (documenting fusion event if applicable). To do this, look up each vertex in the vertex_2_gene dict, and keep track of all @@ -1077,7 +1098,7 @@ def process_NIC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict, start_end_info = {} gene_ID, fusion = assign_gene(vertex_IDs, strand, vertex_2_gene, chrom, positions[0], positions[-1], cursor, run_info, - tmp_gene, gene_starts, gene_ends) + tmp_gene, tmp_t, gene_starts, gene_ends) # gene_ID, fusion = find_gene_match_on_vertex_basis(vertex_IDs, # strand, @@ -1251,7 +1272,7 @@ def find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex_2_gene): def process_NNC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict, gene_starts, gene_ends, edge_dict, locations, vertex_2_gene, run_info, - cursor, tmp_gene): + cursor, tmp_gene, tmp_t): """ Novel not in catalog case """ novelty = [] @@ -1271,7 +1292,7 @@ def process_NNC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict, # print(gene_ID) gene_ID, fusion = assign_gene(vertex_IDs, strand, vertex_2_gene, chrom, positions[0], positions[-1], cursor, run_info, - tmp_gene, gene_starts, gene_ends) + tmp_gene, tmp_t, gene_starts, gene_ends) print('gene id process_nnc') print(gene_ID) print(fusion) @@ -1319,7 +1340,8 @@ def process_NNC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict, def process_spliced_antisense(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict, gene_starts, gene_ends, edge_dict, - locations, vertex_2_gene, run_info, cursor, tmp_gene): + locations, vertex_2_gene, run_info, cursor, tmp_gene, + tmp_t): """ Annotate a transcript as antisense with splice junctions """ gene_novelty = [] @@ -1336,7 +1358,7 @@ def process_spliced_antisense(chrom, positions, strand, edge_IDs, vertex_IDs, anti_gene_ID, match_strand = search_for_overlap_with_gene(chrom, positions[0], positions[-1], strand, cursor, run_info, tmp_gene, - gene_starts, gene_ends, + tmp_t, gene_IDs=anti_gene_ID) if anti_gene_ID == None: return None, None, gene_novelty, transcript_novelty, start_end_info @@ -1390,6 +1412,7 @@ def process_spliced_antisense(chrom, positions, strand, edge_IDs, vertex_IDs, def process_remaining_mult_cases(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict, gene_starts, gene_ends, edge_dict, locations, vertex_2_gene, run_info, cursor, tmp_gene, + tmp_t, fusion): """ This function is a catch-all for multiexonic transcripts that were not FSM, ISM, NIC, NNC, or spliced antisense. @@ -1402,7 +1425,7 @@ def process_remaining_mult_cases(chrom, positions, strand, edge_IDs, vertex_IDs, gene_ID, match_strand = search_for_overlap_with_gene(chrom, positions[0], positions[-1], strand, cursor, run_info, tmp_gene, - gene_starts, gene_ends) + tmp_t) else: gene_ID = None match_strand = None @@ -1496,7 +1519,7 @@ def update_vertex_2_gene(gene_ID, vertex_IDs, strand, vertex_2_gene): def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_dict, transcript_dict, vertex_2_gene, gene_starts, gene_ends, - run_info, tmp_gene): + run_info, tmp_gene, tmp_t): """ Inputs: - Information about the query transcript - chromosome @@ -1564,7 +1587,7 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di transcript_dict, gene_starts, gene_ends, edge_dict, location_dict, - run_info, cursor, tmp_gene) + run_info, cursor, tmp_gene, tmp_t) print(f'gene id from process ism {gene_ID}') # Look for NIC @@ -1577,7 +1600,8 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di gene_starts, gene_ends, edge_dict, location_dict, vertex_2_gene, run_info, - cursor, tmp_gene) + cursor, tmp_gene, + tmp_t) # Novel in catalog transcripts have known splice donors and acceptors, # but new connections between them. @@ -1590,7 +1614,8 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di gene_starts, gene_ends, edge_dict, location_dict, vertex_2_gene, run_info, - cursor, tmp_gene) + cursor, tmp_gene, + tmp_t) # Antisense transcript with splice junctions matching known gene if splice_vertices_known and gene_ID == None and not fusion: @@ -1604,7 +1629,7 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di gene_ends, edge_dict, location_dict, vertex_2_gene, run_info, - cursor, tmp_gene) + cursor, tmp_gene, tmp_t) # Novel not in catalog transcripts contain new splice donors/acceptors # and contain at least one splice junction. There should also be at least @@ -1618,7 +1643,8 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di gene_starts, gene_ends, edge_dict, location_dict, vertex_2_gene, run_info, - cursor, tmp_gene) + cursor, tmp_gene, + tmp_t) print(f'geneID from process_nnc: {gene_ID}') # Transcripts that don't match the previous categories end up here if gene_ID == None: @@ -1632,6 +1658,7 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di edge_dict, location_dict, vertex_2_gene, run_info, cursor, tmp_gene, + tmp_t, fusion) print('this is the gene id it decided on') @@ -1970,6 +1997,11 @@ def prepare_data_structures(cursor, run_info, chrom=None, start=None, start=start, end=end, tmp_tab="temp_monoexon_" + tmp_id) + struct_collection.tmp_t = init_refs.make_temp_transcript_table(cursor, + build, chrom=chrom, + start=start, end=end, + tmp_tab="temp_t_" + tmp_id) + location_dict = init_refs.make_location_dict(build, cursor, chrom=chrom, start=start, end=end) @@ -2027,6 +2059,7 @@ def compute_delta(orig_pos, new_pos, strand): def identify_monoexon_transcript(chrom, positions, strand, cursor, location_dict, edge_dict, transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info, tmp_gene, + tmp_t, tmp_monoexon): gene_novelty = [] transcript_novelty = [] @@ -2101,13 +2134,13 @@ def identify_monoexon_transcript(chrom, positions, strand, cursor, location_dict transcript_dict, gene_starts, gene_ends, edge_dict, location_dict, - run_info, cursor, tmp_gene) + run_info, cursor, tmp_gene, tmp_t) if gene_ID == None: # Find best gene match using overlap search if the ISM/NIC check didn't work gene_ID, match_strand = search_for_overlap_with_gene(chrom, positions[0], positions[1], strand, cursor, run_info, tmp_gene, - gene_starts, gene_ends) + tmp_t) # Intergenic case if gene_ID == None: gene_ID = create_gene(chrom, positions[0], positions[-1], @@ -2776,7 +2809,8 @@ def annotate_read(sam_record: pysam.AlignedSegment, cursor, run_info, vertex_2_gene, gene_starts, gene_ends, run_info, - struct_collection.tmp_gene) + struct_collection.tmp_gene, + struct_collection.tmp_t) else: annotation_info = identify_monoexon_transcript(chrom, positions, strand, cursor, location_dict, @@ -2784,6 +2818,7 @@ def annotate_read(sam_record: pysam.AlignedSegment, cursor, run_info, vertex_2_gene, gene_starts, gene_ends, run_info, struct_collection.tmp_gene, + struct_collection.tmp_t, struct_collection.tmp_monoexon) annotation_info.read_ID = read_ID From 34491fc65e501d7a0870307ac35fcc5a319ecc54 Mon Sep 17 00:00:00 2001 From: fairliereese Date: Mon, 2 Oct 2023 16:09:50 -0700 Subject: [PATCH 21/31] fixed incorrect sql col names --- src/talon/talon.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/talon/talon.py b/src/talon/talon.py index 418ee2e..0fadc48 100644 --- a/src/talon/talon.py +++ b/src/talon/talon.py @@ -655,10 +655,10 @@ def search_for_overlap_with_gene(chromosome, start, end, strand, strand FROM $tmp_t WHERE (chromosome = '$chrom') AND - ((start <= $min_start AND end >= $max_end) OR - (start >= $min_start AND end <= $max_end) OR - (start >= $min_start AND start <= $max_end) OR - (end >= $min_start AND end <= $max_end)) + ((min_pos <= $min_start AND max_pos >= $max_end) OR + (min_pos >= $min_start AND max_pos <= $max_end) OR + (min_pos >= $min_start AND min_pos <= $max_end) OR + (max_pos >= $min_start AND max_pos <= $max_end)) GROUP BY gene_ID;""").substitute({'tmp_t': tmp_t, 'chrom': chromosome, 'min_start': min_start, 'max_end': max_end}) cursor.execute(query) From 94b4e954014e97b2221e41ace5fe0fd60dee650e Mon Sep 17 00:00:00 2001 From: fairliereese Date: Tue, 3 Oct 2023 11:06:46 -0700 Subject: [PATCH 22/31] added tmp transcript db update when new transcripts are created --- src/talon/talon.py | 55 +++++++++++++++++++++++++++------------------- 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/src/talon/talon.py b/src/talon/talon.py index 0fadc48..9a14db0 100644 --- a/src/talon/talon.py +++ b/src/talon/talon.py @@ -452,13 +452,16 @@ def create_gene(chromosome, start, end, strand, memory_cursor, tmp_gene): return new_ID -def create_transcript(chromosome, start_pos, end_pos, gene_ID, edge_IDs, vertex_IDs, - transcript_dict): - """Creates a novel transcript and adds it to the transcript data structure. +def create_transcript(strand, chromosome, start_pos, end_pos, gene_ID, edge_IDs, vertex_IDs, + transcript_dict, tmp_t, memory_cursor): + """Creates a novel transcript, add it to the transcript data structure, + and add to tmp_t """ print('creating new transcript') new_ID = transcript_counter.increment() print(f'new tid:{new_ID}') + + # updating the dict if len(edge_IDs) > 1: jn_path = ",".join(map(str, edge_IDs[1:-1])) else: @@ -479,6 +482,12 @@ def create_transcript(chromosome, start_pos, end_pos, gene_ID, edge_IDs, vertex_ path_key = frozenset(edge_IDs) transcript_dict[path_key] = new_transcript + # updating tmp_t + new_t = (gene_ID, new_ID, chromosome, strand, min(start_pos, end_pos), max(start_pos, end_pos)) + cols = ' ("gene_ID", "transcript_ID", "chromosome", "strand", "min_pos", "max_pos")' + command = 'INSERT INTO ' + tmp_t + cols + ' VALUES ' + '(?,?,?,?,?,?)' + memory_cursor.execute(command, new_t) + return new_transcript @@ -1030,9 +1039,9 @@ def process_ISM(chrom, positions, strand, edge_IDs, vertex_IDs, all_matches, tra gene_ID = match['gene_ID'] suffix.append(str(match['transcript_ID'])) - novel_transcript = create_transcript(chrom, positions[0], positions[-1], + novel_transcript = create_transcript(strand, chrom, positions[0], positions[-1], gene_ID, edge_IDs, vertex_IDs, - transcript_dict) + transcript_dict, tmp_t, cursor) transcript_ID = novel_transcript['transcript_ID'] @@ -1144,9 +1153,9 @@ def process_NIC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict, start_end_info["vertex_IDs"] = vertex_IDs # Create a new transcript of that gene - novel_transcript = create_transcript(chrom, positions[0], positions[-1], + novel_transcript = create_transcript(strand, chrom, positions[0], positions[-1], gene_ID, edge_IDs, vertex_IDs, - transcript_dict) + transcript_dict, tmp_t, cursor) transcript_ID = novel_transcript["transcript_ID"] novelty = [(transcript_ID, run_info.idprefix, "TALON", "NIC_transcript", "TRUE")] @@ -1327,9 +1336,9 @@ def process_NNC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict, start_end_info["edge_IDs"] = edge_IDs start_end_info["vertex_IDs"] = vertex_IDs - transcript_ID = create_transcript(chrom, positions[0], positions[-1], + transcript_ID = create_transcript(strand, chrom, positions[0], positions[-1], gene_ID, edge_IDs, vertex_IDs, - transcript_dict)["transcript_ID"] + transcript_dict, tmp_t, cursor)["transcript_ID"] novelty.append((transcript_ID, run_info.idprefix, "TALON", "NNC_transcript", "TRUE")) @@ -1392,9 +1401,9 @@ def process_spliced_antisense(chrom, positions, strand, edge_IDs, vertex_IDs, gene_ID = create_gene(chrom, positions[0], positions[-1], strand, cursor, tmp_gene) - transcript_ID = create_transcript(chrom, positions[0], positions[-1], + transcript_ID = create_transcript(strand, chrom, positions[0], positions[-1], gene_ID, edge_IDs, vertex_IDs, - transcript_dict)["transcript_ID"] + transcript_dict, tmp_t, cursor)["transcript_ID"] # Handle gene annotations gene_novelty.append((gene_ID, run_info.idprefix, "TALON", @@ -1474,9 +1483,9 @@ def process_remaining_mult_cases(chrom, positions, strand, edge_IDs, vertex_IDs, gene_novelty.append((gene_ID, run_info.idprefix, "TALON", g_nov, "TRUE")) - transcript_ID = create_transcript(chrom, positions[0], positions[-1], + transcript_ID = create_transcript(strand, chrom, positions[0], positions[-1], gene_ID, edge_IDs, vertex_IDs, - transcript_dict)["transcript_ID"] + transcript_dict, tmp_t, cursor)["transcript_ID"] transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON", t_nov, "TRUE")) @@ -1484,9 +1493,9 @@ def process_remaining_mult_cases(chrom, positions, strand, edge_IDs, vertex_IDs, anti_gene_ID = gene_ID gene_ID = create_gene(chrom, positions[0], positions[-1], strand, cursor, tmp_gene) - transcript_ID = create_transcript(chrom, positions[0], positions[-1], + transcript_ID = create_transcript(strand, chrom, positions[0], positions[-1], gene_ID, edge_IDs, vertex_IDs, - transcript_dict)["transcript_ID"] + transcript_dict, tmp_t, cursor)["transcript_ID"] gene_novelty.append((gene_ID, run_info.idprefix, "TALON", "antisense_gene", "TRUE")) @@ -1495,9 +1504,9 @@ def process_remaining_mult_cases(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON", "antisense_transcript", "TRUE")) else: - transcript_ID = create_transcript(chrom, positions[0], positions[-1], + transcript_ID = create_transcript(strand, chrom, positions[0], positions[-1], gene_ID, edge_IDs, vertex_IDs, - transcript_dict)["transcript_ID"] + transcript_dict, tmp_t, cursor)["transcript_ID"] transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON", "genomic_transcript", "TRUE")) @@ -2148,9 +2157,9 @@ def identify_monoexon_transcript(chrom, positions, strand, cursor, location_dict gene_novelty.append((gene_ID, run_info.idprefix, "TALON", "intergenic_novel", "TRUE")) - transcript_ID = create_transcript(chrom, positions[0], positions[-1], + transcript_ID = create_transcript(strand, chrom, positions[0], positions[-1], gene_ID, edge_IDs, vertex_IDs, - transcript_dict)["transcript_ID"] + transcript_dict, tmp_t, cursor)["transcript_ID"] transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON", "intergenic_transcript", "TRUE")) # Antisense case @@ -2158,9 +2167,9 @@ def identify_monoexon_transcript(chrom, positions, strand, cursor, location_dict anti_gene_ID = gene_ID gene_ID = create_gene(chrom, positions[0], positions[-1], strand, cursor, tmp_gene) - transcript_ID = create_transcript(chrom, positions[0], positions[-1], + transcript_ID = create_transcript(strand, chrom, positions[0], positions[-1], gene_ID, edge_IDs, vertex_IDs, - transcript_dict)["transcript_ID"] + transcript_dict, tmp_t, cursor)["transcript_ID"] gene_novelty.append((gene_ID, run_info.idprefix, "TALON", "antisense_gene", "TRUE")) @@ -2171,9 +2180,9 @@ def identify_monoexon_transcript(chrom, positions, strand, cursor, location_dict # Same strand else: - transcript_ID = create_transcript(chrom, positions[0], positions[-1], + transcript_ID = create_transcript(strand, chrom, positions[0], positions[-1], gene_ID, edge_IDs, vertex_IDs, - transcript_dict)["transcript_ID"] + transcript_dict, tmp_t, cursor)["transcript_ID"] transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON", "genomic_transcript", "TRUE")) From 8135a67f4a278034faeba381a27e92b396eac8f3 Mon Sep 17 00:00:00 2001 From: fairliereese Date: Mon, 9 Oct 2023 10:45:48 -0700 Subject: [PATCH 23/31] started to update with verbosity-tuneable logger --- src/talon/logger.py | 17 +++ src/talon/process_sams.py | 32 +++-- src/talon/talon.py | 240 +++++++++++++++++++++----------------- 3 files changed, 170 insertions(+), 119 deletions(-) create mode 100644 src/talon/logger.py diff --git a/src/talon/logger.py b/src/talon/logger.py new file mode 100644 index 0000000..9d485f0 --- /dev/null +++ b/src/talon/logger.py @@ -0,0 +1,17 @@ +import logging + + +def _init_logger(verbosity): + # https://coralogix.com/blog/python-logging-best-practices-tips/ + # https://stackoverflow.com/questions/14097061/easier-way-to-enable-verbose-logging + + levels = [logging.WARNING, logging.INFO, logging.DEBUG] + level = levels[min(verbosity, len(levels) - 1)] # cap to last level index + + # set defaults + msg_fmt = "%(asctime)s : %(levelname)s : [%(filename)s:%(lineno)d] : %(message)s" + date_fmt = "[ %Y-%m-%d %H:%M:%S ]" + + logging.basicConfig(level=level, + format=msg_fmt, + datefmt=date_fmt) diff --git a/src/talon/process_sams.py b/src/talon/process_sams.py index 4960198..84c8071 100644 --- a/src/talon/process_sams.py +++ b/src/talon/process_sams.py @@ -8,6 +8,7 @@ import pysam import os import time +import logging save = pysam.set_verbosity(0) # pysam.set_verbosity(save) @@ -23,8 +24,11 @@ def convert_to_bam(sam, bam, threads): outfile.write(s) except Exception as e: - print(e) - raise RuntimeError("Problem converting sam file '%s' to bam." % (sam)) + logging.error(e) + msg = f'Problem converting SAM file {sam} to BAM' + logging.error(msg) + raise RuntimeError(msg) + # raise RuntimeError("Problem converting sam file '%s' to bam." % (sam)) def preprocess_sam(sam_files, datasets, use_cb_tag, @@ -90,13 +94,18 @@ def preprocess_sam(sam_files, datasets, use_cb_tag, sorted_bam = tmp_dir + "merged_sorted.bam" pysam.sort("-@", str(n_threads), "-o", sorted_bam, merged_bam) pysam.index(sorted_bam) - ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) - print("[ %s ] Merged input SAM/BAM files" % (ts)) + # ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) + # print("[ %s ] Merged input SAM/BAM files" % (ts)) + logging.info('Merged input SAM/BAM files') except: - raise RuntimeError(("Problem merging and indexing SAM/BAM files. " - "Check your file paths and make sure that all " - "files have headers.")) - + # raise RuntimeError(("Problem merging and indexing SAM/BAM files. " + # "Check your file paths and make sure that all " + # "files have headers.")) + msg = "Problem merging and indexing SAM/BAM files. "+\ + "Check your file paths and make sure that all "+\ + "files have headers." + logging.error(msg) + raise RuntimeError(msg) return sorted_bam @@ -117,8 +126,11 @@ def partition_reads(sam_files, datasets, use_cb_tag, try: gr = pr.read_bam(merged_bam) except Exception as e: - print(e) - raise RuntimeError("Problem opening sam file %s" % (merged_bam)) + # print(e) + logging.error(e) + msg = f'Problem opening SAM file {merged_bam}' + logging.error(msg) + raise RuntimeError(msg) gr = gr.merge(slack=100000000, strand=False) diff --git a/src/talon/talon.py b/src/talon/talon.py index 9a14db0..d85dace 100644 --- a/src/talon/talon.py +++ b/src/talon/talon.py @@ -13,11 +13,14 @@ from pathlib import Path import pandas as pd import warnings +import logging + from . import dstruct from . import process_sams as procsams from . import transcript_utils as tutils from . import query_utils as qutils from . import init_refs as init_refs +from . import logger as logger from talon.post import get_read_annotations import pysam from string import Template @@ -120,6 +123,8 @@ def get_args(): parser.add_argument("--tmpDir", dest="tmp_dir", help="Path to directory for tmp files. Default = `talon_tmp/`", type=str, default="talon_tmp/") + parser.add_argument("--verbosity", "-v", type=int, default=1, + help="Verbosity of TALON output. Higher numbers = more verbose.") parser.add_argument("--o", dest="outprefix", help="Prefix for output files", type=str) @@ -294,9 +299,13 @@ def permissive_match_with_gene_priority(chromosome, position, strand, sj_pos, """ # Check inputs if pos_type != "start" and pos_type != "end": - raise ValueError("Please set pos_type to either 'start' or 'end'.") + msg = "Please set pos_type to either 'start' or 'end'." + logging.error(msg) + raise ValueError(msg) if strand != "+" and strand != "-": - raise ValueError("Invalid strand specified: %s" % strand) + msg = f'Invalid strand specified: {strand}' + logging.error(msg) + raise ValueError(msg) # Try exact match first if chromosome in locations and position in locations[chromosome]: @@ -365,9 +374,13 @@ def permissive_vertex_search(chromosome, position, strand, sj_pos, pos_type, return match['location_ID'], dist if pos_type != "start" and pos_type != "end": - raise ValueError("Please set pos_type to either 'start' or 'end'.") + msg = "Please set pos_type to either 'start' or 'end'." + logging.error(msg) + raise ValueError(msg) if strand != "+" and strand != "-": - raise ValueError("Invalid strand specified: %s" % strand) + msg = f"Invalid strand specified: {s}" + logging.error(msg) + raise ValueError(msg) # If there is no strict match, look for vertices that are # (1) On the correct chromosome @@ -627,28 +640,14 @@ def search_for_overlap_with_gene(chromosome, start, end, strand, Parameters: gene_ID (list of str or None): Restrict results to genes in this list """ - print('in search for overlap with gene') + logging.debug('Tiebreaking for gene assignment') min_start = min(start, end) max_end = max(start, end) query_interval = [min_start, max_end] - # print('query interval') - # print(query_interval) - - # query = Template(""" SELECT gene_ID, - # chromosome, - # MIN(start) AS start, - # MAX(end) AS end, - # strand - # FROM $tmp_gene - # WHERE (chromosome = '$chrom') AND - # ((start <= $min_start AND end >= $max_end) OR - # (start >= $min_start AND end <= $max_end) OR - # (start >= $min_start AND start <= $max_end) OR - # (end >= $min_start AND end <= $max_end)) - # GROUP BY gene_ID;""").substitute({'tmp_gene': tmp_gene, 'chrom': chromosome, - # 'min_start': min_start, 'max_end': max_end}) + if isinstance(gene_IDs, list): query = Template("""SELECT gene_ID, + transcript_ID, chromosome, min_pos, max_pos, @@ -658,6 +657,7 @@ def search_for_overlap_with_gene(chromosome, start, end, strand, 'gene_ids': qutils.format_for_IN(gene_IDs)}) elif not gene_IDs: query = Template("""SELECT gene_ID, + transcript_ID, chromosome, min_pos, max_pos, @@ -672,47 +672,33 @@ def search_for_overlap_with_gene(chromosome, start, end, strand, 'min_start': min_start, 'max_end': max_end}) cursor.execute(query) matches = cursor.fetchall() - print('quwewy:') - print(query) # restrict to just the genes we care about if gene_IDs: - print(f'restricting just to {gene_IDs}') + # print(f'restricting just to {gene_IDs}') + logging.debug(f'Restricing gene tiebreak to {gene_IDs}') matches = [match for match in matches if match['gene_ID'] in gene_IDs] if len(matches) == 0: - print('herere here') + # print('herere here') + logging.debug(f'Unable to tiebreak') return None, None # Among multiple matches, preferentially return the same-strand gene with # the greatest amount of overlap - # print('start+end') - # print(start) - # print(end) same_strand_matches = len([x for x in matches if x["strand"] == strand]) - # for m in matches: - # print() - # print(m['gene_ID']) - # print(m['start']) - # print(m['end']) - # - # print(same_strand_matches) if strand == "+" and same_strand_matches > 0 or \ strand == "-" and same_strand_matches == 0: matches = [x for x in matches if x["strand"] == "+"] - # best_match = get_best_match(matches, query_interval) best_match = get_best_match(matches, min_start, max_end) else: matches = [x for x in matches if x["strand"] == "-"] - # best_match = get_best_match(matches, query_interval) best_match = get_best_match(matches, min_start, max_end) - print(f"but right here it says {best_match['gene_ID']}") - return best_match['gene_ID'], best_match['strand'] def get_best_match(matches, min_end, max_end): @@ -724,25 +710,24 @@ def get_best_match(matches, min_end, max_end): min_dist = sys.maxsize best_match = None - print(f'read min: {min_end}') - print(f'read end: {max_end}') + # print(f'read min: {min_end}') + # print(f'read end: {max_end}') + logging.debug(f'Read start / end: ({min_end}, {min_end})') for match in matches: - print() - print(f"gene: {match['gene_ID']}") + logging.debug(f"Matching with transcripts from gene {match['gene_ID']}, transcript {match['transcript_ID']}") end_dist = abs(match['max_pos']-max_end) start_dist = abs(match['min_pos']-min_end) - print(f"gene start: {match['min_pos']}") - print(f"gene end: {match['max_pos']}") + logging.debug(f"Transcript start / end: ({match['min_pos']}, {match['max_pos']})") dist = end_dist+start_dist - print(f'dist: {dist}') + logging.debug(f'Distance between read and transcript ends: {dist}') if dist < min_dist: min_dist = dist best_match = match - print('best match') - print(best_match['gene_ID']) + logging.debug(f"Best gene match: {best_match['gene_ID']}") + # print(best_match['gene_ID']) return best_match @@ -1747,7 +1732,9 @@ def check_inputs(options): # Make sure that the input database exists! database = options.database if not Path(database).exists(): - raise ValueError("Database file '%s' does not exist!" % database) + msg = f"Database file '{s}' does not exist!" + logging.error(msg) + raise ValueError(msg) # Make sure that the genome build exists in the provided TALON database. with sqlite3.connect(database) as conn: @@ -1756,8 +1743,10 @@ def check_inputs(options): builds = [str(x[0]) for x in cursor.fetchall()] if options.build not in builds: build_names = ", ".join(list(builds)) - raise ValueError("Please specify a genome build that exists in the" + - " database. The choices are: " + build_names) + msg = "Please specify a genome build that exists in the" +\ + " database. The choices are: " + build_names + logging.error(msg) + raise ValueError(msg) # Make sure that each input dataset is not already in the database, and # also make sure that each dataset name is unique @@ -1774,33 +1763,37 @@ def check_inputs(options): line = line.strip().split(',') curr_sam = line[3] if len(line) != 4: - raise ValueError('Incorrect number of comma-separated fields' + - ' in config file. There should be four: ' + - '(dataset name, sample description, ' + - 'platform, associated sam/bam file).') + msg = 'Incorrect number of comma-separated fields' +\ + ' in config file. There should be four: ' +\ + '(dataset name, sample description, ' +\ + 'platform, associated sam/bam file).' + logging.error(msg) + raise ValueError(msg) # Make sure that the sam file exists if not Path(curr_sam).exists(): - raise ValueError( - "SAM/BAM file '%s' does not exist!" % curr_sam) + msg = f"SAM/BAM file '{curr_sam}' does not exist!" + logging.error(msg) + raise ValueError(msg) metadata = (line[0], line[1], line[2]) dataname = metadata[0] if dataname in existing_datasets: - warnings.warn("Ignoring dataset with name '" + dataname + + logging.warning("Ignoring dataset with name '" + dataname + "' because it is already in the database.") elif dataname in curr_datasets: - warnings.warn("Skipping duplicated instance of dataset '" + + logging.warning("Skipping duplicated instance of dataset '" + dataname + "'.") elif curr_sam in sam_files: - warnings.warn("Skipping duplicated instance of sam file '" + + logging.warning("Skipping duplicated instance of sam file '" + curr_sam + "'.") else: dataset_metadata.append(metadata) curr_datasets.append(dataname) if not curr_sam.endswith(".sam") and not curr_sam.endswith(".bam"): - raise ValueError( - 'Last field in config file must be a .sam/.bam file') + msg = 'Last field in config file must be a .sam/.bam file' + logging.error(msg) + raise ValueError(msg) sam_files.append(curr_sam) # if we are using the RG tag, check that the config file adheres to the @@ -1825,15 +1818,18 @@ def check_inputs(options): line = line.strip().split(',') curr_sam = line[2] if len(line) != 3: - raise ValueError('Incorrect number of comma-separated fields' + - ' in config file. There should be three: ' + - '(sample description, ' + - 'platform, associated sam/bam file).') + msg = 'Incorrect number of comma-separated fields' +\ + ' in config file. There should be three: ' +\ + '(sample description, ' +\ + 'platform, associated sam/bam file).' + logging.error(msg) + raise ValueError(msg) # Make sure that the sam file exists if not Path(curr_sam).exists(): - raise ValueError( - "SAM/BAM file '%s' does not exist!" % curr_sam) + msg = f"SAM/BAM file '{curr_sam}' does not exist!" + logging.error(msg) + raise ValueError(msg) metadata = ['', line[0], line[1]] # get list of dataset names from the CB tag in the sam file @@ -1854,8 +1850,9 @@ def check_inputs(options): names=['cb_tag'], engine='python') # is the df empty? if df.empty: - raise RuntimeError( - "SAM/BAM file contains no CB tags") + msg = 'SAM/BAM file contains no CB tags' + logging.error(msg) + raise RuntimeError(msg) df['dataset'] = df.cb_tag.str.split( pat='\t', n=1, expand=True)[0] datasets = df.dataset.unique().tolist() @@ -1876,24 +1873,29 @@ def check_inputs(options): metadata[0] = dataname if dataname in existing_datasets: - raise RuntimeError((f"Dataset for read group {f} " + - "already in database.")) + msg = f"Dataset for read group {f} " +\ + "already in database." + logging.error(msg) + raise RuntimeError(msg) # warnings.warn("Ignoring dataset with name '" + dataname + \ # "' because it is already in the database.") elif dataname in curr_datasets: - raise RuntimeError((f"Dataset for read group {f} " + - "already in config file.")) + msg = f"Dataset for read group {f} " +\ + "already in config file." + logging.error(msg) + raise RuntimeError(msg) # warnings.warn("Skipping duplicated instance of dataset '" + \ # dataname + "'.") else: dataset_metadata.append(tuple(metadata)) curr_datasets.append(dataname) if curr_sam in sam_files: - warnings.warn("Skipping duplicated instance of sam/bam file '" + + logging.warning("Skipping duplicated instance of sam/bam file '" + curr_sam + "'.") if not curr_sam.endswith(".sam") and not curr_sam.endswith(".bam"): - raise ValueError( - 'Last field in config file must be a .sam/.bam file') + msg = 'Last field in config file must be a .sam/.bam file' + logging.error(msg) + raise ValueError(msg) sam_files.append(curr_sam) # else: @@ -1904,8 +1906,10 @@ def check_inputs(options): # sam_files.append(curr_sam) if sam_files == []: - raise RuntimeError(("All of the provided dataset names are already in " - "the database. Please check your config file.")) + msg = "All of the provided dataset names are already in "+\ + "the database. Please check your config file." + logging.error(msg) + raise RuntimeError(msg) return sam_files, dataset_metadata @@ -2062,7 +2066,9 @@ def compute_delta(orig_pos, new_pos, strand): else: return -1*abs_dist else: - raise ValueError("Strand must be either + or -") + msg = 'Strand must be either + or -' + logging.error(msg) + raise ValueError(msg) def identify_monoexon_transcript(chrom, positions, strand, cursor, location_dict, @@ -2445,8 +2451,10 @@ def batch_add_annotations(cursor, annot_file, annot_type, batch_size): """ batch_size = 1 if annot_type not in ["gene", "transcript", "exon"]: - raise ValueError("When running batch annot update, must specify " + - "annot_type as 'gene', 'exon', or 'transcript'.") + msg = "When running batch annot update, must specify " +\ + "annot_type as 'gene', 'exon', or 'transcript'." + logging.error(msg) + raise ValueError(msg) with open(annot_file, 'r') as f: while True: @@ -2566,8 +2574,9 @@ def batch_add_abundance(cursor, entries, batch_size): def check_database_integrity(cursor): """ Perform some checks on the database. Run before committing changes""" - ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) - print("[ %s ] Validating database........" % (ts)) + # ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) + # print("[ %s ] Validating database........" % (ts)) + logging.info('Validating database') # For each category, check that the number of table entries matches the counter counter_query = "SELECT * FROM counters" @@ -2588,15 +2597,17 @@ def check_database_integrity(cursor): if actual_count != curr_counter: fail = 1 - print("Database counter for '" + table_name + + logging.error("Database counter for '" + table_name + "' does not match the number of entries in the table." + " Discarding changes to database and exiting...") - print("table_count: " + str(actual_count)) - print("counter_value: " + str(curr_counter)) + logging.debug("table_count: " + str(actual_count)) + logging.debug("counter_value: " + str(curr_counter)) if fail == 1: - raise RuntimeError("Discrepancy found in database. " + - "Discarding changes to database and exiting...") + msg = "Discrepancy found in database. " +\ + "Discarding changes to database and exiting..." + logging.error(msg) + raise RuntimeError(msg) return @@ -2609,9 +2620,10 @@ def parallel_talon(read_file, interval, database, run_info, queue): added to the database, OR alternately, pickle them and write to file where they can be accessed later. """ - ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) - print("[ %s ] Annotating reads in interval %s:%d-%d..." % - (ts, interval[0], interval[1], interval[2])) + # ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) + # print("[ %s ] Annotating reads in interval %s:%d-%d..." % + # (ts, interval[0], interval[1], interval[2])) + logging.info(f'Annotating reads in interval {interval[0]}:{interval[1]}-{interval[2]}...') with sqlite3.connect(database) as conn: conn.row_factory = sqlite3.Row @@ -2887,8 +2899,9 @@ def listener(queue, outfiles, QC_header, timeout=72): msg_fname = msg[0] msg_value = msg[1] if datetime.now() > wait_until or msg_value == 'complete': - ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) - print("[ %s ] Shutting down message queue..." % (ts)) + # ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) + # print("[ %s ] Shutting down message queue..." % (ts)) + logging.info('Shutting down message queue...') for f in open_files.values(): f.close() break @@ -2914,10 +2927,14 @@ def make_QC_header(coverage, identity, length): def main(): """ Runs program """ - ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) - print("[ %s ] Started TALON run" % (ts)) - options = get_args() + logger._init_logger(options.verbosity) + + # ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) + # print("[ %s ] Started TALON run" % (ts)) + logging.info('Started TALON run') + + sam_files, dset_metadata = check_inputs(options) # print(sam_files) # print(dset_metadata[:5]) @@ -2969,8 +2986,8 @@ def main(): read_files = procsams.write_reads_to_file( read_groups, intervals, header_file, tmp_dir=tmp_dir) - ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) - print("[ %s ] Split reads into %d intervals" % (ts, len(read_groups))) + # ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) + logging.info(f'Split reads into {len(read_groups)} intervals') # Set up a queue specifically for writing to outfiles manager = mp.Manager() @@ -2981,8 +2998,9 @@ def main(): for read_file, interval in zip(read_files, intervals): jobs.append((read_file, interval, database, run_info, queue)) - ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) - print("[ %s ] Launching parallel annotation jobs" % (ts)) + # ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) + # print("[ %s ] Launching parallel annotation jobs" % (ts)) + logging.info('Launching parallel annotation jobs') # Start running listener, which will monitor queue for messages QC_header = make_QC_header(run_info.min_coverage, run_info.min_identity, @@ -2998,19 +3016,22 @@ def main(): pool.close() pool.join() - ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) - print("[ %s ] All jobs complete. Starting database update." % (ts)) + # ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) + # print("[ %s ] All jobs complete. Starting database update." % (ts)) + logging.info('All jobs complete. Starting database update') # Update the database batch_size = 10000 update_database(database, batch_size, run_info.outfiles, dataset_db_entries) - ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) - print("[ %s ] Database update complete." % (ts)) + # ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) + # print("[ %s ] Database update complete." % (ts)) + logging.info('Database update complete.') # Write output reads file - ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) - print("[ %s ] Creating read-wise annotation file." % (ts)) + # ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) + # print("[ %s ] Creating read-wise annotation file." % (ts)) + logging.info('Creating read-wise annotation file') get_read_annotations.make_read_annot_file(database, build, outprefix, datasets=datasets) @@ -3019,8 +3040,9 @@ def main(): #print("Transcripts: %d" % transcript_counter.value()) #print("Observed: %d" % observed_counter.value()) - ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) - print("[ %s ] DONE" % (ts)) + # ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) + # print("[ %s ] DONE" % (ts)) + logging.info('DONE') if __name__ == '__main__': From da547a1cd3d4c704eede27e384db77958555448c Mon Sep 17 00:00:00 2001 From: fairliereese Date: Mon, 9 Oct 2023 10:50:01 -0700 Subject: [PATCH 24/31] formatting changes --- src/talon/dstruct.py | 7 +- src/talon/edge.py | 125 +- src/talon/gene.py | 136 +- src/talon/init_refs.py | 233 +- src/talon/initialize_talon_database.py | 518 ++-- src/talon/length_utils.py | 34 +- src/talon/logger.py | 4 +- src/talon/post/ab_utils.py | 64 +- src/talon/post/call_longest_ends.py | 293 ++- .../create_GTF_abundance_from_database.py | 111 +- src/talon/post/create_GTF_from_database.py | 326 ++- .../create_abundance_file_from_database.py | 165 +- .../post/create_anndata_from_database.py | 254 +- src/talon/post/filter_talon_transcripts.py | 357 +-- src/talon/post/generate_talon_report.py | 6 +- src/talon/post/get_read_annotations.py | 395 ++- src/talon/post/get_transcript_sjs.py | 712 ++--- .../post/map_antisense_genes_to_sense.py | 50 +- src/talon/post/post_utils.py | 22 +- src/talon/post/summarize_datasets.py | 113 +- src/talon/process_sams.py | 90 +- src/talon/query_utils.py | 304 ++- src/talon/reformat_gtf.py | 300 ++- src/talon/talon.py | 2339 +++++++++-------- src/talon/talon_label_reads.py | 234 +- src/talon/transcript.py | 253 +- src/talon/transcript_utils.py | 179 +- 27 files changed, 4194 insertions(+), 3430 deletions(-) diff --git a/src/talon/dstruct.py b/src/talon/dstruct.py index da32d7e..b922922 100644 --- a/src/talon/dstruct.py +++ b/src/talon/dstruct.py @@ -3,10 +3,11 @@ class Struct(dict): Make a dict behave as a struct. Example: - + test = Struct(a=1, b=2, c=3) """ - def __init__(self,**kw): - dict.__init__(self,kw) + + def __init__(self, **kw): + dict.__init__(self, kw) self.__dict__ = self diff --git a/src/talon/edge.py b/src/talon/edge.py index 0fccecb..8f2ebd9 100644 --- a/src/talon/edge.py +++ b/src/talon/edge.py @@ -1,29 +1,29 @@ # TALON: Techonology-Agnostic Long Read Analysis Pipeline # Author: Dana Wyman -#------------------------------------------------------------------------------ +# ------------------------------------------------------------------------------ + class Edge(object): """Stores information about an edge, including its location - and the gene/transcript(s) it belongs to. - Attributes: - identifier: Accession ID of the edge - gene: Accession ID of the gene that the edge belongs to - transcript_ids: Set of transcript accession IDs that the edge - belongs to - chromosome: Chromosome that the transcript is located on - (format "chr1") - start: The start position of the edge with respect to the - forward strand - end: The end position of the edge with respect to the - forward strand - strand: "+" if the edge is on the forward strand, and "-" if - it is on the reverse strand - - length: The length of the edge + and the gene/transcript(s) it belongs to. + Attributes: + identifier: Accession ID of the edge + gene: Accession ID of the gene that the edge belongs to + transcript_ids: Set of transcript accession IDs that the edge + belongs to + chromosome: Chromosome that the transcript is located on + (format "chr1") + start: The start position of the edge with respect to the + forward strand + end: The end position of the edge with respect to the + forward strand + strand: "+" if the edge is on the forward strand, and "-" if + it is on the reverse strand + + length: The length of the edge """ - def __init__(self, identifier, chromosome, start, end, strand, gene_id, - transcript_id, annotations): + def __init__(self, identifier, chromosome, start, end, strand, gene_id, transcript_id, annotations): self.chromosome = str(chromosome) self.gene_id = gene_id self.start = int(start) @@ -40,27 +40,27 @@ def __init__(self, identifier, chromosome, start, end, strand, gene_id, self.v2 = None def print_edge(self): - """ Prints a string representation of the edge""" - print(self.identifier + ": " + self.chromosome + ":" + \ - str(self.start) + "-" + str(self.end)) + """Prints a string representation of the edge""" + print(self.identifier + ": " + self.chromosome + ":" + str(self.start) + "-" + str(self.end)) print(self.transcript_ids) return + def create_edge_from_gtf(edge_info): - """ Creates an edge object using information from a GTF entry - Args: - edge_info: A list containing fields from a GTF file edge entry. - Example: - ['chr1', 'HAVANA', 'exon', '11869', '12227', '.', '+', '.', - 'gene_id "ENSG00000223972.5"; transcript_id "ENST00000456328.2"; - gene_type "transcribed_unprocessed_pseudogene"; - gene_status "KNOWN"; gene_name "DDX11L1"; - transcript_type "processed_transcript"; - transcript_status "KNOWN"; transcript_name "DDX11L1-002"; - edge_number 1; edge_id "ENSE00002234944.1"; level 2; - tag "basic"; transcript_support_level "1"; - havana_gene "OTTHUMG00000000961.2"; - havana_transcript "OTTHUMT00000362751.1";'] + """Creates an edge object using information from a GTF entry + Args: + edge_info: A list containing fields from a GTF file edge entry. + Example: + ['chr1', 'HAVANA', 'exon', '11869', '12227', '.', '+', '.', + 'gene_id "ENSG00000223972.5"; transcript_id "ENST00000456328.2"; + gene_type "transcribed_unprocessed_pseudogene"; + gene_status "KNOWN"; gene_name "DDX11L1"; + transcript_type "processed_transcript"; + transcript_status "KNOWN"; transcript_name "DDX11L1-002"; + edge_number 1; edge_id "ENSE00002234944.1"; level 2; + tag "basic"; transcript_support_level "1"; + havana_gene "OTTHUMG00000000961.2"; + havana_transcript "OTTHUMT00000362751.1";'] """ description = edge_info[-1] start = int(edge_info[3]) @@ -71,38 +71,39 @@ def create_edge_from_gtf(edge_info): annotations = extract_edge_annotations_from_GTF(edge_info) if "exon_id" not in annotations: annotations["exon_id"] = "_".join([chromosome, str(start), str(end), strand]) - gene_id = annotations['gene_id'] - transcript_id = annotations['transcript_id'] + gene_id = annotations["gene_id"] + transcript_id = annotations["transcript_id"] edge_id = "_".join([chromosome, str(start), str(end), strand]) if "gene_id" in description: gene_id = (description.split("gene_id ")[1]).split('"')[1] if "transcript_id" in description: transcript_id = (description.split("transcript_id ")[1]).split('"')[1] - - edge = Edge(edge_id, chromosome, start, end, strand, gene_id, transcript_id, - annotations) + + edge = Edge(edge_id, chromosome, start, end, strand, gene_id, transcript_id, annotations) return edge + def extract_edge_annotations_from_GTF(tab_fields): - """ Extracts key-value annotations from the GTF description field - """ + """Extracts key-value annotations from the GTF description field""" attributes = {} # remove trailing newline and split by semicolon - description = tab_fields[-1].strip('\n') - description = description.split(';') + description = tab_fields[-1].strip("\n") + description = description.split(";") # Parse description for fields in description: - if fields == "" or fields == " ": continue + if fields == "" or fields == " ": + continue fields = fields.split() - if fields[0] == '': fields = fields[1:] + if fields[0] == "": + fields = fields[1:] + + key = fields[0].replace('"', "") + val = " ".join(fields[1:]).replace('"', "") - key = fields[0].replace('"', '') - val = ' '.join(fields[1:]).replace('"', '') - attributes[key] = val # Put in placeholders for important attributes (such as gene_id) if they @@ -116,29 +117,29 @@ def extract_edge_annotations_from_GTF(tab_fields): return attributes + def get_edge_from_db(vertex_info_1, vertex_info_2): - """ Uses information from a database edge entry to create an edge object. - """ + """Uses information from a database edge entry to create an edge object.""" if vertex_info_1["edge_id"] != vertex_info_2["edge_id"]: - raise ValueError('Tried to create edge from endpoints with different IDs') + raise ValueError("Tried to create edge from endpoints with different IDs") edge_id = vertex_info_1["edge_id"] - chromosome = vertex_info_1['chromosome'] - start = min(vertex_info_1['position'], vertex_info_2['position']) - end = max(vertex_info_1['position'], vertex_info_2['position']) - strand = vertex_info_1['strand'] - gene_id = vertex_info_1['gene_id'] + chromosome = vertex_info_1["chromosome"] + start = min(vertex_info_1["position"], vertex_info_2["position"]) + end = max(vertex_info_1["position"], vertex_info_2["position"]) + strand = vertex_info_1["strand"] + gene_id = vertex_info_1["gene_id"] edge = Edge(edge_id, chromosome, start, end, strand, gene_id, None, None) edge.v1 = str(vertex_info_1["vertex_ID"]) edge.v2 = str(vertex_info_2["vertex_ID"]) return edge + def create_novel_edge(chromosome, start, end, strand, gene_id, transcript_id, counter): - """ Creates a novel edge with a unique identifier (obtained using - counter). Returns the edge object as well as the updated counter. + """Creates a novel edge with a unique identifier (obtained using + counter). Returns the edge object as well as the updated counter. """ counter["edges"] += 1 curr_novel = counter["edges"] - edge = Edge(curr_novel, chromosome, start, end, strand, gene_id, transcript_id, - None) + edge = Edge(curr_novel, chromosome, start, end, strand, gene_id, transcript_id, None) return edge diff --git a/src/talon/gene.py b/src/talon/gene.py index 52121ea..1f8188f 100644 --- a/src/talon/gene.py +++ b/src/talon/gene.py @@ -1,23 +1,24 @@ # TALON: Techonology-Agnostic Long Read Analysis Pipeline # Author: Dana Wyman -#------------------------------------------------------------------------------ +# ------------------------------------------------------------------------------ + class Gene(object): - """ Contains high-level information about a gene, such as its identifiers, - genomic location, and transcripts. Does not contain exon information. - Attributes: - - identifier: Accession ID of gene, i.e. an Ensembl ID. Required. - - name: Human-readable name of the gene. This attribute can be left - empty if the gene does not have an assigned name. - - chromosome: Chromosome that the gene is located on (format "chr1") - - start: The start position of the gene with respect to the forward - strand (int). Should always be less than or equal to end. - - end: The end position of the gene with respect to the forward strand - (int). Should always be greater than or equal to start. - - strand: "+" if the gene is on the forward strand, "-" if it is on - the reverse strand - - annotations: a dictionary of miscellaneous annotation categories - extracted from a GTF + """Contains high-level information about a gene, such as its identifiers, + genomic location, and transcripts. Does not contain exon information. + Attributes: + - identifier: Accession ID of gene, i.e. an Ensembl ID. Required. + - name: Human-readable name of the gene. This attribute can be left + empty if the gene does not have an assigned name. + - chromosome: Chromosome that the gene is located on (format "chr1") + - start: The start position of the gene with respect to the forward + strand (int). Should always be less than or equal to end. + - end: The end position of the gene with respect to the forward strand + (int). Should always be greater than or equal to start. + - strand: "+" if the gene is on the forward strand, "-" if it is on + the reverse strand + - annotations: a dictionary of miscellaneous annotation categories + extracted from a GTF """ @@ -35,51 +36,48 @@ def __init__(self, identifier, chromosome, start, end, strand, annotations): self.annotations = annotations if start > end: - raise ValueError("""Plus strand gene start must be less than or - equal to end.""") + raise ValueError( + """Plus strand gene start must be less than or + equal to end.""" + ) def set_name(self, name): - """ Sets the name attribute of the Gene to the provided value. - """ - self.annotations['name'] = name + """Sets the name attribute of the Gene to the provided value.""" + self.annotations["name"] = name return def add_transcript(self, transcript): - """ Adds a key-value pair (transcript identifier -> Transcript oject) - to the gene's transcript dictionary - Args: - transcript: object of type Transcript. Must overlap with the - location of the gene. + """Adds a key-value pair (transcript identifier -> Transcript oject) + to the gene's transcript dictionary + Args: + transcript: object of type Transcript. Must overlap with the + location of the gene. """ if transcript.start >= self.end or transcript.end <= self.start: - # only throw the error if we have a multi-bp transcript if transcript.start != transcript.end: transcript_id = transcript.identifier gene_id = transcript.gene_id - raise ValueError(f'Transcript ({transcript_id}) must overlap the gene ({gene_id}) it is assigned to') + raise ValueError(f"Transcript ({transcript_id}) must overlap the gene ({gene_id}) it is assigned to") if transcript.gene_id == self.identifier: # In order to belong to a gene, the transcript gene_id must match transcript_id = transcript.identifier self.transcripts[transcript_id] = transcript else: - raise ValueError('Gene ID of transcript must match gene ' + \ - 'in order for assignment to be made.') + raise ValueError("Gene ID of transcript must match gene " + "in order for assignment to be made.") return - def print_gene(self): - """ Print a string representation of the Gene. Good for debugging. """ + """Print a string representation of the Gene. Good for debugging.""" if "name" in self.annotations != "": # Include name in output if there is one - print(self.identifier + " (" + self.annotations['name'] + "):") + print(self.identifier + " (" + self.annotations["name"] + "):") else: print(self.identifier + ":") - print("\tLocation: " + self.chromosome + ":" + str(self.start) + "-" + \ - str(self.end) + "(" + self.strand + ")") + print("\tLocation: " + self.chromosome + ":" + str(self.start) + "-" + str(self.end) + "(" + self.strand + ")") # Print transcripts in shorthand for transcript in self.transcripts: @@ -87,37 +85,38 @@ def print_gene(self): return + def get_gene_from_db(gene_start_row, gene_end_row): - """ Uses information from a database gene entry to create a + """Uses information from a database gene entry to create a Gene object. Args: gene_row: Tuple-formatted row from 'genes' table of a TALON database """ - if gene_start_row['gene_id'] != gene_end_row['gene_id']: - raise ValueError("get_gene_from_db: provided start and stop " + \ - "come from different genes") - gene_id = gene_start_row['gene_ID'] - chromosome = gene_start_row['chromosome'] + if gene_start_row["gene_id"] != gene_end_row["gene_id"]: + raise ValueError("get_gene_from_db: provided start and stop " + "come from different genes") + gene_id = gene_start_row["gene_ID"] + chromosome = gene_start_row["chromosome"] start = gene_start_row[2] end = gene_end_row[2] - strand = gene_start_row['strand'] + strand = gene_start_row["strand"] - #transcripts = {} #gene_row['transcript_ids'].split(",") + # transcripts = {} #gene_row['transcript_ids'].split(",") gene = Gene(gene_id, chromosome, start, end, strand, {}) return gene + def get_gene_from_gtf(gene_info): - """ Creates a Gene object from a GTF file entry - Args: - gene_info: A list containing fields from a GTF file gene entry. - Example: - ['chr1', 'HAVANA', 'gene', '11869', '14409', '.', '+', '.', - 'gene_id "ENSG00000223972.5"; - gene_type "transcribed_unprocessed_pseudogene"; - gene_status "KNOWN"; gene_name "DDX11L1"; level 2; - havana_gene "OTTHUMG00000000961.2";'] + """Creates a Gene object from a GTF file entry + Args: + gene_info: A list containing fields from a GTF file gene entry. + Example: + ['chr1', 'HAVANA', 'gene', '11869', '14409', '.', '+', '.', + 'gene_id "ENSG00000223972.5"; + gene_type "transcribed_unprocessed_pseudogene"; + gene_status "KNOWN"; gene_name "DDX11L1"; level 2; + havana_gene "OTTHUMG00000000961.2";'] """ chromosome = gene_info[0] start = int(gene_info[3]) @@ -125,31 +124,34 @@ def get_gene_from_gtf(gene_info): strand = gene_info[6] annotations = extract_gene_annotations_from_GTF(gene_info) if "gene_id" not in gene_info[-1]: - raise ValueError('GTF entry lacks a gene_id field') - gene_id = annotations['gene_id'] + raise ValueError("GTF entry lacks a gene_id field") + gene_id = annotations["gene_id"] gene = Gene(gene_id, chromosome, start, end, strand, annotations) return gene + def extract_gene_annotations_from_GTF(tab_fields): """Parses the description field of a gene GTF in order to organize the - information therein into a dictionary. + information therein into a dictionary. """ attributes = {} # remove trailing newline and split by semicolon - description = tab_fields[-1].strip('\n') - description = description.split(';') + description = tab_fields[-1].strip("\n") + description = description.split(";") # Parse description for fields in description: - if fields == "" or fields == " ": continue + if fields == "" or fields == " ": + continue fields = fields.split() - if fields[0] == '': fields = fields[1:] + if fields[0] == "": + fields = fields[1:] - key = fields[0].replace('"', '') - val = ' '.join(fields[1:]).replace('"', '') + key = fields[0].replace('"', "") + val = " ".join(fields[1:]).replace('"', "") attributes[key] = val @@ -157,10 +159,11 @@ def extract_gene_annotations_from_GTF(tab_fields): return attributes + def get_gene_from_exon(exon, gene_id): - """ In rare cases, GTF exons are listed with gene and transcript IDs that - do not have corresponding entries. In this case, we create a gene - for this exon for bookkeeping purposes.""" + """In rare cases, GTF exons are listed with gene and transcript IDs that + do not have corresponding entries. In this case, we create a gene + for this exon for bookkeeping purposes.""" gene_name = gene_id chromosome = exon.chromosome @@ -170,9 +173,10 @@ def get_gene_from_exon(exon, gene_id): gene = Gene(gene_id, gene_name, None, chromosome, start, end, strand) return gene + def create_novel_gene(chromosome, start, end, strand, counter): - """ Creates a novel gene with a unique identifier (obtained using - counter). Returns the gene object as well as the updated counter. + """Creates a novel gene with a unique identifier (obtained using + counter). Returns the gene object as well as the updated counter. """ gene_id = str(counter["genes"] + 1) counter["genes"] += 1 diff --git a/src/talon/init_refs.py b/src/talon/init_refs.py index df7f86f..735a174 100644 --- a/src/talon/init_refs.py +++ b/src/talon/init_refs.py @@ -13,21 +13,23 @@ # make_gene_start_and_end_dict from string import Template + import pandas as pd -def make_temp_novel_gene_table(cursor, build, chrom = None, start = None, - end = None, tmp_tab = "temp_gene"): - """ Attaches a temporary database with a table that has the following fields: - - gene_ID - - chromosome - - start - - end - - strand - The purpose is to track novel genes from this run in order to match - transcripts to them when other forms of gene assignment have failed. + +def make_temp_novel_gene_table(cursor, build, chrom=None, start=None, end=None, tmp_tab="temp_gene"): + """Attaches a temporary database with a table that has the following fields: + - gene_ID + - chromosome + - start + - end + - strand + The purpose is to track novel genes from this run in order to match + transcripts to them when other forms of gene assignment have failed. """ if any(val == None for val in [chrom, start, end]): - command = Template(""" CREATE TEMPORARY TABLE IF NOT EXISTS $tmp_tab AS + command = Template( + """ CREATE TEMPORARY TABLE IF NOT EXISTS $tmp_tab AS SELECT gene_ID, chromosome, start, @@ -42,9 +44,11 @@ def make_temp_novel_gene_table(cursor, build, chrom = None, start = None, LEFT JOIN vertex as v ON g.gene_ID = v.gene_ID LEFT JOIN location as loc ON loc.location_ID = v.vertex_ID WHERE loc.genome_build = '$build' - GROUP BY g.gene_ID); """) + GROUP BY g.gene_ID); """ + ) else: - command = Template(""" CREATE TEMPORARY TABLE IF NOT EXISTS $tmp_tab AS + command = Template( + """ CREATE TEMPORARY TABLE IF NOT EXISTS $tmp_tab AS SELECT gene_ID, chromosome, start, @@ -64,29 +68,29 @@ def make_temp_novel_gene_table(cursor, build, chrom = None, start = None, AND ((start <= $start AND end >= $end) OR (start >= $start AND end <= $end) OR (start >= $start AND start <= $end) - OR (end >= $start AND end <= $end)); """) + OR (end >= $start AND end <= $end)); """ + ) - command = command.substitute({'tmp_tab':tmp_tab, 'build':build, 'chrom':chrom, - 'start':start, 'end':end}) + command = command.substitute({"tmp_tab": tmp_tab, "build": build, "chrom": chrom, "start": start, "end": end}) cursor.execute(command) return tmp_tab -def make_temp_transcript_table(cursor, build, chrom = None, - start = None, end = None, - tmp_tab = "temp_transcript"): - """ Attaches a temporary database with a table that has the following fields: - - gene_ID - - transcript_ID - - chromosome - - start (min position) - - end (max position) - - strand - The purpose is to allow location-based matching tiebreaking - transcripts. """ + +def make_temp_transcript_table(cursor, build, chrom=None, start=None, end=None, tmp_tab="temp_transcript"): + """Attaches a temporary database with a table that has the following fields: + - gene_ID + - transcript_ID + - chromosome + - start (min position) + - end (max position) + - strand + The purpose is to allow location-based matching tiebreaking + transcripts.""" if any(val == None for val in [chrom, start, end]): - command = Template(""" CREATE TEMPORARY TABLE IF NOT EXISTS $tmp_tab AS + command = Template( + """ CREATE TEMPORARY TABLE IF NOT EXISTS $tmp_tab AS SELECT t.gene_ID, t.transcript_ID, loc1.chromosome, @@ -101,9 +105,11 @@ def make_temp_transcript_table(cursor, build, chrom = None, LEFT JOIN genes ON genes.gene_ID = t.gene_ID WHERE loc1.genome_build = '$build' - AND loc2.genome_build = '$build' """) + AND loc2.genome_build = '$build' """ + ) else: - command = Template(""" CREATE TEMPORARY TABLE IF NOT EXISTS $tmp_tab AS + command = Template( + """ CREATE TEMPORARY TABLE IF NOT EXISTS $tmp_tab AS SELECT t.gene_ID, t.transcript_ID, loc1.chromosome, @@ -124,30 +130,29 @@ def make_temp_transcript_table(cursor, build, chrom = None, AND ((min_pos <= $start AND max_pos >= $end) OR (min_pos >= $start AND max_pos <= $end) OR (min_pos >= $start AND min_pos <= $end) - OR (max_pos >= $start AND max_pos <= $end))""") + OR (max_pos >= $start AND max_pos <= $end))""" + ) - command = command.substitute({'build':build, 'chrom':chrom, - 'start':start, 'end':end, - 'tmp_tab':tmp_tab}) + command = command.substitute({"build": build, "chrom": chrom, "start": start, "end": end, "tmp_tab": tmp_tab}) cursor.execute(command) return tmp_tab -def make_temp_monoexonic_transcript_table(cursor, build, chrom = None, - start = None, end = None, - tmp_tab = "temp_monoexon"): - """ Attaches a temporary database with a table that has the following fields: - - gene_ID - - transcript_ID - - chromosome - - start (min position) - - end (max position) - - strand - The purpose is to allow location-based matching for monoexonic query - transcripts. """ + +def make_temp_monoexonic_transcript_table(cursor, build, chrom=None, start=None, end=None, tmp_tab="temp_monoexon"): + """Attaches a temporary database with a table that has the following fields: + - gene_ID + - transcript_ID + - chromosome + - start (min position) + - end (max position) + - strand + The purpose is to allow location-based matching for monoexonic query + transcripts.""" if any(val == None for val in [chrom, start, end]): - command = Template(""" CREATE TEMPORARY TABLE IF NOT EXISTS $tmp_tab AS + command = Template( + """ CREATE TEMPORARY TABLE IF NOT EXISTS $tmp_tab AS SELECT t.gene_ID, t.transcript_ID, loc1.chromosome, @@ -168,9 +173,11 @@ def make_temp_monoexonic_transcript_table(cursor, build, chrom = None, ON genes.gene_ID = t.gene_ID WHERE n_exons = 1 AND loc1.genome_build = '$build' - AND loc2.genome_build = '$build' """) + AND loc2.genome_build = '$build' """ + ) else: - command = Template(""" CREATE TEMPORARY TABLE IF NOT EXISTS $tmp_tab AS + command = Template( + """ CREATE TEMPORARY TABLE IF NOT EXISTS $tmp_tab AS SELECT t.gene_ID, t.transcript_ID, loc1.chromosome, @@ -196,35 +203,36 @@ def make_temp_monoexonic_transcript_table(cursor, build, chrom = None, AND ((min_pos <= $start AND max_pos >= $end) OR (min_pos >= $start AND max_pos <= $end) OR (min_pos >= $start AND min_pos <= $end) - OR (max_pos >= $start AND max_pos <= $end))""") + OR (max_pos >= $start AND max_pos <= $end))""" + ) - command = command.substitute({'build':build, 'chrom':chrom, - 'start':start, 'end':end, - 'tmp_tab':tmp_tab}) + command = command.substitute({"build": build, "chrom": chrom, "start": start, "end": end, "tmp_tab": tmp_tab}) cursor.execute(command) return tmp_tab -def make_location_dict(genome_build, cursor, chrom = None, start = None, end = None): - """ Format of dict: - chromosome -> dict(position -> SQLite3 row from location table) - old: - Key: chromosome, pos - Value: SQLite3 row from location table +def make_location_dict(genome_build, cursor, chrom=None, start=None, end=None): + """Format of dict: + chromosome -> dict(position -> SQLite3 row from location table) + + old: + Key: chromosome, pos + Value: SQLite3 row from location table """ location_dict = {} - if any(val == None for val in [chrom, start,end]): + if any(val == None for val in [chrom, start, end]): query = Template("""SELECT * FROM location WHERE genome_build = '$build' """) else: - query = Template("""SELECT * FROM location + query = Template( + """SELECT * FROM location WHERE genome_build = '$build' AND chromosome = '$chrom' AND position >= $start - AND position <= $end""") - query = query.substitute({'build':genome_build, 'chrom':chrom, - 'start':start, 'end':end}) + AND position <= $end""" + ) + query = query.substitute({"build": genome_build, "chrom": chrom, "start": start, "end": end}) cursor.execute(query) for location in cursor.fetchall(): chromosome = location["chromosome"] @@ -236,16 +244,18 @@ def make_location_dict(genome_build, cursor, chrom = None, start = None, end = N return location_dict -def make_edge_dict(cursor, build = None, chrom = None, start = None, end = None): - """ Format of dict: - Key: vertex1_vertex2_type - Value: SQLite3 row from edge table + +def make_edge_dict(cursor, build=None, chrom=None, start=None, end=None): + """Format of dict: + Key: vertex1_vertex2_type + Value: SQLite3 row from edge table """ edge_dict = {} if any(val == None for val in [chrom, start, end, build]): query = """SELECT * FROM edge""" else: - query = Template("""SELECT e.* + query = Template( + """SELECT e.* FROM edge AS e LEFT JOIN location as loc1 ON e.v1 = loc1.location_ID LEFT JOIN location as loc2 ON e.v2 = loc2.location_ID @@ -253,9 +263,9 @@ def make_edge_dict(cursor, build = None, chrom = None, start = None, end = None) AND loc1.chromosome = "$chrom" AND (loc1.position >= $start AND loc1.position <= $end) AND (loc2.position >= $start AND loc2.position <= $end); - """) - query = query.substitute({'build':build, 'chrom':chrom, - 'start':start, 'end':end}) + """ + ) + query = query.substitute({"build": build, "chrom": chrom, "start": start, "end": end}) cursor.execute(query) for edge in cursor.fetchall(): vertex_1 = edge["v1"] @@ -266,14 +276,16 @@ def make_edge_dict(cursor, build = None, chrom = None, start = None, end = None) return edge_dict -def make_transcript_dict(cursor, build, chrom = None, start = None, end = None): - """ Format of dict: - Key: tuple consisting of edges in transcript path - Value: SQLite3 row from transcript table + +def make_transcript_dict(cursor, build, chrom=None, start=None, end=None): + """Format of dict: + Key: tuple consisting of edges in transcript path + Value: SQLite3 row from transcript table """ transcript_dict = {} if any(val == None for val in [chrom, start, end]): - query = Template("""SELECT t.*, + query = Template( + """SELECT t.*, loc1.chromosome as chromosome, loc1.position as start_pos, loc2.position as end_pos @@ -281,10 +293,12 @@ def make_transcript_dict(cursor, build, chrom = None, start = None, end = None): LEFT JOIN location as loc1 ON t.start_vertex = loc1.location_ID LEFT JOIN location as loc2 ON t.end_vertex = loc2.location_ID WHERE loc1.genome_build = '$build' AND loc2.genome_build = '$build'; - """) + """ + ) else: - query = Template("""SELECT t.*, + query = Template( + """SELECT t.*, loc1.chromosome as chrom, loc1.position as start_pos, loc2.position as end_pos, @@ -298,26 +312,25 @@ def make_transcript_dict(cursor, build, chrom = None, start = None, end = None): AND ((min_pos <= $start AND max_pos >= $end) OR (min_pos >= $start AND max_pos <= $end) OR (min_pos >= $start AND min_pos <= $end) - OR (max_pos >= $start AND max_pos <= $end))""") + OR (max_pos >= $start AND max_pos <= $end))""" + ) - query = query.substitute({'build':build, 'chrom':chrom, - 'start':start, 'end':end}) + query = query.substitute({"build": build, "chrom": chrom, "start": start, "end": end}) cursor.execute(query) for transcript in cursor.fetchall(): transcript_path = transcript["jn_path"] if transcript_path != None: - transcript_path = transcript_path.split(",") + \ - [transcript["start_exon"], transcript["end_exon"]] - transcript_path = frozenset([ int(x) for x in transcript_path]) + transcript_path = transcript_path.split(",") + [transcript["start_exon"], transcript["end_exon"]] + transcript_path = frozenset([int(x) for x in transcript_path]) else: transcript_path = frozenset([transcript["start_exon"]]) transcript_dict[transcript_path] = transcript return transcript_dict -def make_vertex_2_gene_dict(cursor, build = None, chrom = None, start = None, end = None): - """ Create a dictionary that maps vertices to the genes that they belong to. - """ + +def make_vertex_2_gene_dict(cursor, build=None, chrom=None, start=None, end=None): + """Create a dictionary that maps vertices to the genes that they belong to.""" vertex_2_gene = {} if any(val == None for val in [chrom, start, end, build]): query = """SELECT vertex_ID, @@ -326,7 +339,8 @@ def make_vertex_2_gene_dict(cursor, build = None, chrom = None, start = None, en FROM vertex LEFT JOIN genes ON vertex.gene_ID = genes.gene_ID""" else: - query = Template("""SELECT vertex_ID, + query = Template( + """SELECT vertex_ID, vertex.gene_ID, strand FROM vertex @@ -335,9 +349,9 @@ def make_vertex_2_gene_dict(cursor, build = None, chrom = None, start = None, en WHERE loc.genome_build = '$build' AND loc.chromosome = '$chrom' AND (loc.position >= $start AND loc.position <= $end) - """) - query = query.substitute({'build':build, 'chrom':chrom, - 'start':start, 'end':end}) + """ + ) + query = query.substitute({"build": build, "chrom": chrom, "start": start, "end": end}) cursor.execute(query) for vertex_line in cursor.fetchall(): @@ -353,20 +367,20 @@ def make_vertex_2_gene_dict(cursor, build = None, chrom = None, start = None, en return vertex_2_gene -def make_gene_start_or_end_dict(cursor, build, mode, chrom = None, start = None, end = None): - """ Select the starts (or ends) of known genes in the database and store - in a dict. - Format of dict: - Key: gene ID from database - Value: dict mapping positions to start vertices (or end vertices) of - KNOWN transcripts from that gene + +def make_gene_start_or_end_dict(cursor, build, mode, chrom=None, start=None, end=None): + """Select the starts (or ends) of known genes in the database and store + in a dict. + Format of dict: + Key: gene ID from database + Value: dict mapping positions to start vertices (or end vertices) of + KNOWN transcripts from that gene """ if mode not in ["start", "end"]: - raise ValueError(("Incorrect mode supplied to 'make_gene_start_or_end_dict'." - " Expected 'start' or 'end'.")) + raise ValueError(("Incorrect mode supplied to 'make_gene_start_or_end_dict'." " Expected 'start' or 'end'.")) output_dict = {} - if any(val == None for val in [chrom, start,end]): + if any(val == None for val in [chrom, start, end]): query = """SELECT gene_ID, %s_vertex as vertex, loc1.position as %s @@ -381,7 +395,8 @@ def make_gene_start_or_end_dict(cursor, build, mode, chrom = None, start = None, cursor.execute(query % (mode, mode, mode, build)) else: - query = Template("""SELECT gene_ID, + query = Template( + """SELECT gene_ID, ${mode}_vertex as vertex, loc1.chromosome as chrom, loc1.position as $mode @@ -395,14 +410,14 @@ def make_gene_start_or_end_dict(cursor, build, mode, chrom = None, start = None, AND ta.value = 'KNOWN' AND loc1.genome_build = '$build' AND chrom = '$chrom' - AND ($mode >= $start AND $mode <= $end)""") - query = query.substitute({'build':build, 'chrom':chrom, - 'start':start, 'end':end, 'mode':mode}) + AND ($mode >= $start AND $mode <= $end)""" + ) + query = query.substitute({"build": build, "chrom": chrom, "start": start, "end": end, "mode": mode}) cursor.execute(query) for entry in cursor.fetchall(): - gene_ID = entry['gene_ID'] - vertex = entry['vertex'] + gene_ID = entry["gene_ID"] + vertex = entry["vertex"] pos = entry[mode] try: diff --git a/src/talon/initialize_talon_database.py b/src/talon/initialize_talon_database.py index d3ced4d..fe18d81 100644 --- a/src/talon/initialize_talon_database.py +++ b/src/talon/initialize_talon_database.py @@ -6,51 +6,65 @@ # This database is used by the TALON pipeline to maintain a registry of # known annotations as well as novel discoveries. +import os import sqlite3 -from sqlite3 import Error +import time from optparse import OptionParser +from sqlite3 import Error + +from . import edge as Edge from . import gene as Gene from . import transcript as Transcript -from . import edge as Edge -import os -import time + def getOptions(): parser = OptionParser() - parser.add_option("--f", dest = "gtf", - help = "GTF annotation containing genes, transcripts, and edges.", - metavar = "FILE", type = str) - parser.add_option("--g", dest = "genome_build", - help = "Name of genome build that the GTF file is based on (ie hg38)", - type = str) - parser.add_option("--a", dest = "annot_name", - help = "Name of supplied annotation (will be used to label data)", - type = str) - parser.add_option("--l", dest = "min_length", - help = "Minimum required transcript length (default = 0 bp) ", - type = int, default = 0) - parser.add_option("--idprefix", dest = "idprefix", - help = "Prefix for naming novel discoveries in eventual TALON runs", - type = str, default = "TALON") - parser.add_option("--5p", dest = "cutoff_5p", - help = "Maximum allowable distance (bp) at the 5' end during annotation", - type = int, default = "500") - parser.add_option("--3p", dest = "cutoff_3p", - help = "Maximum allowable distance (bp) at the 3' end during annotation", - type = int, default = "300") - - parser.add_option("--o", dest = "outprefix", - help = "Outprefix for the annotation files", - metavar = "FILE", type = "string") + parser.add_option( + "--f", dest="gtf", help="GTF annotation containing genes, transcripts, and edges.", metavar="FILE", type=str + ) + parser.add_option( + "--g", dest="genome_build", help="Name of genome build that the GTF file is based on (ie hg38)", type=str + ) + parser.add_option( + "--a", dest="annot_name", help="Name of supplied annotation (will be used to label data)", type=str + ) + parser.add_option( + "--l", dest="min_length", help="Minimum required transcript length (default = 0 bp) ", type=int, default=0 + ) + parser.add_option( + "--idprefix", + dest="idprefix", + help="Prefix for naming novel discoveries in eventual TALON runs", + type=str, + default="TALON", + ) + parser.add_option( + "--5p", + dest="cutoff_5p", + help="Maximum allowable distance (bp) at the 5' end during annotation", + type=int, + default="500", + ) + parser.add_option( + "--3p", + dest="cutoff_3p", + help="Maximum allowable distance (bp) at the 3' end during annotation", + type=int, + default="300", + ) + + parser.add_option("--o", dest="outprefix", help="Outprefix for the annotation files", metavar="FILE", type="string") (options, args) = parser.parse_args() return options + ############### Database initialization section ############################# + def create_database(path): - """ Creates an SQLite database with the provided name. If a database - of the name already exists, an error is generated. """ + """Creates an SQLite database with the provided name. If a database + of the name already exists, an error is generated.""" if os.path.isfile(path): raise ValueError("Database with name '" + path + "' already exists!") @@ -65,40 +79,38 @@ def create_database(path): return + def init_run_info(database, idprefix, min_length, cutoff_5p, cutoff_3p): - """ Initializes a table that keeps track of important run information - such as the prefix for novel identifiers and the 5 prime and 3 prime - distance cutoffs. Affects how downstream TALON runs are done""" + """Initializes a table that keeps track of important run information + such as the prefix for novel identifiers and the 5 prime and 3 prime + distance cutoffs. Affects how downstream TALON runs are done""" # Connecting to the database file conn = sqlite3.connect(database) c = conn.cursor() # Add table and set primary key column, which will be the gene ID - c.execute("""CREATE TABLE "run_info" ("item" TEXT PRIMARY KEY, - "value" TEXT)""") + c.execute( + """CREATE TABLE "run_info" ("item" TEXT PRIMARY KEY, + "value" TEXT)""" + ) # Add rows cols = " (" + ", ".join([str_wrap_double(x) for x in ["item", "value"]]) + ") " - c.execute('INSERT INTO run_info ' + cols + ' VALUES ' + '(?,?)', - ('schema_version', "v5.0")) - c.execute('INSERT INTO run_info ' + cols + ' VALUES ' + '(?,?)', - ('idprefix', idprefix)) - c.execute('INSERT INTO run_info ' + cols + ' VALUES ' + '(?,?)', - ('cutoff_5p', cutoff_5p)) - c.execute('INSERT INTO run_info ' + cols + ' VALUES ' + '(?,?)', - ('cutoff_3p', cutoff_3p)) - c.execute('INSERT INTO run_info ' + cols + ' VALUES ' + '(?,?)', - ('min_length', min_length)) - c.execute('INSERT INTO run_info ' + cols + ' VALUES ' + '(?,?)', - ('n_places', 9)) + c.execute("INSERT INTO run_info " + cols + " VALUES " + "(?,?)", ("schema_version", "v5.0")) + c.execute("INSERT INTO run_info " + cols + " VALUES " + "(?,?)", ("idprefix", idprefix)) + c.execute("INSERT INTO run_info " + cols + " VALUES " + "(?,?)", ("cutoff_5p", cutoff_5p)) + c.execute("INSERT INTO run_info " + cols + " VALUES " + "(?,?)", ("cutoff_3p", cutoff_3p)) + c.execute("INSERT INTO run_info " + cols + " VALUES " + "(?,?)", ("min_length", min_length)) + c.execute("INSERT INTO run_info " + cols + " VALUES " + "(?,?)", ("n_places", 9)) conn.commit() conn.close() return + def add_gene_table(database): - """ Add a table to the database to track genes. Attributes are: - - Primary Key: Gene ID (interally assigned by database) + """Add a table to the database to track genes. Attributes are: + - Primary Key: Gene ID (interally assigned by database) """ # Connecting to the database file @@ -106,18 +118,21 @@ def add_gene_table(database): c = conn.cursor() # Add table and set primary key column, which will be the gene ID - c.execute("""CREATE TABLE "genes" ("gene_ID" INTEGER PRIMARY KEY, - "strand" TEXT)""") + c.execute( + """CREATE TABLE "genes" ("gene_ID" INTEGER PRIMARY KEY, + "strand" TEXT)""" + ) conn.commit() conn.close() return + def add_transcript_table(database): - """ Add a table to the database to track transcripts. Attributes are: - - Primary Key: Transcript ID (interally assigned by database) - - Gene ID - - Path (Edges) + """Add a table to the database to track transcripts. Attributes are: + - Primary Key: Transcript ID (interally assigned by database) + - Gene ID + - Path (Edges) """ # Connecting to the database file @@ -146,12 +161,13 @@ def add_transcript_table(database): conn.close() return + def add_edge_table(database): - """ Add a table to the database to track edges linking vertices. - Attributes are: - - Primary Key: ID (interally assigned by database) - - Donor ID - - Acceptor ID + """Add a table to the database to track edges linking vertices. + Attributes are: + - Primary Key: ID (interally assigned by database) + - Donor ID + - Acceptor ID """ # Connecting to the database file @@ -180,11 +196,12 @@ def add_edge_table(database): conn.close() return + def add_edgetype_table(database): - """ Add a table to the database to track permitted edge types. We start - with "edge" and "intron" - Attributes are: - - Primary Key: Type + """Add a table to the database to track permitted edge types. We start + with "edge" and "intron" + Attributes are: + - Primary Key: Type """ # Connecting to the database file conn = sqlite3.connect(database) @@ -198,27 +215,26 @@ def add_edgetype_table(database): # Add entries for 'exon' and 'intron' for t in ["exon", "intron"]: cols = "(type)" - vals = [ t ] - command = 'INSERT OR IGNORE INTO "edge_type"' + cols + "VALUES " + \ - '(?)' - c.execute(command,vals) + vals = [t] + command = 'INSERT OR IGNORE INTO "edge_type"' + cols + "VALUES " + "(?)" + c.execute(command, vals) conn.commit() conn.close() return + def add_vertex_table(database): - """ Add a table to the database to track vertices. - Attributes are: - - Vertex_ID: ID (interally assigned by database) - - Gene ID + """Add a table to the database to track vertices. + Attributes are: + - Vertex_ID: ID (interally assigned by database) + - Gene ID """ # Connecting to the database file conn = sqlite3.connect(database) c = conn.cursor() - # Add table and set primary key column, which will be the transcript ID # Also include relationship to the gene table command = """ CREATE TABLE IF NOT EXISTS vertex ( @@ -235,9 +251,10 @@ def add_vertex_table(database): conn.close() return + def add_genome_table(database, build): - """ Add a table that tracks the genome builds in use, then add the provided - genome build to it. + """Add a table that tracks the genome builds in use, then add the provided + genome build to it. """ # Connecting to the database file @@ -245,9 +262,11 @@ def add_genome_table(database, build): c = conn.cursor() # Add table and set primary key column, which will be the edge ID - c.execute("""CREATE TABLE genome_build ( + c.execute( + """CREATE TABLE genome_build ( build_ID INTEGER PRIMARY KEY, - name TEXT)""") + name TEXT)""" + ) # Get value of genome_build counter c.execute('SELECT "count" FROM "counters" WHERE "category" = "genome_build"') @@ -257,10 +276,9 @@ def add_genome_table(database, build): # Add entry for current genome build cols = "(build_ID, name)" - vals = [ db_id, build ] - command = 'INSERT OR IGNORE INTO "genome_build"' + cols + "VALUES " + \ - '(?,?)' - c.execute(command,vals) + vals = [db_id, build] + command = 'INSERT OR IGNORE INTO "genome_build"' + cols + "VALUES " + "(?,?)" + c.execute(command, vals) # Update the counter update_counter = 'UPDATE "counters" SET "count" = ? WHERE "category" = ?' @@ -270,36 +288,40 @@ def add_genome_table(database, build): conn.close() return + def add_dataset_table(database): - """ Add a table that tracks the datasets added to the database. - """ + """Add a table that tracks the datasets added to the database.""" # Connecting to the database file conn = sqlite3.connect(database) c = conn.cursor() # Add table and set primary key column - c.execute("""CREATE TABLE dataset ( + c.execute( + """CREATE TABLE dataset ( dataset_ID INTEGER PRIMARY KEY, dataset_name TEXT, sample TEXT, platform TEXT - )""") + )""" + ) conn.commit() conn.close() return + def add_observed_table(database): - """ Add a table that tracks attributes of observed transcripts, including - 5' and 3' end deltas, as well as the read length. """ + """Add a table that tracks attributes of observed transcripts, including + 5' and 3' end deltas, as well as the read length.""" # Connecting to the database file conn = sqlite3.connect(database) c = conn.cursor() # Add table and set primary key column - c.execute("""CREATE TABLE observed ( + c.execute( + """CREATE TABLE observed ( obs_ID INTEGER PRIMARY KEY, gene_ID INTEGER, transcript_ID INTEGER, @@ -325,18 +347,20 @@ def add_observed_table(database): FOREIGN KEY(end_vertex) REFERENCES vertex(vertex_ID), FOREIGN KEY(start_exon) REFERENCES edge(edge_ID), FOREIGN KEY(end_exon) REFERENCES edge(edge_ID) - )""") + )""" + ) conn.commit() conn.close() return + def add_abundance_table(database): - """ Add a table to the database to track transcript abundance over - all datasets. - - Transcript ID - - Dataset - - Count + """Add a table to the database to track transcript abundance over + all datasets. + - Transcript ID + - Dataset + - Count """ # Connecting to the database file @@ -344,7 +368,8 @@ def add_abundance_table(database): c = conn.cursor() # Add table and set primary key column, which will be the edge ID - c.execute("""CREATE TABLE abundance ( + c.execute( + """CREATE TABLE abundance ( transcript_ID INTEGER, dataset INTEGER, count INTEGER, @@ -352,16 +377,18 @@ def add_abundance_table(database): PRIMARY KEY(transcript_ID, dataset), FOREIGN KEY(transcript_ID) REFERENCES transcripts(transcript_ID), FOREIGN KEY(dataset) REFERENCES dataset(dataset_ID) - )""") + )""" + ) conn.commit() conn.close() return + def add_counter_table(database): - """ Add a table to the database to track novel events. Attributes are: - - Category (gene, transcript, edge) - - Count (number of items in that category so far) + """Add a table to the database to track novel events. Attributes are: + - Category (gene, transcript, edge) + - Count (number of items in that category so far) """ # Connecting to the database file @@ -374,42 +401,39 @@ def add_counter_table(database): # Add novel column default_val = 0 - c.execute("ALTER TABLE {tn} ADD COLUMN '{cn}' {ct}"\ - .format(tn=table_name, cn="count", ct="INTEGER", df=default_val)) + c.execute("ALTER TABLE {tn} ADD COLUMN '{cn}' {ct}".format(tn=table_name, cn="count", ct="INTEGER", df=default_val)) # Add rows - c.execute("INSERT INTO {tn} ({idf}, {cn}) VALUES ('genes', 0)".\ - format(tn=table_name, idf="category", cn="count")) - c.execute("INSERT INTO {tn} ({idf}, {cn}) VALUES ('transcripts', 0)".\ - format(tn=table_name, idf="category", cn="count")) - c.execute("INSERT INTO {tn} ({idf}, {cn}) VALUES ('vertex', 0)".\ - format(tn=table_name, idf="category", cn="count")) - c.execute("INSERT INTO {tn} ({idf}, {cn}) VALUES ('edge', 0)".\ - format(tn=table_name, idf="category", cn="count")) - c.execute("INSERT INTO {tn} ({idf}, {cn}) VALUES ('genome_build', 0)".\ - format(tn=table_name, idf="category", cn="count")) - c.execute("INSERT INTO {tn} ({idf}, {cn}) VALUES ('dataset', 0)".\ - format(tn=table_name, idf="category", cn="count")) - c.execute("INSERT INTO {tn} ({idf}, {cn}) VALUES ('observed', 0)".\ - format(tn=table_name, idf="category", cn="count")) + c.execute("INSERT INTO {tn} ({idf}, {cn}) VALUES ('genes', 0)".format(tn=table_name, idf="category", cn="count")) + c.execute( + "INSERT INTO {tn} ({idf}, {cn}) VALUES ('transcripts', 0)".format(tn=table_name, idf="category", cn="count") + ) + c.execute("INSERT INTO {tn} ({idf}, {cn}) VALUES ('vertex', 0)".format(tn=table_name, idf="category", cn="count")) + c.execute("INSERT INTO {tn} ({idf}, {cn}) VALUES ('edge', 0)".format(tn=table_name, idf="category", cn="count")) + c.execute( + "INSERT INTO {tn} ({idf}, {cn}) VALUES ('genome_build', 0)".format(tn=table_name, idf="category", cn="count") + ) + c.execute("INSERT INTO {tn} ({idf}, {cn}) VALUES ('dataset', 0)".format(tn=table_name, idf="category", cn="count")) + c.execute("INSERT INTO {tn} ({idf}, {cn}) VALUES ('observed', 0)".format(tn=table_name, idf="category", cn="count")) conn.commit() conn.close() return + def add_annotation_table(database, table_name, key_table, fk_id): - """ Add a table to keep track of annotation attributes for genes, - transcripts, etc. The table will be given the provided table name. A - foreign key will be created to link the ID column of the annotation - table to the fk_id column of the key_table. - - Attributes: - - Item ID - - Annotation name: user-provided name for annotation - - Source (in case of an object from a GTF, this comes from the 2nd col) - - Feature type - - Attribute - - Value + """Add a table to keep track of annotation attributes for genes, + transcripts, etc. The table will be given the provided table name. A + foreign key will be created to link the ID column of the annotation + table to the fk_id column of the key_table. + + Attributes: + - Item ID + - Annotation name: user-provided name for annotation + - Source (in case of an object from a GTF, this comes from the 2nd col) + - Feature type + - Attribute + - Value """ # Connecting to the database file @@ -420,28 +444,34 @@ def add_annotation_table(database, table_name, key_table, fk_id): if key_table == "exon": fk_statement = "" else: - fk_statement = ", FOREIGN KEY (ID) REFERENCES "+ key_table + "(" + fk_id + ")" - command = " CREATE TABLE IF NOT EXISTS " + table_name + \ - """ (ID INTEGER, + fk_statement = ", FOREIGN KEY (ID) REFERENCES " + key_table + "(" + fk_id + ")" + command = ( + " CREATE TABLE IF NOT EXISTS " + + table_name + + """ (ID INTEGER, annot_name text, source text, attribute text, value text, - PRIMARY KEY (ID, source, attribute)""" + fk_statement + """); """ + PRIMARY KEY (ID, source, attribute)""" + + fk_statement + + """); """ + ) c.execute(command) conn.commit() conn.close() return + def add_location_table(database): - """ Add a table to the database to track the locations of objects across - the different genome builds. Attributes are: - - Vertex ID - - Genome build - - Chromosome - - Position (1-based) - - Strand + """Add a table to the database to track the locations of objects across + the different genome builds. Attributes are: + - Vertex ID + - Genome build + - Chromosome + - Position (1-based) + - Strand """ # Connecting to the database file @@ -467,15 +497,16 @@ def add_location_table(database): ####################### GTF parsing section ################################# + def read_gtf_file(gtf_file): - """ Reads gene, transcript, and edge information from a GTF file. - Args: - gtf_file: Path to the GTF file - Returns: - genes: A dictionary mapping gene IDs to corresponding gene objects - transcripts: A dictionary mapping gene IDs to corresponding - transcript objects - exons: A dictionary mapping exon IDs to corresponding edge objects + """Reads gene, transcript, and edge information from a GTF file. + Args: + gtf_file: Path to the GTF file + Returns: + genes: A dictionary mapping gene IDs to corresponding gene objects + transcripts: A dictionary mapping gene IDs to corresponding + transcript objects + exons: A dictionary mapping exon IDs to corresponding edge objects """ genes = {} transcripts = {} @@ -533,10 +564,11 @@ def read_gtf_file(gtf_file): return genes, transcripts, exons + def filter_by_length(genes, transcripts, min_length): - """ Given a minimum transcript length, this function - - Iterates over transcripts and keeps the ones with length >= min_length - - Removes genes not represented in the transcript set + """Given a minimum transcript length, this function + - Iterates over transcripts and keeps the ones with length >= min_length + - Removes genes not represented in the transcript set """ filtered_transcripts = {} filtered_genes = {} @@ -553,8 +585,9 @@ def filter_by_length(genes, transcripts, min_length): return filtered_genes, filtered_transcripts + def organize_by_chromosome(genes, transcripts): - """ Iterate through genes and transcripts and group them by chromosome """ + """Iterate through genes and transcripts and group them by chromosome""" gene_dict = {} transcript_dict = {} @@ -579,11 +612,13 @@ def organize_by_chromosome(genes, transcripts): return gene_dict, transcript_dict + ######################### Populate the database ############################ + def populate_db(database, annot_name, chrom_genes, chrom_transcripts, edges, genome_build): - """ Iterate over GTF-derived gene, transcript, and edge entries in order - to add a record for each in the database. + """Iterate over GTF-derived gene, transcript, and edge entries in order + to add a record for each in the database. """ # Connecting to the database file conn = sqlite3.connect(database) @@ -605,8 +640,8 @@ def populate_db(database, annot_name, chrom_genes, chrom_transcripts, edges, gen return -def add_genes(c, genes, annot_name): +def add_genes(c, genes, annot_name): bulk_genes = [] bulk_annotations = [] gene_id_map = {} @@ -641,14 +676,15 @@ def add_genes(c, genes, annot_name): bulk_update_gene_annotations(c, bulk_annotations) return gene_id_map + def bulk_update_genes(c, genes, gene_counter): """ - Given a list of tuple-formatted gene entries, this function inserts them - into the database at the provided cursor (c). + Given a list of tuple-formatted gene entries, this function inserts them + into the database at the provided cursor (c). """ # Insert entries into database in bulk cols = " (" + ", ".join([str_wrap_double(x) for x in ["gene_id", "strand"]]) + ") " - g_command = 'INSERT INTO "genes"' + cols + "VALUES " + '(?,?)' + g_command = 'INSERT INTO "genes"' + cols + "VALUES " + "(?,?)" c.executemany(g_command, genes) # Update counter @@ -657,22 +693,21 @@ def bulk_update_genes(c, genes, gene_counter): return + def bulk_update_gene_annotations(c, bulk_annotations): """ - Given a list of tuple-formatted gene annotation entries, this function - inserts them into the database at the provided cursor (c). + Given a list of tuple-formatted gene annotation entries, this function + inserts them into the database at the provided cursor (c). """ - cols = " (" + ", ".join([str_wrap_double(x) for x in ["ID","annot_name", - "source", "attribute", "value"]]) + ") " - command = 'INSERT INTO "gene_annotations"' + cols + "VALUES " + \ - '(?,?,?,?,?)' + cols = " (" + ", ".join([str_wrap_double(x) for x in ["ID", "annot_name", "source", "attribute", "value"]]) + ") " + command = 'INSERT INTO "gene_annotations"' + cols + "VALUES " + "(?,?,?,?,?)" c.executemany(command, bulk_annotations) return -def add_transcripts(c, transcripts, annot_name, gene_id_map, genome_build): +def add_transcripts(c, transcripts, annot_name, gene_id_map, genome_build): bulk_transcripts = [] bulk_annotations = [] @@ -683,11 +718,11 @@ def add_transcripts(c, transcripts, annot_name, gene_id_map, genome_build): # Get vertex and edge counters from database c.execute('SELECT "count" FROM "counters" WHERE "category" = "vertex"') v_counter = int(c.fetchone()[0]) - vertices['counter'] = v_counter + vertices["counter"] = v_counter c.execute('SELECT "count" FROM "counters" WHERE "category" = "edge"') e_counter = int(c.fetchone()[0]) - edges['counter'] = e_counter + edges["counter"] = e_counter # Get transcript counter c.execute('SELECT "count" FROM "counters" WHERE "category" = "transcripts"') @@ -710,9 +745,9 @@ def add_transcripts(c, transcripts, annot_name, gene_id_map, genome_build): db_gene_id = "NULL" # Process exons to create vertices and edges - transcript_tuple = process_transcript(c, transcript, db_transcript_id, - db_gene_id, genome_build, - annot_name, vertices, edges) + transcript_tuple = process_transcript( + c, transcript, db_transcript_id, db_gene_id, genome_build, annot_name, vertices, edges + ) bulk_transcripts.append(transcript_tuple) # Create annotation entries @@ -736,40 +771,56 @@ def add_transcripts(c, transcripts, annot_name, gene_id_map, genome_build): return + def bulk_update_transcripts(c, transcripts, counter): """ - Given a list of tuple-formatted transcript entries, this function inserts them - into the database at the provided cursor (c). + Given a list of tuple-formatted transcript entries, this function inserts them + into the database at the provided cursor (c). """ - cols = " (" + ", ".join([str_wrap_double(x) for x in ["transcript_ID", - "gene_ID", "start_exon", "jn_path", "end_exon", "start_vertex", "end_vertex", - "n_exons"]]) + ") " - g_command = 'INSERT INTO "transcripts"' + cols + "VALUES " + \ - '(?,?,?,?,?,?,?,?)' - c.executemany(g_command,transcripts) + cols = ( + " (" + + ", ".join( + [ + str_wrap_double(x) + for x in [ + "transcript_ID", + "gene_ID", + "start_exon", + "jn_path", + "end_exon", + "start_vertex", + "end_vertex", + "n_exons", + ] + ] + ) + + ") " + ) + g_command = 'INSERT INTO "transcripts"' + cols + "VALUES " + "(?,?,?,?,?,?,?,?)" + c.executemany(g_command, transcripts) update_counter = 'UPDATE "counters" SET "count" = ? WHERE "category" = ?' c.execute(update_counter, [counter, "transcripts"]) return + def bulk_update_transcript_annotations(c, bulk_annotations): """ - Given a list of tuple-formatted transcript annotation entries, this - function inserts them into the database at the provided cursor (c). + Given a list of tuple-formatted transcript annotation entries, this + function inserts them into the database at the provided cursor (c). """ - cols = " (" + ", ".join([str_wrap_double(x) for x in ["ID","annot_name", - "source", "attribute", "value"]]) + ") " - command = 'INSERT INTO "transcript_annotations"' + cols + "VALUES " + \ - '(?,?,?,?,?)' + cols = " (" + ", ".join([str_wrap_double(x) for x in ["ID", "annot_name", "source", "attribute", "value"]]) + ") " + command = 'INSERT INTO "transcript_annotations"' + cols + "VALUES " + "(?,?,?,?,?)" c.executemany(command, bulk_annotations) return + def bulk_update_vertices(c, vertices): """ - Given a list of tuple-formatted vertex entries, this - function inserts them into the database at the provided cursor (c). + Given a list of tuple-formatted vertex entries, this + function inserts them into the database at the provided cursor (c). """ # Extract the counter counter = vertices.pop("counter") @@ -779,21 +830,20 @@ def bulk_update_vertices(c, vertices): location_list = [] for vertex in list(vertices.values()): gene_IDs = list(vertex[-1]) - vertex_list += [ (vertex[0], x) for x in gene_IDs ] + vertex_list += [(vertex[0], x) for x in gene_IDs] location_list.append(vertex[0:4]) # Bulk entry of vertices - cols = " (" + ", ".join([str_wrap_double(x) for x in ["vertex_ID","gene_id"]]) + ") " - command = 'INSERT INTO "vertex"' + cols + "VALUES " + \ - '(?,?)' + cols = " (" + ", ".join([str_wrap_double(x) for x in ["vertex_ID", "gene_id"]]) + ") " + command = 'INSERT INTO "vertex"' + cols + "VALUES " + "(?,?)" c.executemany(command, vertex_list) # Bulk entry of locations - cols = " (" + ", ".join([str_wrap_double(x) for x in ["location_ID", - "genome_build", "chromosome", "position"]]) + ") " - command = 'INSERT INTO "location"' + cols + "VALUES " + \ - '(?,?,?,?)' + cols = ( + " (" + ", ".join([str_wrap_double(x) for x in ["location_ID", "genome_build", "chromosome", "position"]]) + ") " + ) + command = 'INSERT INTO "location"' + cols + "VALUES " + "(?,?,?,?)" c.executemany(command, location_list) # Counter update @@ -802,18 +852,17 @@ def bulk_update_vertices(c, vertices): return + def bulk_update_edges(c, edges): """ - Given a list of tuple-formatted edge entries, this - function inserts them into the database at the provided cursor (c). + Given a list of tuple-formatted edge entries, this + function inserts them into the database at the provided cursor (c). """ # Extract the counter counter = edges.pop("counter") - cols = " (" + ", ".join([str_wrap_double(x) for x in ["edge_ID","v1", - "v2", "edge_type", "strand"]]) + ") " - command = 'INSERT INTO "edge"' + cols + "VALUES " + \ - '(?,?,?,?,?)' + cols = " (" + ", ".join([str_wrap_double(x) for x in ["edge_ID", "v1", "v2", "edge_type", "strand"]]) + ") " + command = 'INSERT INTO "edge"' + cols + "VALUES " + "(?,?,?,?,?)" c.executemany(command, edges.values()) update_counter = 'UPDATE "counters" SET "count" = ? WHERE "category" = ?' @@ -821,9 +870,8 @@ def bulk_update_edges(c, edges): return -def process_transcript(c, transcript, transcript_id, gene_id, genome_build, - annot_name, vertices, edges): +def process_transcript(c, transcript, transcript_id, gene_id, genome_build, annot_name, vertices, edges): exons = transcript.exons strand = transcript.strand transcript_vertices = [] @@ -833,19 +881,17 @@ def process_transcript(c, transcript, transcript_id, gene_id, genome_build, exon = exons[i] left = exon.start right = exon.end - v1, vertices = create_vertex(c, gene_id, genome_build, - exon.chromosome, left, vertices) + v1, vertices = create_vertex(c, gene_id, genome_build, exon.chromosome, left, vertices) transcript_vertices.append(v1) - v2, vertices = create_vertex(c, gene_id, genome_build, - exon.chromosome, right, vertices) + v2, vertices = create_vertex(c, gene_id, genome_build, exon.chromosome, right, vertices) transcript_vertices.append(v2) # Iterate over vertices in order to create edges. If the transcript is on the # minus strand, reverse the vertex and edge lists if strand == "-": - transcript_vertices = transcript_vertices[::-1] - exons = exons[::-1] + transcript_vertices = transcript_vertices[::-1] + exons = exons[::-1] # Keep track of start vertex, end vertex, and n_exons start_vertex = transcript_vertices[0] @@ -854,9 +900,9 @@ def process_transcript(c, transcript, transcript_id, gene_id, genome_build, prev_edge_type = None exon_index = 0 - for i in range(0,len(transcript_vertices) - 1): + for i in range(0, len(transcript_vertices) - 1): vertex_1 = transcript_vertices[i] - vertex_2 = transcript_vertices[i+1] + vertex_2 = transcript_vertices[i + 1] # Try to create an edge between vertex 1 and 2 if prev_edge_type == None or prev_edge_type == "intron": @@ -875,48 +921,57 @@ def process_transcript(c, transcript, transcript_id, gene_id, genome_build, prev_edge_type = edge_type if len(transcript_edges) > 1: - transcript_path = ",".join(map(str,transcript_edges[1:-1])) + transcript_path = ",".join(map(str, transcript_edges[1:-1])) else: transcript_path = None start_exon = transcript_edges[0] end_exon = transcript_edges[-1] - transcript_tuple = (transcript_id, gene_id, start_exon, transcript_path, - end_exon, start_vertex, end_vertex, n_exons) + transcript_tuple = ( + transcript_id, + gene_id, + start_exon, + transcript_path, + end_exon, + start_vertex, + end_vertex, + n_exons, + ) return transcript_tuple def add_exon_annotations_to_db(c, exon, exon_id, annot_name): - """ Adds annotations from edge object to the database""" + """Adds annotations from edge object to the database""" ignore = ["gene_id", "gene_name"] attributes = exon.annotations - source = attributes['source'] + source = attributes["source"] if "exon_status" not in attributes: - attributes["exon_status"] = "KNOWN" + attributes["exon_status"] = "KNOWN" for att in attributes.keys(): if (att in ignore) or ("gene" in att) or ("transcript" in att): continue value = attributes[att] - cols = " (" + ", ".join([str_wrap_double(x) for x in ["ID","annot_name", - "source", "attribute", "value"]]) + ") " + cols = ( + " (" + ", ".join([str_wrap_double(x) for x in ["ID", "annot_name", "source", "attribute", "value"]]) + ") " + ) vals = [exon_id, annot_name, source, att, value] - command = 'INSERT OR IGNORE INTO "exon_annotations"' + cols + "VALUES " + \ - '(?,?,?,?,?)' - c.execute(command,vals) + command = 'INSERT OR IGNORE INTO "exon_annotations"' + cols + "VALUES " + "(?,?,?,?,?)" + c.execute(command, vals) return + def create_edge(vertex_1, vertex_2, edge_type, strand, edges): """ - Creates a new edge with the provided information, unless a duplicate - already exists in the 'edges' dict. + Creates a new edge with the provided information, unless a duplicate + already exists in the 'edges' dict. """ # Check if the edge exists, and return the ID if it does - query = ",".join([str(vertex_1), str(vertex_2), edge_type,strand]) + query = ",".join([str(vertex_1), str(vertex_2), edge_type, strand]) if query in edges.keys(): existing_edge_id = edges[query][0] return existing_edge_id, edges @@ -931,10 +986,11 @@ def create_edge(vertex_1, vertex_2, edge_type, strand, edges): return edge_id, edges + def create_vertex(c, gene_id, genome_build, chromosome, pos, vertices): """ - Creates a new vertex with the provided information, unless a duplicate - already exists in the database. + Creates a new vertex with the provided information, unless a duplicate + already exists in the database. """ # Check if the vertex exists. If yes, add current gene ID to it query = ",".join([genome_build, chromosome, str(pos)]) @@ -957,18 +1013,21 @@ def create_vertex(c, gene_id, genome_build, chromosome, pos, vertices): def str_wrap_double(s): - """ Adds double quotes around the input string """ + """Adds double quotes around the input string""" s = str(s) return '"' + s + '"' + def hms_string(sec_elapsed): h = int(sec_elapsed / (60 * 60)) m = int((sec_elapsed % (60 * 60)) / 60) - s = sec_elapsed % 60. + s = sec_elapsed % 60.0 return "{}:{:>02}:{:>05.2f}".format(h, m, s) + ########################### Main ########################################### + def main(): options = getOptions() gtf_file = options.gtf @@ -993,8 +1052,7 @@ def main(): add_genome_table(db_name, genome_build) add_location_table(db_name) add_annotation_table(db_name, "gene_annotations", "genes", "gene_ID") - add_annotation_table(db_name, "transcript_annotations", "transcripts", - "transcript_ID") + add_annotation_table(db_name, "transcript_annotations", "transcripts", "transcript_ID") add_annotation_table(db_name, "exon_annotations", "exon", "ID") add_dataset_table(db_name) add_abundance_table(db_name) @@ -1015,5 +1073,5 @@ def main(): populate_db(db_name, annot_name, chrom_genes, chrom_transcripts, exons, genome_build) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/src/talon/length_utils.py b/src/talon/length_utils.py index ebc8d04..769c813 100644 --- a/src/talon/length_utils.py +++ b/src/talon/length_utils.py @@ -3,11 +3,13 @@ # ----------------------------------------------------------------------------- # Queries for working with exon and transcript lengths + def get_all_exon_lengths(cursor, build): - """ Compute all exon lengths and store in a dict """ + """Compute all exon lengths and store in a dict""" exon_lengths = {} - cursor.execute(""" SELECT edge_ID, + cursor.execute( + """ SELECT edge_ID, loc1.position AS pos1, loc2.position AS pos2, abs(loc1.position - loc2.position) + 1 AS diff @@ -16,35 +18,35 @@ def get_all_exon_lengths(cursor, build): LEFT JOIN location AS loc2 ON edge.v2 = loc2.location_ID WHERE edge_type = 'exon' AND loc1.genome_build = '%s' - AND loc2.genome_build = '%s' """ % (build, build)) + AND loc2.genome_build = '%s' """ + % (build, build) + ) for exon in cursor.fetchall(): - exon_ID = exon['edge_ID'] - length = exon['diff'] + exon_ID = exon["edge_ID"] + length = exon["diff"] exon_lengths[exon_ID] = length return exon_lengths + def get_transcript_length(transcript_row, exon_lengths): - """ Compute the length of the supplied transcript model based on its - exons. Expected input format consists of a transcript row from a - TALON database. """ + """Compute the length of the supplied transcript model based on its + exons. Expected input format consists of a transcript row from a + TALON database.""" length = 0 - start_exon = transcript_row['start_exon'] - end_exon = transcript_row['end_exon'] - n_exons = transcript_row['n_exons'] + start_exon = transcript_row["start_exon"] + end_exon = transcript_row["end_exon"] + n_exons = transcript_row["n_exons"] if n_exons == 1: return exon_lengths[start_exon] else: - jn_path = transcript_row['jn_path'].split(",") + jn_path = transcript_row["jn_path"].split(",") all_exons = [start_exon] + [int(x) for x in jn_path[1::2]] + [end_exon] - + for exon in all_exons: length += exon_lengths[exon] return length - - - diff --git a/src/talon/logger.py b/src/talon/logger.py index 9d485f0..31103d7 100644 --- a/src/talon/logger.py +++ b/src/talon/logger.py @@ -12,6 +12,4 @@ def _init_logger(verbosity): msg_fmt = "%(asctime)s : %(levelname)s : [%(filename)s:%(lineno)d] : %(message)s" date_fmt = "[ %Y-%m-%d %H:%M:%S ]" - logging.basicConfig(level=level, - format=msg_fmt, - datefmt=date_fmt) + logging.basicConfig(level=level, format=msg_fmt, datefmt=date_fmt) diff --git a/src/talon/post/ab_utils.py b/src/talon/post/ab_utils.py index 2a2b8e6..1818d23 100644 --- a/src/talon/post/ab_utils.py +++ b/src/talon/post/ab_utils.py @@ -1,20 +1,22 @@ -import sqlite3 import itertools import operator +import sqlite3 from optparse import OptionParser from pathlib import Path -import scanpy + import numpy as np +import scanpy -from . import filter_talon_transcripts as filt from .. import dstruct as dstruct from .. import length_utils as lu -from . import post_utils as putils from .. import query_utils as qutils from .. import talon as talon +from . import filter_talon_transcripts as filt +from . import post_utils as putils + def check_annot_validity(annot, database): - """ Make sure that the user has entered a correct annotation name """ + """Make sure that the user has entered a correct annotation name""" conn = sqlite3.connect(database) cursor = conn.cursor() @@ -27,21 +29,25 @@ def check_annot_validity(annot, database): annotations.remove("TALON") if annot == None: - message = "Please provide a valid annotation name. " + \ - "In this database, your options are: " + \ - ", ".join(annotations) + message = ( + "Please provide a valid annotation name. " + "In this database, your options are: " + ", ".join(annotations) + ) raise ValueError(message) if annot not in annotations: - message = "Annotation name '" + annot + \ - "' not found in this database. Try one of the following: " + \ - ", ".join(annotations) + message = ( + "Annotation name '" + + annot + + "' not found in this database. Try one of the following: " + + ", ".join(annotations) + ) raise ValueError(message) return + def check_build_validity(build, database): - """ Make sure that the user has entered a correct build name """ + """Make sure that the user has entered a correct build name""" conn = sqlite3.connect(database) cursor = conn.cursor() @@ -51,21 +57,22 @@ def check_build_validity(build, database): conn.close() if build == None: - message = "Please provide a valid genome build name. " + \ - "In this database, your options are: " + \ - ", ".join(builds) + message = ( + "Please provide a valid genome build name. " + "In this database, your options are: " + ", ".join(builds) + ) raise ValueError(message) if build not in builds: - message = "Build name '" + build + \ - "' not found in this database. Try one of the following: " + \ - ", ".join(builds) + message = ( + "Build name '" + build + "' not found in this database. Try one of the following: " + ", ".join(builds) + ) raise ValueError(message) return + def fetch_naming_prefix(database): - """ Get naming prefix from the database run_info table """ + """Get naming prefix from the database run_info table""" conn = sqlite3.connect(database) conn.row_factory = sqlite3.Row cursor = conn.cursor() @@ -75,8 +82,9 @@ def fetch_naming_prefix(database): conn.close() return prefix + def fetch_n_places(database): - """ Get length of name field from the database run_info table """ + """Get length of name field from the database run_info table""" conn = sqlite3.connect(database) conn.row_factory = sqlite3.Row cursor = conn.cursor() @@ -86,9 +94,10 @@ def fetch_n_places(database): conn.close() return int(n_places) + def get_transcript_lengths(database, build): - """ Read the transcripts from the database. Then compute the lengths. - Store in a dictionary """ + """Read the transcripts from the database. Then compute the lengths. + Store in a dictionary""" transcript_lengths = {} @@ -101,15 +110,16 @@ def get_transcript_lengths(database, build): cursor.execute("SELECT * FROM transcripts") for transcript_row in cursor.fetchall(): - transcript_ID = transcript_row['transcript_ID'] + transcript_ID = transcript_row["transcript_ID"] length = lu.get_transcript_length(transcript_row, exon_lens) transcript_lengths[transcript_ID] = length conn.close() return transcript_lengths + def fetch_dataset_list(dataset_file, database): - """ Gets a list of all datasets in the database """ + """Gets a list of all datasets in the database""" conn = sqlite3.connect(database) cursor = conn.cursor() @@ -117,17 +127,15 @@ def fetch_dataset_list(dataset_file, database): conn.close() if dataset_file == None: - return all_db_datasets else: datasets = [] - with open(dataset_file, 'r') as f: + with open(dataset_file, "r") as f: for line in f: dataset = line.strip() if dataset not in all_db_datasets: - raise ValueError("Dataset name '%s' not found in database" \ - % (dataset)) + raise ValueError("Dataset name '%s' not found in database" % (dataset)) datasets.append(dataset) return datasets diff --git a/src/talon/post/call_longest_ends.py b/src/talon/post/call_longest_ends.py index f570683..7611e10 100644 --- a/src/talon/post/call_longest_ends.py +++ b/src/talon/post/call_longest_ends.py @@ -1,82 +1,95 @@ import pandas as pd -pd.options.mode.chained_assignment = None + +pd.options.mode.chained_assignment = None import argparse -import numpy as np import csv -def get_args(): +import numpy as np - desc = ('Replaces the starts or ends of transcripts in a GTF with the' - ' longest alternatives (similar to the GENCODE model of' - 'calling transcripts)') + +def get_args(): + desc = ( + "Replaces the starts or ends of transcripts in a GTF with the" + " longest alternatives (similar to the GENCODE model of" + "calling transcripts)" + ) parser = argparse.ArgumentParser(description=desc) - parser.add_argument('-gtf', dest='gtf', - help='TALON GTF to serve as the template to modify') - parser.add_argument('-read_annot', dest='annot', - help='Read annot file from TALON to extract raw read ends from') - parser.add_argument("--datasets", "--d", dest = "datasets_file", - help = """A file indicating which datasets should be + parser.add_argument("-gtf", dest="gtf", help="TALON GTF to serve as the template to modify") + parser.add_argument("-read_annot", dest="annot", help="Read annot file from TALON to extract raw read ends from") + parser.add_argument( + "--datasets", + "--d", + dest="datasets_file", + help="""A file indicating which datasets should be included (one dataset name per line). Default is to include - all datasets.""", default='all') - parser.add_argument('--mode', dest='mode', default='tss', + all datasets.""", + default="all", + ) + parser.add_argument( + "--mode", + dest="mode", + default="tss", help="Modify TSSs or TESs, 'tss', 'tes', 'both'. Default: 'tss'", - choices={'tss', 'tes', 'both'}) - parser.add_argument('--novelty', dest='novelty', - help="Whether to only modify ends from novel or all models, "+\ - "'all' or 'novel'. Default: 'all'", choices={'all', 'novel'}, default='all') - parser.add_argument('-outprefix', '-o', dest='outprefix', - help='Prefix for output file', default='talon') - parser.add_argument('--verbose', '-v', action='store_true', - default=False, help="Display in progress output") + choices={"tss", "tes", "both"}, + ) + parser.add_argument( + "--novelty", + dest="novelty", + help="Whether to only modify ends from novel or all models, " + "'all' or 'novel'. Default: 'all'", + choices={"all", "novel"}, + default="all", + ) + parser.add_argument("-outprefix", "-o", dest="outprefix", help="Prefix for output file", default="talon") + parser.add_argument("--verbose", "-v", action="store_true", default=False, help="Display in progress output") args = parser.parse_args() return args + # df: TALON read annotation dataframe # how: 'tes' or 'tss' for calling ends or starts respectively # novelty: 'all' or 'novel' based on which transcript models # you want to modify the ends of # datasets: string file path of file with datasets to use when # calling longest ends from reads -def get_longest_ends(df, how='tes', novelty='novel', datasets='all'): +def get_longest_ends(df, how="tes", novelty="novel", datasets="all"): + if novelty == "novel": + df = df.loc[df.transcript_novelty != "Known"] - if novelty == 'novel': - df = df.loc[df.transcript_novelty != 'Known'] - - if datasets != 'all': + if datasets != "all": df = df.loc[df.dataset.isin(datasets)] - fwd = df.loc[df.strand == '+'] - rev = df.loc[df.strand == '-'] + fwd = df.loc[df.strand == "+"] + rev = df.loc[df.strand == "-"] # furthest downstream for tes # if + strand, max coord of read end # if - strand, min coord of read end - if how == 'tes': - fwd = fwd[['transcript_ID', 'read_end']] - fwd = fwd.groupby('transcript_ID').max().reset_index() - rev = rev[['transcript_ID', 'read_end']] - rev = rev.groupby('transcript_ID').min().reset_index() - + if how == "tes": + fwd = fwd[["transcript_ID", "read_end"]] + fwd = fwd.groupby("transcript_ID").max().reset_index() + rev = rev[["transcript_ID", "read_end"]] + rev = rev.groupby("transcript_ID").min().reset_index() # furthest upstream for tss: # if + strand, min coord of read start # if - strand, max coord of read start - elif how == 'tss': - fwd = fwd[['transcript_ID', 'read_start']] - fwd = fwd.groupby('transcript_ID').min().reset_index() - rev = rev[['transcript_ID', 'read_start']] - rev = rev.groupby('transcript_ID').max().reset_index() + elif how == "tss": + fwd = fwd[["transcript_ID", "read_start"]] + fwd = fwd.groupby("transcript_ID").min().reset_index() + rev = rev[["transcript_ID", "read_start"]] + rev = rev.groupby("transcript_ID").max().reset_index() # concat fwd and rev df = pd.concat([fwd, rev]) - df = df.sort_values(by='transcript_ID', ascending='True') + df = df.sort_values(by="transcript_ID", ascending="True") return df + # get the longest ends from the read annotation file # annot: TALON read annotation file path # how: 'tss' or 'tes', tss will find start ends and tes will find stop ends @@ -86,136 +99,155 @@ def get_longest_ends(df, how='tes', novelty='novel', datasets='all'): # opref: output file prefix # verbose: display processing progress # test: print out dataframe before and after editing -def replace_gtf_end_coords(gtf_df, ends, how='tes', test=False, verbose=False): - - if how == 'tes': - ends.columns = ['transcript_id', 'tes'] - elif how == 'tss': - ends.columns = ['transcript_id', 'tss'] +def replace_gtf_end_coords(gtf_df, ends, how="tes", test=False, verbose=False): + if how == "tes": + ends.columns = ["transcript_id", "tes"] + elif how == "tss": + ends.columns = ["transcript_id", "tss"] # merge gtf_df with end information -# ends.transcript_id = ends.transcript_id.astype('str') + # ends.transcript_id = ends.transcript_id.astype('str') df = gtf_df.loc[gtf_df.transcript_id.notnull()] - ends.transcript_id = ends.transcript_id.astype('str') - gtf_df.transcript_id = gtf_df.transcript_id.astype('str') - gtf_df = gtf_df.merge(ends, how='left', on='transcript_id') - df.transcript_id = df.transcript_id.astype('str') - df = df.merge(ends, how='inner') + ends.transcript_id = ends.transcript_id.astype("str") + gtf_df.transcript_id = gtf_df.transcript_id.astype("str") + gtf_df = gtf_df.merge(ends, how="left", on="transcript_id") + df.transcript_id = df.transcript_id.astype("str") + df = df.merge(ends, how="inner") if test: - print('Before editing') - print(gtf_df[['transcript_id', 'entry_type', 'strand', 'start', 'stop', how]]) + print("Before editing") + print(gtf_df[["transcript_id", "entry_type", "strand", "start", "stop", how]]) # swap out read starts or ends for the longest ones tids = df.transcript_id.unique() for t, tid in enumerate(tids): if t % 1000 == 0 and verbose: - print('Processing transcript {} of {}'.format(t, len(tids))) + print("Processing transcript {} of {}".format(t, len(tids))) # fwd: swap out transcript "stop" and last exon "stop" # rev: swap out transcript "start" and last exon "start" - if how == 'tes': + if how == "tes": # tes fwd - ind = gtf_df.loc[(gtf_df.strand=='+')&(gtf_df.transcript_id==tid)].index.tolist() + ind = gtf_df.loc[(gtf_df.strand == "+") & (gtf_df.transcript_id == tid)].index.tolist() if ind: # stop of transcript for fwd i = ind[0] - gtf_df.loc[i, 'stop'] = gtf_df.loc[i, 'tes'] + gtf_df.loc[i, "stop"] = gtf_df.loc[i, "tes"] # stop of last exon for fwd i = ind[-1] - gtf_df.loc[i, 'stop'] = gtf_df.loc[i, 'tes'] + gtf_df.loc[i, "stop"] = gtf_df.loc[i, "tes"] # tes rev - ind = gtf_df.loc[(gtf_df.strand=='-')&(gtf_df.transcript_id==tid)].index.tolist() + ind = gtf_df.loc[(gtf_df.strand == "-") & (gtf_df.transcript_id == tid)].index.tolist() if ind: # start of trancscript for rev i = ind[0] - gtf_df.loc[i, 'start'] = gtf_df.loc[i, 'tes'] + gtf_df.loc[i, "start"] = gtf_df.loc[i, "tes"] # start of last exon for rev i = ind[-1] - gtf_df.loc[i, 'start'] = gtf_df.loc[i, 'tes'] + gtf_df.loc[i, "start"] = gtf_df.loc[i, "tes"] # fwd: swap out transcript "start" and first exon "start" # rev: swap out transcript "stop" and first exon "stop" - elif how == 'tss': + elif how == "tss": # tss fwd - ind = gtf_df.loc[(gtf_df.strand=='+')&(gtf_df.transcript_id==tid)].index.tolist() + ind = gtf_df.loc[(gtf_df.strand == "+") & (gtf_df.transcript_id == tid)].index.tolist() if ind: # start of transcript for fwd i = ind[0] - gtf_df.loc[i, 'start'] = gtf_df.loc[i, 'tss'] + gtf_df.loc[i, "start"] = gtf_df.loc[i, "tss"] # start of first exon for fwd i = ind[1] - gtf_df.loc[i, 'start'] = gtf_df.loc[i, 'tss'] + gtf_df.loc[i, "start"] = gtf_df.loc[i, "tss"] # tss rev - ind = gtf_df.loc[(gtf_df.strand=='-')&(gtf_df.transcript_id==tid)].index.tolist() + ind = gtf_df.loc[(gtf_df.strand == "-") & (gtf_df.transcript_id == tid)].index.tolist() if ind: # stop of transcript for rev i = ind[0] - gtf_df.loc[i, 'stop'] = gtf_df.loc[i, 'tss'] + gtf_df.loc[i, "stop"] = gtf_df.loc[i, "tss"] # stop of first exon for rev i = ind[1] - gtf_df.loc[i, 'stop'] = gtf_df.loc[i, 'tss'] + gtf_df.loc[i, "stop"] = gtf_df.loc[i, "tss"] # now fix gene coordinates # tes - if how == 'tes': + if how == "tes": # fwd: replace "stop" of the gene with the maximum of the "stops" # gene_ind = gtf_df.loc[(gtf_df.strand == '+')&(gtf_df.entry_type=='gene')&(gtf_df.tes.notnull())].index.tolist() # gene_ind = gtf_df.loc[(gtf_df.strand == '+')&(gtf_df.entry_type=='gene')].index.tolist() - genes = gtf_df.loc[(gtf_df.strand == '+')&(gtf_df.entry_type=='transcript')&(gtf_df.tes.notnull())].gene_id.unique().tolist() - gene_ind = gtf_df.loc[(gtf_df.gene_id.isin(genes)&(gtf_df.entry_type=='gene'))].index.tolist() + genes = ( + gtf_df.loc[(gtf_df.strand == "+") & (gtf_df.entry_type == "transcript") & (gtf_df.tes.notnull())] + .gene_id.unique() + .tolist() + ) + gene_ind = gtf_df.loc[(gtf_df.gene_id.isin(genes) & (gtf_df.entry_type == "gene"))].index.tolist() if gene_ind: - fwd = gtf_df.loc[(gtf_df.strand == '+')&(gtf_df.entry_type=='transcript')] + fwd = gtf_df.loc[(gtf_df.strand == "+") & (gtf_df.entry_type == "transcript")] if test: - print('fwd') + print("fwd") print(gtf_df.loc[gene_ind]) - gtf_df.loc[gene_ind, 'stop'] = gtf_df.loc[gene_ind].apply(lambda x: \ - fwd.loc[fwd.gene_id==x.gene_id, 'stop'].max(), axis=1) + gtf_df.loc[gene_ind, "stop"] = gtf_df.loc[gene_ind].apply( + lambda x: fwd.loc[fwd.gene_id == x.gene_id, "stop"].max(), axis=1 + ) # rev: replace "start" of the gene with the minimum of the "starts" # gene_ind = gtf_df.loc[(gtf_df.strand == '-')&(gtf_df.entry_type=='gene')&(gtf_df.tes.notnull())].index.tolist() - genes = gtf_df.loc[(gtf_df.strand == '-')&(gtf_df.entry_type=='transcript')&(gtf_df.tes.notnull())].gene_id.unique().tolist() - gene_ind = gtf_df.loc[(gtf_df.gene_id.isin(genes)&(gtf_df.entry_type=='gene'))].index.tolist() + genes = ( + gtf_df.loc[(gtf_df.strand == "-") & (gtf_df.entry_type == "transcript") & (gtf_df.tes.notnull())] + .gene_id.unique() + .tolist() + ) + gene_ind = gtf_df.loc[(gtf_df.gene_id.isin(genes) & (gtf_df.entry_type == "gene"))].index.tolist() if gene_ind: - rev = gtf_df.loc[(gtf_df.strand == '-')&(gtf_df.entry_type=='transcript')] + rev = gtf_df.loc[(gtf_df.strand == "-") & (gtf_df.entry_type == "transcript")] if test: - print('rev') + print("rev") print(gtf_df.loc[gene_ind]) - gtf_df.loc[gene_ind, 'start'] = gtf_df.loc[gene_ind].apply(lambda x: \ - rev.loc[rev.gene_id==x.gene_id, 'start'].min(), axis=1) + gtf_df.loc[gene_ind, "start"] = gtf_df.loc[gene_ind].apply( + lambda x: rev.loc[rev.gene_id == x.gene_id, "start"].min(), axis=1 + ) # tss - elif how == 'tss': + elif how == "tss": # fwd: replace "start" of the gene with the minimum of the "starts" # gene_ind = gtf_df.loc[(gtf_df.strand == '+')&(gtf_df.entry_type=='gene')&(gtf_df.tss.notnull())].index.tolist() - genes = gtf_df.loc[(gtf_df.strand == '+')&(gtf_df.entry_type=='transcript')&(gtf_df.tss.notnull())].gene_id.unique().tolist() - gene_ind = gtf_df.loc[(gtf_df.gene_id.isin(genes)&(gtf_df.entry_type=='gene'))].index.tolist() + genes = ( + gtf_df.loc[(gtf_df.strand == "+") & (gtf_df.entry_type == "transcript") & (gtf_df.tss.notnull())] + .gene_id.unique() + .tolist() + ) + gene_ind = gtf_df.loc[(gtf_df.gene_id.isin(genes) & (gtf_df.entry_type == "gene"))].index.tolist() if gene_ind: - fwd = gtf_df.loc[(gtf_df.strand == '+')&(gtf_df.entry_type=='transcript')] - gtf_df.loc[gene_ind, 'start'] = gtf_df.loc[gene_ind].apply(lambda x: \ - fwd.loc[fwd.gene_id==x.gene_id, 'start'].min(), axis=1) + fwd = gtf_df.loc[(gtf_df.strand == "+") & (gtf_df.entry_type == "transcript")] + gtf_df.loc[gene_ind, "start"] = gtf_df.loc[gene_ind].apply( + lambda x: fwd.loc[fwd.gene_id == x.gene_id, "start"].min(), axis=1 + ) # rev: replace "stop" of the gene with the maximum of the "stops" # gene_ind = gtf_df.loc[(gtf_df.strand == '-')&(gtf_df.entry_type=='gene')&(gtf_df.tss.notnull())].index.tolist() - genes = gtf_df.loc[(gtf_df.strand == '-')&(gtf_df.entry_type=='transcript')&(gtf_df.tss.notnull())].gene_id.unique().tolist() - gene_ind = gtf_df.loc[(gtf_df.gene_id.isin(genes)&(gtf_df.entry_type=='gene'))].index.tolist() + genes = ( + gtf_df.loc[(gtf_df.strand == "-") & (gtf_df.entry_type == "transcript") & (gtf_df.tss.notnull())] + .gene_id.unique() + .tolist() + ) + gene_ind = gtf_df.loc[(gtf_df.gene_id.isin(genes) & (gtf_df.entry_type == "gene"))].index.tolist() if gene_ind: - rev = gtf_df.loc[(gtf_df.strand == '-')&(gtf_df.entry_type=='transcript')] - gtf_df.loc[gene_ind, 'stop'] = gtf_df.loc[gene_ind].apply(lambda x: \ - rev.loc[rev.gene_id==x.gene_id, 'stop'].max(), axis=1) + rev = gtf_df.loc[(gtf_df.strand == "-") & (gtf_df.entry_type == "transcript")] + gtf_df.loc[gene_ind, "stop"] = gtf_df.loc[gene_ind].apply( + lambda x: rev.loc[rev.gene_id == x.gene_id, "stop"].max(), axis=1 + ) if test: print() - print('After editing') - print(gtf_df[['transcript_id', 'entry_type', 'strand', 'start', 'stop', how]]) + print("After editing") + print(gtf_df[["transcript_id", "entry_type", "strand", "start", "stop", how]]) # cols=['chr', 'source', 'entry_type', \ # 'start', 'stop', 'score', 'strand',\ # 'frame', 'fields'] # gtf_df = gtf_df[cols] - gtf_df['start'] = gtf_df['start'].astype('int') - gtf_df['stop'] = gtf_df['stop'].astype('int') + gtf_df["start"] = gtf_df["start"].astype("int") + gtf_df["stop"] = gtf_df["stop"].astype("int") # if test: # fname = '{}_revised_{}_test.gtf'.format(opref, how) # else: @@ -223,12 +255,13 @@ def replace_gtf_end_coords(gtf_df, ends, how='tes', test=False, verbose=False): # gtf_df.to_csv(fname, sep='\t', header=None, index=False, quoting=csv.QUOTE_NONE) return gtf_df + # return a list of datasets from read_annot file # subset by a list of datasets given from a datasets file -def get_datasets_from_read_annot(df, datasets='all'): - if datasets != 'all': - dataset_df = pd.read_csv(datasets, header=None, names=['dataset']) - dataset_list = dataset_df['dataset'].tolist() +def get_datasets_from_read_annot(df, datasets="all"): + if datasets != "all": + dataset_df = pd.read_csv(datasets, header=None, names=["dataset"]) + dataset_list = dataset_df["dataset"].tolist() for d in dataset_list: if d not in df.dataset.unique().tolist(): raise ValueError("Dataset name {} not found in read_annot".format(d)) @@ -237,7 +270,6 @@ def get_datasets_from_read_annot(df, datasets='all'): def main(): - args = get_args() gtf = args.gtf annot = args.annot @@ -252,9 +284,9 @@ def main(): # read in read_annot file try: - df = pd.read_csv(annot, sep='\t') + df = pd.read_csv(annot, sep="\t") except: - raise Error('Problem loading read annot file {}'.format(annot)) + raise Error("Problem loading read annot file {}".format(annot)) # make sure datasets are valid # if datasets != 'all': @@ -263,39 +295,42 @@ def main(): datasets = get_datasets_from_read_annot(df, datasets) # read gtf - gtf_df = pd.read_csv(gtf, sep='\t', header=None, \ - names=['chr', 'source', 'entry_type', \ - 'start', 'stop', 'score', 'strand',\ - 'frame', 'fields'], comment='#') + gtf_df = pd.read_csv( + gtf, + sep="\t", + header=None, + names=["chr", "source", "entry_type", "start", "stop", "score", "strand", "frame", "fields"], + comment="#", + ) # get relevant values from fields - gtf_df['transcript_id'] = np.nan - gtf_df.loc[gtf_df.entry_type!='gene', 'transcript_id'] = gtf_df.loc[gtf_df.entry_type!='gene'].fields.str.split(pat='talon_transcript "', n=1, expand=True)[1] - gtf_df.loc[gtf_df.entry_type!='gene', 'transcript_id'] = gtf_df.loc[gtf_df.entry_type!='gene'].transcript_id.str.split(pat='"', n=1, expand=True)[0] - gtf_df['gene_id'] = gtf_df.loc[gtf_df.entry_type!='gene'].fields.str.split(pat='talon_gene "', n=1, expand=True)[1] - gtf_df['gene_id'] = gtf_df.loc[gtf_df.entry_type!='gene'].gene_id.str.split(pat='"', n=1, expand=True)[0] + gtf_df["transcript_id"] = np.nan + gtf_df.loc[gtf_df.entry_type != "gene", "transcript_id"] = gtf_df.loc[gtf_df.entry_type != "gene"].fields.str.split( + pat='talon_transcript "', n=1, expand=True + )[1] + gtf_df.loc[gtf_df.entry_type != "gene", "transcript_id"] = gtf_df.loc[ + gtf_df.entry_type != "gene" + ].transcript_id.str.split(pat='"', n=1, expand=True)[0] + gtf_df["gene_id"] = gtf_df.loc[gtf_df.entry_type != "gene"].fields.str.split(pat='talon_gene "', n=1, expand=True)[ + 1 + ] + gtf_df["gene_id"] = gtf_df.loc[gtf_df.entry_type != "gene"].gene_id.str.split(pat='"', n=1, expand=True)[0] # first, call ends from the read annot file - if mode == 'both': - + if mode == "both": # tss first - ends = get_longest_ends(df, how='tss', novelty=novelty, datasets=datasets) - gtf_df = replace_gtf_end_coords(gtf_df, ends, - how='tss', verbose=verbose) + ends = get_longest_ends(df, how="tss", novelty=novelty, datasets=datasets) + gtf_df = replace_gtf_end_coords(gtf_df, ends, how="tss", verbose=verbose) # tes - ends = get_longest_ends(df, how='tes', novelty=novelty, datasets=datasets) - gtf_df = replace_gtf_end_coords(gtf_df, ends, - how='tes', verbose=verbose) + ends = get_longest_ends(df, how="tes", novelty=novelty, datasets=datasets) + gtf_df = replace_gtf_end_coords(gtf_df, ends, how="tes", verbose=verbose) else: ends = get_longest_ends(df, how=mode, novelty=novelty, datasets=datasets) - gtf_df = replace_gtf_end_coords(gtf_df, ends, - how=mode, verbose=verbose) + gtf_df = replace_gtf_end_coords(gtf_df, ends, how=mode, verbose=verbose) - cols=['chr', 'source', 'entry_type', \ - 'start', 'stop', 'score', 'strand',\ - 'frame', 'fields'] + cols = ["chr", "source", "entry_type", "start", "stop", "score", "strand", "frame", "fields"] gtf_df = gtf_df[cols] - fname = '{}_revised_{}.gtf'.format(opref, mode) - gtf_df.to_csv(fname, sep='\t', header=None, index=False, quoting=csv.QUOTE_NONE) + fname = "{}_revised_{}.gtf".format(opref, mode) + gtf_df.to_csv(fname, sep="\t", header=None, index=False, quoting=csv.QUOTE_NONE) diff --git a/src/talon/post/create_GTF_abundance_from_database.py b/src/talon/post/create_GTF_abundance_from_database.py index 6f25510..3943f1f 100644 --- a/src/talon/post/create_GTF_abundance_from_database.py +++ b/src/talon/post/create_GTF_abundance_from_database.py @@ -2,45 +2,60 @@ # Author: Dana Wyman # ----------------------------------------------------------------------------- # create_GTF_abundance_from_database.py is designed to generate a GTF -# as well as an abundance file with the same filtering options. +# as well as an abundance file with the same filtering options. -from optparse import OptionParser -import subprocess import os -import sys import shlex +import subprocess +import sys +from optparse import OptionParser + from .create_abundance_file_from_database import main as create_abundance_file_main -from .filter_talon_transcripts import main as filter_transcripts_main from .create_GTF_from_database import main as gtf_from_db_main +from .filter_talon_transcripts import main as filter_transcripts_main -parser = OptionParser(description="""A script to generate a GTF and abundance file - with the same filtering options.""") +parser = OptionParser( + description="""A script to generate a GTF and abundance file + with the same filtering options.""" +) -parser.add_option("--db", dest = "database", - help = "TALON database", metavar = "FILE", type = "string") -parser.add_option("--annot", "-a", dest = "annot", - help = """Which annotation version to use. Will determine which +parser.add_option("--db", dest="database", help="TALON database", metavar="FILE", type="string") +parser.add_option( + "--annot", + "-a", + dest="annot", + help="""Which annotation version to use. Will determine which annotation transcripts are considered known or novel relative to. Note: must be in the TALON database.""", - type = "string") -parser.add_option("--build", "-b", dest = "build", - help = "Genome build to use. Note: must be in the TALON database.", - type = "string") -parser.add_option("--filter", dest ="filtering", action='store_true', - help = "If this option is set, the transcripts in the \ + type="string", +) +parser.add_option( + "--build", "-b", dest="build", help="Genome build to use. Note: must be in the TALON database.", type="string" +) +parser.add_option( + "--filter", + dest="filtering", + action="store_true", + help="If this option is set, the transcripts in the \ database will be filtered prior to GTF creation \ - (for more information, see filter_talon_transcripts.py)") -parser.add_option("--pairings", "-p", dest = "pairings_file", - help = """Optional (only relevant if filter = true): A file indicating + (for more information, see filter_talon_transcripts.py)", +) +parser.add_option( + "--pairings", + "-p", + dest="pairings_file", + help="""Optional (only relevant if filter = true): A file indicating which datasets should be considered together when filtering novel transcripts (i.e. biological replicates). Format: Each line of the file constitutes a group, with member datasets separated by commas. If no file is provided, then novel transcripts appearing in any two datasets will be accepted.""", - metavar = "FILE", type = "string", default = None) -parser.add_option("--o", dest = "outprefix", help = "Prefix for output file", - metavar = "FILE", type = "string") + metavar="FILE", + type="string", + default=None, +) +parser.add_option("--o", dest="outprefix", help="Prefix for output file", metavar="FILE", type="string") (opt, args) = parser.parse_args() @@ -51,47 +66,43 @@ # make abundance file db = opt.database annot = opt.annot -build = opt.build +build = opt.build o = opt.outprefix filtering = opt.filtering pairings = opt.pairings_file -create_abundance_file_arguments = ( - "--db {} --annot {} --build {} --o {}".format(tpath, db, annot, build, o)) +create_abundance_file_arguments = "--db {} --annot {} --build {} --o {}".format(tpath, db, annot, build, o) if filtering: - create_abundance_file_arguments+= ' --filter' - if pairings != None: - create_abundance_file_arguments+= ' --pairings {}'.format(pairings) + create_abundance_file_arguments += " --filter" + if pairings != None: + create_abundance_file_arguments += " --pairings {}".format(pairings) # TODO: Call a function with argument instead of using argv and calling main -sys.argv = ["create_abundance_file_from_database.py"] + shlex.split( - create_abundance_file_arguments) +sys.argv = ["create_abundance_file_from_database.py"] + shlex.split(create_abundance_file_arguments) create_abundance_file_main() # make whitelist file for GTF if filtering: - outfile = o+'_whitelist' - filter_arguments = "--db {} --annot {} --o {}".format( - tpath, db, annot, outfile) - if pairings != None: - filter_arguments+=' --pairings {}'.format(pairings) - sys.argv = ["filter_talon_transcripts.py"] + shlex.split(filter_arguments) - filter_transcripts_main() + outfile = o + "_whitelist" + filter_arguments = "--db {} --annot {} --o {}".format(tpath, db, annot, outfile) + if pairings != None: + filter_arguments += " --pairings {}".format(pairings) + sys.argv = ["filter_talon_transcripts.py"] + shlex.split(filter_arguments) + filter_transcripts_main() # make GTF -gtf_from_db_arguments = "--db {} --build {} --annot {} --o {}".format( - tpath, db, build, annot, o) +gtf_from_db_arguments = "--db {} --build {} --annot {} --o {}".format(tpath, db, build, annot, o) if filtering: - gtf_from_db_arguments +=' --whitelist {}'.format(outfile) + gtf_from_db_arguments += " --whitelist {}".format(outfile) else: - gtf_from_db_arguments +=' --observed' - # pfile = open(pairings, 'r') - # pairing_str = pfile.read() - # pfile.close() - # pairing_str.replace(',', '\n') - # ofile = o+'_datasets' - # ofile = open(ofile, 'w') - # ofile.write(pairing_str) - # cmd+=' --datasets {}'.format(o+'_datasets') + gtf_from_db_arguments += " --observed" + # pfile = open(pairings, 'r') + # pairing_str = pfile.read() + # pfile.close() + # pairing_str.replace(',', '\n') + # ofile = o+'_datasets' + # ofile = open(ofile, 'w') + # ofile.write(pairing_str) + # cmd+=' --datasets {}'.format(o+'_datasets') sys.argv = ["create_GTF_from_database.py"] + shlex.split(gtf_from_db_arguments) gtf_from_db_main() diff --git a/src/talon/post/create_GTF_from_database.py b/src/talon/post/create_GTF_from_database.py index 922398a..b72af34 100644 --- a/src/talon/post/create_GTF_from_database.py +++ b/src/talon/post/create_GTF_from_database.py @@ -7,76 +7,94 @@ import copy import itertools import operator -from optparse import OptionParser import sqlite3 - -from . import post_utils as putils -from . import ab_utils as autils +from optparse import OptionParser from pathlib import Path from .. import query_utils as qutils +from . import ab_utils as autils +from . import post_utils as putils + def getOptions(): parser = OptionParser() - parser.add_option("--db", dest = "database", - help = "TALON database", metavar = "FILE", type = "string") + parser.add_option("--db", dest="database", help="TALON database", metavar="FILE", type="string") - parser.add_option("--build", "-b", dest = "build", - help = "Genome build to use. Note: must be in the TALON database.", - type = "string") + parser.add_option( + "--build", "-b", dest="build", help="Genome build to use. Note: must be in the TALON database.", type="string" + ) - parser.add_option("--annot", "-a", dest = "annot", - help = """Which annotation version to use. Will determine which + parser.add_option( + "--annot", + "-a", + dest="annot", + help="""Which annotation version to use. Will determine which annotation transcripts are considered known or novel relative to. Note: must be in the TALON database.""", - type = "string") + type="string", + ) - parser.add_option("--whitelist", dest = "whitelist", - help = "Whitelist file of transcripts to include in the \ + parser.add_option( + "--whitelist", + dest="whitelist", + help="Whitelist file of transcripts to include in the \ output. First column should be TALON gene ID, \ second column should be TALON transcript ID", - metavar = "FILE", type = "string", default = None) - - parser.add_option("--observed", dest ="observed", action='store_true', - help = "If this option is set, the GTF file will only \ + metavar="FILE", + type="string", + default=None, + ) + + parser.add_option( + "--observed", + dest="observed", + action="store_true", + help="If this option is set, the GTF file will only \ include transcripts that were observed in at least one \ - dataset (redundant if dataset file provided).") - - parser.add_option("--datasets", "-d", dest = "datasets_file", - help = """Optional: A file indicating which datasets should be + dataset (redundant if dataset file provided).", + ) + + parser.add_option( + "--datasets", + "-d", + dest="datasets_file", + help="""Optional: A file indicating which datasets should be included (one dataset name per line). Default is to include all datasets.""", - metavar = "FILE", type = "string", default = None) - - parser.add_option("--o", dest = "outprefix", help = "Prefix for output GTF", - metavar = "FILE", type = "string") + metavar="FILE", + type="string", + default=None, + ) + parser.add_option("--o", dest="outprefix", help="Prefix for output GTF", metavar="FILE", type="string") (options, args) = parser.parse_args() return options + def create_outname(options): - """ Creates filename for the output GTF that reflects the input options that - were used. """ + """Creates filename for the output GTF that reflects the input options that + were used.""" outname = options.outprefix + "_talon" if options.observed == True: - outname = "_".join([ outname, "observedOnly" ]) + outname = "_".join([outname, "observedOnly"]) outname += ".gtf" return outname -def get_annotations(database, feat_type, annot, whitelist = None): - """ Extracts annotations from the gene/transcript/exon annotation table of - the database (depending on choice of feat_type). Limited to rows where - the annot_name column matches the value of annot. - Returns: - annotation_dict: dictionary data structure in which the keys are - gene/transcript/exon TALON IDs (depending on - choice of feat_type) and the value is a list of - annotation tuples. +def get_annotations(database, feat_type, annot, whitelist=None): + """Extracts annotations from the gene/transcript/exon annotation table of + the database (depending on choice of feat_type). Limited to rows where + the annot_name column matches the value of annot. + + Returns: + annotation_dict: dictionary data structure in which the keys are + gene/transcript/exon TALON IDs (depending on + choice of feat_type) and the value is a list of + annotation tuples. """ # Fetch the annotations conn = sqlite3.connect(database) @@ -85,12 +103,17 @@ def get_annotations(database, feat_type, annot, whitelist = None): table_name = feat_type + "_annotations" if whitelist == None: - query = "SELECT * FROM " + table_name + " WHERE annot_name = '" + annot + \ - "' OR source = 'TALON'" + query = "SELECT * FROM " + table_name + " WHERE annot_name = '" + annot + "' OR source = 'TALON'" else: - whitelist_string = "(" + ','.join([str(x) for x in whitelist]) + ")" - query = "SELECT * FROM " + table_name + " WHERE (annot_name = '" + annot + \ - "' OR source = 'TALON') AND ID IN " + whitelist_string + whitelist_string = "(" + ",".join([str(x) for x in whitelist]) + ")" + query = ( + "SELECT * FROM " + + table_name + + " WHERE (annot_name = '" + + annot + + "' OR source = 'TALON') AND ID IN " + + whitelist_string + ) cursor.execute(query) annotation_tuples = cursor.fetchall() @@ -100,29 +123,31 @@ def get_annotations(database, feat_type, annot, whitelist = None): # Group by ID and store in a dictionary ID_groups = {} - for key,group in itertools.groupby(sorted_annotations,operator.itemgetter(0)): + for key, group in itertools.groupby(sorted_annotations, operator.itemgetter(0)): ID_groups[key] = list(group) return ID_groups + def get_gene_2_transcripts(database, genome_build, whitelist): - """ Creates a dictionary mapping gene IDs to the transcripts that belong to - them. The columns in each tuple are: - 0: gene ID - 1: transcript ID - 2: chromosome - 3: start position (min of 5' and 3') - 4: end position (max of 5' and 3') - 5: strand - 6: edge path - 7. n_exons - """ + """Creates a dictionary mapping gene IDs to the transcripts that belong to + them. The columns in each tuple are: + 0: gene ID + 1: transcript ID + 2: chromosome + 3: start position (min of 5' and 3') + 4: end position (max of 5' and 3') + 5: strand + 6: edge path + 7. n_exons + """ conn = sqlite3.connect(database) conn.row_factory = sqlite3.Row cursor = conn.cursor() - whitelist_string = "(" + ','.join([str(x) for x in whitelist]) + ")" - query = """ + whitelist_string = "(" + ",".join([str(x) for x in whitelist]) + ")" + query = ( + """ SELECT t.gene_ID, t.transcript_ID, @@ -138,9 +163,14 @@ def get_gene_2_transcripts(database, genome_build, whitelist): LEFT JOIN location loc1 ON t.start_vertex = loc1.location_ID LEFT JOIN location loc2 ON t.end_vertex = loc2.location_ID LEFT JOIN genes ON t.gene_ID = genes.gene_ID - WHERE loc1.genome_build = '""" + genome_build + """' AND - loc2.genome_build = '""" + genome_build + \ - """' AND t.transcript_ID IN """ + whitelist_string + WHERE loc1.genome_build = '""" + + genome_build + + """' AND + loc2.genome_build = '""" + + genome_build + + """' AND t.transcript_ID IN """ + + whitelist_string + ) cursor.execute(query) transcript_tuples = cursor.fetchall() @@ -148,21 +178,23 @@ def get_gene_2_transcripts(database, genome_build, whitelist): sorted_transcript_tuples = sorted(transcript_tuples, key=lambda x: x["gene_ID"]) gene_groups = {} - for key,group in itertools.groupby(sorted_transcript_tuples,operator.itemgetter(0)): + for key, group in itertools.groupby(sorted_transcript_tuples, operator.itemgetter(0)): # Sort by transcript start position gene_groups[key] = sorted(list(group), key=lambda x: x["min_pos"]) conn.close() return gene_groups + def fetch_exon_locations(database, genome_build): - """ Queries the database to create a dictionary mapping exon IDs to - the chromosome, start, end, and strand of the exon """ + """Queries the database to create a dictionary mapping exon IDs to + the chromosome, start, end, and strand of the exon""" conn = sqlite3.connect(database) cursor = conn.cursor() - query = """ + query = ( + """ SELECT e.edge_ID, loc1.chromosome, @@ -172,9 +204,13 @@ def fetch_exon_locations(database, genome_build): FROM edge e LEFT JOIN location loc1 ON e.v1 = loc1.location_ID LEFT JOIN location loc2 ON e.v2 = loc2.location_ID - WHERE loc1.genome_build = '""" + genome_build + """' AND - loc2.genome_build = '""" + genome_build + \ - """' AND e.edge_type = 'exon';""" + WHERE loc1.genome_build = '""" + + genome_build + + """' AND + loc2.genome_build = '""" + + genome_build + + """' AND e.edge_type = 'exon';""" + ) cursor.execute(query) exon_location_tuples = cursor.fetchall() @@ -188,34 +224,30 @@ def fetch_exon_locations(database, genome_build): conn.close() return exon_locations -def create_gtf(database, annot, genome_build, whitelist, outfile): +def create_gtf(database, annot, genome_build, whitelist, outfile): # Create separate gene and transcript whitelists gene_whitelist = [] transcript_whitelist = [] - for key,group in itertools.groupby(whitelist,operator.itemgetter(0)): + for key, group in itertools.groupby(whitelist, operator.itemgetter(0)): gene_whitelist.append(key) for id_tuple in list(group): transcript_whitelist.append(id_tuple[1]) # Get gene, transcript, and exon annotations - gene_annotations = get_annotations(database, "gene", annot, - whitelist = gene_whitelist) - transcript_annotations = get_annotations(database, "transcript", annot, - whitelist = transcript_whitelist) + gene_annotations = get_annotations(database, "gene", annot, whitelist=gene_whitelist) + transcript_annotations = get_annotations(database, "transcript", annot, whitelist=transcript_whitelist) exon_annotations = get_annotations(database, "exon", annot) - # Get transcript data from the database - gene_2_transcripts = get_gene_2_transcripts(database, genome_build, - transcript_whitelist) + gene_2_transcripts = get_gene_2_transcripts(database, genome_build, transcript_whitelist) # Get exon location info from database exon_ID_2_location = fetch_exon_locations(database, genome_build) # ------------------------------------------------------------- - o = open(outfile, 'w') + o = open(outfile, "w") # Create a GTF entry for every gene for gene_ID, transcript_tuples in gene_2_transcripts.items(): @@ -225,8 +257,7 @@ def create_gtf(database, annot, genome_build, whitelist, outfile): attribute = annot[3] value = annot[4] gene_annotation_dict[attribute] = value - gene_GTF_line = get_gene_GTF_entry(gene_ID, transcript_tuples, - copy.copy(gene_annotation_dict)) + gene_GTF_line = get_gene_GTF_entry(gene_ID, transcript_tuples, copy.copy(gene_annotation_dict)) o.write(gene_GTF_line + "\n") # Create a GTF entry for every transcript of this gene @@ -239,14 +270,16 @@ def create_gtf(database, annot, genome_build, whitelist, outfile): attribute = annot[3] value = annot[4] transcript_annotation_dict[attribute] = value - transcript_GTF_line = get_transcript_GTF_entry(transcript_entry, - copy.copy(gene_annotation_dict), - copy.copy(transcript_annotation_dict)) + transcript_GTF_line = get_transcript_GTF_entry( + transcript_entry, copy.copy(gene_annotation_dict), copy.copy(transcript_annotation_dict) + ) o.write(transcript_GTF_line + "\n") if transcript_entry["n_exons"] != 1: - transcript_edges = [str(transcript_entry["start_exon"])] + \ - str(transcript_entry["jn_path"]).split(",")+ \ - [str(transcript_entry["end_exon"])] + transcript_edges = ( + [str(transcript_entry["start_exon"])] + + str(transcript_entry["jn_path"]).split(",") + + [str(transcript_entry["end_exon"])] + ) else: transcript_edges = [transcript_entry["start_exon"]] @@ -262,30 +295,35 @@ def create_gtf(database, annot, genome_build, whitelist, outfile): value = annot[4] exon_annotation_dict[attribute] = value - - exon_GTF_line = get_exon_GTF_entry(gene_ID, transcript_ID, - exon_ID, exon_num, - exon_ID_2_location, - copy.copy(gene_annotation_dict), - copy.copy(transcript_annotation_dict), - exon_annotation_dict) + exon_GTF_line = get_exon_GTF_entry( + gene_ID, + transcript_ID, + exon_ID, + exon_num, + exon_ID_2_location, + copy.copy(gene_annotation_dict), + copy.copy(transcript_annotation_dict), + exon_annotation_dict, + ) o.write(exon_GTF_line + "\n") exon_num += 1 o.close() return + def make_descriptor_string(attribute, value): - """ Create a key-value string to form part of a GTF entry. - Example: gene_id and ENSG00000117676.13 - becomes - gene_id "ENSG00000117676.13"; + """Create a key-value string to form part of a GTF entry. + Example: gene_id and ENSG00000117676.13 + becomes + gene_id "ENSG00000117676.13"; """ return str(attribute) + ' "' + str(value) + '";' + def format_GTF_tag_values_for_gene(gene_ID, annotation_dict): - """ Parses the annotations for this gene, and supplements them where - necessary for novel transcripts """ + """Parses the annotations for this gene, and supplements them where + necessary for novel transcripts""" attributes = [] @@ -323,15 +361,15 @@ def format_GTF_tag_values_for_gene(gene_ID, annotation_dict): attributes.append(make_descriptor_string("talon_gene", gene_ID)) # Add any remaining annotations - for attribute,value in sorted(annotation_dict.items()): + for attribute, value in sorted(annotation_dict.items()): attributes.append(make_descriptor_string(attribute, value)) return attributes -def format_GTF_tag_values_for_transcript(gene_ID, transcript_ID, gene_annot_dict, - transcript_annot_dict): - """ Parses the annotations for this transcript, and supplements them where - necessary for novel transcripts """ + +def format_GTF_tag_values_for_transcript(gene_ID, transcript_ID, gene_annot_dict, transcript_annot_dict): + """Parses the annotations for this transcript, and supplements them where + necessary for novel transcripts""" attributes = [] @@ -388,16 +426,17 @@ def format_GTF_tag_values_for_transcript(gene_ID, transcript_ID, gene_annot_dict attributes.append(make_descriptor_string("talon_transcript", transcript_ID)) # Add any remaining annotations - for attribute,value in sorted(transcript_annot_dict.items()): + for attribute, value in sorted(transcript_annot_dict.items()): attributes.append(make_descriptor_string(attribute, value)) return attributes -def format_GTF_tag_values_for_exon(gene_ID, transcript_ID, exon_ID, exon_number, - gene_annot_dict, transcript_annot_dict, - exon_annot_dict): - """ Parses the annotations for this exon, and supplements them where - necessary for novel exons """ + +def format_GTF_tag_values_for_exon( + gene_ID, transcript_ID, exon_ID, exon_number, gene_annot_dict, transcript_annot_dict, exon_annot_dict +): + """Parses the annotations for this exon, and supplements them where + necessary for novel exons""" attributes = [] @@ -469,13 +508,14 @@ def format_GTF_tag_values_for_exon(gene_ID, transcript_ID, exon_ID, exon_number, exon_annot_dict.pop("exon_number") # Add any remaining annotations - for attribute,value in sorted(exon_annot_dict.items()): + for attribute, value in sorted(exon_annot_dict.items()): attributes.append(make_descriptor_string(attribute, value)) return attributes + def get_gene_GTF_entry(gene_ID, associated_transcript_tuples, annotation_dict): - """ Creates a GTF annotation entry for the given gene """ + """Creates a GTF annotation entry for the given gene""" if "source" in annotation_dict: source = annotation_dict["source"] @@ -492,13 +532,12 @@ def get_gene_GTF_entry(gene_ID, associated_transcript_tuples, annotation_dict): frame = "." attributes = " ".join(format_GTF_tag_values_for_gene(gene_ID, annotation_dict)) - GTF = '\t'.join([chromosome, source, feature, start, end, score, strand, - frame, attributes]) + GTF = "\t".join([chromosome, source, feature, start, end, score, strand, frame, attributes]) return GTF def get_transcript_GTF_entry(transcript_entry, curr_gene_annot_dict, curr_transcript_annot_dict): - """ Creates a GTF annotation entry for the given transcript """ + """Creates a GTF annotation entry for the given transcript""" if "source" in curr_transcript_annot_dict: source = curr_transcript_annot_dict["source"] @@ -516,19 +555,25 @@ def get_transcript_GTF_entry(transcript_entry, curr_gene_annot_dict, curr_transc score = "." strand = transcript_entry["strand"] frame = "." - attributes = " ".join(format_GTF_tag_values_for_transcript(gene_ID, - transcript_ID, - curr_gene_annot_dict, - curr_transcript_annot_dict)) + attributes = " ".join( + format_GTF_tag_values_for_transcript(gene_ID, transcript_ID, curr_gene_annot_dict, curr_transcript_annot_dict) + ) - GTF = '\t'.join([chromosome, source, feature, start, end, score, strand, - frame, attributes]) + GTF = "\t".join([chromosome, source, feature, start, end, score, strand, frame, attributes]) return GTF -def get_exon_GTF_entry(gene_ID, transcript_ID, exon_ID, exon_num, exon_ID_2_location, - curr_gene_annot_dict, curr_transcript_annot_dict, - curr_exon_annot_dict): - """ Creates a GTF annotation entry for the given exon """ + +def get_exon_GTF_entry( + gene_ID, + transcript_ID, + exon_ID, + exon_num, + exon_ID_2_location, + curr_gene_annot_dict, + curr_transcript_annot_dict, + curr_exon_annot_dict, +): + """Creates a GTF annotation entry for the given exon""" if "source" in curr_exon_annot_dict: source = curr_exon_annot_dict["source"] @@ -543,17 +588,22 @@ def get_exon_GTF_entry(gene_ID, transcript_ID, exon_ID, exon_num, exon_ID_2_loca score = "." strand = curr_exon_location[3] frame = "." - attributes = " ".join(format_GTF_tag_values_for_exon(gene_ID, - transcript_ID, - exon_ID, exon_num, - curr_gene_annot_dict, - curr_transcript_annot_dict, - curr_exon_annot_dict)) - - GTF = '\t'.join([chromosome, source, feature, start, end, score, strand, - frame, attributes]) + attributes = " ".join( + format_GTF_tag_values_for_exon( + gene_ID, + transcript_ID, + exon_ID, + exon_num, + curr_gene_annot_dict, + curr_transcript_annot_dict, + curr_exon_annot_dict, + ) + ) + + GTF = "\t".join([chromosome, source, feature, start, end, score, strand, frame, attributes]) return GTF + # def check_annot_validity(annot, database): # """ Make sure that the user has entered a correct annotation name """ # @@ -605,6 +655,7 @@ def get_exon_GTF_entry(gene_ID, transcript_ID, exon_ID, exon_num, exon_ID_2_loca # # return + def main(): options = getOptions() database = options.database @@ -622,18 +673,13 @@ def main(): if not Path(database).exists(): raise ValueError("Database file '%s' does not exist!" % database) - # Determine which transcripts to include - whitelist = putils.handle_filtering(database, - annot, - observed, - whitelist_file, - dataset_file) + whitelist = putils.handle_filtering(database, annot, observed, whitelist_file, dataset_file) # Sort on gene ID sorted_whitelist = sorted(whitelist, key=lambda x: x[0]) create_gtf(database, annot, build, whitelist, outfile) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/src/talon/post/create_abundance_file_from_database.py b/src/talon/post/create_abundance_file_from_database.py index 73a3b39..1447d20 100644 --- a/src/talon/post/create_abundance_file_from_database.py +++ b/src/talon/post/create_abundance_file_from_database.py @@ -5,67 +5,81 @@ # for each transcript in the TALON database across datasets. Modified by # filtering option. -import sqlite3 import itertools import operator +import sqlite3 from optparse import OptionParser from pathlib import Path -from . import filter_talon_transcripts as filt from .. import dstruct as dstruct from .. import length_utils as lu -from . import post_utils as putils -from . import ab_utils as autils from .. import query_utils as qutils from .. import talon as talon +from . import ab_utils as autils +from . import filter_talon_transcripts as filt +from . import post_utils as putils def getOptions(): parser = OptionParser() - parser.add_option("--db", dest = "database", - help = "TALON database", metavar = "FILE", type = "string") + parser.add_option("--db", dest="database", help="TALON database", metavar="FILE", type="string") - parser.add_option("--annot", "-a", dest = "annot", - help = """Which annotation version to use. Will determine which + parser.add_option( + "--annot", + "-a", + dest="annot", + help="""Which annotation version to use. Will determine which annotation transcripts are considered known or novel relative to. Note: must be in the TALON database.""", - type = "string") + type="string", + ) - parser.add_option("--whitelist", dest = "whitelist", - help = "Whitelist file of transcripts to include in the \ + parser.add_option( + "--whitelist", + dest="whitelist", + help="Whitelist file of transcripts to include in the \ output. First column should be TALON gene ID, \ second column should be TALON transcript ID", - metavar = "FILE", type = "string", default = None) - - parser.add_option("--build", "-b", dest = "build", - help = "Genome build to use. Note: must be in the TALON database.", - type = "string") - - parser.add_option("--datasets", "-d", dest = "datasets_file", - help = """Optional: A file indicating which datasets should be + metavar="FILE", + type="string", + default=None, + ) + + parser.add_option( + "--build", "-b", dest="build", help="Genome build to use. Note: must be in the TALON database.", type="string" + ) + + parser.add_option( + "--datasets", + "-d", + dest="datasets_file", + help="""Optional: A file indicating which datasets should be included (one dataset name per line). Default is to include all datasets.""", - metavar = "FILE", type = "string", default = None) - - parser.add_option("--o", dest = "outprefix", help = "Prefix for output file", - metavar = "FILE", type = "string") + metavar="FILE", + type="string", + default=None, + ) + parser.add_option("--o", dest="outprefix", help="Prefix for output file", metavar="FILE", type="string") (options, args) = parser.parse_args() return options + def create_outname(options): - """ Creates filename for the output abundance that reflects the input options that - were used. """ + """Creates filename for the output abundance that reflects the input options that + were used.""" outname = options.outprefix + "_talon_abundance" if options.whitelist != None: - outname = "_".join([ outname, "filtered" ]) + outname = "_".join([outname, "filtered"]) outname += ".tsv" return outname + # def fetch_dataset_list(dataset_file, database): # """ Gets a list of all datasets in the database """ # @@ -90,10 +104,11 @@ def create_outname(options): # # return datasets + def create_abundance_dict(database, datasets): """Process the abundance table by dataset in order to create a dictionary - data structure organized like this: - transcript_ID -> dataset -> abundance in that dataset + data structure organized like this: + transcript_ID -> dataset -> abundance in that dataset """ abundance = {} @@ -102,8 +117,11 @@ def create_abundance_dict(database, datasets): cursor = conn.cursor() for dataset in datasets: - query = """ SELECT transcript_ID, count FROM abundance - WHERE dataset = '%s' """ % dataset + query = ( + """ SELECT transcript_ID, count FROM abundance + WHERE dataset = '%s' """ + % dataset + ) cursor.execute(query) for transcript in cursor.fetchall(): @@ -119,18 +137,19 @@ def create_abundance_dict(database, datasets): conn.close() return abundance + def fetch_abundances(database, datasets, annot, whitelist): """Constructs a query to get the following information for every - whitelisted transcript: - 1) TALON gene ID - 2) TALON transcript ID - 3) Gene ID (from annotation specified in 'annot', None otherwise) - 4) Transcript ID (from annotation specified in 'annot', None otherwise) - 5) Gene name (from annotation specified in 'annot', None otherwise) - 6) Transcript name (from annotation specified in 'annot', None otherwise) - 7) number of exons in transcript - - Returns a list of tuples (one tuple per transcript) + whitelisted transcript: + 1) TALON gene ID + 2) TALON transcript ID + 3) Gene ID (from annotation specified in 'annot', None otherwise) + 4) Transcript ID (from annotation specified in 'annot', None otherwise) + 5) Gene name (from annotation specified in 'annot', None otherwise) + 6) Transcript name (from annotation specified in 'annot', None otherwise) + 7) number of exons in transcript + + Returns a list of tuples (one tuple per transcript) """ # datasets = fetch_dataset_list(database) @@ -149,7 +168,7 @@ def fetch_abundances(database, datasets, annot, whitelist): conn.row_factory = sqlite3.Row cursor = conn.cursor() - whitelist_string = "WHERE t.transcript_ID IN (" + ','.join(whitelist) + ");" + whitelist_string = "WHERE t.transcript_ID IN (" + ",".join(whitelist) + ");" name_status_query = """ FROM transcripts t @@ -165,7 +184,12 @@ def fetch_abundances(database, datasets, annot, whitelist): LEFT JOIN transcript_annotations ta_name ON t.transcript_ID = ta_name.ID AND ta_name.annot_name = '%s' AND ta_name.attribute = 'transcript_name' - """ % (annot, annot, annot, annot) + """ % ( + annot, + annot, + annot, + annot, + ) full_query = "\n".join([col_query, name_status_query, whitelist_string]) @@ -201,11 +225,11 @@ def fetch_abundances(database, datasets, annot, whitelist): return final_abundance, colnames -def write_abundance_file(abundances, col_names, prefix, n_places, datasets, - novelty_types, transcript_lengths, outfile): - """ Writes abundances and metadata to an output file """ - o = open(outfile, 'w') +def write_abundance_file(abundances, col_names, prefix, n_places, datasets, novelty_types, transcript_lengths, outfile): + """Writes abundances and metadata to an output file""" + + o = open(outfile, "w") novelty_type_cols = ["gene_novelty", "transcript_novelty", "ISM_subtype"] @@ -224,22 +248,24 @@ def write_abundance_file(abundances, col_names, prefix, n_places, datasets, annot_transcript_ID_index = all_colnames.index("annot_transcript_id") gene_name_index = all_colnames.index("annot_gene_name") transcript_name_index = all_colnames.index("annot_transcript_name") - dataset_indices = [i for i,s in enumerate(all_colnames) if s in set(datasets)] + dataset_indices = [i for i, s in enumerate(all_colnames) if s in set(datasets)] # Iterate over abundances, fixing Nones, and write to file for transcript in abundances: - curr_novelty = get_gene_and_transcript_novelty_types(transcript[gene_ID_index], - transcript[transcript_ID_index], - novelty_types) + curr_novelty = get_gene_and_transcript_novelty_types( + transcript[gene_ID_index], transcript[transcript_ID_index], novelty_types + ) transcript = list(transcript) - transcript = transcript[0:first_dataset_index] + \ - [transcript_lengths[transcript[transcript_ID_index]]] + \ - [ curr_novelty[x] for x in novelty_type_cols] + \ - transcript[first_dataset_index:] + transcript = ( + transcript[0:first_dataset_index] + + [transcript_lengths[transcript[transcript_ID_index]]] + + [curr_novelty[x] for x in novelty_type_cols] + + transcript[first_dataset_index:] + ) - alt_gene_name, alt_transcript_name = talon.construct_names(transcript[gene_ID_index], \ - transcript[transcript_ID_index], \ - prefix, n_places) + alt_gene_name, alt_transcript_name = talon.construct_names( + transcript[gene_ID_index], transcript[transcript_ID_index], prefix, n_places + ) if transcript[annot_gene_ID_index] == None: transcript[annot_gene_ID_index] = alt_gene_name @@ -263,8 +289,8 @@ def write_abundance_file(abundances, col_names, prefix, n_places, datasets, def get_gene_and_transcript_novelty_types(gene_ID, transcript_ID, novelty_type): - """ Look up gene and transcript IDs in data structure to determine which types - of novelty are present """ + """Look up gene and transcript IDs in data structure to determine which types + of novelty are present""" curr_novel = {} @@ -297,8 +323,7 @@ def get_gene_and_transcript_novelty_types(gene_ID, transcript_ID, novelty_type): print("Warning: Could not locate novelty type for transcript %s" % transcript_ID) # Look for ISM subtype - if transcript_ID in novelty_type.ISM_prefix and \ - transcript_ID in novelty_type.ISM_suffix: + if transcript_ID in novelty_type.ISM_prefix and transcript_ID in novelty_type.ISM_suffix: curr_novel["ISM_subtype"] = "Both" elif transcript_ID in novelty_type.ISM_prefix: curr_novel["ISM_subtype"] = "Prefix" @@ -309,6 +334,7 @@ def get_gene_and_transcript_novelty_types(gene_ID, transcript_ID, novelty_type): return curr_novel + # def check_annot_validity(annot, database): # """ Make sure that the user has entered a correct annotation name """ # @@ -360,9 +386,10 @@ def get_gene_and_transcript_novelty_types(gene_ID, transcript_ID, novelty_type): # # return + def make_novelty_type_struct(database, datasets): - """ Create a data structure where it is possible to look up whether a gene - or transcript belongs to a particular category of novelty""" + """Create a data structure where it is possible to look up whether a gene + or transcript belongs to a particular category of novelty""" conn = sqlite3.connect(database) conn.row_factory = sqlite3.Row @@ -385,6 +412,7 @@ def make_novelty_type_struct(database, datasets): conn.close() return novelty_type + # def fetch_naming_prefix(database): # """ Get naming prefix from the database run_info table """ # conn = sqlite3.connect(database) @@ -448,15 +476,11 @@ def main(): autils.check_build_validity(build, database) # Determine which transcripts to include - whitelist = putils.handle_filtering(database, - annot, - False, - whitelist_file, - dataset_file) + whitelist = putils.handle_filtering(database, annot, False, whitelist_file, dataset_file) # create transcript whitelist transcript_whitelist = [] - for key,group in itertools.groupby(whitelist,operator.itemgetter(0)): + for key, group in itertools.groupby(whitelist, operator.itemgetter(0)): for id_tuple in list(group): transcript_whitelist.append(str(id_tuple[1])) @@ -471,5 +495,6 @@ def main(): n_places = autils.fetch_n_places(database) write_abundance_file(abundances, colnames, prefix, n_places, datasets, novelty_type, transcript_lengths, outfile) -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/src/talon/post/create_anndata_from_database.py b/src/talon/post/create_anndata_from_database.py index f3ebca3..2c96c76 100644 --- a/src/talon/post/create_anndata_from_database.py +++ b/src/talon/post/create_anndata_from_database.py @@ -4,60 +4,76 @@ # create_anndata_from_database.py is a utility that outputs the abundance # for each transcript in the TALON database across datasets in AnnData format. -import sqlite3 import itertools import operator +import sqlite3 from optparse import OptionParser from pathlib import Path -import scanpy + +import anndata import numpy as np import pandas as pd -import anndata +import scanpy from scipy.sparse import csr_matrix - -from . import filter_talon_transcripts as filt from .. import dstruct as dstruct from .. import length_utils as lu -from . import post_utils as putils -from . import ab_utils as autils from .. import query_utils as qutils from .. import talon as talon +from . import ab_utils as autils +from . import filter_talon_transcripts as filt +from . import post_utils as putils def getOptions(): parser = OptionParser() - parser.add_option("--db", dest = "database", - help = "TALON database", metavar = "FILE", type = "string") - parser.add_option("--annot", "-a", dest = "annot", - help = """Which annotation version to use. Will determine which + parser.add_option("--db", dest="database", help="TALON database", metavar="FILE", type="string") + parser.add_option( + "--annot", + "-a", + dest="annot", + help="""Which annotation version to use. Will determine which annotation transcripts are considered known or novel relative to. Note: must be in the TALON database.""", - type = "string") - parser.add_option("--pass_list", dest = "pass_list", - help = "Pass list file of transcripts to include in the \ + type="string", + ) + parser.add_option( + "--pass_list", + dest="pass_list", + help="Pass list file of transcripts to include in the \ output. First column should be TALON gene ID, \ second column should be TALON transcript ID", - metavar = "FILE", type = "string", default = None) - parser.add_option("--build", "-b", dest = "build", - help = "Genome build to use. Note: must be in the TALON database.", - type = "string") - parser.add_option('--gene', dest='gene_level', - help='Output AnnData on the gene level rather than the transcript', - action='store_true') - parser.add_option("--datasets", "-d", dest = "dataset_file", - help = """Optional: A file indicating which datasets should be + metavar="FILE", + type="string", + default=None, + ) + parser.add_option( + "--build", "-b", dest="build", help="Genome build to use. Note: must be in the TALON database.", type="string" + ) + parser.add_option( + "--gene", + dest="gene_level", + help="Output AnnData on the gene level rather than the transcript", + action="store_true", + ) + parser.add_option( + "--datasets", + "-d", + dest="dataset_file", + help="""Optional: A file indicating which datasets should be included (one dataset name per line). Default is to include all datasets.""", - metavar = "FILE", type = "string", default = None) - parser.add_option("--o", dest = "ofile", help = "Output file name", - metavar = "FILE", type = "string") - + metavar="FILE", + type="string", + default=None, + ) + parser.add_option("--o", dest="ofile", help="Output file name", metavar="FILE", type="string") (options, args) = parser.parse_args() return options + def assign_novelties(df, d, order, how): """ Assign novelty types based on a priority order @@ -76,17 +92,17 @@ def assign_novelties(df, d, order, how): df (pandas DataFrame): DataFrame indexed by gene / transcript ID with novelty information """ - if how == 'gene': - nov_col = 'gene_novelty' + if how == "gene": + nov_col = "gene_novelty" cols = [nov_col] - elif how == 'transcript': - nov_col = 'transcript_novelty' - cols = [nov_col, 'ISM_subtype'] + elif how == "transcript": + nov_col = "transcript_novelty" + cols = [nov_col, "ISM_subtype"] # assign gene or transcript novelty - df = df.pivot(index='ID', columns=['attribute'], values=['value']) + df = df.pivot(index="ID", columns=["attribute"], values=["value"]) df = df.droplevel(0, axis=1) - df.columns.name = '' + df.columns.name = "" for key, value in d.items(): df[key] = False @@ -94,20 +110,20 @@ def assign_novelties(df, d, order, how): # in cases where we're filtering out a lot, # not all novelty types will be represented if value[0] in df.columns: - df.loc[df[value[0]]==value[1], key] = True + df.loc[df[value[0]] == value[1], key] = True df.drop(value[0], axis=1, inplace=True) df[nov_col] = np.nan for o in order: - df.loc[(df[nov_col].isnull())&(df[o]==True), nov_col] = o + df.loc[(df[nov_col].isnull()) & (df[o] == True), nov_col] = o # assign ism subtype if needed - if how == 'transcript': - df['ISM_subtype'] = np.nan - df.loc[(df.ISM_subtype.isnull())&(df['ISM-prefix'])&(df['ISM-suffix']), 'ISM_subtype'] = 'Both' - df.loc[(df.ISM_subtype.isnull())&(df['ISM-prefix']), 'ISM_subtype'] = 'Prefix' - df.loc[(df.ISM_subtype.isnull())&(df['ISM-suffix']), 'ISM_subtype'] = 'Suffix' - df.loc[df.ISM_subtype.isnull(), 'ISM_subtype'] = 'None' + if how == "transcript": + df["ISM_subtype"] = np.nan + df.loc[(df.ISM_subtype.isnull()) & (df["ISM-prefix"]) & (df["ISM-suffix"]), "ISM_subtype"] = "Both" + df.loc[(df.ISM_subtype.isnull()) & (df["ISM-prefix"]), "ISM_subtype"] = "Prefix" + df.loc[(df.ISM_subtype.isnull()) & (df["ISM-suffix"]), "ISM_subtype"] = "Suffix" + df.loc[df.ISM_subtype.isnull(), "ISM_subtype"] = "None" # reduce cols df = df[cols] @@ -115,6 +131,7 @@ def assign_novelties(df, d, order, how): return df + def get_transcript_novs(db, tids): """ Get transcript novelties and ISM subtypes from a TALON db @@ -129,17 +146,18 @@ def get_transcript_novs(db, tids): """ # attributes to search for - nov_col_dict = {'Known': ('transcript_status', 'KNOWN'), - 'ISM': ('ISM_transcript', 'TRUE'), - 'ISM-prefix': ('ISM-prefix_transcript', 'TRUE'), - 'ISM-suffix': ('ISM-suffix_transcript', 'TRUE'), - 'NIC': ('NIC_transcript', 'TRUE'), - 'NNC': ('NNC_transcript', 'TRUE'), - 'Antisense': ('antisense_transcript', 'TRUE'), - 'Intergenic': ('intergenic_transcript', 'TRUE'), - 'Genomic': ('genomic_transcript', 'TRUE')} - order = ['ISM', 'NIC', 'NNC', 'Antisense', - 'Intergenic', 'Genomic', 'Known'] + nov_col_dict = { + "Known": ("transcript_status", "KNOWN"), + "ISM": ("ISM_transcript", "TRUE"), + "ISM-prefix": ("ISM-prefix_transcript", "TRUE"), + "ISM-suffix": ("ISM-suffix_transcript", "TRUE"), + "NIC": ("NIC_transcript", "TRUE"), + "NNC": ("NNC_transcript", "TRUE"), + "Antisense": ("antisense_transcript", "TRUE"), + "Intergenic": ("intergenic_transcript", "TRUE"), + "Genomic": ("genomic_transcript", "TRUE"), + } + order = ["ISM", "NIC", "NNC", "Antisense", "Intergenic", "Genomic", "Known"] attr_list = [val[0] for key, val in nov_col_dict.items()] attrs = qutils.format_for_IN(attr_list) @@ -154,10 +172,11 @@ def get_transcript_novs(db, tids): """ df = pd.read_sql_query(query, conn) - df = assign_novelties(df, nov_col_dict, order, 'transcript') + df = assign_novelties(df, nov_col_dict, order, "transcript") return df + def get_gene_novs(db, gids): """ Get gene novelties from a TALON db @@ -171,10 +190,12 @@ def get_gene_novs(db, gids): """ # attributes to search for - nov_col_dict = {'Known': ('gene_status', 'KNOWN'), - 'Intergenic': ('intergenic_novel', 'TRUE'), - 'Antisense': ('antisense_gene', 'TRUE')} - order = ['Antisense', 'Intergenic', 'Known'] + nov_col_dict = { + "Known": ("gene_status", "KNOWN"), + "Intergenic": ("intergenic_novel", "TRUE"), + "Antisense": ("antisense_gene", "TRUE"), + } + order = ["Antisense", "Intergenic", "Known"] attr_list = [val[0] for key, val in nov_col_dict.items()] attrs = qutils.format_for_IN(attr_list) @@ -188,10 +209,11 @@ def get_gene_novs(db, gids): AND ID IN {gene_query} """ df = pd.read_sql_query(query, conn) - df = assign_novelties(df, nov_col_dict, order, 'gene') + df = assign_novelties(df, nov_col_dict, order, "gene") return df + def get_g_t_names(db, annot, tids): """ Get names / IDs of genes / transcripts from TALON db @@ -238,6 +260,7 @@ def get_g_t_names(db, annot, tids): return df + def get_var_info(db, annot, build, tids=None, gids=None, gene_level=False): """ Get info about names, IDs, novelty categories, etc. for each gene @@ -264,22 +287,21 @@ def get_var_info(db, annot, build, tids=None, gids=None, gene_level=False): # make names for novel genes / transcripts # determine how many missing digits there are # repeat '0' for that many spaces for each gene / transcript ID - df['zero'] = '0' - df['n_gid_zero_to_add'] = n_places-df.gene_ID.astype(str).str.len() - df['temp_gid'] = prefix+'G'+df['zero'].str.repeat(df['n_gid_zero_to_add'])+df['gene_ID'].astype(str) - df['n_tid_zero_to_add'] = n_places-df.transcript_ID.astype(str).str.len() - df['temp_tid'] = prefix+'T'+df['zero'].str.repeat(df['n_tid_zero_to_add'])+df['transcript_ID'].astype(str) - - df['temp'] = df.temp_gid.str.len() - if len(df['temp'].unique().tolist()) != 1: - raise ValueError('Problem naming genes') - df['temp'] = df.temp_tid.str.len() - if len(df['temp'].unique().tolist()) != 1: - raise ValueError('Problem naming transcripts') + df["zero"] = "0" + df["n_gid_zero_to_add"] = n_places - df.gene_ID.astype(str).str.len() + df["temp_gid"] = prefix + "G" + df["zero"].str.repeat(df["n_gid_zero_to_add"]) + df["gene_ID"].astype(str) + df["n_tid_zero_to_add"] = n_places - df.transcript_ID.astype(str).str.len() + df["temp_tid"] = prefix + "T" + df["zero"].str.repeat(df["n_tid_zero_to_add"]) + df["transcript_ID"].astype(str) + + df["temp"] = df.temp_gid.str.len() + if len(df["temp"].unique().tolist()) != 1: + raise ValueError("Problem naming genes") + df["temp"] = df.temp_tid.str.len() + if len(df["temp"].unique().tolist()) != 1: + raise ValueError("Problem naming transcripts") # drop extra stuff - drop_cols = ['zero', 'n_gid_zero_to_add', - 'n_tid_zero_to_add', 'temp'] + drop_cols = ["zero", "n_gid_zero_to_add", "n_tid_zero_to_add", "temp"] df.drop(drop_cols, axis=1, inplace=True) # # add gene / transcript names / ids @@ -291,54 +313,67 @@ def get_var_info(db, annot, build, tids=None, gids=None, gene_level=False): # replace null gene names / ids inds = df.loc[df.annot_gene_id.isnull()].index - df.loc[inds, 'annot_gene_id'] = df.loc[inds, 'temp_gid'] + df.loc[inds, "annot_gene_id"] = df.loc[inds, "temp_gid"] inds = df.loc[df.annot_gene_name.isnull()].index - df.loc[inds, 'annot_gene_name'] = df.loc[inds, 'temp_gid'] + df.loc[inds, "annot_gene_name"] = df.loc[inds, "temp_gid"] # replace null transcript names / ids inds = df.loc[df.annot_transcript_id.isnull()].index - df.loc[inds, 'annot_transcript_id'] = df.loc[inds, 'temp_tid'] + df.loc[inds, "annot_transcript_id"] = df.loc[inds, "temp_tid"] inds = df.loc[df.annot_transcript_name.isnull()].index - df.loc[inds, 'annot_transcript_name'] = df.loc[inds, 'temp_tid'] + df.loc[inds, "annot_transcript_name"] = df.loc[inds, "temp_tid"] # remove temp cols - df.drop(['temp_gid', 'temp_tid'], axis=1, inplace=True) + df.drop(["temp_gid", "temp_tid"], axis=1, inplace=True) # add transcript len - t_lens = pd.DataFrame.from_dict(autils.get_transcript_lengths(db, build), - orient='index', - columns=['length']) - df = df.merge(t_lens, how='left', left_on='transcript_ID', right_index=True) + t_lens = pd.DataFrame.from_dict(autils.get_transcript_lengths(db, build), orient="index", columns=["length"]) + df = df.merge(t_lens, how="left", left_on="transcript_ID", right_index=True) # add gene novelty g_df = get_gene_novs(db, gids) - df = df.merge(g_df, how='left', left_on='gene_ID', right_on='ID') - df.drop('ID', axis=1, inplace=True) + df = df.merge(g_df, how="left", left_on="gene_ID", right_on="ID") + df.drop("ID", axis=1, inplace=True) # add transcript novelty / ism subtype t_df = get_transcript_novs(db, tids) - df = df.merge(t_df, how='left', left_on='transcript_ID', right_on='ID') - df.drop('ID', axis=1, inplace=True) + df = df.merge(t_df, how="left", left_on="transcript_ID", right_on="ID") + df.drop("ID", axis=1, inplace=True) # column order - order = ['gene_ID', 'transcript_ID', 'annot_gene_id', - 'annot_transcript_id', 'annot_gene_name', - 'annot_transcript_name', 'n_exons', 'length', - 'gene_novelty', 'transcript_novelty', 'ISM_subtype'] + order = [ + "gene_ID", + "transcript_ID", + "annot_gene_id", + "annot_transcript_id", + "annot_gene_name", + "annot_transcript_name", + "n_exons", + "length", + "gene_novelty", + "transcript_novelty", + "ISM_subtype", + ] df = df[order] # gene level -- drop columns that are only relevant to transcripts # and drop duplicated entries if gene_level: - drop_cols = ['transcript_ID', 'annot_transcript_id', - 'annot_transcript_name', 'length', - 'transcript_novelty', 'ISM_subtype', - 'n_exons'] + drop_cols = [ + "transcript_ID", + "annot_transcript_id", + "annot_transcript_name", + "length", + "transcript_novelty", + "ISM_subtype", + "n_exons", + ] df.drop(drop_cols, axis=1, inplace=True) df.drop_duplicates(inplace=True) return df + def get_obs_info(db, dataset_file): """ Get metadata table for each dataset in TALON @@ -360,9 +395,10 @@ def get_obs_info(db, dataset_file): FROM dataset WHERE dataset_name IN {datasets_query} """ df = pd.read_sql_query(query, conn) - df.rename({'dataset_name': 'dataset'}, axis=1, inplace=True) + df.rename({"dataset_name": "dataset"}, axis=1, inplace=True) return df + def get_X_info(db, obs, var, gene_level=False): """ Get sparse matrix representation of gene or transcript counts @@ -381,7 +417,7 @@ def get_X_info(db, obs, var, gene_level=False): # filter on genes if gene_level: - var_col = 'gene_ID' + var_col = "gene_ID" feat_str = qutils.format_for_IN(var[var_col].unique().tolist()) query = f"""SELECT t.gene_ID, ab.transcript_ID, ab.dataset, ab.count FROM abundance as ab @@ -393,7 +429,7 @@ def get_X_info(db, obs, var, gene_level=False): # filter on transcripts else: - var_col = 'transcript_ID' + var_col = "transcript_ID" feat_str = qutils.format_for_IN(var[var_col].unique().tolist()) query = f"""SELECT transcript_ID, dataset, count FROM abundance WHERE transcript_ID in {feat_str} @@ -407,25 +443,23 @@ def get_X_info(db, obs, var, gene_level=False): # sum over transcripts from the same gene / dataset if gene_level: - df.drop('transcript_ID', axis=1, inplace=True) - df = df.groupby(['gene_ID', 'dataset']).sum().reset_index() + df.drop("transcript_ID", axis=1, inplace=True) + df = df.groupby(["gene_ID", "dataset"]).sum().reset_index() # make categories based on ordering of obs and var tables - obs_col = 'dataset' + obs_col = "dataset" obs_cat = pd.api.types.CategoricalDtype(obs[obs_col], ordered=True) if obs_cat.categories.tolist() != obs[obs_col].tolist(): - raise ValueError('Problem with dataset names') + raise ValueError("Problem with dataset names") var_cat = pd.api.types.CategoricalDtype(var[var_col], ordered=True) if var_cat.categories.tolist() != var[var_col].tolist(): - raise ValueError('Problem with feature IDs') + raise ValueError("Problem with feature IDs") # create sparse matrix representation without # inflating row = df[obs_col].astype(obs_cat).cat.codes col = df[var_col].astype(var_cat).cat.codes - X = csr_matrix((df['count'], (row, col)), \ - shape=(obs_cat.categories.size, - var_cat.categories.size)) + X = csr_matrix((df["count"], (row, col)), shape=(obs_cat.categories.size, var_cat.categories.size)) # # code to inflate matrix # dfs = pd.SparseDataFrame(X, \ @@ -435,6 +469,7 @@ def get_X_info(db, obs, var, gene_level=False): return X + def main(): options = getOptions() db = options.database @@ -454,11 +489,7 @@ def main(): autils.check_build_validity(build, db) # determine which transcripts to include - pass_list = putils.handle_filtering(db, - annot, - True, - pass_list_file, - dataset_file) + pass_list = putils.handle_filtering(db, annot, True, pass_list_file, dataset_file) gids = [i[0] for i in list(set(pass_list))] tids = [i[1] for i in list(set(pass_list))] @@ -471,5 +502,6 @@ def main(): adata = anndata.AnnData(X=X, obs=obs, var=var) adata.write(ofile) -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/src/talon/post/filter_talon_transcripts.py b/src/talon/post/filter_talon_transcripts.py index 01f5518..6ed4b8e 100644 --- a/src/talon/post/filter_talon_transcripts.py +++ b/src/talon/post/filter_talon_transcripts.py @@ -6,76 +6,119 @@ # used by downstream analysis tools to determine which transcripts and other # features should be reported (for example in a GTF file). -from optparse import OptionParser +import os import sqlite3 +import warnings +from optparse import OptionParser from pathlib import Path + +import pandas as pd + +from talon.post import get_read_annotations as read_annot + from .. import query_utils as qutils from . import ab_utils as autils -from talon.post import get_read_annotations as read_annot -import pandas as pd -import os -import warnings + def getOptions(): - parser = OptionParser(description = ("talon_filter_transcripts is a " - "utility that filters the transcripts inside " - "a TALON database to produce a transcript pass list. " - "This list can then be used by downstream analysis " - "tools to determine which transcripts and other " - "features should be reported (for example in a GTF file)")) - parser.add_option("--db", dest = "database", - help = "TALON database", metavar = "FILE", type = str) - parser.add_option("--annot", "-a", dest = "annot", - help = """Which annotation version to use. Will determine which + parser = OptionParser( + description=( + "talon_filter_transcripts is a " + "utility that filters the transcripts inside " + "a TALON database to produce a transcript pass list. " + "This list can then be used by downstream analysis " + "tools to determine which transcripts and other " + "features should be reported (for example in a GTF file)" + ) + ) + parser.add_option("--db", dest="database", help="TALON database", metavar="FILE", type=str) + parser.add_option( + "--annot", + "-a", + dest="annot", + help="""Which annotation version to use. Will determine which annotation transcripts are considered known or novel relative to. Note: must be in the TALON database.""", - type = "string") - parser.add_option("--datasets", dest = "datasets", default = None, - help = ("Datasets to include. Can be provided as a " - "comma-delimited list on the command line, " - "or as a file with one dataset per line. " - "If this option is omitted, all datasets will " - "be included.")) - parser.add_option("--includeAnnot", dest = "include_annot", action= "store_true", - help = ("Include all transcripts from the annotation, regardless " - "of if they were observed in the data.")) - parser.add_option("--maxFracA", dest = "max_frac_A", default = 0.5, - help = ("Maximum fraction of As to allow in the window " - "located immediately after any read assigned to " - "a novel transcript (helps to filter out internal " - "priming artifacts). Default = 0.5. Use 1 if you prefer" - "to not filter out internal priming events."), - type = float) - parser.add_option("--minCount", dest = "min_count", default = 5, - type = int, - help = ("Number of minimum occurrences required for a " - "novel transcript PER dataset. Default = 5")) - parser.add_option("--minDatasets", dest = "min_datasets", default = None, - type = int, - help = ("Minimum number of datasets novel transcripts " - "must be found in. Default = all datasets provided")) - parser.add_option("--allowGenomic", dest ="allow_genomic", action='store_true', - help = ("If this option is set, transcripts from the Genomic " - "novelty category will be permitted in the output " - "(provided they pass the thresholds). Default " - "behavior is to filter out genomic transcripts " - "since they are unlikely to be real novel isoforms."), - default = False) - parser.add_option("--excludeISM", dest = "exclude_ISMs", action='store_true', - help = ("If this option is set, transcripts from the ISM " - "novelty category will be excluded from the output. " - "Default behavior is to include those that pass other " - "filtering thresholds.")) - parser.add_option("--o", dest = "outfile", help = "Outfile name", - metavar = "FILE", type = "string") - + type="string", + ) + parser.add_option( + "--datasets", + dest="datasets", + default=None, + help=( + "Datasets to include. Can be provided as a " + "comma-delimited list on the command line, " + "or as a file with one dataset per line. " + "If this option is omitted, all datasets will " + "be included." + ), + ) + parser.add_option( + "--includeAnnot", + dest="include_annot", + action="store_true", + help=("Include all transcripts from the annotation, regardless " "of if they were observed in the data."), + ) + parser.add_option( + "--maxFracA", + dest="max_frac_A", + default=0.5, + help=( + "Maximum fraction of As to allow in the window " + "located immediately after any read assigned to " + "a novel transcript (helps to filter out internal " + "priming artifacts). Default = 0.5. Use 1 if you prefer" + "to not filter out internal priming events." + ), + type=float, + ) + parser.add_option( + "--minCount", + dest="min_count", + default=5, + type=int, + help=("Number of minimum occurrences required for a " "novel transcript PER dataset. Default = 5"), + ) + parser.add_option( + "--minDatasets", + dest="min_datasets", + default=None, + type=int, + help=("Minimum number of datasets novel transcripts " "must be found in. Default = all datasets provided"), + ) + parser.add_option( + "--allowGenomic", + dest="allow_genomic", + action="store_true", + help=( + "If this option is set, transcripts from the Genomic " + "novelty category will be permitted in the output " + "(provided they pass the thresholds). Default " + "behavior is to filter out genomic transcripts " + "since they are unlikely to be real novel isoforms." + ), + default=False, + ) + parser.add_option( + "--excludeISM", + dest="exclude_ISMs", + action="store_true", + help=( + "If this option is set, transcripts from the ISM " + "novelty category will be excluded from the output. " + "Default behavior is to include those that pass other " + "filtering thresholds." + ), + ) + parser.add_option("--o", dest="outfile", help="Outfile name", metavar="FILE", type="string") (options, args) = parser.parse_args() return options -def get_known_transcripts(database, annot, include_annot, datasets = None): - """ Fetch gene ID and transcript ID of all known transcripts detected in - the specified datasets """ + +def get_known_transcripts(database, annot, include_annot, datasets=None): + """Fetch gene ID and transcript ID of all known transcripts detected in + the specified datasets""" with sqlite3.connect(database) as conn: # pull from observed table @@ -85,7 +128,9 @@ def get_known_transcripts(database, annot, include_annot, datasets = None): ON ta.ID = observed.transcript_ID WHERE (ta.attribute = 'transcript_status' AND ta.value = 'KNOWN' - AND ta.annot_name = '%s')""" % (annot) + AND ta.annot_name = '%s')""" % ( + annot + ) # pull from normal transcripts table elif include_annot: @@ -108,40 +153,47 @@ def get_known_transcripts(database, annot, include_annot, datasets = None): return known + def fetch_reads_in_datasets_fracA_cutoff(database, datasets, max_frac_A): - """ Selects reads from the database that are from the specified datasets - and which pass the following cutoffs: - - fraction_As <= max_frac_A - Reads with fraction_As value of None will not be included. - If datasets == None, then all datasets are permitted""" + """Selects reads from the database that are from the specified datasets + and which pass the following cutoffs: + - fraction_As <= max_frac_A + Reads with fraction_As value of None will not be included. + If datasets == None, then all datasets are permitted""" -# convert non-iterable datasets to an iterable + # convert non-iterable datasets to an iterable if datasets == None: - with sqlite3.connect(database) as conn: - query = """SELECT dataset_name + with sqlite3.connect(database) as conn: + query = """SELECT dataset_name FROM dataset""" - iter_datasets = pd.read_sql_query(query, conn).dataset_name.tolist() + iter_datasets = pd.read_sql_query(query, conn).dataset_name.tolist() else: - iter_datasets = datasets + iter_datasets = datasets # first check if we have non-null fraction_As columns at all # (one dataset at a time) for dataset in iter_datasets: - with sqlite3.connect(database) as conn: - query = """SELECT read_name, gene_ID, transcript_ID, dataset, fraction_As - FROM observed WHERE dataset='{}' LIMIT 0, 10""".format(dataset) + with sqlite3.connect(database) as conn: + query = """SELECT read_name, gene_ID, transcript_ID, dataset, fraction_As + FROM observed WHERE dataset='{}' LIMIT 0, 10""".format( + dataset + ) - data = pd.read_sql_query(query, conn) - nans = all(data.fraction_As.isna().tolist()) + data = pd.read_sql_query(query, conn) + nans = all(data.fraction_As.isna().tolist()) - if nans and max_frac_A != 1: - print("Reads in dataset {} appear to be unlabelled. " - "Only known transcripts will pass the filter.".format(dataset)) + if nans and max_frac_A != 1: + print( + "Reads in dataset {} appear to be unlabelled. " + "Only known transcripts will pass the filter.".format(dataset) + ) with sqlite3.connect(database) as conn: query = """SELECT read_name, gene_ID, transcript_ID, dataset, fraction_As FROM observed - WHERE fraction_As <= %f""" % (max_frac_A) + WHERE fraction_As <= %f""" % ( + max_frac_A + ) if datasets != None: datasets = qutils.format_for_IN(datasets) query += " AND dataset IN " + datasets @@ -150,7 +202,7 @@ def fetch_reads_in_datasets_fracA_cutoff(database, datasets, max_frac_A): # warn the user if no novel models passed filtering if len(data.index) == 0: - print('No reads passed maxFracA cutoff. Is this expected?') + print("No reads passed maxFracA cutoff. Is this expected?") return data @@ -182,8 +234,9 @@ def fetch_reads_in_datasets_fracA_cutoff(database, datasets, max_frac_A): # # return + def check_db_version(database): - """ Make sure the user is using a v5 database """ + """Make sure the user is using a v5 database""" conn = sqlite3.connect(database) cursor = conn.cursor() @@ -194,19 +247,19 @@ def check_db_version(database): ver = pd.read_sql_query(query, conn) if ver.empty: - message = "Database version is not compatible with v5.0 filtering." - raise ValueError(message) + message = "Database version is not compatible with v5.0 filtering." + raise ValueError(message) + def parse_datasets(dataset_option, database): - """ Parses dataset names from command line. Valid forms of input: - - None (returns None) - - Comma-delimited list of names - - File of names (One per line) - Also checks to make sure that the datasets are in the database. + """Parses dataset names from command line. Valid forms of input: + - None (returns None) + - Comma-delimited list of names + - File of names (One per line) + Also checks to make sure that the datasets are in the database. """ if dataset_option == None: - print(("No dataset names specified, so filtering process will use all " - "datasets present in the database.")) + print(("No dataset names specified, so filtering process will use all " "datasets present in the database.")) return None elif os.path.isfile(dataset_option): @@ -228,103 +281,106 @@ def parse_datasets(dataset_option, database): if dset not in valid_datasets: invalid_datasets.append(dset) if len(invalid_datasets) > 0: - raise ValueError(("Problem parsing datasets. The following names are " - "not in the database: '%s'. \nValid dataset names: '%s'") - % (", ".join(invalid_datasets), - ", ".join(valid_datasets))) + raise ValueError( + ( + "Problem parsing datasets. The following names are " + "not in the database: '%s'. \nValid dataset names: '%s'" + ) + % (", ".join(invalid_datasets), ", ".join(valid_datasets)) + ) else: - print("Parsed the following dataset names successfully: %s" % \ - (", ".join(datasets))) + print("Parsed the following dataset names successfully: %s" % (", ".join(datasets))) return datasets + def get_novelty_df(database): - """ Get the novelty category assignment of each transcript and - store in a data frame """ + """Get the novelty category assignment of each transcript and + store in a data frame""" transcript_novelty_dict = read_annot.get_transcript_novelty(database) - transcript_novelty = pd.DataFrame.from_dict(transcript_novelty_dict, - orient='index') + transcript_novelty = pd.DataFrame.from_dict(transcript_novelty_dict, orient="index") transcript_novelty = transcript_novelty.reset_index() - transcript_novelty.columns = ['transcript_ID', 'transcript_novelty'] + transcript_novelty.columns = ["transcript_ID", "transcript_novelty"] return transcript_novelty + def merge_reads_with_novelty(reads, novelty): - """ Given a data frame of reads and a transcript novelty data frame, - perform a left merge to annotate the reads with their novelty status. + """Given a data frame of reads and a transcript novelty data frame, + perform a left merge to annotate the reads with their novelty status. """ - merged = pd.merge(reads, novelty, on = "transcript_ID", how = "left") + merged = pd.merge(reads, novelty, on="transcript_ID", how="left") return merged + def filter_on_min_count(reads, min_count): - """ Given a reads data frame, compute the number of times that each - transcript ID occurs per dataset. - Keep the rows that meet the min_count threshold and return them. """ + """Given a reads data frame, compute the number of times that each + transcript ID occurs per dataset. + Keep the rows that meet the min_count threshold and return them.""" - cols = ['gene_ID', 'transcript_ID', 'dataset'] + cols = ["gene_ID", "transcript_ID", "dataset"] counts_df = reads[cols].groupby(cols).size() counts_df = counts_df.reset_index() counts_df.columns = cols + ["count"] - filtered = counts_df.loc[counts_df['count'] >= min_count] + filtered = counts_df.loc[counts_df["count"] >= min_count] return filtered + def filter_on_n_datasets(counts_in_datasets, min_datasets): - """ Given a data frame with columns gene_ID, transcript_ID, dataset, - and count (in that dataset), count the number of datasets that each - transcript appears in. Then, filter the data such that only transcripts - found in at least 'min_datasets' remain. """ + """Given a data frame with columns gene_ID, transcript_ID, dataset, + and count (in that dataset), count the number of datasets that each + transcript appears in. Then, filter the data such that only transcripts + found in at least 'min_datasets' remain.""" - cols = ['gene_ID', 'transcript_ID'] + cols = ["gene_ID", "transcript_ID"] dataset_count_df = counts_in_datasets[cols].groupby(cols).size() dataset_count_df = dataset_count_df.reset_index() dataset_count_df.columns = cols + ["n_datasets"] - filtered = dataset_count_df.loc[dataset_count_df['n_datasets'] >= min_datasets] + filtered = dataset_count_df.loc[dataset_count_df["n_datasets"] >= min_datasets] return filtered + def filter_talon_transcripts(database, annot, datasets, options): - """ Filter transcripts belonging to the specified datasets in a TALON - database. The 'annot' parameter specifies which annotation transcripts - are known relative to. Can be tuned with the following options: - - options.include_annot: Include all annotated transcripts regardless - of whether they are expressed - - options.max_frac_A: maximum allowable fraction of As recorded for - region after the read (0-1) - - options.allow_genomic: Removes genomic transcripts if set to False - - options.exlude_ISMs: Removes ISM transcripts if set to True - - options.min_count: Transcripts must appear at least this many times - to count as present in a dataset - - options.min_datasets: After the min_count threshold has been - applied, the transcript must be found in at - least this many datasets to pass the filter. - If this option is set to None, then it will - default to the total number of datasets in the - reads. - Please note that known transcripts are allowed through independently - of these parameters. - """ + """Filter transcripts belonging to the specified datasets in a TALON + database. The 'annot' parameter specifies which annotation transcripts + are known relative to. Can be tuned with the following options: + - options.include_annot: Include all annotated transcripts regardless + of whether they are expressed + - options.max_frac_A: maximum allowable fraction of As recorded for + region after the read (0-1) + - options.allow_genomic: Removes genomic transcripts if set to False + - options.exlude_ISMs: Removes ISM transcripts if set to True + - options.min_count: Transcripts must appear at least this many times + to count as present in a dataset + - options.min_datasets: After the min_count threshold has been + applied, the transcript must be found in at + least this many datasets to pass the filter. + If this option is set to None, then it will + default to the total number of datasets in the + reads. + Please note that known transcripts are allowed through independently + of these parameters. + """ # Known transcripts automatically pass the filter - known = get_known_transcripts(database, annot, - options.include_annot, - datasets = datasets) + known = get_known_transcripts(database, annot, options.include_annot, datasets=datasets) # Get reads that pass fraction A cutoff - reads = fetch_reads_in_datasets_fracA_cutoff(database, datasets, - options.max_frac_A) + reads = fetch_reads_in_datasets_fracA_cutoff(database, datasets, options.max_frac_A) # Fetch novelty information and merge with reads reads = merge_reads_with_novelty(reads, get_novelty_df(database)) # Drop genomic transcripts if desired if options.allow_genomic == False: - reads = reads.loc[reads.transcript_novelty != 'Genomic'] + reads = reads.loc[reads.transcript_novelty != "Genomic"] # Drop ISMs if desired if options.exclude_ISMs == True: - reads = reads.loc[reads.transcript_novelty != 'ISM'] + reads = reads.loc[reads.transcript_novelty != "ISM"] # Perform counts-based filtering filtered_counts = filter_on_min_count(reads, options.min_count) @@ -336,12 +392,15 @@ def filter_talon_transcripts(database, annot, datasets, options): # Join the known transcripts with the filtered ones and return if len(dataset_filtered.index) != 0: - final_filtered = pd.concat([known[["gene_ID", "transcript_ID"]], - dataset_filtered[["gene_ID", "transcript_ID"]]]).drop_duplicates() - else: final_filtered = known + final_filtered = pd.concat( + [known[["gene_ID", "transcript_ID"]], dataset_filtered[["gene_ID", "transcript_ID"]]] + ).drop_duplicates() + else: + final_filtered = known return final_filtered + def main(): options = getOptions() database = options.database @@ -360,16 +419,18 @@ def main(): # Parse datasets datasets = parse_datasets(options.datasets, database) if datasets != None and len(datasets) == 1: - warnings.warn("Only one dataset provided. For best performance, please " - "run TALON with at least 2 biological replicates if possible.") + warnings.warn( + "Only one dataset provided. For best performance, please " + "run TALON with at least 2 biological replicates if possible." + ) # Perform the filtering filtered = filter_talon_transcripts(database, annot, datasets, options) # Write gene and transcript IDs to file print("Writing gene-transcript TALON ID pairs that passed filtering to " + options.outfile + "...") - filtered.to_csv(options.outfile, sep = ",", header = False, index = False) + filtered.to_csv(options.outfile, sep=",", header=False, index=False) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/src/talon/post/generate_talon_report.py b/src/talon/post/generate_talon_report.py index a78254c..f43c685 100644 --- a/src/talon/post/generate_talon_report.py +++ b/src/talon/post/generate_talon_report.py @@ -2,8 +2,7 @@ import sys from pathlib import Path -R_SCRIPT_FILE = Path(__file__).parent / Path("r_scripts") / Path( - "generate_talon_report.R") +R_SCRIPT_FILE = Path(__file__).parent / Path("r_scripts") / Path("generate_talon_report.R") def main(): @@ -11,8 +10,7 @@ def main(): try: subprocess.run(args, stdout=sys.stdout, stderr=sys.stderr) except FileNotFoundError as e: - print("This is a wrapper command for an R script. " - "Do you have GNU R installed?\n" + str(e)) + print("This is a wrapper command for an R script. " "Do you have GNU R installed?\n" + str(e)) if __name__ == "__main__": diff --git a/src/talon/post/get_read_annotations.py b/src/talon/post/get_read_annotations.py index c7d93e4..3be8f97 100644 --- a/src/talon/post/get_read_annotations.py +++ b/src/talon/post/get_read_annotations.py @@ -5,39 +5,51 @@ # database in order to get read-specific annotation information. import argparse -import sqlite3 import os +import sqlite3 from pathlib import Path -from .. import query_utils as qutils from string import Template +from .. import query_utils as qutils + + def get_args(): - """ Fetches the arguments for the program """ + """Fetches the arguments for the program""" - program_desc = ("This utility queries a TALON database in order to get " - "read-specific annotation information.") + program_desc = "This utility queries a TALON database in order to get " "read-specific annotation information." parser = argparse.ArgumentParser(description=program_desc) - parser.add_argument('--db', dest = 'database', metavar='FILE,', type = str, - help='TALON database') - parser.add_argument('--build', dest = 'build', metavar='STRING,', type = str, - help='Genome build (i.e. hg38) to use. Must be in the database.') - parser.add_argument('--datasets', dest = 'datasets', metavar='STRING,', type = str, - help=('Optional: Comma-delimited list of datasets to include. Default ' - 'behavior is to include all datasets in the database.'), - default = None) - parser.add_argument("--o", dest = "outprefix", help = "Prefix for output files", - type = str) + parser.add_argument("--db", dest="database", metavar="FILE,", type=str, help="TALON database") + parser.add_argument( + "--build", + dest="build", + metavar="STRING,", + type=str, + help="Genome build (i.e. hg38) to use. Must be in the database.", + ) + parser.add_argument( + "--datasets", + dest="datasets", + metavar="STRING,", + type=str, + help=( + "Optional: Comma-delimited list of datasets to include. Default " + "behavior is to include all datasets in the database." + ), + default=None, + ) + parser.add_argument("--o", dest="outprefix", help="Prefix for output files", type=str) args = parser.parse_args() return args -def fetch_reads(database, build, tmp_file = None, datasets = None): - """ Performs database query to fetch location and gene/transcript assignment - info for each long read in the specified datasets. - If tmp_file is set to None (default), then the function will return - the query results in a list of lists. If an alternate value is provided, - then the results will be written to a tmp file of that name.""" + +def fetch_reads(database, build, tmp_file=None, datasets=None): + """Performs database query to fetch location and gene/transcript assignment + info for each long read in the specified datasets. + If tmp_file is set to None (default), then the function will return + the query results in a list of lists. If an alternate value is provided, + then the results will be written to a tmp file of that name.""" if datasets != None: # Format as a string for query @@ -86,7 +98,7 @@ def fetch_reads(database, build, tmp_file = None, datasets = None): raise RuntimeError("Problem with reads database query") if tmp_file != None: - o = open(tmp_file, 'w') + o = open(tmp_file, "w") else: reads = [] @@ -115,32 +127,43 @@ def fetch_reads(database, build, tmp_file = None, datasets = None): raise ValueError("Unrecognized strand value: " + str(strand)) # Create entry for output - out_read = (entry["read_name"], entry["dataset"], - entry["genome_build"], entry["gene_ID"], - entry["transcript_ID"], entry["chrom"], - read_start, read_end, strand, entry["n_exons"], - entry["read_length"], entry["fraction_As"], - entry["custom_label"], entry["allelic_label"], - entry["start_support"], entry["end_support"]) + out_read = ( + entry["read_name"], + entry["dataset"], + entry["genome_build"], + entry["gene_ID"], + entry["transcript_ID"], + entry["chrom"], + read_start, + read_end, + strand, + entry["n_exons"], + entry["read_length"], + entry["fraction_As"], + entry["custom_label"], + entry["allelic_label"], + entry["start_support"], + entry["end_support"], + ) if tmp_file != None: - o.write("\t".join([ str(x) for x in out_read ]) + "\n") + o.write("\t".join([str(x) for x in out_read]) + "\n") else: reads.append(out_read) count += 1 # Return results or close file if count == 0: - raise ValueError(("No reads detected. Make sure your dataset names are " - "correct.")) + raise ValueError(("No reads detected. Make sure your dataset names are " "correct.")) if tmp_file != None: o.close() else: return reads + def get_gene_novelty(database): - """ Given a database, get the novelty status of each gene. """ + """Given a database, get the novelty status of each gene.""" gene_novelty = {} with sqlite3.connect(database) as conn: @@ -148,37 +171,46 @@ def get_gene_novelty(database): cursor = conn.cursor() # Fetch known genes - cursor.execute("""SELECT ID FROM gene_annotations + cursor.execute( + """SELECT ID FROM gene_annotations WHERE attribute = "gene_status" - AND value = "KNOWN";""") + AND value = "KNOWN";""" + ) for entry in cursor: gene_novelty[entry[0]] = "Known" # Fetch antisense genes - cursor.execute("""SELECT ID FROM gene_annotations + cursor.execute( + """SELECT ID FROM gene_annotations WHERE attribute = "antisense_gene" - AND value = "TRUE";""") + AND value = "TRUE";""" + ) for entry in cursor: gene_novelty[entry[0]] = "Antisense" # Fetch fusion genes - cursor.execute("""SELECT ID FROM gene_annotations + cursor.execute( + """SELECT ID FROM gene_annotations WHERE attribute = "fusion_novel" - AND value = "TRUE";""") + AND value = "TRUE";""" + ) for entry in cursor: gene_novelty[entry[0]] = "Fusion" # Fetch intergenic genes - cursor.execute("""SELECT ID FROM gene_annotations + cursor.execute( + """SELECT ID FROM gene_annotations WHERE attribute = "intergenic_novel" - AND value = "TRUE";""") + AND value = "TRUE";""" + ) for entry in cursor: gene_novelty[entry[0]] = "Intergenic" return gene_novelty + def get_transcript_novelty(database): - """ Given a database, get the novelty status of each transcript. """ + """Given a database, get the novelty status of each transcript.""" transcript_novelty = {} with sqlite3.connect(database) as conn: @@ -186,65 +218,82 @@ def get_transcript_novelty(database): cursor = conn.cursor() # Fetch known transcripts - cursor.execute("""SELECT ID FROM transcript_annotations + cursor.execute( + """SELECT ID FROM transcript_annotations WHERE attribute = "transcript_status" - AND value = "KNOWN";""") + AND value = "KNOWN";""" + ) for entry in cursor: transcript_novelty[entry[0]] = "Known" # Fetch ISM transcripts - cursor.execute("""SELECT ID FROM transcript_annotations + cursor.execute( + """SELECT ID FROM transcript_annotations WHERE attribute = "ISM_transcript" - AND value = "TRUE";""") + AND value = "TRUE";""" + ) for entry in cursor: transcript_novelty[entry[0]] = "ISM" # Fetch NIC transcripts - cursor.execute("""SELECT ID FROM transcript_annotations + cursor.execute( + """SELECT ID FROM transcript_annotations WHERE attribute = "NIC_transcript" - AND value = "TRUE";""") + AND value = "TRUE";""" + ) for entry in cursor: transcript_novelty[entry[0]] = "NIC" # Fetch NNC transcripts - cursor.execute("""SELECT ID FROM transcript_annotations + cursor.execute( + """SELECT ID FROM transcript_annotations WHERE attribute = "NNC_transcript" - AND value = "TRUE";""") + AND value = "TRUE";""" + ) for entry in cursor: transcript_novelty[entry[0]] = "NNC" # Fetch antisense transcripts - cursor.execute("""SELECT ID FROM transcript_annotations + cursor.execute( + """SELECT ID FROM transcript_annotations WHERE attribute = "antisense_transcript" - AND value = "TRUE";""") + AND value = "TRUE";""" + ) for entry in cursor: transcript_novelty[entry[0]] = "Antisense" # Fetch intergenic transcripts - cursor.execute("""SELECT ID FROM transcript_annotations + cursor.execute( + """SELECT ID FROM transcript_annotations WHERE attribute = "intergenic_transcript" - AND value = "TRUE";""") + AND value = "TRUE";""" + ) for entry in cursor: transcript_novelty[entry[0]] = "Intergenic" # Fetch genomic transcripts - cursor.execute("""SELECT ID FROM transcript_annotations + cursor.execute( + """SELECT ID FROM transcript_annotations WHERE attribute = "genomic_transcript" - AND value = "TRUE";""") + AND value = "TRUE";""" + ) for entry in cursor: transcript_novelty[entry[0]] = "Genomic" # Fetch fusion transcripts - cursor.execute("""SELECT ID FROM transcript_annotations + cursor.execute( + """SELECT ID FROM transcript_annotations WHERE attribute = "fusion_transcript" - AND value = "TRUE";""") + AND value = "TRUE";""" + ) for entry in cursor: transcript_novelty[entry[0]] = "Fusion" return transcript_novelty + def get_ISM_novelty(database): - """ Given a database, get the ISM subtype of each ISM transcript. """ + """Given a database, get the ISM subtype of each ISM transcript.""" all_ISMs = set() prefix_ISMs = set() @@ -256,23 +305,29 @@ def get_ISM_novelty(database): cursor = conn.cursor() # Fetch ISM transcripts - cursor.execute("""SELECT ID FROM transcript_annotations + cursor.execute( + """SELECT ID FROM transcript_annotations WHERE attribute = "ISM_transcript" - AND value = "TRUE";""") + AND value = "TRUE";""" + ) for entry in cursor: all_ISMs.add(entry[0]) # Fetch Prefix ISMs - cursor.execute("""SELECT ID FROM transcript_annotations + cursor.execute( + """SELECT ID FROM transcript_annotations WHERE attribute = "ISM-prefix_transcript" - AND value = "TRUE";""") + AND value = "TRUE";""" + ) for entry in cursor: prefix_ISMs.add(entry[0]) # Fetch Suffix ISMs - cursor.execute("""SELECT ID FROM transcript_annotations + cursor.execute( + """SELECT ID FROM transcript_annotations WHERE attribute = "ISM-suffix_transcript" - AND value = "TRUE";""") + AND value = "TRUE";""" + ) for entry in cursor: suffix_ISMs.add(entry[0]) @@ -289,9 +344,10 @@ def get_ISM_novelty(database): return ISM_novelty + def get_gene_annotations(database): - """ Create a dictionary linking each TALON gene ID to its human-readable - name and accession ID """ + """Create a dictionary linking each TALON gene ID to its human-readable + name and accession ID""" gene_name = {} gene_ID = {} @@ -300,21 +356,26 @@ def get_gene_annotations(database): conn.row_factory = sqlite3.Row cursor = conn.cursor() - cursor.execute("""SELECT ID, ga.value FROM gene_annotations as ga - WHERE attribute = "gene_name";""") + cursor.execute( + """SELECT ID, ga.value FROM gene_annotations as ga + WHERE attribute = "gene_name";""" + ) for entry in cursor: gene_name[entry["ID"]] = entry["value"] - cursor.execute("""SELECT ID, ga.value FROM gene_annotations as ga - WHERE attribute = "gene_id";""") + cursor.execute( + """SELECT ID, ga.value FROM gene_annotations as ga + WHERE attribute = "gene_id";""" + ) for entry in cursor: gene_ID[entry["ID"]] = entry["value"] return gene_name, gene_ID + def get_transcript_annotations(database): - """ Create a dictionary linking each TALON transcript ID to its human-readable - name and accession ID """ + """Create a dictionary linking each TALON transcript ID to its human-readable + name and accession ID""" transcript_name = {} transcript_ID = {} @@ -323,50 +384,55 @@ def get_transcript_annotations(database): conn.row_factory = sqlite3.Row cursor = conn.cursor() - cursor.execute("""SELECT ID, ta.value FROM transcript_annotations as ta - WHERE attribute = "transcript_name";""") + cursor.execute( + """SELECT ID, ta.value FROM transcript_annotations as ta + WHERE attribute = "transcript_name";""" + ) for entry in cursor: transcript_name[entry["ID"]] = entry["value"] - cursor.execute("""SELECT ID, ta.value FROM transcript_annotations as ta - WHERE attribute = "transcript_id";""") + cursor.execute( + """SELECT ID, ta.value FROM transcript_annotations as ta + WHERE attribute = "transcript_id";""" + ) for entry in cursor: transcript_ID[entry["ID"]] = entry["value"] return transcript_name, transcript_ID -def make_read_annot_file(database, build, outprefix, datasets = "all"): - """ Creates an output file with the following columns: - 1. read_name - 2. dataset - 3. genome_build - 4. chrom - 5. read_start - 6. read_end - 7. strand - 8. n_exons - 9. read_length - 10. gene_ID (TALON) - 11. transcript_ID (TALON) - 12. annot_gene_id - 13. annot_transcript_id - 14. annot_gene_name - 15. annot_transcript_name - 16. gene_novelty - 17. transcript_novelty - 18. ISM_subtype - 19. Fraction As (following the alignment) - 20. Custom label - 21. Allelic label - 22. Start support (external assay) - 23. End support (external assay) - - By default, reads from all datasets in the database are included, but - this can be modified by supplying a list/tuple of dataset names to the - datasets parameter. + +def make_read_annot_file(database, build, outprefix, datasets="all"): + """Creates an output file with the following columns: + 1. read_name + 2. dataset + 3. genome_build + 4. chrom + 5. read_start + 6. read_end + 7. strand + 8. n_exons + 9. read_length + 10. gene_ID (TALON) + 11. transcript_ID (TALON) + 12. annot_gene_id + 13. annot_transcript_id + 14. annot_gene_name + 15. annot_transcript_name + 16. gene_novelty + 17. transcript_novelty + 18. ISM_subtype + 19. Fraction As (following the alignment) + 20. Custom label + 21. Allelic label + 22. Start support (external assay) + 23. End support (external assay) + + By default, reads from all datasets in the database are included, but + this can be modified by supplying a list/tuple of dataset names to the + datasets parameter. """ tmp_read_file = outprefix + "_reads.tmp" - fetch_reads(database, build, tmp_file = tmp_read_file, datasets = datasets) + fetch_reads(database, build, tmp_file=tmp_read_file, datasets=datasets) # Make annotation dicts gene_names, gene_IDs = get_gene_annotations(database) @@ -378,21 +444,54 @@ def make_read_annot_file(database, build, outprefix, datasets = "all"): ISM_novelty = get_ISM_novelty(database) fname = outprefix + "_talon_read_annot.tsv" - o = open(fname, 'w') - colnames = [ "read_name", "dataset", "genome_build", "chrom", - "read_start", "read_end", "strand", "n_exons", "read_length", - "gene_ID", "transcript_ID", "annot_gene_id", "annot_transcript_id", - "annot_gene_name", "annot_transcript_name", "gene_novelty", - "transcript_novelty", "ISM_subtype", "fraction_As", "custom_label", - "allelic_label", "start_support", "end_support"] + o = open(fname, "w") + colnames = [ + "read_name", + "dataset", + "genome_build", + "chrom", + "read_start", + "read_end", + "strand", + "n_exons", + "read_length", + "gene_ID", + "transcript_ID", + "annot_gene_id", + "annot_transcript_id", + "annot_gene_name", + "annot_transcript_name", + "gene_novelty", + "transcript_novelty", + "ISM_subtype", + "fraction_As", + "custom_label", + "allelic_label", + "start_support", + "end_support", + ] o.write("\t".join(colnames) + "\n") - with open(tmp_read_file, 'r') as f: + with open(tmp_read_file, "r") as f: for read_entry in f: - read_name, dataset, genome_build, gene_ID, \ - transcript_ID, chrom, read_start, read_end, \ - strand, n_exons, read_length, fraction_As, custom_label, \ - allelic_label, start_support, end_support = read_entry.strip().split("\t") + ( + read_name, + dataset, + genome_build, + gene_ID, + transcript_ID, + chrom, + read_start, + read_end, + strand, + n_exons, + read_length, + fraction_As, + custom_label, + allelic_label, + start_support, + end_support, + ) = read_entry.strip().split("\t") gene_ID = int(gene_ID) transcript_ID = int(transcript_ID) @@ -432,20 +531,43 @@ def make_read_annot_file(database, build, outprefix, datasets = "all"): gene_ID = str(gene_ID) transcript_ID = str(transcript_ID) - o.write("\t".join([read_name, dataset, genome_build, chrom, - read_start, read_end, strand, n_exons, read_length, - gene_ID, transcript_ID, - annot_gene_id, annot_transcript_id, - annot_gene_name, annot_transcript_name, - curr_gene_novelty, curr_transcript_novelty, - curr_ISM_novelty, fraction_As, custom_label, - allelic_label, start_support, end_support]) + "\n") + o.write( + "\t".join( + [ + read_name, + dataset, + genome_build, + chrom, + read_start, + read_end, + strand, + n_exons, + read_length, + gene_ID, + transcript_ID, + annot_gene_id, + annot_transcript_id, + annot_gene_name, + annot_transcript_name, + curr_gene_novelty, + curr_transcript_novelty, + curr_ISM_novelty, + fraction_As, + custom_label, + allelic_label, + start_support, + end_support, + ] + ) + + "\n" + ) o.close() os.system("rm " + tmp_read_file) + def check_build_validity(build, database): - """ Make sure that the user has entered a correct build name """ + """Make sure that the user has entered a correct build name""" conn = sqlite3.connect(database) cursor = conn.cursor() @@ -455,19 +577,20 @@ def check_build_validity(build, database): conn.close() if build == None: - message = "Please provide a valid genome build name. " + \ - "In this database, your options are: " + \ - ", ".join(builds) + message = ( + "Please provide a valid genome build name. " + "In this database, your options are: " + ", ".join(builds) + ) raise ValueError(message) if build not in builds: - message = "Build name '" + build + \ - "' not found in this database. Try one of the following: " + \ - ", ".join(builds) + message = ( + "Build name '" + build + "' not found in this database. Try one of the following: " + ", ".join(builds) + ) raise ValueError(message) return + def main(): options = get_args() database = options.database @@ -483,8 +606,8 @@ def main(): if datasets != None: datasets = datasets.split(",") - make_read_annot_file(database, build, outprefix, datasets = datasets) + make_read_annot_file(database, build, outprefix, datasets=datasets) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/src/talon/post/get_transcript_sjs.py b/src/talon/post/get_transcript_sjs.py index 6b16913..57c51da 100644 --- a/src/talon/post/get_transcript_sjs.py +++ b/src/talon/post/get_transcript_sjs.py @@ -1,56 +1,63 @@ -import os -import pandas as pd import argparse +import os import sqlite3 + import numpy as np +import pandas as pd def get_args(): - - desc = ('Extracts the locations, novelty, and transcript assignments of' - ' exons/introns in a TALON database or GTF file. All positions ' - 'are 1-based.') - parser = argparse.ArgumentParser(description=desc) - - parser.add_argument('--gtf', dest='gtf', default=None, - help = 'TALON GTF file from which to extract exons/introns') - parser.add_argument('--db', dest='db', default=None, - help = 'TALON database from which to extract exons/introns') - parser.add_argument('--ref', dest='ref_gtf', - help = ('GTF reference file (ie GENCODE). Will be used to ' - 'label novelty.')) - parser.add_argument('--mode', dest='mode', - help= ("Choices are 'intron' or 'exon' (default is 'intron'). " - "Determines whether to include introns or exons in the " - "output"), default='intron') - parser.add_argument('--outprefix', dest='outprefix', - help = 'Prefix for output file') + desc = ( + "Extracts the locations, novelty, and transcript assignments of" + " exons/introns in a TALON database or GTF file. All positions " + "are 1-based." + ) + parser = argparse.ArgumentParser(description=desc) + + parser.add_argument("--gtf", dest="gtf", default=None, help="TALON GTF file from which to extract exons/introns") + parser.add_argument("--db", dest="db", default=None, help="TALON database from which to extract exons/introns") + parser.add_argument( + "--ref", dest="ref_gtf", help=("GTF reference file (ie GENCODE). Will be used to " "label novelty.") + ) + parser.add_argument( + "--mode", + dest="mode", + help=( + "Choices are 'intron' or 'exon' (default is 'intron'). " + "Determines whether to include introns or exons in the " + "output" + ), + default="intron", + ) + parser.add_argument("--outprefix", dest="outprefix", help="Prefix for output file") + + args = parser.parse_args() + + if args.gtf and args.db: + raise Exception("only input gtf or db") + + return args - args = parser.parse_args() - - if args.gtf and args.db: - raise Exception('only input gtf or db') - - return args - # creates a dictionary of the last field of a gtf # adapted from Dana Wyman def get_fields(tab_fields): attributes = {} # remove trailing newline and split by semicolon - description = tab_fields[-1].strip('\n') - description = description.split(';') + description = tab_fields[-1].strip("\n") + description = description.split(";") # Parse description for fields in description: - if fields == "" or fields == " ": continue + if fields == "" or fields == " ": + continue fields = fields.split() - if fields[0] == '': fields = fields[1:] + if fields[0] == "": + fields = fields[1:] - key = fields[0].replace('"', '') - val = ' '.join(fields[1:]).replace('"', '') + key = fields[0].replace('"', "") + val = " ".join(fields[1:]).replace('"', "") attributes[key] = val @@ -59,55 +66,49 @@ def get_fields(tab_fields): if "gene_id" not in attributes: attributes["gene_id"] = "NULL" - return attributes + return attributes + # create loc_df (for nodes), edge_df (for edges), and t_df (for paths) def create_dfs_db(db): + # make sure file exists + if not os.path.exists(db): + raise Exception("TALON db file not found. Check path.") - # make sure file exists - if not os.path.exists(db): - raise Exception('TALON db file not found. Check path.') + # open db connection + conn = sqlite3.connect(db) + c = conn.cursor() - # open db connection - conn = sqlite3.connect(db) - c = conn.cursor() + # loc_df + q = "SELECT loc.* FROM location loc" - # loc_df - q = 'SELECT loc.* FROM location loc' + c.execute(q) + locs = c.fetchall() - c.execute(q) - locs = c.fetchall() + loc_df = pd.DataFrame(locs, columns=["location_ID", "genome_build", "chrom", "position"]) - loc_df = pd.DataFrame(locs, - columns=['location_ID', 'genome_build', - 'chrom', 'position']) + # do some df reformatting, add strand + loc_df.drop("genome_build", axis=1, inplace=True) + loc_df.rename({"location_ID": "vertex_id", "position": "coord"}, inplace=True, axis=1) + loc_df.vertex_id = loc_df.vertex_id.map(int) - # do some df reformatting, add strand - loc_df.drop('genome_build', axis=1, inplace=True) - loc_df.rename({'location_ID': 'vertex_id', - 'position': 'coord'}, - inplace=True, axis=1) - loc_df.vertex_id = loc_df.vertex_id.map(int) + # edge_df + q = """SELECT * FROM edge """ - # edge_df - q = """SELECT * FROM edge """ + c.execute(q) + edges = c.fetchall() - c.execute(q) - edges = c.fetchall() + edge_df = pd.DataFrame(edges, columns=["edge_id", "v1", "v2", "edge_type", "strand"]) + edge_df.v1 = edge_df.v1.map(int) + edge_df.v2 = edge_df.v2.map(int) + edge_df["talon_edge_id"] = edge_df.edge_id + edge_df["edge_id"] = edge_df.apply(lambda x: (int(x.v1), int(x.v2)), axis=1) - edge_df = pd.DataFrame(edges, - columns=['edge_id', 'v1', 'v2', - 'edge_type', 'strand']) - edge_df.v1 = edge_df.v1.map(int) - edge_df.v2 = edge_df.v2.map(int) - edge_df['talon_edge_id'] = edge_df.edge_id - edge_df['edge_id'] = edge_df.apply(lambda x: (int(x.v1), int(x.v2)), axis=1) + # t_df + t_df = pd.DataFrame() - # t_df - t_df = pd.DataFrame() - - # get tid, gid, gname, and paths - q = """SELECT ga.value, ta.value, + # get tid, gid, gname, and paths + q = """SELECT ga.value, ta.value, t.start_exon, t.jn_path, t.end_exon, t.start_vertex, t.end_vertex FROM gene_annotations ga @@ -118,325 +119,334 @@ def create_dfs_db(db): OR ga.attribute='gene_id') """ - c.execute(q) - data = c.fetchall() + c.execute(q) + data = c.fetchall() + + # get fields from each transcript and add to dataframe + gids, tids, paths = zip(*[(i[0], i[1], i[2:]) for i in data[::2]]) + gnames = [i[0] for i in data[1::2]] + paths = get_db_edge_paths(paths) + + t_df["tid"] = np.asarray(tids) + t_df["path"] = np.asarray(paths) - # get fields from each transcript and add to dataframe - gids, tids, paths = zip(*[(i[0], i[1], i[2:]) for i in data[::2]]) - gnames = [i[0] for i in data[1::2]] - paths = get_db_edge_paths(paths) + t_df = create_dupe_index(t_df, "tid") + t_df = set_dupe_index(t_df, "tid") - t_df['tid'] = np.asarray(tids) - t_df['path'] = np.asarray(paths) + # furnish the last bit of info in each df + t_df["path"] = [[int(n) for n in path] for path in get_db_vertex_paths(paths, edge_df)] + loc_df = create_dupe_index(loc_df, "vertex_id") + loc_df = set_dupe_index(loc_df, "vertex_id") - t_df = create_dupe_index(t_df, 'tid') - t_df = set_dupe_index(t_df, 'tid') + edge_df.drop("talon_edge_id", axis=1, inplace=True) + edge_df = create_dupe_index(edge_df, "edge_id") + edge_df = set_dupe_index(edge_df, "edge_id") - # furnish the last bit of info in each df - t_df['path'] = [[int(n) for n in path] - for path in get_db_vertex_paths(paths, edge_df)] - loc_df = create_dupe_index(loc_df, 'vertex_id') - loc_df = set_dupe_index(loc_df, 'vertex_id') + return loc_df, edge_df, t_df - edge_df.drop('talon_edge_id', axis=1, inplace=True) - edge_df = create_dupe_index(edge_df, 'edge_id') - edge_df = set_dupe_index(edge_df, 'edge_id') + # create loc_df (nodes), edge_df (edges), and t_df (transcripts) from gtf + # adapted from Dana Wyman and TALON - return loc_df, edge_df, t_df - # create loc_df (nodes), edge_df (edges), and t_df (transcripts) from gtf - # adapted from Dana Wyman and TALON def create_dfs_gtf(gtf_file): + # make sure file exists + if not os.path.exists(gtf_file): + raise Exception("GTF file not found. Check path.") + + # depending on the strand, determine the stard and stop + # coords of an intron or exon + def find_edge_start_stop(v1, v2, strand): + if strand == "-": + start = max([v1, v2]) + stop = min([v1, v2]) + elif strand == "+": + start = min([v1, v2]) + stop = max([v1, v2]) + return start, stop + + # dictionaries to hold unique edges and transcripts + transcripts = {} + exons = {} + + with open(gtf_file) as gtf: + for line in gtf: + # ignore header lines + if line.startswith("#"): + continue + + # split each entry + line = line.strip().split("\t") + + # get some fields from gtf that we care about + chrom = line[0] + entry_type = line[2] + start = int(line[3]) + stop = int(line[4]) + strand = line[6] + fields = line[-1] + + # transcript entry + if entry_type == "transcript": + attributes = get_fields(line) + tid = attributes["transcript_id"] + gid = attributes["gene_id"] + + # add transcript to dictionary + transcript = {tid: {"gid": gid, "tid": tid, "strand": strand, "exons": []}} + transcripts.update(transcript) + + # exon entry + elif entry_type == "exon": + attributes = get_fields(line) + start, stop = find_edge_start_stop(start, stop, strand) + eid = "{}_{}_{}_{}_exon".format(chrom, start, stop, strand) + tid = attributes["transcript_id"] + + # add novel exon to dictionary + if eid not in exons: + edge = {eid: {"eid": eid, "chrom": chrom, "v1": start, "v2": stop, "strand": strand}} + exons.update(edge) + + # add this exon to the transcript's list of exons + if tid in transcripts: + transcripts[tid]["exons"].append(eid) + + # once we have all transcripts, make loc_df + locs = {} + vertex_id = 0 + for edge_id, edge in exons.items(): + chrom = edge["chrom"] + strand = edge["strand"] + + v1 = edge["v1"] + v2 = edge["v2"] + + # exon start + key = (chrom, v1) + if key not in locs: + locs[key] = vertex_id + vertex_id += 1 + # exon end + key = (chrom, v2) + if key not in locs: + locs[key] = vertex_id + vertex_id += 1 + + # add locs-indexed path to transcripts, and populate edges + edges = {} + for _, t in transcripts.items(): + t["path"] = [] + strand = t["strand"] + t_exons = t["exons"] + + for i, exon_id in enumerate(t_exons): + # pull some information from exon dict + exon = exons[exon_id] + chrom = exon["chrom"] + v1 = exon["v1"] + v2 = exon["v2"] + strand = exon["strand"] + + # add current exon and subsequent intron + # (if not the last exon) for each exon to edges + key = (chrom, v1, v2, strand) + v1_key = (chrom, v1) + v2_key = (chrom, v2) + edge_id = (locs[v1_key], locs[v2_key]) + if key not in edges: + edges[key] = {"edge_id": edge_id, "edge_type": "exon"} + + # add exon locs to path + t["path"] += list(edge_id) + + # if this isn't the last exon, we also needa add an intron + # this consists of v2 of the prev exon and v1 of the next exon + if i < len(t_exons) - 1: + next_exon = exons[t_exons[i + 1]] + v1 = next_exon["v1"] + key = (chrom, v2, v1, strand) + v1_key = (chrom, v1) + edge_id = (locs[v2_key], locs[v1_key]) + if key not in edges: + edges[key] = {"edge_id": edge_id, "edge_type": "intron"} + + # turn transcripts, edges, and locs into dataframes + locs = [{"chrom": key[0], "coord": key[1], "vertex_id": vertex_id} for key, vertex_id in locs.items()] + loc_df = pd.DataFrame(locs) + + edges = [ + { + "v1": item["edge_id"][0], + "v2": item["edge_id"][1], + "strand": key[3], + "edge_id": item["edge_id"], + "edge_type": item["edge_type"], + } + for key, item in edges.items() + ] + edge_df = pd.DataFrame(edges) + + transcripts = [{"tid": key, "gid": item["gid"], "path": item["path"]} for key, item in transcripts.items()] + t_df = pd.DataFrame(transcripts) + + # final df formatting + loc_df = create_dupe_index(loc_df, "vertex_id") + loc_df = set_dupe_index(loc_df, "vertex_id") + edge_df = create_dupe_index(edge_df, "edge_id") + edge_df = set_dupe_index(edge_df, "edge_id") + t_df = create_dupe_index(t_df, "tid") + t_df = set_dupe_index(t_df, "tid") + + return loc_df, edge_df, t_df - # make sure file exists - if not os.path.exists(gtf_file): - raise Exception('GTF file not found. Check path.') - - # depending on the strand, determine the stard and stop - # coords of an intron or exon - def find_edge_start_stop(v1, v2, strand): - if strand == '-': - start = max([v1, v2]) - stop = min([v1, v2]) - elif strand == '+': - start = min([v1, v2]) - stop = max([v1, v2]) - return start, stop - - # dictionaries to hold unique edges and transcripts - transcripts = {} - exons = {} - - with open(gtf_file) as gtf: - for line in gtf: - - # ignore header lines - if line.startswith('#'): - continue - - # split each entry - line = line.strip().split('\t') - - # get some fields from gtf that we care about - chrom = line[0] - entry_type = line[2] - start = int(line[3]) - stop = int(line[4]) - strand = line[6] - fields = line[-1] - - # transcript entry - if entry_type == "transcript": - attributes = get_fields(line) - tid = attributes['transcript_id'] - gid = attributes['gene_id'] - - # add transcript to dictionary - transcript = {tid: {'gid': gid, - 'tid': tid, - 'strand': strand, - 'exons': []}} - transcripts.update(transcript) - - # exon entry - elif entry_type == "exon": - attributes = get_fields(line) - start, stop = find_edge_start_stop(start, stop, strand) - eid = '{}_{}_{}_{}_exon'.format(chrom, start, stop, strand) - tid = attributes['transcript_id'] - - # add novel exon to dictionary - if eid not in exons: - edge = {eid: {'eid': eid, - 'chrom': chrom, - 'v1': start, - 'v2': stop, - 'strand': strand}} - exons.update(edge) - - # add this exon to the transcript's list of exons - if tid in transcripts: - transcripts[tid]['exons'].append(eid) - - # once we have all transcripts, make loc_df - locs = {} - vertex_id = 0 - for edge_id, edge in exons.items(): - chrom = edge['chrom'] - strand = edge['strand'] - - v1 = edge['v1'] - v2 = edge['v2'] - - # exon start - key = (chrom, v1) - if key not in locs: - locs[key] = vertex_id - vertex_id += 1 - # exon end - key = (chrom, v2) - if key not in locs: - locs[key] = vertex_id - vertex_id += 1 - - # add locs-indexed path to transcripts, and populate edges - edges = {} - for _,t in transcripts.items(): - t['path'] = [] - strand = t['strand'] - t_exons = t['exons'] - - for i, exon_id in enumerate(t_exons): - - # pull some information from exon dict - exon = exons[exon_id] - chrom = exon['chrom'] - v1 = exon['v1'] - v2 = exon['v2'] - strand = exon['strand'] - - # add current exon and subsequent intron - # (if not the last exon) for each exon to edges - key = (chrom, v1, v2, strand) - v1_key = (chrom, v1) - v2_key = (chrom, v2) - edge_id = (locs[v1_key], locs[v2_key]) - if key not in edges: - edges[key] = {'edge_id': edge_id, 'edge_type': 'exon'} - - # add exon locs to path - t['path'] += list(edge_id) - - # if this isn't the last exon, we also needa add an intron - # this consists of v2 of the prev exon and v1 of the next exon - if i < len(t_exons)-1: - next_exon = exons[t_exons[i+1]] - v1 = next_exon['v1'] - key = (chrom, v2, v1, strand) - v1_key = (chrom, v1) - edge_id = (locs[v2_key], locs[v1_key]) - if key not in edges: - edges[key] = {'edge_id': edge_id, 'edge_type': 'intron'} - - # turn transcripts, edges, and locs into dataframes - locs = [{'chrom': key[0], - 'coord': key[1], - 'vertex_id': vertex_id} for key, vertex_id in locs.items()] - loc_df = pd.DataFrame(locs) - - edges = [{'v1': item['edge_id'][0], - 'v2': item['edge_id'][1], - 'strand': key[3], - 'edge_id': item['edge_id'], - 'edge_type': item['edge_type']} for key, item in edges.items()] - edge_df = pd.DataFrame(edges) - - transcripts = [{'tid': key, - 'gid': item['gid'], - 'path': item['path']} for key, item in transcripts.items()] - t_df = pd.DataFrame(transcripts) - - # final df formatting - loc_df = create_dupe_index(loc_df, 'vertex_id') - loc_df = set_dupe_index(loc_df, 'vertex_id') - edge_df = create_dupe_index(edge_df, 'edge_id') - edge_df = set_dupe_index(edge_df, 'edge_id') - t_df = create_dupe_index(t_df, 'tid') - t_df = set_dupe_index(t_df, 'tid') - - return loc_df, edge_df, t_df # convert talon query into edge path def get_db_edge_paths(paths): - edge_paths = [] - for p in paths: - if p[1] == None: - edge_paths.append([p[0]]) - else: - edge_paths.append( - [p[0], *[int(i) for i in p[1].split(',')], p[2]]) - return edge_paths + edge_paths = [] + for p in paths: + if p[1] == None: + edge_paths.append([p[0]]) + else: + edge_paths.append([p[0], *[int(i) for i in p[1].split(",")], p[2]]) + return edge_paths + # convert edge path to vertex path def get_db_vertex_paths(paths, edge_df): - vertex_paths = [] - for p in paths: - path = [] - for i, e in enumerate(p): - entry = edge_df.loc[edge_df.talon_edge_id == e] - if i == 0: - path.extend([entry.v1.values[0], entry.v2.values[0]]) - else: path.append(entry.v2.values[0]) - vertex_paths.append(path) - return vertex_paths + vertex_paths = [] + for p in paths: + path = [] + for i, e in enumerate(p): + entry = edge_df.loc[edge_df.talon_edge_id == e] + if i == 0: + path.extend([entry.v1.values[0], entry.v2.values[0]]) + else: + path.append(entry.v2.values[0]) + vertex_paths.append(path) + return vertex_paths # creates the duplicate index def create_dupe_index(df, ind_name): - df[ind_name+'_back'] = df[ind_name] - return df + df[ind_name + "_back"] = df[ind_name] + return df + def add_coord_info(edge_df, loc_df): - edge_df['chrom'] = edge_df.apply(lambda x: loc_df.loc[x.v1, 'chrom'], axis=1) - edge_df['start'] = edge_df.apply(lambda x: loc_df.loc[x.v1, 'coord'], axis=1) - edge_df['stop'] = edge_df.apply(lambda x: loc_df.loc[x.v2, 'coord'], axis=1) + edge_df["chrom"] = edge_df.apply(lambda x: loc_df.loc[x.v1, "chrom"], axis=1) + edge_df["start"] = edge_df.apply(lambda x: loc_df.loc[x.v1, "coord"], axis=1) + edge_df["stop"] = edge_df.apply(lambda x: loc_df.loc[x.v2, "coord"], axis=1) + + return edge_df + - return edge_df +def subset_edges(edge_df, mode="intron"): + sjs = edge_df[edge_df.apply(lambda x: True if x.edge_type == mode else False, axis=1)] + return sjs -def subset_edges(edge_df, mode='intron'): - sjs = edge_df[edge_df.apply( - lambda x: True if x.edge_type == mode else False, axis=1)] - return sjs def determine_sj_novelty(ref_edge_df, edge_df): + # Merge known starts from ref_edge_df with the query edges + ref_edge_df["start_known"] = True + edge_df = edge_df.merge( + ref_edge_df[["chrom", "start", "strand", "start_known"]], how="left", on=["chrom", "strand", "start"] + ) + edge_df.fillna(value=False, inplace=True) + + # Merge known ends from ref_edge_df with the query edges + ref_edge_df["stop_known"] = True + edge_df = edge_df.merge( + ref_edge_df[["chrom", "stop", "strand", "stop_known"]], how="left", on=["chrom", "strand", "stop"] + ) + edge_df.fillna(value=False, inplace=True) + + # Now determine whether the edge in whole has been seen before + ref_edge_df["combination_known"] = True + edge_df = edge_df.merge( + ref_edge_df[["chrom", "start", "stop", "strand", "combination_known"]], + how="left", + on=["chrom", "strand", "start", "stop"], + ) + edge_df.fillna(value=False, inplace=True) + + return edge_df - # Merge known starts from ref_edge_df with the query edges - ref_edge_df['start_known'] = True - edge_df = edge_df.merge(ref_edge_df[['chrom', 'start', 'strand', 'start_known']], - how = 'left', - on = ['chrom', 'strand', 'start']) - edge_df.fillna(value=False, inplace=True) - - # Merge known ends from ref_edge_df with the query edges - ref_edge_df['stop_known'] = True - edge_df = edge_df.merge(ref_edge_df[['chrom', 'stop', 'strand', 'stop_known']], - how = 'left', - on = ['chrom', 'strand', 'stop']) - edge_df.fillna(value=False, inplace=True) - - # Now determine whether the edge in whole has been seen before - ref_edge_df['combination_known'] = True - edge_df = edge_df.merge(ref_edge_df[['chrom', 'start', 'stop', 'strand', - 'combination_known']], - how = 'left', on = ['chrom', 'strand', 'start', 'stop']) - edge_df.fillna(value=False, inplace=True) - - return edge_df # renames old index dupe column in df and resets the index def reset_dupe_index(df, ind_name): - df.rename({ind_name: ind_name+'_back'}, inplace=True, axis=1) - df.reset_index(inplace=True) - return(df) + df.rename({ind_name: ind_name + "_back"}, inplace=True, axis=1) + df.reset_index(inplace=True) + return df + # set index, rename dupe index in df def set_dupe_index(df, ind_name): - df.set_index(ind_name, inplace=True) - df.rename({ind_name+'_back': ind_name}, inplace=True, axis=1) - return(df) + df.set_index(ind_name, inplace=True) + df.rename({ind_name + "_back": ind_name}, inplace=True, axis=1) + return df -def format_edge_df(edge_df): - edge_df.reset_index(drop=True, inplace=True) - edge_df.drop(['edge_type', 'v1', 'v2'], axis=1, inplace=True) - return edge_df - -def find_tids_from_sj(edge_df, t_df, mode='intron'): - if mode == 'exon': - t_df['edges'] = t_df.apply( - lambda x: [(x.path[i], x.path[i+1]) for i in range(len(x.path[:-1]))][::2], - axis=1) - elif mode == 'intron': - t_df['edges'] = t_df.apply( - lambda x: [(x.path[i], x.path[i+1]) for i in range(len(x.path[:-1]))][1::2], - axis=1) - edge_df['tids'] = edge_df.apply(lambda x: add_tids_to_sj(x, t_df), axis=1) - edge_df.reset_index(drop=True, inplace=True) - edge_df.drop('edge_id', inplace=True, axis=1) - - return edge_df -def add_tids_to_sj(x, t_df): - return ','.join([tid for tid, edges in zip(t_df.tid, t_df.edges) if x.edge_id in edges]) - - -def main(): - args = get_args() +def format_edge_df(edge_df): + edge_df.reset_index(drop=True, inplace=True) + edge_df.drop(["edge_type", "v1", "v2"], axis=1, inplace=True) + return edge_df - ref_loc_df, ref_edge_df, ref_t_df = create_dfs_gtf(args.ref_gtf) - ref_edge_df = add_coord_info(ref_edge_df, ref_loc_df) - ref_edge_df = subset_edges(ref_edge_df, mode=args.mode) - ref_edge_df = format_edge_df(ref_edge_df) - if args.db: - loc_df, edge_df, t_df = create_dfs_db(args.db) +def find_tids_from_sj(edge_df, t_df, mode="intron"): + if mode == "exon": + t_df["edges"] = t_df.apply(lambda x: [(x.path[i], x.path[i + 1]) for i in range(len(x.path[:-1]))][::2], axis=1) + elif mode == "intron": + t_df["edges"] = t_df.apply( + lambda x: [(x.path[i], x.path[i + 1]) for i in range(len(x.path[:-1]))][1::2], axis=1 + ) + edge_df["tids"] = edge_df.apply(lambda x: add_tids_to_sj(x, t_df), axis=1) + edge_df.reset_index(drop=True, inplace=True) + edge_df.drop("edge_id", inplace=True, axis=1) - elif args.gtf: - loc_df, edge_df, t_df = create_dfs_gtf(args.gtf) + return edge_df - edge_df = add_coord_info(edge_df, loc_df) - edge_df = subset_edges(edge_df, mode=args.mode) - edge_df = format_edge_df(edge_df) - edge_df = determine_sj_novelty(ref_edge_df, edge_df) - edge_df = find_tids_from_sj(edge_df, t_df, mode=args.mode) - edge_df = edge_df.rename(columns={'tids': 'transcript_ids'}) - edge_df.to_csv('{}_{}s.tsv'.format(args.outprefix, args.mode), - sep='\t', index=False, columns=["chrom","start","stop", - "strand", "start_known", - "stop_known", - "combination_known", - "transcript_ids"]) +def add_tids_to_sj(x, t_df): + return ",".join([tid for tid, edges in zip(t_df.tid, t_df.edges) if x.edge_id in edges]) -if __name__ == '__main__': - main() +def main(): + args = get_args() + + ref_loc_df, ref_edge_df, ref_t_df = create_dfs_gtf(args.ref_gtf) + ref_edge_df = add_coord_info(ref_edge_df, ref_loc_df) + ref_edge_df = subset_edges(ref_edge_df, mode=args.mode) + ref_edge_df = format_edge_df(ref_edge_df) + + if args.db: + loc_df, edge_df, t_df = create_dfs_db(args.db) + + elif args.gtf: + loc_df, edge_df, t_df = create_dfs_gtf(args.gtf) + + edge_df = add_coord_info(edge_df, loc_df) + edge_df = subset_edges(edge_df, mode=args.mode) + edge_df = format_edge_df(edge_df) + edge_df = determine_sj_novelty(ref_edge_df, edge_df) + edge_df = find_tids_from_sj(edge_df, t_df, mode=args.mode) + + edge_df = edge_df.rename(columns={"tids": "transcript_ids"}) + edge_df.to_csv( + "{}_{}s.tsv".format(args.outprefix, args.mode), + sep="\t", + index=False, + columns=[ + "chrom", + "start", + "stop", + "strand", + "start_known", + "stop_known", + "combination_known", + "transcript_ids", + ], + ) + + +if __name__ == "__main__": + main() diff --git a/src/talon/post/map_antisense_genes_to_sense.py b/src/talon/post/map_antisense_genes_to_sense.py index 1a506ec..bdd4276 100644 --- a/src/talon/post/map_antisense_genes_to_sense.py +++ b/src/talon/post/map_antisense_genes_to_sense.py @@ -4,27 +4,32 @@ # map_antisense_genes_to_sense.py is a utility that outputs the ID of the # corresponding sense gene for every antisense gene in the database -from optparse import OptionParser import sqlite3 +from optparse import OptionParser from pathlib import Path from . import ab_utils as autils + def getOptions(): parser = OptionParser() - parser.add_option("--db", dest = "database", - help = "TALON database", metavar = "FILE", type = "string") - parser.add_option("--annot", "-a", dest = "annot", - help = """Which annotation version to use. Will determine which + parser.add_option("--db", dest="database", help="TALON database", metavar="FILE", type="string") + parser.add_option( + "--annot", + "-a", + dest="annot", + help="""Which annotation version to use. Will determine which annotation is used to fetch gene names. Note: - Must be in the TALON database.""", type = "string") - parser.add_option("--o", dest = "outprefix", help = "Prefix for output GTF", - metavar = "FILE", type = "string") + Must be in the TALON database.""", + type="string", + ) + parser.add_option("--o", dest="outprefix", help="Prefix for output GTF", metavar="FILE", type="string") (options, args) = parser.parse_args() return options + # def check_annot_validity(annot, database): # """ Make sure that the user has entered a correct annotation name """ # @@ -52,16 +57,19 @@ def getOptions(): # # return + def create_gene_name_dict(cursor, annot): - """ Create a dictionary mapping TALON gene IDs to their names in the - annot annotation""" + """Create a dictionary mapping TALON gene IDs to their names in the + annot annotation""" - cursor.execute("""SELECT ga.ID, + cursor.execute( + """SELECT ga.ID, ga.value AS gene_name FROM gene_annotations AS ga WHERE ga.attribute = 'gene_name' - AND (ga.annot_name = '%s' OR ga.source = 'TALON')""" \ - % (annot)) + AND (ga.annot_name = '%s' OR ga.source = 'TALON')""" + % (annot) + ) gene_names = {} for entry in cursor.fetchall(): @@ -71,6 +79,7 @@ def create_gene_name_dict(cursor, annot): return gene_names + def main(): options = getOptions() database = options.database @@ -89,10 +98,12 @@ def main(): conn = sqlite3.connect(database) conn.row_factory = sqlite3.Row cursor = conn.cursor() - cursor.execute("""SELECT ga.ID As antisense_talon_ID, + cursor.execute( + """SELECT ga.ID As antisense_talon_ID, ga.value AS sense_talon_ID FROM gene_annotations AS ga - WHERE ga.attribute = 'gene_antisense_to_IDs'""") + WHERE ga.attribute = 'gene_antisense_to_IDs'""" + ) antisense_rows = cursor.fetchall() # Create a dict of gene names @@ -100,16 +111,15 @@ def main(): # Write antisense-sense pairs to file. When there is more than one sense match, # create separate lines - o = open(outfile, 'w') + o = open(outfile, "w") o.write(",".join(["antisense_talon_ID", "sense_talon_ID", "sense_gene_name"]) + "\n") for entry in antisense_rows: sense_IDs = entry["sense_talon_ID"].split(",") for sense_ID in sense_IDs: - o.write(",".join([str(entry["antisense_talon_ID"]), - str(sense_ID), - gene_name_dict[int(sense_ID)]]) + "\n") + o.write(",".join([str(entry["antisense_talon_ID"]), str(sense_ID), gene_name_dict[int(sense_ID)]]) + "\n") o.close() -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/src/talon/post/post_utils.py b/src/talon/post/post_utils.py index 0805f65..097d9a0 100644 --- a/src/talon/post/post_utils.py +++ b/src/talon/post/post_utils.py @@ -3,13 +3,14 @@ from .. import query_utils as qutils + def handle_filtering(database, annot, observed, whitelist_file, dataset_file): - """ Determines which transcripts to allow in the analysis. This can be done - in two different ways. If no whitelist is included, then all of the - transcripts in the database are included (modified by 'observed' - option). If a whitelist is provided, then transcripts on that list - will be included (modified by 'observed' option). This can be - tuned further by providing a dataset file, but this is optional. """ + """Determines which transcripts to allow in the analysis. This can be done + in two different ways. If no whitelist is included, then all of the + transcripts in the database are included (modified by 'observed' + option). If a whitelist is provided, then transcripts on that list + will be included (modified by 'observed' option). This can be + tuned further by providing a dataset file, but this is optional.""" conn = sqlite3.connect(database) conn.row_factory = sqlite3.Row @@ -31,14 +32,17 @@ def handle_filtering(database, annot, observed, whitelist_file, dataset_file): if datasets != None: # Limit the whitelist to transcripts detected in the datasets - transcripts = [ x[1] for x in whitelist ] + transcripts = [x[1] for x in whitelist] transcript_str = qutils.format_for_IN(transcripts) dataset_str = qutils.format_for_IN(datasets) query = """ SELECT DISTINCT gene_ID, transcript_ID FROM observed WHERE transcript_ID IN %s - AND dataset in %s """ % (transcript_str, dataset_str) + AND dataset in %s """ % ( + transcript_str, + dataset_str, + ) cursor.execute(query) whitelist = cursor.fetchall() @@ -46,6 +50,6 @@ def handle_filtering(database, annot, observed, whitelist_file, dataset_file): # check if the pass list has any transcripts if len(whitelist) == 0: - raise ValueError('No transcripts found with the given filtering settings') + raise ValueError("No transcripts found with the given filtering settings") return whitelist diff --git a/src/talon/post/summarize_datasets.py b/src/talon/post/summarize_datasets.py index 2bdd885..e0bbfac 100644 --- a/src/talon/post/summarize_datasets.py +++ b/src/talon/post/summarize_datasets.py @@ -4,61 +4,76 @@ from .. import query_utils as qutils + def get_args(): - """ Fetches the arguments for the program """ + """Fetches the arguments for the program""" program_desc = """Generates a tab-delimited file of gene and transcript counts for each dataset in the database (broken down by category).""" parser = argparse.ArgumentParser(description=program_desc) - parser.add_argument('--db', dest = 'database', metavar='FILE,', type = str, - help='TALON database') - parser.add_argument('--groups', dest = 'groups', metavar='FILE,', type = str, - help='Optional: file of comma-delimited dataset groups to process together', default = None) - parser.add_argument("--verbose", - help = "Verbose mode: print out the counts in terminal", - action="store_true") - parser.add_argument("--o", dest = "outprefix", - help = "Prefix for output file", type = str) + parser.add_argument("--db", dest="database", metavar="FILE,", type=str, help="TALON database") + parser.add_argument( + "--groups", + dest="groups", + metavar="FILE,", + type=str, + help="Optional: file of comma-delimited dataset groups to process together", + default=None, + ) + parser.add_argument("--verbose", help="Verbose mode: print out the counts in terminal", action="store_true") + parser.add_argument("--o", dest="outprefix", help="Prefix for output file", type=str) args = parser.parse_args() return args -def write_counts_file(cursor, outprefix, datasets, verbose = False): - """ Create a log file with the following columns: - - dataset name - - Number of reads annotated - - Number of known genes detected (total) - - Number of novel genes detected (total) - - Number of known transcripts detected (total) - - Number of novel transcripts detected (total) - Breakdowns by category - - Number of antisense genes detected - - Number of intergenic genes detected - - Number of known transcripts - - Number of FSM transcripts detected (perfect + with novelty) - - Number of total ISM transcripts detected - - Number of suffix ISMs detected - - Number of antisense transcripts detected - - Number of genomic transcripts detected + +def write_counts_file(cursor, outprefix, datasets, verbose=False): + """Create a log file with the following columns: + - dataset name + - Number of reads annotated + - Number of known genes detected (total) + - Number of novel genes detected (total) + - Number of known transcripts detected (total) + - Number of novel transcripts detected (total) + Breakdowns by category + - Number of antisense genes detected + - Number of intergenic genes detected + - Number of known transcripts + - Number of FSM transcripts detected (perfect + with novelty) + - Number of total ISM transcripts detected + - Number of suffix ISMs detected + - Number of antisense transcripts detected + - Number of genomic transcripts detected """ - o = open(outprefix + "_talon_summary.tsv", 'w') + o = open(outprefix + "_talon_summary.tsv", "w") d = dict() - columns = [ "dataset", "reads_annotated", "known_genes", "antisense_genes", - "other_novel_genes", "known_transcripts", "novel_transcripts", - "ISMs", "prefix_ISMs", "suffix_ISMs", "NICs", "NNCs", - "antisense_transcripts", "genomic_transcripts" ] + columns = [ + "dataset", + "reads_annotated", + "known_genes", + "antisense_genes", + "other_novel_genes", + "known_transcripts", + "novel_transcripts", + "ISMs", + "prefix_ISMs", + "suffix_ISMs", + "NICs", + "NNCs", + "antisense_transcripts", + "genomic_transcripts", + ] o.write("\t".join(columns) + "\n") # Get dataset names if datasets == None: cursor.execute(""" SELECT dataset_name FROM dataset """) - datasets = [ str(x[0]) for x in cursor.fetchall() ] + datasets = [str(x[0]) for x in cursor.fetchall()] for dataset in datasets: - # Get number of reads in the dataset reads = qutils.count_observed_reads(cursor, dataset) @@ -97,10 +112,22 @@ def write_counts_file(cursor, outprefix, datasets, verbose = False): # Get genomic novel transcripts genomic_transcripts = len(qutils.fetch_genomic_transcripts(cursor, dataset)) - outputs = [ dataset, reads, known_genes, antisense_genes, - intergenic_genes, known_transcripts, novel_transcripts, ISMs, prefix_ISMs, - suffix_ISMs, NICs, NNCs, antisense_transcripts, - genomic_transcripts ] + outputs = [ + dataset, + reads, + known_genes, + antisense_genes, + intergenic_genes, + known_transcripts, + novel_transcripts, + ISMs, + prefix_ISMs, + suffix_ISMs, + NICs, + NNCs, + antisense_transcripts, + genomic_transcripts, + ] if verbose == True: print("---------------%s---------------" % dataset) @@ -123,18 +150,20 @@ def write_counts_file(cursor, outprefix, datasets, verbose = False): o.close() + def process_groups(group_file): - """ Read in a comma-delimited file of dataset groups and format them - as a list of lists """ + """Read in a comma-delimited file of dataset groups and format them + as a list of lists""" datasets = [] - with open(group_file, 'r') as f: + with open(group_file, "r") as f: for line in f: line = line.strip() datasets.append(line.split(",")) return datasets + def main(): options = get_args() @@ -157,5 +186,5 @@ def main(): conn.close() -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/src/talon/process_sams.py b/src/talon/process_sams.py index 84c8071..80720c0 100644 --- a/src/talon/process_sams.py +++ b/src/talon/process_sams.py @@ -4,39 +4,39 @@ # Functions related to processing the input SAM files and partitioning them # for processing in parallel -import pyranges as pr -import pysam +import logging import os import time -import logging + +import pyranges as pr +import pysam save = pysam.set_verbosity(0) # pysam.set_verbosity(save) + def convert_to_bam(sam, bam, threads): - """ Convert provided sam file to bam file (provided name). """ + """Convert provided sam file to bam file (provided name).""" try: infile = pysam.AlignmentFile(sam, "r", threads=threads) - outfile = pysam.AlignmentFile( - bam, "wb", template=infile, threads=threads) + outfile = pysam.AlignmentFile(bam, "wb", template=infile, threads=threads) for s in infile: outfile.write(s) except Exception as e: logging.error(e) - msg = f'Problem converting SAM file {sam} to BAM' + msg = f"Problem converting SAM file {sam} to BAM" logging.error(msg) raise RuntimeError(msg) # raise RuntimeError("Problem converting sam file '%s' to bam." % (sam)) -def preprocess_sam(sam_files, datasets, use_cb_tag, - tmp_dir="talon_tmp/", n_threads=0): - """ Copy and rename the provided SAM/BAM file(s), merge them, and index. - This is necessary in order to use following commands on the reads. - The renaming is necessary in order to label the reads according to - their dataset.""" +def preprocess_sam(sam_files, datasets, use_cb_tag, tmp_dir="talon_tmp/", n_threads=0): + """Copy and rename the provided SAM/BAM file(s), merge them, and index. + This is necessary in order to use following commands on the reads. + The renaming is necessary in order to label the reads according to + their dataset.""" # Create the tmp dir os.system("mkdir -p %s " % (tmp_dir)) @@ -57,8 +57,7 @@ def preprocess_sam(sam_files, datasets, use_cb_tag, renamed_sams.append(sorted_bam) merged_bam = tmp_dir + "merged.bam" - merge_args = [merged_bam] + renamed_sams + \ - ["-f", "-r", "-@", str(n_threads)] + merge_args = [merged_bam] + renamed_sams + ["-f", "-r", "-@", str(n_threads)] # index_args = [merged_bam, "-@", str(n_threads)] # # Merge datasets and use -r option to include a read group tag @@ -76,12 +75,12 @@ def preprocess_sam(sam_files, datasets, use_cb_tag, elif use_cb_tag: for i, sam in enumerate(sam_files): fname_split = sam.split(".") - suffix = "."+fname_split[-1] + suffix = "." + fname_split[-1] if suffix == ".sam": - bam_copy = '{}{}_unsorted.bam'.format(tmp_dir, i) + bam_copy = "{}{}_unsorted.bam".format(tmp_dir, i) convert_to_bam(sam, bam_copy, n_threads) sam = bam_copy - sorted_bam = '{}{}.bam'.format(tmp_dir, i) + sorted_bam = "{}{}.bam".format(tmp_dir, i) pysam.sort("-@", str(n_threads), "-o", sorted_bam, sam) renamed_sams.append(sorted_bam) @@ -96,39 +95,39 @@ def preprocess_sam(sam_files, datasets, use_cb_tag, pysam.index(sorted_bam) # ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) # print("[ %s ] Merged input SAM/BAM files" % (ts)) - logging.info('Merged input SAM/BAM files') + logging.info("Merged input SAM/BAM files") except: # raise RuntimeError(("Problem merging and indexing SAM/BAM files. " # "Check your file paths and make sure that all " # "files have headers.")) - msg = "Problem merging and indexing SAM/BAM files. "+\ - "Check your file paths and make sure that all "+\ - "files have headers." + msg = ( + "Problem merging and indexing SAM/BAM files. " + + "Check your file paths and make sure that all " + + "files have headers." + ) logging.error(msg) raise RuntimeError(msg) return sorted_bam -def partition_reads(sam_files, datasets, use_cb_tag, - tmp_dir="talon_tmp/", n_threads=0): - """ Use bedtools merge to create non-overlapping intervals from all of the - transcripts in a series of SAM/BAM files. Then, iterate over the intervals - to extract all reads inside of them from the pysam object. +def partition_reads(sam_files, datasets, use_cb_tag, tmp_dir="talon_tmp/", n_threads=0): + """Use bedtools merge to create non-overlapping intervals from all of the + transcripts in a series of SAM/BAM files. Then, iterate over the intervals + to extract all reads inside of them from the pysam object. - Returns: - - List of lists: sublists contain pysam reads from a given interval - - List of tuple intervals - - filename of merged bam file (to keep track of the header) - """ - merged_bam = preprocess_sam(sam_files, datasets, use_cb_tag, - tmp_dir=tmp_dir, n_threads=n_threads) + Returns: + - List of lists: sublists contain pysam reads from a given interval + - List of tuple intervals + - filename of merged bam file (to keep track of the header) + """ + merged_bam = preprocess_sam(sam_files, datasets, use_cb_tag, tmp_dir=tmp_dir, n_threads=n_threads) try: gr = pr.read_bam(merged_bam) except Exception as e: # print(e) logging.error(e) - msg = f'Problem opening SAM file {merged_bam}' + msg = f"Problem opening SAM file {merged_bam}" logging.error(msg) raise RuntimeError(msg) @@ -139,20 +138,17 @@ def partition_reads(sam_files, datasets, use_cb_tag, read_groups = [] with pysam.AlignmentFile(merged_bam) as bam: # type: pysam.AlignmentFile for _, interval in gr.df.iterrows(): - reads = get_reads_in_interval(bam, interval.Chromosome, - interval.Start, interval.End) + reads = get_reads_in_interval(bam, interval.Chromosome, interval.Start, interval.End) read_groups.append(reads) - coords.append((interval.Chromosome, - interval.Start + 1, interval.End)) + coords.append((interval.Chromosome, interval.Start + 1, interval.End)) return read_groups, coords, merged_bam -def write_reads_to_file(read_groups, intervals, - header_template, tmp_dir="talon_tmp/"): - """ For each read group, iterate over the reads and write them to a file - named for the interval they belong to. This step is necessary because - Pysam objects cannot be pickled. """ +def write_reads_to_file(read_groups, intervals, header_template, tmp_dir="talon_tmp/"): + """For each read group, iterate over the reads and write them to a file + named for the interval they belong to. This step is necessary because + Pysam objects cannot be pickled.""" tmp_dir = tmp_dir + "interval_files/" if not os.path.exists(tmp_dir): @@ -171,9 +167,9 @@ def write_reads_to_file(read_groups, intervals, def get_reads_in_interval(sam, chrom, start, end): - """ Given an open pysam.AlignmentFile, return only the reads that overlap - the provided interval. Note that this means there may be reads that - extend beyond the bounds of the interval. """ + """Given an open pysam.AlignmentFile, return only the reads that overlap + the provided interval. Note that this means there may be reads that + extend beyond the bounds of the interval.""" iterator = sam.fetch(chrom, start, end) reads = [x for x in iterator] return reads diff --git a/src/talon/query_utils.py b/src/talon/query_utils.py index 992bd2c..c9946c8 100644 --- a/src/talon/query_utils.py +++ b/src/talon/query_utils.py @@ -5,12 +5,14 @@ import sqlite3 + def fetch_reproducible_intergenic(cursor, datasets): - """ Return the gene and transcript ID of any intergenic transcripts that were - found in at least two of the supplied datasets """ + """Return the gene and transcript ID of any intergenic transcripts that were + found in at least two of the supplied datasets""" datasets = format_for_IN(datasets) - query = """SELECT gene_ID, + query = ( + """SELECT gene_ID, a.transcript_ID FROM abundance as a LEFT JOIN transcript_annotations as ta @@ -18,20 +20,24 @@ def fetch_reproducible_intergenic(cursor, datasets): LEFT JOIN transcripts ON transcripts.transcript_ID = a.transcript_ID WHERE ta.attribute = 'intergenic_transcript' - AND a.dataset IN """ + datasets + \ - """ GROUP BY a.transcript_ID + AND a.dataset IN """ + + datasets + + """ GROUP BY a.transcript_ID HAVING count(*) > 1;""" + ) cursor.execute(query) intergenic = [(x[0], x[1], "intergenic_transcript") for x in cursor.fetchall()] return intergenic + def fetch_reproducible_antisense(cursor, datasets): - """ Return the gene and transcript ID of any antisense transcripts that were - found in at least two of the supplied datasets """ + """Return the gene and transcript ID of any antisense transcripts that were + found in at least two of the supplied datasets""" datasets = format_for_IN(datasets) - query = """SELECT gene_ID, + query = ( + """SELECT gene_ID, a.transcript_ID FROM abundance as a LEFT JOIN transcript_annotations as ta @@ -39,20 +45,24 @@ def fetch_reproducible_antisense(cursor, datasets): LEFT JOIN transcripts ON transcripts.transcript_ID = a.transcript_ID WHERE ta.attribute = 'antisense_transcript' - AND a.dataset IN """ + datasets + \ - """ GROUP BY a.transcript_ID + AND a.dataset IN """ + + datasets + + """ GROUP BY a.transcript_ID HAVING count(*) > 1;""" + ) cursor.execute(query) antisense = [(x[0], x[1], "antisense_transcript") for x in cursor.fetchall()] return antisense + def fetch_reproducible_NNCs(cursor, datasets): - """ Return the gene and transcript ID of any NNC transcripts that were - found in at least two of the supplied datasets """ + """Return the gene and transcript ID of any NNC transcripts that were + found in at least two of the supplied datasets""" datasets = format_for_IN(datasets) - query = """SELECT gene_ID, + query = ( + """SELECT gene_ID, a.transcript_ID FROM abundance as a LEFT JOIN transcript_annotations as ta @@ -60,20 +70,24 @@ def fetch_reproducible_NNCs(cursor, datasets): LEFT JOIN transcripts ON transcripts.transcript_ID = a.transcript_ID WHERE ta.attribute = 'NNC_transcript' - AND a.dataset IN """ + datasets + \ - """ GROUP BY a.transcript_ID + AND a.dataset IN """ + + datasets + + """ GROUP BY a.transcript_ID HAVING count(*) > 1;""" + ) cursor.execute(query) NNC = [(x[0], x[1], "NNC_transcript") for x in cursor.fetchall()] return NNC + def fetch_reproducible_NICs(cursor, datasets): - """ Return the gene and transcript ID of any NIC transcripts that were - found in at least two of the supplied datasets """ + """Return the gene and transcript ID of any NIC transcripts that were + found in at least two of the supplied datasets""" datasets = format_for_IN(datasets) - query = """SELECT gene_ID, + query = ( + """SELECT gene_ID, a.transcript_ID FROM abundance as a LEFT JOIN transcript_annotations as ta @@ -81,24 +95,28 @@ def fetch_reproducible_NICs(cursor, datasets): LEFT JOIN transcripts ON transcripts.transcript_ID = a.transcript_ID WHERE ta.attribute = 'NIC_transcript' - AND a.dataset IN """ + datasets + \ - """ GROUP BY a.transcript_ID + AND a.dataset IN """ + + datasets + + """ GROUP BY a.transcript_ID HAVING count(*) > 1;""" + ) cursor.execute(query) NIC = [(x[0], x[1], "NIC_transcript") for x in cursor.fetchall()] return NIC + def fetch_reproducible_ISMs(cursor, datasets): - """ Return the gene and transcript ID of any ISM transcripts that were - found in at least two of the supplied datasets """ + """Return the gene and transcript ID of any ISM transcripts that were + found in at least two of the supplied datasets""" datasets = format_for_IN(datasets) transcripts_seen = {} # To label novelty, perform queries separately for suffix, prefix, and # regular ISMs - query = """SELECT gene_ID, + query = ( + """SELECT gene_ID, a.transcript_ID FROM abundance as a LEFT JOIN transcript_annotations as ta @@ -106,9 +124,11 @@ def fetch_reproducible_ISMs(cursor, datasets): LEFT JOIN transcripts ON transcripts.transcript_ID = a.transcript_ID WHERE ta.attribute = 'ISM-prefix_transcript' - AND a.dataset IN """ + datasets + \ - """ GROUP BY a.transcript_ID + AND a.dataset IN """ + + datasets + + """ GROUP BY a.transcript_ID HAVING count(*) > 1;""" + ) cursor.execute(query) ISMs = [(x[0], x[1], "ISM-prefix_transcript") for x in cursor.fetchall()] @@ -116,7 +136,8 @@ def fetch_reproducible_ISMs(cursor, datasets): for entry in ISMs: transcripts_seen[entry[1]] = 1 - query = """SELECT gene_ID, + query = ( + """SELECT gene_ID, a.transcript_ID FROM abundance as a LEFT JOIN transcript_annotations as ta @@ -124,9 +145,11 @@ def fetch_reproducible_ISMs(cursor, datasets): LEFT JOIN transcripts ON transcripts.transcript_ID = a.transcript_ID WHERE ta.attribute = 'ISM-suffix_transcript' - AND a.dataset IN """ + datasets + \ - """ GROUP BY a.transcript_ID + AND a.dataset IN """ + + datasets + + """ GROUP BY a.transcript_ID HAVING count(*) > 1;""" + ) cursor.execute(query) suffix_ISMs = [(x[0], x[1], "ISM-suffix_transcript") for x in cursor.fetchall()] @@ -136,7 +159,8 @@ def fetch_reproducible_ISMs(cursor, datasets): ISMs.append(entry) transcripts_seen[entry[1]] = 1 - query = """SELECT gene_ID, + query = ( + """SELECT gene_ID, a.transcript_ID FROM abundance as a LEFT JOIN transcript_annotations as ta @@ -144,9 +168,11 @@ def fetch_reproducible_ISMs(cursor, datasets): LEFT JOIN transcripts ON transcripts.transcript_ID = a.transcript_ID WHERE ta.attribute = 'ISM_transcript' - AND a.dataset IN """ + datasets + \ - """ GROUP BY a.transcript_ID + AND a.dataset IN """ + + datasets + + """ GROUP BY a.transcript_ID HAVING count(*) > 1;""" + ) cursor.execute(query) all_ISMs = [(x[0], x[1], "other_ISM_transcript") for x in cursor.fetchall()] @@ -158,32 +184,41 @@ def fetch_reproducible_ISMs(cursor, datasets): return ISMs + def fetch_known_transcripts_with_gene_label(cursor, datasets): - """ Fetch known transcripts along with the gene they belong to """ + """Fetch known transcripts along with the gene they belong to""" datasets = format_for_IN(datasets) - query = """SELECT DISTINCT gene_ID,transcript_ID FROM observed + query = ( + """SELECT DISTINCT gene_ID,transcript_ID FROM observed LEFT JOIN transcript_annotations AS ta ON ta.ID = observed.transcript_ID WHERE (ta.attribute = 'transcript_status' AND ta.value = 'KNOWN') - AND observed.dataset IN """ + datasets + AND observed.dataset IN """ + + datasets + ) cursor.execute(query) known_transcripts = [(x[0], x[1], "FSM_transcript") for x in cursor.fetchall()] return known_transcripts + def fetch_NIC_transcripts_with_gene_label(cursor, datasets): - """ Fetch NIC transcripts along with the gene they belong to """ + """Fetch NIC transcripts along with the gene they belong to""" datasets = format_for_IN(datasets) - query = """SELECT DISTINCT gene_ID,transcript_ID FROM observed + query = ( + """SELECT DISTINCT gene_ID,transcript_ID FROM observed LEFT JOIN transcript_annotations AS ta ON ta.ID = observed.transcript_ID WHERE (ta.attribute = 'NIC_transcript') - AND observed.dataset IN """ + datasets + AND observed.dataset IN """ + + datasets + ) cursor.execute(query) known_transcripts = [(x[0], x[1], "NIC_transcript") for x in cursor.fetchall()] return known_transcripts + def count_observed_reads(cursor, datasets): - """ Count the number of observed reads for the provided datasets """ + """Count the number of observed reads for the provided datasets""" datasets = format_for_IN(datasets) query = "SELECT COUNT(obs_ID) FROM observed WHERE dataset IN " + datasets @@ -191,61 +226,76 @@ def count_observed_reads(cursor, datasets): reads = cursor.fetchone()[0] return reads + def fetch_all_known_genes_detected(cursor, datasets): - """ Get the IDs of all known genes found in a particular dataset (no - filter with respect to the type of transcript detected). """ + """Get the IDs of all known genes found in a particular dataset (no + filter with respect to the type of transcript detected).""" datasets = format_for_IN(datasets) - query = """SELECT DISTINCT(gene_ID) FROM observed + query = ( + """SELECT DISTINCT(gene_ID) FROM observed LEFT JOIN gene_annotations AS ga ON ga.ID = observed.gene_ID WHERE (ga.attribute = 'gene_status' AND ga.value = 'KNOWN') - AND observed.dataset IN """ + datasets + AND observed.dataset IN """ + + datasets + ) cursor.execute(query) known_genes = [x[0] for x in cursor.fetchall()] return known_genes + def count_known_genes_detected(cursor, dataset): - """ Count the number of known genes detected in the dataset (no filter - with respect to the type of transcript detected). """ + """Count the number of known genes detected in the dataset (no filter + with respect to the type of transcript detected).""" known_genes = fetch_all_known_genes_detected(cursor, dataset) return len(known_genes) + def count_novel_genes_detected(cursor, dataset): - """ Count the number of novel genes detected in the dataset (no filter - with respect to the type of transcript detected). """ + """Count the number of novel genes detected in the dataset (no filter + with respect to the type of transcript detected).""" novel_genes = fetch_all_novel_genes_detected(cursor, dataset) return len(novel_genes) + def fetch_all_novel_genes_detected(cursor, datasets): - """ Get the IDs of all novel genes found in a particular dataset (no - filter with respect to the type of transcript detected). """ + """Get the IDs of all novel genes found in a particular dataset (no + filter with respect to the type of transcript detected).""" datasets = format_for_IN(datasets) - query = """SELECT DISTINCT(gene_ID) FROM observed + query = ( + """SELECT DISTINCT(gene_ID) FROM observed LEFT JOIN gene_annotations AS ga ON ga.ID = observed.gene_ID WHERE (ga.attribute = 'gene_status' AND ga.value = 'NOVEL') - AND observed.dataset IN """ + datasets + AND observed.dataset IN """ + + datasets + ) cursor.execute(query) novel_genes = [x[0] for x in cursor.fetchall()] return novel_genes + def fetch_all_known_transcripts_detected(cursor, datasets): - """ Get the IDs of all transcripts annotated as known. Does not include - novel FSMs """ - + """Get the IDs of all transcripts annotated as known. Does not include + novel FSMs""" + datasets = format_for_IN(datasets) - query = """SELECT DISTINCT(transcript_ID) FROM observed + query = ( + """SELECT DISTINCT(transcript_ID) FROM observed LEFT JOIN transcript_annotations AS ta ON ta.ID = observed.transcript_ID WHERE (ta.attribute = 'transcript_status' AND ta.value = 'KNOWN') - AND observed.dataset IN """ + datasets + AND observed.dataset IN """ + + datasets + ) cursor.execute(query) known_transcripts = [x[0] for x in cursor.fetchall()] return known_transcripts + def fetch_FSM_novel_transcripts(cursor, dataset): - """ Fetch IDs of novel FSMs observed in the current dataset """ + """Fetch IDs of novel FSMs observed in the current dataset""" query = """SELECT DISTINCT(transcript_ID) FROM observed LEFT JOIN transcript_annotations AS ta ON ta.ID = observed.transcript_ID @@ -255,167 +305,213 @@ def fetch_FSM_novel_transcripts(cursor, dataset): FSM_transcripts = [x[0] for x in cursor.fetchall()] return FSM_transcripts + def fetch_novel_transcripts(cursor, datasets): - """ Fetch IDs of novel transcripts observed in the current dataset """ + """Fetch IDs of novel transcripts observed in the current dataset""" datasets = format_for_IN(datasets) - query = """SELECT DISTINCT(transcript_ID) FROM observed + query = ( + """SELECT DISTINCT(transcript_ID) FROM observed LEFT JOIN transcript_annotations AS ta ON ta.ID = observed.transcript_ID WHERE (ta.attribute = 'transcript_status' AND ta.value = 'NOVEL') - AND observed.dataset IN """ + datasets + AND observed.dataset IN """ + + datasets + ) cursor.execute(query) transcripts = [x[0] for x in cursor.fetchall()] return transcripts + def fetch_antisense_genes(cursor, datasets): - """ Fetch IDs of antisense genes observed in the dataset(s) """ + """Fetch IDs of antisense genes observed in the dataset(s)""" datasets = format_for_IN(datasets) - query = """SELECT DISTINCT(gene_ID) FROM observed + query = ( + """SELECT DISTINCT(gene_ID) FROM observed LEFT JOIN gene_annotations AS ga ON ga.ID = observed.gene_ID WHERE (ga.attribute = 'antisense_gene') - AND observed.dataset IN """ + datasets + AND observed.dataset IN """ + + datasets + ) cursor.execute(query) genes = [x[0] for x in cursor.fetchall()] return genes + def fetch_intergenic_novel_genes(cursor, datasets): - """ Fetch IDs of novel genes denoted as intergenic """ + """Fetch IDs of novel genes denoted as intergenic""" datasets = format_for_IN(datasets) - query = """SELECT DISTINCT(gene_ID) FROM observed + query = ( + """SELECT DISTINCT(gene_ID) FROM observed LEFT JOIN gene_annotations AS ga ON ga.ID = observed.gene_ID WHERE (ga.attribute = 'intergenic_novel') - AND observed.dataset IN """ + datasets + AND observed.dataset IN """ + + datasets + ) cursor.execute(query) genes = [x[0] for x in cursor.fetchall()] return genes + def fetch_all_ISM_transcripts(cursor, datasets): - """ Fetch IDs of all ISM transcripts """ - + """Fetch IDs of all ISM transcripts""" + datasets = format_for_IN(datasets) - query = """SELECT DISTINCT(transcript_ID) FROM observed + query = ( + """SELECT DISTINCT(transcript_ID) FROM observed LEFT JOIN transcript_annotations AS ta ON ta.ID = observed.transcript_ID WHERE (ta.attribute = 'ISM_transcript') - AND observed.dataset IN """ + datasets + AND observed.dataset IN """ + + datasets + ) cursor.execute(query) transcripts = [x[0] for x in cursor.fetchall()] return transcripts + def fetch_prefix_ISM_transcripts(cursor, datasets): - """ Fetch IDs of all ISM prefix transcripts """ + """Fetch IDs of all ISM prefix transcripts""" datasets = format_for_IN(datasets) - query = """SELECT DISTINCT(transcript_ID) FROM observed + query = ( + """SELECT DISTINCT(transcript_ID) FROM observed LEFT JOIN transcript_annotations AS ta ON ta.ID = observed.transcript_ID WHERE (ta.attribute = 'ISM-prefix_transcript') - AND observed.dataset IN """ + datasets + AND observed.dataset IN """ + + datasets + ) cursor.execute(query) transcripts = [x[0] for x in cursor.fetchall()] return transcripts + def fetch_suffix_ISM_transcripts(cursor, datasets): - """ Fetch IDs of all ISM suffix transcripts """ + """Fetch IDs of all ISM suffix transcripts""" datasets = format_for_IN(datasets) - query = """SELECT DISTINCT(transcript_ID) FROM observed + query = ( + """SELECT DISTINCT(transcript_ID) FROM observed LEFT JOIN transcript_annotations AS ta ON ta.ID = observed.transcript_ID WHERE (ta.attribute = 'ISM-suffix_transcript') - AND observed.dataset IN """ + datasets + AND observed.dataset IN """ + + datasets + ) cursor.execute(query) transcripts = [x[0] for x in cursor.fetchall()] return transcripts + def fetch_NIC_transcripts(cursor, datasets): - """ Fetch IDs of all NIC transcripts """ + """Fetch IDs of all NIC transcripts""" datasets = format_for_IN(datasets) - query = """SELECT DISTINCT(transcript_ID) FROM observed + query = ( + """SELECT DISTINCT(transcript_ID) FROM observed LEFT JOIN transcript_annotations AS ta ON ta.ID = observed.transcript_ID WHERE (ta.attribute = 'NIC_transcript') - AND observed.dataset IN """ + datasets + AND observed.dataset IN """ + + datasets + ) cursor.execute(query) transcripts = [x[0] for x in cursor.fetchall()] return transcripts + def fetch_NNC_transcripts(cursor, datasets): - """ Fetch IDs of all NNC transcripts """ + """Fetch IDs of all NNC transcripts""" datasets = format_for_IN(datasets) - query = """SELECT DISTINCT(transcript_ID) FROM observed + query = ( + """SELECT DISTINCT(transcript_ID) FROM observed LEFT JOIN transcript_annotations AS ta ON ta.ID = observed.transcript_ID WHERE (ta.attribute = 'NNC_transcript') - AND observed.dataset IN """ + datasets + AND observed.dataset IN """ + + datasets + ) cursor.execute(query) transcripts = [x[0] for x in cursor.fetchall()] return transcripts + def fetch_antisense_transcripts(cursor, datasets): - """ Fetch IDs of all antisense transcripts """ + """Fetch IDs of all antisense transcripts""" datasets = format_for_IN(datasets) - query = """SELECT DISTINCT(transcript_ID) FROM observed + query = ( + """SELECT DISTINCT(transcript_ID) FROM observed LEFT JOIN transcript_annotations AS ta ON ta.ID = observed.transcript_ID WHERE (ta.attribute = 'antisense_transcript') - AND observed.dataset IN """ + datasets + AND observed.dataset IN """ + + datasets + ) cursor.execute(query) transcripts = [x[0] for x in cursor.fetchall()] return transcripts + def fetch_intergenic_transcripts(cursor, datasets): - """ Fetch IDs of all intergenic transcripts """ + """Fetch IDs of all intergenic transcripts""" datasets = format_for_IN(datasets) - query = """SELECT DISTINCT(transcript_ID) FROM observed + query = ( + """SELECT DISTINCT(transcript_ID) FROM observed LEFT JOIN transcript_annotations AS ta ON ta.ID = observed.transcript_ID WHERE (ta.attribute = 'intergenic_transcript') - AND observed.dataset IN """ + datasets + AND observed.dataset IN """ + + datasets + ) cursor.execute(query) transcripts = [x[0] for x in cursor.fetchall()] return transcripts def fetch_genomic_transcripts(cursor, datasets): - """ Fetch IDs of all genomic transcripts """ + """Fetch IDs of all genomic transcripts""" datasets = format_for_IN(datasets) - query = """SELECT DISTINCT(transcript_ID) FROM observed + query = ( + """SELECT DISTINCT(transcript_ID) FROM observed LEFT JOIN transcript_annotations AS ta ON ta.ID = observed.transcript_ID WHERE (ta.attribute = 'genomic_transcript') - AND observed.dataset IN """ + datasets + AND observed.dataset IN """ + + datasets + ) cursor.execute(query) transcripts = [x[0] for x in cursor.fetchall()] return transcripts + def fetch_all_transcript_gene_pairs(cursor): - """ Return gene_ID - transcript_ID tuples from database """ + """Return gene_ID - transcript_ID tuples from database""" query = """ SELECT gene_ID, transcript_ID FROM transcripts """ cursor.execute(query) - + pairs = cursor.fetchall() return pairs - + + def fetch_all_datasets(cursor): - """ Return a list of all datasets in database """ + """Return a list of all datasets in database""" cursor.execute("SELECT dataset_name FROM dataset") datasets = [str(x[0]) for x in cursor.fetchall()] return datasets + def parse_whitelist(whitelist_file): - """ From the whitelist file, obtain a list of acccepted gene and - transcript IDs tuples""" + """From the whitelist file, obtain a list of acccepted gene and + transcript IDs tuples""" whitelist = set() - with open(whitelist_file, 'r') as f: + with open(whitelist_file, "r") as f: for line in f: line = line.strip() fields = line.split(",") @@ -427,13 +523,14 @@ def parse_whitelist(whitelist_file): raise ValueError("Gene/Transcript IDs in whitelist must be integer TALON IDs") return whitelist + def parse_datasets(dataset_file, cursor): - """ From the dataset file, obtain a list of acccepted dataset names""" + """From the dataset file, obtain a list of acccepted dataset names""" # Get datasets in this database db_datasets = fetch_all_datasets(cursor) dataset_list = set() - with open(dataset_file, 'r') as f: + with open(dataset_file, "r") as f: for line in f: line = line.strip() fields = line.split() @@ -443,13 +540,14 @@ def parse_datasets(dataset_file, cursor): dataset_list.add(dataset) return dataset_list -#------------------------------------------------------------------------------- + +# ------------------------------------------------------------------------------- def format_for_IN(l): - """ Converts input to string that can be used for IN database query """ - + """Converts input to string that can be used for IN database query""" + if type(l) is tuple: l = list(l) if type(l) is str: l = [l] - return "(" + ','.join(['"' + str(x) + '"' for x in l]) + ")" + return "(" + ",".join(['"' + str(x) + '"' for x in l]) + ")" diff --git a/src/talon/reformat_gtf.py b/src/talon/reformat_gtf.py index 5e2912e..3188492 100644 --- a/src/talon/reformat_gtf.py +++ b/src/talon/reformat_gtf.py @@ -1,218 +1,212 @@ import argparse + import pandas as pd + def get_args(): + desc = "Fixes a GTF with no genes" + parser = argparse.ArgumentParser(description=desc) - desc = 'Fixes a GTF with no genes' - parser = argparse.ArgumentParser(description=desc) + parser.add_argument("-gtf", "-g", dest="gtf", help="gtf to fix") + args = parser.parse_args() - parser.add_argument('-gtf', '-g', dest='gtf', - help='gtf to fix') - args = parser.parse_args() + return args - return args # check what entries are missing in the gtf def is_bad_gtf(gtffile): + missing_gene = False + missing_trans = False - missing_gene = False - missing_trans = False + # how many lines are useless lines + with open(gtffile, "r") as infile: + for i, line in enumerate(infile): + if "##" not in line: + break + skiprows = [j for j in range(0, i)] - # how many lines are useless lines - with open(gtffile, 'r') as infile: - for i, line in enumerate(infile): - if '##' not in line: - break - skiprows = [j for j in range(0, i)] + df = pd.read_csv(gtffile, sep="\t", usecols=[2], skiprows=skiprows) + categories = df.iloc[:, 0].unique() - df = pd.read_csv(gtffile, sep='\t', usecols=[2], skiprows=skiprows) - categories = df.iloc[:,0].unique() + # print(categories) - # print(categories) + # what are we missing? + if "gene" not in categories: + missing_gene = True + if "transcript" not in categories: + missing_trans = True - # what are we missing? - if 'gene' not in categories: - missing_gene = True - if 'transcript' not in categories: - missing_trans = True + return (missing_gene, missing_trans) - return (missing_gene, missing_trans) # get value associated with keyword in the 9th column of gtf def get_field_value(key, fields): if key not in fields: return None else: - return fields.split(key+' "')[1].split()[0].replace('";','') + return fields.split(key + ' "')[1].split()[0].replace('";', "") + def construct_new_entry(prev_line, coords, entry_type): + # print('Constructing new {} entry'.format(entry_type)) - # print('Constructing new {} entry'.format(entry_type)) - - # add gene or transcript type, coords, and len - prev_line[2] = entry_type - prev_line[3] = min(coords) - prev_line[4] = max(coords) - prev_line[7] = '.' - - # change the fields to reflect what type we are now - new_fields = '' - fields = prev_line[-1] - gid = get_field_value('gene_id', fields) - new_fields += 'gene_id "{}";'.format(gid) - - # if there's a gene name add it too - gname = get_field_value('gene_name', fields) - if gname: - new_fields += 'gene_name "{}";'.format(gname) - - if entry_type == 'transcript': - tid = get_field_value('transcript_id', fields) - new_fields += ' transcript_id "{}";'.format(tid) - - prev_line[-1] = new_fields - prev_line = format_to_write(prev_line) - - return prev_line + # add gene or transcript type, coords, and len + prev_line[2] = entry_type + prev_line[3] = min(coords) + prev_line[4] = max(coords) + prev_line[7] = "." -def make_ofile_name(matfile, prefix=None): - fname = matfile.split('.gtf')[0] - if prefix: - fname += '_' - fname += prefix - fname += '_reformatted.gtf' - return fname + # change the fields to reflect what type we are now + new_fields = "" + fields = prev_line[-1] + gid = get_field_value("gene_id", fields) + new_fields += 'gene_id "{}";'.format(gid) -def format_to_write(line): - return ''.join('\t'.join([str(i) for i in line])+'\n') + # if there's a gene name add it too + gname = get_field_value("gene_name", fields) + if gname: + new_fields += 'gene_name "{}";'.format(gname) -def main(): + if entry_type == "transcript": + tid = get_field_value("transcript_id", fields) + new_fields += ' transcript_id "{}";'.format(tid) - args = get_args() - gtffile = args.gtf + prev_line[-1] = new_fields + prev_line = format_to_write(prev_line) - (missing_gene, missing_transcript) = is_bad_gtf(gtffile) + return prev_line - print('Missing transcript : {}'.format(missing_transcript)) - # if nothing is missing, you good! - if not missing_gene and not missing_transcript: - print('GTF has both gene and transcript entries. Nothing to add.') - return +def make_ofile_name(matfile, prefix=None): + fname = matfile.split(".gtf")[0] + if prefix: + fname += "_" + fname += prefix + fname += "_reformatted.gtf" + return fname + + +def format_to_write(line): + return "".join("\t".join([str(i) for i in line]) + "\n") - # loop through this thing - infile = open(gtffile, 'r') - outfile = open(make_ofile_name(gtffile), 'w') - curr_gid = '' - curr_gid_coords = [] +def main(): + args = get_args() + gtffile = args.gtf + + (missing_gene, missing_transcript) = is_bad_gtf(gtffile) - curr_tid = '' - curr_tid_coords = [] + print("Missing transcript : {}".format(missing_transcript)) - first_transcript = True - first_exon = True + # if nothing is missing, you good! + if not missing_gene and not missing_transcript: + print("GTF has both gene and transcript entries. Nothing to add.") + return - gene_list = [] - transcript_list = [] + # loop through this thing + infile = open(gtffile, "r") + outfile = open(make_ofile_name(gtffile), "w") - prev_line = '' + curr_gid = "" + curr_gid_coords = [] - # relevant entries - entries = ['exon'] + curr_tid = "" + curr_tid_coords = [] - if missing_gene: - entries.append('transcript') + first_transcript = True + first_exon = True - if missing_gene or missing_transcript: + gene_list = [] + transcript_list = [] - for line in infile: + prev_line = "" - # skip the dumb header lines - if line.startswith('#'): - continue + # relevant entries + entries = ["exon"] - line = line.strip().split('\t') - fields = line[-1] + if missing_gene: + entries.append("transcript") - gid = get_field_value('gene_id', fields) - tid = get_field_value('transcript_id', fields) + if missing_gene or missing_transcript: + for line in infile: + # skip the dumb header lines + if line.startswith("#"): + continue - if line[2] in entries: + line = line.strip().split("\t") + fields = line[-1] - # set variables if first entry - if first_exon: - curr_gid = gid - curr_tid = tid + gid = get_field_value("gene_id", fields) + tid = get_field_value("transcript_id", fields) - curr_gid_coords = [int(line[3]), int(line[4])] - curr_tid_coords = [int(line[3]), int(line[4])] + if line[2] in entries: + # set variables if first entry + if first_exon: + curr_gid = gid + curr_tid = tid - first_exon = False + curr_gid_coords = [int(line[3]), int(line[4])] + curr_tid_coords = [int(line[3]), int(line[4])] - prev_line = line + first_exon = False - # found a new transcript - elif missing_transcript and tid != curr_tid: + prev_line = line - # create transcript entry and dump to current gene list - new_entry = construct_new_entry( - prev_line, curr_tid_coords, 'transcript') - transcript_list = new_entry+''.join(transcript_list) + # found a new transcript + elif missing_transcript and tid != curr_tid: + # create transcript entry and dump to current gene list + new_entry = construct_new_entry(prev_line, curr_tid_coords, "transcript") + transcript_list = new_entry + "".join(transcript_list) - gene_list += transcript_list - transcript_list = '' - curr_tid_coords = [] + gene_list += transcript_list + transcript_list = "" + curr_tid_coords = [] - if missing_gene and gid != curr_gid: - - # create gene entry and write current gene list - new_entry = construct_new_entry( - prev_line, curr_gid_coords, 'gene') - gene_list = new_entry+''.join(gene_list) + if missing_gene and gid != curr_gid: + # create gene entry and write current gene list + new_entry = construct_new_entry(prev_line, curr_gid_coords, "gene") + gene_list = new_entry + "".join(gene_list) - gene_list += ''.join(transcript_list) - transcript_list = '' - curr_tid_coords = [] + gene_list += "".join(transcript_list) + transcript_list = "" + curr_tid_coords = [] - outfile.write(gene_list) - gene_list = '' - curr_gid_coords = [] + outfile.write(gene_list) + gene_list = "" + curr_gid_coords = [] - # update loop vars - curr_gid = gid - curr_tid = tid - curr_gid_coords.append(int(line[3])) - curr_gid_coords.append(int(line[4])) - curr_tid_coords.append(int(line[3])) - curr_tid_coords.append(int(line[4])) + # update loop vars + curr_gid = gid + curr_tid = tid + curr_gid_coords.append(int(line[3])) + curr_gid_coords.append(int(line[4])) + curr_tid_coords.append(int(line[3])) + curr_tid_coords.append(int(line[4])) - prev_line = line + prev_line = line - # regardless, append to list of entries to write - transcript_list += format_to_write(line) + # regardless, append to list of entries to write + transcript_list += format_to_write(line) - # if we've reached the end of the file - # create transcript entry and dump to current gene list - if missing_transcript: - new_entry = construct_new_entry( - prev_line, curr_tid_coords, 'transcript') - transcript_list = new_entry+''.join(transcript_list) + # if we've reached the end of the file + # create transcript entry and dump to current gene list + if missing_transcript: + new_entry = construct_new_entry(prev_line, curr_tid_coords, "transcript") + transcript_list = new_entry + "".join(transcript_list) - gene_list += transcript_list - transcript_list = '' + gene_list += transcript_list + transcript_list = "" - # create gene entry and write current gene list - if missing_gene: - new_entry = construct_new_entry( - prev_line, curr_gid_coords, 'gene') - gene_list = new_entry+''.join(gene_list) - outfile.write(gene_list) - gene_list = '' + # create gene entry and write current gene list + if missing_gene: + new_entry = construct_new_entry(prev_line, curr_gid_coords, "gene") + gene_list = new_entry + "".join(gene_list) + outfile.write(gene_list) + gene_list = "" + infile.close() + outfile.close() - infile.close() - outfile.close() -if __name__ == '__main__': - main() +if __name__ == "__main__": + main() diff --git a/src/talon/talon.py b/src/talon/talon.py index d85dace..79daa18 100644 --- a/src/talon/talon.py +++ b/src/talon/talon.py @@ -5,39 +5,41 @@ # assigns them transcript and gene identifiers based on a GTF annotation. # Novel transcripts are assigned new identifiers. import argparse -from functools import reduce -import sqlite3 -import sys +import logging +import multiprocessing as mp import operator import os +import queue +import sqlite3 +import sys +import time +import warnings +from datetime import datetime, timedelta +from functools import reduce +from itertools import islice, repeat from pathlib import Path +from string import Template + import pandas as pd -import warnings -import logging +import pysam + +from talon.post import get_read_annotations from . import dstruct -from . import process_sams as procsams -from . import transcript_utils as tutils -from . import query_utils as qutils from . import init_refs as init_refs from . import logger as logger -from talon.post import get_read_annotations -import pysam -from string import Template -import multiprocessing as mp -import queue -from datetime import datetime, timedelta -import time -from itertools import repeat, islice - +from . import process_sams as procsams +from . import query_utils as qutils +from . import transcript_utils as tutils # set verbosity for pysam save = pysam.set_verbosity(0) # pysam.set_verbosity(save) + class Counter(object): def __init__(self, initval=0): - self.val = mp.Value('i', initval) + self.val = mp.Value("i", initval) self.lock = mp.Lock() def increment(self): @@ -51,8 +53,8 @@ def value(self): def get_counters(database): - """ Fetch counter values from the database and create counter objects - that will be accessible to all of the threads during the parallel run + """Fetch counter values from the database and create counter objects + that will be accessible to all of the threads during the parallel run """ with sqlite3.connect(database) as conn: @@ -62,34 +64,33 @@ def get_counters(database): # Fetch counter values cursor.execute("SELECT * FROM counters WHERE category == 'genes'") global gene_counter - gene_counter = Counter(initval=cursor.fetchone()['count']) + gene_counter = Counter(initval=cursor.fetchone()["count"]) - cursor.execute( - "SELECT * FROM counters WHERE category == 'transcripts'") + cursor.execute("SELECT * FROM counters WHERE category == 'transcripts'") global transcript_counter - transcript_counter = Counter(initval=cursor.fetchone()['count']) + transcript_counter = Counter(initval=cursor.fetchone()["count"]) cursor.execute("SELECT * FROM counters WHERE category == 'vertex'") global vertex_counter - vertex_counter = Counter(initval=cursor.fetchone()['count']) + vertex_counter = Counter(initval=cursor.fetchone()["count"]) cursor.execute("SELECT * FROM counters WHERE category == 'edge'") global edge_counter - edge_counter = Counter(initval=cursor.fetchone()['count']) + edge_counter = Counter(initval=cursor.fetchone()["count"]) cursor.execute("SELECT * FROM counters WHERE category == 'observed'") global observed_counter - observed_counter = Counter(initval=cursor.fetchone()['count']) + observed_counter = Counter(initval=cursor.fetchone()["count"]) cursor.execute("SELECT * FROM counters WHERE category == 'dataset'") global dataset_counter - dataset_counter = Counter(initval=cursor.fetchone()['count']) + dataset_counter = Counter(initval=cursor.fetchone()["count"]) return def get_args(): - """ Fetches the arguments for the program """ + """Fetches the arguments for the program""" program_desc = """TALON takes transcripts from one or more long read datasets (SAM format) and assigns them transcript and gene @@ -97,51 +98,88 @@ def get_args(): Novel events are assigned new identifiers.""" parser = argparse.ArgumentParser(description=program_desc) - parser.add_argument("--f", dest="config_file", - help="Dataset config file: dataset name, sample description, " + - "platform, sam file (comma-delimited)", type=str) - parser.add_argument("--cb", dest='use_cb_tag', action='store_true', - help="Use CB tag in input SAM file instead of including " + - "a dataset name in your config file", default=False) - parser.add_argument('--db', dest='database', metavar='FILE,', type=str, - help='TALON database. Created using talon_initialize_database') - parser.add_argument('--build', dest='build', metavar='STRING,', type=str, - help='Genome build (i.e. hg38) to use. Must be in the database.') - parser.add_argument("--threads", "-t", dest="threads", - help="Number of threads to run program with.", - type=int, default=2) - parser.add_argument("--cov", "-c", dest="min_coverage", - help="Minimum alignment coverage in order to use a SAM entry. Default = 0.9", - type=float, default=0.9) - parser.add_argument("--identity", "-i", dest="min_identity", - help="Minimum alignment identity in order to use a SAM entry. Default = 0.8", - type=float, default=0.8) - parser.add_argument("--nsg", "--create_novel_spliced_genes", dest='create_novel_spliced_genes', action='store_true', - help="Make novel genes with the intergenic novelty label " + - "for transcripts that don't share " + - "splice junctions with any other models", default=False) - parser.add_argument("--tmpDir", dest="tmp_dir", - help="Path to directory for tmp files. Default = `talon_tmp/`", - type=str, default="talon_tmp/") - parser.add_argument("--verbosity", "-v", type=int, default=1, - help="Verbosity of TALON output. Higher numbers = more verbose.") - parser.add_argument("--o", dest="outprefix", - help="Prefix for output files", type=str) + parser.add_argument( + "--f", + dest="config_file", + help="Dataset config file: dataset name, sample description, " + "platform, sam file (comma-delimited)", + type=str, + ) + parser.add_argument( + "--cb", + dest="use_cb_tag", + action="store_true", + help="Use CB tag in input SAM file instead of including " + "a dataset name in your config file", + default=False, + ) + parser.add_argument( + "--db", + dest="database", + metavar="FILE,", + type=str, + help="TALON database. Created using talon_initialize_database", + ) + parser.add_argument( + "--build", + dest="build", + metavar="STRING,", + type=str, + help="Genome build (i.e. hg38) to use. Must be in the database.", + ) + parser.add_argument( + "--threads", "-t", dest="threads", help="Number of threads to run program with.", type=int, default=2 + ) + parser.add_argument( + "--cov", + "-c", + dest="min_coverage", + help="Minimum alignment coverage in order to use a SAM entry. Default = 0.9", + type=float, + default=0.9, + ) + parser.add_argument( + "--identity", + "-i", + dest="min_identity", + help="Minimum alignment identity in order to use a SAM entry. Default = 0.8", + type=float, + default=0.8, + ) + parser.add_argument( + "--nsg", + "--create_novel_spliced_genes", + dest="create_novel_spliced_genes", + action="store_true", + help="Make novel genes with the intergenic novelty label " + + "for transcripts that don't share " + + "splice junctions with any other models", + default=False, + ) + parser.add_argument( + "--tmpDir", + dest="tmp_dir", + help="Path to directory for tmp files. Default = `talon_tmp/`", + type=str, + default="talon_tmp/", + ) + parser.add_argument( + "--verbosity", "-v", type=int, default=1, help="Verbosity of TALON output. Higher numbers = more verbose." + ) + parser.add_argument("--o", dest="outprefix", help="Prefix for output files", type=str) args = parser.parse_args() return args def str_wrap_double(s): - """ Adds double quotes around the input string """ + """Adds double quotes around the input string""" s = str(s) return '"' + s + '"' def search_for_vertex_at_pos(chromosome, position, location_dict): - """ Given a chromosome and a position (1-based), this function queries the - location dict to determine whether a vertex - fitting those criteria exists. Returns the row if yes, and __ if no. + """Given a chromosome and a position (1-based), this function queries the + location dict to determine whether a vertex + fitting those criteria exists. Returns the row if yes, and __ if no. """ try: return location_dict[chromosome][position] @@ -150,7 +188,7 @@ def search_for_vertex_at_pos(chromosome, position, location_dict): def search_for_edge(vertex_1, vertex_2, edge_type, edge_dict): - """ Search the edge dict for an edge linking vertex_1 and vertex_2""" + """Search the edge dict for an edge linking vertex_1 and vertex_2""" query_key = (vertex_1, vertex_2, edge_type) try: return edge_dict[query_key] @@ -158,12 +196,11 @@ def search_for_edge(vertex_1, vertex_2, edge_type, edge_dict): return None -def match_monoexon_vertices(chromosome, positions, strand, location_dict, - run_info): - """ Given the start and end of a single-exon transcript, this function looks - for a matching vertex for each position. Also returns a list where each - index indicates whether that vertex is novel to the data structure - (0 for known, 1 for novel) """ +def match_monoexon_vertices(chromosome, positions, strand, location_dict, run_info): + """Given the start and end of a single-exon transcript, this function looks + for a matching vertex for each position. Also returns a list where each + index indicates whether that vertex is novel to the data structure + (0 for known, 1 for novel)""" # Returned by function vertex_matches = [] @@ -183,20 +220,19 @@ def match_monoexon_vertices(chromosome, positions, strand, location_dict, if curr_index == start: sj_pos = positions[curr_index + 1] pos_type = "start" - vertex_match, diff_5p = permissive_vertex_search(chromosome, position, - strand, sj_pos, pos_type, - location_dict, run_info) + vertex_match, diff_5p = permissive_vertex_search( + chromosome, position, strand, sj_pos, pos_type, location_dict, run_info + ) elif curr_index == end: sj_pos = positions[curr_index - 1] pos_type = "end" - vertex_match, diff_3p = permissive_vertex_search(chromosome, position, - strand, sj_pos, pos_type, - location_dict, run_info) + vertex_match, diff_3p = permissive_vertex_search( + chromosome, position, strand, sj_pos, pos_type, location_dict, run_info + ) if vertex_match == None: # If no vertex matches the position, one is created. - vertex_match = create_vertex(chromosome, position, location_dict, run_info)[ - "location_ID"] + vertex_match = create_vertex(chromosome, position, location_dict, run_info)["location_ID"] novelty.append(1) else: novelty.append(0) @@ -208,42 +244,39 @@ def match_monoexon_vertices(chromosome, positions, strand, location_dict, def match_splice_vertices(chromosome, positions, strand, location_dict, run_info): - """ Given a chromosome and a list of positions from the transcript in 5' to - 3' end order, this function looks for a matching vertex for each splice - junction position (so it ignores the ends). Also returns a list where - each index indicates whether that vertex is novel to the data structure - (0 for known, 1 for novel) """ + """Given a chromosome and a list of positions from the transcript in 5' to + 3' end order, this function looks for a matching vertex for each splice + junction position (so it ignores the ends). Also returns a list where + each index indicates whether that vertex is novel to the data structure + (0 for known, 1 for novel)""" # Returned by function vertex_matches = [] novelty = [] # Iterate over positions - for curr_index in range(1, len(positions)-1): + for curr_index in range(1, len(positions) - 1): position = positions[curr_index] - vertex_match = search_for_vertex_at_pos( - chromosome, position, location_dict) + vertex_match = search_for_vertex_at_pos(chromosome, position, location_dict) if vertex_match == None: # If no vertex matches the position, one is created. - vertex_match = create_vertex( - chromosome, position, location_dict, run_info) + vertex_match = create_vertex(chromosome, position, location_dict, run_info) novelty.append(1) else: novelty.append(0) # Add to running list of matches - vertex_matches.append(vertex_match['location_ID']) + vertex_matches.append(vertex_match["location_ID"]) return vertex_matches, novelty -def match_all_transcript_vertices(chromosome, positions, strand, location_dict, - run_info): - """ Given a chromosome and a list of positions from the transcript in 5' to - 3' end order, this function looks for a matching vertex for each - position. Also returns a list where each index indicates whether that - vertex is novel to the data structure (0 for known, 1 for novel) """ +def match_all_transcript_vertices(chromosome, positions, strand, location_dict, run_info): + """Given a chromosome and a list of positions from the transcript in 5' to + 3' end order, this function looks for a matching vertex for each + position. Also returns a list where each index indicates whether that + vertex is novel to the data structure (0 for known, 1 for novel)""" # Returned by function vertex_matches = [] @@ -263,39 +296,38 @@ def match_all_transcript_vertices(chromosome, positions, strand, location_dict, if curr_index == start: sj_pos = positions[curr_index + 1] pos_type = "start" - vertex_match, diff_5p = permissive_vertex_search(chromosome, position, - strand, sj_pos, pos_type, - location_dict, run_info) + vertex_match, diff_5p = permissive_vertex_search( + chromosome, position, strand, sj_pos, pos_type, location_dict, run_info + ) elif curr_index == end: sj_pos = positions[curr_index - 1] pos_type = "end" - vertex_match, diff_3p = permissive_vertex_search(chromosome, position, - strand, sj_pos, pos_type, - location_dict, run_info) + vertex_match, diff_3p = permissive_vertex_search( + chromosome, position, strand, sj_pos, pos_type, location_dict, run_info + ) # Remaining mid-transcript positions go through strict matching process else: - vertex_match = search_for_vertex_at_pos( - chromosome, position, location_dict) + vertex_match = search_for_vertex_at_pos(chromosome, position, location_dict) if vertex_match == None: # If no vertex matches the position, one is created. - vertex_match = create_vertex( - chromosome, position, location_dict, run_info) + vertex_match = create_vertex(chromosome, position, location_dict, run_info) novelty.append(1) else: novelty.append(0) # Add to running list of matches - vertex_matches.append(vertex_match['location_ID']) + vertex_matches.append(vertex_match["location_ID"]) return tuple(vertex_matches), tuple(novelty), diff_5p, diff_3p -def permissive_match_with_gene_priority(chromosome, position, strand, sj_pos, - pos_type, gene_ID, gene_locs, locations, run_info): - """ Tries to match a position to a known start/end vertex from the same - gene. If none is found, the normal permissive match procedure is - invoked. +def permissive_match_with_gene_priority( + chromosome, position, strand, sj_pos, pos_type, gene_ID, gene_locs, locations, run_info +): + """Tries to match a position to a known start/end vertex from the same + gene. If none is found, the normal permissive match procedure is + invoked. """ # Check inputs if pos_type != "start" and pos_type != "end": @@ -303,7 +335,7 @@ def permissive_match_with_gene_priority(chromosome, position, strand, sj_pos, logging.error(msg) raise ValueError(msg) if strand != "+" and strand != "-": - msg = f'Invalid strand specified: {strand}' + msg = f"Invalid strand specified: {strand}" logging.error(msg) raise ValueError(msg) @@ -313,23 +345,21 @@ def permissive_match_with_gene_priority(chromosome, position, strand, sj_pos, dist = 0 if gene_ID in gene_locs: if position in gene_locs[gene_ID]: - return match['location_ID'], dist, 1 + return match["location_ID"], dist, 1 else: - return match['location_ID'], dist, 0 + return match["location_ID"], dist, 0 else: - return match['location_ID'], dist, 0 + return match["location_ID"], dist, 0 # This approach only works when there are known starts/ends for this gene if gene_ID in gene_locs: - # Get cutoff distance if pos_type == "start": max_dist = run_info.cutoff_5p else: max_dist = run_info.cutoff_3p - if (strand == "+" and pos_type == "start") or \ - (strand == "-" and pos_type == "end"): + if (strand == "+" and pos_type == "start") or (strand == "-" and pos_type == "end"): search_window_start = position - max_dist search_window_end = sj_pos else: @@ -355,23 +385,20 @@ def permissive_match_with_gene_priority(chromosome, position, strand, sj_pos, return closest_vertex, best_dist, 1 # Otherwise, revert to permissive match approach. - match, dist = permissive_vertex_search(chromosome, position, strand, - sj_pos, pos_type, - locations, run_info) + match, dist = permissive_vertex_search(chromosome, position, strand, sj_pos, pos_type, locations, run_info) return match, dist, 0 -def permissive_vertex_search(chromosome, position, strand, sj_pos, pos_type, - locations, run_info): - """ Given a position, this function tries to find a vertex match within the - cutoff distance that also comes before the splice junction begins. - If no vertex is found, the function returns None. """ +def permissive_vertex_search(chromosome, position, strand, sj_pos, pos_type, locations, run_info): + """Given a position, this function tries to find a vertex match within the + cutoff distance that also comes before the splice junction begins. + If no vertex is found, the function returns None.""" # Try a strict match first if chromosome in locations and position in locations[chromosome]: match = locations[chromosome][position] dist = 0 - return match['location_ID'], dist + return match["location_ID"], dist if pos_type != "start" and pos_type != "end": msg = "Please set pos_type to either 'start' or 'end'." @@ -396,8 +423,7 @@ def permissive_vertex_search(chromosome, position, strand, sj_pos, pos_type, # position first (since degradtion is more biologically likely). # For the + strand, this would be a negative delta, and for the - strand, # it would be a positive delta - if (strand == "+" and pos_type == "start") or \ - (strand == "-" and pos_type == "end"): + if (strand == "+" and pos_type == "start") or (strand == "-" and pos_type == "end"): direction_priority = -1 search_window_start = position - max_dist search_window_end = sj_pos @@ -407,30 +433,27 @@ def permissive_vertex_search(chromosome, position, strand, sj_pos, pos_type, search_window_end = position + max_dist for dist in range(1, max_dist): - curr_pos = position + dist*direction_priority + curr_pos = position + dist * direction_priority if curr_pos > search_window_start and curr_pos < search_window_end: match = search_for_vertex_at_pos(chromosome, curr_pos, locations) if match != None: dist = compute_delta(curr_pos, position, strand) - return match['location_ID'], dist + return match["location_ID"], dist - curr_pos = position - dist*direction_priority + curr_pos = position - dist * direction_priority if curr_pos > search_window_start and curr_pos < search_window_end: match = search_for_vertex_at_pos(chromosome, curr_pos, locations) if match != None: dist = compute_delta(curr_pos, position, strand) - return match['location_ID'], dist + return match["location_ID"], dist return None, None def create_vertex(chromosome, position, location_dict, run_info): - """ Creates a novel vertex and adds it to the location data structure. """ + """Creates a novel vertex and adds it to the location data structure.""" new_ID = vertex_counter.increment() - new_vertex = {'location_ID': new_ID, - 'genome_build': run_info.build, - 'chromosome': chromosome, - 'position': position} + new_vertex = {"location_ID": new_ID, "genome_build": run_info.build, "chromosome": chromosome, "position": position} try: location_dict[chromosome][position] = new_vertex @@ -441,38 +464,34 @@ def create_vertex(chromosome, position, location_dict, run_info): def create_edge(vertex_1, vertex_2, edge_type, strand, edge_dict): - """ Creates a novel edge and adds it to the edge data structure. """ + """Creates a novel edge and adds it to the edge data structure.""" new_ID = edge_counter.increment() - new_edge = {'edge_ID': new_ID, - 'v1': vertex_1, - 'v2': vertex_2, - 'edge_type': edge_type, - 'strand': strand} + new_edge = {"edge_ID": new_ID, "v1": vertex_1, "v2": vertex_2, "edge_type": edge_type, "strand": strand} edge_dict[(vertex_1, vertex_2, edge_type)] = new_edge return new_edge def create_gene(chromosome, start, end, strand, memory_cursor, tmp_gene): - """ Create a novel gene and add it to the temporary table. - """ + """Create a novel gene and add it to the temporary table.""" new_ID = gene_counter.increment() new_gene = (new_ID, chromosome, min(start, end), max(start, end), strand) cols = ' ("gene_ID", "chromosome", "start", "end", "strand")' - command = 'INSERT INTO ' + tmp_gene + cols + ' VALUES ' + '(?,?,?,?,?)' + command = "INSERT INTO " + tmp_gene + cols + " VALUES " + "(?,?,?,?,?)" memory_cursor.execute(command, new_gene) return new_ID -def create_transcript(strand, chromosome, start_pos, end_pos, gene_ID, edge_IDs, vertex_IDs, - transcript_dict, tmp_t, memory_cursor): +def create_transcript( + strand, chromosome, start_pos, end_pos, gene_ID, edge_IDs, vertex_IDs, transcript_dict, tmp_t, memory_cursor +): """Creates a novel transcript, add it to the transcript data structure, - and add to tmp_t + and add to tmp_t """ - print('creating new transcript') + print("creating new transcript") new_ID = transcript_counter.increment() - print(f'new tid:{new_ID}') + print(f"new tid:{new_ID}") # updating the dict if len(edge_IDs) > 1: @@ -480,17 +499,19 @@ def create_transcript(strand, chromosome, start_pos, end_pos, gene_ID, edge_IDs, else: jn_path = None - new_transcript = {'transcript_ID': new_ID, - 'gene_ID': gene_ID, - 'jn_path': jn_path, - 'start_exon': edge_IDs[0], - 'end_exon': edge_IDs[-1], - 'start_vertex': vertex_IDs[0], - 'end_vertex': vertex_IDs[-1], - 'n_exons': int((len(edge_IDs) + 1)/2), - 'chromosome': chromosome, - 'start_pos': start_pos, - 'end_pos': end_pos} + new_transcript = { + "transcript_ID": new_ID, + "gene_ID": gene_ID, + "jn_path": jn_path, + "start_exon": edge_IDs[0], + "end_exon": edge_IDs[-1], + "start_vertex": vertex_IDs[0], + "end_vertex": vertex_IDs[-1], + "n_exons": int((len(edge_IDs) + 1) / 2), + "chromosome": chromosome, + "start_pos": start_pos, + "end_pos": end_pos, + } path_key = frozenset(edge_IDs) transcript_dict[path_key] = new_transcript @@ -498,17 +519,17 @@ def create_transcript(strand, chromosome, start_pos, end_pos, gene_ID, edge_IDs, # updating tmp_t new_t = (gene_ID, new_ID, chromosome, strand, min(start_pos, end_pos), max(start_pos, end_pos)) cols = ' ("gene_ID", "transcript_ID", "chromosome", "strand", "min_pos", "max_pos")' - command = 'INSERT INTO ' + tmp_t + cols + ' VALUES ' + '(?,?,?,?,?,?)' + command = "INSERT INTO " + tmp_t + cols + " VALUES " + "(?,?,?,?,?,?)" memory_cursor.execute(command, new_t) return new_transcript def check_all_exons_known(novelty): - """ Given a list in which each element represents the novelty (1) or - known-ness of a transcript edge (0), determine whether all of the - exons are known or not. Return True if all are known, and False - otherwise. Input should not include first or last exon. """ + """Given a list in which each element represents the novelty (1) or + known-ness of a transcript edge (0), determine whether all of the + exons are known or not. Return True if all are known, and False + otherwise. Input should not include first or last exon.""" if len(novelty) == 1: return novelty[0] == 0 @@ -522,11 +543,11 @@ def check_all_exons_known(novelty): def check_all_SJs_known(novelty): - """ Given a list in which each element represents the novelty (1) or - known-ness of a transcript edge (0), determine whether all of the - introns are known or not. Return True if all are known, and False - otherwise. Input should not include first or last exon. If there is - only one entry, then that means there is one splice junction (two exons)""" + """Given a list in which each element represents the novelty (1) or + known-ness of a transcript edge (0), determine whether all of the + introns are known or not. Return True if all are known, and False + otherwise. Input should not include first or last exon. If there is + only one entry, then that means there is one splice junction (two exons)""" if len(novelty) == 1: return novelty[0] == 0 @@ -540,9 +561,9 @@ def check_all_SJs_known(novelty): def match_all_splice_edges(vertices, strand, edge_dict, run_info): - """ Given a list of splice junction-only vertex IDs from the transcript in 5' to - 3' end order, this function looks for a matching edge ID for each - position. If none exists, it creates one. """ + """Given a list of splice junction-only vertex IDs from the transcript in 5' to + 3' end order, this function looks for a matching edge ID for each + position. If none exists, it creates one.""" edge_matches = [] novelty = [] @@ -559,9 +580,7 @@ def match_all_splice_edges(vertices, strand, edge_dict, run_info): vertex_1 = vertices[index_1] vertex_2 = vertices[index_2] - edge_match, curr_novelty = match_or_create_edge(vertex_1, vertex_2, - edge_type, strand, - edge_dict) + edge_match, curr_novelty = match_or_create_edge(vertex_1, vertex_2, edge_type, strand, edge_dict) edge_matches.append(edge_match) novelty.append(curr_novelty) @@ -569,23 +588,22 @@ def match_all_splice_edges(vertices, strand, edge_dict, run_info): def match_or_create_edge(vertex_1, vertex_2, edge_type, strand, edge_dict): - """ Searches for edge match to provided set of vertices. If none found, - creates a new edge. """ + """Searches for edge match to provided set of vertices. If none found, + creates a new edge.""" novelty = 0 edge_match = search_for_edge(vertex_1, vertex_2, edge_type, edge_dict) if edge_match == None: # If no edge matches the position, one is created. - edge_match = create_edge(vertex_1, vertex_2, edge_type, strand, - edge_dict) + edge_match = create_edge(vertex_1, vertex_2, edge_type, strand, edge_dict) novelty = 1 return edge_match["edge_ID"], novelty def match_all_transcript_edges(vertices, strand, edge_dict, run_info): - """ Given a list of vertex IDs from the transcript in 5' to - 3' end order, this function looks for a matching edge ID for each - position. If none exists, it creates one. Only used for monoexon case""" + """Given a list of vertex IDs from the transcript in 5' to + 3' end order, this function looks for a matching edge ID for each + position. If none exists, it creates one. Only used for monoexon case""" edge_matches = [] novelty = [] @@ -602,9 +620,7 @@ def match_all_transcript_edges(vertices, strand, edge_dict, run_info): vertex_1 = vertices[index_1] vertex_2 = vertices[index_2] - edge_match, curr_novelty = match_or_create_edge(vertex_1, vertex_2, - edge_type, strand, - edge_dict) + edge_match, curr_novelty = match_or_create_edge(vertex_1, vertex_2, edge_type, strand, edge_dict) edge_matches.append(edge_match) novelty.append(curr_novelty) @@ -612,14 +628,13 @@ def match_all_transcript_edges(vertices, strand, edge_dict, run_info): def search_for_ISM(edge_IDs, transcript_dict): - """ Given a list of edges in a query transcript, determine whether it is an - incomplete splice match (ISM) of any transcript in the dict. Will also - return FSM matches if they're there""" + """Given a list of edges in a query transcript, determine whether it is an + incomplete splice match (ISM) of any transcript in the dict. Will also + return FSM matches if they're there""" edges = frozenset(edge_IDs) - ISM_matches = [transcript_dict[x] - for x in transcript_dict if edges.issubset(x)] + ISM_matches = [transcript_dict[x] for x in transcript_dict if edges.issubset(x)] if len(ISM_matches) > 0: return ISM_matches @@ -627,36 +642,36 @@ def search_for_ISM(edge_IDs, transcript_dict): return None -def search_for_overlap_with_gene(chromosome, start, end, strand, - cursor, run_info, tmp_gene, tmp_t, - gene_IDs=None): - """ Given a start and an end value for an interval, query the database to - determine whether the interval overlaps with any genes. If it there is - more than one match, prioritize same-strand first and foremost. - If there is more than one same-strand option, prioritize distance from 3' / 5'. - Antisense matches may be returned if there is no same strand - match. +def search_for_overlap_with_gene(chromosome, start, end, strand, cursor, run_info, tmp_gene, tmp_t, gene_IDs=None): + """Given a start and an end value for an interval, query the database to + determine whether the interval overlaps with any genes. If it there is + more than one match, prioritize same-strand first and foremost. + If there is more than one same-strand option, prioritize distance from 3' / 5'. + Antisense matches may be returned if there is no same strand + match. - Parameters: - gene_ID (list of str or None): Restrict results to genes in this list + Parameters: + gene_ID (list of str or None): Restrict results to genes in this list """ - logging.debug('Tiebreaking for gene assignment') + logging.debug("Tiebreaking for gene assignment") min_start = min(start, end) max_end = max(start, end) query_interval = [min_start, max_end] if isinstance(gene_IDs, list): - query = Template("""SELECT gene_ID, + query = Template( + """SELECT gene_ID, transcript_ID, chromosome, min_pos, max_pos, strand FROM $tmp_t - WHERE gene_ID IN $gene_ids""").substitute({'tmp_t': tmp_t, \ - 'gene_ids': qutils.format_for_IN(gene_IDs)}) + WHERE gene_ID IN $gene_ids""" + ).substitute({"tmp_t": tmp_t, "gene_ids": qutils.format_for_IN(gene_IDs)}) elif not gene_IDs: - query = Template("""SELECT gene_ID, + query = Template( + """SELECT gene_ID, transcript_ID, chromosome, min_pos, @@ -668,30 +683,27 @@ def search_for_overlap_with_gene(chromosome, start, end, strand, (min_pos >= $min_start AND max_pos <= $max_end) OR (min_pos >= $min_start AND min_pos <= $max_end) OR (max_pos >= $min_start AND max_pos <= $max_end)) - GROUP BY gene_ID;""").substitute({'tmp_t': tmp_t, 'chrom': chromosome, - 'min_start': min_start, 'max_end': max_end}) + GROUP BY gene_ID;""" + ).substitute({"tmp_t": tmp_t, "chrom": chromosome, "min_start": min_start, "max_end": max_end}) cursor.execute(query) matches = cursor.fetchall() # restrict to just the genes we care about if gene_IDs: # print(f'restricting just to {gene_IDs}') - logging.debug(f'Restricing gene tiebreak to {gene_IDs}') - matches = [match for match in matches if match['gene_ID'] in gene_IDs] - + logging.debug(f"Restricing gene tiebreak to {gene_IDs}") + matches = [match for match in matches if match["gene_ID"] in gene_IDs] if len(matches) == 0: # print('herere here') - logging.debug(f'Unable to tiebreak') + logging.debug(f"Unable to tiebreak") return None, None # Among multiple matches, preferentially return the same-strand gene with # the greatest amount of overlap same_strand_matches = len([x for x in matches if x["strand"] == strand]) - if strand == "+" and same_strand_matches > 0 or \ - strand == "-" and same_strand_matches == 0: - + if strand == "+" and same_strand_matches > 0 or strand == "-" and same_strand_matches == 0: matches = [x for x in matches if x["strand"] == "+"] best_match = get_best_match(matches, min_start, max_end) @@ -699,7 +711,8 @@ def search_for_overlap_with_gene(chromosome, start, end, strand, matches = [x for x in matches if x["strand"] == "-"] best_match = get_best_match(matches, min_start, max_end) - return best_match['gene_ID'], best_match['strand'] + return best_match["gene_ID"], best_match["strand"] + def get_best_match(matches, min_end, max_end): """ @@ -712,16 +725,16 @@ def get_best_match(matches, min_end, max_end): # print(f'read min: {min_end}') # print(f'read end: {max_end}') - logging.debug(f'Read start / end: ({min_end}, {min_end})') + logging.debug(f"Read start / end: ({min_end}, {min_end})") for match in matches: logging.debug(f"Matching with transcripts from gene {match['gene_ID']}, transcript {match['transcript_ID']}") - end_dist = abs(match['max_pos']-max_end) - start_dist = abs(match['min_pos']-min_end) + end_dist = abs(match["max_pos"] - max_end) + start_dist = abs(match["min_pos"] - min_end) logging.debug(f"Transcript start / end: ({match['min_pos']}, {match['max_pos']})") - dist = end_dist+start_dist - logging.debug(f'Distance between read and transcript ends: {dist}') + dist = end_dist + start_dist + logging.debug(f"Distance between read and transcript ends: {dist}") if dist < min_dist: min_dist = dist best_match = match @@ -731,7 +744,6 @@ def get_best_match(matches, min_end, max_end): return best_match - # def get_best_match(matches, query_interval): # """ Given a set of gene matches and a query interval, return the match # that has the greatest amount of overlap with the query.""" @@ -757,26 +769,26 @@ def get_best_match(matches, min_end, max_end): def get_overlap(a, b): - """ Computes the amount of overlap between two intervals. - Returns 0 if there is no overlap. The function treats the start and - ends of each interval as inclusive, meaning that if a = b = [10, 20], - the overlap reported would be 11, not 10. - Args: - a: First interval, formattted as a list (query) - b: Second interval, formatted as a list (reference) - perc_overlap: Percent overlap from the reference interval that the - query interval consumed + """Computes the amount of overlap between two intervals. + Returns 0 if there is no overlap. The function treats the start and + ends of each interval as inclusive, meaning that if a = b = [10, 20], + the overlap reported would be 11, not 10. + Args: + a: First interval, formattted as a list (query) + b: Second interval, formatted as a list (reference) + perc_overlap: Percent overlap from the reference interval that the + query interval consumed """ overlap = max(0, min(a[1], b[1]) - max(a[0], b[0]) + 1) - ref_len = abs(b[1]-b[0]) - perc_overlap = (overlap/ref_len)*100 + ref_len = abs(b[1] - b[0]) + perc_overlap = (overlap / ref_len) * 100 return overlap, perc_overlap def search_for_transcript(edge_IDs, transcript_dict): - """ Given the edge IDs (in set data structure) that make up a query - transcript, look for a match in the transcript dict. - Return gene ID and transcript ID if found, and None if not. """ + """Given the edge IDs (in set data structure) that make up a query + transcript, look for a match in the transcript dict. + Return gene ID and transcript ID if found, and None if not.""" try: transcript = transcript_dict[edge_IDs] @@ -787,9 +799,10 @@ def search_for_transcript(edge_IDs, transcript_dict): return None, None -def process_FSM(chrom, positions, strand, edge_IDs, vertex_IDs, all_matches, gene_starts, gene_ends, - edge_dict, locations, run_info): - """ Given a transcript, try to find an FSM match for it """ +def process_FSM( + chrom, positions, strand, edge_IDs, vertex_IDs, all_matches, gene_starts, gene_ends, edge_dict, locations, run_info +): + """Given a transcript, try to find an FSM match for it""" gene_ID = None transcript_ID = None novelty = [] @@ -803,150 +816,146 @@ def process_FSM(chrom, positions, strand, edge_IDs, vertex_IDs, all_matches, gen # Check if any of the matches have the same number of exons as the query. # Such a match should be prioritized because it's an FSM - n_exons = int(len(positions)/2) - FSM_matches = [x for x in all_matches if x['n_exons'] == n_exons] + n_exons = int(len(positions) / 2) + FSM_matches = [x for x in all_matches if x["n_exons"] == n_exons] if len(FSM_matches) == 0: return None, None, [], None else: transcript_match = FSM_matches[0] - gene_ID = transcript_match['gene_ID'] - transcript_ID = transcript_match['transcript_ID'] + gene_ID = transcript_match["gene_ID"] + transcript_ID = transcript_match["transcript_ID"] # Check whether the query's 5' and 3' ends are within range of those of # the match. If not, perform a permissive match assignment - curr_5p_diff = compute_delta(transcript_match['start_pos'], positions[0], - strand) - curr_3p_diff = compute_delta(transcript_match['end_pos'], positions[-1], - strand) + curr_5p_diff = compute_delta(transcript_match["start_pos"], positions[0], strand) + curr_3p_diff = compute_delta(transcript_match["end_pos"], positions[-1], strand) # ---------------- 5' end --------------------------------- if abs(curr_5p_diff) <= run_info.cutoff_5p: - start_vertex = transcript_match['start_vertex'] - start_exon = transcript_match['start_exon'] + start_vertex = transcript_match["start_vertex"] + start_exon = transcript_match["start_exon"] diff_5p = curr_5p_diff start_novelty = 0 else: # First get a permissively matched start vertex - start_vertex, start_exon, start_novelty, known_start, diff_5p = process_5p(chrom, - positions, strand, - vertex_IDs, - gene_ID, gene_starts, - edge_dict, - locations, run_info) + start_vertex, start_exon, start_novelty, known_start, diff_5p = process_5p( + chrom, positions, strand, vertex_IDs, gene_ID, gene_starts, edge_dict, locations, run_info + ) # ---------------- 3' end --------------------------------- if abs(curr_3p_diff) <= run_info.cutoff_3p: - end_vertex = transcript_match['end_vertex'] - end_exon = transcript_match['end_exon'] + end_vertex = transcript_match["end_vertex"] + end_exon = transcript_match["end_exon"] diff_3p = curr_3p_diff end_novelty = 0 else: # First get a permissively matched end vertex - end_vertex, end_exon, end_novelty, known_end, diff_3p = process_3p(chrom, - positions, strand, - vertex_IDs, - gene_ID, gene_ends, - edge_dict, - locations, run_info) + end_vertex, end_exon, end_novelty, known_end, diff_3p = process_3p( + chrom, positions, strand, vertex_IDs, gene_ID, gene_ends, edge_dict, locations, run_info + ) edge_IDs = [start_exon] + edge_IDs + [end_exon] vertex_IDs = [start_vertex] + vertex_IDs + [end_vertex] # Package information for output - start_end_info = {"start_vertex": start_vertex, - "end_vertex": end_vertex, - "start_exon": start_exon, - "end_exon": end_exon, - "diff_5p": diff_5p, - "diff_3p": diff_3p, - "start_novelty": start_novelty, - "end_novelty": end_novelty, - "vertex_IDs": vertex_IDs, - "edge_IDs": edge_IDs} + start_end_info = { + "start_vertex": start_vertex, + "end_vertex": end_vertex, + "start_exon": start_exon, + "end_exon": end_exon, + "diff_5p": diff_5p, + "diff_3p": diff_3p, + "start_novelty": start_novelty, + "end_novelty": end_novelty, + "vertex_IDs": vertex_IDs, + "edge_IDs": edge_IDs, + } return gene_ID, transcript_ID, novelty, start_end_info -def process_5p(chrom, positions, strand, vertex_IDs, gene_ID, gene_starts, edge_dict, - locations, run_info): - """ Conduct permissive match for 5' end and return assigned vertex, - edge, and distance """ +def process_5p(chrom, positions, strand, vertex_IDs, gene_ID, gene_starts, edge_dict, locations, run_info): + """Conduct permissive match for 5' end and return assigned vertex, + edge, and distance""" # First get a permissively matched start vertex - start_vertex, diff_5p, known_start = permissive_match_with_gene_priority(chrom, - positions[0], strand, positions[1], - "start", gene_ID, gene_starts, - locations, run_info) + start_vertex, diff_5p, known_start = permissive_match_with_gene_priority( + chrom, positions[0], strand, positions[1], "start", gene_ID, gene_starts, locations, run_info + ) if start_vertex == None: - start_vertex = create_vertex(chrom, positions[0], locations, run_info)[ - 'location_ID'] + start_vertex = create_vertex(chrom, positions[0], locations, run_info)["location_ID"] # Then get the start exon - start_exon, start_novelty = match_or_create_edge(start_vertex, - vertex_IDs[0], - "exon", strand, - edge_dict) + start_exon, start_novelty = match_or_create_edge(start_vertex, vertex_IDs[0], "exon", strand, edge_dict) # If known_start == 1, the start vertex is a known startpoint of this gene. # start novelty refers to the novelty of the first exon (1 if yes, 0 if no) return start_vertex, start_exon, start_novelty, known_start, diff_5p -def process_3p(chrom, positions, strand, vertex_IDs, gene_ID, gene_ends, edge_dict, - locations, run_info): - """ Conduct permissive match for 3' end and return assigned vertex, - edge, and distance """ +def process_3p(chrom, positions, strand, vertex_IDs, gene_ID, gene_ends, edge_dict, locations, run_info): + """Conduct permissive match for 3' end and return assigned vertex, + edge, and distance""" # First get a permissively matched end vertex - end_vertex, diff_3p, known_end = permissive_match_with_gene_priority(chrom, - positions[-1], strand, positions[-2], - "end", gene_ID, gene_ends, - locations, run_info) + end_vertex, diff_3p, known_end = permissive_match_with_gene_priority( + chrom, positions[-1], strand, positions[-2], "end", gene_ID, gene_ends, locations, run_info + ) if end_vertex == None: - end_vertex = create_vertex( - chrom, positions[-1], locations, run_info)['location_ID'] + end_vertex = create_vertex(chrom, positions[-1], locations, run_info)["location_ID"] # Then get the end exon - end_exon, end_novelty = match_or_create_edge(vertex_IDs[-1], - end_vertex, - "exon", strand, - edge_dict) + end_exon, end_novelty = match_or_create_edge(vertex_IDs[-1], end_vertex, "exon", strand, edge_dict) # If known_end == 1, the end vertex is a known endpoint of this gene. # end novelty refers to the novelty of the final exon (1 if yes, 0 if no) return end_vertex, end_exon, end_novelty, known_end, diff_3p -def process_ISM(chrom, positions, strand, edge_IDs, vertex_IDs, all_matches, transcript_dict, - gene_starts, gene_ends, edge_dict, locations, run_info, - cursor, tmp_gene, tmp_t): - """ Given a transcript, try to find an ISM match for it. If the - best match is an ISM with known ends, that will be promoted to NIC. """ +def process_ISM( + chrom, + positions, + strand, + edge_IDs, + vertex_IDs, + all_matches, + transcript_dict, + gene_starts, + gene_ends, + edge_dict, + locations, + run_info, + cursor, + tmp_gene, + tmp_t, +): + """Given a transcript, try to find an ISM match for it. If the + best match is an ISM with known ends, that will be promoted to NIC.""" gene_ID = None transcript_ID = None novelty = [] start_end_info = {} - n_exons = int(len(positions)/2) + n_exons = int(len(positions) / 2) ISM = [] suffix = [] prefix = [] # choose gene to assign it to - gene_matches = list(set([match['gene_ID'] for match in all_matches])) + gene_matches = list(set([match["gene_ID"] for match in all_matches])) print(gene_matches) # tie break based on distance to 5' / 3' ends if len(gene_matches) > 1: - gene_ID, _ = search_for_overlap_with_gene(chrom, positions[0], - positions[-1], strand, cursor, run_info, tmp_gene, - tmp_t, gene_IDs=gene_matches) - all_matches = [m for m in all_matches if m['gene_ID'] == gene_ID] + gene_ID, _ = search_for_overlap_with_gene( + chrom, positions[0], positions[-1], strand, cursor, run_info, tmp_gene, tmp_t, gene_IDs=gene_matches + ) + all_matches = [m for m in all_matches if m["gene_ID"] == gene_ID] else: - gene_ID = all_matches[0]['gene_ID'] + gene_ID = all_matches[0]["gene_ID"] # if we didn't assign a gene ID if gene_ID == None: - return None, None, [], None + return None, None, [], None # print('edge IDs') # print(edge_IDs) @@ -956,18 +965,12 @@ def process_ISM(chrom, positions, strand, edge_IDs, vertex_IDs, all_matches, tra # Get matches for the ends if n_exons > 1: - start_vertex, start_exon, start_novelty, known_start, diff_5p = process_5p(chrom, - positions, strand, - vertex_IDs, - gene_ID, gene_starts, - edge_dict, - locations, run_info) - end_vertex, end_exon, end_novelty, known_end, diff_3p = process_3p(chrom, - positions, strand, - vertex_IDs, - gene_ID, gene_ends, - edge_dict, - locations, run_info) + start_vertex, start_exon, start_novelty, known_start, diff_5p = process_5p( + chrom, positions, strand, vertex_IDs, gene_ID, gene_starts, edge_dict, locations, run_info + ) + end_vertex, end_exon, end_novelty, known_end, diff_3p = process_3p( + chrom, positions, strand, vertex_IDs, gene_ID, gene_ends, edge_dict, locations, run_info + ) # Update info edge_IDs = [start_exon] + edge_IDs + [end_exon] vertex_IDs = [start_vertex] + vertex_IDs + [end_vertex] @@ -988,9 +991,8 @@ def process_ISM(chrom, positions, strand, edge_IDs, vertex_IDs, all_matches, tra # Iterate over all matches from assigned gene to characterize ISMs for match in all_matches: - # Add ISM - ISM.append(str(match['transcript_ID'])) + ISM.append(str(match["transcript_ID"])) # Single-exon case if n_exons == 1: @@ -1001,58 +1003,53 @@ def process_ISM(chrom, positions, strand, edge_IDs, vertex_IDs, all_matches, tra novelty = [] return gene_ID, transcript_ID, novelty, start_end_info - match_path = match['jn_path'] + match_path = match["jn_path"] exon = str(edge_IDs[0]) # Look for prefix if match_path.startswith(exon): - prefix.append(str(match['transcript_ID'])) + prefix.append(str(match["transcript_ID"])) # Look for suffix if match_path.endswith(exon): - suffix.append(str(match['transcript_ID'])) - gene_ID = match['gene_ID'] + suffix.append(str(match["transcript_ID"])) + gene_ID = match["gene_ID"] continue # Multi-exon case edge_str = ",".join([str(x) for x in edge_IDs[1:-1]]) # Look for prefix - if match['jn_path'].startswith(edge_str): - prefix.append(str(match['transcript_ID'])) + if match["jn_path"].startswith(edge_str): + prefix.append(str(match["transcript_ID"])) # Look for suffix - if match['jn_path'].endswith(edge_str): - gene_ID = match['gene_ID'] - suffix.append(str(match['transcript_ID'])) + if match["jn_path"].endswith(edge_str): + gene_ID = match["gene_ID"] + suffix.append(str(match["transcript_ID"])) - novel_transcript = create_transcript(strand, chrom, positions[0], positions[-1], - gene_ID, edge_IDs, vertex_IDs, - transcript_dict, tmp_t, cursor) + novel_transcript = create_transcript( + strand, chrom, positions[0], positions[-1], gene_ID, edge_IDs, vertex_IDs, transcript_dict, tmp_t, cursor + ) - transcript_ID = novel_transcript['transcript_ID'] + transcript_ID = novel_transcript["transcript_ID"] ISM_str = ",".join(ISM) - novelty.append((transcript_ID, run_info.idprefix, "TALON", - "ISM_transcript", "TRUE")) - novelty.append((transcript_ID, run_info.idprefix, "TALON", - "ISM_to_IDs", ISM_str)) + novelty.append((transcript_ID, run_info.idprefix, "TALON", "ISM_transcript", "TRUE")) + novelty.append((transcript_ID, run_info.idprefix, "TALON", "ISM_to_IDs", ISM_str)) if prefix != []: prefix_str = ",".join(prefix) - novelty.append((transcript_ID, run_info.idprefix, "TALON", - "ISM-prefix_transcript", "TRUE")) - novelty.append((transcript_ID, run_info.idprefix, "TALON", - "ISM-prefix_to_IDs", prefix_str)) + novelty.append((transcript_ID, run_info.idprefix, "TALON", "ISM-prefix_transcript", "TRUE")) + novelty.append((transcript_ID, run_info.idprefix, "TALON", "ISM-prefix_to_IDs", prefix_str)) if suffix != []: suffix_str = ",".join(suffix) - novelty.append((transcript_ID, run_info.idprefix, "TALON", - "ISM-suffix_transcript", "TRUE")) - novelty.append((transcript_ID, run_info.idprefix, "TALON", - "ISM-suffix_to_IDs", suffix_str)) + novelty.append((transcript_ID, run_info.idprefix, "TALON", "ISM-suffix_transcript", "TRUE")) + novelty.append((transcript_ID, run_info.idprefix, "TALON", "ISM-suffix_to_IDs", suffix_str)) return gene_ID, transcript_ID, novelty, start_end_info -def assign_gene(vertex_IDs, strand, vertex_2_gene, - chrom, start, end, cursor, run_info, - tmp_gene, tmp_t, gene_starts, gene_ends): + +def assign_gene( + vertex_IDs, strand, vertex_2_gene, chrom, start, end, cursor, run_info, tmp_gene, tmp_t, gene_starts, gene_ends +): """ Assign a gene to a transcript. First do this on the basis of splice site matching. If this yields more than one gene, then choose the gene with the @@ -1065,34 +1062,55 @@ def assign_gene(vertex_IDs, strand, vertex_2_gene, """ # first attempt to assign based on matching vertices - gene_ID, fusion = find_gene_match_on_vertex_basis(vertex_IDs, - strand, - vertex_2_gene) + gene_ID, fusion = find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex_2_gene) # if previous function returned more than one gene that we need to tiebreak, # look for closest gene based on end differences, out of candidate genes # only if it wasn't previously labeled as fusion if type(gene_ID) == list and fusion == False: - gene_ID, match_strand = search_for_overlap_with_gene(chrom, start, - end, strand, - cursor, run_info, tmp_gene, - tmp_t, - gene_IDs=gene_ID) + gene_ID, match_strand = search_for_overlap_with_gene( + chrom, start, end, strand, cursor, run_info, tmp_gene, tmp_t, gene_IDs=gene_ID + ) return gene_ID, fusion -def process_NIC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict, - gene_starts, gene_ends, edge_dict, locations, vertex_2_gene, run_info, - cursor, tmp_gene, tmp_t): - """ For a transcript that has been determined to be novel in catalog, find - the proper gene match (documenting fusion event if applicable). To do - this, look up each vertex in the vertex_2_gene dict, and keep track of all - same-strand genes. """ +def process_NIC( + chrom, + positions, + strand, + edge_IDs, + vertex_IDs, + transcript_dict, + gene_starts, + gene_ends, + edge_dict, + locations, + vertex_2_gene, + run_info, + cursor, + tmp_gene, + tmp_t, +): + """For a transcript that has been determined to be novel in catalog, find + the proper gene match (documenting fusion event if applicable). To do + this, look up each vertex in the vertex_2_gene dict, and keep track of all + same-strand genes.""" start_end_info = {} - gene_ID, fusion = assign_gene(vertex_IDs, strand, vertex_2_gene, - chrom, positions[0], positions[-1], cursor, run_info, - tmp_gene, tmp_t, gene_starts, gene_ends) + gene_ID, fusion = assign_gene( + vertex_IDs, + strand, + vertex_2_gene, + chrom, + positions[0], + positions[-1], + cursor, + run_info, + tmp_gene, + tmp_t, + gene_starts, + gene_ends, + ) # gene_ID, fusion = find_gene_match_on_vertex_basis(vertex_IDs, # strand, @@ -1108,21 +1126,15 @@ def process_NIC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict, # print(gene_ID) if gene_ID == None: - return None, None, [], None, fusion + return None, None, [], None, fusion # Get matches for the ends - start_vertex, start_exon, start_novelty, known_start, diff_5p = process_5p(chrom, - positions, strand, - vertex_IDs, - gene_ID, gene_starts, - edge_dict, - locations, run_info) - end_vertex, end_exon, end_novelty, known_end, diff_3p = process_3p(chrom, - positions, strand, - vertex_IDs, - gene_ID, gene_ends, - edge_dict, - locations, run_info) + start_vertex, start_exon, start_novelty, known_start, diff_5p = process_5p( + chrom, positions, strand, vertex_IDs, gene_ID, gene_starts, edge_dict, locations, run_info + ) + end_vertex, end_exon, end_novelty, known_end, diff_3p = process_3p( + chrom, positions, strand, vertex_IDs, gene_ID, gene_ends, edge_dict, locations, run_info + ) # Update info edge_IDs = [start_exon] + edge_IDs + [end_exon] vertex_IDs = [start_vertex] + vertex_IDs + [end_vertex] @@ -1138,16 +1150,16 @@ def process_NIC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict, start_end_info["vertex_IDs"] = vertex_IDs # Create a new transcript of that gene - novel_transcript = create_transcript(strand, chrom, positions[0], positions[-1], - gene_ID, edge_IDs, vertex_IDs, - transcript_dict, tmp_t, cursor) + novel_transcript = create_transcript( + strand, chrom, positions[0], positions[-1], gene_ID, edge_IDs, vertex_IDs, transcript_dict, tmp_t, cursor + ) transcript_ID = novel_transcript["transcript_ID"] - novelty = [(transcript_ID, run_info.idprefix, "TALON", - "NIC_transcript", "TRUE")] + novelty = [(transcript_ID, run_info.idprefix, "TALON", "NIC_transcript", "TRUE")] fusion = False return gene_ID, transcript_ID, novelty, start_end_info, fusion + def get_vertex_2_gene_df(vertex_2_gene): """ Get a DataFrame mapping each unique combination of vertex:gene_ID @@ -1167,16 +1179,17 @@ def get_vertex_2_gene_df(vertex_2_gene): vids.append(key) # df = pd.DataFrame.from_dict(vertex_2_gene, orient='index') df = pd.DataFrame() - df['gid'] = gids - df['vid'] = vids + df["gid"] = gids + df["vid"] = vids # print(df.head()) # print(len(df.index)) # print(len(df.vid.unique().tolist())) # print(df.loc[df.vid.duplicated(keep=False)].sort_values(by='vid')) return df + def find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex_2_gene): - """ Use vertices in a transcript to try to pinpoint the gene it belongs to. + """Use vertices in a transcript to try to pinpoint the gene it belongs to. Parameters: vertex_IDs (list of int): List of vertices in the read @@ -1230,13 +1243,13 @@ def find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex_2_gene): # when there are no shared splice sites between gene hits but we did # hit more than one gene elif max(n_gene_matches) <= 1 and len(gene_tally) > 1: - print(' went here') + print(" went here") return None, True # if we hit more than one gene and they have overlapping sjs, # tie break based on ????? elif len(gene_tally) > 1: - print('i found more than one gene') + print("i found more than one gene") print(gene_tally) print(n_gene_matches) return list(gene_tally.keys()), False @@ -1254,8 +1267,6 @@ def find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex_2_gene): # temp3 = temp1.merge(temp2, on='gid') # print(temp3) - - # For the main assignment, pick the gene that is observed the most else: gene_ID = max(gene_tally, key=gene_tally.get) @@ -1264,10 +1275,24 @@ def find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex_2_gene): return gene_ID, fusion -def process_NNC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict, - gene_starts, gene_ends, edge_dict, locations, vertex_2_gene, run_info, - cursor, tmp_gene, tmp_t): - """ Novel not in catalog case """ +def process_NNC( + chrom, + positions, + strand, + edge_IDs, + vertex_IDs, + transcript_dict, + gene_starts, + gene_ends, + edge_dict, + locations, + vertex_2_gene, + run_info, + cursor, + tmp_gene, + tmp_t, +): + """Novel not in catalog case""" novelty = [] start_end_info = {} @@ -1284,10 +1309,21 @@ def process_NNC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict, # gene_starts, gene_ends) # print('geneid from search for overlap with gene') # print(gene_ID) - gene_ID, fusion = assign_gene(vertex_IDs, strand, vertex_2_gene, - chrom, positions[0], positions[-1], cursor, run_info, - tmp_gene, tmp_t, gene_starts, gene_ends) - print('gene id process_nnc') + gene_ID, fusion = assign_gene( + vertex_IDs, + strand, + vertex_2_gene, + chrom, + positions[0], + positions[-1], + cursor, + run_info, + tmp_gene, + tmp_t, + gene_starts, + gene_ends, + ) + print("gene id process_nnc") print(gene_ID) print(fusion) @@ -1295,18 +1331,12 @@ def process_NNC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict, return None, None, [], None, fusion # Get matches for the ends - start_vertex, start_exon, start_novelty, known_start, diff_5p = process_5p(chrom, - positions, strand, - vertex_IDs, - gene_ID, gene_starts, - edge_dict, - locations, run_info) - end_vertex, end_exon, end_novelty, known_end, diff_3p = process_3p(chrom, - positions, strand, - vertex_IDs, - gene_ID, gene_ends, - edge_dict, - locations, run_info) + start_vertex, start_exon, start_novelty, known_start, diff_5p = process_5p( + chrom, positions, strand, vertex_IDs, gene_ID, gene_starts, edge_dict, locations, run_info + ) + end_vertex, end_exon, end_novelty, known_end, diff_3p = process_3p( + chrom, positions, strand, vertex_IDs, gene_ID, gene_ends, edge_dict, locations, run_info + ) # Update info edge_IDs = [start_exon] + edge_IDs + [end_exon] vertex_IDs = [start_vertex] + vertex_IDs + [end_vertex] @@ -1321,22 +1351,34 @@ def process_NNC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict, start_end_info["edge_IDs"] = edge_IDs start_end_info["vertex_IDs"] = vertex_IDs - transcript_ID = create_transcript(strand, chrom, positions[0], positions[-1], - gene_ID, edge_IDs, vertex_IDs, - transcript_dict, tmp_t, cursor)["transcript_ID"] + transcript_ID = create_transcript( + strand, chrom, positions[0], positions[-1], gene_ID, edge_IDs, vertex_IDs, transcript_dict, tmp_t, cursor + )["transcript_ID"] - novelty.append((transcript_ID, run_info.idprefix, "TALON", - "NNC_transcript", "TRUE")) + novelty.append((transcript_ID, run_info.idprefix, "TALON", "NNC_transcript", "TRUE")) fusion = False return gene_ID, transcript_ID, novelty, start_end_info, fusion -def process_spliced_antisense(chrom, positions, strand, edge_IDs, vertex_IDs, - transcript_dict, gene_starts, gene_ends, edge_dict, - locations, vertex_2_gene, run_info, cursor, tmp_gene, - tmp_t): - """ Annotate a transcript as antisense with splice junctions """ +def process_spliced_antisense( + chrom, + positions, + strand, + edge_IDs, + vertex_IDs, + transcript_dict, + gene_starts, + gene_ends, + edge_dict, + locations, + vertex_2_gene, + run_info, + cursor, + tmp_gene, + tmp_t, +): + """Annotate a transcript as antisense with splice junctions""" gene_novelty = [] transcript_novelty = [] @@ -1346,30 +1388,21 @@ def process_spliced_antisense(chrom, positions, strand, edge_IDs, vertex_IDs, anti_strand = "-" else: anti_strand = "+" - anti_gene_ID, fusion = find_gene_match_on_vertex_basis(vertex_IDs, anti_strand, - vertex_2_gene) + anti_gene_ID, fusion = find_gene_match_on_vertex_basis(vertex_IDs, anti_strand, vertex_2_gene) if type(anti_gene_ID) == list and fusion == False: - anti_gene_ID, match_strand = search_for_overlap_with_gene(chrom, positions[0], - positions[-1], strand, - cursor, run_info, tmp_gene, - tmp_t, - gene_IDs=anti_gene_ID) + anti_gene_ID, match_strand = search_for_overlap_with_gene( + chrom, positions[0], positions[-1], strand, cursor, run_info, tmp_gene, tmp_t, gene_IDs=anti_gene_ID + ) if anti_gene_ID == None: return None, None, gene_novelty, transcript_novelty, start_end_info # Take care of ends - start_vertex, start_exon, start_novelty, known_start, diff_5p = process_5p(chrom, - positions, strand, - vertex_IDs, - anti_gene_ID, gene_ends, - edge_dict, - locations, run_info) - end_vertex, end_exon, end_novelty, known_end, diff_3p = process_3p(chrom, - positions, strand, - vertex_IDs, - anti_gene_ID, gene_starts, - edge_dict, - locations, run_info) + start_vertex, start_exon, start_novelty, known_start, diff_5p = process_5p( + chrom, positions, strand, vertex_IDs, anti_gene_ID, gene_ends, edge_dict, locations, run_info + ) + end_vertex, end_exon, end_novelty, known_end, diff_3p = process_3p( + chrom, positions, strand, vertex_IDs, anti_gene_ID, gene_starts, edge_dict, locations, run_info + ) # Update info edge_IDs = [start_exon] + edge_IDs + [end_exon] vertex_IDs = [start_vertex] + vertex_IDs + [end_vertex] @@ -1384,59 +1417,61 @@ def process_spliced_antisense(chrom, positions, strand, edge_IDs, vertex_IDs, start_end_info["edge_IDs"] = edge_IDs start_end_info["vertex_IDs"] = vertex_IDs - gene_ID = create_gene(chrom, positions[0], positions[-1], - strand, cursor, tmp_gene) - transcript_ID = create_transcript(strand, chrom, positions[0], positions[-1], - gene_ID, edge_IDs, vertex_IDs, - transcript_dict, tmp_t, cursor)["transcript_ID"] + gene_ID = create_gene(chrom, positions[0], positions[-1], strand, cursor, tmp_gene) + transcript_ID = create_transcript( + strand, chrom, positions[0], positions[-1], gene_ID, edge_IDs, vertex_IDs, transcript_dict, tmp_t, cursor + )["transcript_ID"] # Handle gene annotations - gene_novelty.append((gene_ID, run_info.idprefix, "TALON", - "antisense_gene", "TRUE")) - gene_novelty.append((gene_ID, run_info.idprefix, "TALON", - "gene_antisense_to_IDs", anti_gene_ID)) + gene_novelty.append((gene_ID, run_info.idprefix, "TALON", "antisense_gene", "TRUE")) + gene_novelty.append((gene_ID, run_info.idprefix, "TALON", "gene_antisense_to_IDs", anti_gene_ID)) # Handle transcript annotations - transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON", - "antisense_transcript", "TRUE")) + transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON", "antisense_transcript", "TRUE")) return gene_ID, transcript_ID, gene_novelty, transcript_novelty, start_end_info -def process_remaining_mult_cases(chrom, positions, strand, edge_IDs, vertex_IDs, - transcript_dict, gene_starts, gene_ends, edge_dict, - locations, vertex_2_gene, run_info, cursor, tmp_gene, - tmp_t, - fusion): - """ This function is a catch-all for multiexonic transcripts that were not - FSM, ISM, NIC, NNC, or spliced antisense. +def process_remaining_mult_cases( + chrom, + positions, + strand, + edge_IDs, + vertex_IDs, + transcript_dict, + gene_starts, + gene_ends, + edge_dict, + locations, + vertex_2_gene, + run_info, + cursor, + tmp_gene, + tmp_t, + fusion, +): + """This function is a catch-all for multiexonic transcripts that were not + FSM, ISM, NIC, NNC, or spliced antisense. """ gene_novelty = [] transcript_novelty = [] start_end_info = {} if not run_info.create_novel_spliced_genes or not fusion: - print('did i get here?') - gene_ID, match_strand = search_for_overlap_with_gene(chrom, positions[0], - positions[-1], strand, - cursor, run_info, tmp_gene, - tmp_t) + print("did i get here?") + gene_ID, match_strand = search_for_overlap_with_gene( + chrom, positions[0], positions[-1], strand, cursor, run_info, tmp_gene, tmp_t + ) else: gene_ID = None match_strand = None # We don't care about the gene when making these assignments - start_vertex, start_exon, start_novelty, known_start, diff_5p = process_5p(chrom, - positions, strand, - vertex_IDs, - gene_ID, gene_starts, - edge_dict, - locations, run_info) - end_vertex, end_exon, end_novelty, known_end, diff_3p = process_3p(chrom, - positions, strand, - vertex_IDs, - gene_ID, gene_ends, - edge_dict, - locations, run_info) + start_vertex, start_exon, start_novelty, known_start, diff_5p = process_5p( + chrom, positions, strand, vertex_IDs, gene_ID, gene_starts, edge_dict, locations, run_info + ) + end_vertex, end_exon, end_novelty, known_end, diff_3p = process_3p( + chrom, positions, strand, vertex_IDs, gene_ID, gene_ends, edge_dict, locations, run_info + ) # Update info edge_IDs = [start_exon] + edge_IDs + [end_exon] vertex_IDs = [start_vertex] + vertex_IDs + [end_vertex] @@ -1452,54 +1487,46 @@ def process_remaining_mult_cases(chrom, positions, strand, edge_IDs, vertex_IDs, start_end_info["vertex_IDs"] = vertex_IDs if gene_ID == None: - print(f'fusion: {fusion}') + print(f"fusion: {fusion}") if fusion: - print('i should be here') - t_nov = 'fusion_transcript' - g_nov = 'fusion_novel' + print("i should be here") + t_nov = "fusion_transcript" + g_nov = "fusion_novel" else: - print('but I think im going here') - t_nov = 'intergenic_transcript' - g_nov = 'intergenic_novel' + print("but I think im going here") + t_nov = "intergenic_transcript" + g_nov = "intergenic_novel" - gene_ID = create_gene(chrom, positions[0], positions[-1], - strand, cursor, tmp_gene) + gene_ID = create_gene(chrom, positions[0], positions[-1], strand, cursor, tmp_gene) - gene_novelty.append((gene_ID, run_info.idprefix, "TALON", - g_nov, "TRUE")) + gene_novelty.append((gene_ID, run_info.idprefix, "TALON", g_nov, "TRUE")) - transcript_ID = create_transcript(strand, chrom, positions[0], positions[-1], - gene_ID, edge_IDs, vertex_IDs, - transcript_dict, tmp_t, cursor)["transcript_ID"] - transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON", - t_nov, "TRUE")) + transcript_ID = create_transcript( + strand, chrom, positions[0], positions[-1], gene_ID, edge_IDs, vertex_IDs, transcript_dict, tmp_t, cursor + )["transcript_ID"] + transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON", t_nov, "TRUE")) elif match_strand != strand: anti_gene_ID = gene_ID - gene_ID = create_gene(chrom, positions[0], positions[-1], strand, - cursor, tmp_gene) - transcript_ID = create_transcript(strand, chrom, positions[0], positions[-1], - gene_ID, edge_IDs, vertex_IDs, - transcript_dict, tmp_t, cursor)["transcript_ID"] - - gene_novelty.append((gene_ID, run_info.idprefix, "TALON", - "antisense_gene", "TRUE")) - gene_novelty.append((gene_ID, run_info.idprefix, "TALON", - "gene_antisense_to_IDs", anti_gene_ID)) - transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON", - "antisense_transcript", "TRUE")) + gene_ID = create_gene(chrom, positions[0], positions[-1], strand, cursor, tmp_gene) + transcript_ID = create_transcript( + strand, chrom, positions[0], positions[-1], gene_ID, edge_IDs, vertex_IDs, transcript_dict, tmp_t, cursor + )["transcript_ID"] + + gene_novelty.append((gene_ID, run_info.idprefix, "TALON", "antisense_gene", "TRUE")) + gene_novelty.append((gene_ID, run_info.idprefix, "TALON", "gene_antisense_to_IDs", anti_gene_ID)) + transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON", "antisense_transcript", "TRUE")) else: - transcript_ID = create_transcript(strand, chrom, positions[0], positions[-1], - gene_ID, edge_IDs, vertex_IDs, - transcript_dict, tmp_t, cursor)["transcript_ID"] - transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON", - "genomic_transcript", "TRUE")) + transcript_ID = create_transcript( + strand, chrom, positions[0], positions[-1], gene_ID, edge_IDs, vertex_IDs, transcript_dict, tmp_t, cursor + )["transcript_ID"] + transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON", "genomic_transcript", "TRUE")) return gene_ID, transcript_ID, gene_novelty, transcript_novelty, start_end_info def update_vertex_2_gene(gene_ID, vertex_IDs, strand, vertex_2_gene): - """ Add all vertices with gene pairings to vertex_2_gene dict """ + """Add all vertices with gene pairings to vertex_2_gene dict""" for vertex in vertex_IDs: if vertex in vertex_2_gene: @@ -1511,50 +1538,60 @@ def update_vertex_2_gene(gene_ID, vertex_IDs, strand, vertex_2_gene): return -def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_dict, - transcript_dict, vertex_2_gene, gene_starts, gene_ends, - run_info, tmp_gene, tmp_t): - """ Inputs: - - Information about the query transcript - - chromosome - - list of positions - - strand - - Data structures - - location_dict (position --> vertex) - - edge_dict (v1_v2_edgetype --> edge) - - transcript_dict - - vertex_2_gene (maps vertices to the gene(s) they are part of) - - gene_starts (maps gene IDs to known start vertices) - - gene_ends (maps gene IDs to known end vertices) - - run_info - - Outputs: - - Assigned gene ID - - Assigned transcript ID - - gene and transcript novelty entries (to be added to database) - - IDs of start and end vertices - - 5' and 3' deltas from assigned start/end vertices +def identify_transcript( + chrom, + positions, + strand, + cursor, + location_dict, + edge_dict, + transcript_dict, + vertex_2_gene, + gene_starts, + gene_ends, + run_info, + tmp_gene, + tmp_t, +): + """Inputs: + - Information about the query transcript + - chromosome + - list of positions + - strand + - Data structures + - location_dict (position --> vertex) + - edge_dict (v1_v2_edgetype --> edge) + - transcript_dict + - vertex_2_gene (maps vertices to the gene(s) they are part of) + - gene_starts (maps gene IDs to known start vertices) + - gene_ends (maps gene IDs to known end vertices) + - run_info + + Outputs: + - Assigned gene ID + - Assigned transcript ID + - gene and transcript novelty entries (to be added to database) + - IDs of start and end vertices + - 5' and 3' deltas from assigned start/end vertices """ gene_novelty = [] transcript_novelty = [] - n_exons = int(len(positions)/2.0) + n_exons = int(len(positions) / 2.0) gene_ID = None # Get vertex matches for the transcript positions - vertex_IDs, v_novelty = match_splice_vertices(chrom, positions, strand, - location_dict, run_info) + vertex_IDs, v_novelty = match_splice_vertices(chrom, positions, strand, location_dict, run_info) # Get edge matches for transcript exons and introns based on the vertices - edge_IDs, e_novelty = match_all_splice_edges( - vertex_IDs, strand, edge_dict, run_info) + edge_IDs, e_novelty = match_all_splice_edges(vertex_IDs, strand, edge_dict, run_info) # Check novelty of exons and splice jns. This will help us categorize # what type of novelty the transcript has all_SJs_known = check_all_SJs_known(e_novelty) all_exons_known = check_all_exons_known(e_novelty) - splice_vertices_known = (sum(v_novelty) == 0) - all_exons_novel = (reduce(operator.mul, e_novelty, 1) == 1) - print(f'all exons novel : {all_exons_novel}') + splice_vertices_known = sum(v_novelty) == 0 + all_exons_novel = reduce(operator.mul, e_novelty, 1) == 1 + print(f"all exons novel : {all_exons_novel}") fusion = False # Look for FSM or ISM. @@ -1563,127 +1600,172 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di all_matches = search_for_ISM(edge_IDs, transcript_dict) if all_matches != None: # Look for FSM first - print('looking for fsm') - gene_ID, transcript_ID, transcript_novelty, start_end_info = process_FSM(chrom, - positions, strand, edge_IDs, - vertex_IDs, all_matches, - gene_starts, gene_ends, - edge_dict, - location_dict, run_info) + print("looking for fsm") + gene_ID, transcript_ID, transcript_novelty, start_end_info = process_FSM( + chrom, + positions, + strand, + edge_IDs, + vertex_IDs, + all_matches, + gene_starts, + gene_ends, + edge_dict, + location_dict, + run_info, + ) if gene_ID == None: # Now look for ISM - print('looking for ism') - gene_ID, transcript_ID, transcript_novelty, start_end_info = process_ISM(chrom, - positions, - strand, edge_IDs, - vertex_IDs, - all_matches, - transcript_dict, - gene_starts, gene_ends, - edge_dict, location_dict, - run_info, cursor, tmp_gene, tmp_t) - print(f'gene id from process ism {gene_ID}') + print("looking for ism") + gene_ID, transcript_ID, transcript_novelty, start_end_info = process_ISM( + chrom, + positions, + strand, + edge_IDs, + vertex_IDs, + all_matches, + transcript_dict, + gene_starts, + gene_ends, + edge_dict, + location_dict, + run_info, + cursor, + tmp_gene, + tmp_t, + ) + print(f"gene id from process ism {gene_ID}") # Look for NIC if gene_ID == None: - print('looking for nic') - gene_ID, transcript_ID, transcript_novelty, start_end_info, fusion = process_NIC(chrom, - positions, - strand, edge_IDs, - vertex_IDs, transcript_dict, - gene_starts, gene_ends, - edge_dict, location_dict, - vertex_2_gene, run_info, - cursor, tmp_gene, - tmp_t) + print("looking for nic") + gene_ID, transcript_ID, transcript_novelty, start_end_info, fusion = process_NIC( + chrom, + positions, + strand, + edge_IDs, + vertex_IDs, + transcript_dict, + gene_starts, + gene_ends, + edge_dict, + location_dict, + vertex_2_gene, + run_info, + cursor, + tmp_gene, + tmp_t, + ) # Novel in catalog transcripts have known splice donors and acceptors, # but new connections between them. elif splice_vertices_known and gene_ID == None: - print('looking for nic (again?)') - gene_ID, transcript_ID, transcript_novelty, start_end_info, fusion = process_NIC(chrom, - positions, - strand, edge_IDs, - vertex_IDs, transcript_dict, - gene_starts, gene_ends, - edge_dict, location_dict, - vertex_2_gene, run_info, - cursor, tmp_gene, - tmp_t) + print("looking for nic (again?)") + gene_ID, transcript_ID, transcript_novelty, start_end_info, fusion = process_NIC( + chrom, + positions, + strand, + edge_IDs, + vertex_IDs, + transcript_dict, + gene_starts, + gene_ends, + edge_dict, + location_dict, + vertex_2_gene, + run_info, + cursor, + tmp_gene, + tmp_t, + ) # Antisense transcript with splice junctions matching known gene if splice_vertices_known and gene_ID == None and not fusion: - print('looking for spliced antisese') - gene_ID, transcript_ID, gene_novelty, transcript_novelty, start_end_info = \ - process_spliced_antisense(chrom, positions, - strand, edge_IDs, - vertex_IDs, - transcript_dict, - gene_starts, - gene_ends, - edge_dict, location_dict, - vertex_2_gene, run_info, - cursor, tmp_gene, tmp_t) + print("looking for spliced antisese") + gene_ID, transcript_ID, gene_novelty, transcript_novelty, start_end_info = process_spliced_antisense( + chrom, + positions, + strand, + edge_IDs, + vertex_IDs, + transcript_dict, + gene_starts, + gene_ends, + edge_dict, + location_dict, + vertex_2_gene, + run_info, + cursor, + tmp_gene, + tmp_t, + ) # Novel not in catalog transcripts contain new splice donors/acceptors # and contain at least one splice junction. There should also be at least # one shared exon from existing transcripts to even try assigning a gene - elif not(splice_vertices_known) and not fusion and not all_exons_novel: - print('lookign for NNCs') - gene_ID, transcript_ID, transcript_novelty, start_end_info, fusion = process_NNC(chrom, - positions, - strand, edge_IDs, - vertex_IDs, transcript_dict, - gene_starts, gene_ends, - edge_dict, location_dict, - vertex_2_gene, run_info, - cursor, tmp_gene, - tmp_t) - print(f'geneID from process_nnc: {gene_ID}') + elif not (splice_vertices_known) and not fusion and not all_exons_novel: + print("lookign for NNCs") + gene_ID, transcript_ID, transcript_novelty, start_end_info, fusion = process_NNC( + chrom, + positions, + strand, + edge_IDs, + vertex_IDs, + transcript_dict, + gene_starts, + gene_ends, + edge_dict, + location_dict, + vertex_2_gene, + run_info, + cursor, + tmp_gene, + tmp_t, + ) + print(f"geneID from process_nnc: {gene_ID}") # Transcripts that don't match the previous categories end up here if gene_ID == None: - print('looking for this other stuff') - gene_ID, transcript_ID, gene_novelty, transcript_novelty, start_end_info = \ - process_remaining_mult_cases(chrom, positions, - strand, edge_IDs, - vertex_IDs, - transcript_dict, - gene_starts, gene_ends, - edge_dict, location_dict, - vertex_2_gene, run_info, - cursor, tmp_gene, - tmp_t, - fusion) - - print('this is the gene id it decided on') + print("looking for this other stuff") + gene_ID, transcript_ID, gene_novelty, transcript_novelty, start_end_info = process_remaining_mult_cases( + chrom, + positions, + strand, + edge_IDs, + vertex_IDs, + transcript_dict, + gene_starts, + gene_ends, + edge_dict, + location_dict, + vertex_2_gene, + run_info, + cursor, + tmp_gene, + tmp_t, + fusion, + ) + + print("this is the gene id it decided on") print(gene_ID) # Add all novel vertices to vertex_2_gene now that we have the gene ID vertex_IDs = start_end_info["vertex_IDs"] edge_IDs = start_end_info["edge_IDs"] - e_novelty = [start_end_info["start_novelty"]] + e_novelty + \ - [start_end_info["end_novelty"]] + e_novelty = [start_end_info["start_novelty"]] + e_novelty + [start_end_info["end_novelty"]] update_vertex_2_gene(gene_ID, vertex_IDs, strand, vertex_2_gene) # For novel genes and transcripts, add names to novelty entries - talon_gene_name, talon_transcript_name = construct_names(gene_ID, - transcript_ID, - run_info.idprefix, - run_info.n_places) + talon_gene_name, talon_transcript_name = construct_names( + gene_ID, transcript_ID, run_info.idprefix, run_info.n_places + ) if len(gene_novelty) > 0: - gene_novelty.append((gene_ID, run_info.idprefix, "TALON", - "gene_status", "NOVEL")) - gene_novelty.append((gene_ID, run_info.idprefix, "TALON", - "gene_name", talon_gene_name)) - gene_novelty.append((gene_ID, run_info.idprefix, "TALON", - "gene_id", talon_gene_name)) + gene_novelty.append((gene_ID, run_info.idprefix, "TALON", "gene_status", "NOVEL")) + gene_novelty.append((gene_ID, run_info.idprefix, "TALON", "gene_name", talon_gene_name)) + gene_novelty.append((gene_ID, run_info.idprefix, "TALON", "gene_id", talon_gene_name)) if len(transcript_novelty) > 0: - transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON", - "transcript_status", "NOVEL")) - transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON", - "transcript_name", talon_transcript_name)) - transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON", - "transcript_id", talon_transcript_name)) + transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON", "transcript_status", "NOVEL")) + transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON", "transcript_name", talon_transcript_name)) + transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON", "transcript_id", talon_transcript_name)) # Add annotation entries for any novel exons exon_novelty = [] exons = edge_IDs[::2] @@ -1692,8 +1774,7 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di if sum(e_novelty) > 0: for exon, is_novel in zip(exons, e_novelty): if is_novel: - exon_novelty.append((exon, run_info.idprefix, "TALON", - "exon_status", "NOVEL")) + exon_novelty.append((exon, run_info.idprefix, "TALON", "exon_status", "NOVEL")) # Package up information for output annotations = dstruct.Struct() @@ -1712,9 +1793,9 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di def construct_names(gene_ID, transcript_ID, prefix, n_places): - """ Create a gene and transcript name using the TALON IDs. - The n_places variable indicates how many characters long the numeric - part of the name should be. """ + """Create a gene and transcript name using the TALON IDs. + The n_places variable indicates how many characters long the numeric + part of the name should be.""" gene_ID_str = str(gene_ID).zfill(n_places) gene_name = prefix + "G" + gene_ID_str @@ -1726,8 +1807,8 @@ def construct_names(gene_ID, transcript_ID, prefix, n_places): def check_inputs(options): - """ Checks the input options provided by the user and makes sure that - they are valid. Throw an error with descriptive help message if not.""" + """Checks the input options provided by the user and makes sure that + they are valid. Throw an error with descriptive help message if not.""" # Make sure that the input database exists! database = options.database @@ -1743,8 +1824,7 @@ def check_inputs(options): builds = [str(x[0]) for x in cursor.fetchall()] if options.build not in builds: build_names = ", ".join(list(builds)) - msg = "Please specify a genome build that exists in the" +\ - " database. The choices are: " + build_names + msg = "Please specify a genome build that exists in the" + " database. The choices are: " + build_names logging.error(msg) raise ValueError(msg) @@ -1758,15 +1838,17 @@ def check_inputs(options): cursor.execute(""" SELECT dataset_name FROM dataset """) existing_datasets = [str(x[0]) for x in cursor.fetchall()] - with open(options.config_file, 'r') as f: + with open(options.config_file, "r") as f: for line in f: - line = line.strip().split(',') + line = line.strip().split(",") curr_sam = line[3] if len(line) != 4: - msg = 'Incorrect number of comma-separated fields' +\ - ' in config file. There should be four: ' +\ - '(dataset name, sample description, ' +\ - 'platform, associated sam/bam file).' + msg = ( + "Incorrect number of comma-separated fields" + + " in config file. There should be four: " + + "(dataset name, sample description, " + + "platform, associated sam/bam file)." + ) logging.error(msg) raise ValueError(msg) @@ -1779,19 +1861,18 @@ def check_inputs(options): metadata = (line[0], line[1], line[2]) dataname = metadata[0] if dataname in existing_datasets: - logging.warning("Ignoring dataset with name '" + dataname + - "' because it is already in the database.") + logging.warning( + "Ignoring dataset with name '" + dataname + "' because it is already in the database." + ) elif dataname in curr_datasets: - logging.warning("Skipping duplicated instance of dataset '" + - dataname + "'.") + logging.warning("Skipping duplicated instance of dataset '" + dataname + "'.") elif curr_sam in sam_files: - logging.warning("Skipping duplicated instance of sam file '" + - curr_sam + "'.") + logging.warning("Skipping duplicated instance of sam file '" + curr_sam + "'.") else: dataset_metadata.append(metadata) curr_datasets.append(dataname) if not curr_sam.endswith(".sam") and not curr_sam.endswith(".bam"): - msg = 'Last field in config file must be a .sam/.bam file' + msg = "Last field in config file must be a .sam/.bam file" logging.error(msg) raise ValueError(msg) sam_files.append(curr_sam) @@ -1805,7 +1886,7 @@ def check_inputs(options): cursor.execute(""" SELECT dataset_name FROM dataset """) existing_datasets = [str(x[0]) for x in cursor.fetchall()] - with open(options.config_file, 'r') as f: + with open(options.config_file, "r") as f: n_lines = 0 for line in f: n_lines += 1 @@ -1815,13 +1896,15 @@ def check_inputs(options): # raise ValueError('Using more than one sam file with RG '+\ # 'tag is currently unsupported') - line = line.strip().split(',') + line = line.strip().split(",") curr_sam = line[2] if len(line) != 3: - msg = 'Incorrect number of comma-separated fields' +\ - ' in config file. There should be three: ' +\ - '(sample description, ' +\ - 'platform, associated sam/bam file).' + msg = ( + "Incorrect number of comma-separated fields" + + " in config file. There should be three: " + + "(sample description, " + + "platform, associated sam/bam file)." + ) logging.error(msg) raise ValueError(msg) @@ -1830,41 +1913,44 @@ def check_inputs(options): msg = f"SAM/BAM file '{curr_sam}' does not exist!" logging.error(msg) raise ValueError(msg) - metadata = ['', line[0], line[1]] + metadata = ["", line[0], line[1]] # get list of dataset names from the CB tag in the sam file - if curr_sam.endswith('.sam'): - + if curr_sam.endswith(".sam"): # which rows are comment rows? - with open(curr_sam, 'r') as infile: + with open(curr_sam, "r") as infile: skip_rows = [] for i, line in enumerate(infile): - if line.startswith('@'): + if line.startswith("@"): skip_rows.append(i) else: break # read just the cb tags - df = pd.read_csv(curr_sam, sep='\tCB:Z:', - skiprows=skip_rows, - usecols=[1], header=None, - names=['cb_tag'], engine='python') + df = pd.read_csv( + curr_sam, + sep="\tCB:Z:", + skiprows=skip_rows, + usecols=[1], + header=None, + names=["cb_tag"], + engine="python", + ) # is the df empty? if df.empty: - msg = 'SAM/BAM file contains no CB tags' + msg = "SAM/BAM file contains no CB tags" logging.error(msg) raise RuntimeError(msg) - df['dataset'] = df.cb_tag.str.split( - pat='\t', n=1, expand=True)[0] + df["dataset"] = df.cb_tag.str.split(pat="\t", n=1, expand=True)[0] datasets = df.dataset.unique().tolist() - elif curr_sam.endswith('.bam'): + elif curr_sam.endswith(".bam"): datasets = [] # make an index so we can use the pysam fetch function pysam.index(curr_sam) - infile = pysam.AlignmentFile(curr_sam, 'rb') + infile = pysam.AlignmentFile(curr_sam, "rb") for read in infile.fetch(): - tag = read.get_tag('CB') + tag = read.get_tag("CB") datasets.append(tag) # only unique dataset datasets = list(set(datasets)) @@ -1873,15 +1959,13 @@ def check_inputs(options): metadata[0] = dataname if dataname in existing_datasets: - msg = f"Dataset for read group {f} " +\ - "already in database." + msg = f"Dataset for read group {f} " + "already in database." logging.error(msg) raise RuntimeError(msg) # warnings.warn("Ignoring dataset with name '" + dataname + \ # "' because it is already in the database.") elif dataname in curr_datasets: - msg = f"Dataset for read group {f} " +\ - "already in config file." + msg = f"Dataset for read group {f} " + "already in config file." logging.error(msg) raise RuntimeError(msg) # warnings.warn("Skipping duplicated instance of dataset '" + \ @@ -1890,10 +1974,9 @@ def check_inputs(options): dataset_metadata.append(tuple(metadata)) curr_datasets.append(dataname) if curr_sam in sam_files: - logging.warning("Skipping duplicated instance of sam/bam file '" + - curr_sam + "'.") + logging.warning("Skipping duplicated instance of sam/bam file '" + curr_sam + "'.") if not curr_sam.endswith(".sam") and not curr_sam.endswith(".bam"): - msg = 'Last field in config file must be a .sam/.bam file' + msg = "Last field in config file must be a .sam/.bam file" logging.error(msg) raise ValueError(msg) sam_files.append(curr_sam) @@ -1906,19 +1989,25 @@ def check_inputs(options): # sam_files.append(curr_sam) if sam_files == []: - msg = "All of the provided dataset names are already in "+\ - "the database. Please check your config file." + msg = "All of the provided dataset names are already in " + "the database. Please check your config file." logging.error(msg) raise RuntimeError(msg) return sam_files, dataset_metadata -def init_run_info(database, genome_build, min_coverage=0.9, min_identity=0, - use_cb_tag=False, create_novel_spliced_genes=False, tmp_dir="talon_tmp/"): - """ Initializes a dictionary that keeps track of important run information - such as the desired genome build, the prefix for novel identifiers, - and the novel counters for the run. """ +def init_run_info( + database, + genome_build, + min_coverage=0.9, + min_identity=0, + use_cb_tag=False, + create_novel_spliced_genes=False, + tmp_dir="talon_tmp/", +): + """Initializes a dictionary that keeps track of important run information + such as the desired genome build, the prefix for novel identifiers, + and the novel counters for the run.""" with sqlite3.connect(database) as conn: conn.row_factory = sqlite3.Row @@ -1936,8 +2025,8 @@ def init_run_info(database, genome_build, min_coverage=0.9, min_identity=0, # Fetch information from run_info table cursor.execute("""SELECT * FROM run_info""") for info in cursor.fetchall(): - info_name = info['item'] - value = info['value'] + info_name = info["item"] + value = info["value"] if info_name not in ["idprefix", "schema_version"]: value = int(value) run_info[info_name] = value @@ -1945,14 +2034,14 @@ def init_run_info(database, genome_build, min_coverage=0.9, min_identity=0, # Fetch dataset counter query = "SELECT * FROM counters WHERE category == 'dataset'" cursor.execute(query) - run_info.dataset = cursor.fetchone()['count'] + run_info.dataset = cursor.fetchone()["count"] return run_info def init_outfiles(outprefix, tmp_dir="talon_tmp/"): - """ Initialize output files for the run that all processes will be able to - write to via the queue. """ + """Initialize output files for the run that all processes will be able to + write to via the queue.""" # If there is a tmp dir there already, remove it if os.path.exists(tmp_dir): @@ -1985,59 +2074,42 @@ def init_outfiles(outprefix, tmp_dir="talon_tmp/"): for fname in outfiles: # Replace with handle to open file - open(outfiles[fname], 'w').close() + open(outfiles[fname], "w").close() return outfiles -def prepare_data_structures(cursor, run_info, chrom=None, start=None, - end=None, tmp_id="1"): - """ Initializes data structures needed for the run and organizes them - in a dictionary for more ease of use when passing them between functions +def prepare_data_structures(cursor, run_info, chrom=None, start=None, end=None, tmp_id="1"): + """Initializes data structures needed for the run and organizes them + in a dictionary for more ease of use when passing them between functions """ build = run_info.build min_coverage = run_info.min_coverage min_identity = run_info.min_identity struct_collection = dstruct.Struct() - struct_collection.tmp_gene = init_refs.make_temp_novel_gene_table(cursor, - build, chrom=chrom, - start=start, end=end, - tmp_tab="temp_gene_" + tmp_id) - - struct_collection.tmp_monoexon = init_refs.make_temp_monoexonic_transcript_table(cursor, - build, chrom=chrom, - start=start, end=end, - tmp_tab="temp_monoexon_" + tmp_id) - - struct_collection.tmp_t = init_refs.make_temp_transcript_table(cursor, - build, chrom=chrom, - start=start, end=end, - tmp_tab="temp_t_" + tmp_id) - - location_dict = init_refs.make_location_dict(build, cursor, chrom=chrom, - start=start, end=end) - - edge_dict = init_refs.make_edge_dict(cursor, build=build, chrom=chrom, - start=start, end=end) - - transcript_dict = init_refs.make_transcript_dict(cursor, build, chrom=chrom, - start=start, end=end) - - vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor, build=build, - chrom=chrom, - start=start, end=end) - - gene_starts = init_refs.make_gene_start_or_end_dict(cursor, - build, "start", - chrom=chrom, - start=start, - end=end) - gene_ends = init_refs.make_gene_start_or_end_dict(cursor, - build, "end", - chrom=chrom, - start=start, - end=end) + struct_collection.tmp_gene = init_refs.make_temp_novel_gene_table( + cursor, build, chrom=chrom, start=start, end=end, tmp_tab="temp_gene_" + tmp_id + ) + + struct_collection.tmp_monoexon = init_refs.make_temp_monoexonic_transcript_table( + cursor, build, chrom=chrom, start=start, end=end, tmp_tab="temp_monoexon_" + tmp_id + ) + + struct_collection.tmp_t = init_refs.make_temp_transcript_table( + cursor, build, chrom=chrom, start=start, end=end, tmp_tab="temp_t_" + tmp_id + ) + + location_dict = init_refs.make_location_dict(build, cursor, chrom=chrom, start=start, end=end) + + edge_dict = init_refs.make_edge_dict(cursor, build=build, chrom=chrom, start=start, end=end) + + transcript_dict = init_refs.make_transcript_dict(cursor, build, chrom=chrom, start=start, end=end) + + vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor, build=build, chrom=chrom, start=start, end=end) + + gene_starts = init_refs.make_gene_start_or_end_dict(cursor, build, "start", chrom=chrom, start=start, end=end) + gene_ends = init_refs.make_gene_start_or_end_dict(cursor, build, "end", chrom=chrom, start=start, end=end) struct_collection.location_dict = location_dict struct_collection.edge_dict = edge_dict @@ -2050,32 +2122,43 @@ def prepare_data_structures(cursor, run_info, chrom=None, start=None, def compute_delta(orig_pos, new_pos, strand): - """ Given a starting position and a new position, compute the distance - between them. The sign indicates whether the second point is - upstream or downstream of the original with respect to strand. """ + """Given a starting position and a new position, compute the distance + between them. The sign indicates whether the second point is + upstream or downstream of the original with respect to strand.""" abs_dist = abs(orig_pos - new_pos) if strand == "+": if new_pos < orig_pos: - return -1*abs_dist + return -1 * abs_dist else: return abs_dist elif strand == "-": if new_pos < orig_pos: return abs_dist else: - return -1*abs_dist + return -1 * abs_dist else: - msg = 'Strand must be either + or -' + msg = "Strand must be either + or -" logging.error(msg) raise ValueError(msg) -def identify_monoexon_transcript(chrom, positions, strand, cursor, location_dict, - edge_dict, transcript_dict, vertex_2_gene, - gene_starts, gene_ends, run_info, tmp_gene, - tmp_t, - tmp_monoexon): +def identify_monoexon_transcript( + chrom, + positions, + strand, + cursor, + location_dict, + edge_dict, + transcript_dict, + vertex_2_gene, + gene_starts, + gene_ends, + run_info, + tmp_gene, + tmp_t, + tmp_monoexon, +): gene_novelty = [] transcript_novelty = [] exon_novelty = [] @@ -2087,7 +2170,8 @@ def identify_monoexon_transcript(chrom, positions, strand, cursor, location_dict end = positions[-1] # First, look for a monoexonic transcript match that overlaps the current # transcript - query = Template(""" SELECT * + query = Template( + """ SELECT * FROM $tmp_monoexon AS tm WHERE tm.chromosome = '$chrom' AND tm.strand = '$strand' @@ -2095,10 +2179,16 @@ def identify_monoexon_transcript(chrom, positions, strand, cursor, location_dict OR (min_pos >= $start AND max_pos <= $end) OR (min_pos >= $start AND min_pos <= $end) OR (max_pos >= $start AND max_pos <= $end)) - """).substitute({"tmp_monoexon": tmp_monoexon, - "chrom": chrom, "strand": strand, - "start": min(start, end), - "end": max(start, end)}) + """ + ).substitute( + { + "tmp_monoexon": tmp_monoexon, + "chrom": chrom, + "strand": strand, + "start": min(start, end), + "end": max(start, end), + } + ) cursor.execute(query) matches = cursor.fetchall() @@ -2110,32 +2200,28 @@ def identify_monoexon_transcript(chrom, positions, strand, cursor, location_dict best_match = None for match in matches: # get overlap and compare - match_interval = [match['start'], match['end']] + match_interval = [match["start"], match["end"]] overlap, perc_overlap = get_overlap([start, end], match_interval) if overlap >= best_overlap: best_overlap = overlap best_match = match - gene_ID = best_match['gene_ID'] - transcript_ID = best_match['transcript_ID'] - vertex_IDs = (best_match['start_vertex'], best_match['end_vertex']) - edge_IDs = [best_match['exon_ID']] - diff_5p = compute_delta(best_match['start'], start, strand) - diff_3p = compute_delta(best_match['end'], end, strand) + gene_ID = best_match["gene_ID"] + transcript_ID = best_match["transcript_ID"] + vertex_IDs = (best_match["start_vertex"], best_match["end_vertex"]) + edge_IDs = [best_match["exon_ID"]] + diff_5p = compute_delta(best_match["start"], start, strand) + diff_3p = compute_delta(best_match["end"], end, strand) # If there is no match, proceed to genomic/antisense style matching. else: # Start by performing vertex match vertex_IDs, v_novelty, diff_5p, diff_3p = match_monoexon_vertices( - chrom, - positions, - strand, - location_dict, - run_info) + chrom, positions, strand, location_dict, run_info + ) # Get edge match (or create new edge) - edge_IDs, e_novelty = match_all_transcript_edges(vertex_IDs, strand, - edge_dict, run_info) + edge_IDs, e_novelty = match_all_transcript_edges(vertex_IDs, strand, edge_dict, run_info) # If the exon is known, then this transcript must be ISM or NIC gene_ID = None @@ -2143,94 +2229,125 @@ def identify_monoexon_transcript(chrom, positions, strand, cursor, location_dict all_matches = search_for_ISM(edge_IDs, transcript_dict) if all_matches != None: - gene_ID, transcript_ID, transcript_novelty, info = process_ISM(chrom, positions, - strand, edge_IDs, - vertex_IDs, all_matches, - transcript_dict, - gene_starts, gene_ends, - edge_dict, location_dict, - run_info, cursor, tmp_gene, tmp_t) + gene_ID, transcript_ID, transcript_novelty, info = process_ISM( + chrom, + positions, + strand, + edge_IDs, + vertex_IDs, + all_matches, + transcript_dict, + gene_starts, + gene_ends, + edge_dict, + location_dict, + run_info, + cursor, + tmp_gene, + tmp_t, + ) if gene_ID == None: # Find best gene match using overlap search if the ISM/NIC check didn't work - gene_ID, match_strand = search_for_overlap_with_gene(chrom, positions[0], - positions[1], strand, - cursor, run_info, tmp_gene, - tmp_t) + gene_ID, match_strand = search_for_overlap_with_gene( + chrom, positions[0], positions[1], strand, cursor, run_info, tmp_gene, tmp_t + ) # Intergenic case if gene_ID == None: - gene_ID = create_gene(chrom, positions[0], positions[-1], - strand, cursor, tmp_gene) - - gene_novelty.append((gene_ID, run_info.idprefix, "TALON", - "intergenic_novel", "TRUE")) - transcript_ID = create_transcript(strand, chrom, positions[0], positions[-1], - gene_ID, edge_IDs, vertex_IDs, - transcript_dict, tmp_t, cursor)["transcript_ID"] - transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON", - "intergenic_transcript", "TRUE")) + gene_ID = create_gene(chrom, positions[0], positions[-1], strand, cursor, tmp_gene) + + gene_novelty.append((gene_ID, run_info.idprefix, "TALON", "intergenic_novel", "TRUE")) + transcript_ID = create_transcript( + strand, + chrom, + positions[0], + positions[-1], + gene_ID, + edge_IDs, + vertex_IDs, + transcript_dict, + tmp_t, + cursor, + )["transcript_ID"] + transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON", "intergenic_transcript", "TRUE")) # Antisense case elif match_strand != strand: anti_gene_ID = gene_ID - gene_ID = create_gene(chrom, positions[0], positions[-1], - strand, cursor, tmp_gene) - transcript_ID = create_transcript(strand, chrom, positions[0], positions[-1], - gene_ID, edge_IDs, vertex_IDs, - transcript_dict, tmp_t, cursor)["transcript_ID"] - - gene_novelty.append((gene_ID, run_info.idprefix, "TALON", - "antisense_gene", "TRUE")) - gene_novelty.append((gene_ID, run_info.idprefix, "TALON", - "gene_antisense_to_IDs", anti_gene_ID)) - transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON", - "antisense_transcript", "TRUE")) + gene_ID = create_gene(chrom, positions[0], positions[-1], strand, cursor, tmp_gene) + transcript_ID = create_transcript( + strand, + chrom, + positions[0], + positions[-1], + gene_ID, + edge_IDs, + vertex_IDs, + transcript_dict, + tmp_t, + cursor, + )["transcript_ID"] + + gene_novelty.append((gene_ID, run_info.idprefix, "TALON", "antisense_gene", "TRUE")) + gene_novelty.append((gene_ID, run_info.idprefix, "TALON", "gene_antisense_to_IDs", anti_gene_ID)) + transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON", "antisense_transcript", "TRUE")) # Same strand else: - transcript_ID = create_transcript(strand, chrom, positions[0], positions[-1], - gene_ID, edge_IDs, vertex_IDs, - transcript_dict, tmp_t, cursor)["transcript_ID"] - transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON", - "genomic_transcript", "TRUE")) + transcript_ID = create_transcript( + strand, + chrom, + positions[0], + positions[-1], + gene_ID, + edge_IDs, + vertex_IDs, + transcript_dict, + tmp_t, + cursor, + )["transcript_ID"] + transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON", "genomic_transcript", "TRUE")) # Add all novel vertices to vertex_2_gene now that we have the gene ID update_vertex_2_gene(gene_ID, vertex_IDs, strand, vertex_2_gene) - talon_gene_name, talon_transcript_name = construct_names(gene_ID, - transcript_ID, - run_info.idprefix, - run_info.n_places) + talon_gene_name, talon_transcript_name = construct_names( + gene_ID, transcript_ID, run_info.idprefix, run_info.n_places + ) # Add novel gene annotation attributes if len(gene_novelty) > 0: - gene_novelty.append((gene_ID, run_info.idprefix, "TALON", - "gene_status", "NOVEL")) - gene_novelty.append((gene_ID, run_info.idprefix, "TALON", - "gene_name", talon_gene_name)) - gene_novelty.append((gene_ID, run_info.idprefix, "TALON", - "gene_id", talon_gene_name)) + gene_novelty.append((gene_ID, run_info.idprefix, "TALON", "gene_status", "NOVEL")) + gene_novelty.append((gene_ID, run_info.idprefix, "TALON", "gene_name", talon_gene_name)) + gene_novelty.append((gene_ID, run_info.idprefix, "TALON", "gene_id", talon_gene_name)) # Add novel transcript annotation attributes - transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON", - "transcript_status", "NOVEL")) - transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON", - "transcript_name", talon_transcript_name)) - transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON", - "transcript_id", talon_transcript_name)) + transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON", "transcript_status", "NOVEL")) + transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON", "transcript_name", talon_transcript_name)) + transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON", "transcript_id", talon_transcript_name)) # Add annotation entries for any novel exons if e_novelty[0] == 1: - exon_novelty.append((edge_IDs[0], run_info.idprefix, "TALON", - "exon_status", "NOVEL")) + exon_novelty.append((edge_IDs[0], run_info.idprefix, "TALON", "exon_status", "NOVEL")) # Add the novel transcript to the temporary monoexon table - new_mono = (gene_ID, transcript_ID, chrom, start, end, strand, - vertex_IDs[0], vertex_IDs[-1], edge_IDs[0], - min(start, end), max(start, end)) - cols = '("gene_ID", "transcript_ID", "chromosome", "start", "end",' + \ - '"strand", "start_vertex", "end_vertex", "exon_ID", "min_pos",' + \ - '"max_pos")' - command = 'INSERT INTO ' + tmp_monoexon + ' ' + cols + ' VALUES ' + \ - '(?,?,?,?,?,?,?,?,?,?,?)' + new_mono = ( + gene_ID, + transcript_ID, + chrom, + start, + end, + strand, + vertex_IDs[0], + vertex_IDs[-1], + edge_IDs[0], + min(start, end), + max(start, end), + ) + cols = ( + '("gene_ID", "transcript_ID", "chromosome", "start", "end",' + + '"strand", "start_vertex", "end_vertex", "exon_ID", "min_pos",' + + '"max_pos")' + ) + command = "INSERT INTO " + tmp_monoexon + " " + cols + " VALUES " + "(?,?,?,?,?,?,?,?,?,?,?)" cursor.execute(command, new_mono) # Package annotation information @@ -2251,7 +2368,7 @@ def identify_monoexon_transcript(chrom, positions, strand, cursor, location_dict def update_database(database, batch_size, outfiles, datasets): - """ Adds new entries to the database. """ + """Adds new entries to the database.""" conn = sqlite3.connect(database) conn.row_factory = sqlite3.Row @@ -2266,8 +2383,7 @@ def update_database(database, batch_size, outfiles, datasets): batch_add_observed(cursor, outfiles.observed, batch_size) update_counter(cursor) batch_add_annotations(cursor, outfiles.gene_annot, "gene", batch_size) - batch_add_annotations(cursor, outfiles.transcript_annot, "transcript", - batch_size) + batch_add_annotations(cursor, outfiles.transcript_annot, "transcript", batch_size) batch_add_annotations(cursor, outfiles.exon_annot, "exon", batch_size) check_database_integrity(cursor) @@ -2278,7 +2394,7 @@ def update_database(database, batch_size, outfiles, datasets): def update_counter(cursor): # , n_datasets): - """ Update the database counter using the global counter variables """ + """Update the database counter using the global counter variables""" update_g = 'UPDATE "counters" SET "count" = ? WHERE "category" = "genes"' cursor.execute(update_g, [gene_counter.value()]) @@ -2303,21 +2419,18 @@ def update_counter(cursor): # , n_datasets): def batch_add_vertex2gene(cursor, v2g_file, batch_size): - """ Add new vertex-gene relationships to the vertex table """ + """Add new vertex-gene relationships to the vertex table""" - with open(v2g_file, 'r') as f: + with open(v2g_file, "r") as f: while True: - batch = [tuple(x.strip().split("\t")) - for x in islice(f, batch_size)] + batch = [tuple(x.strip().split("\t")) for x in islice(f, batch_size)] if batch == []: break try: - cols = " (" + ", ".join([str_wrap_double(x) for x in - ["vertex_ID", "gene_ID"]]) + ") " - command = 'INSERT OR IGNORE INTO "vertex"' + cols + "VALUES " + \ - '(?,?)' + cols = " (" + ", ".join([str_wrap_double(x) for x in ["vertex_ID", "gene_ID"]]) + ") " + command = 'INSERT OR IGNORE INTO "vertex"' + cols + "VALUES " + "(?,?)" cursor.executemany(command, batch) except Exception as e: @@ -2327,21 +2440,22 @@ def batch_add_vertex2gene(cursor, v2g_file, batch_size): def batch_add_locations(cursor, location_file, batch_size): - """ Add new locations to database """ + """Add new locations to database""" - with open(location_file, 'r') as f: + with open(location_file, "r") as f: while True: - batch = [tuple(x.strip().split("\t")) - for x in islice(f, batch_size)] + batch = [tuple(x.strip().split("\t")) for x in islice(f, batch_size)] if batch == []: break try: - cols = " (" + ", ".join([str_wrap_double(x) for x in - ["location_ID", "genome_build", "chromosome", "position"]]) + ") " - command = 'INSERT INTO "location"' + cols + "VALUES " + \ - '(?,?,?,?)' + cols = ( + " (" + + ", ".join([str_wrap_double(x) for x in ["location_ID", "genome_build", "chromosome", "position"]]) + + ") " + ) + command = 'INSERT INTO "location"' + cols + "VALUES " + "(?,?,?,?)" cursor.executemany(command, batch) except Exception as e: @@ -2351,21 +2465,22 @@ def batch_add_locations(cursor, location_file, batch_size): def batch_add_edges(cursor, edge_file, batch_size): - """ Add new edges to database """ + """Add new edges to database""" - with open(edge_file, 'r') as f: + with open(edge_file, "r") as f: while True: - batch = [tuple(x.strip().split("\t")) - for x in islice(f, batch_size)] + batch = [tuple(x.strip().split("\t")) for x in islice(f, batch_size)] if batch == []: break try: - cols = " (" + ", ".join([str_wrap_double(x) for x in - ["edge_ID", "v1", "v2", "edge_type", "strand"]]) + ") " - command = 'INSERT INTO "edge"' + \ - cols + "VALUES " + '(?,?,?,?,?)' + cols = ( + " (" + + ", ".join([str_wrap_double(x) for x in ["edge_ID", "v1", "v2", "edge_type", "strand"]]) + + ") " + ) + command = 'INSERT INTO "edge"' + cols + "VALUES " + "(?,?,?,?,?)" cursor.executemany(command, batch) except Exception as e: @@ -2376,15 +2491,15 @@ def batch_add_edges(cursor, edge_file, batch_size): def batch_add_transcripts(cursor, transcript_file, batch_size): - """ Add new transcripts to database """ + """Add new transcripts to database""" - with open(transcript_file, 'r') as f: + with open(transcript_file, "r") as f: while True: batch_lines = islice(f, batch_size) batch = [] for line in batch_lines: transcript = line.strip().split("\t") - if transcript[3] == 'None': + if transcript[3] == "None": transcript[3] = None batch.append(transcript) @@ -2392,11 +2507,26 @@ def batch_add_transcripts(cursor, transcript_file, batch_size): break try: - cols = " (" + ", ".join([str_wrap_double(x) for x in - ["transcript_id", "gene_id", "start_exon", "jn_path", - "end_exon", "start_vertex", "end_vertex", "n_exons"]]) + ") " - command = 'INSERT INTO "transcripts"' + \ - cols + "VALUES " + '(?,?,?,?,?,?,?,?)' + cols = ( + " (" + + ", ".join( + [ + str_wrap_double(x) + for x in [ + "transcript_id", + "gene_id", + "start_exon", + "jn_path", + "end_exon", + "start_vertex", + "end_vertex", + "n_exons", + ] + ] + ) + + ") " + ) + command = 'INSERT INTO "transcripts"' + cols + "VALUES " + "(?,?,?,?,?,?,?,?)" cursor.executemany(command, batch) except Exception as e: @@ -2407,21 +2537,18 @@ def batch_add_transcripts(cursor, transcript_file, batch_size): def batch_add_genes(cursor, gene_file, batch_size): - """ Add genes to the database gene table """ + """Add genes to the database gene table""" - with open(gene_file, 'r') as f: + with open(gene_file, "r") as f: while True: - batch = [tuple(x.strip().split("\t")) - for x in islice(f, batch_size)] + batch = [tuple(x.strip().split("\t")) for x in islice(f, batch_size)] if batch == []: break try: - cols = " (" + ", ".join([str_wrap_double(x) for x in - ["gene_ID", "strand"]]) + ") " - command = 'INSERT OR IGNORE INTO genes' + \ - cols + "VALUES " + '(?,?)' + cols = " (" + ", ".join([str_wrap_double(x) for x in ["gene_ID", "strand"]]) + ") " + command = "INSERT OR IGNORE INTO genes" + cols + "VALUES " + "(?,?)" cursor.executemany(command, batch) except Exception as e: @@ -2431,13 +2558,13 @@ def batch_add_genes(cursor, gene_file, batch_size): def add_datasets(cursor, datasets): - """ Add dataset records to database """ + """Add dataset records to database""" try: - cols = " (" + ", ".join([str_wrap_double(x) for x in - ["dataset_ID", "dataset_name", "sample", "platform"]]) + ") " - command = 'INSERT INTO "dataset"' + cols + \ - "VALUES " + '(?,?,?,?)' + cols = ( + " (" + ", ".join([str_wrap_double(x) for x in ["dataset_ID", "dataset_name", "sample", "platform"]]) + ") " + ) + command = 'INSERT INTO "dataset"' + cols + "VALUES " + "(?,?,?,?)" cursor.executemany(command, datasets) except Exception as e: @@ -2447,28 +2574,27 @@ def add_datasets(cursor, datasets): def batch_add_annotations(cursor, annot_file, annot_type, batch_size): - """ Add gene/transcript/exon annotations to the appropriate annotation table - """ + """Add gene/transcript/exon annotations to the appropriate annotation table""" batch_size = 1 if annot_type not in ["gene", "transcript", "exon"]: - msg = "When running batch annot update, must specify " +\ - "annot_type as 'gene', 'exon', or 'transcript'." + msg = "When running batch annot update, must specify " + "annot_type as 'gene', 'exon', or 'transcript'." logging.error(msg) raise ValueError(msg) - with open(annot_file, 'r') as f: + with open(annot_file, "r") as f: while True: - batch = [tuple(x.strip().split("\t")) - for x in islice(f, batch_size)] + batch = [tuple(x.strip().split("\t")) for x in islice(f, batch_size)] if batch == []: break try: - cols = " (" + ", ".join([str_wrap_double(x) for x in - ["ID", "annot_name", "source", "attribute", "value"]]) + ") " - command = 'INSERT OR IGNORE INTO "' + annot_type + \ - '_annotations" ' + cols + "VALUES " + '(?,?,?,?,?)' + cols = ( + " (" + + ", ".join([str_wrap_double(x) for x in ["ID", "annot_name", "source", "attribute", "value"]]) + + ") " + ) + command = 'INSERT OR IGNORE INTO "' + annot_type + '_annotations" ' + cols + "VALUES " + "(?,?,?,?,?)" cursor.executemany(command, batch) except Exception as e: @@ -2478,12 +2604,12 @@ def batch_add_annotations(cursor, annot_file, annot_type, batch_size): def batch_add_observed(cursor, observed_file, batch_size): - """ Adds observed tuples (obs_ID, gene_ID, transcript_ID, read_name, - dataset, start_vertex_ID, end_vertex_ID, start_exon, end_exon, - start_delta, end_delta, read_length) to observed table of database. """ + """Adds observed tuples (obs_ID, gene_ID, transcript_ID, read_name, + dataset, start_vertex_ID, end_vertex_ID, start_exon, end_exon, + start_delta, end_delta, read_length) to observed table of database.""" abundance = {} - with open(observed_file, 'r') as f: + with open(observed_file, "r") as f: while True: batch = [] for observed in islice(f, batch_size): @@ -2524,14 +2650,35 @@ def batch_add_observed(cursor, observed_file, batch_size): # Add to database try: - cols = " (" + ", ".join([str_wrap_double(x) for x in - ["obs_ID", "gene_ID", "transcript_ID", "read_name", - "dataset", "start_vertex", "end_vertex", - "start_exon", "end_exon", "start_delta", "end_delta", - "read_length", "fraction_As", "custom_label", - "allelic_label", "start_support", "end_support"]]) + ") " - command = 'INSERT INTO "observed"' + cols + \ - "VALUES " + '(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)' + cols = ( + " (" + + ", ".join( + [ + str_wrap_double(x) + for x in [ + "obs_ID", + "gene_ID", + "transcript_ID", + "read_name", + "dataset", + "start_vertex", + "end_vertex", + "start_exon", + "end_exon", + "start_delta", + "end_delta", + "read_length", + "fraction_As", + "custom_label", + "allelic_label", + "start_support", + "end_support", + ] + ] + ) + + ") " + ) + command = 'INSERT INTO "observed"' + cols + "VALUES " + "(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)" cursor.executemany(command, batch) except Exception as e: @@ -2549,21 +2696,20 @@ def batch_add_observed(cursor, observed_file, batch_size): def batch_add_abundance(cursor, entries, batch_size): - """ Reads abundance tuples (transcript_ID, dataset, count) and - adds to the abundance table of the database """ + """Reads abundance tuples (transcript_ID, dataset, count) and + adds to the abundance table of the database""" index = 0 while index < len(entries): try: - batch = entries[index:index + batch_size] + batch = entries[index : index + batch_size] except: batch = entries[index:] index += batch_size try: - cols = " (" + ", ".join([str_wrap_double(x) for x in - ["transcript_id", "dataset", "count"]]) + ") " - command = 'INSERT INTO "abundance"' + cols + "VALUES " + '(?,?,?)' + cols = " (" + ", ".join([str_wrap_double(x) for x in ["transcript_id", "dataset", "count"]]) + ") " + command = 'INSERT INTO "abundance"' + cols + "VALUES " + "(?,?,?)" cursor.executemany(command, batch) except Exception as e: print(e) @@ -2572,11 +2718,11 @@ def batch_add_abundance(cursor, entries, batch_size): def check_database_integrity(cursor): - """ Perform some checks on the database. Run before committing changes""" + """Perform some checks on the database. Run before committing changes""" # ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) # print("[ %s ] Validating database........" % (ts)) - logging.info('Validating database') + logging.info("Validating database") # For each category, check that the number of table entries matches the counter counter_query = "SELECT * FROM counters" @@ -2597,15 +2743,17 @@ def check_database_integrity(cursor): if actual_count != curr_counter: fail = 1 - logging.error("Database counter for '" + table_name + - "' does not match the number of entries in the table." + - " Discarding changes to database and exiting...") + logging.error( + "Database counter for '" + + table_name + + "' does not match the number of entries in the table." + + " Discarding changes to database and exiting..." + ) logging.debug("table_count: " + str(actual_count)) logging.debug("counter_value: " + str(curr_counter)) if fail == 1: - msg = "Discrepancy found in database. " +\ - "Discarding changes to database and exiting..." + msg = "Discrepancy found in database. " + "Discarding changes to database and exiting..." logging.error(msg) raise RuntimeError(msg) @@ -2613,28 +2761,26 @@ def check_database_integrity(cursor): def parallel_talon(read_file, interval, database, run_info, queue): - """ Manage TALON processing of a single chunk of the input. Initialize - reference data structures covering only the provided interval region, - then send the read file to the annotation step. Once annotation is - complete, return the data tuples generated so that they can be - added to the database, OR alternately, pickle them and write to file - where they can be accessed later. """ + """Manage TALON processing of a single chunk of the input. Initialize + reference data structures covering only the provided interval region, + then send the read file to the annotation step. Once annotation is + complete, return the data tuples generated so that they can be + added to the database, OR alternately, pickle them and write to file + where they can be accessed later.""" # ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) # print("[ %s ] Annotating reads in interval %s:%d-%d..." % # (ts, interval[0], interval[1], interval[2])) - logging.info(f'Annotating reads in interval {interval[0]}:{interval[1]}-{interval[2]}...') + logging.info(f"Annotating reads in interval {interval[0]}:{interval[1]}-{interval[2]}...") with sqlite3.connect(database) as conn: conn.row_factory = sqlite3.Row cursor = conn.cursor() tmp_id = str(os.getpid()) - struct_collection = prepare_data_structures(cursor, run_info, - chrom=interval[0], - start=interval[1], - end=interval[2], - tmp_id=tmp_id) + struct_collection = prepare_data_structures( + cursor, run_info, chrom=interval[0], start=interval[1], end=interval[2], tmp_id=tmp_id + ) interval_id = "%s_%d_%d" % interval @@ -2644,37 +2790,29 @@ def parallel_talon(read_file, interval, database, run_info, queue): qc_metrics = tutils.check_read_quality(record, run_info) passed_qc = qc_metrics[2] - qc_msg = (run_info.outfiles.qc, "\t".join( - [str(x) for x in qc_metrics])) + qc_msg = (run_info.outfiles.qc, "\t".join([str(x) for x in qc_metrics])) queue.put(qc_msg) if passed_qc: - annotation_info = annotate_read(record, cursor, run_info, - struct_collection) - unpack_observed(annotation_info, queue, - run_info.outfiles.observed) + annotation_info = annotate_read(record, cursor, run_info, struct_collection) + unpack_observed(annotation_info, queue, run_info.outfiles.observed) # Update annotation records # TODO: there is no need for entry to be a list/tuple for entry in annotation_info.gene_novelty: - msg = (run_info.outfiles.gene_annot, - "\t".join([str(x) for x in entry])) + msg = (run_info.outfiles.gene_annot, "\t".join([str(x) for x in entry])) queue.put(msg) for entry in annotation_info.transcript_novelty: - msg = (run_info.outfiles.transcript_annot, - "\t".join([str(x) for x in entry])) + msg = (run_info.outfiles.transcript_annot, "\t".join([str(x) for x in entry])) queue.put(msg) for entry in annotation_info.exon_novelty: - msg = (run_info.outfiles.exon_annot, - "\t".join([str(x) for x in entry])) + msg = (run_info.outfiles.exon_annot, "\t".join([str(x) for x in entry])) queue.put(msg) # Write the temp_gene table to file - cursor.execute("SELECT gene_ID, strand FROM " + - struct_collection.tmp_gene) + cursor.execute("SELECT gene_ID, strand FROM " + struct_collection.tmp_gene) for row in cursor.fetchall(): - msg = ((run_info.outfiles.genes, str( - row['gene_ID'])+"\t" + row['strand'])) + msg = (run_info.outfiles.genes, str(row["gene_ID"]) + "\t" + row["strand"]) queue.put(msg) # Pass messages to output files @@ -2684,23 +2822,30 @@ def parallel_talon(read_file, interval, database, run_info, queue): for transcript in list(transcripts.values()): # Only write novel transcripts to file if type(transcript) is dict: - entry = "\t".join([str(x) for x in (transcript['transcript_ID'], - transcript['gene_ID'], - transcript['start_exon'], - transcript['jn_path'], - transcript['end_exon'], - transcript['start_vertex'], - transcript['end_vertex'], - transcript['n_exons'])]) + entry = "\t".join( + [ + str(x) + for x in ( + transcript["transcript_ID"], + transcript["gene_ID"], + transcript["start_exon"], + transcript["jn_path"], + transcript["end_exon"], + transcript["start_vertex"], + transcript["end_vertex"], + transcript["n_exons"], + ) + ] + ) queue.put((run_info.outfiles.transcripts, entry)) # Write new edges to file edges = struct_collection.edge_dict for edge in list(edges.values()): if type(edge) is dict: - entry = "\t".join([str(x) for x in [edge['edge_ID'], edge['v1'], - edge['v2'], edge['edge_type'], - edge['strand']]]) + entry = "\t".join( + [str(x) for x in [edge["edge_ID"], edge["v1"], edge["v2"], edge["edge_type"], edge["strand"]]] + ) queue.put((run_info.outfiles.edges, entry)) # Write locations to file @@ -2708,18 +2853,18 @@ def parallel_talon(read_file, interval, database, run_info, queue): for chrom_dict in location_dict.values(): for loc in list(chrom_dict.values()): if type(loc) is dict: - msg = (run_info.outfiles.location, - "\t".join([str(x) for x in (loc['location_ID'], - loc['genome_build'], - loc['chromosome'], - loc['position'])])) + msg = ( + run_info.outfiles.location, + "\t".join( + [str(x) for x in (loc["location_ID"], loc["genome_build"], loc["chromosome"], loc["position"])] + ), + ) queue.put(msg) # Write new vertex-gene combos to file for vertex_ID, gene_set in struct_collection.vertex_2_gene.items(): for gene in gene_set: - msg = (run_info.outfiles.v2g, - "\t".join([str(x) for x in (vertex_ID, gene[0])])) + msg = (run_info.outfiles.v2g, "\t".join([str(x) for x in (vertex_ID, gene[0])])) queue.put(msg) struct_collection = None @@ -2728,13 +2873,13 @@ def parallel_talon(read_file, interval, database, run_info, queue): def parse_custom_SAM_tags(sam_record: pysam.AlignedSegment): - """ Looks for the following tags in the read. Will be set to None if no tag - is found - fA: fraction As in the 10-bp interval following the alignment end - lC: custom label (type = string) - lA: custom allele label (type = string) - tS: flag indicating start site support (type = string) - tE: flag indicating end site support (typ = string) + """Looks for the following tags in the read. Will be set to None if no tag + is found + fA: fraction As in the 10-bp interval following the alignment end + lC: custom label (type = string) + lA: custom allele label (type = string) + tS: flag indicating start site support (type = string) + tE: flag indicating end site support (typ = string) """ try: fraction_As = sam_record.get_tag("fA") @@ -2760,28 +2905,27 @@ def parse_custom_SAM_tags(sam_record: pysam.AlignedSegment): return fraction_As, custom_label, allelic_label, start_support, end_support -def annotate_read(sam_record: pysam.AlignedSegment, cursor, run_info, - struct_collection, mode=1): - """ Accepts a pysam-formatted read as input, and compares it to the - annotations in struct_collection to assign it a gene and transcript - identity. Returns annotation_info, which is a dict that has the - following attributes: - gene_ID - transcript_ID - gene_novelty - transcript_novelty - exon_novelty - start_vertex - end_vertex - start_exon - end_exon - start_delta - end_delta - fraction_As (following the end of the alignment) - custom_label - allelic_label - start_support - end_support +def annotate_read(sam_record: pysam.AlignedSegment, cursor, run_info, struct_collection, mode=1): + """Accepts a pysam-formatted read as input, and compares it to the + annotations in struct_collection to assign it a gene and transcript + identity. Returns annotation_info, which is a dict that has the + following attributes: + gene_ID + transcript_ID + gene_novelty + transcript_novelty + exon_novelty + start_vertex + end_vertex + start_exon + end_exon + start_delta + end_delta + fraction_As (following the end of the alignment) + custom_label + allelic_label + start_support + end_support """ # Parse attributes to determine the chromosome, positions, and strand of the transcript read_ID = sam_record.query_name @@ -2800,14 +2944,12 @@ def annotate_read(sam_record: pysam.AlignedSegment, cursor, run_info, cigar = sam_record.cigarstring # Parse custom TALON tags - fraction_As, custom_label, allelic_label, start_support, \ - end_support = parse_custom_SAM_tags(sam_record) + fraction_As, custom_label, allelic_label, start_support, end_support = parse_custom_SAM_tags(sam_record) intron_list = tutils.get_introns(sam_record, sam_start, cigar) # Adjust intron positions by 1 to get splice sites in exon terms - splice_sites = [x + 1 if i % 2 == 1 else x - 1 for i, x in - enumerate(intron_list)] + splice_sites = [x + 1 if i % 2 == 1 else x - 1 for i, x in enumerate(intron_list)] positions = [sam_start] + splice_sites + [sam_end] # Flip the positions' order if the read is on the minus strand @@ -2822,25 +2964,40 @@ def annotate_read(sam_record: pysam.AlignedSegment, cursor, run_info, gene_starts = struct_collection.gene_starts gene_ends = struct_collection.gene_ends - n_exons = int(len(positions)/2) + n_exons = int(len(positions) / 2) if n_exons > 1: - annotation_info = identify_transcript(chrom, positions, strand, - cursor, location_dict, - edge_dict, transcript_dict, - vertex_2_gene, - gene_starts, gene_ends, - run_info, - struct_collection.tmp_gene, - struct_collection.tmp_t) + annotation_info = identify_transcript( + chrom, + positions, + strand, + cursor, + location_dict, + edge_dict, + transcript_dict, + vertex_2_gene, + gene_starts, + gene_ends, + run_info, + struct_collection.tmp_gene, + struct_collection.tmp_t, + ) else: - annotation_info = identify_monoexon_transcript(chrom, positions, strand, - cursor, location_dict, - edge_dict, transcript_dict, - vertex_2_gene, - gene_starts, gene_ends, - run_info, struct_collection.tmp_gene, - struct_collection.tmp_t, - struct_collection.tmp_monoexon) + annotation_info = identify_monoexon_transcript( + chrom, + positions, + strand, + cursor, + location_dict, + edge_dict, + transcript_dict, + vertex_2_gene, + gene_starts, + gene_ends, + run_info, + struct_collection.tmp_gene, + struct_collection.tmp_t, + struct_collection.tmp_monoexon, + ) annotation_info.read_ID = read_ID annotation_info.dataset = dataset @@ -2858,19 +3015,30 @@ def annotate_read(sam_record: pysam.AlignedSegment, cursor, run_info, def unpack_observed(annotation_info, queue, obs_file): - """ Now that transcript has been annotated, unpack values and - create an observed entry. Send the observed entry to the queue - for output to obs_file.""" + """Now that transcript has been annotated, unpack values and + create an observed entry. Send the observed entry to the queue + for output to obs_file.""" obs_ID = observed_counter.increment() - observed = (obs_ID, annotation_info.gene_ID, annotation_info.transcript_ID, - annotation_info.read_ID, annotation_info.dataset, - annotation_info.start_vertex, annotation_info.end_vertex, - annotation_info.start_exon, annotation_info.end_exon, - annotation_info.start_delta, annotation_info.end_delta, - annotation_info.read_length, annotation_info.fraction_As, - annotation_info.custom_label, annotation_info.allelic_label, - annotation_info.start_support, annotation_info.end_support) + observed = ( + obs_ID, + annotation_info.gene_ID, + annotation_info.transcript_ID, + annotation_info.read_ID, + annotation_info.dataset, + annotation_info.start_vertex, + annotation_info.end_vertex, + annotation_info.start_exon, + annotation_info.end_exon, + annotation_info.start_delta, + annotation_info.end_delta, + annotation_info.read_length, + annotation_info.fraction_As, + annotation_info.custom_label, + annotation_info.allelic_label, + annotation_info.start_support, + annotation_info.end_support, + ) msg = (obs_file, "\t".join([str(x) for x in observed])) queue.put(msg) @@ -2878,14 +3046,14 @@ def unpack_observed(annotation_info, queue, obs_file): def listener(queue, outfiles, QC_header, timeout=72): - """ During the run, this function listens for messages on the provided - queue. When a message is received (consisting of a filename and a - string), it writes the string to that file. Timeout unit is in hours""" + """During the run, this function listens for messages on the provided + queue. When a message is received (consisting of a filename and a + string), it writes the string to that file. Timeout unit is in hours""" # Open all of the outfiles open_files = {} for fpath in outfiles.values(): - open_files[fpath] = open(fpath, 'w') + open_files[fpath] = open(fpath, "w") # Add a header to the QC file QC_file = open_files[outfiles.qc] @@ -2898,10 +3066,10 @@ def listener(queue, outfiles, QC_header, timeout=72): msg = queue.get() msg_fname = msg[0] msg_value = msg[1] - if datetime.now() > wait_until or msg_value == 'complete': + if datetime.now() > wait_until or msg_value == "complete": # ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) # print("[ %s ] Shutting down message queue..." % (ts)) - logging.info('Shutting down message queue...') + logging.info("Shutting down message queue...") for f in open_files.values(): f.close() break @@ -2911,29 +3079,33 @@ def listener(queue, outfiles, QC_header, timeout=72): def make_QC_header(coverage, identity, length): - """ Create a header for the read QC file """ - - cols = "\t".join(["dataset", "read_ID", "passed_QC", "primary_mapped", - "read_length", "fraction_aligned", "identity"]) - header = "\n".join(["# TALON run filtering settings:", - "# Min fraction read aligned: %f " % coverage, - "# Min read identity to reference: %f" % identity, - "# Min transcript length: %d" % length, - "# -------------------------------------------", - cols]) + """Create a header for the read QC file""" + + cols = "\t".join( + ["dataset", "read_ID", "passed_QC", "primary_mapped", "read_length", "fraction_aligned", "identity"] + ) + header = "\n".join( + [ + "# TALON run filtering settings:", + "# Min fraction read aligned: %f " % coverage, + "# Min read identity to reference: %f" % identity, + "# Min transcript length: %d" % length, + "# -------------------------------------------", + cols, + ] + ) return header def main(): - """ Runs program """ + """Runs program""" options = get_args() logger._init_logger(options.verbosity) # ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) # print("[ %s ] Started TALON run" % (ts)) - logging.info('Started TALON run') - + logging.info("Started TALON run") sam_files, dset_metadata = check_inputs(options) # print(sam_files) @@ -2955,19 +3127,18 @@ def main(): create_novel_spliced_genes = bool(options.create_novel_spliced_genes) # format tmp_dir if missing fwd slash - if not tmp_dir.endswith('/'): - tmp_dir += '/' + if not tmp_dir.endswith("/"): + tmp_dir += "/" # Set globally accessible counters get_counters(database) # Initialize worker pool with mp.Pool(processes=threads) as pool: - run_info = init_run_info(database, build, min_coverage, min_identity, - use_cb_tag, create_novel_spliced_genes, - tmp_dir=tmp_dir) - run_info.outfiles = init_outfiles(options.outprefix, - tmp_dir=tmp_dir) + run_info = init_run_info( + database, build, min_coverage, min_identity, use_cb_tag, create_novel_spliced_genes, tmp_dir=tmp_dir + ) + run_info.outfiles = init_outfiles(options.outprefix, tmp_dir=tmp_dir) # Create annotation entry for each dataset datasets = [] @@ -2978,16 +3149,13 @@ def main(): dataset_db_entries.append((d_id, d_name, description, platform)) # Partition the reads - read_groups, intervals, header_file = procsams.partition_reads(sam_files, - datasets, - use_cb_tag, - tmp_dir=tmp_dir, - n_threads=threads) - - read_files = procsams.write_reads_to_file( - read_groups, intervals, header_file, tmp_dir=tmp_dir) + read_groups, intervals, header_file = procsams.partition_reads( + sam_files, datasets, use_cb_tag, tmp_dir=tmp_dir, n_threads=threads + ) + + read_files = procsams.write_reads_to_file(read_groups, intervals, header_file, tmp_dir=tmp_dir) # ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) - logging.info(f'Split reads into {len(read_groups)} intervals') + logging.info(f"Split reads into {len(read_groups)} intervals") # Set up a queue specifically for writing to outfiles manager = mp.Manager() @@ -3000,50 +3168,47 @@ def main(): # ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) # print("[ %s ] Launching parallel annotation jobs" % (ts)) - logging.info('Launching parallel annotation jobs') + logging.info("Launching parallel annotation jobs") # Start running listener, which will monitor queue for messages - QC_header = make_QC_header(run_info.min_coverage, run_info.min_identity, - run_info.min_length) + QC_header = make_QC_header(run_info.min_coverage, run_info.min_identity, run_info.min_length) pool.apply_async(listener, (queue, run_info.outfiles, QC_header)) # Now launch the parallel TALON jobs pool.starmap(parallel_talon, jobs) # Now we are done, kill the listener - msg_done = (None, 'complete') + msg_done = (None, "complete") queue.put(msg_done) pool.close() pool.join() # ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) # print("[ %s ] All jobs complete. Starting database update." % (ts)) - logging.info('All jobs complete. Starting database update') + logging.info("All jobs complete. Starting database update") # Update the database batch_size = 10000 - update_database(database, batch_size, - run_info.outfiles, dataset_db_entries) + update_database(database, batch_size, run_info.outfiles, dataset_db_entries) # ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) # print("[ %s ] Database update complete." % (ts)) - logging.info('Database update complete.') + logging.info("Database update complete.") # Write output reads file # ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) # print("[ %s ] Creating read-wise annotation file." % (ts)) - logging.info('Creating read-wise annotation file') - get_read_annotations.make_read_annot_file(database, build, - outprefix, datasets=datasets) + logging.info("Creating read-wise annotation file") + get_read_annotations.make_read_annot_file(database, build, outprefix, datasets=datasets) # For debugging - #print("Genes: %d" % gene_counter.value()) - #print("Transcripts: %d" % transcript_counter.value()) - #print("Observed: %d" % observed_counter.value()) + # print("Genes: %d" % gene_counter.value()) + # print("Transcripts: %d" % transcript_counter.value()) + # print("Observed: %d" % observed_counter.value()) # ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) # print("[ %s ] DONE" % (ts)) - logging.info('DONE') + logging.info("DONE") -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/src/talon/talon_label_reads.py b/src/talon/talon_label_reads.py index 0915e69..8a565db 100644 --- a/src/talon/talon_label_reads.py +++ b/src/talon/talon_label_reads.py @@ -1,59 +1,72 @@ # TALON: Techonology-Agnostic Long Read Analysis Pipeline # Author: Dana Wyman # ----------------------------------------------------------------------------- -# This program reads in SAM-formatted long read alignments and adds a custom +# This program reads in SAM-formatted long read alignments and adds a custom # tag to reflect the fraction of As in the sequence immediately following the -# alignment. This can help indicate the likelihood of an internal priming -# artifact. +# alignment. This can help indicate the likelihood of an internal priming +# artifact. -import pyfaidx -import pysam +import glob import multiprocessing as mp -from datetime import datetime, timedelta -import time import os -import glob +import time +from datetime import datetime, timedelta from optparse import OptionParser +import pyfaidx +import pysam + + def get_options(): - """ Read input args """ - - parser = OptionParser(description=("This program reads in SAM-formatted " - "long read alignments and adds a custom tag to " - "reflect the fraction of As in the sequence " - "immediately following the alignment. This can help " - "indicate the likelihood of an internal priming " - "artifact.")) - parser.add_option("--f", dest = "sam_file", - help = "SAM file of transcripts") - parser.add_option("--g", dest = "genome_file", - help = "Reference genome fasta file") - parser.add_option("--t", dest = "threads", type = int, - help = "Number of threads to run", default = 1) - parser.add_option("--ar", dest = "fracA_range_size", type = int, - help = ("Size of post-transcript interval to compute " - "fraction As on. Default = 20"), default = 20) - parser.add_option("--tmpDir", dest = "tmp_dir", - help = ("Path to directory for tmp files. " - "Default = tmp_label_reads"), - default = "tmp_label_reads") - parser.add_option("--deleteTmp", dest = "delete_tmp", - action='store_true', - help = ("If this option is set, the temporary directory " - "generated by the program will be " - "removed at the end of the run.")) - parser.add_option("--o", dest = "outprefix", default = "talon_prelabels", - help = "Prefix for outfiles") + """Read input args""" + + parser = OptionParser( + description=( + "This program reads in SAM-formatted " + "long read alignments and adds a custom tag to " + "reflect the fraction of As in the sequence " + "immediately following the alignment. This can help " + "indicate the likelihood of an internal priming " + "artifact." + ) + ) + parser.add_option("--f", dest="sam_file", help="SAM file of transcripts") + parser.add_option("--g", dest="genome_file", help="Reference genome fasta file") + parser.add_option("--t", dest="threads", type=int, help="Number of threads to run", default=1) + parser.add_option( + "--ar", + dest="fracA_range_size", + type=int, + help=("Size of post-transcript interval to compute " "fraction As on. Default = 20"), + default=20, + ) + parser.add_option( + "--tmpDir", + dest="tmp_dir", + help=("Path to directory for tmp files. " "Default = tmp_label_reads"), + default="tmp_label_reads", + ) + parser.add_option( + "--deleteTmp", + dest="delete_tmp", + action="store_true", + help=( + "If this option is set, the temporary directory " + "generated by the program will be " + "removed at the end of the run." + ), + ) + parser.add_option("--o", dest="outprefix", default="talon_prelabels", help="Prefix for outfiles") (opts, args) = parser.parse_args() return opts -def fetch_seq(chrom: str, start: int, stop: int, strand: str, genome: pyfaidx.Fasta, - indexing=0): - """ Given a genomic interval, return the sequence with respect to the - strand supplied. - If 1-based indexing is specified, then 1 will be subtracted from the - position to convert to the Python indexing. """ + +def fetch_seq(chrom: str, start: int, stop: int, strand: str, genome: pyfaidx.Fasta, indexing=0): + """Given a genomic interval, return the sequence with respect to the + strand supplied. + If 1-based indexing is specified, then 1 will be subtracted from the + position to convert to the Python indexing.""" if start > stop: raise ValueError("Start must be less than or equal to stop") @@ -71,32 +84,34 @@ def fetch_seq(chrom: str, start: int, stop: int, strand: str, genome: pyfaidx.Fa return str(seq) + def compute_frac_As(seq: str): - """ Compute fraction of sequence made up of As """ + """Compute fraction of sequence made up of As""" - a = seq.count('A') + a = seq.count("A") n = len(seq) if n == 0: return 0 else: - return float(a)/n + return float(a) / n + def fetch_range_after_transcript(transcript_end: int, strand: str, length: int): - """ Given the 1-based stop position of a transcript and its strand, - return a 1-based genomic range of the specified length that starts with - the base just after the end position. The smaller position is always - reported first. - Example: - fetch_range_after_transcript(4, '+', 2) would yield (5, 6) - fetch_range_after_transcript(4, '-', 2) would yield (2, 3) + """Given the 1-based stop position of a transcript and its strand, + return a 1-based genomic range of the specified length that starts with + the base just after the end position. The smaller position is always + reported first. + Example: + fetch_range_after_transcript(4, '+', 2) would yield (5, 6) + fetch_range_after_transcript(4, '-', 2) would yield (2, 3) """ if length < 1: raise ValueError("Length must be greater than or equal to 1") - if strand == '+': + if strand == "+": range_start = transcript_end + 1 range_end = range_start + length - 1 - elif strand == '-': + elif strand == "-": range_start = transcript_end - 1 range_end = range_start - length + 1 else: @@ -104,49 +119,49 @@ def fetch_range_after_transcript(transcript_end: int, strand: str, length: int): return (min(range_start, range_end), max(range_start, range_end)) + def compute_transcript_end(transcript: pysam.AlignedSegment): - """ Compute the position of the final transcript base relative to the genome, - taking strand into account. Position is 1-based. """ + """Compute the position of the final transcript base relative to the genome, + taking strand into account. Position is 1-based.""" strand = "-" if transcript.is_reverse else "+" - if strand == '+': + if strand == "+": return transcript.reference_end - if strand == '-': - return transcript.reference_start + 1 # (make 1-based) + if strand == "-": + return transcript.reference_start + 1 # (make 1-based) + -def compute_frac_as_after_transcript(chrom: str, transcript_end: int, strand: str, - range_size: int, genome: pyfaidx.Fasta): - """ Given a transcript end, strand, range size, and genome object, - compute the fraction of sequence in the range immediately after - the transcript end that is made up of As.""" +def compute_frac_as_after_transcript( + chrom: str, transcript_end: int, strand: str, range_size: int, genome: pyfaidx.Fasta +): + """Given a transcript end, strand, range size, and genome object, + compute the fraction of sequence in the range immediately after + the transcript end that is made up of As.""" # Get sequence of range immediately after transcript - range_start, range_end = fetch_range_after_transcript(transcript_end, - strand, range_size) - range_seq = fetch_seq(chrom, range_start, range_end, strand, genome, - indexing = 1) + range_start, range_end = fetch_range_after_transcript(transcript_end, strand, range_size) + range_seq = fetch_seq(chrom, range_start, range_end, strand, genome, indexing=1) # Get fraction As in sequence return compute_frac_As(range_seq) - -def split_reads_by_chrom(sam_file, tmp_dir = "tmp_label_reads", n_threads = 1): - """ Reads a SAM/BAM file and splits the reads into one file per chromosome. - Returns a list of the resulting filenames.""" + +def split_reads_by_chrom(sam_file, tmp_dir="tmp_label_reads", n_threads=1): + """Reads a SAM/BAM file and splits the reads into one file per chromosome. + Returns a list of the resulting filenames.""" ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) print("[ %s ] Splitting SAM by chromosome..." % (ts)) tmp_dir = tmp_dir + "/raw" - os.system("mkdir -p %s" %(tmp_dir)) + os.system("mkdir -p %s" % (tmp_dir)) if sam_file.endswith(".sam"): # Convert to bam ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) print("[ %s ] -----Converting to bam...." % (ts)) bam_file = tmp_dir + "/all_reads.bam" - pysam.view("-b", "-S", "-@", str(n_threads), "-o", bam_file, sam_file, - catch_stdout=False) + pysam.view("-b", "-S", "-@", str(n_threads), "-o", bam_file, sam_file, catch_stdout=False) elif sam_file.endswith(".bam"): bam_file = sam_file else: @@ -160,24 +175,23 @@ def split_reads_by_chrom(sam_file, tmp_dir = "tmp_label_reads", n_threads = 1): pysam.sort("-@", str(n_threads), "-o", sorted_bam, bam_file) bam_file = sorted_bam pysam.index(bam_file) - + # Open bam file tmp_dir += "/chroms" - os.system("mkdir -p %s" %(tmp_dir)) + os.system("mkdir -p %s" % (tmp_dir)) read_files = [] ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) print("[ %s ] -----Writing chrom files..." % (ts)) with pysam.AlignmentFile(bam_file, "rb") as bam: # Iterate over chromosomes and write a reads file for each - chromosomes = [ x.contig for x in bam.get_index_statistics() \ - if x.mapped > 0 ] + chromosomes = [x.contig for x in bam.get_index_statistics() if x.mapped > 0] for chrom in chromosomes: - records = bam.fetch(chrom) - fname = tmp_dir + "/" + chrom + ".sam" - with pysam.AlignmentFile(fname, "w", template = bam) as o: - for record in records: - o.write(record) - read_files.append(fname) + records = bam.fetch(chrom) + fname = tmp_dir + "/" + chrom + ".sam" + with pysam.AlignmentFile(fname, "w", template=bam) as o: + for record in records: + o.write(record) + read_files.append(fname) return read_files @@ -185,16 +199,15 @@ def split_reads_by_chrom(sam_file, tmp_dir = "tmp_label_reads", n_threads = 1): def run_chrom_thread(sam_file, options): """ """ outname = sam_file.split("/")[-1].split(".sam")[0] - genome = pyfaidx.Fasta(options.genome_file, sequence_always_upper=True, - one_based_attributes=False) + genome = pyfaidx.Fasta(options.genome_file, sequence_always_upper=True, one_based_attributes=False) - os.system("mkdir -p %s" % (options.tmp_dir + "/labeled")) + os.system("mkdir -p %s" % (options.tmp_dir + "/labeled")) out_log_fname = options.tmp_dir + "/labeled/" + outname + "_read_labels.tsv" out_sam_fname = options.tmp_dir + "/labeled/" + outname + ".sam" # Iterate over reads - out_log = open(out_log_fname, 'w') - pos_seen_fracA = {} # Store fraction As for previously seen positions + out_log = open(out_log_fname, "w") + pos_seen_fracA = {} # Store fraction As for previously seen positions with pysam.AlignmentFile(sam_file) as sam: out_sam = pysam.AlignmentFile(out_sam_fname, "w", template=sam) @@ -211,50 +224,51 @@ def run_chrom_thread(sam_file, options): if location_str in pos_seen_fracA: frac_As = pos_seen_fracA[location_str] else: - frac_As = compute_frac_as_after_transcript(chrom, transcript_end, - strand, - options.fracA_range_size, - genome) + frac_As = compute_frac_as_after_transcript( + chrom, transcript_end, strand, options.fracA_range_size, genome + ) pos_seen_fracA[location_str] = frac_As - record.tags += [('fA', round(frac_As,3))] + record.tags += [("fA", round(frac_As, 3))] # TODO: Add other labels to the read, i.e. CAGE, canonical polyA # Write to output files out_sam.write(record) - out_log.write("\t".join([read_id, str(frac_As)]) + '\n') + out_log.write("\t".join([read_id, str(frac_As)]) + "\n") out_sam.close() out_log.close() return + def pool_outputs(indir, outprefix): - """ Given an input directory containing SAM files and log files, - concatenate them to form the final output. """ + """Given an input directory containing SAM files and log files, + concatenate them to form the final output.""" sam_fname = outprefix + "_labeled.sam" - log_fname = outprefix + "_read_labels.tsv" - + log_fname = outprefix + "_read_labels.tsv" + # Get list of files to combine sam_files = glob.glob(indir + "/*.sam") log_files = glob.glob(indir + "/*_read_labels.tsv") # Add headers - with open(log_fname, 'w') as f: - f.write("\t".join(["read_name", "fraction_As"]) + '\n') + with open(log_fname, "w") as f: + f.write("\t".join(["read_name", "fraction_As"]) + "\n") - os.system('cp %s %s' % (sam_files[0], sam_fname)) + os.system("cp %s %s" % (sam_files[0], sam_fname)) - # Concatenate + # Concatenate for sam in sam_files[1:]: os.system('grep -v "^@" %s >> %s' % (sam, sam_fname)) for logfile in log_files: - os.system('cat %s >> %s' % (logfile, log_fname)) + os.system("cat %s >> %s" % (logfile, log_fname)) return + def main(options=None): if options == None: options = get_options() @@ -270,8 +284,7 @@ def main(options=None): os.system("rm -r %s" % (options.tmp_dir)) # Partition reads by chromosome - read_files = split_reads_by_chrom(options.sam_file, tmp_dir = options.tmp_dir, - n_threads = options.threads) + read_files = split_reads_by_chrom(options.sam_file, tmp_dir=options.tmp_dir, n_threads=options.threads) # Now launch the parallel TALON read label jobs ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) @@ -290,10 +303,11 @@ def main(options=None): # Delete tmp_dir if desired if options.delete_tmp: os.system("rm -r %s" % (options.tmp_dir)) - + ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) print("[ %s ] Run complete" % (ts)) -if __name__ == '__main__': + +if __name__ == "__main__": options = get_options() - main(options) + main(options) diff --git a/src/talon/transcript.py b/src/talon/transcript.py index b1c32d7..b64cb32 100644 --- a/src/talon/transcript.py +++ b/src/talon/transcript.py @@ -1,31 +1,30 @@ # TALON: Techonology-Agnostic Long Read Analysis Pipeline # Author: Dana Wyman -#------------------------------------------------------------------------------ +# ------------------------------------------------------------------------------ + class Transcript(object): """Stores information about a gene transcript, including its location - and constitutive exons. - Attributes: - identifier: Accession ID of transcript, i.e. an Ensembl ID. Must - be unique. - name: Human-readable name of the transcript. Does not have to be - unique - chromosome: Chromosome that the transcript is located on - (format "chr1") - start: The start position of the transcript with respect to the - forward strand - end: The end position of the transcript with respect to the - forward strand - strand: "+" if the transcript is on the forward strand, and "-" if - it is on the reverse strand - gene_id: unique ID of the gene that this transcript belongs to - exons: List of exon objects belonging to this transcript, in sorted - order. + and constitutive exons. + Attributes: + identifier: Accession ID of transcript, i.e. an Ensembl ID. Must + be unique. + name: Human-readable name of the transcript. Does not have to be + unique + chromosome: Chromosome that the transcript is located on + (format "chr1") + start: The start position of the transcript with respect to the + forward strand + end: The end position of the transcript with respect to the + forward strand + strand: "+" if the transcript is on the forward strand, and "-" if + it is on the reverse strand + gene_id: unique ID of the gene that this transcript belongs to + exons: List of exon objects belonging to this transcript, in sorted + order. """ - def __init__(self, identifier, chromosome, start, end, strand, gene_id, - annotations): - + def __init__(self, identifier, chromosome, start, end, strand, gene_id, annotations): self.identifier = str(identifier) self.gene_id = str(gene_id) @@ -39,7 +38,7 @@ def __init__(self, identifier, chromosome, start, end, strand, gene_id, self.annotations = annotations def get_5prime_vertex(self): - """ Returns ID of 5' end vertex """ + """Returns ID of 5' end vertex""" if self.strand == "+": return self.exons[0].v1 @@ -47,7 +46,7 @@ def get_5prime_vertex(self): return self.exons[-1].v2 def get_3prime_vertex(self): - """ Returns ID of 5' end vertex """ + """Returns ID of 5' end vertex""" if self.strand == "+": return self.exons[-1].v2 @@ -58,7 +57,7 @@ def get_edge_path(self): edges = self.get_all_edges() if len(edges) == 0: return None - path = [ x.identifier for x in edges] + path = [x.identifier for x in edges] # Must reverse the path if the transcript is on the '-' strand if self.strand == "-": @@ -67,30 +66,29 @@ def get_edge_path(self): def get_all_edges(self): all_edges = [] - for i in range(0,self.n_exons): + for i in range(0, self.n_exons): all_edges.append(self.exons[i]) try: all_edges.append(self.introns[i]) except: pass - + return all_edges def get_length(self): - """ Computes the length of the transcript by summing the lengths of - its exons """ + """Computes the length of the transcript by summing the lengths of + its exons""" if len(self.exons) == 0: - raise ValueError('Cannot compute length: Transcript does not ' + \ - 'have any exons') - + raise ValueError("Cannot compute length: Transcript does not " + "have any exons") + transcript_length = 0 for exon in self.exons: transcript_length += exon.length return transcript_length def get_exon_coords(self): - """ Returns a list of the exon coordinates in order """ + """Returns a list of the exon coordinates in order""" exon_coords = [] for exon in self.exons: exon_coords.append(int(exon.start)) @@ -101,11 +99,17 @@ def add_exon(self, exon): """Adds an exon object to the transcript.""" if exon.start > exon.end: - raise ValueError('Exon start (' + str(exon.start) + ') ' + \ - 'is supposed to be before the exon end (' + str(exon.end) + ')') + raise ValueError( + "Exon start (" + + str(exon.start) + + ") " + + "is supposed to be before the exon end (" + + str(exon.end) + + ")" + ) # Check where in the list the exon should be added - for i in range(0,len(self.exons)): + for i in range(0, len(self.exons)): existing_exon = self.exons[i] if exon.end < existing_exon.start: self.exons = self.exons[0:i] + [exon] + self.exons[i:] @@ -121,44 +125,53 @@ def add_intron(self, intron): """Adds an edge object to the transcript.""" if intron.start > intron.end: - raise ValueError('Intron start (' + str(intron.start) + ')' + \ - 'is supposed to be before the intron end (' + str(intron.end) + ')') + raise ValueError( + "Intron start (" + + str(intron.start) + + ")" + + "is supposed to be before the intron end (" + + str(intron.end) + + ")" + ) # Check where in the list the intron should be added - for i in range(0,len(self.introns)): + for i in range(0, len(self.introns)): existing_intron = self.introns[i] if intron.end < existing_intron.start: self.introns = self.introns[0:i] + [intron] + self.introns[i:] return self.introns.append(intron) return - + def check_exon_validity(self): - """ The transcript's exons are valid if: - 1) Exons are in sorted order (ascending) - 2) Exon bounds do not exceed transcript start and end - 3) Exons are all on the appropriate chromosome - If these conditions are violated, this function raises an error. + """The transcript's exons are valid if: + 1) Exons are in sorted order (ascending) + 2) Exon bounds do not exceed transcript start and end + 3) Exons are all on the appropriate chromosome + If these conditions are violated, this function raises an error. """ prev = 0 for exon in self.exons: if exon.chromosome != self.chromosome: - raise ValueError('Invalid exon in transcript ' + \ - self.identifier + ': wrong chromosome') + raise ValueError("Invalid exon in transcript " + self.identifier + ": wrong chromosome") if exon.start < self.start or exon.end > self.end: print("self.start: " + str(self.start)) print("self.end: " + str(self.end)) print("exon.start: " + str(exon.start)) print("exon.end: " + str(exon.end)) - raise ValueError('Invalid exon in transcript ' + \ - self.identifier + ': (' + str(exon.start) + "-" + \ - str(exon.end) + \ - ') is located beyond start or end of transcript') + raise ValueError( + "Invalid exon in transcript " + + self.identifier + + ": (" + + str(exon.start) + + "-" + + str(exon.end) + + ") is located beyond start or end of transcript" + ) if exon.start <= prev: # This error would indicate a TALON bug rather than user error, - # so we shouldn't see it. - raise ValueError('Exons of transcript ' + \ - self.identifier + ' are not stored in ascending order.') + # so we shouldn't see it. + raise ValueError("Exons of transcript " + self.identifier + " are not stored in ascending order.") prev = exon.end return @@ -170,7 +183,7 @@ def get_introns(self): intron_list = [] i = 1 - while (i < len(exon_coords) - 1): + while i < len(exon_coords) - 1: j = i + 1 intron_list.append(exon_coords[i] + 1) @@ -179,32 +192,30 @@ def get_introns(self): return intron_list - def print_transcript(self): - """ Print a string representation of the Transcript. Good for debugging - """ + """Print a string representation of the Transcript. Good for debugging""" transcript_id = self.identifier if transcript_id == None: transcript_id = "Transcript" - print("\tLocation: " + self.chromosome + ":" + str(self.start) + "-" + \ - str(self.end) + "(" + self.strand + ")") + print("\tLocation: " + self.chromosome + ":" + str(self.start) + "-" + str(self.end) + "(" + self.strand + ")") # Print exons print("\tExons: " + "\n".join([str(x.start) + "-" + str(x.end) for x in self.exons])) - return + return + def get_transcript_from_db(transcript_row, exon_tree, intron_tree): - """ Uses information from a database transcript entry to create a + """Uses information from a database transcript entry to create a Transcript object. Args: - transcript_row: Tuple-formatted row from transcripts table of a + transcript_row: Tuple-formatted row from transcripts table of a TALON database """ - transcript_id = str(transcript_row['transcript_id']) - gene_id = str(transcript_row['gene_id']) + transcript_id = str(transcript_row["transcript_id"]) + gene_id = str(transcript_row["gene_id"]) - edges = transcript_row['path'].split(",") + edges = transcript_row["path"].split(",") # Check strand sample_edge = str(edges[0]) @@ -212,7 +223,7 @@ def get_transcript_from_db(transcript_row, exon_tree, intron_tree): # Reverse the edge list if the transcript is on the - strand if strand == "-": - edges = edges[::-1] + edges = edges[::-1] # Get start and end of transcript if edges[0] in exon_tree.edges and edges[-1] in exon_tree.edges: @@ -220,30 +231,40 @@ def get_transcript_from_db(transcript_row, exon_tree, intron_tree): start = (exon_tree.edges[edges[0]]).start end = (exon_tree.edges[edges[-1]]).end else: - raise RuntimeError("Ignoring transcript with ID " + transcript_id +\ - " because first or last exon not found in exon tree.") - - transcript = Transcript(transcript_id, chromosome, start, end, strand, - gene_id,{}) + raise RuntimeError( + "Ignoring transcript with ID " + transcript_id + " because first or last exon not found in exon tree." + ) - # Make sure that all of the exons and introns in this transcript have a + transcript = Transcript(transcript_id, chromosome, start, end, strand, gene_id, {}) + + # Make sure that all of the exons and introns in this transcript have a # non-zero length. Otherwise, return None - for i in range(0,len(edges)): + for i in range(0, len(edges)): # Even indices are exons if i % 2 == 0: curr_exon_id = str(edges[i]) if curr_exon_id not in exon_tree.edges: - raise RuntimeError("Ignoring transcript with ID " + transcript_id +\ - " because exon " + curr_exon_id + " not found in exon tree.") - + raise RuntimeError( + "Ignoring transcript with ID " + + transcript_id + + " because exon " + + curr_exon_id + + " not found in exon tree." + ) + else: curr_intron_id = str(edges[i]) if curr_intron_id not in intron_tree.edges: - print("Warning: Ignoring transcript with ID " + transcript_id +\ - " because intron " + curr_intron_id + " not found in intron tree.") + print( + "Warning: Ignoring transcript with ID " + + transcript_id + + " because intron " + + curr_intron_id + + " not found in intron tree." + ) return None - for i in range(0,len(edges)): + for i in range(0, len(edges)): # Even indices are exons if i % 2 == 0: curr_exon_id = str(edges[i]) @@ -257,23 +278,23 @@ def get_transcript_from_db(transcript_row, exon_tree, intron_tree): (curr_intron.transcript_ids).add(transcript_id) return transcript - + def get_transcript_from_gtf(transcript_info): - """ Uses information from a GTF-formatted transcript entry to create a + """Uses information from a GTF-formatted transcript entry to create a Transcript object. Args: - transcript_info: A list containing fields from a GTF file gene + transcript_info: A list containing fields from a GTF file gene entry. Example: - + chr1 HAVANA transcript 12010 13670 . + - . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; - gene_type "transcribed_unprocessed_pseudogene"; - gene_status "KNOWN"; gene_name "DDX11L1"; - transcript_type "transcribed_unprocessed_pseudogene"; - transcript_status "KNOWN"; transcript_name "DDX11L1-001"; - level 2; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; - transcript_support_level "NA"; havana_gene "OTTHUMG00000000961.2"; + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; + gene_type "transcribed_unprocessed_pseudogene"; + gene_status "KNOWN"; gene_name "DDX11L1"; + transcript_type "transcribed_unprocessed_pseudogene"; + transcript_status "KNOWN"; transcript_name "DDX11L1-001"; + level 2; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; + transcript_support_level "NA"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; """ chromosome = transcript_info[0] @@ -282,36 +303,36 @@ def get_transcript_from_gtf(transcript_info): strand = transcript_info[6] if "transcript_id" not in transcript_info[-1]: - raise ValueError('GTF entry lacks a transcript_id field') + raise ValueError("GTF entry lacks a transcript_id field") annotations = extract_transcript_annotations_from_GTF(transcript_info) + gene_id = annotations["gene_id"] + transcript_id = annotations["transcript_id"] - gene_id = annotations['gene_id'] - transcript_id = annotations['transcript_id'] - - transcript = Transcript(transcript_id, chromosome, start, end, strand, - gene_id, annotations) + transcript = Transcript(transcript_id, chromosome, start, end, strand, gene_id, annotations) return transcript + def extract_transcript_annotations_from_GTF(tab_fields): - """ Extracts key-value annotations from the GTF description field - """ + """Extracts key-value annotations from the GTF description field""" attributes = {} # remove trailing newline and split by semicolon - description = tab_fields[-1].strip('\n') - description = description.split(';') + description = tab_fields[-1].strip("\n") + description = description.split(";") # Parse description for fields in description: - if fields == "" or fields == " ": continue + if fields == "" or fields == " ": + continue fields = fields.split() - if fields[0] == '': fields = fields[1:] + if fields[0] == "": + fields = fields[1:] - key = fields[0].replace('"', '') - val = ' '.join(fields[1:]).replace('"', '') + key = fields[0].replace('"', "") + val = " ".join(fields[1:]).replace('"', "") attributes[key] = val @@ -322,33 +343,31 @@ def extract_transcript_annotations_from_GTF(tab_fields): attributes["source"] = tab_fields[1] - return attributes + return attributes def get_transcript_from_exon(exon, gene_id, transcript_id): - """ In rare cases, GTF exons are listed with gene and transcript IDs that - do not have corresponding entries. In this case, we create a transcript - for this exon for bookkeeping purposes.""" + """In rare cases, GTF exons are listed with gene and transcript IDs that + do not have corresponding entries. In this case, we create a transcript + for this exon for bookkeeping purposes.""" name = transcript_id chromosome = exon.chromosome start = exon.start end = exon.end strand = exon.strand - transcript = Transcript(transcript_id, name, None, chromosome, start, end, - strand, gene_id) + transcript = Transcript(transcript_id, name, None, chromosome, start, end, strand, gene_id) return transcript -def create_novel_transcript(chromosome, start, end, strand, gene_id, counter, - exons, introns): - """ Creates a novel transcript with a unique identifier (obtained using - counter). Returns the transcript object as well as the updated counter. + +def create_novel_transcript(chromosome, start, end, strand, gene_id, counter, exons, introns): + """Creates a novel transcript with a unique identifier (obtained using + counter). Returns the transcript object as well as the updated counter. """ counter["transcripts"] += 1 transcript_id = str(counter["transcripts"]) - - transcript = Transcript(transcript_id, chromosome, start, end, strand, - gene_id, None) + + transcript = Transcript(transcript_id, chromosome, start, end, strand, gene_id, None) for exon in exons: transcript.add_exon(exon) diff --git a/src/talon/transcript_utils.py b/src/talon/transcript_utils.py index 3a0ec1f..ce37048 100644 --- a/src/talon/transcript_utils.py +++ b/src/talon/transcript_utils.py @@ -5,10 +5,12 @@ import itertools import re + import pysam + def check_read_quality(sam_record: pysam.AlignedSegment, run_info): - """ Process an individual sam read and return quality attributes. """ + """Process an individual sam read and return quality attributes.""" read_ID = sam_record.query_name flag = sam_record.flag cigar = sam_record.cigarstring @@ -16,9 +18,9 @@ def check_read_quality(sam_record: pysam.AlignedSegment, run_info): read_length = sam_record.query_length if not run_info.use_cb_tag: - dataset = sam_record.get_tag('RG') + dataset = sam_record.get_tag("RG") elif run_info.use_cb_tag: - dataset = sam_record.get_tag('CB') + dataset = sam_record.get_tag("CB") # Only use uniquely mapped transcripts if flag not in [0, 16]: @@ -30,7 +32,7 @@ def check_read_quality(sam_record: pysam.AlignedSegment, run_info): # Locate the MD field of the sam transcript try: - md_tag = sam_record.get_tag('MD') + md_tag = sam_record.get_tag("MD") except KeyError: raise ValueError("SAM transcript %s lacks an MD tag" % read_ID) @@ -39,120 +41,125 @@ def check_read_quality(sam_record: pysam.AlignedSegment, run_info): coverage = compute_alignment_coverage(cigar) identity = compute_alignment_identity(md_tag, seq) - if coverage < run_info.min_coverage or \ - identity < run_info.min_identity: + if coverage < run_info.min_coverage or identity < run_info.min_identity: return [dataset, read_ID, 0, 1, read_length, coverage, identity] # At this point, the read has passed the quality control return [dataset, read_ID, 1, 1, read_length, coverage, identity] + def compute_alignment_coverage(CIGAR): - """ This function computes what fraction of the read is actually aligned to - the genome by excluding hard or soft-clipped bases.""" + """This function computes what fraction of the read is actually aligned to + the genome by excluding hard or soft-clipped bases.""" total_bases = 0.0 unaligned_bases = 0.0 ops, counts = split_cigar(CIGAR) - for op,ct in zip(ops, counts): + for op, ct in zip(ops, counts): if op == "N": continue if op == "H" or op == "S": unaligned_bases += ct total_bases += ct - return (total_bases - unaligned_bases)/total_bases + return (total_bases - unaligned_bases) / total_bases + def compute_alignment_identity(MD_tag, SEQ): - """ This function computes what fraction of the read matches the reference - genome.""" + """This function computes what fraction of the read matches the reference + genome.""" total_bases = len(SEQ) matches = 0.0 ops, counts = splitMD(MD_tag) - for op,ct in zip(ops, counts): + for op, ct in zip(ops, counts): if op == "M": matches += ct if op == "D": total_bases += ct - return matches/total_bases + return matches / total_bases + def splitMD(MD): - """ Takes MD tag and splits into two lists: - one with capital letters (match operators), and one with - the number of bases that each operation applies to. """ - - operations = [] - - # Split MD string where type changes. - # Digits are separated from base changes. - # Deletions (with ^) are captured together. - counts = ["".join(x) for _, x in itertools.groupby(MD, key=str.isdigit)] - - # Get operations - for i in range(0,len(counts)): - curr = counts[i] - try: - counts[i] = int(curr) - operations.append("M") - except ValueError: - # Handle deletion - if curr.startswith("^"): - operations.append("D") - counts[i] = len(counts[i]) - 1 - else: - operations.append("X") - counts[i] = len(counts[i]) - - return operations, counts + """Takes MD tag and splits into two lists: + one with capital letters (match operators), and one with + the number of bases that each operation applies to.""" + + operations = [] + + # Split MD string where type changes. + # Digits are separated from base changes. + # Deletions (with ^) are captured together. + counts = ["".join(x) for _, x in itertools.groupby(MD, key=str.isdigit)] + + # Get operations + for i in range(0, len(counts)): + curr = counts[i] + try: + counts[i] = int(curr) + operations.append("M") + except ValueError: + # Handle deletion + if curr.startswith("^"): + operations.append("D") + counts[i] = len(counts[i]) - 1 + else: + operations.append("X") + counts[i] = len(counts[i]) + + return operations, counts + def split_cigar(cigar): - """ Takes CIGAR string from SAM and splits it into two lists: - one with capital letters (match operators), and one with - the number of bases that each operation applies to. """ + """Takes CIGAR string from SAM and splits it into two lists: + one with capital letters (match operators), and one with + the number of bases that each operation applies to.""" - alignTypes = re.sub('[0-9]', " ", cigar).split() - counts = re.sub('[=A-Z]', " ", cigar).split() + alignTypes = re.sub("[0-9]", " ", cigar).split() + counts = re.sub("[=A-Z]", " ", cigar).split() counts = [int(i) for i in counts] return alignTypes, counts + def compute_transcript_end(start, cigar): - """ Given the start position and CIGAR string of a mapped SAM transcript, - compute the end position in the reference genome. - Args: - start: The start position of the transcript with respect to the - forward strand + """Given the start position and CIGAR string of a mapped SAM transcript, + compute the end position in the reference genome. + Args: + start: The start position of the transcript with respect to the + forward strand - cigar: SAM CIGAR string describing match operations to the reference - genome + cigar: SAM CIGAR string describing match operations to the reference + genome - Returns: - end position of the transcript. + Returns: + end position of the transcript. """ end = start ops, counts = split_cigar(cigar) - for op,ct in zip(ops, counts): + for op, ct in zip(ops, counts): if op in ["=", "H", "M", "N", "D"]: end += ct return end - 1 -def compute_jI(start, cigar): - """ If the input sam file doesn't have the custom STARlong-derived jI tag, - we need to compute it. This is done by stepping through the CIGAR - string, where introns are represented by the N operation. - - start: The start position of the transcript with respect to the - forward strand - cigar: SAM CIGAR string describing match operations to the reference - genome - Returns: jI string representation of intron start and end positions. - Example jI strings: - no introns: jI:B:i,-1 - two introns: jI:B:i,167936516,167951806,167951862,167966628 +def compute_jI(start, cigar): + """If the input sam file doesn't have the custom STARlong-derived jI tag, + we need to compute it. This is done by stepping through the CIGAR + string, where introns are represented by the N operation. + + start: The start position of the transcript with respect to the + forward strand + cigar: SAM CIGAR string describing match operations to the reference + genome + Returns: jI string representation of intron start and end positions. + + Example jI strings: + no introns: jI:B:i,-1 + two introns: jI:B:i,167936516,167951806,167951862,167966628 """ operations, counts = split_cigar(cigar) @@ -160,7 +167,7 @@ def compute_jI(start, cigar): genomePos = start # Iterate over cigar operations - for op,ct in zip(operations, counts): + for op, ct in zip(operations, counts): if op == "N": # This is an intron intronStart = genomePos @@ -181,22 +188,22 @@ def compute_jI(start, cigar): def get_introns(sam_record: pysam.AlignedSegment, start, cigar): - """ Locates the jI field in a list of SAM fields or computes - it from the CIGAR string and start position if it isn't found. - Note that positions refer to start and endpoints of introns, not exons, - so adjustments are needed to avoid an off-by-one error if you want exons. - - Example jI strings: - no introns: jI:B:i,-1 - two introns: jI:B:i,167936516,167951806,167951862,167966628 - Args: - sam_record: a pysam AlignedSegment - start: The start position of the transcript with respect to the - forward strand - cigar: SAM CIGAR string describing match operations to the reference - genome - Returns: - intron_list: intron starts and ends in a list (sorted order) + """Locates the jI field in a list of SAM fields or computes + it from the CIGAR string and start position if it isn't found. + Note that positions refer to start and endpoints of introns, not exons, + so adjustments are needed to avoid an off-by-one error if you want exons. + + Example jI strings: + no introns: jI:B:i,-1 + two introns: jI:B:i,167936516,167951806,167951862,167966628 + Args: + sam_record: a pysam AlignedSegment + start: The start position of the transcript with respect to the + forward strand + cigar: SAM CIGAR string describing match operations to the reference + genome + Returns: + intron_list: intron starts and ends in a list (sorted order) """ try: intron_list = sam_record.get_tag("jI").tolist() From e8f4ca55b9af7fe3d99d08b0c5af251399ee62a7 Mon Sep 17 00:00:00 2001 From: fairliereese Date: Mon, 9 Oct 2023 11:15:15 -0700 Subject: [PATCH 25/31] added more read assignment info to the logger --- src/talon/logger.py | 2 +- src/talon/talon.py | 91 ++++++++++++++++++++++++++------------------- 2 files changed, 54 insertions(+), 39 deletions(-) diff --git a/src/talon/logger.py b/src/talon/logger.py index 31103d7..0d2ede7 100644 --- a/src/talon/logger.py +++ b/src/talon/logger.py @@ -9,7 +9,7 @@ def _init_logger(verbosity): level = levels[min(verbosity, len(levels) - 1)] # cap to last level index # set defaults - msg_fmt = "%(asctime)s : %(levelname)s : [%(filename)s:%(lineno)d] : %(message)s" + msg_fmt = "%(asctime)s : %(levelname)s : [%(filename)s:%(lineno)d:%(funcName)s] : %(message)s" date_fmt = "[ %Y-%m-%d %H:%M:%S ]" logging.basicConfig(level=level, format=msg_fmt, datefmt=date_fmt) diff --git a/src/talon/talon.py b/src/talon/talon.py index 79daa18..da96c93 100644 --- a/src/talon/talon.py +++ b/src/talon/talon.py @@ -475,6 +475,8 @@ def create_edge(vertex_1, vertex_2, edge_type, strand, edge_dict): def create_gene(chromosome, start, end, strand, memory_cursor, tmp_gene): """Create a novel gene and add it to the temporary table.""" new_ID = gene_counter.increment() + logging.debug(f'Creating new gene with id {new_ID}') + new_gene = (new_ID, chromosome, min(start, end), max(start, end), strand) cols = ' ("gene_ID", "chromosome", "start", "end", "strand")' @@ -489,9 +491,10 @@ def create_transcript( """Creates a novel transcript, add it to the transcript data structure, and add to tmp_t """ - print("creating new transcript") + # print("creating new transcript") new_ID = transcript_counter.increment() - print(f"new tid:{new_ID}") + # print(f"new tid:{new_ID}") + logging.debug(f'Creating new transcript with id {new_ID}') # updating the dict if len(edge_IDs) > 1: @@ -942,7 +945,8 @@ def process_ISM( # choose gene to assign it to gene_matches = list(set([match["gene_ID"] for match in all_matches])) - print(gene_matches) + logging.debug(f'Genes with matching vertices: {gene_matches}') + # print(gene_matches) # tie break based on distance to 5' / 3' ends if len(gene_matches) > 1: @@ -1202,6 +1206,8 @@ def find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex_2_gene): needs to be created fusion (bool): Whether gene read is from might be fusion / read through """ + logging.debug('Attempting to assign gene based on vertex concordance') + gene_matches = [] n_gene_matches = [] @@ -1243,15 +1249,14 @@ def find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex_2_gene): # when there are no shared splice sites between gene hits but we did # hit more than one gene elif max(n_gene_matches) <= 1 and len(gene_tally) > 1: - print(" went here") + logging.debug('Found a potential fusion transcript') return None, True # if we hit more than one gene and they have overlapping sjs, - # tie break based on ????? elif len(gene_tally) > 1: - print("i found more than one gene") - print(gene_tally) - print(n_gene_matches) + logging.debug('Found more than one gene w/ overlapping vertices') + # print(gene_tally) + # print(n_gene_matches) return list(gene_tally.keys()), False # temp = df.loc[df.gid.isin(gene_matches)].copy(deep=True) # temp = temp.drop_duplicates() @@ -1271,6 +1276,8 @@ def find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex_2_gene): else: gene_ID = max(gene_tally, key=gene_tally.get) fusion = False + logging.debug(f'Assigning this read to gene {gene_ID}') + return gene_ID, fusion @@ -1323,13 +1330,15 @@ def process_NNC( gene_starts, gene_ends, ) - print("gene id process_nnc") - print(gene_ID) - print(fusion) + # print("gene id process_nnc") + # print(gene_ID) + # print(fusion) if gene_ID == None: return None, None, [], None, fusion + logging.debug(f'Assigning this read to gene {gene_ID}') + # Get matches for the ends start_vertex, start_exon, start_novelty, known_start, diff_5p = process_5p( chrom, positions, strand, vertex_IDs, gene_ID, gene_starts, edge_dict, locations, run_info @@ -1457,7 +1466,7 @@ def process_remaining_mult_cases( transcript_novelty = [] start_end_info = {} if not run_info.create_novel_spliced_genes or not fusion: - print("did i get here?") + # print("did i get here?") gene_ID, match_strand = search_for_overlap_with_gene( chrom, positions[0], positions[-1], strand, cursor, run_info, tmp_gene, tmp_t ) @@ -1487,13 +1496,13 @@ def process_remaining_mult_cases( start_end_info["vertex_IDs"] = vertex_IDs if gene_ID == None: - print(f"fusion: {fusion}") + # print(f"fusion: {fusion}") if fusion: - print("i should be here") + # print("i should be here") t_nov = "fusion_transcript" g_nov = "fusion_novel" else: - print("but I think im going here") + # print("but I think im going here") t_nov = "intergenic_transcript" g_nov = "intergenic_novel" @@ -1591,7 +1600,8 @@ def identify_transcript( all_exons_known = check_all_exons_known(e_novelty) splice_vertices_known = sum(v_novelty) == 0 all_exons_novel = reduce(operator.mul, e_novelty, 1) == 1 - print(f"all exons novel : {all_exons_novel}") + # print(f"all exons novel : {all_exons_novel}") + logging.debug(f'All exons novel?: {all_exons_novel}') fusion = False # Look for FSM or ISM. @@ -1600,7 +1610,8 @@ def identify_transcript( all_matches = search_for_ISM(edge_IDs, transcript_dict) if all_matches != None: # Look for FSM first - print("looking for fsm") + # print("looking for fsm") + logging.debug('Looking for FSMs') gene_ID, transcript_ID, transcript_novelty, start_end_info = process_FSM( chrom, positions, @@ -1616,7 +1627,8 @@ def identify_transcript( ) if gene_ID == None: # Now look for ISM - print("looking for ism") + # print("looking for ism") + logging.debug('Looking for ISM') gene_ID, transcript_ID, transcript_novelty, start_end_info = process_ISM( chrom, positions, @@ -1634,11 +1646,11 @@ def identify_transcript( tmp_gene, tmp_t, ) - print(f"gene id from process ism {gene_ID}") # Look for NIC if gene_ID == None: - print("looking for nic") + # print("looking for nic") + logging.debug('Looking for NIC') gene_ID, transcript_ID, transcript_novelty, start_end_info, fusion = process_NIC( chrom, positions, @@ -1660,7 +1672,8 @@ def identify_transcript( # Novel in catalog transcripts have known splice donors and acceptors, # but new connections between them. elif splice_vertices_known and gene_ID == None: - print("looking for nic (again?)") + # print("looking for nic (again?)") + logging.info('Looking for NIC (2)') gene_ID, transcript_ID, transcript_novelty, start_end_info, fusion = process_NIC( chrom, positions, @@ -1681,7 +1694,8 @@ def identify_transcript( # Antisense transcript with splice junctions matching known gene if splice_vertices_known and gene_ID == None and not fusion: - print("looking for spliced antisese") + # print("looking for spliced antisese") + logging.debug('Looking for splice antisense') gene_ID, transcript_ID, gene_novelty, transcript_novelty, start_end_info = process_spliced_antisense( chrom, positions, @@ -1704,7 +1718,8 @@ def identify_transcript( # and contain at least one splice junction. There should also be at least # one shared exon from existing transcripts to even try assigning a gene elif not (splice_vertices_known) and not fusion and not all_exons_novel: - print("lookign for NNCs") + # print("lookign for NNCs") + logging.debug('Looking for NNC') gene_ID, transcript_ID, transcript_novelty, start_end_info, fusion = process_NNC( chrom, positions, @@ -1722,10 +1737,11 @@ def identify_transcript( tmp_gene, tmp_t, ) - print(f"geneID from process_nnc: {gene_ID}") + # print(f"geneID from process_nnc: {gene_ID}") # Transcripts that don't match the previous categories end up here if gene_ID == None: - print("looking for this other stuff") + # print("looking for this other stuff") + logging.debug('Looking for everything else') gene_ID, transcript_ID, gene_novelty, transcript_novelty, start_end_info = process_remaining_mult_cases( chrom, positions, @@ -1745,8 +1761,8 @@ def identify_transcript( fusion, ) - print("this is the gene id it decided on") - print(gene_ID) + logging.debug(f'Gene ID for this read: {gene_ID}') + # Add all novel vertices to vertex_2_gene now that we have the gene ID vertex_IDs = start_end_info["vertex_IDs"] edge_IDs = start_end_info["edge_IDs"] @@ -2434,7 +2450,7 @@ def batch_add_vertex2gene(cursor, v2g_file, batch_size): cursor.executemany(command, batch) except Exception as e: - print(e) + logging.error(e) sys.exit(1) return @@ -2459,7 +2475,7 @@ def batch_add_locations(cursor, location_file, batch_size): cursor.executemany(command, batch) except Exception as e: - print(e) + logging.error(e) sys.exit(1) return @@ -2484,7 +2500,7 @@ def batch_add_edges(cursor, edge_file, batch_size): cursor.executemany(command, batch) except Exception as e: - print(e) + logging.error(e) sys.exit(1) return @@ -2530,7 +2546,7 @@ def batch_add_transcripts(cursor, transcript_file, batch_size): cursor.executemany(command, batch) except Exception as e: - print(e) + logging.error(e) sys.exit(1) return @@ -2552,7 +2568,7 @@ def batch_add_genes(cursor, gene_file, batch_size): cursor.executemany(command, batch) except Exception as e: - print(e) + logging.error(e) sys.exit(1) return @@ -2568,7 +2584,7 @@ def add_datasets(cursor, datasets): cursor.executemany(command, datasets) except Exception as e: - print(e) + logging.error(e) sys.exit(1) return @@ -2598,7 +2614,7 @@ def batch_add_annotations(cursor, annot_file, annot_type, batch_size): cursor.executemany(command, batch) except Exception as e: - print(e) + logging.error(e) sys.exit(1) return @@ -2682,7 +2698,7 @@ def batch_add_observed(cursor, observed_file, batch_size): cursor.executemany(command, batch) except Exception as e: - print(e) + logging.error(e) sys.exit(1) # Now create abundance tuples and add to DB @@ -2712,7 +2728,7 @@ def batch_add_abundance(cursor, entries, batch_size): command = 'INSERT INTO "abundance"' + cols + "VALUES " + "(?,?,?)" cursor.executemany(command, batch) except Exception as e: - print(e) + logging.error(e) sys.exit(1) return @@ -2929,8 +2945,7 @@ def annotate_read(sam_record: pysam.AlignedSegment, cursor, run_info, struct_col """ # Parse attributes to determine the chromosome, positions, and strand of the transcript read_ID = sam_record.query_name - print() - print(read_ID) + logging.debug(read_ID) if not run_info.use_cb_tag: dataset = sam_record.get_tag("RG") else: From 129a8ff3f91592d21962053bbbf1beea37e63475 Mon Sep 17 00:00:00 2001 From: fairliereese Date: Mon, 9 Oct 2023 14:54:31 -0700 Subject: [PATCH 26/31] added adjustments to tests to accomodate new function signatures --- src/talon/talon.py | 52 ++++++++---- testing_suite/build_test_databases.py | 5 +- testing_suite/test_ISM_identification.py | 35 ++++---- testing_suite/test_NIC_identification.py | 15 +++- testing_suite/test_NNC_identification.py | 6 +- testing_suite/test_all_exons_known.py | 12 ++- testing_suite/test_all_exons_novel.py | 43 ++++++++++ testing_suite/test_database_updates.py | 25 +++--- testing_suite/test_get_overlap.py | 14 ++-- testing_suite/test_identification_flow.py | 81 +++++++++++++------ testing_suite/test_monoexonic.py | 17 ++-- ...on_read_overlapping_monoexon_transcript.py | 17 ++-- .../test_process_remaining_mult_cases.py | 22 ++++- .../test_search_for_overlap_with_gene.py | 53 +++++++----- 14 files changed, 275 insertions(+), 122 deletions(-) create mode 100644 testing_suite/test_all_exons_novel.py diff --git a/src/talon/talon.py b/src/talon/talon.py index da96c93..b94626e 100644 --- a/src/talon/talon.py +++ b/src/talon/talon.py @@ -534,6 +534,7 @@ def check_all_exons_known(novelty): exons are known or not. Return True if all are known, and False otherwise. Input should not include first or last exon.""" + if len(novelty) == 1: return novelty[0] == 0 @@ -544,6 +545,22 @@ def check_all_exons_known(novelty): else: return True +def check_all_exons_novel(novelty): + """Given a list in which each element represents the novelty (1) or + known-ness of a transcript edge (0), determine whether all of the + exons are novel or not. Return True if all are novel, and False + otherwise. Input should not include first or last exon.""" + + if len(novelty) == 1: + return 0 # we have no exons to analyze + + exons = novelty[1::2] + + if sum(exons) != len(exons): + return False + else: + return True + def check_all_SJs_known(novelty): """Given a list in which each element represents the novelty (1) or @@ -694,7 +711,7 @@ def search_for_overlap_with_gene(chromosome, start, end, strand, cursor, run_inf # restrict to just the genes we care about if gene_IDs: # print(f'restricting just to {gene_IDs}') - logging.debug(f"Restricing gene tiebreak to {gene_IDs}") + logging.debug(f"Restricting gene tiebreak to {gene_IDs}") matches = [match for match in matches if match["gene_ID"] in gene_IDs] if len(matches) == 0: @@ -728,9 +745,10 @@ def get_best_match(matches, min_end, max_end): # print(f'read min: {min_end}') # print(f'read end: {max_end}') - logging.debug(f"Read start / end: ({min_end}, {min_end})") + logging.debug(f"Read start / end: ({min_end}, {max_end})") for match in matches: + logging.debug('') logging.debug(f"Matching with transcripts from gene {match['gene_ID']}, transcript {match['transcript_ID']}") end_dist = abs(match["max_pos"] - max_end) start_dist = abs(match["min_pos"] - min_end) @@ -1052,8 +1070,7 @@ def process_ISM( def assign_gene( - vertex_IDs, strand, vertex_2_gene, chrom, start, end, cursor, run_info, tmp_gene, tmp_t, gene_starts, gene_ends -): + vertex_IDs, strand, vertex_2_gene, chrom, start, end, cursor, run_info, tmp_gene, tmp_t): """ Assign a gene to a transcript. First do this on the basis of splice site matching. If this yields more than one gene, then choose the gene with the @@ -1111,9 +1128,7 @@ def process_NIC( cursor, run_info, tmp_gene, - tmp_t, - gene_starts, - gene_ends, + tmp_t ) # gene_ID, fusion = find_gene_match_on_vertex_basis(vertex_IDs, @@ -1326,9 +1341,7 @@ def process_NNC( cursor, run_info, tmp_gene, - tmp_t, - gene_starts, - gene_ends, + tmp_t ) # print("gene id process_nnc") # print(gene_ID) @@ -1466,7 +1479,6 @@ def process_remaining_mult_cases( transcript_novelty = [] start_end_info = {} if not run_info.create_novel_spliced_genes or not fusion: - # print("did i get here?") gene_ID, match_strand = search_for_overlap_with_gene( chrom, positions[0], positions[-1], strand, cursor, run_info, tmp_gene, tmp_t ) @@ -1496,7 +1508,7 @@ def process_remaining_mult_cases( start_end_info["vertex_IDs"] = vertex_IDs if gene_ID == None: - # print(f"fusion: {fusion}") + logging.debug(f"Fusion: {fusion}") if fusion: # print("i should be here") t_nov = "fusion_transcript" @@ -1590,18 +1602,24 @@ def identify_transcript( # Get vertex matches for the transcript positions vertex_IDs, v_novelty = match_splice_vertices(chrom, positions, strand, location_dict, run_info) + logging.debug(f'Vertex IDs: {vertex_IDs}') + logging.debug(f'Vertex novelties: {v_novelty}') + # Get edge matches for transcript exons and introns based on the vertices edge_IDs, e_novelty = match_all_splice_edges(vertex_IDs, strand, edge_dict, run_info) + logging.debug(f'Edge IDs: {edge_IDs}') + logging.debug(f'Exon novelty: {e_novelty}') # Check novelty of exons and splice jns. This will help us categorize # what type of novelty the transcript has all_SJs_known = check_all_SJs_known(e_novelty) all_exons_known = check_all_exons_known(e_novelty) - splice_vertices_known = sum(v_novelty) == 0 - all_exons_novel = reduce(operator.mul, e_novelty, 1) == 1 + all_exons_novel = check_all_exons_novel(e_novelty) + splice_vertices_known = (sum(v_novelty) == 0) + # all_exons_novel = reduce(operator.mul, e_novelty, 1) == 1 # print(f"all exons novel : {all_exons_novel}") - logging.debug(f'All exons novel?: {all_exons_novel}') + logging.debug(f'All internal exons novel?: {all_exons_novel}') fusion = False # Look for FSM or ISM. @@ -1673,7 +1691,7 @@ def identify_transcript( # but new connections between them. elif splice_vertices_known and gene_ID == None: # print("looking for nic (again?)") - logging.info('Looking for NIC (2)') + logging.debug('Looking for NIC (2)') gene_ID, transcript_ID, transcript_novelty, start_end_info, fusion = process_NIC( chrom, positions, @@ -2945,6 +2963,8 @@ def annotate_read(sam_record: pysam.AlignedSegment, cursor, run_info, struct_col """ # Parse attributes to determine the chromosome, positions, and strand of the transcript read_ID = sam_record.query_name + logging.debug('') + logging.debug('') logging.debug(read_ID) if not run_info.use_cb_tag: dataset = sam_record.get_tag("RG") diff --git a/testing_suite/build_test_databases.py b/testing_suite/build_test_databases.py index 4baed17..f5dcff4 100644 --- a/testing_suite/build_test_databases.py +++ b/testing_suite/build_test_databases.py @@ -232,7 +232,8 @@ "--3p", "300", "--idprefix", "TALON", "--l", "0", - "--g", "hg38", "--o", "scratch/readthrough"]) + "--g", "hg38", + "--o", "scratch/readthrough"]) except Exception as e: print(e) sys.exit("Database initialization failed on readthrough annotation") @@ -246,6 +247,8 @@ "--build", "hg38", "--cov", "0", "--identity", "0", + "--create_novel_spliced_genes", + '-v', '2', "--o", "scratch/readthrough" ]) except Exception as e: print(e) diff --git a/testing_suite/test_ISM_identification.py b/testing_suite/test_ISM_identification.py index e920215..7685d58 100644 --- a/testing_suite/test_ISM_identification.py +++ b/testing_suite/test_ISM_identification.py @@ -13,7 +13,8 @@ def test_ISM_suffix(self): database = "scratch/toy.db" run_info = talon.init_run_info(database, build) talon.get_counters(database) - + init_refs.make_temp_novel_gene_table(cursor, build) + init_refs.make_temp_transcript_table(cursor, build) edge_dict = init_refs.make_edge_dict(cursor) location_dict = init_refs.make_location_dict(build, cursor) transcript_dict = init_refs.make_transcript_dict(cursor, build) @@ -28,17 +29,20 @@ def test_ISM_suffix(self): v_novelty = [0, 0] all_matches = talon.search_for_ISM(edge_IDs, transcript_dict) - gene_ID, transcript_ID, novelty, start_end_info = talon.process_ISM(chrom, - positions, + gene_ID, transcript_ID, novelty, start_end_info = talon.process_ISM(chrom, + positions, strand, edge_IDs, - vertex_IDs, - all_matches, + vertex_IDs, + all_matches, transcript_dict, - gene_starts, gene_ends, - edge_dict, location_dict, - run_info) + gene_starts, gene_ends, + edge_dict, location_dict, + run_info, + cursor, + "temp_gene", + "temp_transcript") - correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor) + correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor) assert gene_ID == correct_gene_ID assert start_end_info["vertex_IDs"] == [3, 4, 5, 6] @@ -56,7 +60,8 @@ def test_ISM_prefix(self): database = "scratch/toy.db" run_info = talon.init_run_info(database, build) talon.get_counters(database) - + init_refs.make_temp_novel_gene_table(cursor, build) + init_refs.make_temp_transcript_table(cursor, build) edge_dict = init_refs.make_edge_dict(cursor) location_dict = init_refs.make_location_dict(build, cursor) transcript_dict = init_refs.make_transcript_dict(cursor, build) @@ -79,12 +84,15 @@ def test_ISM_prefix(self): transcript_dict, gene_starts, gene_ends, edge_dict, location_dict, - run_info) + run_info, + cursor, + "temp_gene", + "temp_transcript") correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor) assert gene_ID == correct_gene_ID assert start_end_info["vertex_IDs"] == [1, 2, 3, 4] - assert start_end_info["edge_IDs"] == [1, 2, 3] + assert start_end_info["edge_IDs"] == [1, 2, 3] conn.close() @@ -112,5 +120,4 @@ def test_no_match(self): all_matches = talon.search_for_ISM(edge_IDs, transcript_dict) assert all_matches == None - conn.close() - + conn.close() diff --git a/testing_suite/test_NIC_identification.py b/testing_suite/test_NIC_identification.py index 92e10fe..1137eb9 100644 --- a/testing_suite/test_NIC_identification.py +++ b/testing_suite/test_NIC_identification.py @@ -16,6 +16,8 @@ def test_NIC_match(self): edge_dict = init_refs.make_edge_dict(cursor) location_dict = init_refs.make_location_dict(build, cursor) run_info = talon.init_run_info(database, build) + init_refs.make_temp_novel_gene_table(cursor, build) + init_refs.make_temp_transcript_table(cursor, build) transcript_dict = init_refs.make_transcript_dict(cursor, build) vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor) gene_starts = init_refs.make_gene_start_or_end_dict(cursor, build, "start") @@ -32,9 +34,12 @@ def test_NIC_match(self): positions, strand, edge_IDs, vertex_IDs, transcript_dict, - gene_starts, gene_ends, - edge_dict, location_dict, - vertex_2_gene, run_info) + gene_starts, gene_ends, edge_dict, location_dict, + vertex_2_gene, + run_info, + cursor, + "temp_gene", + "temp_transcript") correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor) assert gene_ID == correct_gene_ID @@ -54,6 +59,8 @@ def test_antisense(self): edge_dict = init_refs.make_edge_dict(cursor) locations = init_refs.make_location_dict(build, cursor) run_info = talon.init_run_info(database, build) + init_refs.make_temp_novel_gene_table(cursor, build) + init_refs.make_temp_transcript_table(cursor, build) transcript_dict = init_refs.make_transcript_dict(cursor, build) vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor) gene_starts = init_refs.make_gene_start_or_end_dict(cursor, build, "start") @@ -82,7 +89,7 @@ def test_antisense(self): gene_ends, edge_dict, locations, vertex_2_gene, run_info, - cursor, "temp_gene") + cursor, "temp_gene", "temp_transcript") #anti_gene_ID = talon.find_gene_match_on_vertex_basis(vertex_IDs, # anti_strand, # vertex_2_gene) diff --git a/testing_suite/test_NNC_identification.py b/testing_suite/test_NNC_identification.py index 63f1cd6..7b9e852 100644 --- a/testing_suite/test_NNC_identification.py +++ b/testing_suite/test_NNC_identification.py @@ -16,6 +16,8 @@ def test_NNC_match(self): edge_dict = init_refs.make_edge_dict(cursor) location_dict = init_refs.make_location_dict(build, cursor) run_info = talon.init_run_info(database, build) + init_refs.make_temp_novel_gene_table(cursor, build) + init_refs.make_temp_transcript_table(cursor, build) transcript_dict = init_refs.make_transcript_dict(cursor, build) vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor) gene_starts = init_refs.make_gene_start_or_end_dict(cursor, build, "start") @@ -34,7 +36,9 @@ def test_NNC_match(self): vertex_IDs, transcript_dict, gene_starts, gene_ends, edge_dict, location_dict, - vertex_2_gene, run_info) + vertex_2_gene, run_info, + cursor, "temp_gene", + "temp_transcript") correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor) assert gene_ID == correct_gene_ID diff --git a/testing_suite/test_all_exons_known.py b/testing_suite/test_all_exons_known.py index 831e433..52f358f 100644 --- a/testing_suite/test_all_exons_known.py +++ b/testing_suite/test_all_exons_known.py @@ -8,7 +8,7 @@ def test_find_true(self): """ Example where all of the exons are known. """ # Remember that first pos is first intron, last is last intron - novelty = [0, 0, 0, 0, 0 ] + novelty = [0, 0, 0, 0, 0 ] # Make sure that no match got returned assert talon.check_all_exons_known(novelty) == True @@ -21,17 +21,23 @@ def test_find_true_with_novel_exons(self): # Make sure that no match got returned assert talon.check_all_exons_known(novelty) == True - + def test_find_false(self): """ Example with novel exons """ novelty = [0, 1, 0, 1, 0] # Make sure that no match got returned - assert talon.check_all_exons_known(novelty) == False + assert talon.check_all_exons_known(novelty) == False def test_monoexonic(self): """ Monoexonic known exon """ novelty = [0] assert talon.check_all_exons_known(novelty) == True + + def test_no_internal_exons(self): + """ No internal exons """ + novelty = [0, 0, 0] + + assert talon.check_all_exons_known(novelty) == True diff --git a/testing_suite/test_all_exons_novel.py b/testing_suite/test_all_exons_novel.py new file mode 100644 index 0000000..004f165 --- /dev/null +++ b/testing_suite/test_all_exons_novel.py @@ -0,0 +1,43 @@ +import pytest +from talon import talon +@pytest.mark.dbunit + +class TestAllExonsNovel(object): + + def test_find_true(self): + """ Example where all of the exons are known. + """ + # Remember that first pos is first intron, last is last intron + novelty = [0, 0, 0, 0, 0 ] + + # Make sure that no match got returned + assert talon.check_all_exons_novel(novelty) == False + + def test_find_true_with_novel_exons(self): + """ Example where all of the exons are known, but the introns are not. + Note: This is not necessarily realistic biologically. + """ + novelty = [1, 0, 1, 0, 1] + + # Make sure that no match got returned + assert talon.check_all_exons_novel(novelty) == False + + def test_find_false(self): + """ Example with novel exons + """ + novelty = [0, 1, 0, 1, 0] + + # Make sure that no match got returned + assert talon.check_all_exons_novel(novelty) == True + + def test_monoexonic(self): + """ Monoexonic known exon """ + novelty = [0] + + assert talon.check_all_exons_novel(novelty) == False + + def test_no_internal_exons(self): + """ No internal exons """ + novelty = [0, 0, 0] + + assert talon.check_all_exons_novel(novelty) == False diff --git a/testing_suite/test_database_updates.py b/testing_suite/test_database_updates.py index d4927c2..36cffbf 100644 --- a/testing_suite/test_database_updates.py +++ b/testing_suite/test_database_updates.py @@ -55,7 +55,7 @@ def test_observed(self): with open("scratch/db_updates/observed.tsv", 'w') as f: for obs in observed: f.write("\t".join([str(x) for x in obs]) + "\n") - + batch_size = 1 talon.batch_add_observed(cursor, "scratch/db_updates/observed.tsv", batch_size) @@ -100,7 +100,7 @@ def test_gene_annot(self): f.write("\t".join([str(x) for x in entry]) + "\n") batch_size = 1 - talon.batch_add_annotations(cursor, "scratch/db_updates/gene_annot.tsv", + talon.batch_add_annotations(cursor, "scratch/db_updates/gene_annot.tsv", "gene", batch_size) # Test if items are there @@ -125,7 +125,7 @@ def test_transcript_annot(self): f.write("\t".join([str(x) for x in entry]) + "\n") batch_size = 2 - talon.batch_add_annotations(cursor, "scratch/db_updates/transcript_annot.tsv", + talon.batch_add_annotations(cursor, "scratch/db_updates/transcript_annot.tsv", "transcript", batch_size) # Test if items are there @@ -150,7 +150,7 @@ def test_exon_annot(self): f.write("\t".join([str(x) for x in entry]) + "\n") batch_size = 3 - talon.batch_add_annotations(cursor, "scratch/db_updates/exon_annot.tsv", + talon.batch_add_annotations(cursor, "scratch/db_updates/exon_annot.tsv", "exon", batch_size) # Test if items are there @@ -180,7 +180,7 @@ def test_gene_update(self): talon.batch_add_genes(cursor, "scratch/db_updates/genes.tsv", 10) - # Test if gene with ID 6 is there, but make sure we didn't add + # Test if gene with ID 6 is there, but make sure we didn't add # duplicates of the other genes query = "SELECT * FROM genes" gene_IDs = [ x['gene_ID'] for x in cursor.execute(query)] @@ -189,15 +189,18 @@ def test_gene_update(self): conn.close() def test_transcript_update(self): - """ Try to add novel transcript entries to database while ignoring + """ Try to add novel transcript entries to database while ignoring duplicates """ conn, cursor = get_db_cursor() build = "toy_build" transcript_dict = init_refs.make_transcript_dict(cursor, build) + init_refs.make_temp_transcript_table(cursor, build) + database = "scratch/toy.db" talon.get_counters(database) - talon.create_transcript("chr1", 1, 1000, 1, (1,), (1,2), transcript_dict) + talon.create_transcript('+', "chr1", 1, 1000, 1, (1,), (1,2), transcript_dict, + "temp_transcript", cursor) # Write to file os.system("mkdir -p scratch/db_updates/") @@ -257,7 +260,7 @@ def test_edge_update(self): batch_size = 10 talon.batch_add_edges(cursor, "scratch/db_updates/edges.tsv", batch_size) - + # Test if the edge table has the correct number of edges now query = "SELECT * FROM edge" cursor.execute(query) @@ -277,7 +280,7 @@ def test_location_update(self): orig_n_pos = talon.vertex_counter.value() talon.create_vertex("chr4", 2000, location_dict, run_info) - + # Write to file os.system("mkdir -p scratch/db_updates/") with open("scratch/db_updates/loc.tsv", 'w') as f: @@ -307,12 +310,12 @@ def test_vertex2gene_update(self): build = "toy_build" vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor) - talon.update_vertex_2_gene(2, (1,2), "-", vertex_2_gene) + talon.update_vertex_2_gene(2, (1,2), "-", vertex_2_gene) talon.update_vertex_2_gene(1, (1,2,3,4,5,6), "+", vertex_2_gene) # Write to file os.system("mkdir -p scratch/db_updates/") - with open("scratch/db_updates/v2g.tsv", 'w') as f: + with open("scratch/db_updates/v2g.tsv", 'w') as f: for vertex_ID, gene_set in vertex_2_gene.items(): for gene in gene_set: entry = "\t".join([ str(x) for x in (vertex_ID, gene[0])]) diff --git a/testing_suite/test_get_overlap.py b/testing_suite/test_get_overlap.py index f647892..1a52868 100644 --- a/testing_suite/test_get_overlap.py +++ b/testing_suite/test_get_overlap.py @@ -5,11 +5,11 @@ class TestGetOverlap(object): def test_1(self): """ Example where intervals of size 11 match exactly. So the answer - should be 11. + should be 11. """ - a = [ 10, 20 ] + a = [ 10, 20 ] b = [ 10, 20 ] - assert talon.get_overlap(a, b) == 11 + assert talon.get_overlap(a, b)[0] == 11 def test_2(self): """ Example where interval a is contained within interval b. The answer @@ -17,18 +17,18 @@ def test_2(self): """ a = [ 12, 18 ] b = [ 10, 20 ] - assert talon.get_overlap(a, b) == 7 + assert talon.get_overlap(a, b)[0] == 7 def test_3(self): """ Example where interval a starts and ends earlier than b. """ a = [ 10, 20 ] b = [ 15, 25 ] - assert talon.get_overlap(a, b) == 6 + assert talon.get_overlap(a, b)[0] == 6 def test_4(self): - """ Example with no overlap. + """ Example with no overlap. """ a = [ 10, 20 ] b = [ 30, 40 ] - assert talon.get_overlap(a, b) == 0 + assert talon.get_overlap(a, b)[0] == 0 diff --git a/testing_suite/test_identification_flow.py b/testing_suite/test_identification_flow.py index a9fb5fd..bf5358a 100644 --- a/testing_suite/test_identification_flow.py +++ b/testing_suite/test_identification_flow.py @@ -14,9 +14,12 @@ def test_FSM_perfect(self): database = "scratch/toy.db" talon.get_counters(database) init_refs.make_temp_novel_gene_table(cursor, build) + init_refs.make_temp_transcript_table(cursor, build) + + init_refs.make_temp_transcript_table(cursor, build) edge_dict = init_refs.make_edge_dict(cursor) location_dict = init_refs.make_location_dict(build, cursor) - run_info = talon.init_run_info(database, build) + run_info = talon.init_run_info(database, build) transcript_dict = init_refs.make_transcript_dict(cursor, build) vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor) gene_starts = init_refs.make_gene_start_or_end_dict(cursor, build, "start") @@ -27,13 +30,13 @@ def test_FSM_perfect(self): positions = [ 1, 100, 500, 600, 900, 1000 ] - annotation = talon.identify_transcript(chrom, positions, strand, cursor, - location_dict, edge_dict, - transcript_dict, vertex_2_gene, + annotation = talon.identify_transcript(chrom, positions, strand, cursor, + location_dict, edge_dict, + transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info, - "temp_gene") + "temp_gene", "temp_transcript") - correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor) + correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor) correct_transcript_ID = fetch_correct_ID("TG1-001", "transcript", cursor) assert annotation['gene_ID'] == correct_gene_ID assert annotation['transcript_ID'] == correct_transcript_ID @@ -50,6 +53,8 @@ def test_FSM_end_diff(self): talon.get_counters(database) init_refs.make_temp_novel_gene_table(cursor, build) + init_refs.make_temp_transcript_table(cursor, build) + edge_dict = init_refs.make_edge_dict(cursor) location_dict = init_refs.make_location_dict(build, cursor) run_info = talon.init_run_info(database, build) @@ -67,7 +72,7 @@ def test_FSM_end_diff(self): location_dict, edge_dict, transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info, - "temp_gene") + "temp_gene", "temp_transcript") correct_gene_ID = fetch_correct_ID("TG2", "gene", cursor) novelty_types = [ x[-2] for x in annotation['transcript_novelty']] @@ -76,7 +81,7 @@ def test_FSM_end_diff(self): conn.close() def test_NIC_instead_of_ISM(self): - """ Test case where the transcript looks like an ISM, but has known + """ Test case where the transcript looks like an ISM, but has known starts and ends. In past TALON versions, this was considered NIC, but expected behavior is now ISM """ conn, cursor = get_db_cursor() @@ -85,6 +90,8 @@ def test_NIC_instead_of_ISM(self): talon.get_counters(database) init_refs.make_temp_novel_gene_table(cursor, build) + init_refs.make_temp_transcript_table(cursor, build) + edge_dict = init_refs.make_edge_dict(cursor) location_dict = init_refs.make_location_dict(build, cursor) run_info = talon.init_run_info(database, build) @@ -101,7 +108,7 @@ def test_NIC_instead_of_ISM(self): location_dict, edge_dict, transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info, - "tmp_gene") + "tmp_gene", "temp_transcript") correct_gene_ID = fetch_correct_ID("TG5", "gene", cursor) novelty_types = [ x[-2] for x in annotation['transcript_novelty']] @@ -118,6 +125,8 @@ def test_ISM_suffix(self): talon.get_counters(database) init_refs.make_temp_novel_gene_table(cursor, build) + init_refs.make_temp_transcript_table(cursor, build) + edge_dict = init_refs.make_edge_dict(cursor) location_dict = init_refs.make_location_dict(build, cursor) run_info = talon.init_run_info(database, build) @@ -134,12 +143,12 @@ def test_ISM_suffix(self): location_dict, edge_dict, transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info, - "temp_gene") + "temp_gene", "temp_transcript") correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor) novelty_types = [ x[-2] for x in annotation['transcript_novelty']] assert annotation['gene_ID'] == correct_gene_ID - assert "ISM_transcript" in novelty_types + assert "ISM_transcript" in novelty_types assert "ISM-suffix_transcript" in novelty_types assert annotation['start_delta'] == 50 conn.close() @@ -153,6 +162,8 @@ def test_ISM_prefix(self): talon.get_counters(database) init_refs.make_temp_novel_gene_table(cursor, build) + init_refs.make_temp_transcript_table(cursor, build) + edge_dict = init_refs.make_edge_dict(cursor) location_dict = init_refs.make_location_dict(build, cursor) run_info = talon.init_run_info(database, build) @@ -168,7 +179,8 @@ def test_ISM_prefix(self): annotation = talon.identify_transcript(chrom, positions, strand, cursor, location_dict, edge_dict, transcript_dict, vertex_2_gene, - gene_starts, gene_ends, run_info, "temp_gene") + gene_starts, gene_ends, run_info, + "temp_gene", "temp_transcript") correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor) novelty_types = [ x[-2] for x in annotation['transcript_novelty']] @@ -187,6 +199,8 @@ def test_ISM_internal(self): talon.get_counters(database) init_refs.make_temp_novel_gene_table(cursor, build) + init_refs.make_temp_transcript_table(cursor, build) + init_refs.make_temp_monoexonic_transcript_table(cursor, build) edge_dict = init_refs.make_edge_dict(cursor) location_dict = init_refs.make_location_dict(build, cursor) @@ -203,12 +217,13 @@ def test_ISM_internal(self): annotation = talon.identify_monoexon_transcript(chrom, positions, strand, cursor, location_dict, edge_dict, transcript_dict, vertex_2_gene, - gene_starts, gene_ends, run_info, "temp_gene", "temp_monoexon") + gene_starts, gene_ends, run_info, + "temp_gene", "temp_transcript", "temp_monoexon") correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor) novelty_types = [ x[-2] for x in annotation['transcript_novelty']] assert annotation['gene_ID'] == correct_gene_ID - assert "ISM_transcript" in novelty_types + assert "ISM_transcript" in novelty_types assert annotation['start_delta'] == annotation['end_delta'] == 0 conn.close() @@ -221,6 +236,8 @@ def test_NIC(self): talon.get_counters(database) init_refs.make_temp_novel_gene_table(cursor, build) + init_refs.make_temp_transcript_table(cursor, build) + edge_dict = init_refs.make_edge_dict(cursor) location_dict = init_refs.make_location_dict(build, cursor) run_info = talon.init_run_info(database, build) @@ -231,12 +248,13 @@ def test_NIC(self): chrom = "chr1" strand = "+" - positions = ( 1, 100, 900, 1000 ) + positions = ( 1, 100, 900, 1000 ) annotation = talon.identify_transcript(chrom, positions, strand, cursor, location_dict, edge_dict, transcript_dict, vertex_2_gene, - gene_starts, gene_ends, run_info, "temp_gene") + gene_starts, gene_ends, run_info, + "temp_gene", "temp_transcript") correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor) novelty_types = [ x[-2] for x in annotation['transcript_novelty']] @@ -247,14 +265,16 @@ def test_NIC(self): def test_NNC(self): """ Example where the transcript skips an exon and has a novel splice - donor + donor at the first exon """ conn, cursor = get_db_cursor() build = "toy_build" database = "scratch/toy.db" - talon.get_counters(database) + talon.get_counters(database) init_refs.make_temp_novel_gene_table(cursor, build) + init_refs.make_temp_transcript_table(cursor, build) + edge_dict = init_refs.make_edge_dict(cursor) location_dict = init_refs.make_location_dict(build, cursor) run_info = talon.init_run_info(database, build) @@ -270,7 +290,8 @@ def test_NNC(self): annotation = talon.identify_transcript(chrom, positions, strand, cursor, location_dict, edge_dict, transcript_dict, vertex_2_gene, - gene_starts, gene_ends, run_info, "temp_gene") + gene_starts, gene_ends, run_info, + "temp_gene", "temp_transcript") correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor) novelty_types = [ x[-2] for x in annotation['transcript_novelty']] @@ -288,6 +309,8 @@ def test_spliced_antisense(self): talon.get_counters(database) init_refs.make_temp_novel_gene_table(cursor, build) + init_refs.make_temp_transcript_table(cursor, build) + edge_dict = init_refs.make_edge_dict(cursor) location_dict = init_refs.make_location_dict(build, cursor) run_info = talon.init_run_info(database, build) @@ -303,7 +326,8 @@ def test_spliced_antisense(self): annotation = talon.identify_transcript(chrom, positions, strand, cursor, location_dict, edge_dict, transcript_dict, vertex_2_gene, - gene_starts, gene_ends, run_info, "temp_gene") + gene_starts, gene_ends, run_info, + "temp_gene", "temp_transcript") anti_gene_ID = fetch_correct_ID("TG2", "gene", cursor) gene_novelty_types = [ x[-2] for x in annotation['gene_novelty']] @@ -322,6 +346,8 @@ def test_genomic_unspliced(self): talon.get_counters(database) init_refs.make_temp_novel_gene_table(cursor, build) + init_refs.make_temp_transcript_table(cursor, build) + init_refs.make_temp_monoexonic_transcript_table(cursor, build) edge_dict = init_refs.make_edge_dict(cursor) location_dict = init_refs.make_location_dict(build, cursor) @@ -338,14 +364,15 @@ def test_genomic_unspliced(self): annotation = talon.identify_monoexon_transcript(chrom, positions, strand, cursor, location_dict, edge_dict, transcript_dict, vertex_2_gene, - gene_starts, gene_ends, run_info, "temp_gene", "temp_monoexon") + gene_starts, gene_ends, run_info, + "temp_gene", "temp_transcript", "temp_monoexon") correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor) novelty_types = [ x[-2] for x in annotation['transcript_novelty']] assert annotation['gene_ID'] == correct_gene_ID assert "genomic_transcript" in novelty_types assert annotation['end_delta'] == -10 - conn.close() + conn.close() def test_NIC_with_all_known_edges(self): """ Test case derived from a real mouse Map2k4 read. All of edges are @@ -355,9 +382,11 @@ def test_NIC_with_all_known_edges(self): talon.get_counters(database) conn = sqlite3.connect(database) conn.row_factory = sqlite3.Row - cursor = conn.cursor() + cursor = conn.cursor() build = "mm10" init_refs.make_temp_novel_gene_table(cursor, build) + init_refs.make_temp_transcript_table(cursor, build) + edge_dict = init_refs.make_edge_dict(cursor) location_dict = init_refs.make_location_dict(build, cursor) run_info = talon.init_run_info(database, build) @@ -369,11 +398,12 @@ def test_NIC_with_all_known_edges(self): chrom = "chr11" strand = "-" positions = [65788254, 65788136, 65775765, 65775733, 65756371, 65756269, 65735366, 65735192, 65719603, 65719484, 65712297, 65712178, 65709983, 65709932, 65707111, 65706984, 65696365, 65696288, 65693570, 65693422, 65691773, 65691728, 65690804, 65689322] - + annotation = talon.identify_transcript(chrom, positions, strand, cursor, location_dict, edge_dict, transcript_dict, vertex_2_gene, - gene_starts, gene_ends, run_info, "temp_gene") + gene_starts, gene_ends, run_info, + "temp_gene", "temp_transcript") assert annotation['gene_ID'] == 1 assert annotation['transcript_ID'] == 8 @@ -381,4 +411,3 @@ def test_NIC_with_all_known_edges(self): assert "NIC_transcript" in novelty_types conn.close() - diff --git a/testing_suite/test_monoexonic.py b/testing_suite/test_monoexonic.py index d0ebf64..92ad9c6 100644 --- a/testing_suite/test_monoexonic.py +++ b/testing_suite/test_monoexonic.py @@ -14,6 +14,7 @@ def test_match(self): talon.get_counters(database) init_refs.make_temp_novel_gene_table(cursor, build) init_refs.make_temp_monoexonic_transcript_table(cursor, build) + init_refs.make_temp_transcript_table(cursor, build) edge_dict = init_refs.make_edge_dict(cursor) location_dict = init_refs.make_location_dict(build, cursor) run_info = talon.init_run_info(database, build) @@ -26,12 +27,12 @@ def test_match(self): strand = "-" positions = ( 3900, 1100 ) - annotation = talon.identify_monoexon_transcript(chrom, positions, + annotation = talon.identify_monoexon_transcript(chrom, positions, strand, cursor, location_dict, edge_dict, transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info, - 'temp_gene', 'temp_monoexon') + 'temp_gene', 'temp_transcript', 'temp_monoexon') correct_gene_ID = fetch_correct_ID("TG6", "gene", cursor) correct_transcript_ID = fetch_correct_ID("TG6-001", "transcript", cursor) @@ -43,7 +44,7 @@ def test_match(self): def test_partial_match(self): """ Example where the transcript overlaps a single-exon transcript, - but is shorter. In the past, the start would be assigned to the + but is shorter. In the past, the start would be assigned to the annotated start, and the end would be novel. This is no longer the case- at this time, the transcript will be assigned to the annotated match. """ @@ -53,6 +54,7 @@ def test_partial_match(self): database = "scratch/toy.db" talon.get_counters(database) init_refs.make_temp_novel_gene_table(cursor, build) + init_refs.make_temp_transcript_table(cursor, build) init_refs.make_temp_monoexonic_transcript_table(cursor, build) edge_dict = init_refs.make_edge_dict(cursor) location_dict = init_refs.make_location_dict(build, cursor) @@ -71,7 +73,7 @@ def test_partial_match(self): location_dict, edge_dict, transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info, - 'temp_gene', 'temp_monoexon') + 'temp_gene', 'temp_transcript', 'temp_monoexon') correct_gene_ID = fetch_correct_ID("TG6", "gene", cursor) correct_transcript_ID = fetch_correct_ID("TG6-001", "transcript", cursor) @@ -82,7 +84,7 @@ def test_partial_match(self): conn.close() -# Commenting out these tests for now because they are redundant. But saving in +# Commenting out these tests for now because they are redundant. But saving in # case they might be useful down the line. # def test_partial_match_3prime(self): @@ -176,6 +178,7 @@ def test_antisense(self): database = "scratch/toy.db" talon.get_counters(database) init_refs.make_temp_novel_gene_table(cursor, build) + init_refs.make_temp_transcript_table(cursor, build) init_refs.make_temp_monoexonic_transcript_table(cursor, build) edge_dict = init_refs.make_edge_dict(cursor) location_dict = init_refs.make_location_dict(build, cursor) @@ -194,7 +197,7 @@ def test_antisense(self): location_dict, edge_dict, transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info, - 'temp_gene', 'temp_monoexon') + 'temp_gene', 'temp_transcript', 'temp_monoexon') anti_gene_ID = fetch_correct_ID("TG6", "gene", cursor) gene_novelty_types = [ x[-2] for x in annotation['gene_novelty']] @@ -203,4 +206,4 @@ def test_antisense(self): assert "antisense_gene" in gene_novelty_types assert "antisense_transcript" in t_novelty_types - conn.close() + conn.close() diff --git a/testing_suite/test_multiexon_read_overlapping_monoexon_transcript.py b/testing_suite/test_multiexon_read_overlapping_monoexon_transcript.py index 7ff6a21..e419548 100644 --- a/testing_suite/test_multiexon_read_overlapping_monoexon_transcript.py +++ b/testing_suite/test_multiexon_read_overlapping_monoexon_transcript.py @@ -14,7 +14,7 @@ def test_transcript_assigned_intergenic(self): when it was actually supposed to be genomic """ # Set up references - database = "scratch/multiexon_read_overlapping_monoexon_transcript/talon.db" + database = "scratch/multiexon_read_overlapping_monoexon_transcript/talon.db" conn = sqlite3.connect(database) conn.row_factory = sqlite3.Row cursor = conn.cursor() @@ -23,6 +23,7 @@ def test_transcript_assigned_intergenic(self): talon.get_counters(database) run_info = talon.init_run_info(database, build) struct_collection = talon.prepare_data_structures(cursor, run_info) + init_refs.make_temp_transcript_table(cursor, "toy_build") # Use pysam to get the read from the SAM file sam_file = "input_files/multiexon_read_overlapping_monoexon_transcript/read.sam" @@ -34,21 +35,21 @@ def test_transcript_assigned_intergenic(self): # Get read attributes chrom = sam_record.reference_name strand = "-" if sam_record.is_reverse else "+" - sam_start = sam_record.reference_start + sam_start = sam_record.reference_start sam_end = sam_record.reference_end # Do we get any overlap with the reference gene? best_gene, match_strand = talon.search_for_overlap_with_gene(chrom, min(sam_start, sam_end), - max(sam_start, sam_end), strand, - cursor, run_info, - struct_collection.tmp_gene) + max(sam_start, sam_end), strand, + cursor, run_info, + struct_collection.tmp_gene, + struct_collection.tmp_t) assert best_gene == 1 assert match_strand == "-" - annotation_info = talon.annotate_read(sam_record, cursor, run_info, + annotation_info = talon.annotate_read(sam_record, cursor, run_info, struct_collection, mode = 0) - + assert annotation_info['gene_ID'] == 1 assert annotation_info['transcript_ID'] == 2 assert 'genomic_transcript' in annotation_info['transcript_novelty'][0] - diff --git a/testing_suite/test_process_remaining_mult_cases.py b/testing_suite/test_process_remaining_mult_cases.py index 36966bb..dbb01fa 100644 --- a/testing_suite/test_process_remaining_mult_cases.py +++ b/testing_suite/test_process_remaining_mult_cases.py @@ -1,6 +1,10 @@ import pytest from talon import talon, init_refs from .helper_fns import fetch_correct_ID, get_db_cursor +import logging + +logging.basicConfig(level=logging.DEBUG) + @pytest.mark.integration class TestIdentifyRemaining(object): @@ -15,7 +19,9 @@ def test_fusion(self): talon.get_counters(db) edge_dict = init_refs.make_edge_dict(cursor) location_dict = init_refs.make_location_dict(build, cursor) - run_info = talon.init_run_info(db, build) + run_info = talon.init_run_info(db, build, create_novel_spliced_genes=True) + init_refs.make_temp_novel_gene_table(cursor, "toy_build") + init_refs.make_temp_transcript_table(cursor, "toy_build") transcript_dict = init_refs.make_transcript_dict(cursor, build) vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor) gene_starts = init_refs.make_gene_start_or_end_dict(cursor, build, "start") @@ -23,10 +29,10 @@ def test_fusion(self): correct_gene_ID = talon.gene_counter.value() + 1 chrom = "chr1" - positions = [1, 100, 500, 600, 900, 1010, 5000, 5550, 6000] + positions = [100, 500, 600, 900, 1010, 5000, 5550, 6000] strand = "+" edge_IDs = [2, 3, 4]+[ talon.edge_counter.value() + 1, talon.edge_counter.value() + 2 ] - vertex_IDs = [2, 3, 4, 5, 9, 10] + vertex_IDs = [2, 3, 4, 5, 9, 10, 11] v_novelty = [0, 0, 0, 0, 0, 0] # Construct temp novel gene db @@ -42,6 +48,7 @@ def test_fusion(self): edge_dict, location_dict, vertex_2_gene, run_info, cursor, "temp_gene", + "temp_transcript", fusion) assert gene_ID == correct_gene_ID @@ -62,6 +69,8 @@ def test_intergenic(self): edge_dict = init_refs.make_edge_dict(cursor) location_dict = init_refs.make_location_dict(build, cursor) run_info = talon.init_run_info(database, build) + init_refs.make_temp_novel_gene_table(cursor, "toy_build") + init_refs.make_temp_transcript_table(cursor, "toy_build") transcript_dict = init_refs.make_transcript_dict(cursor, build) vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor) gene_starts = init_refs.make_gene_start_or_end_dict(cursor, build, "start") @@ -87,6 +96,7 @@ def test_intergenic(self): edge_dict, location_dict, vertex_2_gene, run_info, cursor, "temp_gene", + "temp_transcript", fusion) assert gene_ID == correct_gene_ID @@ -106,6 +116,8 @@ def test_antisense(self): edge_dict = init_refs.make_edge_dict(cursor) location_dict = init_refs.make_location_dict(build, cursor) run_info = talon.init_run_info(database, build) + init_refs.make_temp_novel_gene_table(cursor, "toy_build") + init_refs.make_temp_transcript_table(cursor, "toy_build") transcript_dict = init_refs.make_transcript_dict(cursor, build) vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor) gene_starts = init_refs.make_gene_start_or_end_dict(cursor, build, "start") @@ -131,6 +143,7 @@ def test_antisense(self): edge_dict, location_dict, vertex_2_gene, run_info, cursor, "temp_gene", + "temp_transcript", fusion) assert gene_ID == correct_gene_ID assert transcript_dict[frozenset(start_end_info["edge_IDs"])] != None @@ -148,6 +161,8 @@ def test_genomic(self): edge_dict = init_refs.make_edge_dict(cursor) location_dict = init_refs.make_location_dict(build, cursor) run_info = talon.init_run_info(database, build) + init_refs.make_temp_novel_gene_table(cursor, "toy_build") + init_refs.make_temp_transcript_table(cursor, "toy_build") transcript_dict = init_refs.make_transcript_dict(cursor, build) vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor) gene_starts = init_refs.make_gene_start_or_end_dict(cursor, build, "start") @@ -172,6 +187,7 @@ def test_genomic(self): edge_dict, location_dict, vertex_2_gene, run_info, cursor, "temp_gene", + "temp_transcript", fusion) correct_gene_ID = fetch_correct_ID("TG3", "gene", cursor) assert gene_ID == correct_gene_ID diff --git a/testing_suite/test_search_for_overlap_with_gene.py b/testing_suite/test_search_for_overlap_with_gene.py index 2d3ff48..44f78bd 100644 --- a/testing_suite/test_search_for_overlap_with_gene.py +++ b/testing_suite/test_search_for_overlap_with_gene.py @@ -14,6 +14,7 @@ def test_no_match(self): database = "scratch/toy.db" run_info = talon.init_run_info(database, build, tmp_dir = "scratch/tmp/") init_refs.make_temp_novel_gene_table(cursor, "toy_build") + init_refs.make_temp_transcript_table(cursor, "toy_build") location_dict = init_refs.make_location_dict(build, cursor) run_info = talon.init_run_info(database, build, tmp_dir = "scratch/tmp/") @@ -23,16 +24,18 @@ def test_no_match(self): gene_ID, match_strand = talon.search_for_overlap_with_gene(chrom, pos[0], pos[1], strand, cursor, - run_info, - "temp_gene") + run_info, + "temp_gene", + "temp_transcript") assert gene_ID == None # Should get same results for flipped interval gene_ID, match_strand = talon.search_for_overlap_with_gene(chrom, pos[0], pos[1], strand, cursor, - run_info, - "temp_gene") + run_info, + "temp_gene", + "temp_transcript") assert gene_ID == None conn.close() @@ -42,6 +45,7 @@ def test_single_match(self): database = "scratch/toy.db" build = "toy_build" init_refs.make_temp_novel_gene_table(cursor, "toy_build") + init_refs.make_temp_transcript_table(cursor, "toy_build") location_dict = init_refs.make_location_dict(build, cursor) run_info = talon.init_run_info(database, build, tmp_dir = "scratch/tmp/") @@ -53,7 +57,8 @@ def test_single_match(self): pos[1], strand, cursor, run_info, - "temp_gene") + "temp_gene", + "temp_transcript") assert gene_ID == fetch_correct_ID("TG1", "gene", cursor) @@ -61,13 +66,14 @@ def test_single_match(self): conn.close() def test_same_strand_match_with_two_genes(self): - """ Example where interval overlaps two genes, one of which is on the + """ Example where interval overlaps two genes, one of which is on the same strand. """ - + database = "scratch/toy.db" conn, cursor = get_db_cursor() build = "toy_build" init_refs.make_temp_novel_gene_table(cursor, "toy_build") + init_refs.make_temp_transcript_table(cursor, "toy_build") location_dict = init_refs.make_location_dict(build, cursor) run_info = talon.init_run_info(database, build) @@ -75,24 +81,26 @@ def test_same_strand_match_with_two_genes(self): pos = [1500, 910] strand = "-" - gene_ID, match_strand = talon.search_for_overlap_with_gene(chrom, pos[0], + gene_ID, match_strand = talon.search_for_overlap_with_gene(chrom, pos[0], pos[1], - strand, cursor, + strand, cursor, run_info, - "temp_gene") + "temp_gene", + "temp_transcript") assert gene_ID == fetch_correct_ID("TG3", "gene", cursor) assert match_strand == strand conn.close() def test_same_strand_match_left_overlap(self): - """ Example where the overlap is on the same strand. Query start is to + """ Example where the overlap is on the same strand. Query start is to the left of the gene, and query end is before the end of the gene. """ database = "scratch/toy.db" conn, cursor = get_db_cursor() build = "toy_build" init_refs.make_temp_novel_gene_table(cursor, "toy_build") + init_refs.make_temp_transcript_table(cursor, "toy_build") location_dict = init_refs.make_location_dict(build, cursor) run_info = talon.init_run_info(database, build) @@ -104,7 +112,8 @@ def test_same_strand_match_left_overlap(self): pos[1], strand, cursor, run_info, - "temp_gene") + "temp_gene", + "temp_transcript") assert gene_ID == fetch_correct_ID("TG3", "gene", cursor) assert match_strand == strand @@ -118,31 +127,34 @@ def test_antisense_match(self): conn, cursor = get_db_cursor() build = "toy_build" init_refs.make_temp_novel_gene_table(cursor, "toy_build") + init_refs.make_temp_transcript_table(cursor, "toy_build") location_dict = init_refs.make_location_dict(build, cursor) run_info = talon.init_run_info(database, build) chrom = "chr1" pos = [1400, 2100] strand = "+" - + gene_ID, match_strand = talon.search_for_overlap_with_gene(chrom, pos[0], pos[1], strand, cursor, run_info, - "temp_gene") + "temp_gene", + "temp_transcript") assert gene_ID == fetch_correct_ID("TG3", "gene", cursor) assert match_strand == "-" conn.close() def test_2_genes_same_strand(self): - """ Example where query overlaps two genes. Must choose the one with + """ Example where query overlaps two genes. Must choose the one with more overlap """ - + database = "scratch/toy.db" conn, cursor = get_db_cursor() - build = "toy_build" + build = "toy_build" init_refs.make_temp_novel_gene_table(cursor, "toy_build") + init_refs.make_temp_transcript_table(cursor, "toy_build") location_dict = init_refs.make_location_dict(build, cursor) run_info = talon.init_run_info(database, build) @@ -154,10 +166,9 @@ def test_2_genes_same_strand(self): pos[1], strand, cursor, run_info, - "temp_gene") + "temp_gene", + "temp_transcript") assert gene_ID == fetch_correct_ID("TG1", "gene", cursor) assert match_strand == "+" - conn.close() - - + conn.close() From a5fb07bcac13b5392fb1caaf65e22f8db001074b Mon Sep 17 00:00:00 2001 From: fairliereese Date: Mon, 9 Oct 2023 15:59:44 -0700 Subject: [PATCH 27/31] added sam version of test input file --- .../hl60_1_1_subset_remapped_sorted.sam | 213 ++++++++++++++++++ 1 file changed, 213 insertions(+) create mode 100644 testing_suite/input_files/readthrough/hl60_1_1_subset_remapped_sorted.sam diff --git a/testing_suite/input_files/readthrough/hl60_1_1_subset_remapped_sorted.sam b/testing_suite/input_files/readthrough/hl60_1_1_subset_remapped_sorted.sam new file mode 100644 index 0000000..47feae6 --- /dev/null +++ b/testing_suite/input_files/readthrough/hl60_1_1_subset_remapped_sorted.sam @@ -0,0 +1,213 @@ +@HD VN:1.6 SO:coordinate +@SQ SN:chr1 LN:248956422 +@SQ SN:chr2 LN:242193529 +@SQ SN:chr3 LN:198295559 +@SQ SN:chr4 LN:190214555 +@SQ SN:chr5 LN:181538259 +@SQ SN:chr6 LN:170805979 +@SQ SN:chr7 LN:159345973 +@SQ SN:chr8 LN:145138636 +@SQ SN:chr9 LN:138394717 +@SQ SN:chr10 LN:133797422 +@SQ SN:chr11 LN:135086622 +@SQ SN:chr12 LN:133275309 +@SQ SN:chr13 LN:114364328 +@SQ SN:chr14 LN:107043718 +@SQ SN:chr15 LN:101991189 +@SQ SN:chr16 LN:90338345 +@SQ SN:chr17 LN:83257441 +@SQ SN:chr18 LN:80373285 +@SQ SN:chr19 LN:58617616 +@SQ SN:chr20 LN:64444167 +@SQ SN:chr21 LN:46709983 +@SQ SN:chr22 LN:50818468 +@SQ SN:chrX LN:156040895 +@SQ SN:chrY LN:57227415 +@SQ SN:chrM LN:16569 +@SQ SN:chr1_KI270706v1_random LN:175055 +@SQ SN:chr1_KI270707v1_random LN:32032 +@SQ SN:chr1_KI270708v1_random LN:127682 +@SQ SN:chr1_KI270709v1_random LN:66860 +@SQ SN:chr1_KI270710v1_random LN:40176 +@SQ SN:chr1_KI270711v1_random LN:42210 +@SQ SN:chr1_KI270712v1_random LN:176043 +@SQ SN:chr1_KI270713v1_random LN:40745 +@SQ SN:chr1_KI270714v1_random LN:41717 +@SQ SN:chr2_KI270715v1_random LN:161471 +@SQ SN:chr2_KI270716v1_random LN:153799 +@SQ SN:chr3_GL000221v1_random LN:155397 +@SQ SN:chr4_GL000008v2_random LN:209709 +@SQ SN:chr5_GL000208v1_random LN:92689 +@SQ SN:chr9_KI270717v1_random LN:40062 +@SQ SN:chr9_KI270718v1_random LN:38054 +@SQ SN:chr9_KI270719v1_random LN:176845 +@SQ SN:chr9_KI270720v1_random LN:39050 +@SQ SN:chr11_KI270721v1_random LN:100316 +@SQ SN:chr14_GL000009v2_random LN:201709 +@SQ SN:chr14_GL000225v1_random LN:211173 +@SQ SN:chr14_KI270722v1_random LN:194050 +@SQ SN:chr14_GL000194v1_random LN:191469 +@SQ SN:chr14_KI270723v1_random LN:38115 +@SQ SN:chr14_KI270724v1_random LN:39555 +@SQ SN:chr14_KI270725v1_random LN:172810 +@SQ SN:chr14_KI270726v1_random LN:43739 +@SQ SN:chr15_KI270727v1_random LN:448248 +@SQ SN:chr16_KI270728v1_random LN:1872759 +@SQ SN:chr17_GL000205v2_random LN:185591 +@SQ SN:chr17_KI270729v1_random LN:280839 +@SQ SN:chr17_KI270730v1_random LN:112551 +@SQ SN:chr22_KI270731v1_random LN:150754 +@SQ SN:chr22_KI270732v1_random LN:41543 +@SQ SN:chr22_KI270733v1_random LN:179772 +@SQ SN:chr22_KI270734v1_random LN:165050 +@SQ SN:chr22_KI270735v1_random LN:42811 +@SQ SN:chr22_KI270736v1_random LN:181920 +@SQ SN:chr22_KI270737v1_random LN:103838 +@SQ SN:chr22_KI270738v1_random LN:99375 +@SQ SN:chr22_KI270739v1_random LN:73985 +@SQ SN:chrY_KI270740v1_random LN:37240 +@SQ SN:chrUn_KI270302v1 LN:2274 +@SQ SN:chrUn_KI270304v1 LN:2165 +@SQ SN:chrUn_KI270303v1 LN:1942 +@SQ SN:chrUn_KI270305v1 LN:1472 +@SQ SN:chrUn_KI270322v1 LN:21476 +@SQ SN:chrUn_KI270320v1 LN:4416 +@SQ SN:chrUn_KI270310v1 LN:1201 +@SQ SN:chrUn_KI270316v1 LN:1444 +@SQ SN:chrUn_KI270315v1 LN:2276 +@SQ SN:chrUn_KI270312v1 LN:998 +@SQ SN:chrUn_KI270311v1 LN:12399 +@SQ SN:chrUn_KI270317v1 LN:37690 +@SQ SN:chrUn_KI270412v1 LN:1179 +@SQ SN:chrUn_KI270411v1 LN:2646 +@SQ SN:chrUn_KI270414v1 LN:2489 +@SQ SN:chrUn_KI270419v1 LN:1029 +@SQ SN:chrUn_KI270418v1 LN:2145 +@SQ SN:chrUn_KI270420v1 LN:2321 +@SQ SN:chrUn_KI270424v1 LN:2140 +@SQ SN:chrUn_KI270417v1 LN:2043 +@SQ SN:chrUn_KI270422v1 LN:1445 +@SQ SN:chrUn_KI270423v1 LN:981 +@SQ SN:chrUn_KI270425v1 LN:1884 +@SQ SN:chrUn_KI270429v1 LN:1361 +@SQ SN:chrUn_KI270442v1 LN:392061 +@SQ SN:chrUn_KI270466v1 LN:1233 +@SQ SN:chrUn_KI270465v1 LN:1774 +@SQ SN:chrUn_KI270467v1 LN:3920 +@SQ SN:chrUn_KI270435v1 LN:92983 +@SQ SN:chrUn_KI270438v1 LN:112505 +@SQ SN:chrUn_KI270468v1 LN:4055 +@SQ SN:chrUn_KI270510v1 LN:2415 +@SQ SN:chrUn_KI270509v1 LN:2318 +@SQ SN:chrUn_KI270518v1 LN:2186 +@SQ SN:chrUn_KI270508v1 LN:1951 +@SQ SN:chrUn_KI270516v1 LN:1300 +@SQ SN:chrUn_KI270512v1 LN:22689 +@SQ SN:chrUn_KI270519v1 LN:138126 +@SQ SN:chrUn_KI270522v1 LN:5674 +@SQ SN:chrUn_KI270511v1 LN:8127 +@SQ SN:chrUn_KI270515v1 LN:6361 +@SQ SN:chrUn_KI270507v1 LN:5353 +@SQ SN:chrUn_KI270517v1 LN:3253 +@SQ SN:chrUn_KI270529v1 LN:1899 +@SQ SN:chrUn_KI270528v1 LN:2983 +@SQ SN:chrUn_KI270530v1 LN:2168 +@SQ SN:chrUn_KI270539v1 LN:993 +@SQ SN:chrUn_KI270538v1 LN:91309 +@SQ SN:chrUn_KI270544v1 LN:1202 +@SQ SN:chrUn_KI270548v1 LN:1599 +@SQ SN:chrUn_KI270583v1 LN:1400 +@SQ SN:chrUn_KI270587v1 LN:2969 +@SQ SN:chrUn_KI270580v1 LN:1553 +@SQ SN:chrUn_KI270581v1 LN:7046 +@SQ SN:chrUn_KI270579v1 LN:31033 +@SQ SN:chrUn_KI270589v1 LN:44474 +@SQ SN:chrUn_KI270590v1 LN:4685 +@SQ SN:chrUn_KI270584v1 LN:4513 +@SQ SN:chrUn_KI270582v1 LN:6504 +@SQ SN:chrUn_KI270588v1 LN:6158 +@SQ SN:chrUn_KI270593v1 LN:3041 +@SQ SN:chrUn_KI270591v1 LN:5796 +@SQ SN:chrUn_KI270330v1 LN:1652 +@SQ SN:chrUn_KI270329v1 LN:1040 +@SQ SN:chrUn_KI270334v1 LN:1368 +@SQ SN:chrUn_KI270333v1 LN:2699 +@SQ SN:chrUn_KI270335v1 LN:1048 +@SQ SN:chrUn_KI270338v1 LN:1428 +@SQ SN:chrUn_KI270340v1 LN:1428 +@SQ SN:chrUn_KI270336v1 LN:1026 +@SQ SN:chrUn_KI270337v1 LN:1121 +@SQ SN:chrUn_KI270363v1 LN:1803 +@SQ SN:chrUn_KI270364v1 LN:2855 +@SQ SN:chrUn_KI270362v1 LN:3530 +@SQ SN:chrUn_KI270366v1 LN:8320 +@SQ SN:chrUn_KI270378v1 LN:1048 +@SQ SN:chrUn_KI270379v1 LN:1045 +@SQ SN:chrUn_KI270389v1 LN:1298 +@SQ SN:chrUn_KI270390v1 LN:2387 +@SQ SN:chrUn_KI270387v1 LN:1537 +@SQ SN:chrUn_KI270395v1 LN:1143 +@SQ SN:chrUn_KI270396v1 LN:1880 +@SQ SN:chrUn_KI270388v1 LN:1216 +@SQ SN:chrUn_KI270394v1 LN:970 +@SQ SN:chrUn_KI270386v1 LN:1788 +@SQ SN:chrUn_KI270391v1 LN:1484 +@SQ SN:chrUn_KI270383v1 LN:1750 +@SQ SN:chrUn_KI270393v1 LN:1308 +@SQ SN:chrUn_KI270384v1 LN:1658 +@SQ SN:chrUn_KI270392v1 LN:971 +@SQ SN:chrUn_KI270381v1 LN:1930 +@SQ SN:chrUn_KI270385v1 LN:990 +@SQ SN:chrUn_KI270382v1 LN:4215 +@SQ SN:chrUn_KI270376v1 LN:1136 +@SQ SN:chrUn_KI270374v1 LN:2656 +@SQ SN:chrUn_KI270372v1 LN:1650 +@SQ SN:chrUn_KI270373v1 LN:1451 +@SQ SN:chrUn_KI270375v1 LN:2378 +@SQ SN:chrUn_KI270371v1 LN:2805 +@SQ SN:chrUn_KI270448v1 LN:7992 +@SQ SN:chrUn_KI270521v1 LN:7642 +@SQ SN:chrUn_GL000195v1 LN:182896 +@SQ SN:chrUn_GL000219v1 LN:179198 +@SQ SN:chrUn_GL000220v1 LN:161802 +@SQ SN:chrUn_GL000224v1 LN:179693 +@SQ SN:chrUn_KI270741v1 LN:157432 +@SQ SN:chrUn_GL000226v1 LN:15008 +@SQ SN:chrUn_GL000213v1 LN:164239 +@SQ SN:chrUn_KI270743v1 LN:210658 +@SQ SN:chrUn_KI270744v1 LN:168472 +@SQ SN:chrUn_KI270745v1 LN:41891 +@SQ SN:chrUn_KI270746v1 LN:66486 +@SQ SN:chrUn_KI270747v1 LN:198735 +@SQ SN:chrUn_KI270748v1 LN:93321 +@SQ SN:chrUn_KI270749v1 LN:158759 +@SQ SN:chrUn_KI270750v1 LN:148850 +@SQ SN:chrUn_KI270751v1 LN:150742 +@SQ SN:chrUn_KI270752v1 LN:27745 +@SQ SN:chrUn_KI270753v1 LN:62944 +@SQ SN:chrUn_KI270754v1 LN:40191 +@SQ SN:chrUn_KI270755v1 LN:36723 +@SQ SN:chrUn_KI270756v1 LN:79590 +@SQ SN:chrUn_KI270757v1 LN:71251 +@SQ SN:chrUn_GL000214v1 LN:137718 +@SQ SN:chrUn_KI270742v1 LN:186739 +@SQ SN:chrUn_GL000216v2 LN:176608 +@SQ SN:chrUn_GL000218v1 LN:161147 +@SQ SN:chrEBV LN:171823 +@PG ID:minimap2 PN:minimap2 VN:2.24-r1122 CL:minimap2 -t 1 -ax splice -uf --secondary=no -C5 /data/homezvol1/freese/mortazavi_lab/ref/hg38/hg38.fa hl60_1_1_subset.fastq +@PG ID:samtools PN:samtools PP:minimap2 VN:1.15.1 CL:samtools calmd -b hl60_1_1_subset_remapped.sam /data/homezvol1/freese/mortazavi_lab/ref/hg38/hg38.fa +@PG ID:samtools.1 PN:samtools PP:samtools VN:1.15.1 CL:samtools sort hl60_1_1_subset_remapped.bam +@PG ID:samtools.2 PN:samtools PP:samtools.1 VN:1.15.1 CL:samtools view -h readthrough/hl60_1_1_subset_remapped_sorted.bam +cenps_cort_fsm 0 chr1 10430430 60 139M3273N124M691N34M5656N67M1851N177M8935N282M * 0 0 CTCGCGCCGCGGCGGGAAAATCCGACCTGGCCGCGCACCACCGCCCCTTCTCGGCCCTCCTGCGTTTGCCCAGGGTCGGCCCGCAGTGATGGAGGAGGAGGCGGAGACCGAGGAGCAGCAGCGATTCTCTTACCAACAGAGGCTAAAGGCAGCAGTTCACTATACTGTGGGTTGTCTTTGCGAGGAAGTTGCATTGGACAAAGAGATGCAGTTCAGCAAACAGACCATTGCGGCCATTTCGGAGCTGACTTTCCGACAGTGTGAAAATTTTGCCAAAGACCTTGAAATGTTTGCAAGACATGCGAAAAGAACCACAATTAACACTGAAGATGTGAAGCTCTTAGCCAGGAGGAGTAATTCACTGCTAAAATACATCACAGACAAAAGTGAAGAGATTGCTCAGATTAACCTAGAACGAAAAGCACAGAAGAAAAAGAAGTCAGAGGATGGAAGCAAAAATTCAAGGCAGCCAGCAGAGGCTGGAGTGGTGGAAAGTGAGAATTAAAGTCCCTCGCCGCTTGGAAAGTGCAGCCTTCTACAGCATATGCAGGAAGCGGCAGGAATAAGGAAAAGCAGCCTCCTGACTTTCCTCGCTTGGTGGTTTGAGTGGACCTCCCAGGCCAGTGCCGGGCCCCTCATAGGAGAGGAAGCCCGGGAGGTGGCCAGGCGGCAGGAAGGCGCACCCCCCCAGCAATCCGCGCGCCGGGACAGAATGCCCTGCAGGAACTTCTTCTGGAAGACCTTCTCCTCCTGCAAATAAAACCTCACCCATGAATGCTCACGCAAGTGTAATGACAGACCTGAATAAAATGTATTAAGCAGC 5555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555 NM:i:1 ms:i:820 AS:i:658 nn:i:0 ts:A:+ tp:A:P cm:i:249 s1:i:759 s2:i:159 de:f:0.0012 rl:i:0 MD:Z:696T126 +cenps_cort_nnc 0 chr1 10430430 60 149M2I5M2I1M3257N124M691N34M5656N67M1851N177M8935N282M * 0 0 CTCGCGCCGCGGCGGGAAAATCCGACCTGGCCGCGCACCACCGCCCCTTCTCGGCCCTCCTGCGTTTGCCCAGGGTCGGCCCGCAGTGATGGAGGAGGAGGCGGAGACCGAGGAGCAGCAGCGATTCTCTTACCAACAGGTACAGGAAAGTACAGGAAGAGGCTAAAGGCAGCAGTTCACTATACTGTGGGTTGTCTTTGCGAGGAAGTTGCATTGGACAAAGAGATGCAGTTCAGCAAACAGACCATTGCGGCCATTTCGGAGCTGACTTTCCGACAGTGTGAAAATTTTGCCAAAGACCTTGAAATGTTTGCAAGACATGCGAAAAGAACCACAATTAACACTGAAGATGTGAAGCTCTTAGCCAGGAGGAGTAATTCACTGCTAAAATACATCACAGACAAAAGTGAAGAGATTGCTCAGATTAACCTAGAACGAAAAGCACAGAAGAAAAAGAAGTCAGAGGATGGAAGCAAAAATTCAAGGCAGCCAGCAGAGGCTGGAGTGGTGGAAAGTGAGAATTAAAGTCCCTCGCCGCTTGGAAAGTGCAGCCTTCTACAGCATATGCAGGAAGCGGCAGGAATAAGGAAAAGCAGCCTCCTGACTTTCCTCGCTTGGTGGTTTGAGTGGACCTCCCAGGCCAGTGCCGGGCCCCTCATAGGAGAGGAAGCCCGGGAGGTGGCCAGGCGGCAGGAAGGCGCACCCCCCCAGCAATCCGCGCGCCGGGACAGAATGCCCTGCAGGAACTTCTTCTGGAAGACCTTCTCCTCCTGCAAATAAAACCTCACCCATGAATGCTCACGCAAGTGTAATGACAGACCTGAATAAAATGTATTAAGCAGC 555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555 NM:i:6 ms:i:826 AS:i:661 nn:i:0 ts:A:+ tp:A:P cm:i:250 s1:i:766 s2:i:159 de:f:0.0048 rl:i:0 MD:Z:151G560T126 +cenps_cort_nic 0 chr1 10430430 60 139M4088N34M5656N67M1851N177M8935N282M * 0 0 CTCGCGCCGCGGCGGGAAAATCCGACCTGGCCGCGCACCACCGCCCCTTCTCGGCCCTCCTGCGTTTGCCCAGGGTCGGCCCGCAGTGATGGAGGAGGAGGCGGAGACCGAGGAGCAGCAGCGATTCTCTTACCAACAGAAAATTTTGCCAAAGACCTTGAAATGTTTGCAAGACATGCGAAAAGAACCACAATTAACACTGAAGATGTGAAGCTCTTAGCCAGGAGGAGTAATTCACTGCTAAAATACATCACAGACAAAAGTGAAGAGATTGCTCAGATTAACCTAGAACGAAAAGCACAGAAGAAAAAGAAGTCAGAGGATGGAAGCAAAAATTCAAGGCAGCCAGCAGAGGCTGGAGTGGTGGAAAGTGAGAATTAAAGTCCCTCGCCGCTTGGAAAGTGCAGCCTTCTACAGCATATGCAGGAAGCGGCAGGAATAAGGAAAAGCAGCCTCCTGACTTTCCTCGCTTGGTGGTTTGAGTGGACCTCCCAGGCCAGTGCCGGGCCCCTCATAGGAGAGGAAGCCCGGGAGGTGGCCAGGCGGCAGGAAGGCGCACCCCCCCAGCAATCCGCGCGCCGGGACAGAATGCCCTGCAGGAACTTCTTCTGGAAGACCTTCTCCTCCTGCAAATAAAACCTCACCCATGAATGCTCACGCAAGTGTAATGACAGACCTGAATAAAATGTATTAAGCAGC 555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555 NM:i:1 ms:i:696 AS:i:566 nn:i:0 ts:A:+ tp:A:P cm:i:209 s1:i:644 s2:i:99 de:f:0.0014 rl:i:0 MD:Z:572T126 +cenps_fsm 0 chr1 10430430 60 139M3273N124M691N34M5656N67M1851N546M * 0 0 CTCGCGCCGCGGCGGGAAAATCCGACCTGGCCGCGCACCACCGCCCCTTCTCGGCCCTCCTGCGTTTGCCCAGGGTCGGCCCGCAGTGATGGAGGAGGAGGCGGAGACCGAGGAGCAGCAGCGATTCTCTTACCAACAGAGGCTAAAGGCAGCAGTTCACTATACTGTGGGTTGTCTTTGCGAGGAAGTTGCATTGGACAAAGAGATGCAGTTCAGCAAACAGACCATTGCGGCCATTTCGGAGCTGACTTTCCGACAGTGTGAAAATTTTGCCAAAGACCTTGAAATGTTTGCAAGACATGCGAAAAGAACCACAATTAACACTGAAGATGTGAAGCTCTTAGCCAGGAGGAGTAATTCACTGCTAAAATACATCACAGACAAAAGTGAAGAGATTGCTCAGATTAACCTAGAACGAAAAGCACAGAAGAAAAAGAAGTCAGAGGATGGAAGCAAAAATTCAAGGCAGCCAGCAGAGGCTGGAGTGGTGGAAAGTGAGAATTAAAGTCCCTCGCCGCTTGGAAAGTGCAGCCTTCTACAGGTAGAGCCACCTAGAAATGCATATGGCTGCAAAGGAAACTTTGAAGGGTTAAATAGAGATTTAAAAAAATAAAATAAAAAGGCTGGGCTAGGGTGCTTTTTGTGCTGAATTCTCCACATTGTTAACTGCCAAAGCTAGTTTTAGAGAATGAGAAAGTCTTAAGCAAAATACTCCCAGGTCTCACTCCAGAACATAAAAATGGTGTGTGATCGAATGGTATATATTAGAAATTACATCTGTTGTAATTAAAATTGTGTGAGCAATTAAACATGGTTGACTTTTTCAAGCAAAAATCAGTTCATCTTTTGATGTAATTTTCTAGGCTAAATGGCAATCTCTGAAAGATGAATAAAGCTATATTTATTTAGC 5555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555 NM:i:1 ms:i:907 AS:i:779 nn:i:0 ts:A:+ tp:A:P cm:i:280 s1:i:857 s2:i:174 de:f:0.0011 rl:i:20 MD:Z:752A157 +cenps_nic 0 chr1 10430430 60 139M307N526M2440N124M691N34M5656N67M1851N225M1S * 0 0 CTCGCGCCGCGGCGGGAAAATCCGACCTGGCCGCGCACCACCGCCCCTTCTCGGCCCTCCTGCGTTTGCCCAGGGTCGGCCCGCAGTGATGGAGGAGGAGGCGGAGACCGAGGAGCAGCAGCGATTCTCTTACCAACAGATGGGGTTTTCGTGAGGGTACAACGTCGGCATTAGACATTCCAGGTGACGCCCGTACGCGGTGGGCGGTTCGGGCCGGAGCTCTGGAACGCTGGCCCTGGAGGCGTCGACCCCTCGTTACTGATGCAGGGACGCGGTGCGGACCAGTCAGGCCCAGAGCTCGTCCTTAGATGTGGGTTCGAATCTCTGCCCCGCCAACTTGTGATCGTATCGACTCGGCCCAGACGCAATTTTCTTCTCTGCAAAATCGTCATAAGAATAATCACTTGTCAGGGTAGCTGCGGGCATCCCATTCGTTCCTTTCATCAGCGCCGGGCATATGGGGCGTCAGAGGCTGAGAACGTTGCCGTGAAGAGGCTTAAAAGCAAGACCCGGAGTGGCGACCTTAAAGAGGACGGACTGAAGAAACGCGGGAATGAGCTCCAGACGCGGGAGTTTCCTCTCTACAAAGTTACACTGCAGCAGCTGTCTACCCTGCCCCTTGTCTTTTGAGAAGTTCAAACCTTCAGAAAAGTTGCAAGAACACGAGGCTAAAGGCAGCAGTTCACTATACTGTGGGTTGTCTTTGCGAGGAAGTTGCATTGGACAAAGAGATGCAGTTCAGCAAACAGACCATTGCGGCCATTTCGGAGCTGACTTTCCGACAGTGTGAAAATTTTGCCAAAGACCTTGAAATGTTTGCAAGACATGCGAAAAGAACCACAATTAACACTGAAGATGTGAAGCTCTTAGCCAGGAGGAGTAATTCACTGCTAAAATACATCACAGACAAAAGTGAAGAGATTGCTCAGATTAACCTAGAACGAAAAGCACAGAAGAAAAAGAAGTCAGAGGATGGAAGCAAAAATTCAAGGCAGCCAGCAGAGGCTGGAGTGGTGGAAAGTGAGAATTAAAGTCCCTCGCCGCTTGGAAAGTGCAGCCTTCTACAGGTAGAGCCACCTAGAAATGCATATGGCTGCAAAGGAAACTTTGAAGGGC 555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555 NM:i:0 ms:i:1115 AS:i:955 nn:i:0 ts:A:+ tp:A:P cm:i:351 s1:i:1057 s2:i:129 de:f:0 rl:i:0 MD:Z:1115 +cenps_nnc 0 chr1 10430430 60 150M296N526M2440N124M691N34M5656N67M1851N225M1S * 0 0 CTCGCGCCGCGGCGGGAAAATCCGACCTGGCCGCGCACCACCGCCCCTTCTCGGCCCTCCTGCGTTTGCCCAGGGTCGGCCCGCAGTGATGGAGGAGGAGGCGGAGACCGAGGAGCAGCAGCGATTCTCTTACCAACAGGTACAGGAAAAATGGGGTTTTCGTGAGGGTACAACGTCGGCATTAGACATTCCAGGTGACGCCCGTACGCGGTGGGCGGTTCGGGCCGGAGCTCTGGAACGCTGGCCCTGGAGGCGTCGACCCCTCGTTACTGATGCAGGGACGCGGTGCGGACCAGTCAGGCCCAGAGCTCGTCCTTAGATGTGGGTTCGAATCTCTGCCCCGCCAACTTGTGATCGTATCGACTCGGCCCAGACGCAATTTTCTTCTCTGCAAAATCGTCATAAGAATAATCACTTGTCAGGGTAGCTGCGGGCATCCCATTCGTTCCTTTCATCAGCGCCGGGCATATGGGGCGTCAGAGGCTGAGAACGTTGCCGTGAAGAGGCTTAAAAGCAAGACCCGGAGTGGCGACCTTAAAGAGGACGGACTGAAGAAACGCGGGAATGAGCTCCAGACGCGGGAGTTTCCTCTCTACAAAGTTACACTGCAGCAGCTGTCTACCCTGCCCCTTGTCTTTTGAGAAGTTCAAACCTTCAGAAAAGTTGCAAGAACACGAGGCTAAAGGCAGCAGTTCACTATACTGTGGGTTGTCTTTGCGAGGAAGTTGCATTGGACAAAGAGATGCAGTTCAGCAAACAGACCATTGCGGCCATTTCGGAGCTGACTTTCCGACAGTGTGAAAATTTTGCCAAAGACCTTGAAATGTTTGCAAGACATGCGAAAAGAACCACAATTAACACTGAAGATGTGAAGCTCTTAGCCAGGAGGAGTAATTCACTGCTAAAATACATCACAGACAAAAGTGAAGAGATTGCTCAGATTAACCTAGAACGAAAAGCACAGAAGAAAAAGAAGTCAGAGGATGGAAGCAAAAATTCAAGGCAGCCAGCAGAGGCTGGAGTGGTGGAAAGTGAGAATTAAAGTCCCTCGCCGCTTGGAAAGTGCAGCCTTCTACAGGTAGAGCCACCTAGAAATGCATATGGCTGCAAAGGAAACTTTGAAGGGC 55555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555 NM:i:0 ms:i:1126 AS:i:961 nn:i:0 ts:A:+ tp:A:P cm:i:352 s1:i:1063 s2:i:129 de:f:0 rl:i:0 MD:Z:1126 +cenps_cort_ism 0 chr1 10440345 60 12S69M1851N177M8935N282M * 0 0 GAAATGTTTGCAAGACATGCGAAAAGAACCACAATTAACACTGAAGATGTGAAGCTCTTAGCCAGGAGGAGTAATTCACTGCTAAAATACATCACAGACAAAAGTGAAGAGATTGCTCAGATTAACCTAGAACGAAAAGCACAGAAGAAAAAGAAGTCAGAGGATGGAAGCAAAAATTCAAGGCAGCCAGCAGAGGCTGGAGTGGTGGAAAGTGAGAATTAAAGTCCCTCGCCGCTTGGAAAGTGCAGCCTTCTACAGCATATGCAGGAAGCGGCAGGAATAAGGAAAAGCAGCCTCCTGACTTTCCTCGCTTGGTGGTTTGAGTGGACCTCCCAGGCCAGTGCCGGGCCCCTCATAGGAGAGGAAGCCCGGGAGGTGGCCAGGCGGCAGGAAGGCGCACCCCCCCAGCAATCCGCGCGCCGGGACAGAATGCCCTGCAGGAACTTCTTCTGGAAGACCTTCTCCTCCTGCAAATAAAACCTCACCCATGAATGCTCACGCAAGTGTAATGACAGACCTGAATAAAATGTATTAAGCAGC 555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555 NM:i:1 ms:i:525 AS:i:459 nn:i:0 ts:A:+ tp:A:P cm:i:160 s1:i:498 s2:i:74 de:f:0.0019 rl:i:0 MD:Z:401T126 +eloa_rpl11_fsm_1 0 chr1 23691792 60 38M779N151M1047N107M746N132M54229N57M764N107M896N1186M376N112M1581N156M107N98M1382N181M250N112M567N173M2386N393M2S * 0 0 GGAAGCTCCGCTTTCTCTTCCTGCTCTCCATCATGGCGCAGGATCAAGGTGAAAAGGAGAACCCCATGCGGGAACTTCGCATCCGCAAACTCTGTCTCAACATCTGTGTTGGGGAGAGTGGAGACAGACTGACGCGAGCAGCCAAGGTGTTGGAGCAGCTCACAGGGCAGACCCCTGTGTTTTCCAAAGCTAGATACACTGTCAGATCCTTTGGCATCCGGAGAAATGAAAAGATTGCTGTCCACTGCACAGTTCGAGGGGCCAAGGCAGAAGAAATCTTGGAGAAGGGTCTAAAGGTGCGGGAGTATGAGTTAAGAAAAAACAACTTCTCAGATACTGGAAACTTTGGTTTTGGGATCCAGGAACACATCGATCTGGGTATCAAATATGACCCAAGCATTGGTATCTACGGCCTGGACTTCTATGTGCTATTGAAATATTTGAAGAAACTCTCCACCCTGCCTATTACAGTAGACATTCTTGCGGAGACTGGGGTTGGGAAAACAGTAAATAGCTTGCGAAAACACGAGCATGTTGGAAGCTTTGCCAGGGACCTAGTGGCCCAGTGGAAGAAGCTGGTTCCTGTGGAACGAAATGCTGAGCCTGATGAACAGGACTTTGAGAAGAGCAATTCCCGAAAGCGCCCTCGGGATGCCCTGCAGAAGGAGGAGGAGATGGAGGGGGACTACCAAGAAACCTGGAAAGCCACGGGGAGCCGATCCTATAGCCCTGACCACAGGCAGAAGAAACATAGGAAACTCTCGGAGCTCGAGAGACCTCACAAAGTGTCTCACGGTCATGAGAGGAGAGATGAGAGAAAGAGGTGTCACAGAATGTCACCAACTTACTCTTCAGACCCTGAGTCTTCTGATTATGGCCATGTTCAATCCCCTCCATCTTGTACCAGTCCTCATCAGATGTACGTCGACCACTACAGATCCCTGGAGGAGGACCAGGAGCCCATTGTTTCACACCAGAAGCCTGGGAAAGGCCACAGCAATGCCTTTCAGGACAGACTCGGGGCCAGCCAAGAACGACACCTGGGTGAACCCCATGGGAAAGGGGTTGTGAGTCAAAACAAGGAGCACAAATCTTCCCACAAGGACAAACGCCCCGTGGATGCCAAGAGTGATGAGAAGGCCTCTGTGGTGAGCAGAGAGAAATCACACAAGGCCCTCTCCAAAGAGGAGAACCGAAGGCCACCCTCAGGGGACAATGCAAGGGAGAAACCGCCCTCTAGTGGCGTAAAGAAAGAGAAGGACAGAGAGGGCAGCAGCCTGAAGAAGAAGTGTTTGCCTCCCTCAGAGGCCGCTTCAGACAACCACCTGAAAAAGCCAAAGCACAGAGACCCAGAGAAAGCCAAATTGGACAAAAGCAAGCAAGGTCTGGACAGCTTTGACACAGGAAAAGGAGCAGGAGACCTGTTGCCCAAGGTAAAAGAGAAGGGTTCTAACAACCTAAAGACTCCAGAAGGGAAAGTCAAAACTAATTTGGATAGAAAGTCACTGGGCTCCCTCCCTAAAGTTGAGGAGACAGATATGGAGGATGAATTCGAGCAGCCAACCATGTCTTTTGAATCCTACCTCAGCTATGACCAGCCCCGGAAGAAAAAGAAAAAGATTGTGAAAACTTCAGCCACGGCACTTGGAGATAAAGGACTTAAAAAAAATGACTCTAAAAGCACTGGTAAAAACTTGGACTCAGTTCAGAAATTACCCAAGGTGAACAAAACCAAGTCAGAGAAGCCGGCTGGAGCTGATTTAGCCAAGCTGAGAAAGGTGCCTGATGTGTTGCCAGTGTTGCCAGACCTCCCGTTACCCGCGATACAGGCCAATTACCGTCCACTGCCTTCCCTCGAGCTGATATCCTCCTTCCAGCCAAAGCGAAAAGCGTTCTCTTCACCCCAGGAAGAAGAAGAAGCTGGATTTACTGGGCGCAGAATGAATTCCAAGATGCAGGTGTATTCTGGTTCCAAGTGTGCCTATCTCCCTAAAATGATGACCTTGCACCAGCAATGCATCCGAGTACTTAAAAACAACATCGATTCAATCTTTGAAGTGGGAGGAGTCCCATACTCTGTTCTTGAACCCGTTTTGGAGAGGTGTACACCTGATCAGCTGTATCGCATAGAGGAATACAATCATGTATTAATTGAAGAAACAGATCAATTATGGAAAGTTCATTGTCACCGAGACTTTAAGGAAGAAAGACCCGAAGAGTATGAGTCGTGGCGAGAGATGTACCTGCGGCTTCAGGACGCCCGAGAGCAGCGGCTACGAGTACTAACAAAGAATATCCAGTTCGCACATGCCAATAAGCCCAAAGGCCGACAAGCAAAGATGGCCTTTGTCAACTCTGTGGCCAAGCCACCTCGTGACGTCCGGAGGAGGCAGGAAAAGTTTGGAACGGGAGGAGCAGCTGTCCCTGAGAAAATCAAGATCAAGCCAGCCCCGTACCCCATGGGAAGCAGCCATGCTTCCGCCAGTAGTATCAGCTTTAACCCCAGCCCTGAGGAGCCGGCCTATGATGGCCCAAGCACCAGCAGTGCCCACTTGGCACCAGTGGTCAGCAGCACTGTTTCCTATGATCCTAGGAAACCCACTGTGAAGAAAATTGCCCCAATGATGGCCAAGACAATTAAAGCTTTCAAGAACAGATTCTCCCGACGATAAACTGAGGACTTGCCTTGGAAATGGAATCTGGGGAGGCAGGAATACAAGGACAGTGGGGGTTGGGGAATGGAATTCTACAGGAGACTGGAGTCTTGCTTTGTGGATCCTTTTGGTCTCCGAGTCCTGCAGTCTGCAGGTGCTGCCCCTGGGAACCTGCGTGCCACAGCCCCGCCTCCCTGCCTGGAGCACACTTTAGAATTCTGAAGATGTGAAGCCTCTGTCTCACTGAGGATTTTAAAGGTCAATTATACTTTTGTTGTTCATTAGCATCTTTGTAAACTATAAGACGTAGTTTTAATTAATAAATATTGCCCCCAGATTGTATTTATATGG 55555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555 NM:i:1 ms:i:3000 AS:i:2584 nn:i:0 ts:A:+ tp:A:P cm:i:959 s1:i:2859 s2:i:188 de:f:0.0003 rl:i:30 MD:Z:2488C514 +eloa_rpl11_nic 0 chr1 23691792 60 38M1977N107M746N132M54229N57M764N107M896N1186M376N112M1581N156M107N98M1382N181M250N112M567N173M2386N393M2S * 0 0 GGAAGCTCCGCTTTCTCTTCCTGCTCTCCATCATGGCGCTAGATACACTGTCAGATCCTTTGGCATCCGGAGAAATGAAAAGATTGCTGTCCACTGCACAGTTCGAGGGGCCAAGGCAGAAGAAATCTTGGAGAAGGGTCTAAAGGTGCGGGAGTATGAGTTAAGAAAAAACAACTTCTCAGATACTGGAAACTTTGGTTTTGGGATCCAGGAACACATCGATCTGGGTATCAAATATGACCCAAGCATTGGTATCTACGGCCTGGACTTCTATGTGCTATTGAAATATTTGAAGAAACTCTCCACCCTGCCTATTACAGTAGACATTCTTGCGGAGACTGGGGTTGGGAAAACAGTAAATAGCTTGCGAAAACACGAGCATGTTGGAAGCTTTGCCAGGGACCTAGTGGCCCAGTGGAAGAAGCTGGTTCCTGTGGAACGAAATGCTGAGCCTGATGAACAGGACTTTGAGAAGAGCAATTCCCGAAAGCGCCCTCGGGATGCCCTGCAGAAGGAGGAGGAGATGGAGGGGGACTACCAAGAAACCTGGAAAGCCACGGGGAGCCGATCCTATAGCCCTGACCACAGGCAGAAGAAACATAGGAAACTCTCGGAGCTCGAGAGACCTCACAAAGTGTCTCACGGTCATGAGAGGAGAGATGAGAGAAAGAGGTGTCACAGAATGTCACCAACTTACTCTTCAGACCCTGAGTCTTCTGATTATGGCCATGTTCAATCCCCTCCATCTTGTACCAGTCCTCATCAGATGTACGTCGACCACTACAGATCCCTGGAGGAGGACCAGGAGCCCATTGTTTCACACCAGAAGCCTGGGAAAGGCCACAGCAATGCCTTTCAGGACAGACTCGGGGCCAGCCAAGAACGACACCTGGGTGAACCCCATGGGAAAGGGGTTGTGAGTCAAAACAAGGAGCACAAATCTTCCCACAAGGACAAACGCCCCGTGGATGCCAAGAGTGATGAGAAGGCCTCTGTGGTGAGCAGAGAGAAATCACACAAGGCCCTCTCCAAAGAGGAGAACCGAAGGCCACCCTCAGGGGACAATGCAAGGGAGAAACCGCCCTCTAGTGGCGTAAAGAAAGAGAAGGACAGAGAGGGCAGCAGCCTGAAGAAGAAGTGTTTGCCTCCCTCAGAGGCCGCTTCAGACAACCACCTGAAAAAGCCAAAGCACAGAGACCCAGAGAAAGCCAAATTGGACAAAAGCAAGCAAGGTCTGGACAGCTTTGACACAGGAAAAGGAGCAGGAGACCTGTTGCCCAAGGTAAAAGAGAAGGGTTCTAACAACCTAAAGACTCCAGAAGGGAAAGTCAAAACTAATTTGGATAGAAAGTCACTGGGCTCCCTCCCTAAAGTTGAGGAGACAGATATGGAGGATGAATTCGAGCAGCCAACCATGTCTTTTGAATCCTACCTCAGCTATGACCAGCCCCGGAAGAAAAAGAAAAAGATTGTGAAAACTTCAGCCACGGCACTTGGAGATAAAGGACTTAAAAAAAATGACTCTAAAAGCACTGGTAAAAACTTGGACTCAGTTCAGAAATTACCCAAGGTGAACAAAACCAAGTCAGAGAAGCCGGCTGGAGCTGATTTAGCCAAGCTGAGAAAGGTGCCTGATGTGTTGCCAGTGTTGCCAGACCTCCCGTTACCCGCGATACAGGCCAATTACCGTCCACTGCCTTCCCTCGAGCTGATATCCTCCTTCCAGCCAAAGCGAAAAGCGTTCTCTTCACCCCAGGAAGAAGAAGAAGCTGGATTTACTGGGCGCAGAATGAATTCCAAGATGCAGGTGTATTCTGGTTCCAAGTGTGCCTATCTCCCTAAAATGATGACCTTGCACCAGCAATGCATCCGAGTACTTAAAAACAACATCGATTCAATCTTTGAAGTGGGAGGAGTCCCATACTCTGTTCTTGAACCCGTTTTGGAGAGGTGTACACCTGATCAGCTGTATCGCATAGAGGAATACAATCATGTATTAATTGAAGAAACAGATCAATTATGGAAAGTTCATTGTCACCGAGACTTTAAGGAAGAAAGACCCGAAGAGTATGAGTCGTGGCGAGAGATGTACCTGCGGCTTCAGGACGCCCGAGAGCAGCGGCTACGAGTACTAACAAAGAATATCCAGTTCGCACATGCCAATAAGCCCAAAGGCCGACAAGCAAAGATGGCCTTTGTCAACTCTGTGGCCAAGCCACCTCGTGACGTCCGGAGGAGGCAGGAAAAGTTTGGAACGGGAGGAGCAGCTGTCCCTGAGAAAATCAAGATCAAGCCAGCCCCGTACCCCATGGGAAGCAGCCATGCTTCCGCCAGTAGTATCAGCTTTAACCCCAGCCCTGAGGAGCCGGCCTATGATGGCCCAAGCACCAGCAGTGCCCACTTGGCACCAGTGGTCAGCAGCACTGTTTCCTATGATCCTAGGAAACCCACTGTGAAGAAAATTGCCCCAATGATGGCCAAGACAATTAAAGCTTTCAAGAACAGATTCTCCCGACGATAAACTGAGGACTTGCCTTGGAAATGGAATCTGGGGAGGCAGGAATACAAGGACAGTGGGGGTTGGGGAATGGAATTCTACAGGAGACTGGAGTCTTGCTTTGTGGATCCTTTTGGTCTCCGAGTCCTGCAGTCTGCAGGTGCTGCCCCTGGGAACCTGCGTGCCACAGCCCCGCCTCCCTGCCTGGAGCACACTTTAGAATTCTGAAGATGTGAAGCCTCTGTCTCACTGAGGATTTTAAAGGTCAATTATACTTTTGTTGTTCATTAGCATCTTTGTAAACTATAAGACGTAGTTTTAATTAATAAATATTGCCCCCAGATTGTATTTATATGG 5555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555 NM:i:1 ms:i:2849 AS:i:2465 nn:i:0 ts:A:+ tp:A:P cm:i:912 s1:i:2717 s2:i:129 de:f:0.0004 rl:i:30 MD:Z:2337C514 +eloa_rpl11_fsm_2 0 chr1 23692590 60 5S170M1047N107M746N132M54229N57M764N107M896N1186M376N112M1581N156M107N98M1382N181M250N112M567N173M2386N392M2S * 0 0 CTCTTCTGCTCTTCCCTGTTGCAGCAGGATCAAGGTGAAAAGGAGAACCCCATGCGGGAACTTCGCATCCGCAAACTCTGTCTCAACATCTGTGTTGGGGAGAGTGGAGACAGACTGACGCGAGCAGCCAAGGTGTTGGAGCAGCTCACAGGGCAGACCCCTGTGTTTTCCAAAGCTAGATACACTGTCAGATCCTTTGGCATCCGGAGAAATGAAAAGATTGCTGTCCACTGCACAGTTCGAGGGGCCAAGGCAGAAGAAATCTTGGAGAAGGGTCTAAAGGTGCGGGAGTATGAGTTAAGAAAAAACAACTTCTCAGATACTGGAAACTTTGGTTTTGGGATCCAGGAACACATCGATCTGGGTATCAAATATGACCCAAGCATTGGTATCTACGGCCTGGACTTCTATGTGCTATTGAAATATTTGAAGAAACTCTCCACCCTGCCTATTACAGTAGACATTCTTGCGGAGACTGGGGTTGGGAAAACAGTAAATAGCTTGCGAAAACACGAGCATGTTGGAAGCTTTGCCAGGGACCTAGTGGCCCAGTGGAAGAAGCTGGTTCCTGTGGAACGAAATGCTGAGCCTGATGAACAGGACTTTGAGAAGAGCAATTCCCGAAAGCGCCCTCGGGATGCCCTGCAGAAGGAGGAGGAGATGGAGGGGGACTACCAAGAAACCTGGAAAGCCACGGGGAGCCGATCCTATAGCCCTGACCACAGGCAGAAGAAACATAGGAAACTCTCGGAGCTCGAGAGACCTCACAAAGTGTCTCACGGTCATGAGAGGAGAGATGAGAGAAAGAGGTGTCACAGAATGTCACCAACTTACTCTTCAGACCCTGAGTCTTCTGATTATGGCCATGTTCAATCCCCTCCATCTTGTACCAGTCCTCATCAGATGTACGTCGACCACTACAGATCCCTGGAGGAGGACCAGGAGCCCATTGTTTCACACCAGAAGCCTGGGAAAGGCCACAGCAATGCCTTTCAGGACAGACTCGGGGCCAGCCAAGAACGACACCTGGGTGAACCCCATGGGAAAGGGGTTGTGAGTCAAAACAAGGAGCACAAATCTTCCCACAAGGACAAACGCCCCGTGGATGCCAAGAGTGATGAGAAGGCCTCTGTGGTGAGCAGAGAGAAATCACACAAGGCCCTCTCCAAAGAGGAGAACCGAAGGCCACCCTCAGGGGACAATGCAAGGGAGAAACCGCCCTCTAGTGGCGTAAAGAAAGAGAAGGACAGAGAGGGCAGCAGCCTGAAGAAGAAGTGTTTGCCTCCCTCAGAGGCCGCTTCAGACAACCACCTGAAAAAGCCAAAGCACAGAGACCCAGAGAAAGCCAAATTGGACAAAAGCAAGCAAGGTCTGGACAGCTTTGACACAGGAAAAGGAGCAGGAGACCTGTTGCCCAAGGTAAAAGAGAAGGGTTCTAACAACCTAAAGACTCCAGAAGGGAAAGTCAAAACTAATTTGGATAGAAAGTCACTGGGCTCCCTCCCTAAAGTTGAGGAGACAGATATGGAGGATGAATTCGAGCAGCCAACCATGTCTTTTGAATCCTACCTCAGCTATGACCAGCCCCGGAAGAAAAAGAAAAAGATTGTGAAAACTTCAGCCACGGCACTTGGAGATAAAGGACTTAAAAAAAATGACTCTAAAAGCACTGGTAAAAACTTGGACTCAGTTCAGAAATTACCCAAGGTGAACAAAACCAAGTCAGAGAAGCCGGCTGGAGCTGATTTAGCCAAGCTGAGAAAGGTGCCTGATGTGTTGCCAGTGTTGCCAGACCTCCCGTTACCCGCGATACAGGCCAATTACCGTCCACTGCCTTCCCTCGAGCTGATATCCTCCTTCCAGCCAAAGCGAAAAGCGTTCTCTTCACCCCAGGAAGAAGAAGAAGCTGGATTTACTGGGCGCAGAATGAATTCCAAGATGCAGGTGTATTCTGGTTCCAAGTGTGCCTATCTCCCTAAAATGATGACCTTGCACCAGCAATGCATCCGAGTACTTAAAAACAACATCGATTCAATCTTTGAAGTGGGAGGAGTCCCATACTCTGTTCTTGAACCCGTTTTGGAGAGGTGTACACCTGATCAGCTGTATCGCATAGAGGAATACAATCATGTATTAATTGAAGAAACAGATCAATTATGGAAAGTTCATTGTCACCGAGACTTTAAGGAAGAAAGACCCGAAGAGTATGAGTCGTGGCGAGAGATGTACCTGCGGCTTCAGGACGCCCGAGAGCAGCGGCTACGAGTACTAACAAAGAATATCCAGTTCGCACATGCCAATAAGCCCAAAGGCCGACAAGCAAAGATGGCCTTTGTCAACTCTGTGGCCAAGCCACCTCGTGACGTCCGGAGGAGGCAGGAAAAGTTTGGAACGGGAGGAGCAGCTGTCCCTGAGAAAATCAAGATCAAGCCAGCCCCGTACCCCATGGGAAGCAGCCATGCTTCCGCCAGTAGTATCAGCTTTAACCCCAGCCCTGAGGAGCCGGCCTATGATGGCCCAAGCACCAGCAGTGCCCACTTGGCACCAGTGGTCAGCAGCACTGTTTCCTATGATCCTAGGAAACCCACTGTGAAGAAAATTGCCCCAATGATGGCCAAGACAATTAAAGCTTTCAAGAACAGATTCTCCCGACGATAAACTGAGGACTTGCCTTGGAAATGGAATCTGGGGAGGCAGGAATACAAGGACAGTGGGGGTTGGGGAATGGAATTCTACAGGAGACTGGAGTCTTGCTTTGTGGATCCTTTTGGTCTCCGAGTCCTGCAGTCTGCAGGTGCTGCCCCTGGGAACCTGCGTGCCACAGCCCCGCCTCCCTGCCTGGAGCACACTTTAGAATTCTGAAGATGTGAAGCCTCTGTCTCACTGAGGATTTTAAAGGTCAATTATACTTTTGTTGTTCATTAGCATCTTTGTAAACTATAAGACGTAGTTTTAATTAATAAATATTGCCCCCAGATTGTATTTATAGC 55555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555 NM:i:1 ms:i:2980 AS:i:2596 nn:i:0 ts:A:+ tp:A:P cm:i:959 s1:i:2851 s2:i:188 de:f:0.0003 rl:i:30 MD:Z:2469C513 +eloa_rpl11_nnc 0 chr1 23692590 60 5S184M1033N107M746N132M54229N57M764N107M896N1186M376N112M1581N156M107N98M1382N181M250N112M567N173M2386N392M2S * 0 0 CTCTTCTGCTCTTCCCTGTTGCAGCAGGATCAAGGTGAAAAGGAGAACCCCATGCGGGAACTTCGCATCCGCAAACTCTGTCTCAACATCTGTGTTGGGGAGAGTGGAGACAGACTGACGCGAGCAGCCAAGGTGTTGGAGCAGCTCACAGGGCAGACCCCTGTGTTTTCCAAAGGTGAGTAGTCACAACTAGATACACTGTCAGATCCTTTGGCATCCGGAGAAATGAAAAGATTGCTGTCCACTGCACAGTTCGAGGGGCCAAGGCAGAAGAAATCTTGGAGAAGGGTCTAAAGGTGCGGGAGTATGAGTTAAGAAAAAACAACTTCTCAGATACTGGAAACTTTGGTTTTGGGATCCAGGAACACATCGATCTGGGTATCAAATATGACCCAAGCATTGGTATCTACGGCCTGGACTTCTATGTGCTATTGAAATATTTGAAGAAACTCTCCACCCTGCCTATTACAGTAGACATTCTTGCGGAGACTGGGGTTGGGAAAACAGTAAATAGCTTGCGAAAACACGAGCATGTTGGAAGCTTTGCCAGGGACCTAGTGGCCCAGTGGAAGAAGCTGGTTCCTGTGGAACGAAATGCTGAGCCTGATGAACAGGACTTTGAGAAGAGCAATTCCCGAAAGCGCCCTCGGGATGCCCTGCAGAAGGAGGAGGAGATGGAGGGGGACTACCAAGAAACCTGGAAAGCCACGGGGAGCCGATCCTATAGCCCTGACCACAGGCAGAAGAAACATAGGAAACTCTCGGAGCTCGAGAGACCTCACAAAGTGTCTCACGGTCATGAGAGGAGAGATGAGAGAAAGAGGTGTCACAGAATGTCACCAACTTACTCTTCAGACCCTGAGTCTTCTGATTATGGCCATGTTCAATCCCCTCCATCTTGTACCAGTCCTCATCAGATGTACGTCGACCACTACAGATCCCTGGAGGAGGACCAGGAGCCCATTGTTTCACACCAGAAGCCTGGGAAAGGCCACAGCAATGCCTTTCAGGACAGACTCGGGGCCAGCCAAGAACGACACCTGGGTGAACCCCATGGGAAAGGGGTTGTGAGTCAAAACAAGGAGCACAAATCTTCCCACAAGGACAAACGCCCCGTGGATGCCAAGAGTGATGAGAAGGCCTCTGTGGTGAGCAGAGAGAAATCACACAAGGCCCTCTCCAAAGAGGAGAACCGAAGGCCACCCTCAGGGGACAATGCAAGGGAGAAACCGCCCTCTAGTGGCGTAAAGAAAGAGAAGGACAGAGAGGGCAGCAGCCTGAAGAAGAAGTGTTTGCCTCCCTCAGAGGCCGCTTCAGACAACCACCTGAAAAAGCCAAAGCACAGAGACCCAGAGAAAGCCAAATTGGACAAAAGCAAGCAAGGTCTGGACAGCTTTGACACAGGAAAAGGAGCAGGAGACCTGTTGCCCAAGGTAAAAGAGAAGGGTTCTAACAACCTAAAGACTCCAGAAGGGAAAGTCAAAACTAATTTGGATAGAAAGTCACTGGGCTCCCTCCCTAAAGTTGAGGAGACAGATATGGAGGATGAATTCGAGCAGCCAACCATGTCTTTTGAATCCTACCTCAGCTATGACCAGCCCCGGAAGAAAAAGAAAAAGATTGTGAAAACTTCAGCCACGGCACTTGGAGATAAAGGACTTAAAAAAAATGACTCTAAAAGCACTGGTAAAAACTTGGACTCAGTTCAGAAATTACCCAAGGTGAACAAAACCAAGTCAGAGAAGCCGGCTGGAGCTGATTTAGCCAAGCTGAGAAAGGTGCCTGATGTGTTGCCAGTGTTGCCAGACCTCCCGTTACCCGCGATACAGGCCAATTACCGTCCACTGCCTTCCCTCGAGCTGATATCCTCCTTCCAGCCAAAGCGAAAAGCGTTCTCTTCACCCCAGGAAGAAGAAGAAGCTGGATTTACTGGGCGCAGAATGAATTCCAAGATGCAGGTGTATTCTGGTTCCAAGTGTGCCTATCTCCCTAAAATGATGACCTTGCACCAGCAATGCATCCGAGTACTTAAAAACAACATCGATTCAATCTTTGAAGTGGGAGGAGTCCCATACTCTGTTCTTGAACCCGTTTTGGAGAGGTGTACACCTGATCAGCTGTATCGCATAGAGGAATACAATCATGTATTAATTGAAGAAACAGATCAATTATGGAAAGTTCATTGTCACCGAGACTTTAAGGAAGAAAGACCCGAAGAGTATGAGTCGTGGCGAGAGATGTACCTGCGGCTTCAGGACGCCCGAGAGCAGCGGCTACGAGTACTAACAAAGAATATCCAGTTCGCACATGCCAATAAGCCCAAAGGCCGACAAGCAAAGATGGCCTTTGTCAACTCTGTGGCCAAGCCACCTCGTGACGTCCGGAGGAGGCAGGAAAAGTTTGGAACGGGAGGAGCAGCTGTCCCTGAGAAAATCAAGATCAAGCCAGCCCCGTACCCCATGGGAAGCAGCCATGCTTCCGCCAGTAGTATCAGCTTTAACCCCAGCCCTGAGGAGCCGGCCTATGATGGCCCAAGCACCAGCAGTGCCCACTTGGCACCAGTGGTCAGCAGCACTGTTTCCTATGATCCTAGGAAACCCACTGTGAAGAAAATTGCCCCAATGATGGCCAAGACAATTAAAGCTTTCAAGAACAGATTCTCCCGACGATAAACTGAGGACTTGCCTTGGAAATGGAATCTGGGGAGGCAGGAATACAAGGACAGTGGGGGTTGGGGAATGGAATTCTACAGGAGACTGGAGTCTTGCTTTGTGGATCCTTTTGGTCTCCGAGTCCTGCAGTCTGCAGGTGCTGCCCCTGGGAACCTGCGTGCCACAGCCCCGCCTCCCTGCCTGGAGCACACTTTAGAATTCTGAAGATGTGAAGCCTCTGTCTCACTGAGGATTTTAAAGGTCAATTATACTTTTGTTGTTCATTAGCATCTTTGTAAACTATAAGACGTAGTTTTAATTAATAAATATTGCCCCCAGATTGTATTTATAGC 5555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555 NM:i:1 ms:i:2994 AS:i:2605 nn:i:0 ts:A:+ tp:A:P cm:i:962 s1:i:2863 s2:i:186 de:f:0.0003 rl:i:30 MD:Z:2483C513 +rpl11_fsm 0 chr1 23692612 60 26S148M1047N107M746N132M1006N111M435N69M3S * 0 0 CCTTCTCTCCTCCTCCATCATGCGCAGATCAAGGTGAAAAGGAGAACCCCATGCGGGAACTTCGCATCCGCAAACTCTGTCTCAACATCTGTGTTGGGGAGAGTGGAGACAGACTGACGCGAGCAGCCAAGGTGTTGGAGCAGCTCACAGGGCAGACCCCTGTGTTTTCCAAAGCTAGATACACTGTCAGATCCTTTGGCATCCGGAGAAATGAAAAGATTGCTGTCCACTGCACAGTTCGAGGGGCCAAGGCAGAAGAAATCTTGGAGAAGGGTCTAAAGGTGCGGGAGTATGAGTTAAGAAAAAACAACTTCTCAGATACTGGAAACTTTGGTTTTGGGATCCAGGAACACATCGATCTGGGTATCAAATATGACCCAAGCATTGGTATCTACGGCCTGGACTTCTATGTGGTGCTGGGTAGGCCAGGTTTCAGCATCGCAGACAAGAAGCGCAGGACAGGCTGCATTGGGGCCAAACACAGAATCAGCAAAGAGGAGGCCATGCGCTGGTTCCAGCAGAAGTATGATGGGATCATCCTTCCTGGCAAATAAATTCCCGTTTCTATCCAAAAGAGCAATAAAAAGTTTTCATTG 55555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555 NM:i:0 ms:i:567 AS:i:439 nn:i:0 ts:A:+ tp:A:P cm:i:173 s1:i:523 s2:i:251 de:f:0 rl:i:0 MD:Z:567 +rpl11_ism 0 chr1 23693834 60 80M746N132M1006N111M435N69M3S * 0 0 TCCGGAGAAATGAAAAGATTGCTGTCCACTGCACAGTTCGAGGGGCCAAGGCAGAAGAAATCTTGGAGAAGGGTCTAAAGGTGCGGGAGTATGAGTTAAGAAAAAACAACTTCTCAGATACTGGAAACTTTGGTTTTGGGATCCAGGAACACATCGATCTGGGTATCAAATATGACCCAAGCATTGGTATCTACGGCCTGGACTTCTATGTGGTGCTGGGTAGGCCAGGTTTCAGCATCGCAGACAAGAAGCGCAGGACAGGCTGCATTGGGGCCAAACACAGAATCAGCAAAGAGGAGGCCATGCGCTGGTTCCAGCAGAAGTATGATGGGATCATCCTTCCTGGCAAATAAATTCCCGTTTCTATCCAAAAGAGCAATAAAAAGTTTTCATTG 55555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555 NM:i:0 ms:i:392 AS:i:296 nn:i:0 ts:A:+ tp:A:P cm:i:119 s1:i:359 s2:i:192 de:f:0 rl:i:0 MD:Z:392 From 1e77c98e35c35414a93f9abc743b7312501130c2 Mon Sep 17 00:00:00 2001 From: fairliereese Date: Mon, 9 Oct 2023 16:02:15 -0700 Subject: [PATCH 28/31] added index for test input bam file --- .../hl60_1_1_subset_remapped_sorted.bam.bai | Bin 0 -> 13296 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 testing_suite/input_files/readthrough/hl60_1_1_subset_remapped_sorted.bam.bai diff --git a/testing_suite/input_files/readthrough/hl60_1_1_subset_remapped_sorted.bam.bai b/testing_suite/input_files/readthrough/hl60_1_1_subset_remapped_sorted.bam.bai new file mode 100644 index 0000000000000000000000000000000000000000..9e1c7e0235c254aaf8fdd6b1ea24313bd3134fee GIT binary patch literal 13296 zcmeI$u?@m75CzZ+f(~eDsA!l2ISFbfqDBV-H)Vg-CyCqze!^diMh9NWd$`4u@H zPQxQ2=Iy@CB8JvSte4qN``$k{ef)mu?-9Fgejl$9ZArVn$5-#MPjhaMxo_=P`%H_R zQ)~5^hCI}PI&cm+zyS_$fCC)h00%h00S<701048g2kIZ7zSprF Date: Thu, 12 Oct 2023 10:18:35 -0700 Subject: [PATCH 29/31] added check for reference lengths == 0 in get_overlap --- src/talon/talon.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/talon/talon.py b/src/talon/talon.py index b94626e..1f688fc 100644 --- a/src/talon/talon.py +++ b/src/talon/talon.py @@ -802,7 +802,10 @@ def get_overlap(a, b): """ overlap = max(0, min(a[1], b[1]) - max(a[0], b[0]) + 1) ref_len = abs(b[1] - b[0]) - perc_overlap = (overlap / ref_len) * 100 + if ref_len != 0: + perc_overlap = (overlap / ref_len) * 100 + else: + perc_overlap = None return overlap, perc_overlap From e9ca9675cc38dc453fa1e2398100c157c79f52b5 Mon Sep 17 00:00:00 2001 From: fairliereese Date: Mon, 16 Oct 2023 12:29:02 -0700 Subject: [PATCH 30/31] removed pybedtools as a dependency --- .travis.yml | 1 - requirements.txt | 1 - setup.py | 1 - tox.ini | 1 - 4 files changed, 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index ef002ad..0b88f89 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,7 +7,6 @@ python: install: - sudo apt update && sudo apt install bedtools - - pip install pybedtools - pip install tox script: diff --git a/requirements.txt b/requirements.txt index 404beb3..c2f1349 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ pandas pyfaidx pysam==0.15.4 -pybedtools diff --git a/setup.py b/setup.py index 943debb..94fcce9 100644 --- a/setup.py +++ b/setup.py @@ -52,7 +52,6 @@ "pyranges", "bamread>=0.0.11", "pysam>=0.15.4", - "pybedtools", "pyfaidx", "scanpy" ], diff --git a/tox.ini b/tox.ini index 4bb6066..67c93d2 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,6 @@ envlist=py3 [testenv] deps=pytest - pybedtools whitelist_externals = make commands = make -C testing_suite test From f6164972c4cfcecded81f59d283e89c5323643e9 Mon Sep 17 00:00:00 2001 From: fairliereese Date: Mon, 16 Oct 2023 12:44:02 -0700 Subject: [PATCH 31/31] added python <3.8 requirements, as higher versions change how multiprocessing works --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 94fcce9..5de86d0 100644 --- a/setup.py +++ b/setup.py @@ -46,7 +46,7 @@ "License :: OSI Approved :: MIT License", "Topic :: Scientific/Engineering :: Bio-Informatics" ], - python_requires=">=3.6", + python_requires=">=3.6,<3.8", install_requires=[ "pandas", "pyranges",