From f1b21363d4e5f354e7791c252075759762c09b20 Mon Sep 17 00:00:00 2001
From: fairliereese <fairliek@comcast.net>
Date: Thu, 21 Sep 2023 12:30:42 -0700
Subject: [PATCH 01/31] update

---
 src/talon/post/create_anndata_from_database.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/talon/post/create_anndata_from_database.py b/src/talon/post/create_anndata_from_database.py
index 515eaa6..f3ebca3 100644
--- a/src/talon/post/create_anndata_from_database.py
+++ b/src/talon/post/create_anndata_from_database.py
@@ -30,33 +30,27 @@ def getOptions():
 
     parser.add_option("--db", dest = "database",
         help = "TALON database", metavar = "FILE", type = "string")
-
     parser.add_option("--annot", "-a", dest = "annot",
         help = """Which annotation version to use. Will determine which
                   annotation transcripts are considered known or novel
                   relative to. Note: must be in the TALON database.""",
         type = "string")
-
     parser.add_option("--pass_list", dest = "pass_list",
                       help = "Pass list file of transcripts to include in the \
                               output. First column should be TALON gene ID, \
                               second column should be TALON transcript ID",
                       metavar = "FILE", type = "string", default = None)
-
     parser.add_option("--build", "-b", dest = "build",
         help = "Genome build to use. Note: must be in the TALON database.",
         type = "string")
-
     parser.add_option('--gene', dest='gene_level',
         help='Output AnnData on the gene level rather than the transcript',
         action='store_true')
-
     parser.add_option("--datasets", "-d",  dest = "dataset_file",
         help = """Optional: A file indicating which datasets should be
                   included (one dataset name per line). Default is to include
                   all datasets.""",
         metavar = "FILE", type = "string", default = None)
-
     parser.add_option("--o", dest = "ofile", help = "Output file name",
         metavar = "FILE", type = "string")
 

From 417565a9fce866c065de59a357e14a3b569d2596 Mon Sep 17 00:00:00 2001
From: fairliereese <fairliek@comcast.net>
Date: Thu, 21 Sep 2023 12:32:04 -0700
Subject: [PATCH 02/31] added untested functionality to distinguish fusion /
 readthrough transcripts

---
 src/talon/post/get_read_annotations.py | 102 +++++++++++---------
 src/talon/talon.py                     | 126 ++++++++++++++++---------
 2 files changed, 137 insertions(+), 91 deletions(-)

diff --git a/src/talon/post/get_read_annotations.py b/src/talon/post/get_read_annotations.py
index dc18748..c7d93e4 100644
--- a/src/talon/post/get_read_annotations.py
+++ b/src/talon/post/get_read_annotations.py
@@ -1,7 +1,7 @@
 # TALON: Techonology-Agnostic Long Read Analysis Pipeline
 # Author: Dana Wyman
 # -----------------------------------------------------------------------------
-# get_read_annotations.py is a utility that queries a TALON 
+# get_read_annotations.py is a utility that queries a TALON
 # database in order to get read-specific annotation information.
 
 import argparse
@@ -34,9 +34,9 @@ def get_args():
 
 def fetch_reads(database, build, tmp_file = None, datasets = None):
     """ Performs database query to fetch location and gene/transcript assignment
-        info for each long read in the specified datasets. 
+        info for each long read in the specified datasets.
         If tmp_file is set to None (default), then the function will return
-        the query results in a list of lists. If an alternate value is provided, 
+        the query results in a list of lists. If an alternate value is provided,
         then the results will be written to a tmp file of that name."""
 
     if datasets != None:
@@ -50,8 +50,8 @@ def fetch_reads(database, build, tmp_file = None, datasets = None):
         conn.row_factory = sqlite3.Row
         cursor = conn.cursor()
 
-        query = """ SELECT os.read_name, 
-                           os.dataset, 
+        query = """ SELECT os.read_name,
+                           os.dataset,
                            loc1.genome_build,
                            os.gene_ID as gene_ID,
                            os.transcript_ID as transcript_ID,
@@ -66,17 +66,17 @@ def fetch_reads(database, build, tmp_file = None, datasets = None):
                            os.fraction_As,
                            os.custom_label,
                            os.allelic_label,
-                           os.start_support, 
+                           os.start_support,
                            os.end_support
                     FROM observed as os
-                    LEFT JOIN location as loc1 ON 
-                        loc1.location_ID = os.start_vertex 
-                    LEFT JOIN location as loc2 ON 
+                    LEFT JOIN location as loc1 ON
+                        loc1.location_ID = os.start_vertex
+                    LEFT JOIN location as loc2 ON
                         loc2.location_ID = os.end_vertex
                     LEFT JOIN genes ON genes.gene_ID = os.gene_ID
-                    LEFT JOIN transcripts ON 
+                    LEFT JOIN transcripts ON
                         transcripts.transcript_ID = os.transcript_ID
-                    WHERE loc1.genome_build = '$build' 
+                    WHERE loc1.genome_build = '$build'
                     AND loc2.genome_build = '$build' """
         query = Template(query + dataset_str)
         try:
@@ -86,7 +86,7 @@ def fetch_reads(database, build, tmp_file = None, datasets = None):
             raise RuntimeError("Problem with reads database query")
 
         if tmp_file != None:
-            o = open(tmp_file, 'w') 
+            o = open(tmp_file, 'w')
         else:
             reads = []
 
@@ -113,16 +113,16 @@ def fetch_reads(database, build, tmp_file = None, datasets = None):
                 read_end = entry["end_vertex_pos"] - TTS_diff
             else:
                 raise ValueError("Unrecognized strand value: " + str(strand))
-            
+
             # Create entry for output
             out_read = (entry["read_name"], entry["dataset"],
                         entry["genome_build"], entry["gene_ID"],
-                        entry["transcript_ID"], entry["chrom"], 
+                        entry["transcript_ID"], entry["chrom"],
                         read_start, read_end, strand, entry["n_exons"],
                         entry["read_length"], entry["fraction_As"],
                         entry["custom_label"], entry["allelic_label"],
                         entry["start_support"], entry["end_support"])
-           
+
             if tmp_file != None:
                 o.write("\t".join([ str(x) for x in out_read ]) + "\n")
             else:
@@ -131,7 +131,7 @@ def fetch_reads(database, build, tmp_file = None, datasets = None):
 
     # Return results or close file
     if count == 0:
-        raise ValueError(("No reads detected. Make sure your dataset names are " 
+        raise ValueError(("No reads detected. Make sure your dataset names are "
                           "correct."))
 
     if tmp_file != None:
@@ -161,6 +161,13 @@ def get_gene_novelty(database):
         for entry in cursor:
             gene_novelty[entry[0]] = "Antisense"
 
+        # Fetch fusion genes
+        cursor.execute("""SELECT ID FROM gene_annotations
+                              WHERE attribute = "fusion_novel"
+                              AND value = "TRUE";""")
+        for entry in cursor:
+            gene_novelty[entry[0]] = "Fusion"
+
         # Fetch intergenic genes
         cursor.execute("""SELECT ID FROM gene_annotations
                               WHERE attribute = "intergenic_novel"
@@ -172,7 +179,7 @@ def get_gene_novelty(database):
 
 def get_transcript_novelty(database):
     """ Given a database, get the novelty status of each transcript. """
-  
+
     transcript_novelty = {}
     with sqlite3.connect(database) as conn:
         conn.row_factory = sqlite3.Row
@@ -184,59 +191,66 @@ def get_transcript_novelty(database):
                               AND value = "KNOWN";""")
         for entry in cursor:
             transcript_novelty[entry[0]] = "Known"
-    
+
         # Fetch ISM transcripts
         cursor.execute("""SELECT ID FROM transcript_annotations
                               WHERE attribute = "ISM_transcript"
                               AND value = "TRUE";""")
         for entry in cursor:
             transcript_novelty[entry[0]] = "ISM"
-    
+
         # Fetch NIC transcripts
         cursor.execute("""SELECT ID FROM transcript_annotations
                               WHERE attribute = "NIC_transcript"
                               AND value = "TRUE";""")
         for entry in cursor:
             transcript_novelty[entry[0]] = "NIC"
-    
+
         # Fetch NNC transcripts
         cursor.execute("""SELECT ID FROM transcript_annotations
                               WHERE attribute = "NNC_transcript"
                               AND value = "TRUE";""")
         for entry in cursor:
             transcript_novelty[entry[0]] = "NNC"
-    
+
         # Fetch antisense transcripts
         cursor.execute("""SELECT ID FROM transcript_annotations
                               WHERE attribute = "antisense_transcript"
                               AND value = "TRUE";""")
         for entry in cursor:
             transcript_novelty[entry[0]] = "Antisense"
-    
+
         # Fetch intergenic transcripts
         cursor.execute("""SELECT ID FROM transcript_annotations
                               WHERE attribute = "intergenic_transcript"
                               AND value = "TRUE";""")
         for entry in cursor:
             transcript_novelty[entry[0]] = "Intergenic"
-    
+
         # Fetch genomic transcripts
         cursor.execute("""SELECT ID FROM transcript_annotations
                               WHERE attribute = "genomic_transcript"
                               AND value = "TRUE";""")
         for entry in cursor:
             transcript_novelty[entry[0]] = "Genomic"
-    
+
+        # Fetch fusion transcripts
+        cursor.execute("""SELECT ID FROM transcript_annotations
+                              WHERE attribute = "fusion_transcript"
+                              AND value = "TRUE";""")
+        for entry in cursor:
+            transcript_novelty[entry[0]] = "Fusion"
+
     return transcript_novelty
 
 def get_ISM_novelty(database):
     """ Given a database, get the ISM subtype of each ISM transcript. """
-    
+
     all_ISMs = set()
     prefix_ISMs = set()
     suffix_ISMs = set()
     ISM_novelty = {}
-    
+
     with sqlite3.connect(database) as conn:
         conn.row_factory = sqlite3.Row
         cursor = conn.cursor()
@@ -275,7 +289,7 @@ def get_ISM_novelty(database):
 
     return ISM_novelty
 
-def get_gene_annotations(database): 
+def get_gene_annotations(database):
     """ Create a dictionary linking each TALON gene ID to its human-readable
         name and accession ID """
 
@@ -286,13 +300,13 @@ def get_gene_annotations(database):
         conn.row_factory = sqlite3.Row
         cursor = conn.cursor()
 
-        cursor.execute("""SELECT ID, ga.value FROM gene_annotations as ga 
+        cursor.execute("""SELECT ID, ga.value FROM gene_annotations as ga
                           WHERE attribute = "gene_name";""")
         for entry in cursor:
             gene_name[entry["ID"]] = entry["value"]
 
         cursor.execute("""SELECT ID, ga.value FROM gene_annotations as ga
-                          WHERE attribute = "gene_id";""") 
+                          WHERE attribute = "gene_id";""")
         for entry in cursor:
             gene_ID[entry["ID"]] = entry["value"]
 
@@ -347,8 +361,8 @@ def make_read_annot_file(database, build, outprefix, datasets = "all"):
             22. Start support (external assay)
             23. End support (external assay)
 
-        By default, reads from all datasets in the database are included, but 
-        this can be modified by supplying a list/tuple of dataset names to the 
+        By default, reads from all datasets in the database are included, but
+        this can be modified by supplying a list/tuple of dataset names to the
         datasets parameter.
     """
     tmp_read_file = outprefix + "_reads.tmp"
@@ -356,19 +370,19 @@ def make_read_annot_file(database, build, outprefix, datasets = "all"):
 
     # Make annotation dicts
     gene_names, gene_IDs = get_gene_annotations(database)
-    transcript_names, transcript_IDs = get_transcript_annotations(database) 
+    transcript_names, transcript_IDs = get_transcript_annotations(database)
 
     # Make novelty dicts
     gene_novelty = get_gene_novelty(database)
     transcript_novelty = get_transcript_novelty(database)
-    ISM_novelty = get_ISM_novelty(database) 
+    ISM_novelty = get_ISM_novelty(database)
 
     fname = outprefix + "_talon_read_annot.tsv"
     o = open(fname, 'w')
-    colnames = [ "read_name", "dataset", "genome_build", "chrom", 
+    colnames = [ "read_name", "dataset", "genome_build", "chrom",
                  "read_start", "read_end", "strand", "n_exons", "read_length",
                  "gene_ID", "transcript_ID", "annot_gene_id", "annot_transcript_id",
-                 "annot_gene_name", "annot_transcript_name", "gene_novelty", 
+                 "annot_gene_name", "annot_transcript_name", "gene_novelty",
                  "transcript_novelty", "ISM_subtype", "fraction_As", "custom_label",
                  "allelic_label", "start_support", "end_support"]
     o.write("\t".join(colnames) + "\n")
@@ -396,8 +410,8 @@ def make_read_annot_file(database, build, outprefix, datasets = "all"):
             if curr_transcript_novelty == "ISM":
                 curr_ISM_novelty = ISM_novelty[transcript_ID]
             else:
-                curr_ISM_novelty = "None"    
-             
+                curr_ISM_novelty = "None"
+
             # Get annotation info
             try:
                 annot_gene_id = gene_IDs[gene_ID]
@@ -413,7 +427,7 @@ def make_read_annot_file(database, build, outprefix, datasets = "all"):
                 annot_transcript_id = "None"
             try:
                 annot_transcript_name = transcript_names[transcript_ID]
-            except: 
+            except:
                 annot_transcript_name = "None"
 
             gene_ID = str(gene_ID)
@@ -421,9 +435,9 @@ def make_read_annot_file(database, build, outprefix, datasets = "all"):
             o.write("\t".join([read_name, dataset, genome_build, chrom,
                                read_start, read_end, strand, n_exons, read_length,
                                gene_ID, transcript_ID,
-                               annot_gene_id, annot_transcript_id, 
-                               annot_gene_name, annot_transcript_name, 
-                               curr_gene_novelty, curr_transcript_novelty, 
+                               annot_gene_id, annot_transcript_id,
+                               annot_gene_name, annot_transcript_name,
+                               curr_gene_novelty, curr_transcript_novelty,
                                curr_ISM_novelty, fraction_As, custom_label,
                                allelic_label, start_support, end_support]) + "\n")
 
@@ -467,10 +481,10 @@ def main():
         raise ValueError("Database file '%s' does not exist!" % database)
 
     if datasets != None:
-        datasets = datasets.split(",")   
-   
+        datasets = datasets.split(",")
+
     make_read_annot_file(database, build, outprefix, datasets = datasets)
-    
+
 
 if __name__ == '__main__':
     main()
diff --git a/src/talon/talon.py b/src/talon/talon.py
index e9e0a5a..bc6058e 100644
--- a/src/talon/talon.py
+++ b/src/talon/talon.py
@@ -950,30 +950,13 @@ def process_NIC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict,
         this, look up each vertex in the vertex_2_gene dict, and keep track of all
         same-strand genes. """
 
-    gene_matches = []
     start_end_info = {}
 
-    for vertex in vertex_IDs:
-        if vertex in vertex_2_gene:
-            curr_matches = vertex_2_gene[vertex]
-
-            # Make sure the gene is on the correct strand
-            gene_matches += [x[0]
-                             for x in list(curr_matches) if x[1] == strand]
-
-
-    # Now count up how often we see each gene
-    gene_tally = dict((x, gene_matches.count(x)) for x in set(gene_matches))
-
-    # print(gene_matches)
-    # print(gene_tally)
-    # TODO: deal with fusions
-
-    # For the main assignment, pick the gene that is observed the most
-    if len(gene_tally) == 0:
-        return None, None, [], None
-
-    gene_ID = max(gene_tally, key=gene_tally.get)
+    gene_ID, fusion = find_gene_match_on_vertex_basis(vertex_IDs,
+                                                      strand,
+                                                      vertex_2_gene)
+    if gene_ID == None:
+      return None, None, [], None, fusion
 
     # Get matches for the ends
     start_vertex, start_exon, start_novelty, known_start, diff_5p = process_5p(chrom,
@@ -1009,35 +992,60 @@ def process_NIC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict,
     transcript_ID = novel_transcript["transcript_ID"]
     novelty = [(transcript_ID, run_info.idprefix, "TALON",
                 "NIC_transcript", "TRUE")]
+    fusion = False
 
-    return gene_ID, transcript_ID, novelty, start_end_info
+    return gene_ID, transcript_ID, novelty, start_end_info, fusion
 
 
 def find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex_2_gene):
     """ Use vertices in a transcript to try to pinpoint the gene it belongs to.
+
+    Parameters:
+        vertex_IDs (list of int): List of vertices in the read
+        strand (str): Strand of read
+        vertex_2_gene (dict): Dict. w/ keys = vertices, items = gene IDs of genes
+            that use this vertex
+
+    Returns:
+        gene_ID (str or None): Gene ID of matching gene, or None if novel gene
+            needs to be created
+        fusion (bool): Whether gene read is from might be fusion / read through
     """
     gene_matches = []
+    n_gene_matches = []
+
     for vertex in vertex_IDs:
         if vertex in vertex_2_gene:
             curr_matches = vertex_2_gene[vertex]
 
-            # Make sure the gene is on the correct strand
-            gene_matches += [x[0] for x in curr_matches if x[1] == strand]
+            # enforce same strandedness
+            matches = [m for m in list(curr_matches) if m[1] == strand]
 
-    if len(gene_matches) == 0:
-        return None
+            gene_matches += [x[0] for x in list(matches)]
 
-    # Now count up how often we see each gene
+            # how many genes have this splice site?
+            n_gene_matches.append(len(matches))
+
+    # how many splice sites are from each gene
     gene_tally = dict((x, gene_matches.count(x)) for x in set(gene_matches))
 
-    # print(gene_matches)
-    # print(gene_tally)
-    # TODO: deal with fusions
+    # no shared splice junctions
+    if len(gene_matches) == 0:
+        return None, False
+
+    # if there is more than one splice site from two non-overlapping genes
+    # we need to make a new gene
+    # when there are no shared splice sites between gene hits but we did
+    # hit more than one gene
+    elif max(n_gene_matches) <= 1 and len(gene_tally) > 1:
+        return None, True
 
     # For the main assignment, pick the gene that is observed the most
-    gene_ID = max(gene_tally, key=gene_tally.get)
+    else:
+        gene_ID = max(gene_tally, key=gene_tally.get)
+        fusion = False
 
-    return gene_ID
+    return gene_ID, fusion
 
 
 def process_NNC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict,
@@ -1047,10 +1055,10 @@ def process_NNC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict,
     novelty = []
     start_end_info = {}
 
-    gene_ID = find_gene_match_on_vertex_basis(
+    gene_ID, fusion = find_gene_match_on_vertex_basis(
         vertex_IDs, strand, vertex_2_gene)
     if gene_ID == None:
-        return None, None, [], None
+        return None, None, [], None, False
 
     # Get matches for the ends
     start_vertex, start_exon, start_novelty, known_start, diff_5p = process_5p(chrom,
@@ -1085,8 +1093,9 @@ def process_NNC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict,
 
     novelty.append((transcript_ID, run_info.idprefix, "TALON",
                     "NNC_transcript", "TRUE"))
+    fusion = False
 
-    return gene_ID, transcript_ID, novelty, start_end_info
+    return gene_ID, transcript_ID, novelty, start_end_info, fusion
 
 
 def process_spliced_antisense(chrom, positions, strand, edge_IDs, vertex_IDs,
@@ -1102,7 +1111,7 @@ def process_spliced_antisense(chrom, positions, strand, edge_IDs, vertex_IDs,
         anti_strand = "-"
     else:
         anti_strand = "+"
-    anti_gene_ID = find_gene_match_on_vertex_basis(vertex_IDs, anti_strand,
+    anti_gene_ID, fusion = find_gene_match_on_vertex_basis(vertex_IDs, anti_strand,
                                                    vertex_2_gene)
     if anti_gene_ID == None:
         return None, None, gene_novelty, transcript_novelty, start_end_info
@@ -1155,7 +1164,8 @@ def process_spliced_antisense(chrom, positions, strand, edge_IDs, vertex_IDs,
 
 def process_remaining_mult_cases(chrom, positions, strand, edge_IDs, vertex_IDs,
                                  transcript_dict, gene_starts, gene_ends, edge_dict,
-                                 locations, vertex_2_gene, run_info, cursor, tmp_gene):
+                                 locations, vertex_2_gene, run_info, cursor, tmp_gene,
+                                 fusion):
     """ This function is a catch-all for multiexonic transcripts that were not
         FSM, ISM, NIC, NNC, or spliced antisense.
     """
@@ -1199,17 +1209,25 @@ def process_remaining_mult_cases(chrom, positions, strand, edge_IDs, vertex_IDs,
     start_end_info["vertex_IDs"] = vertex_IDs
 
     if gene_ID == None:
+
+        if fusion:
+            t_nov = 'fusion_transcript'
+            g_nov = 'fusion_novel'
+        else:
+            t_nov = 'intergenic_transcript'
+            g_nov = 'intergenic_novel'
+
         gene_ID = create_gene(chrom, positions[0], positions[-1],
                               strand, cursor, tmp_gene)
 
         gene_novelty.append((gene_ID, run_info.idprefix, "TALON",
-                             "intergenic_novel", "TRUE"))
+                             g_nov, "TRUE"))
 
         transcript_ID = create_transcript(chrom, positions[0], positions[-1],
                                           gene_ID, edge_IDs, vertex_IDs,
                                           transcript_dict)["transcript_ID"]
         transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON",
-                                   "intergenic_transcript", "TRUE"))
+                                   t_nov, "TRUE"))
 
     elif match_strand != strand:
         anti_gene_ID = gene_ID
@@ -1291,6 +1309,7 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di
     all_exons_known = check_all_exons_known(e_novelty)
     splice_vertices_known = (sum(v_novelty) == 0)
     all_exons_novel = (reduce(operator.mul, e_novelty, 1) == 1)
+    fusion = False
 
     # Look for FSM or ISM.
     if all_SJs_known:
@@ -1298,6 +1317,7 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di
         all_matches = search_for_ISM(edge_IDs, transcript_dict)
         if all_matches != None:
             # Look for FSM first
+            print('looking for fsm')
             gene_ID, transcript_ID, transcript_novelty, start_end_info = process_FSM(chrom,
                                                                                      positions, strand, edge_IDs,
                                                                                      vertex_IDs, all_matches,
@@ -1306,6 +1326,7 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di
                                                                                      location_dict, run_info)
             if gene_ID == None:
                 # Now look for ISM
+                print('looking for ism')
                 gene_ID, transcript_ID, transcript_novelty, start_end_info = process_ISM(chrom,
                                                                                          positions,
                                                                                          strand, edge_IDs,
@@ -1317,7 +1338,8 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di
                                                                                          run_info)
         # Look for NIC
         if gene_ID == None:
-            gene_ID, transcript_ID, transcript_novelty, start_end_info = process_NIC(chrom,
+            print('looking for nic')
+            gene_ID, transcript_ID, transcript_novelty, start_end_info, fusion = process_NIC(chrom,
                                                                                      positions,
                                                                                      strand, edge_IDs,
                                                                                      vertex_IDs, transcript_dict,
@@ -1328,7 +1350,8 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di
     # Novel in catalog transcripts have known splice donors and acceptors,
     # but new connections between them.
     elif splice_vertices_known and gene_ID == None:
-        gene_ID, transcript_ID, transcript_novelty, start_end_info = process_NIC(chrom,
+        print('looking for nic (again?)')
+        gene_ID, transcript_ID, transcript_novelty, start_end_info, fusion = process_NIC(chrom,
                                                                                  positions,
                                                                                  strand, edge_IDs,
                                                                                  vertex_IDs, transcript_dict,
@@ -1337,7 +1360,8 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di
                                                                                  vertex_2_gene, run_info)
 
     # Antisense transcript with splice junctions matching known gene
-    if splice_vertices_known and gene_ID == None:
+    if splice_vertices_known and gene_ID == None and not fusion:
+        print('looking for spliced antisese')
         gene_ID, transcript_ID, gene_novelty, transcript_novelty, start_end_info = \
             process_spliced_antisense(chrom, positions,
                                       strand, edge_IDs,
@@ -1351,8 +1375,9 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di
 
     # Novel not in catalog transcripts contain new splice donors/acceptors
     # and contain at least one splice junction.
-    elif not(splice_vertices_known):
-        gene_ID, transcript_ID, transcript_novelty, start_end_info = process_NNC(chrom,
+    elif not(splice_vertices_known) and not fusion:
+        print('lookign for NNCs')
+        gene_ID, transcript_ID, transcript_novelty, start_end_info, fusion = process_NNC(chrom,
                                                                                  positions,
                                                                                  strand, edge_IDs,
                                                                                  vertex_IDs, transcript_dict,
@@ -1361,6 +1386,7 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di
                                                                                  vertex_2_gene, run_info)
     # Transcripts that don't match the previous categories end up here
     if gene_ID == None:
+        print('looking for this other stuff')
         gene_ID, transcript_ID, gene_novelty, transcript_novelty, start_end_info = \
             process_remaining_mult_cases(chrom, positions,
                                          strand, edge_IDs,
@@ -1369,7 +1395,12 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di
                                          gene_starts, gene_ends,
                                          edge_dict, location_dict,
                                          vertex_2_gene, run_info,
-                                         cursor, tmp_gene)
+                                         cursor, tmp_gene,
+                                         fusion)
+
+    print(gene_ID)
+    print(gene_novelty)
+    print(transcript_novelty)
 
     # Add all novel vertices to vertex_2_gene now that we have the gene ID
     vertex_IDs = start_end_info["vertex_IDs"]
@@ -1398,7 +1429,6 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di
                                    "transcript_name", talon_transcript_name))
         transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON",
                                    "transcript_id", talon_transcript_name))
-
     # Add annotation entries for any novel exons
     exon_novelty = []
     exons = edge_IDs[::2]
@@ -2466,6 +2496,8 @@ def annotate_read(sam_record: pysam.AlignedSegment, cursor, run_info,
     """
     # Parse attributes to determine the chromosome, positions, and strand of the transcript
     read_ID = sam_record.query_name
+    print()
+    print(read_ID)
     if not run_info.use_cb_tag:
         dataset = sam_record.get_tag("RG")
     else:

From 0519f32d9681b26f455f5f8bbccf4f011e4902a0 Mon Sep 17 00:00:00 2001
From: fairliereese <fairliek@comcast.net>
Date: Thu, 21 Sep 2023 12:32:54 -0700
Subject: [PATCH 03/31] added tests for option to filter known transcripts for
 all known, regardless of whether observed

---
 testing_suite/filtering/optparse_mock_filt.py |  1 +
 testing_suite/filtering/test_fetch_known.py   | 27 ++++++++++++++++---
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/testing_suite/filtering/optparse_mock_filt.py b/testing_suite/filtering/optparse_mock_filt.py
index ef5c7cb..75aebe0 100644
--- a/testing_suite/filtering/optparse_mock_filt.py
+++ b/testing_suite/filtering/optparse_mock_filt.py
@@ -11,3 +11,4 @@ def __init__(self, database, annot, max_frac_A = 0.5,
          self.outprefix = outprefix
          self.allow_genomic = allow_genomic
          self.exclude_ISMs = exclude_ISMs
+         self.include_annot = False
diff --git a/testing_suite/filtering/test_fetch_known.py b/testing_suite/filtering/test_fetch_known.py
index 27fcc39..2d883e4 100644
--- a/testing_suite/filtering/test_fetch_known.py
+++ b/testing_suite/filtering/test_fetch_known.py
@@ -7,36 +7,55 @@ def test_get_known_transcripts_all_datasets():
         transcripts when datasets are not speicified """
 
     database = "scratch/filter/test.db"
-    known = filt.get_known_transcripts(database, "toy", datasets = None)
+    include_annot = False
+    known = filt.get_known_transcripts(database, "toy",
+                                       include_annot,
+                                       datasets = None)
+    assert list(known.gene_ID) == [1, 1]
+    assert list(known.transcript_ID) == [1, 2]
+
+def test_get_known_transcripts_dataset_1_include_annot():
+    """ Make sure the get_known_transcripts function returns all known
+        transcripts with the include_annot function """
+
+    database = "scratch/filter/test.db"
+    include_annot = False
+    known = filt.get_known_transcripts(database, "toy",
+                                       include_annot,
+                                       datasets = None)
     assert list(known.gene_ID) == [1, 1]
     assert list(known.transcript_ID) == [1, 2]
 
 def test_get_known_transcripts_specific_dataset():
-    """ Now make sure the correct transcript is returned when the dataset is 
+    """ Now make sure the correct transcript is returned when the dataset is
         specified. """
 
     database = "scratch/filter/test.db"
+    include_annot = False
 
     # Both datasets
-    known = filt.get_known_transcripts(database, "toy", 
+    known = filt.get_known_transcripts(database, "toy",
+                                       include_annot,
                                        datasets = ["dataset_1", "dataset_2"])
     assert list(known.gene_ID) == [1, 1]
     assert list(known.transcript_ID) == [1, 2]
 
     # Dataset 1
     known = filt.get_known_transcripts(database, "toy",
+                                       include_annot,
                                        datasets = ["dataset_1"])
     assert list(known.iloc[0]) == [1, 1]
     assert len(known) == 1
 
     # Dataset 2
     known = filt.get_known_transcripts(database, "toy",
+                                       include_annot,
                                        datasets = ["dataset_2"])
     assert list(known.iloc[0]) == [1, 2]
     assert len(known) == 1
 
     # Dataset 3
     known = filt.get_known_transcripts(database, "toy",
+                                       include_annot,
                                        datasets = ["dataset_3"])
     assert len(known) == 0
-

From c64fd572404cbe61b4dcf396750f0b13cb059643 Mon Sep 17 00:00:00 2001
From: fairliereese <fairliek@comcast.net>
Date: Thu, 21 Sep 2023 12:34:18 -0700
Subject: [PATCH 04/31] updated existing tests to support new function
 signatures

---
 testing_suite/test_NIC_identification.py      | 32 +++++++++----------
 testing_suite/test_NNC_identification.py      |  3 +-
 .../test_find_gene_match_on_vertex_basis.py   |  9 ++++--
 .../test_process_remaining_mult_cases.py      | 22 ++++++++-----
 4 files changed, 38 insertions(+), 28 deletions(-)

diff --git a/testing_suite/test_NIC_identification.py b/testing_suite/test_NIC_identification.py
index 33eb664..92e10fe 100644
--- a/testing_suite/test_NIC_identification.py
+++ b/testing_suite/test_NIC_identification.py
@@ -6,7 +6,7 @@
 class TestIdentifyNIC(object):
 
     def test_NIC_match(self):
-        """ Example where the transcript is an NIC match to an existing one by 
+        """ Example where the transcript is an NIC match to an existing one by
             virtue of skipping an exon.
         """
         conn, cursor = get_db_cursor()
@@ -28,22 +28,23 @@ def test_NIC_match(self):
         strand = "+"
         v_novelty = [0, 0]
 
-        gene_ID, transcript_ID, novelty, start_end_info = talon.process_NIC(chrom, 
-                                                            positions, 
-                                                            strand, edge_IDs, 
+        gene_ID, transcript_ID, novelty, start_end_info, fusion = talon.process_NIC(chrom,
+                                                            positions,
+                                                            strand, edge_IDs,
                                                             vertex_IDs, transcript_dict,
-                                                            gene_starts, gene_ends, 
-                                                            edge_dict, location_dict, 
+                                                            gene_starts, gene_ends,
+                                                            edge_dict, location_dict,
                                                             vertex_2_gene, run_info)
 
         correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor)
         assert gene_ID == correct_gene_ID
         assert start_end_info["vertex_IDs"] == [1,2,5,6]
         assert transcript_dict[frozenset(start_end_info["edge_IDs"])] != None
+        assert fusion == False
         conn.close()
 
     def test_antisense(self):
-        """ Example where the vertices are known but there is no same-strand 
+        """ Example where the vertices are known but there is no same-strand
             match """
 
         conn, cursor = get_db_cursor()
@@ -64,7 +65,7 @@ def test_antisense(self):
         chrom = "chr1"
         start = 1000
         end = 1
-        edge_IDs = [ talon.edge_counter.value() + 1 ] 
+        edge_IDs = [ talon.edge_counter.value() + 1 ]
         positions = [ 1000, 900, 100, 1]
         vertex_IDs = [ 5, 2 ]
         strand = "-"
@@ -73,16 +74,16 @@ def test_antisense(self):
 
         # Find antisense match
         gene_ID, transcript_ID, gene_novelty, transcript_novelty, start_end_info = \
-                                      talon.process_spliced_antisense(chrom, positions, 
-                                                                  strand, edge_IDs, 
-                                                                  vertex_IDs, 
+                                      talon.process_spliced_antisense(chrom, positions,
+                                                                  strand, edge_IDs,
+                                                                  vertex_IDs,
                                                                   transcript_dict,
-                                                                  gene_starts, 
-                                                                  gene_ends, 
-                                                                  edge_dict, locations, 
+                                                                  gene_starts,
+                                                                  gene_ends,
+                                                                  edge_dict, locations,
                                                                   vertex_2_gene, run_info,
                                                                   cursor, "temp_gene")
-        #anti_gene_ID = talon.find_gene_match_on_vertex_basis(vertex_IDs, 
+        #anti_gene_ID = talon.find_gene_match_on_vertex_basis(vertex_IDs,
         #                                                     anti_strand,
         #                                                     vertex_2_gene)
 
@@ -92,4 +93,3 @@ def test_antisense(self):
         assert start_end_info["vertex_IDs"] == [6, 5, 2, 1]
 
         conn.close()
-
diff --git a/testing_suite/test_NNC_identification.py b/testing_suite/test_NNC_identification.py
index 87c426e..63f1cd6 100644
--- a/testing_suite/test_NNC_identification.py
+++ b/testing_suite/test_NNC_identification.py
@@ -28,7 +28,7 @@ def test_NNC_match(self):
         strand = "+"
         v_novelty = [0, 0]
 
-        gene_ID, transcript_ID, transcript_novelty, start_end_info = talon.process_NNC(chrom,
+        gene_ID, transcript_ID, transcript_novelty, start_end_info, fusion = talon.process_NNC(chrom,
                                                             positions,
                                                             strand, edge_IDs,
                                                             vertex_IDs, transcript_dict,
@@ -40,4 +40,5 @@ def test_NNC_match(self):
         assert gene_ID == correct_gene_ID
         assert start_end_info["vertex_IDs"] == [1] + vertex_IDs + [6]
         assert transcript_dict[frozenset(start_end_info["edge_IDs"])] != None
+        assert fusion == False
         conn.close()
diff --git a/testing_suite/test_find_gene_match_on_vertex_basis.py b/testing_suite/test_find_gene_match_on_vertex_basis.py
index 730e482..b7c625e 100644
--- a/testing_suite/test_find_gene_match_on_vertex_basis.py
+++ b/testing_suite/test_find_gene_match_on_vertex_basis.py
@@ -19,10 +19,11 @@ def test_perfect_match(self):
         vertex_IDs = (1, 2, 3, 4, 5, 6)
         strand = "+"
 
-        gene_ID = talon.find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex2gene)
+        gene_ID, fusion = talon.find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex2gene)
 
         correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor)
         assert gene_ID == correct_gene_ID
+        assert fusion == False
         conn.close()
 
     def test_NNC_type_match(self):
@@ -38,10 +39,11 @@ def test_NNC_type_match(self):
         vertex_IDs = (1, 200, 3, 4, 5, 6)
         strand = "+"
 
-        gene_ID = talon.find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex2gene)
+        gene_ID, fusion = talon.find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex2gene)
 
         correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor)
         assert gene_ID == correct_gene_ID
+        assert fusion == False
         conn.close()
 
     def test_no_match(self):
@@ -56,7 +58,8 @@ def test_no_match(self):
         vertex_IDs = (1000, 2000, 3000, 4000)
         strand = "+"
 
-        gene_ID = talon.find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex2gene)
+        gene_ID, fusion = talon.find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex2gene)
 
         assert gene_ID == None
+        assert fusion == False
         conn.close()
diff --git a/testing_suite/test_process_remaining_mult_cases.py b/testing_suite/test_process_remaining_mult_cases.py
index a912741..cc63516 100644
--- a/testing_suite/test_process_remaining_mult_cases.py
+++ b/testing_suite/test_process_remaining_mult_cases.py
@@ -30,16 +30,18 @@ def test_intergenic(self):
         edge_IDs = [ talon.edge_counter.value() + 1, talon.edge_counter.value() + 2 ]
         vertex_IDs = [ talon.vertex_counter.value() + 1, talon.vertex_counter.value() + 2 ]
         strand = "+"
+        fusion = False
 
         gene_ID, transcript_ID, gene_novelty, transcript_novelty, start_end_info = \
-                             talon.process_remaining_mult_cases(chrom, positions, 
-                                                                strand, edge_IDs, 
-                                                                vertex_IDs, 
+                             talon.process_remaining_mult_cases(chrom, positions,
+                                                                strand, edge_IDs,
+                                                                vertex_IDs,
                                                                 transcript_dict,
-                                                                gene_starts, gene_ends, 
+                                                                gene_starts, gene_ends,
                                                                 edge_dict, location_dict,
-                                                                vertex_2_gene, run_info, 
-                                                                cursor, "temp_gene")
+                                                                vertex_2_gene, run_info,
+                                                                cursor, "temp_gene",
+                                                                fusion)
 
         assert gene_ID == correct_gene_ID
         assert transcript_dict[frozenset(start_end_info["edge_IDs"])] != None
@@ -72,6 +74,7 @@ def test_antisense(self):
         edge_IDs = [ talon.edge_counter.value() + 1, talon.edge_counter.value() + 2 ]
         vertex_IDs = [ talon.vertex_counter.value() + 1, talon.vertex_counter.value() + 2 ]
         strand = "-"
+        fusion = False
 
         gene_ID, transcript_ID, gene_novelty, transcript_novelty, start_end_info = \
                              talon.process_remaining_mult_cases(chrom, positions,
@@ -81,7 +84,8 @@ def test_antisense(self):
                                                                 gene_starts, gene_ends,
                                                                 edge_dict, location_dict,
                                                                 vertex_2_gene, run_info,
-                                                                cursor, "temp_gene")
+                                                                cursor, "temp_gene",
+                                                                fusion)
         assert gene_ID == correct_gene_ID
         assert transcript_dict[frozenset(start_end_info["edge_IDs"])] != None
         assert gene_novelty[0][-2] == "antisense_gene"
@@ -111,6 +115,7 @@ def test_genomic(self):
         edge_IDs = [ talon.edge_counter.value() + 1, talon.edge_counter.value() + 2 ]
         vertex_IDs = [ talon.vertex_counter.value() + 1, talon.vertex_counter.value() + 2 ]
         strand = "-"
+        fusion = False
 
         gene_ID, transcript_ID, gene_novelty, transcript_novelty, start_end_info = \
                              talon.process_remaining_mult_cases(chrom, positions,
@@ -120,7 +125,8 @@ def test_genomic(self):
                                                                 gene_starts, gene_ends,
                                                                 edge_dict, location_dict,
                                                                 vertex_2_gene, run_info,
-                                                                cursor, "temp_gene")
+                                                                cursor, "temp_gene",
+                                                                fusion)
         correct_gene_ID = fetch_correct_ID("TG3", "gene", cursor)
         assert gene_ID == correct_gene_ID
         assert transcript_dict[frozenset(start_end_info["edge_IDs"])] != None

From 5c2e1970e8c40ae1b07b86c9cc5b5748fa38f61b Mon Sep 17 00:00:00 2001
From: fairliereese <fairliek@comcast.net>
Date: Thu, 21 Sep 2023 13:18:12 -0700
Subject: [PATCH 05/31] added test for low-level fusion gene det. and fixed
 relevant function

---
 src/talon/talon.py                            |  2 +-
 .../test_process_remaining_mult_cases.py      | 46 +++++++++++++++++++
 2 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/src/talon/talon.py b/src/talon/talon.py
index bc6058e..8c7809d 100644
--- a/src/talon/talon.py
+++ b/src/talon/talon.py
@@ -1173,7 +1173,7 @@ def process_remaining_mult_cases(chrom, positions, strand, edge_IDs, vertex_IDs,
     transcript_novelty = []
     start_end_info = {}
 
-    if not run_info.create_novel_spliced_genes:
+    if not run_info.create_novel_spliced_genes and not fusion:
         gene_ID, match_strand = search_for_overlap_with_gene(chrom, positions[0],
                                                              positions[-1], strand,
                                                              cursor, run_info, tmp_gene)
diff --git a/testing_suite/test_process_remaining_mult_cases.py b/testing_suite/test_process_remaining_mult_cases.py
index cc63516..36966bb 100644
--- a/testing_suite/test_process_remaining_mult_cases.py
+++ b/testing_suite/test_process_remaining_mult_cases.py
@@ -5,6 +5,52 @@
 
 class TestIdentifyRemaining(object):
 
+    def test_fusion(self):
+        """ Example where the transcript is shares splice junctions between
+            two different genes
+        """
+        conn, cursor = get_db_cursor()
+        build = "toy_build"
+        db = "scratch/toy.db"
+        talon.get_counters(db)
+        edge_dict = init_refs.make_edge_dict(cursor)
+        location_dict = init_refs.make_location_dict(build, cursor)
+        run_info = talon.init_run_info(db, build)
+        transcript_dict = init_refs.make_transcript_dict(cursor, build)
+        vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor)
+        gene_starts = init_refs.make_gene_start_or_end_dict(cursor, build, "start")
+        gene_ends = init_refs.make_gene_start_or_end_dict(cursor, build, "end")
+        correct_gene_ID = talon.gene_counter.value() + 1
+
+        chrom = "chr1"
+        positions = [1, 100, 500, 600, 900, 1010, 5000, 5550, 6000]
+        strand = "+"
+        edge_IDs = [2, 3, 4]+[ talon.edge_counter.value() + 1, talon.edge_counter.value() + 2 ]
+        vertex_IDs = [2, 3, 4, 5, 9, 10]
+        v_novelty = [0, 0, 0, 0, 0, 0]
+
+        # Construct temp novel gene db
+        init_refs.make_temp_novel_gene_table(cursor, "toy_build")
+        fusion = True
+
+        gene_ID, transcript_ID, gene_novelty, transcript_novelty, start_end_info = \
+                             talon.process_remaining_mult_cases(chrom, positions,
+                                                                strand, edge_IDs,
+                                                                vertex_IDs,
+                                                                transcript_dict,
+                                                                gene_starts, gene_ends,
+                                                                edge_dict, location_dict,
+                                                                vertex_2_gene, run_info,
+                                                                cursor, "temp_gene",
+                                                                fusion)
+
+        assert gene_ID == correct_gene_ID
+        assert transcript_dict[frozenset(start_end_info["edge_IDs"])] != None
+        assert gene_novelty[0][-2] == "fusion_novel"
+        conn.close()
+        conn.close()
+
+
     def test_intergenic(self):
         """ Example where the transcript is an NIC match to an existing one by
             virtue of a new splice donor.

From 04894a196d31407a50129018a7fd84a7f3c47696 Mon Sep 17 00:00:00 2001
From: fairliereese <fairliek@comcast.net>
Date: Thu, 21 Sep 2023 14:16:11 -0700
Subject: [PATCH 06/31] added examples for readthrough

---
 .../input_files/readthrough/config.csv        |   1 +
 .../hl60_1_1_subset_remapped_sorted.bam       | Bin 0 -> 6521 bytes
 .../input_files/readthrough/readthrough.gtf   | 294 ++++++++++++++++++
 3 files changed, 295 insertions(+)
 create mode 100644 testing_suite/input_files/readthrough/config.csv
 create mode 100644 testing_suite/input_files/readthrough/hl60_1_1_subset_remapped_sorted.bam
 create mode 100644 testing_suite/input_files/readthrough/readthrough.gtf

diff --git a/testing_suite/input_files/readthrough/config.csv b/testing_suite/input_files/readthrough/config.csv
new file mode 100644
index 0000000..974c858
--- /dev/null
+++ b/testing_suite/input_files/readthrough/config.csv
@@ -0,0 +1 @@
+hl60_1_1,hl60,SequelII,input_files/readthrough/hl60_1_1_subset_remapped_sorted.bam
diff --git a/testing_suite/input_files/readthrough/hl60_1_1_subset_remapped_sorted.bam b/testing_suite/input_files/readthrough/hl60_1_1_subset_remapped_sorted.bam
new file mode 100644
index 0000000000000000000000000000000000000000..f33a4b84b7f0927fc5be26c7b840261a1f010e43
GIT binary patch
literal 6521
zcmZXZRZtv^mV}2vgF}G81ef6M1h){}-2x;KT!!E>I0X0LE`z(n4DK2v5Zr?d8XWF_
z?`~~v?Vg7|&;8I}RaaAe$3Ox6`(Drh(lS5*4tBPIL$s+JvG@bW$_dFG6Ocs8B!D!?
z#RK6xRSB|%P%ot>4jR&a@mpD49(7!=?_g&~`>DWXQJ}%%FL`T=N=Ji6$Uu!bgQ7A(
z`p6^|EiFfA$z=ny;5r!Z^f>iFWTX2pl}t>ETThOAP5yA(%Q3it;K*=xr`VjJPf@MZ
zyJV>Rshdbsc0yc}SbLnDa!gejcHGU@O}s5_K^R+E9to<tea)Utyad4>0M@EZr?!C@
z0Jqfl#o83{et8?p-bH9-82Tk4yD?-Uz>?Zf*Am=x3?{}RbXlBQg7~Oc6`|Eg9Kc>o
zMjT#u<nQ9N(Yz{HWJh$+T0=ns)5%Tw$bxR}g_fL;`NbQ}u~!qE$~2*$aEhD);8~XA
z0*!J&W17eA&)M80uWEx>=@=@I697gS+j>sp5hzR*KV~XbvE%;Q9lmgxFou1&EEWiL
zYo6qeTkz3MPic1?Ug*{X(_m3?@^91&!JQrHD_;C%vhq?KNyRSnzqg)?9{SmTsq64*
zZkfhI*5NBCN#w;5hGW4K(jB1N@<vSiAXh^p69Bg^U%%C+mz+f<^Pq6>vgerYRKBEZ
zK@&6~g796YMJ3sVXk_^wG~=!TvSXN|{#?nY`ZykX^u6G$xoIca`P%)nS@l`<2EVYW
z(43Gg^yi}MkL`9B5#!~YuL+tqvHcm#A!;myk!{Va_QBiqLzzg_#`&Nem2R6PO`-yq
zXZ?XFySm<tN_wfuOvml$I(giypxxQ3GS~X1A<aT`!ysi_bp9xk@Jg2)bk}-0Ro+d!
z9YX?}*3YO!_F;Ky`2}@SDLaOB)KxVmG0_<)Mu)Kb@|fj5<tXX-GpvSgXkCeUc|Do%
zZxUT0dZvv0K2klX%jqoq^$D_0y8JjUiq{d0l{qiV=KLgjQ@xg$z>r_Yn!3#J)nqtI
zvfVEJ82J#U%2DU-vUR>W>wBS3M9FsT195Ld&+9dnn<pFMHt(OsJ3BZTG1c6`HJo4Y
z(-=;LgIf=_pQl|SDhSiiwnVECsC-0B_rUxdx~vqTOER=A5eG?W<R4HB*!>vJy%BPV
zfuSj1)UYJ%3N>T~*GsYl)puD2)5;KH7b(DGa1r0=+0A<WhJ3sE0&r2QjJpv;7sOCQ
ziz1({jB>U|Va}DShueFUMzF_bDvp}oG~AQOco`aY`MIt044qWT#-xoDlsAWqEh_yy
zDy)yRS&)VBytEL(uVt2QoicKc)RY!LB)`%-j6d58ZR*8J^Bq@Du8@k>Cklotya!`y
zltE>fFl8YYIwZ4P5jw>E-STd&p+xeEGMr`L=tp1V^I0s=P$Y{YCrf^OlD7M#@L+{d
zBbORe4%47io~$d)+_YS8v1n*%+8+u#|4kjBrmhzegDpeScjC1kl~75GH!I0#S%a5a
zlzQnBM)#bu?|W6g_HbgHrL!Dq(j*8;8$<b)du1$?!g}|G`o}P?(y+?bQ?YGJ^3Y92
z@@m?+fF>h>g2cK|BARZ^ZA~L$pf52#GY}=3i$<13wsp2xbnklmjyZ@vyp?T@^bK<)
z5f?JXCn(=5C!mO28GHOoh-&C~FFgwwO7zSo-T6D0m}?5lay^}j_YBWi0+nm~mSqbn
z?Vu6%`Vb`y&s0d_E)FiuZoIW_Xk!-Tq$Y6{$9i|2JLK%G>BJ_$x!Oq2c2%G#DKGVO
z=y`{Nab3E75{G9hVS<7aU7ehp-Z%3}D<!-|G9fX1r&ju5#nLr4a3_PP)9_Q4-ooZa
z<*Z=k#<L|oY8o7#=~zy(uC95<H3`4y)$(3EK}1|+cDL`jtf&S&;jR3MGAFmAim#tj
zXHdumPP2%ve(u7>IVwvSnHW`oxYMhZ7f{&j%D<jp(}wVrFJ{^LL4s$r=M9Y>KYh0P
znBKGgen*3D%+Hp6vv_hinKSEpeL$VogazeUfQ#p@KP>=NMMr10g+#tYgLvDNichC_
zrI4G;4Wx5X-WE&+kuRP-MaBUYC<=rMkPn8=GItY;Zpf$Dg90M0x!9l=cG#VvnOEgo
zoSP*SLdCdUHS1%7hCQlhzy4s~a-L@NqaHP1S>M0jWB(1_T$0;Y<2+vOvdixXk8@ZS
zy7l^Q?VUIlDQ9MLx8!xiYUOL;l@+4~zKP>J`LW*Q=jdyJh&_*tEN(mLT5VVzKt`;u
zzReX<i&2~Sme!&5-UrJ~4{<cnP)_mP%}vQo3#)5j_jG>Muj9*J$4Uvw{x{{5$B^3d
zo)L=G^X$v^^iR)-*)<0xx3OnEr7^vd7!LP0sXkqI#kDi=sZ3Djmn3DC`r1u;?(VMl
zJzVd_>#C}Jzqm|JE%OS#Ka_jvd2069({ntjx+g<8j8p_X`8gbMRNgZa#M}m~`9$lA
ze!J>-g&o4+rXUQE-&N%H7^En(=651ml)qbVja~|}ikg||545nKy`@8=km4b{)uwuK
zq5Gx2{c2+k4#Vjkl^04CI(a8E59v^s_M3$O?esiE)>VHLUn2PFL&v<r%-8*F!x+Bo
zhE50i7;JZsZeL@Nf%jr8-Td&(NS9ss7y+lv6`h(coqT}z?S3}j7TEI>xRvh{tJuZz
zz18o}SMD9GG;a^!O|nkR^J(GLe;s9d+Dnt*!OxU2W=T_&;Ohg%7xK{!Dc-m>-b8ql
z9+~%fs<PJ!e^@NV<fhb{l+!!Y=F8=6tZX+Uai3|P;<VFB^Rp%CxZ@)$=a{Bkhbr^4
zMrk@;Vr)}w8)<AtQqiL6KWe&_5OkvUZ!nOjP<{vQp7R7lPiN0um>q&1+$R{ZP_&`u
zIZ-d+>tEIjZ@}m}+uIZ{4Zz9oSrRe*QzP7T=~^n*^ZZwe^RX&vs>bVS3Y~e#?o#50
zJ#g`F%OCt{Gq=aNb&7FxPSDlp3^6e-CC>r$?G)7c%vb8=-k~%s?b7b_ZP!67c8!?s
z^eUeAvy>AN*xrwE$nMe1lX2HGX8q=gbiC&LVcH$=kFIq<o<Th=5@>siw_VqEux?O~
z)*@2p;QI9YiU!miVw)Ge2;G~_K{p_U(JO$H;E7xyW3(d(HPAYUl41nnjB?O60XWMn
zY^rdh-qC1B2mw3=4!QK{iUhrJ?FH#|y!}Z5;?8Ay0GLlmiPL!C(rszxrO807go6QU
zpyXDtCE(1okf3O^BeT7YqOL-%0s@T(9Jq4PEuY(-mMy_bo%ZPmC&ivP_F4WOAG#K?
zEV(eN6IXYGCo1U3@H05BP-;cNyN3u?0RHh3SQ9s-@WWnIJ~;+Gr6)DI^h$QlxXyt8
z`)k*4mWA1>D3aixIKn4&O{y)s#sfJb*%@xcnTOyZT|me{5BLisasm3fpGZ0Z?RkO+
zfZF-`C#ZH6c50@E$_qSu65N%%x+8|c)G-fExBoHgyo}e1Ir2Sc0-57m&*9py@!+~6
zm7D#|*DI@FM@sR!DU}DLll(6Eb(EE+^D-J8$m-G5IAr=2y$4_%1158`CwcHR6Q%`=
zHs7G|QjK1<UG_o)1nn{c7sJp%Vp!PN*}@`O;cCYUbFPkF_HuJ0)HLB23Pw`Wxb7+4
z>HGal?j`&RI<?!>>cqCBDAEqaxSGVqE2xq-%yZ{?o}s2Z+1dZN8wT&7{Nql7HwQW2
z0UIY;!xaRuNX`OcM{CpzPl?y@E6cz%@=xUTApFw2>CM0~%UjW?FDQz!fum%go-3+G
z&xnd42*o)zr5rENnIwaNqz5#C@qCNAtfT*kW@VnY6K5l@?mj;P(}2UohJ!!pQK3sw
zv^O<kG{0SL1mkgWU0EMmY%EWw7pK8K(|!@ca`m|l$zSiO!o~KkZQNHV#pz#Q?fTnC
z9?=r5^MHiU7O?rjfH~?($NMvSI)Or`pU8@ied%nP1eP}HMmpsrr1mzWCWs6UlL_Zh
zw5Un;=gOLoTcRsg(hcGVc9^B3^8)p(r}n>xt1_{h0s<vRX08Dk8nf9`EpP9$uf~-C
z7^yEeBmk7Lv~KJ`^Seut7P>3mua}21Z4$j8&RnD=oD~yq%#gUXp~D{!0DOhVpKG`=
ze<4^44*BIPB3BYj3$7LC>;M4O|7ioNI48zqan^Fg-D1d0yfoY8+R}0nn?%G_A%M9{
zD`!4!cZO6_jJilH0Xkgsh(50TkH!Z<EVv)#-VnZe^%jUGz(FrdLOP)EvlSjH+fx0A
z%;$)m5wU*f9y~pXZ!!!hPddz5xmoDaai=uPM*JQU<S@~_7@87oAQ!s(ne((H-tA$q
z_M1TL+x=R@ezR(}|5<*x+G@)+>|$#uZ8vhAW2}P8D79~qeT?5Hb$6y?F*P`iZA{5%
z?BbQN<x!Z}SVJQlH%hJ&ldM>}zVBq1-`~2Mt->^}ez$U(2m1ly%&3Wh40EmKthtCf
zuPtp_gX6N*MiRU7+3>s=ieDcs6hAtT^B8(F(p(-43=)N_8Qal@n+|6%?fe-|)(2|v
zj8~J<CCf71eCqnPZY)R`3G%SS(NI|siOfQ~OI4;F%VJ9?j&f;}{xR&%k|k`E$uLj*
z%9>%Z>^|aGoV2dc(3;TuH16X-v)H^Odq1<7hQgGekR+85=}uy^JO;;sf-bxP-Q)34
ztQ6Rq&!jpN*ocrZt!qQ_n2tozU)Go}iSA<6GrVur3YvoQI_e<32nXyJlg;ya^o6Bm
zy}9l$Y+)~<>#N)4gU@8nsy^kK;-Z)@$EwRcBu21#LGY#7`e=HP_vW|HdRouz)dXJl
zFMUT=4~q9L$CKAGL#d@6@Dn8afWuyApMNN|8#|U3TK5q9`Qb%vV?T0n?ZYiQ&Vj}C
zBFfgQeytK;XVeW}GT7Z0)O|*e;LCNS$roRAp@47bS&MPi0h<SP!K{JLn+ymvn?S@x
zgV-Cig9o*mi_H(~z7iK3ukv0?miiW1F=jrFPy!a}Rs*j^07^u)s>rpmq8ArNzjkJ}
z!*$KCd)FLlz5jeetyo_gtM$7Mh5zx7zcI6W=S3N)NH4PlN<+fl!NJ)|uSB*5`0^kp
zvEanECGg@7!(l*rUpf!0v&hsy*<bpK0Qwbz<HUH3Eh75pHI{HrT_|nDK$8QyW=I(z
zxQ`ArrXmx7iao(1^m(IzOsqhMU8a(L9YSvG#<b>PttIWVAy0s<mM{#|fiR^~oDo-X
zfzAog$0I2ExC4!hlgkq#j}0)KNtYDe092ey2&kRzxzlL!<(O)}X+FWj@^W>A9|^(3
zBERf$Rm2i+TF<#|Kc#SMchr(o=1GLuG{USfsWi2s?&<Og5xe9wCF*%o9zL0EV7BPG
zQhvQ#fpM6I%P}TQPiOSiz*II%!lSSLcTm6OrJEjG95R}}`}W`GE<qZCHsZ+PlvYC0
zEkqAmNyMdTZ=Kzl+dXSU0QmMd<Ye1ukz)+;mwl{Bjbm)v(E#|3uF^X~L8SKn@8s&I
zVvbmLQu3flYUd!1S+Y;^B}#~X6DuON^F;iQ+@XQgy{~lQVX}LPRp)gJGe2CFV(46B
zDL-sd66T?&bJv4m-|Og>5{dZ%41PpP#X;^1#+vTs0#<DO7&fXnAA`sg1d4>LL<d_p
zEXr}u*#}=UeRZE9;(SH+T>(6t@mq4#l8bpm8XUulUxPQ%gknz;iLK`<U~4##j{HuJ
zOgK$Wz><M36Mkr5i++c?(Z;s$U1^Z^`r~lTl6x+b^}3Rw6ke69uobwNw8%M<3p7fu
zI10a8h(DwDVa<pjqXrF-YMwG99bxCyduX*IPt&;fsp2~yM%k`PqL$$0=_(U%?13cJ
z(c-82%nL!)feNa#(rRmbHgCl&xzx4M1gTa@!GE|vOR~6fofjH4nFL<s`(vVc*_s%<
zb;u*Hqfmq&h8Z90(?roE`QeWzYpTGmw>HuFnSBWtFi2-)=1KW=iSl@pC#R7=Wy2|7
z`)U{KRtnnSH8zusFb;;Gq8O~ffB;rHy+J;*L!Qq5d!2<0QU5%7P%G$SK^gUSvNoB!
z{hK+84IP~ii-Y_5a;DU>60`SXs809@pJ0^Cx<V3G`^8y3Bp-Cdhx`V_$|%6*7mZ?h
z=U5mF?BcA@QOzcy`GjT|YQF)+H5o!fcW0J2!7V0?eKW_tCy8uo5Hihwg!xgJdTeql
zzP7KtQp8IZQ=)cI=a!uCAW(~^x7ag_oKHOOu6kdnXpn*;sd`b$k~ozGD$KStfHTfa
zov&KmAO|(AIC3_sbl);UOou1|DdFvgwDDT}$TD)&j6wuizoM#dbBv;!@LG|2vHh8F
zzBy+V(>&G<zn3vtRPNUWN*(e^(sTXN_{E5VS`JObYs6fmCNpKj2|<@<Kg&VU3jH1A
zK;Zo`on$MQjB>uxT1eZ%ir}MbKQV;=%=JhG+XA{c-J4uas{m&u`U=z9APr=B)RLKZ
zxCzy26BWI(eH)Dd3<)M^{yQLe#vZ4d7m*<r9m-yQCS(3{y09DROD92g{%gcXHNZ8H
zeY7yx&j3BNWMdd(JWi4Yg#pZ5vB@8Ty;yLNSZkz3!BnE3H;J!-pA5<m*kK?J)ZlH0
z#plzWqiIbu4cknSqw|zzXN|KS+Xf)FX%Faws(Eca*!#SRCs!-WNj{+0XM`!7Q3p6Q
zzH2c)Mg@;1Gi#M7|M&;u$;>asojROrxdMW#-5z?m0s(a%c7B)bPe$ParoX!T+_&<1
z-{!Vg{dPYXRuOa{H<76Kje9l;5OKcvdGXOc;J){ov8$_izLc@sT?48*BVt)(v>zT=
zM1r-T=>57PD2{tE2eWJm>b*sKP|n^3`4o&TP$xsyp1<VMognLy%&qum!wNeC%YQ*c
z7~ea-2XaCp*)Seay(d5@ltkD~+I5~c-nW$MDf>!s4YOKiNroNo%*j&on~r3s)T+`&
zRglI)y~S2v@Xile!lXnW;1NaMJ@XuZwHZF7uG`X;ND?I(%Uw)_W5h$CyqpP4#Y{a&
zt_{Mk9<``UZaiurh8w>AQNA#oUmuvb4bo+0Y2sl-r&J}N?=~7oF5-R11xDDGo)(5L
zWVDF!fVEX{!lcx);}olK3a#<);O6}(tMt<@g}XszzgIJGca69{C6Pw!NJ6ulnG4^s
z*TlQ({^@JrF~>CDPG>e^#Z4XMUZ72Fg3jZBe9AJwx9@Bux>ta)+2whVscxC_{IA?J
zUw^O=cX5OCc5%tJ-Y>6^gEMwiLUx<@Eb=I7M$a0(#RqzSHusL!YFkuMm`9_6DuJA3
zMQ=Fi2s0JZsqBHT!37F3nsZPMHZqetdE(!1w{2?ENDMUPsnl8Zg|0D@cHV_G;H`M*
z&woW-JhWBU)eFtF9SBvx_FdTO#TjCd^9u|!o;#<P*||PtyX{^Rhn#?j${1p4O@Bib
zi*{+dcr`Qe;BYBYMmyhIq;=`-1v-Hy*AfpKTSRbI3+sdii7}yDklriu#6;%=fu9bW
zn!+MgQ3Tj~9|Q~;cB+drN*MItkY{0BR-xvsbn%+tQ|}=6Dz8q#a90Qa09%Se@rS&7
zzUe@kaQl9CNUVMoglEiv6}a^3L7fx}&3}@(-`KFD#z~N^ywjP()?OCjOTiHx)fn7y
z<x0-q2UZ>wt?h0icP|ooPd^+Ql7^7Q#HKcYLy$E}f6ics*|OfuN9FcYb~?<vP)>M~
zhg9)S!x^O1pcq&IkwI04BC6ax`1ui<R}f^K81Z^*F-X;#Qg9zlk#NjGFyx$W%g(Cn
zg<%9q9qg&91l1)ni8?)1@zc_bAw;(rlBuD%M;BkN4vQj7$jB>zG8j{BpgMm~#%YY4
z^T<IDWT7pT5ZrJx%;UAxCKF}r^CpcdBr>F!;71oFLU51&3DcD-{qIEnJDT-M-xPT&
z8GcY1cURC~wx(L;thKn?6t21<{PemX?^hUp^o7UODDrq9L<zx9mMd{6CZQ*9)*ce_
zWX7_W?vA@(5ZjV*$3$N&j)SQDa(XBC1NYn{FNO2&$ftdA5Bi?}d4pd6ivxI-$3#DJ
z7HPGF5N;BSIX7*&mW%?i)l{aG{vQ$!VX6s-DL0;{`^Semk%t6)UR$Yn4w2;%4k^FJ
z73V7XCjLiM{);OA!OLz?y_Uy)-9mQtx{1eq@j|xa&-OXBmu<g>zKZra;`g~)zdeti
zoBSAGe2e`}A16)Bn0y}EN_Ov>KDX_PQVHMB?WgT>-aee^tWa^j4{8u4Wg;#2V3Ybv
zZNtHQgpObt37ox#HG1E`TlG`0ilN#o5hE2X?orr&1VwL+gdj*Gq#O5d4pSOv>$Ef5
zq8;0;?IfYW-JerX*=Gv5Q?JI_@84m4sdPHf$Chrh`a@dz=1INa$b4ePYxd1}2I2ha
zjz!0Ga$F98gap>>@`SNG)*%t}moBf@|H*z+A`r6zWBty4vu=Bg>aRU=@9z39G!)sJ
zi`Aw}q#*w)=eDaZsg0+K-G%54_O%80FC^zB%}D{5NSjE?k7ms~l}=nFlJUCsWWkc9
zt>QOJmNN)kTq=2|?2oK1vfr>NFM~y?$bX$5)uHvoc0Ao@o*mU34LzCqwU|+fyV%^l
W>2(jGRj~a3b?Pku`9E9BzvW-7#D!S^

literal 0
HcmV?d00001

diff --git a/testing_suite/input_files/readthrough/readthrough.gtf b/testing_suite/input_files/readthrough/readthrough.gtf
new file mode 100644
index 0000000..7891fb5
--- /dev/null
+++ b/testing_suite/input_files/readthrough/readthrough.gtf
@@ -0,0 +1,294 @@
+chr1	HAVANA	gene	10430102	10452153	.	+	.	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; tag "overlapping_locus";
+chr1	HAVANA	transcript	10430102	10452153	.	+	.	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000400900.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-201"; transcript_support_level "2"; tag "basic,appris_principal_4,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000368904.1"; protein_id "ENSP00000383692.2"; ccdsid "CCDS114.1";
+chr1	HAVANA	exon	10430102	10430568	.	+	.	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000400900.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-201"; transcript_support_level "2"; tag "basic,appris_principal_4,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000368904.1"; exon_number "1"; exon_id "ENSE00003720917.1"; protein_id "ENSP00000383692.2"; ccdsid "CCDS114.1";
+chr1	HAVANA	CDS	10430518	10430568	.	+	0	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000400900.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-201"; transcript_support_level "2"; tag "basic,appris_principal_4,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000368904.1"; exon_number "1"; exon_id "ENSE00003720917.1"; protein_id "ENSP00000383692.2"; ccdsid "CCDS114.1";
+chr1	HAVANA	start_codon	10430518	10430520	.	+	0	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000400900.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-201"; transcript_support_level "2"; tag "basic,appris_principal_4,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000368904.1"; exon_number "1"; exon_id "ENSE00003720917.1"; protein_id "ENSP00000383692.2"; ccdsid "CCDS114.1";
+chr1	HAVANA	exon	10433842	10433965	.	+	.	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000400900.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-201"; transcript_support_level "2"; tag "basic,appris_principal_4,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000368904.1"; exon_number "2"; exon_id "ENSE00003626111.1"; protein_id "ENSP00000383692.2"; ccdsid "CCDS114.1";
+chr1	HAVANA	CDS	10433842	10433965	.	+	0	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000400900.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-201"; transcript_support_level "2"; tag "basic,appris_principal_4,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000368904.1"; exon_number "2"; exon_id "ENSE00003626111.1"; protein_id "ENSP00000383692.2"; ccdsid "CCDS114.1";
+chr1	HAVANA	exon	10434657	10434690	.	+	.	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000400900.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-201"; transcript_support_level "2"; tag "basic,appris_principal_4,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000368904.1"; exon_number "3"; exon_id "ENSE00003530260.1"; protein_id "ENSP00000383692.2"; ccdsid "CCDS114.1";
+chr1	HAVANA	CDS	10434657	10434690	.	+	2	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000400900.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-201"; transcript_support_level "2"; tag "basic,appris_principal_4,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000368904.1"; exon_number "3"; exon_id "ENSE00003530260.1"; protein_id "ENSP00000383692.2"; ccdsid "CCDS114.1";
+chr1	HAVANA	exon	10440347	10440413	.	+	.	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000400900.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-201"; transcript_support_level "2"; tag "basic,appris_principal_4,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000368904.1"; exon_number "4"; exon_id "ENSE00003654668.1"; protein_id "ENSP00000383692.2"; ccdsid "CCDS114.1";
+chr1	HAVANA	CDS	10440347	10440413	.	+	1	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000400900.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-201"; transcript_support_level "2"; tag "basic,appris_principal_4,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000368904.1"; exon_number "4"; exon_id "ENSE00003654668.1"; protein_id "ENSP00000383692.2"; ccdsid "CCDS114.1";
+chr1	HAVANA	exon	10451377	10452153	.	+	.	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000400900.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-201"; transcript_support_level "2"; tag "basic,appris_principal_4,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000368904.1"; exon_number "5"; exon_id "ENSE00003482235.1"; protein_id "ENSP00000383692.2"; ccdsid "CCDS114.1";
+chr1	HAVANA	CDS	10451377	10451592	.	+	0	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000400900.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-201"; transcript_support_level "2"; tag "basic,appris_principal_4,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000368904.1"; exon_number "5"; exon_id "ENSE00003482235.1"; protein_id "ENSP00000383692.2"; ccdsid "CCDS114.1";
+chr1	HAVANA	stop_codon	10451593	10451595	.	+	0	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000400900.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-201"; transcript_support_level "2"; tag "basic,appris_principal_4,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000368904.1"; exon_number "5"; exon_id "ENSE00003482235.1"; protein_id "ENSP00000383692.2"; ccdsid "CCDS114.1";
+chr1	HAVANA	UTR	10430102	10430517	.	+	.	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000400900.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-201"; transcript_support_level "2"; tag "basic,appris_principal_4,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000368904.1"; exon_number "1"; exon_id "ENSE00003720917.1"; protein_id "ENSP00000383692.2"; ccdsid "CCDS114.1";
+chr1	HAVANA	UTR	10451593	10452153	.	+	.	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000400900.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-201"; transcript_support_level "2"; tag "basic,appris_principal_4,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000368904.1"; exon_number "5"; exon_id "ENSE00003482235.1"; protein_id "ENSP00000383692.2"; ccdsid "CCDS114.1";
+chr1	HAVANA	transcript	10430384	10451659	.	+	.	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000470413.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-203"; transcript_support_level "2"; tag "basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000130799.1"; protein_id "ENSP00000433615.1"; ccdsid "CCDS53266.1";
+chr1	HAVANA	exon	10430384	10430568	.	+	.	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000470413.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-203"; transcript_support_level "2"; tag "basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000130799.1"; exon_number "1"; exon_id "ENSE00002253437.1"; protein_id "ENSP00000433615.1"; ccdsid "CCDS53266.1";
+chr1	HAVANA	CDS	10430518	10430568	.	+	0	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000470413.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-203"; transcript_support_level "2"; tag "basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000130799.1"; exon_number "1"; exon_id "ENSE00002253437.1"; protein_id "ENSP00000433615.1"; ccdsid "CCDS53266.1";
+chr1	HAVANA	start_codon	10430518	10430520	.	+	0	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000470413.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-203"; transcript_support_level "2"; tag "basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000130799.1"; exon_number "1"; exon_id "ENSE00002253437.1"; protein_id "ENSP00000433615.1"; ccdsid "CCDS53266.1";
+chr1	HAVANA	exon	10433842	10433965	.	+	.	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000470413.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-203"; transcript_support_level "2"; tag "basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000130799.1"; exon_number "2"; exon_id "ENSE00003626111.1"; protein_id "ENSP00000433615.1"; ccdsid "CCDS53266.1";
+chr1	HAVANA	CDS	10433842	10433965	.	+	0	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000470413.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-203"; transcript_support_level "2"; tag "basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000130799.1"; exon_number "2"; exon_id "ENSE00003626111.1"; protein_id "ENSP00000433615.1"; ccdsid "CCDS53266.1";
+chr1	HAVANA	exon	10434657	10434690	.	+	.	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000470413.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-203"; transcript_support_level "2"; tag "basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000130799.1"; exon_number "3"; exon_id "ENSE00003530260.1"; protein_id "ENSP00000433615.1"; ccdsid "CCDS53266.1";
+chr1	HAVANA	CDS	10434657	10434690	.	+	2	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000470413.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-203"; transcript_support_level "2"; tag "basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000130799.1"; exon_number "3"; exon_id "ENSE00003530260.1"; protein_id "ENSP00000433615.1"; ccdsid "CCDS53266.1";
+chr1	HAVANA	exon	10451377	10451659	.	+	.	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000470413.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-203"; transcript_support_level "2"; tag "basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000130799.1"; exon_number "4"; exon_id "ENSE00003665651.1"; protein_id "ENSP00000433615.1"; ccdsid "CCDS53266.1";
+chr1	HAVANA	CDS	10451377	10451398	.	+	1	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000470413.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-203"; transcript_support_level "2"; tag "basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000130799.1"; exon_number "4"; exon_id "ENSE00003665651.1"; protein_id "ENSP00000433615.1"; ccdsid "CCDS53266.1";
+chr1	HAVANA	stop_codon	10451399	10451401	.	+	0	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000470413.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-203"; transcript_support_level "2"; tag "basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000130799.1"; exon_number "4"; exon_id "ENSE00003665651.1"; protein_id "ENSP00000433615.1"; ccdsid "CCDS53266.1";
+chr1	HAVANA	UTR	10430384	10430517	.	+	.	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000470413.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-203"; transcript_support_level "2"; tag "basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000130799.1"; exon_number "1"; exon_id "ENSE00002253437.1"; protein_id "ENSP00000433615.1"; ccdsid "CCDS53266.1";
+chr1	HAVANA	UTR	10451399	10451659	.	+	.	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000470413.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-203"; transcript_support_level "2"; tag "basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000130799.1"; exon_number "4"; exon_id "ENSE00003665651.1"; protein_id "ENSP00000433615.1"; ccdsid "CCDS53266.1";
+chr1	HAVANA	transcript	10430435	10451411	.	+	.	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602787.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-205"; transcript_support_level "3"; tag "basic,appris_alternative_2,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000467583.2"; protein_id "ENSP00000473509.2"; ccdsid "CCDS115.1";
+chr1	HAVANA	exon	10430435	10430568	.	+	.	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602787.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-205"; transcript_support_level "3"; tag "basic,appris_alternative_2,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000467583.2"; exon_number "1"; exon_id "ENSE00003784866.1"; protein_id "ENSP00000473509.2"; ccdsid "CCDS115.1";
+chr1	HAVANA	CDS	10430518	10430568	.	+	0	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602787.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-205"; transcript_support_level "3"; tag "basic,appris_alternative_2,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000467583.2"; exon_number "1"; exon_id "ENSE00003784866.1"; protein_id "ENSP00000473509.2"; ccdsid "CCDS115.1";
+chr1	HAVANA	start_codon	10430518	10430520	.	+	0	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602787.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-205"; transcript_support_level "3"; tag "basic,appris_alternative_2,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000467583.2"; exon_number "1"; exon_id "ENSE00003784866.1"; protein_id "ENSP00000473509.2"; ccdsid "CCDS115.1";
+chr1	HAVANA	exon	10433842	10433965	.	+	.	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602787.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-205"; transcript_support_level "3"; tag "basic,appris_alternative_2,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000467583.2"; exon_number "2"; exon_id "ENSE00003626111.1"; protein_id "ENSP00000473509.2"; ccdsid "CCDS115.1";
+chr1	HAVANA	CDS	10433842	10433965	.	+	0	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602787.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-205"; transcript_support_level "3"; tag "basic,appris_alternative_2,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000467583.2"; exon_number "2"; exon_id "ENSE00003626111.1"; protein_id "ENSP00000473509.2"; ccdsid "CCDS115.1";
+chr1	HAVANA	exon	10434657	10434690	.	+	.	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602787.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-205"; transcript_support_level "3"; tag "basic,appris_alternative_2,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000467583.2"; exon_number "3"; exon_id "ENSE00003530260.1"; protein_id "ENSP00000473509.2"; ccdsid "CCDS115.1";
+chr1	HAVANA	CDS	10434657	10434690	.	+	2	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602787.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-205"; transcript_support_level "3"; tag "basic,appris_alternative_2,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000467583.2"; exon_number "3"; exon_id "ENSE00003530260.1"; protein_id "ENSP00000473509.2"; ccdsid "CCDS115.1";
+chr1	HAVANA	exon	10440347	10440413	.	+	.	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602787.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-205"; transcript_support_level "3"; tag "basic,appris_alternative_2,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000467583.2"; exon_number "4"; exon_id "ENSE00003654668.1"; protein_id "ENSP00000473509.2"; ccdsid "CCDS115.1";
+chr1	HAVANA	CDS	10440347	10440413	.	+	1	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602787.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-205"; transcript_support_level "3"; tag "basic,appris_alternative_2,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000467583.2"; exon_number "4"; exon_id "ENSE00003654668.1"; protein_id "ENSP00000473509.2"; ccdsid "CCDS115.1";
+chr1	HAVANA	exon	10442265	10442441	.	+	.	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602787.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-205"; transcript_support_level "3"; tag "basic,appris_alternative_2,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000467583.2"; exon_number "5"; exon_id "ENSE00003232254.1"; protein_id "ENSP00000473509.2"; ccdsid "CCDS115.1";
+chr1	HAVANA	CDS	10442265	10442402	.	+	0	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602787.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-205"; transcript_support_level "3"; tag "basic,appris_alternative_2,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000467583.2"; exon_number "5"; exon_id "ENSE00003232254.1"; protein_id "ENSP00000473509.2"; ccdsid "CCDS115.1";
+chr1	HAVANA	stop_codon	10442403	10442405	.	+	0	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602787.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-205"; transcript_support_level "3"; tag "basic,appris_alternative_2,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000467583.2"; exon_number "5"; exon_id "ENSE00003232254.1"; protein_id "ENSP00000473509.2"; ccdsid "CCDS115.1";
+chr1	HAVANA	exon	10451377	10451411	.	+	.	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602787.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-205"; transcript_support_level "3"; tag "basic,appris_alternative_2,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000467583.2"; exon_number "6"; exon_id "ENSE00003573719.3"; protein_id "ENSP00000473509.2"; ccdsid "CCDS115.1";
+chr1	HAVANA	UTR	10430435	10430517	.	+	.	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602787.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-205"; transcript_support_level "3"; tag "basic,appris_alternative_2,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000467583.2"; exon_number "1"; exon_id "ENSE00003784866.1"; protein_id "ENSP00000473509.2"; ccdsid "CCDS115.1";
+chr1	HAVANA	UTR	10442403	10442441	.	+	.	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602787.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-205"; transcript_support_level "3"; tag "basic,appris_alternative_2,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000467583.2"; exon_number "5"; exon_id "ENSE00003232254.1"; protein_id "ENSP00000473509.2"; ccdsid "CCDS115.1";
+chr1	HAVANA	UTR	10451377	10451411	.	+	.	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602787.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-205"; transcript_support_level "3"; tag "basic,appris_alternative_2,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000467583.2"; exon_number "6"; exon_id "ENSE00003573719.3"; protein_id "ENSP00000473509.2"; ccdsid "CCDS115.1";
+chr1	HAVANA	transcript	10430804	10452003	.	+	.	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602296.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-204"; transcript_support_level "3"; tag "overlapping_uORF,upstream_uORF,basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000488626.1"; protein_id "ENSP00000473401.2"; ccdsid "CCDS72699.1";
+chr1	HAVANA	exon	10430804	10431401	.	+	.	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602296.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-204"; transcript_support_level "3"; tag "overlapping_uORF,upstream_uORF,basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000488626.1"; exon_number "1"; exon_id "ENSE00003725534.2"; protein_id "ENSP00000473401.2"; ccdsid "CCDS72699.1";
+chr1	HAVANA	CDS	10431290	10431401	.	+	0	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602296.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-204"; transcript_support_level "3"; tag "overlapping_uORF,upstream_uORF,basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000488626.1"; exon_number "1"; exon_id "ENSE00003725534.2"; protein_id "ENSP00000473401.2"; ccdsid "CCDS72699.1";
+chr1	HAVANA	start_codon	10431290	10431292	.	+	0	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602296.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-204"; transcript_support_level "3"; tag "overlapping_uORF,upstream_uORF,basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000488626.1"; exon_number "1"; exon_id "ENSE00003725534.2"; protein_id "ENSP00000473401.2"; ccdsid "CCDS72699.1";
+chr1	HAVANA	exon	10434657	10434690	.	+	.	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602296.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-204"; transcript_support_level "3"; tag "overlapping_uORF,upstream_uORF,basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000488626.1"; exon_number "2"; exon_id "ENSE00003530260.1"; protein_id "ENSP00000473401.2"; ccdsid "CCDS72699.1";
+chr1	HAVANA	CDS	10434657	10434690	.	+	2	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602296.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-204"; transcript_support_level "3"; tag "overlapping_uORF,upstream_uORF,basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000488626.1"; exon_number "2"; exon_id "ENSE00003530260.1"; protein_id "ENSP00000473401.2"; ccdsid "CCDS72699.1";
+chr1	HAVANA	exon	10440347	10440413	.	+	.	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602296.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-204"; transcript_support_level "3"; tag "overlapping_uORF,upstream_uORF,basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000488626.1"; exon_number "3"; exon_id "ENSE00003654668.1"; protein_id "ENSP00000473401.2"; ccdsid "CCDS72699.1";
+chr1	HAVANA	CDS	10440347	10440413	.	+	1	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602296.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-204"; transcript_support_level "3"; tag "overlapping_uORF,upstream_uORF,basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000488626.1"; exon_number "3"; exon_id "ENSE00003654668.1"; protein_id "ENSP00000473401.2"; ccdsid "CCDS72699.1";
+chr1	HAVANA	exon	10451377	10452003	.	+	.	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602296.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-204"; transcript_support_level "3"; tag "overlapping_uORF,upstream_uORF,basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000488626.1"; exon_number "4"; exon_id "ENSE00003613950.2"; protein_id "ENSP00000473401.2"; ccdsid "CCDS72699.1";
+chr1	HAVANA	CDS	10451377	10451592	.	+	0	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602296.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-204"; transcript_support_level "3"; tag "overlapping_uORF,upstream_uORF,basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000488626.1"; exon_number "4"; exon_id "ENSE00003613950.2"; protein_id "ENSP00000473401.2"; ccdsid "CCDS72699.1";
+chr1	HAVANA	stop_codon	10451593	10451595	.	+	0	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602296.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-204"; transcript_support_level "3"; tag "overlapping_uORF,upstream_uORF,basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000488626.1"; exon_number "4"; exon_id "ENSE00003613950.2"; protein_id "ENSP00000473401.2"; ccdsid "CCDS72699.1";
+chr1	HAVANA	UTR	10430804	10431289	.	+	.	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602296.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-204"; transcript_support_level "3"; tag "overlapping_uORF,upstream_uORF,basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000488626.1"; exon_number "1"; exon_id "ENSE00003725534.2"; protein_id "ENSP00000473401.2"; ccdsid "CCDS72699.1";
+chr1	HAVANA	UTR	10451593	10452003	.	+	.	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000602296.6"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-204"; transcript_support_level "3"; tag "overlapping_uORF,upstream_uORF,basic,readthrough_transcript,CCDS"; havana_transcript "OTTHUMT00000488626.1"; exon_number "4"; exon_id "ENSE00003613950.2"; protein_id "ENSP00000473401.2"; ccdsid "CCDS72699.1";
+chr1	HAVANA	transcript	10431258	10451659	.	+	.	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000465026.1"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-202"; transcript_support_level "3"; tag "basic,readthrough_transcript"; havana_transcript "OTTHUMT00000130794.2"; protein_id "ENSP00000489060.1";
+chr1	HAVANA	exon	10431258	10431401	.	+	.	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000465026.1"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-202"; transcript_support_level "3"; tag "basic,readthrough_transcript"; havana_transcript "OTTHUMT00000130794.2"; exon_number "1"; exon_id "ENSE00001928985.1"; protein_id "ENSP00000489060.1";
+chr1	HAVANA	CDS	10431290	10431401	.	+	0	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000465026.1"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-202"; transcript_support_level "3"; tag "basic,readthrough_transcript"; havana_transcript "OTTHUMT00000130794.2"; exon_number "1"; exon_id "ENSE00001928985.1"; protein_id "ENSP00000489060.1";
+chr1	HAVANA	start_codon	10431290	10431292	.	+	0	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000465026.1"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-202"; transcript_support_level "3"; tag "basic,readthrough_transcript"; havana_transcript "OTTHUMT00000130794.2"; exon_number "1"; exon_id "ENSE00001928985.1"; protein_id "ENSP00000489060.1";
+chr1	HAVANA	exon	10440347	10440413	.	+	.	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000465026.1"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-202"; transcript_support_level "3"; tag "basic,readthrough_transcript"; havana_transcript "OTTHUMT00000130794.2"; exon_number "2"; exon_id "ENSE00003589213.1"; protein_id "ENSP00000489060.1";
+chr1	HAVANA	CDS	10440347	10440381	.	+	2	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000465026.1"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-202"; transcript_support_level "3"; tag "basic,readthrough_transcript"; havana_transcript "OTTHUMT00000130794.2"; exon_number "2"; exon_id "ENSE00003589213.1"; protein_id "ENSP00000489060.1";
+chr1	HAVANA	stop_codon	10440382	10440384	.	+	0	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000465026.1"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-202"; transcript_support_level "3"; tag "basic,readthrough_transcript"; havana_transcript "OTTHUMT00000130794.2"; exon_number "2"; exon_id "ENSE00003589213.1"; protein_id "ENSP00000489060.1";
+chr1	HAVANA	exon	10451377	10451659	.	+	.	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000465026.1"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-202"; transcript_support_level "3"; tag "basic,readthrough_transcript"; havana_transcript "OTTHUMT00000130794.2"; exon_number "3"; exon_id "ENSE00003497845.1"; protein_id "ENSP00000489060.1";
+chr1	HAVANA	UTR	10431258	10431289	.	+	.	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000465026.1"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-202"; transcript_support_level "3"; tag "basic,readthrough_transcript"; havana_transcript "OTTHUMT00000130794.2"; exon_number "1"; exon_id "ENSE00001928985.1"; protein_id "ENSP00000489060.1";
+chr1	HAVANA	UTR	10440382	10440413	.	+	.	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000465026.1"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-202"; transcript_support_level "3"; tag "basic,readthrough_transcript"; havana_transcript "OTTHUMT00000130794.2"; exon_number "2"; exon_id "ENSE00003589213.1"; protein_id "ENSP00000489060.1";
+chr1	HAVANA	UTR	10451377	10451659	.	+	.	gene_id "ENSG00000251503.8"; gene_type "protein_coding"; gene_name "CENPS-CORT"; level "2"; havana_gene "OTTHUMG00000162436.4"; transcript_id "ENST00000465026.1"; transcript_type "protein_coding"; transcript_name "CENPS-CORT-202"; transcript_support_level "3"; tag "basic,readthrough_transcript"; havana_transcript "OTTHUMT00000130794.2"; exon_number "3"; exon_id "ENSE00003497845.1"; protein_id "ENSP00000489060.1";
+chr1	HAVANA	gene	10430443	10442809	.	+	.	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "1"; havana_gene "OTTHUMG00000059085.10"; tag "overlapping_locus";
+chr1	HAVANA	transcript	10430443	10442809	.	+	.	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000309048.7"; transcript_type "protein_coding"; transcript_name "CENPS-201"; transcript_support_level "1"; tag "basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000130797.2"; protein_id "ENSP00000308583.2"; ccdsid "CCDS115.1";
+chr1	HAVANA	exon	10430443	10430568	.	+	.	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000309048.7"; transcript_type "protein_coding"; transcript_name "CENPS-201"; transcript_support_level "1"; tag "basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000130797.2"; exon_number "1"; exon_id "ENSE00001849117.1"; protein_id "ENSP00000308583.2"; ccdsid "CCDS115.1";
+chr1	HAVANA	CDS	10430518	10430568	.	+	0	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000309048.7"; transcript_type "protein_coding"; transcript_name "CENPS-201"; transcript_support_level "1"; tag "basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000130797.2"; exon_number "1"; exon_id "ENSE00001849117.1"; protein_id "ENSP00000308583.2"; ccdsid "CCDS115.1";
+chr1	HAVANA	start_codon	10430518	10430520	.	+	0	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000309048.7"; transcript_type "protein_coding"; transcript_name "CENPS-201"; transcript_support_level "1"; tag "basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000130797.2"; exon_number "1"; exon_id "ENSE00001849117.1"; protein_id "ENSP00000308583.2"; ccdsid "CCDS115.1";
+chr1	HAVANA	exon	10433842	10433965	.	+	.	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000309048.7"; transcript_type "protein_coding"; transcript_name "CENPS-201"; transcript_support_level "1"; tag "basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000130797.2"; exon_number "2"; exon_id "ENSE00003666910.1"; protein_id "ENSP00000308583.2"; ccdsid "CCDS115.1";
+chr1	HAVANA	CDS	10433842	10433965	.	+	0	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000309048.7"; transcript_type "protein_coding"; transcript_name "CENPS-201"; transcript_support_level "1"; tag "basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000130797.2"; exon_number "2"; exon_id "ENSE00003666910.1"; protein_id "ENSP00000308583.2"; ccdsid "CCDS115.1";
+chr1	HAVANA	exon	10434657	10434690	.	+	.	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000309048.7"; transcript_type "protein_coding"; transcript_name "CENPS-201"; transcript_support_level "1"; tag "basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000130797.2"; exon_number "3"; exon_id "ENSE00003664823.1"; protein_id "ENSP00000308583.2"; ccdsid "CCDS115.1";
+chr1	HAVANA	CDS	10434657	10434690	.	+	2	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000309048.7"; transcript_type "protein_coding"; transcript_name "CENPS-201"; transcript_support_level "1"; tag "basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000130797.2"; exon_number "3"; exon_id "ENSE00003664823.1"; protein_id "ENSP00000308583.2"; ccdsid "CCDS115.1";
+chr1	HAVANA	exon	10440347	10440413	.	+	.	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000309048.7"; transcript_type "protein_coding"; transcript_name "CENPS-201"; transcript_support_level "1"; tag "basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000130797.2"; exon_number "4"; exon_id "ENSE00003599799.1"; protein_id "ENSP00000308583.2"; ccdsid "CCDS115.1";
+chr1	HAVANA	CDS	10440347	10440413	.	+	1	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000309048.7"; transcript_type "protein_coding"; transcript_name "CENPS-201"; transcript_support_level "1"; tag "basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000130797.2"; exon_number "4"; exon_id "ENSE00003599799.1"; protein_id "ENSP00000308583.2"; ccdsid "CCDS115.1";
+chr1	HAVANA	exon	10442265	10442809	.	+	.	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000309048.7"; transcript_type "protein_coding"; transcript_name "CENPS-201"; transcript_support_level "1"; tag "basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000130797.2"; exon_number "5"; exon_id "ENSE00001833855.2"; protein_id "ENSP00000308583.2"; ccdsid "CCDS115.1";
+chr1	HAVANA	CDS	10442265	10442402	.	+	0	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000309048.7"; transcript_type "protein_coding"; transcript_name "CENPS-201"; transcript_support_level "1"; tag "basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000130797.2"; exon_number "5"; exon_id "ENSE00001833855.2"; protein_id "ENSP00000308583.2"; ccdsid "CCDS115.1";
+chr1	HAVANA	stop_codon	10442403	10442405	.	+	0	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000309048.7"; transcript_type "protein_coding"; transcript_name "CENPS-201"; transcript_support_level "1"; tag "basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000130797.2"; exon_number "5"; exon_id "ENSE00001833855.2"; protein_id "ENSP00000308583.2"; ccdsid "CCDS115.1";
+chr1	HAVANA	UTR	10430443	10430517	.	+	.	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000309048.7"; transcript_type "protein_coding"; transcript_name "CENPS-201"; transcript_support_level "1"; tag "basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000130797.2"; exon_number "1"; exon_id "ENSE00001849117.1"; protein_id "ENSP00000308583.2"; ccdsid "CCDS115.1";
+chr1	HAVANA	UTR	10442403	10442809	.	+	.	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000309048.7"; transcript_type "protein_coding"; transcript_name "CENPS-201"; transcript_support_level "1"; tag "basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000130797.2"; exon_number "5"; exon_id "ENSE00001833855.2"; protein_id "ENSP00000308583.2"; ccdsid "CCDS115.1";
+chr1	HAVANA	transcript	10430512	10431130	.	+	.	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000602486.1"; transcript_type "processed_transcript"; transcript_name "CENPS-205"; transcript_support_level "3"; havana_transcript "OTTHUMT00000467584.1";
+chr1	HAVANA	exon	10430512	10430568	.	+	.	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000602486.1"; transcript_type "processed_transcript"; transcript_name "CENPS-205"; transcript_support_level "3"; havana_transcript "OTTHUMT00000467584.1"; exon_number "1"; exon_id "ENSE00003405083.1";
+chr1	HAVANA	exon	10430876	10431130	.	+	.	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000602486.1"; transcript_type "processed_transcript"; transcript_name "CENPS-205"; transcript_support_level "3"; havana_transcript "OTTHUMT00000467584.1"; exon_number "2"; exon_id "ENSE00003344661.1";
+chr1	HAVANA	transcript	10430783	10442334	.	+	.	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "1"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000477755.1"; transcript_type "nonsense_mediated_decay"; transcript_name "CENPS-204"; transcript_support_level "2"; tag "overlapping_uORF,upstream_uORF,exp_conf"; havana_transcript "OTTHUMT00000130800.6"; protein_id "ENSP00000468629.2";
+chr1	HAVANA	exon	10430783	10431401	.	+	.	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "1"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000477755.1"; transcript_type "nonsense_mediated_decay"; transcript_name "CENPS-204"; transcript_support_level "2"; tag "overlapping_uORF,upstream_uORF,exp_conf"; havana_transcript "OTTHUMT00000130800.6"; exon_number "1"; exon_id "ENSE00001850449.1"; protein_id "ENSP00000468629.2";
+chr1	HAVANA	CDS	10431290	10431401	.	+	0	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "1"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000477755.1"; transcript_type "nonsense_mediated_decay"; transcript_name "CENPS-204"; transcript_support_level "2"; tag "overlapping_uORF,upstream_uORF,exp_conf"; havana_transcript "OTTHUMT00000130800.6"; exon_number "1"; exon_id "ENSE00001850449.1"; protein_id "ENSP00000468629.2";
+chr1	HAVANA	start_codon	10431290	10431292	.	+	0	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "1"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000477755.1"; transcript_type "nonsense_mediated_decay"; transcript_name "CENPS-204"; transcript_support_level "2"; tag "overlapping_uORF,upstream_uORF,exp_conf"; havana_transcript "OTTHUMT00000130800.6"; exon_number "1"; exon_id "ENSE00001850449.1"; protein_id "ENSP00000468629.2";
+chr1	HAVANA	exon	10433842	10433965	.	+	.	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "1"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000477755.1"; transcript_type "nonsense_mediated_decay"; transcript_name "CENPS-204"; transcript_support_level "2"; tag "overlapping_uORF,upstream_uORF,exp_conf"; havana_transcript "OTTHUMT00000130800.6"; exon_number "2"; exon_id "ENSE00003505048.1"; protein_id "ENSP00000468629.2";
+chr1	HAVANA	CDS	10433842	10433963	.	+	2	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "1"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000477755.1"; transcript_type "nonsense_mediated_decay"; transcript_name "CENPS-204"; transcript_support_level "2"; tag "overlapping_uORF,upstream_uORF,exp_conf"; havana_transcript "OTTHUMT00000130800.6"; exon_number "2"; exon_id "ENSE00003505048.1"; protein_id "ENSP00000468629.2";
+chr1	HAVANA	exon	10434657	10434690	.	+	.	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "1"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000477755.1"; transcript_type "nonsense_mediated_decay"; transcript_name "CENPS-204"; transcript_support_level "2"; tag "overlapping_uORF,upstream_uORF,exp_conf"; havana_transcript "OTTHUMT00000130800.6"; exon_number "3"; exon_id "ENSE00003290091.2"; protein_id "ENSP00000468629.2";
+chr1	HAVANA	stop_codon	10433964	10433965	.	+	0	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "1"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000477755.1"; transcript_type "nonsense_mediated_decay"; transcript_name "CENPS-204"; transcript_support_level "2"; tag "overlapping_uORF,upstream_uORF,exp_conf"; havana_transcript "OTTHUMT00000130800.6"; exon_number "2"; exon_id "ENSE00003290091.2"; protein_id "ENSP00000468629.2";
+chr1	HAVANA	stop_codon	10434657	10434657	.	+	1	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "1"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000477755.1"; transcript_type "nonsense_mediated_decay"; transcript_name "CENPS-204"; transcript_support_level "2"; tag "overlapping_uORF,upstream_uORF,exp_conf"; havana_transcript "OTTHUMT00000130800.6"; exon_number "3"; exon_id "ENSE00003290091.2"; protein_id "ENSP00000468629.2";
+chr1	HAVANA	exon	10440347	10440413	.	+	.	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "1"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000477755.1"; transcript_type "nonsense_mediated_decay"; transcript_name "CENPS-204"; transcript_support_level "2"; tag "overlapping_uORF,upstream_uORF,exp_conf"; havana_transcript "OTTHUMT00000130800.6"; exon_number "4"; exon_id "ENSE00003786199.1"; protein_id "ENSP00000468629.2";
+chr1	HAVANA	exon	10442265	10442334	.	+	.	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "1"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000477755.1"; transcript_type "nonsense_mediated_decay"; transcript_name "CENPS-204"; transcript_support_level "2"; tag "overlapping_uORF,upstream_uORF,exp_conf"; havana_transcript "OTTHUMT00000130800.6"; exon_number "5"; exon_id "ENSE00001930747.1"; protein_id "ENSP00000468629.2";
+chr1	HAVANA	UTR	10430783	10431289	.	+	.	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "1"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000477755.1"; transcript_type "nonsense_mediated_decay"; transcript_name "CENPS-204"; transcript_support_level "2"; tag "overlapping_uORF,upstream_uORF,exp_conf"; havana_transcript "OTTHUMT00000130800.6"; exon_number "1"; exon_id "ENSE00001850449.1"; protein_id "ENSP00000468629.2";
+chr1	HAVANA	UTR	10433964	10433965	.	+	.	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "1"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000477755.1"; transcript_type "nonsense_mediated_decay"; transcript_name "CENPS-204"; transcript_support_level "2"; tag "overlapping_uORF,upstream_uORF,exp_conf"; havana_transcript "OTTHUMT00000130800.6"; exon_number "2"; exon_id "ENSE00003505048.1"; protein_id "ENSP00000468629.2";
+chr1	HAVANA	UTR	10434657	10434690	.	+	.	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "1"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000477755.1"; transcript_type "nonsense_mediated_decay"; transcript_name "CENPS-204"; transcript_support_level "2"; tag "overlapping_uORF,upstream_uORF,exp_conf"; havana_transcript "OTTHUMT00000130800.6"; exon_number "3"; exon_id "ENSE00003290091.2"; protein_id "ENSP00000468629.2";
+chr1	HAVANA	UTR	10440347	10440413	.	+	.	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "1"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000477755.1"; transcript_type "nonsense_mediated_decay"; transcript_name "CENPS-204"; transcript_support_level "2"; tag "overlapping_uORF,upstream_uORF,exp_conf"; havana_transcript "OTTHUMT00000130800.6"; exon_number "4"; exon_id "ENSE00003786199.1"; protein_id "ENSP00000468629.2";
+chr1	HAVANA	UTR	10442265	10442334	.	+	.	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "1"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000477755.1"; transcript_type "nonsense_mediated_decay"; transcript_name "CENPS-204"; transcript_support_level "2"; tag "overlapping_uORF,upstream_uORF,exp_conf"; havana_transcript "OTTHUMT00000130800.6"; exon_number "5"; exon_id "ENSE00001930747.1"; protein_id "ENSP00000468629.2";
+chr1	HAVANA	transcript	10431123	10442502	.	+	.	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000462462.1"; transcript_type "protein_coding"; transcript_name "CENPS-202"; transcript_support_level "3"; tag "basic"; havana_transcript "OTTHUMT00000130802.2"; protein_id "ENSP00000489524.1";
+chr1	HAVANA	exon	10431123	10431401	.	+	.	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000462462.1"; transcript_type "protein_coding"; transcript_name "CENPS-202"; transcript_support_level "3"; tag "basic"; havana_transcript "OTTHUMT00000130802.2"; exon_number "1"; exon_id "ENSE00001858020.1"; protein_id "ENSP00000489524.1";
+chr1	HAVANA	CDS	10431290	10431401	.	+	0	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000462462.1"; transcript_type "protein_coding"; transcript_name "CENPS-202"; transcript_support_level "3"; tag "basic"; havana_transcript "OTTHUMT00000130802.2"; exon_number "1"; exon_id "ENSE00001858020.1"; protein_id "ENSP00000489524.1";
+chr1	HAVANA	start_codon	10431290	10431292	.	+	0	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000462462.1"; transcript_type "protein_coding"; transcript_name "CENPS-202"; transcript_support_level "3"; tag "basic"; havana_transcript "OTTHUMT00000130802.2"; exon_number "1"; exon_id "ENSE00001858020.1"; protein_id "ENSP00000489524.1";
+chr1	HAVANA	exon	10440347	10440413	.	+	.	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000462462.1"; transcript_type "protein_coding"; transcript_name "CENPS-202"; transcript_support_level "3"; tag "basic"; havana_transcript "OTTHUMT00000130802.2"; exon_number "2"; exon_id "ENSE00003788145.1"; protein_id "ENSP00000489524.1";
+chr1	HAVANA	CDS	10440347	10440381	.	+	2	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000462462.1"; transcript_type "protein_coding"; transcript_name "CENPS-202"; transcript_support_level "3"; tag "basic"; havana_transcript "OTTHUMT00000130802.2"; exon_number "2"; exon_id "ENSE00003788145.1"; protein_id "ENSP00000489524.1";
+chr1	HAVANA	stop_codon	10440382	10440384	.	+	0	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000462462.1"; transcript_type "protein_coding"; transcript_name "CENPS-202"; transcript_support_level "3"; tag "basic"; havana_transcript "OTTHUMT00000130802.2"; exon_number "2"; exon_id "ENSE00003788145.1"; protein_id "ENSP00000489524.1";
+chr1	HAVANA	exon	10442265	10442502	.	+	.	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000462462.1"; transcript_type "protein_coding"; transcript_name "CENPS-202"; transcript_support_level "3"; tag "basic"; havana_transcript "OTTHUMT00000130802.2"; exon_number "3"; exon_id "ENSE00001872482.1"; protein_id "ENSP00000489524.1";
+chr1	HAVANA	UTR	10431123	10431289	.	+	.	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000462462.1"; transcript_type "protein_coding"; transcript_name "CENPS-202"; transcript_support_level "3"; tag "basic"; havana_transcript "OTTHUMT00000130802.2"; exon_number "1"; exon_id "ENSE00001858020.1"; protein_id "ENSP00000489524.1";
+chr1	HAVANA	UTR	10440382	10440413	.	+	.	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000462462.1"; transcript_type "protein_coding"; transcript_name "CENPS-202"; transcript_support_level "3"; tag "basic"; havana_transcript "OTTHUMT00000130802.2"; exon_number "2"; exon_id "ENSE00003788145.1"; protein_id "ENSP00000489524.1";
+chr1	HAVANA	UTR	10442265	10442502	.	+	.	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000462462.1"; transcript_type "protein_coding"; transcript_name "CENPS-202"; transcript_support_level "3"; tag "basic"; havana_transcript "OTTHUMT00000130802.2"; exon_number "3"; exon_id "ENSE00001872482.1"; protein_id "ENSP00000489524.1";
+chr1	HAVANA	transcript	10440123	10442657	.	+	.	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000464507.1"; transcript_type "retained_intron"; transcript_name "CENPS-203"; transcript_support_level "2"; havana_transcript "OTTHUMT00000130801.1";
+chr1	HAVANA	exon	10440123	10440413	.	+	.	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000464507.1"; transcript_type "retained_intron"; transcript_name "CENPS-203"; transcript_support_level "2"; havana_transcript "OTTHUMT00000130801.1"; exon_number "1"; exon_id "ENSE00001880184.1";
+chr1	HAVANA	exon	10442265	10442657	.	+	.	gene_id "ENSG00000175279.21"; gene_type "protein_coding"; gene_name "CENPS"; level "2"; havana_gene "OTTHUMG00000059085.10"; transcript_id "ENST00000464507.1"; transcript_type "retained_intron"; transcript_name "CENPS-203"; transcript_support_level "2"; havana_transcript "OTTHUMT00000130801.1"; exon_number "2"; exon_id "ENSE00001818886.1";
+chr1	HAVANA	gene	10449719	10451902	.	+	.	gene_id "ENSG00000241563.3"; gene_type "protein_coding"; gene_name "CORT"; level "2"; havana_gene "OTTHUMG00000001906.4"; tag "overlapping_locus";
+chr1	HAVANA	transcript	10449719	10451902	.	+	.	gene_id "ENSG00000241563.3"; gene_type "protein_coding"; gene_name "CORT"; level "2"; havana_gene "OTTHUMG00000001906.4"; transcript_id "ENST00000377049.3"; transcript_type "protein_coding"; transcript_name "CORT-201"; transcript_support_level "1"; tag "upstream_ATG,basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000005410.3"; protein_id "ENSP00000366248.3"; ccdsid "CCDS117.2";
+chr1	HAVANA	exon	10449719	10450322	.	+	.	gene_id "ENSG00000241563.3"; gene_type "protein_coding"; gene_name "CORT"; level "2"; havana_gene "OTTHUMG00000001906.4"; transcript_id "ENST00000377049.3"; transcript_type "protein_coding"; transcript_name "CORT-201"; transcript_support_level "1"; tag "upstream_ATG,basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000005410.3"; exon_number "1"; exon_id "ENSE00001472620.3"; protein_id "ENSP00000366248.3"; ccdsid "CCDS117.2";
+chr1	HAVANA	CDS	10450224	10450322	.	+	0	gene_id "ENSG00000241563.3"; gene_type "protein_coding"; gene_name "CORT"; level "2"; havana_gene "OTTHUMG00000001906.4"; transcript_id "ENST00000377049.3"; transcript_type "protein_coding"; transcript_name "CORT-201"; transcript_support_level "1"; tag "upstream_ATG,basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000005410.3"; exon_number "1"; exon_id "ENSE00001472620.3"; protein_id "ENSP00000366248.3"; ccdsid "CCDS117.2";
+chr1	HAVANA	start_codon	10450224	10450226	.	+	0	gene_id "ENSG00000241563.3"; gene_type "protein_coding"; gene_name "CORT"; level "2"; havana_gene "OTTHUMG00000001906.4"; transcript_id "ENST00000377049.3"; transcript_type "protein_coding"; transcript_name "CORT-201"; transcript_support_level "1"; tag "upstream_ATG,basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000005410.3"; exon_number "1"; exon_id "ENSE00001472620.3"; protein_id "ENSP00000366248.3"; ccdsid "CCDS117.2";
+chr1	HAVANA	exon	10451377	10451902	.	+	.	gene_id "ENSG00000241563.3"; gene_type "protein_coding"; gene_name "CORT"; level "2"; havana_gene "OTTHUMG00000001906.4"; transcript_id "ENST00000377049.3"; transcript_type "protein_coding"; transcript_name "CORT-201"; transcript_support_level "1"; tag "upstream_ATG,basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000005410.3"; exon_number "2"; exon_id "ENSE00001887610.1"; protein_id "ENSP00000366248.3"; ccdsid "CCDS117.2";
+chr1	HAVANA	CDS	10451377	10451592	.	+	0	gene_id "ENSG00000241563.3"; gene_type "protein_coding"; gene_name "CORT"; level "2"; havana_gene "OTTHUMG00000001906.4"; transcript_id "ENST00000377049.3"; transcript_type "protein_coding"; transcript_name "CORT-201"; transcript_support_level "1"; tag "upstream_ATG,basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000005410.3"; exon_number "2"; exon_id "ENSE00001887610.1"; protein_id "ENSP00000366248.3"; ccdsid "CCDS117.2";
+chr1	HAVANA	stop_codon	10451593	10451595	.	+	0	gene_id "ENSG00000241563.3"; gene_type "protein_coding"; gene_name "CORT"; level "2"; havana_gene "OTTHUMG00000001906.4"; transcript_id "ENST00000377049.3"; transcript_type "protein_coding"; transcript_name "CORT-201"; transcript_support_level "1"; tag "upstream_ATG,basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000005410.3"; exon_number "2"; exon_id "ENSE00001887610.1"; protein_id "ENSP00000366248.3"; ccdsid "CCDS117.2";
+chr1	HAVANA	UTR	10449719	10450223	.	+	.	gene_id "ENSG00000241563.3"; gene_type "protein_coding"; gene_name "CORT"; level "2"; havana_gene "OTTHUMG00000001906.4"; transcript_id "ENST00000377049.3"; transcript_type "protein_coding"; transcript_name "CORT-201"; transcript_support_level "1"; tag "upstream_ATG,basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000005410.3"; exon_number "1"; exon_id "ENSE00001472620.3"; protein_id "ENSP00000366248.3"; ccdsid "CCDS117.2";
+chr1	HAVANA	UTR	10451593	10451902	.	+	.	gene_id "ENSG00000241563.3"; gene_type "protein_coding"; gene_name "CORT"; level "2"; havana_gene "OTTHUMG00000001906.4"; transcript_id "ENST00000377049.3"; transcript_type "protein_coding"; transcript_name "CORT-201"; transcript_support_level "1"; tag "upstream_ATG,basic,appris_principal_1,CCDS"; havana_transcript "OTTHUMT00000005410.3"; exon_number "2"; exon_id "ENSE00001887610.1"; protein_id "ENSP00000366248.3"; ccdsid "CCDS117.2";
+chr1	HAVANA	gene	23691742	23696835	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8";
+chr1	HAVANA	transcript	23691742	23696417	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000374550.8"; transcript_type "protein_coding"; transcript_name "RPL11-201"; transcript_support_level "1"; tag "NAGNAG_splice_site,basic,appris_alternative_1"; havana_transcript "OTTHUMT00000494315.1"; protein_id "ENSP00000363676.4";
+chr1	HAVANA	exon	23691742	23691829	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000374550.8"; transcript_type "protein_coding"; transcript_name "RPL11-201"; transcript_support_level "1"; tag "NAGNAG_splice_site,basic,appris_alternative_1"; havana_transcript "OTTHUMT00000494315.1"; exon_number "1"; exon_id "ENSE00003819571.1"; protein_id "ENSP00000363676.4";
+chr1	HAVANA	CDS	23691824	23691829	.	+	0	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000374550.8"; transcript_type "protein_coding"; transcript_name "RPL11-201"; transcript_support_level "1"; tag "NAGNAG_splice_site,basic,appris_alternative_1"; havana_transcript "OTTHUMT00000494315.1"; exon_number "1"; exon_id "ENSE00003819571.1"; protein_id "ENSP00000363676.4";
+chr1	HAVANA	start_codon	23691824	23691826	.	+	0	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000374550.8"; transcript_type "protein_coding"; transcript_name "RPL11-201"; transcript_support_level "1"; tag "NAGNAG_splice_site,basic,appris_alternative_1"; havana_transcript "OTTHUMT00000494315.1"; exon_number "1"; exon_id "ENSE00003819571.1"; protein_id "ENSP00000363676.4";
+chr1	HAVANA	exon	23692612	23692759	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000374550.8"; transcript_type "protein_coding"; transcript_name "RPL11-201"; transcript_support_level "1"; tag "NAGNAG_splice_site,basic,appris_alternative_1"; havana_transcript "OTTHUMT00000494315.1"; exon_number "2"; exon_id "ENSE00003826575.1"; protein_id "ENSP00000363676.4";
+chr1	HAVANA	CDS	23692612	23692759	.	+	0	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000374550.8"; transcript_type "protein_coding"; transcript_name "RPL11-201"; transcript_support_level "1"; tag "NAGNAG_splice_site,basic,appris_alternative_1"; havana_transcript "OTTHUMT00000494315.1"; exon_number "2"; exon_id "ENSE00003826575.1"; protein_id "ENSP00000363676.4";
+chr1	HAVANA	exon	23693807	23693913	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000374550.8"; transcript_type "protein_coding"; transcript_name "RPL11-201"; transcript_support_level "1"; tag "NAGNAG_splice_site,basic,appris_alternative_1"; havana_transcript "OTTHUMT00000494315.1"; exon_number "3"; exon_id "ENSE00003529048.1"; protein_id "ENSP00000363676.4";
+chr1	HAVANA	CDS	23693807	23693913	.	+	2	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000374550.8"; transcript_type "protein_coding"; transcript_name "RPL11-201"; transcript_support_level "1"; tag "NAGNAG_splice_site,basic,appris_alternative_1"; havana_transcript "OTTHUMT00000494315.1"; exon_number "3"; exon_id "ENSE00003529048.1"; protein_id "ENSP00000363676.4";
+chr1	HAVANA	exon	23694660	23694791	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000374550.8"; transcript_type "protein_coding"; transcript_name "RPL11-201"; transcript_support_level "1"; tag "NAGNAG_splice_site,basic,appris_alternative_1"; havana_transcript "OTTHUMT00000494315.1"; exon_number "4"; exon_id "ENSE00003542504.1"; protein_id "ENSP00000363676.4";
+chr1	HAVANA	CDS	23694660	23694791	.	+	0	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000374550.8"; transcript_type "protein_coding"; transcript_name "RPL11-201"; transcript_support_level "1"; tag "NAGNAG_splice_site,basic,appris_alternative_1"; havana_transcript "OTTHUMT00000494315.1"; exon_number "4"; exon_id "ENSE00003542504.1"; protein_id "ENSP00000363676.4";
+chr1	HAVANA	exon	23695798	23695908	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000374550.8"; transcript_type "protein_coding"; transcript_name "RPL11-201"; transcript_support_level "1"; tag "NAGNAG_splice_site,basic,appris_alternative_1"; havana_transcript "OTTHUMT00000494315.1"; exon_number "5"; exon_id "ENSE00003586241.1"; protein_id "ENSP00000363676.4";
+chr1	HAVANA	CDS	23695798	23695908	.	+	0	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000374550.8"; transcript_type "protein_coding"; transcript_name "RPL11-201"; transcript_support_level "1"; tag "NAGNAG_splice_site,basic,appris_alternative_1"; havana_transcript "OTTHUMT00000494315.1"; exon_number "5"; exon_id "ENSE00003586241.1"; protein_id "ENSP00000363676.4";
+chr1	HAVANA	exon	23696344	23696417	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000374550.8"; transcript_type "protein_coding"; transcript_name "RPL11-201"; transcript_support_level "1"; tag "NAGNAG_splice_site,basic,appris_alternative_1"; havana_transcript "OTTHUMT00000494315.1"; exon_number "6"; exon_id "ENSE00003823475.1"; protein_id "ENSP00000363676.4";
+chr1	HAVANA	CDS	23696344	23696370	.	+	0	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000374550.8"; transcript_type "protein_coding"; transcript_name "RPL11-201"; transcript_support_level "1"; tag "NAGNAG_splice_site,basic,appris_alternative_1"; havana_transcript "OTTHUMT00000494315.1"; exon_number "6"; exon_id "ENSE00003823475.1"; protein_id "ENSP00000363676.4";
+chr1	HAVANA	stop_codon	23696371	23696373	.	+	0	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000374550.8"; transcript_type "protein_coding"; transcript_name "RPL11-201"; transcript_support_level "1"; tag "NAGNAG_splice_site,basic,appris_alternative_1"; havana_transcript "OTTHUMT00000494315.1"; exon_number "6"; exon_id "ENSE00003823475.1"; protein_id "ENSP00000363676.4";
+chr1	HAVANA	UTR	23691742	23691823	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000374550.8"; transcript_type "protein_coding"; transcript_name "RPL11-201"; transcript_support_level "1"; tag "NAGNAG_splice_site,basic,appris_alternative_1"; havana_transcript "OTTHUMT00000494315.1"; exon_number "1"; exon_id "ENSE00003819571.1"; protein_id "ENSP00000363676.4";
+chr1	HAVANA	UTR	23696371	23696417	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000374550.8"; transcript_type "protein_coding"; transcript_name "RPL11-201"; transcript_support_level "1"; tag "NAGNAG_splice_site,basic,appris_alternative_1"; havana_transcript "OTTHUMT00000494315.1"; exon_number "6"; exon_id "ENSE00003823475.1"; protein_id "ENSP00000363676.4";
+chr1	HAVANA	transcript	23691806	23696412	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000443624.6"; transcript_type "retained_intron"; transcript_name "RPL11-202"; transcript_support_level "2"; havana_transcript "OTTHUMT00000008171.3";
+chr1	HAVANA	exon	23691806	23691829	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000443624.6"; transcript_type "retained_intron"; transcript_name "RPL11-202"; transcript_support_level "2"; havana_transcript "OTTHUMT00000008171.3"; exon_number "1"; exon_id "ENSE00003826268.1";
+chr1	HAVANA	exon	23692609	23692759	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000443624.6"; transcript_type "retained_intron"; transcript_name "RPL11-202"; transcript_support_level "2"; havana_transcript "OTTHUMT00000008171.3"; exon_number "2"; exon_id "ENSE00003605024.1";
+chr1	HAVANA	exon	23693807	23693913	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000443624.6"; transcript_type "retained_intron"; transcript_name "RPL11-202"; transcript_support_level "2"; havana_transcript "OTTHUMT00000008171.3"; exon_number "3"; exon_id "ENSE00003648500.1";
+chr1	HAVANA	exon	23694660	23695908	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000443624.6"; transcript_type "retained_intron"; transcript_name "RPL11-202"; transcript_support_level "2"; havana_transcript "OTTHUMT00000008171.3"; exon_number "4"; exon_id "ENSE00001629265.2";
+chr1	HAVANA	exon	23696344	23696412	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000443624.6"; transcript_type "retained_intron"; transcript_name "RPL11-202"; transcript_support_level "2"; havana_transcript "OTTHUMT00000008171.3"; exon_number "5"; exon_id "ENSE00003823259.1";
+chr1	HAVANA	transcript	23691806	23696835	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000643754.2"; transcript_type "protein_coding"; transcript_name "RPL11-206"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008168.5"; protein_id "ENSP00000496250.1"; ccdsid "CCDS238.1";
+chr1	HAVANA	exon	23691806	23691829	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000643754.2"; transcript_type "protein_coding"; transcript_name "RPL11-206"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008168.5"; exon_number "1"; exon_id "ENSE00001463805.4"; protein_id "ENSP00000496250.1"; ccdsid "CCDS238.1";
+chr1	HAVANA	CDS	23691824	23691829	.	+	0	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000643754.2"; transcript_type "protein_coding"; transcript_name "RPL11-206"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008168.5"; exon_number "1"; exon_id "ENSE00001463805.4"; protein_id "ENSP00000496250.1"; ccdsid "CCDS238.1";
+chr1	HAVANA	start_codon	23691824	23691826	.	+	0	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000643754.2"; transcript_type "protein_coding"; transcript_name "RPL11-206"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008168.5"; exon_number "1"; exon_id "ENSE00001463805.4"; protein_id "ENSP00000496250.1"; ccdsid "CCDS238.1";
+chr1	HAVANA	exon	23692609	23692759	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000643754.2"; transcript_type "protein_coding"; transcript_name "RPL11-206"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008168.5"; exon_number "2"; exon_id "ENSE00003655446.1"; protein_id "ENSP00000496250.1"; ccdsid "CCDS238.1";
+chr1	HAVANA	CDS	23692609	23692759	.	+	0	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000643754.2"; transcript_type "protein_coding"; transcript_name "RPL11-206"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008168.5"; exon_number "2"; exon_id "ENSE00003655446.1"; protein_id "ENSP00000496250.1"; ccdsid "CCDS238.1";
+chr1	HAVANA	exon	23693807	23693913	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000643754.2"; transcript_type "protein_coding"; transcript_name "RPL11-206"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008168.5"; exon_number "3"; exon_id "ENSE00003529048.1"; protein_id "ENSP00000496250.1"; ccdsid "CCDS238.1";
+chr1	HAVANA	CDS	23693807	23693913	.	+	2	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000643754.2"; transcript_type "protein_coding"; transcript_name "RPL11-206"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008168.5"; exon_number "3"; exon_id "ENSE00003529048.1"; protein_id "ENSP00000496250.1"; ccdsid "CCDS238.1";
+chr1	HAVANA	exon	23694660	23694791	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000643754.2"; transcript_type "protein_coding"; transcript_name "RPL11-206"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008168.5"; exon_number "4"; exon_id "ENSE00003542504.1"; protein_id "ENSP00000496250.1"; ccdsid "CCDS238.1";
+chr1	HAVANA	CDS	23694660	23694791	.	+	0	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000643754.2"; transcript_type "protein_coding"; transcript_name "RPL11-206"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008168.5"; exon_number "4"; exon_id "ENSE00003542504.1"; protein_id "ENSP00000496250.1"; ccdsid "CCDS238.1";
+chr1	HAVANA	exon	23695798	23695908	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000643754.2"; transcript_type "protein_coding"; transcript_name "RPL11-206"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008168.5"; exon_number "5"; exon_id "ENSE00003586241.1"; protein_id "ENSP00000496250.1"; ccdsid "CCDS238.1";
+chr1	HAVANA	CDS	23695798	23695908	.	+	0	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000643754.2"; transcript_type "protein_coding"; transcript_name "RPL11-206"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008168.5"; exon_number "5"; exon_id "ENSE00003586241.1"; protein_id "ENSP00000496250.1"; ccdsid "CCDS238.1";
+chr1	HAVANA	exon	23696344	23696835	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000643754.2"; transcript_type "protein_coding"; transcript_name "RPL11-206"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008168.5"; exon_number "6"; exon_id "ENSE00003827991.1"; protein_id "ENSP00000496250.1"; ccdsid "CCDS238.1";
+chr1	HAVANA	CDS	23696344	23696370	.	+	0	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000643754.2"; transcript_type "protein_coding"; transcript_name "RPL11-206"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008168.5"; exon_number "6"; exon_id "ENSE00003827991.1"; protein_id "ENSP00000496250.1"; ccdsid "CCDS238.1";
+chr1	HAVANA	stop_codon	23696371	23696373	.	+	0	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000643754.2"; transcript_type "protein_coding"; transcript_name "RPL11-206"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008168.5"; exon_number "6"; exon_id "ENSE00003827991.1"; protein_id "ENSP00000496250.1"; ccdsid "CCDS238.1";
+chr1	HAVANA	UTR	23691806	23691823	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000643754.2"; transcript_type "protein_coding"; transcript_name "RPL11-206"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008168.5"; exon_number "1"; exon_id "ENSE00001463805.4"; protein_id "ENSP00000496250.1"; ccdsid "CCDS238.1";
+chr1	HAVANA	UTR	23696371	23696835	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000643754.2"; transcript_type "protein_coding"; transcript_name "RPL11-206"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008168.5"; exon_number "6"; exon_id "ENSE00003827991.1"; protein_id "ENSP00000496250.1"; ccdsid "CCDS238.1";
+chr1	HAVANA	transcript	23691821	23696425	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000467075.2"; transcript_type "nonsense_mediated_decay"; transcript_name "RPL11-204"; transcript_support_level "3"; tag "RNA_Seq_supported_partial"; havana_transcript "OTTHUMT00000008169.2"; protein_id "ENSP00000493634.1";
+chr1	HAVANA	exon	23691821	23692060	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000467075.2"; transcript_type "nonsense_mediated_decay"; transcript_name "RPL11-204"; transcript_support_level "3"; tag "RNA_Seq_supported_partial"; havana_transcript "OTTHUMT00000008169.2"; exon_number "1"; exon_id "ENSE00001914158.2"; protein_id "ENSP00000493634.1";
+chr1	HAVANA	CDS	23691824	23691955	.	+	0	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000467075.2"; transcript_type "nonsense_mediated_decay"; transcript_name "RPL11-204"; transcript_support_level "3"; tag "RNA_Seq_supported_partial"; havana_transcript "OTTHUMT00000008169.2"; exon_number "1"; exon_id "ENSE00001914158.2"; protein_id "ENSP00000493634.1";
+chr1	HAVANA	start_codon	23691824	23691826	.	+	0	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000467075.2"; transcript_type "nonsense_mediated_decay"; transcript_name "RPL11-204"; transcript_support_level "3"; tag "RNA_Seq_supported_partial"; havana_transcript "OTTHUMT00000008169.2"; exon_number "1"; exon_id "ENSE00001914158.2"; protein_id "ENSP00000493634.1";
+chr1	HAVANA	stop_codon	23691956	23691958	.	+	0	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000467075.2"; transcript_type "nonsense_mediated_decay"; transcript_name "RPL11-204"; transcript_support_level "3"; tag "RNA_Seq_supported_partial"; havana_transcript "OTTHUMT00000008169.2"; exon_number "1"; exon_id "ENSE00001914158.2"; protein_id "ENSP00000493634.1";
+chr1	HAVANA	exon	23692609	23692759	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000467075.2"; transcript_type "nonsense_mediated_decay"; transcript_name "RPL11-204"; transcript_support_level "3"; tag "RNA_Seq_supported_partial"; havana_transcript "OTTHUMT00000008169.2"; exon_number "2"; exon_id "ENSE00003605024.1"; protein_id "ENSP00000493634.1";
+chr1	HAVANA	exon	23693807	23693913	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000467075.2"; transcript_type "nonsense_mediated_decay"; transcript_name "RPL11-204"; transcript_support_level "3"; tag "RNA_Seq_supported_partial"; havana_transcript "OTTHUMT00000008169.2"; exon_number "3"; exon_id "ENSE00003648500.1"; protein_id "ENSP00000493634.1";
+chr1	HAVANA	exon	23694660	23694791	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000467075.2"; transcript_type "nonsense_mediated_decay"; transcript_name "RPL11-204"; transcript_support_level "3"; tag "RNA_Seq_supported_partial"; havana_transcript "OTTHUMT00000008169.2"; exon_number "4"; exon_id "ENSE00003606435.1"; protein_id "ENSP00000493634.1";
+chr1	HAVANA	exon	23695798	23695908	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000467075.2"; transcript_type "nonsense_mediated_decay"; transcript_name "RPL11-204"; transcript_support_level "3"; tag "RNA_Seq_supported_partial"; havana_transcript "OTTHUMT00000008169.2"; exon_number "5"; exon_id "ENSE00003568710.1"; protein_id "ENSP00000493634.1";
+chr1	HAVANA	exon	23696344	23696425	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000467075.2"; transcript_type "nonsense_mediated_decay"; transcript_name "RPL11-204"; transcript_support_level "3"; tag "RNA_Seq_supported_partial"; havana_transcript "OTTHUMT00000008169.2"; exon_number "6"; exon_id "ENSE00001044698.3"; protein_id "ENSP00000493634.1";
+chr1	HAVANA	UTR	23691821	23691823	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000467075.2"; transcript_type "nonsense_mediated_decay"; transcript_name "RPL11-204"; transcript_support_level "3"; tag "RNA_Seq_supported_partial"; havana_transcript "OTTHUMT00000008169.2"; exon_number "1"; exon_id "ENSE00001914158.2"; protein_id "ENSP00000493634.1";
+chr1	HAVANA	UTR	23691956	23692060	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000467075.2"; transcript_type "nonsense_mediated_decay"; transcript_name "RPL11-204"; transcript_support_level "3"; tag "RNA_Seq_supported_partial"; havana_transcript "OTTHUMT00000008169.2"; exon_number "1"; exon_id "ENSE00001914158.2"; protein_id "ENSP00000493634.1";
+chr1	HAVANA	UTR	23692609	23692759	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000467075.2"; transcript_type "nonsense_mediated_decay"; transcript_name "RPL11-204"; transcript_support_level "3"; tag "RNA_Seq_supported_partial"; havana_transcript "OTTHUMT00000008169.2"; exon_number "2"; exon_id "ENSE00003605024.1"; protein_id "ENSP00000493634.1";
+chr1	HAVANA	UTR	23693807	23693913	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000467075.2"; transcript_type "nonsense_mediated_decay"; transcript_name "RPL11-204"; transcript_support_level "3"; tag "RNA_Seq_supported_partial"; havana_transcript "OTTHUMT00000008169.2"; exon_number "3"; exon_id "ENSE00003648500.1"; protein_id "ENSP00000493634.1";
+chr1	HAVANA	UTR	23694660	23694791	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000467075.2"; transcript_type "nonsense_mediated_decay"; transcript_name "RPL11-204"; transcript_support_level "3"; tag "RNA_Seq_supported_partial"; havana_transcript "OTTHUMT00000008169.2"; exon_number "4"; exon_id "ENSE00003606435.1"; protein_id "ENSP00000493634.1";
+chr1	HAVANA	UTR	23695798	23695908	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000467075.2"; transcript_type "nonsense_mediated_decay"; transcript_name "RPL11-204"; transcript_support_level "3"; tag "RNA_Seq_supported_partial"; havana_transcript "OTTHUMT00000008169.2"; exon_number "5"; exon_id "ENSE00003568710.1"; protein_id "ENSP00000493634.1";
+chr1	HAVANA	UTR	23696344	23696425	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000467075.2"; transcript_type "nonsense_mediated_decay"; transcript_name "RPL11-204"; transcript_support_level "3"; tag "RNA_Seq_supported_partial"; havana_transcript "OTTHUMT00000008169.2"; exon_number "6"; exon_id "ENSE00001044698.3"; protein_id "ENSP00000493634.1";
+chr1	HAVANA	transcript	23692306	23696418	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000458455.2"; transcript_type "protein_coding"; transcript_name "RPL11-203"; transcript_support_level "1"; tag "basic"; havana_transcript "OTTHUMT00000008170.2"; protein_id "ENSP00000398888.2";
+chr1	HAVANA	exon	23692306	23692759	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000458455.2"; transcript_type "protein_coding"; transcript_name "RPL11-203"; transcript_support_level "1"; tag "basic"; havana_transcript "OTTHUMT00000008170.2"; exon_number "1"; exon_id "ENSE00001874656.1"; protein_id "ENSP00000398888.2";
+chr1	HAVANA	CDS	23692636	23692759	.	+	0	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000458455.2"; transcript_type "protein_coding"; transcript_name "RPL11-203"; transcript_support_level "1"; tag "basic"; havana_transcript "OTTHUMT00000008170.2"; exon_number "1"; exon_id "ENSE00001874656.1"; protein_id "ENSP00000398888.2";
+chr1	HAVANA	start_codon	23692636	23692638	.	+	0	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000458455.2"; transcript_type "protein_coding"; transcript_name "RPL11-203"; transcript_support_level "1"; tag "basic"; havana_transcript "OTTHUMT00000008170.2"; exon_number "1"; exon_id "ENSE00001874656.1"; protein_id "ENSP00000398888.2";
+chr1	HAVANA	exon	23693807	23693913	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000458455.2"; transcript_type "protein_coding"; transcript_name "RPL11-203"; transcript_support_level "1"; tag "basic"; havana_transcript "OTTHUMT00000008170.2"; exon_number "2"; exon_id "ENSE00003529048.1"; protein_id "ENSP00000398888.2";
+chr1	HAVANA	CDS	23693807	23693913	.	+	2	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000458455.2"; transcript_type "protein_coding"; transcript_name "RPL11-203"; transcript_support_level "1"; tag "basic"; havana_transcript "OTTHUMT00000008170.2"; exon_number "2"; exon_id "ENSE00003529048.1"; protein_id "ENSP00000398888.2";
+chr1	HAVANA	exon	23694660	23694791	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000458455.2"; transcript_type "protein_coding"; transcript_name "RPL11-203"; transcript_support_level "1"; tag "basic"; havana_transcript "OTTHUMT00000008170.2"; exon_number "3"; exon_id "ENSE00003542504.1"; protein_id "ENSP00000398888.2";
+chr1	HAVANA	CDS	23694660	23694791	.	+	0	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000458455.2"; transcript_type "protein_coding"; transcript_name "RPL11-203"; transcript_support_level "1"; tag "basic"; havana_transcript "OTTHUMT00000008170.2"; exon_number "3"; exon_id "ENSE00003542504.1"; protein_id "ENSP00000398888.2";
+chr1	HAVANA	exon	23695798	23695908	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000458455.2"; transcript_type "protein_coding"; transcript_name "RPL11-203"; transcript_support_level "1"; tag "basic"; havana_transcript "OTTHUMT00000008170.2"; exon_number "4"; exon_id "ENSE00003586241.1"; protein_id "ENSP00000398888.2";
+chr1	HAVANA	CDS	23695798	23695908	.	+	0	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000458455.2"; transcript_type "protein_coding"; transcript_name "RPL11-203"; transcript_support_level "1"; tag "basic"; havana_transcript "OTTHUMT00000008170.2"; exon_number "4"; exon_id "ENSE00003586241.1"; protein_id "ENSP00000398888.2";
+chr1	HAVANA	exon	23696344	23696418	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000458455.2"; transcript_type "protein_coding"; transcript_name "RPL11-203"; transcript_support_level "1"; tag "basic"; havana_transcript "OTTHUMT00000008170.2"; exon_number "5"; exon_id "ENSE00001857063.1"; protein_id "ENSP00000398888.2";
+chr1	HAVANA	CDS	23696344	23696370	.	+	0	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000458455.2"; transcript_type "protein_coding"; transcript_name "RPL11-203"; transcript_support_level "1"; tag "basic"; havana_transcript "OTTHUMT00000008170.2"; exon_number "5"; exon_id "ENSE00001857063.1"; protein_id "ENSP00000398888.2";
+chr1	HAVANA	stop_codon	23696371	23696373	.	+	0	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000458455.2"; transcript_type "protein_coding"; transcript_name "RPL11-203"; transcript_support_level "1"; tag "basic"; havana_transcript "OTTHUMT00000008170.2"; exon_number "5"; exon_id "ENSE00001857063.1"; protein_id "ENSP00000398888.2";
+chr1	HAVANA	UTR	23692306	23692635	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000458455.2"; transcript_type "protein_coding"; transcript_name "RPL11-203"; transcript_support_level "1"; tag "basic"; havana_transcript "OTTHUMT00000008170.2"; exon_number "1"; exon_id "ENSE00001874656.1"; protein_id "ENSP00000398888.2";
+chr1	HAVANA	UTR	23696371	23696418	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000458455.2"; transcript_type "protein_coding"; transcript_name "RPL11-203"; transcript_support_level "1"; tag "basic"; havana_transcript "OTTHUMT00000008170.2"; exon_number "5"; exon_id "ENSE00001857063.1"; protein_id "ENSP00000398888.2";
+chr1	HAVANA	transcript	23692609	23695946	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000482370.2"; transcript_type "retained_intron"; transcript_name "RPL11-205"; transcript_support_level "1"; havana_transcript "OTTHUMT00000008574.2";
+chr1	HAVANA	exon	23692609	23692759	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000482370.2"; transcript_type "retained_intron"; transcript_name "RPL11-205"; transcript_support_level "1"; havana_transcript "OTTHUMT00000008574.2"; exon_number "1"; exon_id "ENSE00003605024.1";
+chr1	HAVANA	exon	23693807	23693913	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000482370.2"; transcript_type "retained_intron"; transcript_name "RPL11-205"; transcript_support_level "1"; havana_transcript "OTTHUMT00000008574.2"; exon_number "2"; exon_id "ENSE00003648500.1";
+chr1	HAVANA	exon	23694660	23694791	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000482370.2"; transcript_type "retained_intron"; transcript_name "RPL11-205"; transcript_support_level "1"; havana_transcript "OTTHUMT00000008574.2"; exon_number "3"; exon_id "ENSE00003606435.1";
+chr1	HAVANA	exon	23695798	23695946	.	+	.	gene_id "ENSG00000142676.14"; gene_type "protein_coding"; gene_name "RPL11"; level "2"; havana_gene "OTTHUMG00000002926.8"; transcript_id "ENST00000482370.2"; transcript_type "retained_intron"; transcript_name "RPL11-205"; transcript_support_level "1"; havana_transcript "OTTHUMT00000008574.2"; exon_number "4"; exon_id "ENSE00001619539.2";
+chr1	HAVANA	gene	23743155	23762059	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3";
+chr1	HAVANA	transcript	23743155	23762059	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2";
+chr1	HAVANA	exon	23743155	23743578	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "1"; exon_id "ENSE00001731717.2"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2";
+chr1	HAVANA	CDS	23743426	23743578	.	+	0	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "1"; exon_id "ENSE00001731717.2"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2";
+chr1	HAVANA	start_codon	23743426	23743428	.	+	0	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "1"; exon_id "ENSE00001731717.2"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2";
+chr1	HAVANA	exon	23749021	23749077	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "2"; exon_id "ENSE00003559637.1"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2";
+chr1	HAVANA	CDS	23749021	23749077	.	+	0	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "2"; exon_id "ENSE00003559637.1"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2";
+chr1	HAVANA	exon	23749842	23749948	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "3"; exon_id "ENSE00003565857.1"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2";
+chr1	HAVANA	CDS	23749842	23749948	.	+	0	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "3"; exon_id "ENSE00003565857.1"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2";
+chr1	HAVANA	exon	23750845	23752030	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "4"; exon_id "ENSE00000560760.1"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2";
+chr1	HAVANA	CDS	23750845	23752030	.	+	1	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "4"; exon_id "ENSE00000560760.1"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2";
+chr1	HAVANA	exon	23752407	23752518	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "5"; exon_id "ENSE00000758354.1"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2";
+chr1	HAVANA	CDS	23752407	23752518	.	+	0	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "5"; exon_id "ENSE00000758354.1"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2";
+chr1	HAVANA	exon	23754100	23754255	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "6"; exon_id "ENSE00000388637.1"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2";
+chr1	HAVANA	CDS	23754100	23754255	.	+	2	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "6"; exon_id "ENSE00000388637.1"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2";
+chr1	HAVANA	exon	23754363	23754460	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "7"; exon_id "ENSE00000388638.1"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2";
+chr1	HAVANA	CDS	23754363	23754460	.	+	2	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "7"; exon_id "ENSE00000388638.1"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2";
+chr1	HAVANA	exon	23755843	23756023	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "8"; exon_id "ENSE00000388639.1"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2";
+chr1	HAVANA	CDS	23755843	23756023	.	+	0	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "8"; exon_id "ENSE00000388639.1"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2";
+chr1	HAVANA	exon	23756274	23756385	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "9"; exon_id "ENSE00000758350.1"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2";
+chr1	HAVANA	CDS	23756274	23756385	.	+	2	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "9"; exon_id "ENSE00000758350.1"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2";
+chr1	HAVANA	exon	23756953	23757125	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "10"; exon_id "ENSE00000758349.1"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2";
+chr1	HAVANA	CDS	23756953	23757125	.	+	1	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "10"; exon_id "ENSE00000758349.1"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2";
+chr1	HAVANA	exon	23759512	23762059	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "11"; exon_id "ENSE00001596651.2"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2";
+chr1	HAVANA	CDS	23759512	23759570	.	+	2	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "11"; exon_id "ENSE00001596651.2"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2";
+chr1	HAVANA	stop_codon	23759571	23759573	.	+	0	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "11"; exon_id "ENSE00001596651.2"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2";
+chr1	HAVANA	UTR	23743155	23743425	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "1"; exon_id "ENSE00001731717.2"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2";
+chr1	HAVANA	UTR	23759571	23762059	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000418390.6"; transcript_type "protein_coding"; transcript_name "ELOA-201"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; havana_transcript "OTTHUMT00000008230.2"; exon_number "11"; exon_id "ENSE00001596651.2"; protein_id "ENSP00000395574.2"; ccdsid "CCDS239.2";
+chr1	ENSEMBL	transcript	23743366	23762040	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2";
+chr1	ENSEMBL	exon	23743366	23743578	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "1"; exon_id "ENSE00003753896.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2";
+chr1	ENSEMBL	CDS	23743426	23743578	.	+	0	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "1"; exon_id "ENSE00003753896.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2";
+chr1	ENSEMBL	start_codon	23743426	23743428	.	+	0	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "1"; exon_id "ENSE00003753896.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2";
+chr1	ENSEMBL	exon	23749021	23749077	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "2"; exon_id "ENSE00003559637.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2";
+chr1	ENSEMBL	CDS	23749021	23749077	.	+	0	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "2"; exon_id "ENSE00003559637.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2";
+chr1	ENSEMBL	exon	23749842	23749948	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "3"; exon_id "ENSE00003565857.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2";
+chr1	ENSEMBL	CDS	23749842	23749948	.	+	0	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "3"; exon_id "ENSE00003565857.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2";
+chr1	ENSEMBL	exon	23750845	23752030	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "4"; exon_id "ENSE00000560760.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2";
+chr1	ENSEMBL	CDS	23750845	23752030	.	+	1	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "4"; exon_id "ENSE00000560760.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2";
+chr1	ENSEMBL	exon	23752407	23752518	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "5"; exon_id "ENSE00000758354.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2";
+chr1	ENSEMBL	CDS	23752407	23752518	.	+	0	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "5"; exon_id "ENSE00000758354.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2";
+chr1	ENSEMBL	exon	23754100	23754255	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "6"; exon_id "ENSE00000388637.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2";
+chr1	ENSEMBL	CDS	23754100	23754255	.	+	2	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "6"; exon_id "ENSE00000388637.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2";
+chr1	ENSEMBL	exon	23754363	23754460	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "7"; exon_id "ENSE00000388638.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2";
+chr1	ENSEMBL	CDS	23754363	23754460	.	+	2	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "7"; exon_id "ENSE00000388638.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2";
+chr1	ENSEMBL	exon	23755843	23756023	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "8"; exon_id "ENSE00000388639.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2";
+chr1	ENSEMBL	CDS	23755843	23756023	.	+	0	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "8"; exon_id "ENSE00000388639.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2";
+chr1	ENSEMBL	exon	23756274	23756385	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "9"; exon_id "ENSE00000758350.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2";
+chr1	ENSEMBL	CDS	23756274	23756385	.	+	2	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "9"; exon_id "ENSE00000758350.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2";
+chr1	ENSEMBL	exon	23756953	23757125	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "10"; exon_id "ENSE00000758349.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2";
+chr1	ENSEMBL	CDS	23756953	23757125	.	+	1	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "10"; exon_id "ENSE00000758349.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2";
+chr1	ENSEMBL	exon	23759512	23762040	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "11"; exon_id "ENSE00003748060.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2";
+chr1	ENSEMBL	CDS	23759512	23759570	.	+	2	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "11"; exon_id "ENSE00003748060.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2";
+chr1	ENSEMBL	stop_codon	23759571	23759573	.	+	0	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "11"; exon_id "ENSE00003748060.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2";
+chr1	ENSEMBL	UTR	23743366	23743425	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "1"; exon_id "ENSE00003753896.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2";
+chr1	ENSEMBL	UTR	23759571	23762040	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "3"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000613537.4"; transcript_type "protein_coding"; transcript_name "ELOA-204"; transcript_support_level "1"; tag "basic,appris_principal_2,CCDS"; exon_number "11"; exon_id "ENSE00003748060.1"; protein_id "ENSP00000484196.1"; ccdsid "CCDS239.2";
+chr1	HAVANA	transcript	23743472	23759893	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; protein_id "ENSP00000476781.1";
+chr1	HAVANA	exon	23743472	23743578	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "1"; exon_id "ENSE00003703844.1"; protein_id "ENSP00000476781.1";
+chr1	HAVANA	CDS	23743504	23743578	.	+	0	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "1"; exon_id "ENSE00003703844.1"; protein_id "ENSP00000476781.1";
+chr1	HAVANA	start_codon	23743504	23743506	.	+	0	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "1"; exon_id "ENSE00003703844.1"; protein_id "ENSP00000476781.1";
+chr1	HAVANA	exon	23749021	23749077	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "2"; exon_id "ENSE00003559637.1"; protein_id "ENSP00000476781.1";
+chr1	HAVANA	CDS	23749021	23749077	.	+	0	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "2"; exon_id "ENSE00003559637.1"; protein_id "ENSP00000476781.1";
+chr1	HAVANA	exon	23749842	23749948	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "3"; exon_id "ENSE00003565857.1"; protein_id "ENSP00000476781.1";
+chr1	HAVANA	CDS	23749842	23749948	.	+	0	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "3"; exon_id "ENSE00003565857.1"; protein_id "ENSP00000476781.1";
+chr1	HAVANA	exon	23750845	23752030	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "4"; exon_id "ENSE00000560760.1"; protein_id "ENSP00000476781.1";
+chr1	HAVANA	CDS	23750845	23752030	.	+	1	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "4"; exon_id "ENSE00000560760.1"; protein_id "ENSP00000476781.1";
+chr1	HAVANA	exon	23752407	23752518	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "5"; exon_id "ENSE00000758354.1"; protein_id "ENSP00000476781.1";
+chr1	HAVANA	CDS	23752407	23752518	.	+	0	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "5"; exon_id "ENSE00000758354.1"; protein_id "ENSP00000476781.1";
+chr1	HAVANA	exon	23754100	23754255	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "6"; exon_id "ENSE00000388637.1"; protein_id "ENSP00000476781.1";
+chr1	HAVANA	CDS	23754100	23754255	.	+	2	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "6"; exon_id "ENSE00000388637.1"; protein_id "ENSP00000476781.1";
+chr1	HAVANA	exon	23754363	23754460	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "7"; exon_id "ENSE00000388638.1"; protein_id "ENSP00000476781.1";
+chr1	HAVANA	CDS	23754363	23754460	.	+	2	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "7"; exon_id "ENSE00000388638.1"; protein_id "ENSP00000476781.1";
+chr1	HAVANA	exon	23755843	23756023	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "8"; exon_id "ENSE00000388639.1"; protein_id "ENSP00000476781.1";
+chr1	HAVANA	CDS	23755843	23756023	.	+	0	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "8"; exon_id "ENSE00000388639.1"; protein_id "ENSP00000476781.1";
+chr1	HAVANA	exon	23756274	23756385	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "9"; exon_id "ENSE00000758350.1"; protein_id "ENSP00000476781.1";
+chr1	HAVANA	CDS	23756274	23756385	.	+	2	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "9"; exon_id "ENSE00000758350.1"; protein_id "ENSP00000476781.1";
+chr1	HAVANA	exon	23756953	23757125	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "10"; exon_id "ENSE00000758349.1"; protein_id "ENSP00000476781.1";
+chr1	HAVANA	CDS	23756953	23757125	.	+	1	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "10"; exon_id "ENSE00000758349.1"; protein_id "ENSP00000476781.1";
+chr1	HAVANA	exon	23759512	23759893	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "11"; exon_id "ENSE00003710448.1"; protein_id "ENSP00000476781.1";
+chr1	HAVANA	CDS	23759512	23759570	.	+	2	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "11"; exon_id "ENSE00003710448.1"; protein_id "ENSP00000476781.1";
+chr1	HAVANA	stop_codon	23759571	23759573	.	+	0	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "11"; exon_id "ENSE00003710448.1"; protein_id "ENSP00000476781.1";
+chr1	HAVANA	UTR	23743472	23743503	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "1"; exon_id "ENSE00003703844.1"; protein_id "ENSP00000476781.1";
+chr1	HAVANA	UTR	23759571	23759893	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000609199.1"; transcript_type "protein_coding"; transcript_name "ELOA-203"; transcript_support_level "1"; tag "CAGE_supported_TSS,basic,appris_alternative_2"; havana_transcript "OTTHUMT00000471733.1"; exon_number "11"; exon_id "ENSE00003710448.1"; protein_id "ENSP00000476781.1";
+chr1	HAVANA	transcript	23744034	23750944	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000487554.1"; transcript_type "processed_transcript"; transcript_name "ELOA-202"; transcript_support_level "3"; havana_transcript "OTTHUMT00000008231.1";
+chr1	HAVANA	exon	23744034	23744270	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000487554.1"; transcript_type "processed_transcript"; transcript_name "ELOA-202"; transcript_support_level "3"; havana_transcript "OTTHUMT00000008231.1"; exon_number "1"; exon_id "ENSE00001849808.1";
+chr1	HAVANA	exon	23749021	23749077	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000487554.1"; transcript_type "processed_transcript"; transcript_name "ELOA-202"; transcript_support_level "3"; havana_transcript "OTTHUMT00000008231.1"; exon_number "2"; exon_id "ENSE00003591316.1";
+chr1	HAVANA	exon	23749842	23749948	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000487554.1"; transcript_type "processed_transcript"; transcript_name "ELOA-202"; transcript_support_level "3"; havana_transcript "OTTHUMT00000008231.1"; exon_number "3"; exon_id "ENSE00003523840.1";
+chr1	HAVANA	exon	23750845	23750944	.	+	.	gene_id "ENSG00000011007.12"; gene_type "protein_coding"; gene_name "ELOA"; level "2"; havana_gene "OTTHUMG00000002957.3"; transcript_id "ENST00000487554.1"; transcript_type "processed_transcript"; transcript_name "ELOA-202"; transcript_support_level "3"; havana_transcript "OTTHUMT00000008231.1"; exon_number "4"; exon_id "ENSE00001846969.1";

From 7d74daccae52c3e02810b5429712a5901fea3d59 Mon Sep 17 00:00:00 2001
From: fairliereese <fairliek@comcast.net>
Date: Thu, 21 Sep 2023 15:42:00 -0700
Subject: [PATCH 07/31] added more readthrough tests

---
 testing_suite/build_test_databases.py         |  43 ++-
 .../test_assignment_readthrough_examples.py   | 273 ++++++++++++++++++
 .../test_find_gene_match_on_vertex_basis.py   |  20 ++
 3 files changed, 333 insertions(+), 3 deletions(-)
 create mode 100644 testing_suite/test_assignment_readthrough_examples.py

diff --git a/testing_suite/build_test_databases.py b/testing_suite/build_test_databases.py
index 5210c4e..4baed17 100644
--- a/testing_suite/build_test_databases.py
+++ b/testing_suite/build_test_databases.py
@@ -112,7 +112,7 @@
         "--3p", "300",
         "--idprefix", "ENCODEH",
         "--l", "300",
-        "--g",  "hg38", "--o", 
+        "--g",  "hg38", "--o",
         "scratch/multiexon_read_overlapping_monoexon_transcript/talon"])
 
 except Exception as e:
@@ -160,14 +160,14 @@
         "--o", "scratch/intergenic_GM12878" ])
 except Exception as e:
     print(e)
-    sys.exit("TALON run failed on chr11_and_Tcf3")
+    sys.exit("TALON run failed on chr22")
 
 # Actually perform the chr11_and_Tcf3 TALON run
 try:
     subprocess.check_output(
        ["talon",
         "--f", "input_files/chr11_and_Tcf3/config.csv",
-        "--db", "scratch/chr11_and_Tcf3.db", 
+        "--db", "scratch/chr11_and_Tcf3.db",
         "--build", "mm10",
         "--cov", "0",
         "--identity", "0",
@@ -213,3 +213,40 @@
 except Exception as e:
     print(e)
     sys.exit("Problem creating mock database for filtering tests")
+
+
+# code to get cenps-cort and rpl11-eloa only gtf
+# import pyranges as pr
+# df = pr.read_gtf('/Users/fairliereese/Documents/programming/mortazavi_lab/ref/gencode.v29/gencode.v29.annotation.gtf', duplicate_attr=True).df
+# gnames = ['CENPS', 'CORT', 'CENPS-CORT', 'RPL11', 'ELOA']
+# df = df.loc[df.gene_name.isin(gnames)]
+# df = pr.PyRanges(df)
+# df.to_gtf('input_files/readthrough/readthrough.gtf')
+
+try:
+    subprocess.check_output(
+       ["talon_initialize_database",
+        "--f", "input_files/readthrough/readthrough.gtf",
+        "--a",  "gencode_v29",
+        "--5p", "500",
+        "--3p", "300",
+        "--idprefix", "TALON",
+        "--l", "0",
+        "--g",  "hg38", "--o", "scratch/readthrough"])
+except Exception as e:
+    print(e)
+    sys.exit("Database initialization failed on readthrough annotation")
+
+# Actually perform the readthrough TALON run
+try:
+    subprocess.check_output(
+       ["talon",
+        "--f", "input_files/readthrough/config.csv",
+        "--db", "scratch/readthrough.db",
+        "--build", "hg38",
+        "--cov", "0",
+        "--identity", "0",
+        "--o", "scratch/readthrough" ])
+except Exception as e:
+    print(e)
+    sys.exit("TALON run failed on readthrough")
diff --git a/testing_suite/test_assignment_readthrough_examples.py b/testing_suite/test_assignment_readthrough_examples.py
new file mode 100644
index 0000000..879d458
--- /dev/null
+++ b/testing_suite/test_assignment_readthrough_examples.py
@@ -0,0 +1,273 @@
+import pytest
+import sqlite3
+from .helper_fns import fetch_correct_ID
+
+@pytest.mark.integration
+
+# All data comes from hl60_1_1 from the ENCODE data
+
+class TestAssignments(object):
+    """ The objective here is to make sure that each transcript in the
+        readthrough example set was assigned the expected identity. """
+
+    def test_FSM_of_annot_rt(self):
+        """ cenps_cort_fsm is a FSM to the annotated readthrough locus of
+            CENPS-CORT. Comes from ENCODE hl60 data"""
+
+        conn = sqlite3.connect("scratch/readthrough.db")
+        conn.row_factory = sqlite3.Row
+        cursor = conn.cursor()
+
+        dataset = "hl60_1_1"
+        read_ID = "cenps_cort_fsm"
+
+        # Fetch observed entry from table
+        query = """SELECT * from observed WHERE dataset = ? AND read_name = ?"""
+        assignment = cursor.execute(query, [dataset, read_ID]).fetchall()[0]
+
+        correct_gene_ID = fetch_correct_ID("CENPS-CORT", "gene", cursor)
+        assert assignment['gene_ID'] == correct_gene_ID
+
+        annot_dict = make_annot_dict(cursor, assignment['transcript_ID'])
+        assert annot_dict["transcript_status"] == "KNOWN"
+        conn.close()
+
+    def test_ISM_of_annot_rt(self):
+       """ cenps_cort_ism is an ISM of readthrough locus of CENPS-CORT"""
+
+       conn = sqlite3.connect("scratch/readthrough.db")
+       conn.row_factory = sqlite3.Row
+       cursor = conn.cursor()
+
+       dataset = "hl60_1_1"
+       read_ID = "cenps_cort_ism"
+
+       # Fetch observed entry from table
+       query = """SELECT * from observed WHERE dataset = ? AND read_name = ?"""
+       assignment = cursor.execute(query, [dataset, read_ID]).fetchall()[0]
+
+       correct_gene_ID = fetch_correct_ID("CENPS-CORT", "gene", cursor)
+       assert assignment['gene_ID'] == correct_gene_ID
+
+       # Now make sure that the novel transcript was annotated correctly
+       annot_dict = make_annot_dict(cursor, assignment['transcript_ID'])
+       assert annot_dict["ISM_transcript"] == "TRUE"
+       conn.close()
+
+    def test_NNC_of_annot_rt(self):
+       """ cenps_cort_nnc shares most sjs with CENPS-CORT redthrough locus """
+
+       conn = sqlite3.connect("scratch/readthrough.db")
+       conn.row_factory = sqlite3.Row
+       cursor = conn.cursor()
+
+       dataset = "hl60_1_1"
+       read_ID = "cenps_cort_nnc"
+
+       # Fetch observed entry from table
+       query = """SELECT * from observed WHERE dataset = ? AND read_name = ?"""
+       assignment = cursor.execute(query, [dataset, read_ID]).fetchall()[0]
+       correct_gene_ID = fetch_correct_ID("CENPS-CORT", "gene", cursor)
+       assert assignment['gene_ID'] == correct_gene_ID
+
+       # Now make sure that the novel transcript was annotated correctly
+       annot_dict = make_annot_dict(cursor, assignment['transcript_ID'])
+       assert annot_dict["NNC_transcript"] == "TRUE"
+       assert annot_dict["transcript_status"] == "NOVEL"
+       conn.close()
+
+    def test_NIC_of_annot_rt(self):
+        """ cenps_cort_nnc shares all ss, but has a novel sj with CENPS-CORT redthrough locus """
+
+        conn = sqlite3.connect("scratch/readthrough.db")
+        conn.row_factory = sqlite3.Row
+        cursor = conn.cursor()
+
+        dataset = "hl60_1_1"
+        read_ID = "cenps_cort_nic"
+
+        # Fetch observed entry from table
+        query = """SELECT * from observed WHERE dataset = ? AND read_name = ?"""
+        assignment = cursor.execute(query, [dataset, read_ID]).fetchall()[0]
+        correct_gene_ID = fetch_correct_ID("CENPS-CORT", "gene", cursor)
+        assert assignment['gene_ID'] == correct_gene_ID
+
+        # Now make sure that the novel transcript was annotated correctly
+        annot_dict = make_annot_dict(cursor, assignment['transcript_ID'])
+        assert annot_dict["NIC_transcript"] == "TRUE"
+        assert annot_dict["transcript_status"] == "NOVEL"
+        conn.close()
+
+    def test_FSM_of_novel_rt_1(self):
+        """ eloa_rpl11_fsm_1 is FSM to 2 different genes, ELOA and RPL11"""
+        conn = sqlite3.connect("scratch/readthrough.db")
+        conn.row_factory = sqlite3.Row
+        cursor = conn.cursor()
+
+        dataset = "hl60_1_1"
+        read_ID = "eloa_rpl11_fsm_1"
+
+        # Fetch observed entry from table
+        query = """SELECT * from observed WHERE dataset = ? AND read_name = ?"""
+        assignment = cursor.execute(query, [dataset, read_ID]).fetchall()[0]
+
+        # we had 5 annotated genes (CENPS, CORT, CENPS-CORT, ELOA, and RPL11)
+        # so new gene should be 6
+        correct_gene_ID = 6
+        assert assignment['gene_ID'] == correct_gene_ID
+
+        annot_dict = make_annot_dict(cursor, assignment['transcript_ID'])
+        assert annot_dict['transcript_status'] == 'NOVEL'
+        assert annot_dict["fusion_transcript"] == "TRUE"
+
+        annot_dict = make_annot_dict_gene(cursor, assignment['gene_ID'])
+        assert annot_dict['gene_status'] == 'NOVEL'
+        assert annot_dict['fusion_novel'] == 'TRUE'
+
+        conn.close()
+
+    def test_FSM_of_novel_rt_2(self):
+        """ eloa_rpl11_fsm_2 is FSM to 2 different genes, ELOA and RPL11"""
+        conn = sqlite3.connect("scratch/readthrough.db")
+        conn.row_factory = sqlite3.Row
+        cursor = conn.cursor()
+
+        dataset = "hl60_1_1"
+        read_ID = "eloa_rpl11_fsm_2"
+
+        # Fetch observed entry from table
+        query = """SELECT * from observed WHERE dataset = ? AND read_name = ?"""
+        assignment = cursor.execute(query, [dataset, read_ID]).fetchall()[0]
+
+        # we had 5 annotated genes (CENPS, CORT, CENPS-CORT, ELOA, and RPL11)
+        # so new gene should be 6
+        correct_gene_ID = 6
+        assert assignment['gene_ID'] == correct_gene_ID
+
+        annot_dict = make_annot_dict(cursor, assignment['transcript_ID'])
+        assert annot_dict['transcript_status'] == 'NOVEL'
+        assert annot_dict["ISM_transcript"] == "TRUE"
+        assert annot_dict['ISM-suffix_transcript'] == 'TRUE'
+
+        annot_dict = make_annot_dict_gene(cursor, assignment['gene_ID'])
+        assert annot_dict['gene_status'] == 'NOVEL'
+        assert annot_dict['fusion_novel'] == 'TRUE'
+
+        conn.close()
+
+    def test_FSM_of_overlapping_single_gene(self):
+        """ rpl11_fsm is an FSM an annotated gene that is subsumed by the
+            RPL11-ELOA readthrough loci. However it should just
+            be annotated to RPL11"""
+
+        conn = sqlite3.connect("scratch/readthrough.db")
+        conn.row_factory = sqlite3.Row
+        cursor = conn.cursor()
+
+        dataset = "hl60_1_1"
+        read_ID = "rpl11_fsm"
+
+        # Fetch observed entry from table
+        query = """SELECT * from observed WHERE dataset = ? AND read_name = ?"""
+        assignment = cursor.execute(query, [dataset, read_ID]).fetchall()[0]
+
+        correct_gene_ID = fetch_correct_ID("RPL11", "gene", cursor)
+        assert assignment['gene_ID'] == correct_gene_ID
+
+        annot_dict = make_annot_dict(cursor, assignment['transcript_ID'])
+        assert annot_dict["transcript_status"] == "KNOWN"
+        conn.close()
+
+    def test_ISM_of_overlapping_single_gene(self):
+        """ rpl11_ism is an ISM an annotated gene that is subsumed by the
+            RPL11-ELOA readthrough loci. However it should just
+            be annotated to RPL11"""
+
+        conn = sqlite3.connect("scratch/readthrough.db")
+        conn.row_factory = sqlite3.Row
+        cursor = conn.cursor()
+
+        dataset = "hl60_1_1"
+        read_ID = "rpl11_ism"
+
+        # Fetch observed entry from table
+        query = """SELECT * from observed WHERE dataset = ? AND read_name = ?"""
+        assignment = cursor.execute(query, [dataset, read_ID]).fetchall()[0]
+
+        correct_gene_ID = fetch_correct_ID("RPL11", "gene", cursor)
+        assert assignment['gene_ID'] == correct_gene_ID
+
+        annot_dict = make_annot_dict(cursor, assignment['transcript_ID'])
+        assert annot_dict['transcript_status'] == 'NOVEL'
+        assert annot_dict["ISM_transcript"] == "TRUE"
+        assert annot_dict['ISM-suffix_transcript'] == 'TRUE'
+        conn.close()
+
+    def test_NNC_of_annot_rt(self):
+       """ eloa_rpl11_nnc shares most sjs with novel ELOA-RPL11 rt locus """
+
+       conn = sqlite3.connect("scratch/readthrough.db")
+       conn.row_factory = sqlite3.Row
+       cursor = conn.cursor()
+
+       dataset = "hl60_1_1"
+       read_ID = "eloa_rpl11_nnc"
+
+       # Fetch observed entry from table
+       query = """SELECT * from observed WHERE dataset = ? AND read_name = ?"""
+       assignment = cursor.execute(query, [dataset, read_ID]).fetchall()[0]
+       correct_gene_ID = 6
+       assert assignment['gene_ID'] == correct_gene_ID
+
+       # Now make sure that the novel transcript was annotated correctly
+       annot_dict = make_annot_dict(cursor, assignment['transcript_ID'])
+       assert annot_dict["NNC_transcript"] == "TRUE"
+       assert annot_dict["transcript_status"] == "NOVEL"
+       conn.close()
+
+    def test_NNC_of_annot_rt(self):
+        """ eloa_rpl11_nic shares all sjs but one new sj w/ novel ELOA-RPL11 rt locus """
+
+        conn = sqlite3.connect("scratch/readthrough.db")
+        conn.row_factory = sqlite3.Row
+        cursor = conn.cursor()
+
+        dataset = "hl60_1_1"
+        read_ID = "eloa_rpl11_nic"
+
+        # Fetch observed entry from table
+        query = """SELECT * from observed WHERE dataset = ? AND read_name = ?"""
+        assignment = cursor.execute(query, [dataset, read_ID]).fetchall()[0]
+        correct_gene_ID = 6
+        assert assignment['gene_ID'] == correct_gene_ID
+
+        # Now make sure that the novel transcript was annotated correctly
+        annot_dict = make_annot_dict(cursor, assignment['transcript_ID'])
+        assert annot_dict["NIC_transcript"] == "TRUE"
+        assert annot_dict["transcript_status"] == "NOVEL"
+        conn.close()
+
+
+def make_annot_dict_gene(cursor, gene_ID):
+    """ Extracts all gene annotations for the transcript ID and puts
+        them in a dict """
+    query = """SELECT * from gene_annotations WHERE ID = ?"""
+    annotations = cursor.execute(query, [gene_ID]).fetchall()
+    annot_dict = {}
+    for annot in annotations:
+        attribute = annot["attribute"]
+        value = annot["value"]
+        annot_dict[attribute] = value
+    return annot_dict
+
+def make_annot_dict(cursor, transcript_ID):
+    """ Extracts all transcript annotations for the transcript ID and puts
+        them in a dict """
+    query = """SELECT * from transcript_annotations WHERE ID = ?"""
+    annotations = cursor.execute(query, [transcript_ID]).fetchall()
+    annot_dict = {}
+    for annot in annotations:
+        attribute = annot["attribute"]
+        value = annot["value"]
+        annot_dict[attribute] = value
+    return annot_dict
diff --git a/testing_suite/test_find_gene_match_on_vertex_basis.py b/testing_suite/test_find_gene_match_on_vertex_basis.py
index b7c625e..d095fa8 100644
--- a/testing_suite/test_find_gene_match_on_vertex_basis.py
+++ b/testing_suite/test_find_gene_match_on_vertex_basis.py
@@ -26,6 +26,26 @@ def test_perfect_match(self):
         assert fusion == False
         conn.close()
 
+    def test_fusion_match(self):
+        """ Example where the vertices overlap multiple genes.
+        """
+        conn, cursor = get_db_cursor()
+        db = "scratch/toy.db"
+        build = "toy_build"
+        init_refs.make_temp_novel_gene_table(cursor, "toy_build")
+        run_info = talon.init_run_info(db, build)
+        vertex2gene = init_refs.make_vertex_2_gene_dict(cursor)
+
+        vertex_IDs = (1, 2, 3, 4, 5, 9, 10, 11)
+        strand = "+"
+
+        gene_ID, fusion = talon.find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex2gene)
+
+        correct_gene_ID = None
+        assert gene_ID == correct_gene_ID
+        assert fusion == True
+        conn.close()
+
     def test_NNC_type_match(self):
         """ Example where some vertices match a gene, while others don't.
         """

From 91dce6f3a4b549b5e345ebc40667294d24bd6fa4 Mon Sep 17 00:00:00 2001
From: fairliereese <fairliek@comcast.net>
Date: Fri, 22 Sep 2023 10:31:00 -0700
Subject: [PATCH 08/31] added more tests for readthrough calls

---
 src/talon/talon.py                            |   4 -
 .../hl60_1_1_subset_remapped_sorted.bam       | Bin 6521 -> 7230 bytes
 ...test_assignment_chr11_and_Tcf3_examples.py |  18 ++---
 .../test_assignment_readthrough_examples.py   |  73 ++++++++++++++++++
 4 files changed, 82 insertions(+), 13 deletions(-)

diff --git a/src/talon/talon.py b/src/talon/talon.py
index 8c7809d..8601ba1 100644
--- a/src/talon/talon.py
+++ b/src/talon/talon.py
@@ -1398,10 +1398,6 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di
                                          cursor, tmp_gene,
                                          fusion)
 
-    print(gene_ID)
-    print(gene_novelty)
-    print(transcript_novelty)
-
     # Add all novel vertices to vertex_2_gene now that we have the gene ID
     vertex_IDs = start_end_info["vertex_IDs"]
     edge_IDs = start_end_info["edge_IDs"]
diff --git a/testing_suite/input_files/readthrough/hl60_1_1_subset_remapped_sorted.bam b/testing_suite/input_files/readthrough/hl60_1_1_subset_remapped_sorted.bam
index f33a4b84b7f0927fc5be26c7b840261a1f010e43..b8e0088bef55f87cfceacf8c452023c9d597a506 100644
GIT binary patch
delta 4142
zcmV+}5Yg}XGQK#l9t?jI5ba&v%jd^cpJ#Xbl3nBWi~|{L(VdgQ4urJ(>}yh^Wab%g
ztfT}t2v%Ve*Q5kD+mI*Vg<fPPr!ZL1U?}7!G}7L9VJPMzVk_1cyr~x!y%j3{0g5+L
z1?%Vhp5NB&<FlcH$);h>=GpxA_kEd}Gv|EH`J9>W_xX(vn2mq@<Hx^jPCxX@u~Ybb
z`M5FP+ln5$_T-Ic_pe_2{BzG=eQN(1^R*8f^YY&s^UV(#bN)VK&Kxu5#V2;m*%QXR
zv2V;raF6{@95b(xF)2naHCCb#ZK_;~4O}>lQ8}^1DI98xg^LqkU9PTFXQRu-g_37V
zohjs+$l;4NZ0~=m)h1&K&T+GN$$@ef9CHC@vX@xG9Mz?qCg;#vCXZIg*(BuKER=&p
zqHII(-CCh6LS;<H+I;uJN+RlESZ9e^cFs!*c@p<*Gr175BXZQxFJGw;`9U?5P@F50
zTa}EfvhUP5Vf3?-lTgy=p?F`=?TEiTK_|uW*z|MYENOoe7uy7duoJG(Qs~v;e2&#M
zd|7I!RAXS~P@qugw&sw1VZnhkW~!8VCfu};C*on*<M6PBS|h=L5*jAy6HT9SOO`D8
zzshGG+1q($|H_5q`O1~wKX&8Bo_T&h-}n5p`O$07?0t8C_sYK?+dsGW<zIjDsbBp0
zS3mXK^*w*{Nd4@k2QNLgd+w5X^$^^i#^+zSJ>1^7am_q>moWifKWYyC{}KGW{H=Ep
zerpH1IQ?q@@dDx{OWwNzaQ+~``BMtcUpfrV{{e9RueX5nPjA6__u}r)UA%O51JHkX
z;#PnL_?oXDyFEZZeQlEuzxcg(BM)8xx&U;^`$~Vd+2o%b17KY`3|QX=V2J|OH2~Iy
z7v8*?O<ufwZUa{T^p<S$<QA-c_sDGW2jT5!c=7dHQLgq}0&YVRx)cdOIy;4G>(q0e
zozSkr6Yf)!tCHqyMF_ZMy-Z3aQ0Z;rT%XZM>KL=PgI!A@v7>-HNW+jv_O<su9eE5y
zW}|;eG;yHdgY|VnIWypD8g7O$XO`~g=uot;!Kaj=74)Ux9{5x4v{f%zvgD?0G5(91
zl%6_%E5={(I{^7#J`Rw79U%Xm`(IL!cL4dz7tiJ0i<iw$e&|+^{}Fut)e}d7{LN(a
z{Er#)oihOUSKg*d@T0~!^Zy0-QrjFY*JghyjvA)3euOHvaI>XKjcwEta~YCt5oCF}
z#*vvv>j}Z2<``S4HZ)|60jD*d&LeOD7gH)6nUaW)edL6$IAnoyqKiOeABQKTg2PT+
z+*~TPcn&j8p2LSyVg+{BqUfJ;woO<=L;=1cvTy|cW89r`wN+yaMd(6Y(ne}-NN9f%
z2dMRsbigt?qd!R$f)URG@kCngG)h_fzK?q=t}I!y<Q)-2NrNc)D<=>ozkWDMUezcW
zP9RElL`e;KH)}V$yBE#bBSPfojt-GGu9?R{lU}s%a){g%(UGjzIiooI%i;>(bC_TQ
zIdbd>fwj~bD<deRN)b&Z+^a<ke4c++d~sEyY0T_~Ll|@^j?@tra1|jMGM9ibm`C%`
z4vnNSfF+;@0RPcKchr4A@Iw1V5Euvh(MAHJaFG!=(Y-h{Iyj26MA3qOM<j6;eUg%S
zl5;ucgvg@d(IZr&kyzavDJ6?Qh(Lt^lUlNj{>+vpZq^3bG!TuN1hAgyO4ol#g!q}f
z)$Zf8lB*t`6?bu7j}<~>3$s-A6S;%f3H>_+K?Hv=EA$2dYT}O87sNYa3$hf@H;-Tl
zt(Snw0Fy%K21~OZ0Wl!NAq>wl9Aaw1Fp&mWhp<l>I(1pHk}OcIf-tKo1IFDnFgo1Z
z4IU*Xjhi?;lXcBG*4ud19khS!GNqYaT@|S%OP0LTvW=NfY0UhK!!h%5jhP=mGG?B;
zWL`gh5Hnx-j4pKijr$-zf9&=P9Z&B+WByuKp^sSHxZ^MkmV)TISybpwsZT*%Mz}~g
zt5rlG$voJV&_D;lI=#z?Hic`hg6`=mhC@_mMq?iF9cQ680;=b-1<`**cM!h@ZW^c~
zlm!k#AX_mw={|$bY4OcB-(6T7X~}J6TM4>btNq{FIaKX`4et7HJ8;)e!(Ff6_1&A*
z{tIV!H`V^%wg(=2dOW>D4_!4M!{<Lcb*slddHwTGTz&4@>$|&J^1r%k9*0Z6an_h;
zk@W6oW4;DI|H>zfdH8?B#$5TCyUk00fxr6X!S}}>Kob0EB*FLKPDqOX{@0Kk<GMF+
z><oUc{Sy2i4SwbH!SVfjjrqHi#@zcQV_x#cM0^dm8ZjZ|9I1>EhOaSHACTiBIm#A}
z6Lo>Fz|F`G8H2(QnYB7NDLg45orDwNf+k3yQwX&n{Y<URq1Au%+(T+1BWXqY8Z)2=
zm@$hCz>9S=x>^)903~BqN3|t#O)EJ*H{`!W$dFUWJ#9)FZA$eFC$~QORGWu?i)AWa
znoqW}<TScS@<QHEO?xpY;6m~}G_7(?&Gn@=L3SS+d6%?kgA_MlP*lJTkhCWx&y4Xo
zDuR3)NEVSYierEA*}9yxmIUM(oT(HZHE*DYHe3RXqE-tEkqNz_5@O9IdhWdd<Decb
z^H{H$XVMB=&cHh4&`_<d!vpA$PUB1D@-h_FhJ_03m_V(R8^FzXfS^nuA7jF1v!MYZ
zLzb>JI@_UbE!a8Gj$GKw9M+;y#*hK2NYjPqa5{Gp%7=gYDO&KPwwZ051LcStPzo-$
z<kLWeK$dR{N5M!bw+vjb3_EC9;gb*}Fo_IRqt}QiKxZgDKo5jCgKuo)q)KgwH#i??
z?VJ({;4riSTR{N8EQ70Im;rPRR^oV56U(4Qszy?#K?Q#*@I7Frtku|o9^f3Ag{k(<
z&D-dfxKn@DC7X(<6UcTyt4G@dbC0GNRyNU9>%1~Rp}8?)8#aYPOcoE9JQT=WH&ITF
z7A|4IazZQ?rs~mj(iJ72Q_^rKjc~kLUGd2@VU<+~O$UvTRAPaDh@Tji@--NF0uux=
zg5xm46@xNzfxA@3#ZmnOnjd<N=aELx#VL$8s;_@yrmAWV0lRuoO*k#cogYD&SZjX3
zY!T~(`xwxK&tj7Vjh1la-0VbG;m8?Ar2I!T;&bRJwJ~7^a5LDUbj*VfMLOIcLEc*$
z8Z*6?U14N2J*;Jh*#c)c1*xhsN7^`V1pmyiNf2&KDu^)7DK3^CT~E+U#<1Z3A;6-#
z;{bny%DQ5fr6k-DSOY6pwid-UR@YKO-r%I5uZlDc?vP=64*%&ywFC2oQTZ~#tefkY
z#7RgJaAtK}xr6UB8i68OVN4c2NrN2%qoy9^4wzw3T#Uqn50hR%Irt7}7zF3{P-u>x
ztYG@#NR^<3A$8Ne4wnh7D!OERt*H(aQyqU$XNo8YN=wi`@{$I?Xa!B6%TST9#SqjB
z3eFE9O=M2noQxHAwY`a4s-Z6(_nB?@UNCzN#@(H54o4@lnhsB3=cUiy<>5OWK$k38
zvSi7UB}<kpS@HwQ&Aq0l@|7$8<e_TupC8}fy>j2F{d0TYIM^Nk;B(jae&Uh(*>isv
z9(Z8y;_e0Wd-Ol`r_&EzH9wBeKXv>1lkP^+>eI`=bw{dB$$4vKIL}i+QmX@NCEX&^
zz{f&pBUezdP4&bl&{J1O2g3##?Bb>}bW3UAdMmo9SLdvGfRhE#T=26#$b-;>{7roB
z5j4xr<Xh=8gEJ}NONn(6v`_1Foi~3;RPt7FAHp}(%LM<nHW%Ln#2eJTX6_A)vegX}
zpA3>`GfB!y?g^|{S5tLUC|I$JoMPQ<na^4%0y)Nza<ajZO<>Y&D_AO6qF2HSLbxf5
z1*N1zHy%ajYzl!|OCh!(pyarR(~3%KA!H98P6;G_IE(|NGi5Mz_HEGM#L$0QXE#!S
zj+$-J2GCQlOV+vCfR;hzX8<E6=q7`XTP+tP<?hZ7DBpnz+Z@1H1*F>RId!(I<rPxS
zN@H?R222rb93u_RiN{Qplv))Rw5l~MPG+cDSEjR#+71s_!LbWfoWO+#lp4Dh6S9`6
za@B?Tv?+JU6dtT=-cg51Y!iQGUP~hu+F?}&(`NIQ!&JM5S;kD|ptS?^u|k7QU<Hlz
z=5;0tHW9{$Il&YRqC``2*Q!yf1x<0M2JVf1bj+6)S#m)eP%BtCjf5#FU2G^E5B6`V
z(F*P}bV7rgD(qA%E<|F7F3+dhaa?lJ-DTvv_9t5+*1ccg`?`FBx$A#Wpq4nQu=q`t
zNXtdKGSWK7=wToM7ps5pgl*yEJ`z4^<xE#-+88r1yN%|k2yTfn!KwrXNxF|u`c$)p
zNtNWaDn^(vp2cdFFbpl#bS=U9=wU<{Z>zA}!QKvmgAhl!nVO%)JPenbkD+HRg!rJ(
zdTc7N&ZC>U`ZK%?8=ZfhUAu(o%1)hkV78swWGWR})6EqQ6JC?e&2|fw_5n>53ybnp
zFt`#e73jGFD+Z}`F>HWG%W*{A&!L4uOrzD_>MlUI1JR&7a2h76Z_HOKev(FH*P9Pz
zcFQ=3xh_Blp9L+cA{^``2|l$taEX-(4~3Va4IF4X>9cUq3xa>swKN!<?wZImakQBH
z1pk^v*S{ot0E%|cYf06KqD$4UbrYWha<wFB!gm;Y9ne4U(G8s_-NjflRMfPD%Mn%p
zv&h9lo49<t+YqWq0qyw-o6`cFowLad(*?{Py5cEE-Qn)L2ObJDy4n2HD(I3WOO`BI
zvSi7UB})#;wj_TOv?TKfcO5Fpyrd<WS5Iy?@ZIy~x4&PKIe-50rX=&kc1zIxUpUwj
z^ih0%{q45|oimT&%hyi6v)h4Q*cN6|8iF6)PvO=+ptTQZ?E_l-fYv^sANGAfYmf7i
zB}<kpS+Zowk|pn1*%q0;szs)F<i4MKkKFfj>A}5=yXSw+^KU6Ko!%CizVSBuer{ab
zl$c(?DK9_#LtA31;(80gu1eFYG_6Y0sx+-i)2cKrS+Zowk|j%)ELpPTwz91|{jt`a
z?mt|2`b(`leNyXAfBL_kH#@(nJKeqgRV;g7czWmLL;rG*{%XR<4!xo5`}>;AS9gqg
z_=uMr{quj%8uPFC_xX<=yu9oJu3|7yFjDqGqS8RnTvFB#-c_>&r5iEO=Gh19Q&TRK
zn^J8nwrrl(sWSCJkYQa784O6~1lPYqkDOIWwYd`N94?IFEJ$_Kr>u&;P7^I!>quWa
ztxvLG#QKDeLno9==7G+#=r39E1FSiDCVjxHD^Y(PUhcFBEqVXR_N9fF^(AO;9)1bh
z9|09k>90Kb&s*PFc=6I@^AE=k0O79RIRFHBxB24EQGlRtD>NT?dk}c{YPqezc`Lg%
zL-7Ed;HVUrLrpjnEs{QL8Hzr^SOGy=u?^@18pZv?NF!B!@8IM(M<0Q-a!cgq40FAZ
s=e;5$Tf3e+u<LZ#u74AD{W*QP-#2fy>+=tuGye@@b|GtocavKg9*g$!WdHyG

delta 3427
zcmV-p4V?16IQcTL9t?j@4eeaZk7q>{zC8mdGa=)tR6<3F;S?3EgaFg?WN<W8btRS<
z2(gI^HX{yTh{G_UNnGfHDh^3S7ZM2x8v+J4E+j-27{y4`xWR=)x32sHG;R!<h~N2j
zgV4?pUC02blj)w@_tw2tbsk@x@7%{XE?rjg$Im>o960&Zp8bE=KE8KZ-rNyAcKY74
z=eACre(<4(Pu;im!1Ch9m*t7yF3YPQTb9GuEz31~mgUjA*Or5qEXzw<%kl|aW9x=J
z%d@mBF$AAWC`3J&SbWZgefF{U;-i-^#B-@3`)tJ(n~KfFnqX75oYRQ0dE!(eS?s9Y
zv=(cr(ubgp9oc_LVoFJH%raVfCn59b#l{qePh4vvhhE5<DCFwMlvn~$GR&?SBa}oa
z_hG$u-&{8mi5hof4N*<jI*FVH`Ibx~%bYBcrHXO6LYc^QDx5i6n<dpE31=nOsQQT6
zk3v>B$KE(Qmoe;sUyh)YY<*viv!j*Nfs;*yLcA4Upe289#iG3r#a8Uhg$tFy-h0ka
zD0EvACzrk8K<pzHN`6FKG?OFp-FSz?-DIkj1QUwrn4}GKeT|(mWy=35x8Jt0_Q2NW
zk-cei^ZR?wp50g;-b&X!d@kK_`hkr%w$?ZQwrA_m#xwWad*7G8`1H*Wo!MA!D|a0`
zaqQ0ZL&txXr+2~a0c?Ni!f<=`?CIr>%a?_*`;KM%|4-oOiPzpo_$>|SV)fSq;t9l4
zrhIS(;Cvat`DO*@CwGJM-vQ45d<Qsxb_bl-kFI~==&^$<fd1hn?*?dq@A9)f7Y6A2
zPp|UfN8fxu^56-e6F{eYq+}<X{K*~w*0J4y^*Vn5OBAq91F()f^732R<k91YR$%qQ
z-fggY>UIUI-`LBref5RG>VA;-Uu!0L9)L<DL)aRZ0Sx2~4dR%r18M+#G6Rz;4UV~y
zGY-hXzyc(EBX<k%iI#vw*;q;dB-;_}l8^}E6eNt^Du*vtsv?NRwQUDdgs1?6!y_8Y
zO+$Zw1q{t9IA!3X89ky^Ekmp%-+>%lb12?-XH!CZ;6@P)Pm#z1Re>m$9q<*cFdFd@
z2h4~a!!dwcT$HJiiR@Vja2VK1a{wPN*zc-qR`GAjl=I6@#9pBh``X&Bh<z3j``0x@
z?5&8{Gnc>dR>U4TxW0<mZ=3CiJ<#u8qmzH9maDP-^ZoCR*n7`Bc=xG?&Yf9b*K*sb
z_2sJw>6Z>J%Q++?TWMKdM4Uf&!?N6h#AEYwS1gYM27Yzp_Wr#$ArZM1iO7|>5*Xs|
zzXqm=^IpQSYw&aWD~Nw|_}qc*<6GA*%kTCr%e7A~%j0fY0(K*;dWc9mu=@7S-C=*F
zE{-i29D>duaDpz<8DU1IOYg*8WE3p%k`PHTg@7Q!30085MCOvM@v+v?nQ}{wBXW)&
zN8uh4pa+;SvUI?Uu_M@$6*T}QeH2Tjf_2thQgx{|WGuu8Xig1Df*D9Qq&Q?e#<<{O
zsSfci#uF%Mb<q?r2^ot>bjX_AK+1n(Fe%_dbd4(zuMf?&1Z)<G3|DeCYFfpZ113co
zIhRE?V<jV-!Rc9Q2l)lyxh&R;OU5GgNyx9*b)P6_-%DCS53O=JWOSt%q;QEa8p<3>
z%E9^8knkXTE1HNIAR5IOQtl}M>kOc>cT>6p&>?lhP7rYEifY|J1*Q+6R!n~t;N}`Y
zPzI3C!(gKtu7F5R(n3$(G-w+=V=z!u1~(tw7<B3}Wk4!&F!7^XwOxSnp?(Smd1_r*
z8*QK*u^mdm>4seFSRslblYJ1(lu}K=_2S_NH7R@&)(9BUL)92HRurH!l+MrtVV%Jq
z6MR&qR;)K@545%x$tT8y1q6Rt-oOF?D<fCNG#TkKyu|ud)|xziphg<F8uHdm1^CXe
zlrg$?U<7C*BYP@sb@AGpLE1|xdi;e|hsiWIibLN>lMA}?@UnrS8tXIz7aA)kHY|ON
z1Y`Mbqk{syFRN8fofaWs23mz|;HnN?#|8&=DMnomr4o)8(=Fye173ewgwQqX3Q0L2
z%g*AEu*0dWI(bA2Kd{DC_#6`GfKxKUrFfhiG(MpDuID(U#vL7#uGc|h6_zThIV{-4
z4a_MZp-!%|MXFw_>u_7Fb>ds^(1c3@9>Ka=B9K!x14Bh1CpZzD*@~|Co?DEy54Zus
z41OpLc5s}fA^fq(J41h6*G$A#0zv6CKqN5PQQ0R11*xi%53~y2AQ@gP4TJ|qE%2TM
zQ-$1uZ4r7&m=@xn86MTFXE;=1WD!Yn#1(-x@N(--3}UL+(883dB1pk26luUy;23}^
z{8K~KZTE#!xja1dY8#9Q0)&GRw)Vw0@GXx{pop41jD;<!@I!xK)X<~80cN}rouL!&
zNlyUv0xim;gMC>Cg{I)h2+WT_Dg-5TDXZZ%gbdfB=#sEoQynO#SfI`jQ4o|Gp?~Bh
z6@XDQy1<a3BJl=O&?v}g&m0Fbr=6V4E9`221G!YiSQ@S~YIiNedKJ#ytf>}92eO)m
zNbn$kZL}u0T<m`Zbjp+|Q>ILrGG)q?DgULsRWIJ3HaFeAcUPY_f3bIKee=5gTZcAY
z*}lVd;-NDepSrEwb?C@VH*FkUKeBv}{#}1MaPrjhNo;@m!u2P;y`|NsC%$!2s!q{5
zW29T>hk&FO3)V_{6RU!cG1s07sMxA{;v(p&ExmzZgA9K*u|pZUq1bW0kwkHnF-q#-
zWDJ@Me%3lU5PFcmf$bJRv&@LD<~9;IlM=odz44&T!`PO1L<vgX3a&%gLoE;RZ(~w+
zRY1H!-AnSVf>GA8V&cPt<e5YglahM^>m8u_DboNew!z0xR=2&97K%WQF{Km@TQU($
z+M5EF3YLH9l(2#juF7Jpl(g%!6~S5)nW@&up|XII;~G{gDz!2v2OdrlB)(hB1Ee!1
zFm&%~r|zSN)|$6HGIUf;4TeEaosUYb4Wnle`N3es1l=UiaiisesNCI}4&_@gVUrk)
zRY0npwke|~Ew7MrR_epEGGGc|<CrO1D-M>*G1h+~mbI$Y4H|o>nsoRkXg|K&0*;-j
zpn=c6L#d%@F(GN0Diysk8dv2G8R9$I+Z=S7<V}R-wKQU&9k0spw0b_p!qm3H%CJ;o
ztsP*D1sbdZFQ}v!FC!A0NSGgN0uywiNJCQ7s!=Q%UHL{8+#BO)u$LBDQbr$8D_A)7
z2$O&0CWf;Q+jm&iX$9BmYEV{Jg`aB0g-8s1*5bkq$0bF5fQDSt@gzg!_3kYAzTQ58
z?HUxQC5|Fq+^R~X<s!W^QkxIK!9f@&Ykc?t-$LWoBQ~{irgvy+?}NR!E6q_BToQAF
zR|y=Fl5mkW)NSEXIXbP15e()>Hd-a*uBCsPrX^Sx9GnRAtp%Rjy)#`9AmjtWOx@4G
z4&BD;0=J}v5XXAIc2$A34#U*DKix?;!J5%@NHABj8oV9Ww(65H7icX7k3SjPCRNi6
zRN6W;l?^<~QO4wQFqEO^0<7qy-sRx~zSk6b(EI6H7=#&(j+V1@gac8h)X@xvYAb*D
z)rz0Q9=vVUaURWh9E7bi(7`1^PpSwDe~E&vS~^ZKG7zDNQuKiXRgFHfXFY+f-b;hS
zA*4Bt=!1drBjRghz5gZ20Z?>!PD`p*6kUq$?oEADQjHP^V#h<T9r_18+OCF*8IhWx
zqN*iaAK(RWi<AwtiPLwU2Y@Ofqd$K);B#7_Gh<X4JS@X{7>c75S{=<b2Rsz-G0f_Q
zRzassnKEU{lqplDOqsGvc1kj=C7B;wzN;kjxRzv|-nVsVquq1weOJEx`r9R$!-tQr
zN-}rvJQ8&MW804eU4`uz-uscDL(84m`Ru;`@8dv^>=b5V?1CSCp2E)OfaZU5K=V1E
z`5e%E4(P-F9MF8mdCHV2Q>ILrGG)q?53KAInV#1oQ#kK)Ki8i3xu0VvHjb_zULJl&
zk?Furk?EE9c<$%y=~ap85j1(?mJe--sfcZr+007QtTfF^)2uYjO4F<~O_?%f%9JTn
zrc9YK<$|(Pclxo`ovz<qclv*&)}3zDy3<ep<9)NktGd$_JD*eBc<lbQeJB5XrG8%g
z&Rt(9d;7Vj<@vQ`x#c_`Ir`J*m*p?``|wrUA1^zCvpg6m7^!zGL1`dpE-C8=?<&cF
z()GZ!diTM&Sd|N<s#M#E$*Z?@iVS@bq#Ii}fdR=F;QANom9rwTRGWXfjBdGSYe1@l
zzGamKEzzSfmh{WI`X&oztZ(R83_`hN>KLpS{ZOLoV9n8a(ihB{64mbHV!P0kkDdS)
z7Qn)B{RrC2yFY^VBcQ^5{cPbsfC{S!-}!9evE$1h_G|;f<=@!`1Vs1p-L>-pf_|-V
zx%9n3;Qg!R){NjS-qu&a*#U6aQqDGUiD-!)Nnf_)tZy(DK#*2!8G}HlxIXvTW6|#p
z4j)Euok=UVM0N}w^MgDe9NF>fu?D}6yZ!o`@awPZ$NgS?w_hJVacKD$!gYiqNMn;A
F93I*WlT82s

diff --git a/testing_suite/test_assignment_chr11_and_Tcf3_examples.py b/testing_suite/test_assignment_chr11_and_Tcf3_examples.py
index 75e4392..bd8c2bd 100644
--- a/testing_suite/test_assignment_chr11_and_Tcf3_examples.py
+++ b/testing_suite/test_assignment_chr11_and_Tcf3_examples.py
@@ -4,7 +4,7 @@
 @pytest.mark.integration
 
 class TestAssignments(object):
-    """ The objective here is to make sure that each transcript in the 
+    """ The objective here is to make sure that each transcript in the
         chr11_and_Tcf3 example set was assigned the expected identity. """
 
     def test_ISM_of_Canx(self):
@@ -37,7 +37,7 @@ def test_ISM_of_Canx(self):
         conn.close()
 
     def test_prefix_ISM_of_Canx(self):
-        """ m54284_180814_002203/18677911/ccs is an ISM transcript of Canx at 
+        """ m54284_180814_002203/18677911/ccs is an ISM transcript of Canx at
             first glance that has known 5' and 3' ends. Comes from BC017 data. """
 
         conn = sqlite3.connect("scratch/chr11_and_Tcf3.db")
@@ -55,7 +55,7 @@ def test_prefix_ISM_of_Canx(self):
         assert assignment['gene_ID'] == correct_gene_ID
         assert assignment['transcript_ID'] == 8462
         assert assignment['start_delta'] == 30
-        assert assignment['end_delta'] == -290 
+        assert assignment['end_delta'] == -290
 
         # Now make sure that the novel transcript was annotated correctly
         annot_dict = make_annot_dict(cursor, assignment['transcript_ID'])
@@ -117,7 +117,7 @@ def test_suffix_ISM_of_Tcf3(self):
         conn.close()
 
     def test_NIC_of_Drg1(self):
-        """ For this example, the same read was planted in two different 
+        """ For this example, the same read was planted in two different
             datasets (m54284_180814_002203/49414590/ccs) """
 
         conn = sqlite3.connect("scratch/chr11_and_Tcf3.db")
@@ -129,7 +129,7 @@ def test_NIC_of_Drg1(self):
         read_ID = "m54284_180814_002203/49414590/ccs"
 
         # Fetch observed entry from table
-        query = """SELECT * from observed WHERE dataset IN 
+        query = """SELECT * from observed WHERE dataset IN
                    ('PB65_B017', 'PB65_B018') AND read_name = ?"""
         cursor.execute(query, [read_ID])
         correct_gene_ID = fetch_correct_ID("Drg1", "gene", cursor)
@@ -148,14 +148,14 @@ def test_NIC_of_Drg1(self):
     def test_FSM_of_Drg1(self):
         """ Read m54284_180814_002203/40042763/ccs is an FSM of the Drg1 gene
             (BC017) """
-        
+
         conn = sqlite3.connect("scratch/chr11_and_Tcf3.db")
         conn.row_factory = sqlite3.Row
         cursor = conn.cursor()
 
         dataset = "PB65_B017"
         read_ID = "m54284_180814_002203/40042763/ccs"
-        
+
         # Fetch observed entry from table
         query = """SELECT * from observed WHERE dataset = ? AND read_name = ?"""
         assignment = cursor.execute(query, [dataset, read_ID]).fetchall()[0]
@@ -199,8 +199,8 @@ def antisense_to_Grb10(self):
         assert annot_dict["antisense_transcript"] == "TRUE"
         assert annot_dict["transcript_status"] == "NOVEL"
         conn.close()
-        
-        
+
+
 
 def make_annot_dict_gene(cursor, gene_ID):
     """ Extracts all gene annotations for the transcript ID and puts
diff --git a/testing_suite/test_assignment_readthrough_examples.py b/testing_suite/test_assignment_readthrough_examples.py
index 879d458..d1fd28c 100644
--- a/testing_suite/test_assignment_readthrough_examples.py
+++ b/testing_suite/test_assignment_readthrough_examples.py
@@ -247,6 +247,79 @@ def test_NNC_of_annot_rt(self):
         assert annot_dict["transcript_status"] == "NOVEL"
         conn.close()
 
+    def test_FSM_of_overlapping_single_gene_cenps(self):
+        """ cenps_fsm is an FSM an annotated gene that is subsumed by the
+            RPL11-ELOA readthrough loci. However it should just
+            be annotated to RPL11"""
+
+        conn = sqlite3.connect("scratch/readthrough.db")
+        conn.row_factory = sqlite3.Row
+        cursor = conn.cursor()
+
+        dataset = "hl60_1_1"
+        read_ID = "cenps_fsm"
+
+        # Fetch observed entry from table
+        query = """SELECT * from observed WHERE dataset = ? AND read_name = ?"""
+        assignment = cursor.execute(query, [dataset, read_ID]).fetchall()[0]
+
+        correct_gene_ID = fetch_correct_ID("CENPS", "gene", cursor)
+        assert assignment['gene_ID'] == correct_gene_ID
+
+        annot_dict = make_annot_dict(cursor, assignment['transcript_ID'])
+        assert annot_dict["transcript_status"] == "KNOWN"
+        conn.close()
+
+    def test_NIC_of_annot_single_gene_2(self):
+       """ cenps_nic shares all sss w/ annotated cenps. I named the read wrong oops
+       validated by looking for the shared sss of the weird exon w/ known models
+       (cenps-204 and cenps-205) """
+
+       conn = sqlite3.connect("scratch/readthrough.db")
+       conn.row_factory = sqlite3.Row
+       cursor = conn.cursor()
+
+       dataset = "hl60_1_1"
+       read_ID = "cenps_nic"
+
+       # Fetch observed entry from table
+       query = """SELECT * from observed WHERE dataset = ? AND read_name = ?"""
+       assignment = cursor.execute(query, [dataset, read_ID]).fetchall()[0]
+       correct_gene_ID = fetch_correct_ID("CENPS", "gene", cursor)
+       assert assignment['gene_ID'] == correct_gene_ID
+
+       # Now make sure that the novel transcript was annotated correctly
+       annot_dict = make_annot_dict(cursor, assignment['transcript_ID'])
+       assert annot_dict["NIC_transcript"] == "TRUE"
+       assert annot_dict["transcript_status"] == "NOVEL"
+       conn.close()
+
+    def test_NNC_of_annot_single_gene_2(self):
+        """ cenps_nic shares all sss w/ annotated cenps. I named the read wrong oops
+        validated by looking for the shared sss of the weird exon w/ known models
+        (cenps-204 and cenps-205) """
+
+        conn = sqlite3.connect("scratch/readthrough.db")
+        conn.row_factory = sqlite3.Row
+        cursor = conn.cursor()
+
+        dataset = "hl60_1_1"
+        read_ID = "cenps_nnc"
+
+        # Fetch observed entry from table
+        query = """SELECT * from observed WHERE dataset = ? AND read_name = ?"""
+        assignment = cursor.execute(query, [dataset, read_ID]).fetchall()[0]
+        correct_gene_ID = fetch_correct_ID("CENPS", "gene", cursor)
+        assert assignment['gene_ID'] == correct_gene_ID
+
+        # Now make sure that the novel transcript was annotated correctly
+        annot_dict = make_annot_dict(cursor, assignment['transcript_ID'])
+        assert annot_dict["NNC_transcript"] == "TRUE"
+        assert annot_dict["transcript_status"] == "NOVEL"
+        conn.close()
+
+
+
 
 def make_annot_dict_gene(cursor, gene_ID):
     """ Extracts all gene annotations for the transcript ID and puts

From 364cde4941b4effe83896fa901e8fb287fcd52df Mon Sep 17 00:00:00 2001
From: fairliereese <fairliek@comcast.net>
Date: Mon, 25 Sep 2023 14:58:09 -0700
Subject: [PATCH 09/31] added code to pick which gene for possible fusion
 transcripts (that does not look like it will work)"

---
 src/talon/talon.py | 60 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/src/talon/talon.py b/src/talon/talon.py
index 8601ba1..2948b55 100644
--- a/src/talon/talon.py
+++ b/src/talon/talon.py
@@ -996,6 +996,32 @@ def process_NIC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict,
 
     return gene_ID, transcript_ID, novelty, start_end_info, fusion
 
+def get_vertex_2_gene_df(vertex_2_gene):
+    """
+    Get a DataFrame mapping each unique combination of vertex:gene_ID
+
+    Parameters:
+        vertex_2_gene (dict): Dictionary mapping each vertex ID to a list of
+            gene IDs that the vertex is found in
+
+    Returns:
+        df (pandas DataFrame): DataFrame of unique vertex:gene combinations
+    """
+    gids = []
+    vids = []
+    for key, item in vertex_2_gene.items():
+        for item2 in item:
+            gids.append(item2[0])
+            vids.append(key)
+    # df = pd.DataFrame.from_dict(vertex_2_gene, orient='index')
+    df = pd.DataFrame()
+    df['gid'] = gids
+    df['vid'] = vids
+    print(df.head())
+    print(len(df.index))
+    print(len(df.vid.unique().tolist()))
+    print(df.loc[df.vid.duplicated(keep=False)].sort_values(by='vid'))
+    return df
 
 def find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex_2_gene):
     """ Use vertices in a transcript to try to pinpoint the gene it belongs to.
@@ -1025,9 +1051,22 @@ def find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex_2_gene):
 
             # how many genes have this splice site?
             n_gene_matches.append(len(matches))
+    print('curr_matches)')
+    print(curr_matches)
+
+    df = get_vertex_2_gene_df(vertex_2_gene)
+
+    print('eeps epps')
+    print(vertex_IDs)
+    print(df.head())
 
     # how many splice sites are from each gene
     gene_tally = dict((x, gene_matches.count(x)) for x in set(gene_matches))
+    print(gene_tally)
+    print(len(gene_tally))
+    print(n_gene_matches)
+    print(' genes')
+    print(gene_matches)
 
     # no shared splice junctions
     if len(gene_matches) == 0:
@@ -1038,8 +1077,29 @@ def find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex_2_gene):
     # when there are no shared splice sites between gene hits but we did
     # hit more than one gene
     elif max(n_gene_matches) <= 1 and len(gene_tally) > 1:
+        print(' went here')
         return None, True
 
+    # if we hit more than one gene and they have overlapping sjs,
+    # tiebreak based on % of SJs from each
+    # gene that we hit. pick gene w/ greatest percentage
+    elif len(gene_tally) > 1:
+        temp = df.loc[df.gid.isin(gene_matches)].copy(deep=True)
+        temp = temp.drop_duplicates()
+
+        # get total # vertices / gene
+        temp1 = temp.groupby('gid').count().reset_index().rename({'vid': 'n_vert'}, axis=1)
+
+        # get total # detected vertices / gene
+        temp2 = temp.loc[temp.vid.isin(vertex_IDs)].copy(deep=True)
+        temp2 = temp2.groupby('gid').count().reset_index().rename({'vid': 'n_vert_in_t'}, axis=1)
+
+        # merge
+        temp3 = temp1.merge(temp2, on='gid')
+        print(temp3)
+
+
+
     # For the main assignment, pick the gene that is observed the most
     else:
         gene_ID = max(gene_tally, key=gene_tally.get)

From bbf00348babd01afe328d2c671e1897be1204ebd Mon Sep 17 00:00:00 2001
From: fairliereese <fairliek@comcast.net>
Date: Mon, 25 Sep 2023 16:28:21 -0700
Subject: [PATCH 10/31] other attempt at tiebreaking based on % ref overlap...
 also doesnt entirely work

---
 src/talon/init_refs.py |  12 ++---
 src/talon/talon.py     | 118 +++++++++++++++++++++++++++--------------
 2 files changed, 83 insertions(+), 47 deletions(-)

diff --git a/src/talon/init_refs.py b/src/talon/init_refs.py
index b6fd3ce..7c36710 100644
--- a/src/talon/init_refs.py
+++ b/src/talon/init_refs.py
@@ -1,7 +1,7 @@
 # TALON: Techonology-Agnostic Long Read Analysis Pipeline
 # Author: Dana Wyman
 # -----------------------------------------------------------------------------
-# Contains functions that query the database to initialize various data 
+# Contains functions that query the database to initialize various data
 # structures for the TALON run.
 # ---------------------------------------------------------------------
 # make_temp_novel_gene_table
@@ -13,8 +13,9 @@
 # make_gene_start_and_end_dict
 
 from string import Template
+import pandas as pd
 
-def make_temp_novel_gene_table(cursor, build, chrom = None, start = None, 
+def make_temp_novel_gene_table(cursor, build, chrom = None, start = None,
                                end = None, tmp_tab = "temp_gene"):
     """ Attaches a temporary database with a table that has the following fields:
             - gene_ID
@@ -137,7 +138,7 @@ def make_temp_monoexonic_transcript_table(cursor, build, chrom = None,
                                        OR (max_pos >= $start AND max_pos <= $end))""")
 
     command = command.substitute({'build':build, 'chrom':chrom,
-                                  'start':start, 'end':end, 
+                                  'start':start, 'end':end,
                                   'tmp_tab':tmp_tab})
     cursor.execute(command)
 
@@ -292,8 +293,8 @@ def make_vertex_2_gene_dict(cursor, build = None, chrom = None, start = None, en
     return vertex_2_gene
 
 def make_gene_start_or_end_dict(cursor, build, mode, chrom = None, start = None, end = None):
-    """ Select the starts (or ends) of known genes in the database and store 
-        in a dict. 
+    """ Select the starts (or ends) of known genes in the database and store
+        in a dict.
         Format of dict:
             Key: gene ID from database
             Value: dict mapping positions to start vertices (or end vertices) of
@@ -350,4 +351,3 @@ def make_gene_start_or_end_dict(cursor, build, mode, chrom = None, start = None,
             output_dict[gene_ID][pos] = vertex
 
     return output_dict
-
diff --git a/src/talon/talon.py b/src/talon/talon.py
index 2948b55..14be17d 100644
--- a/src/talon/talon.py
+++ b/src/talon/talon.py
@@ -611,10 +611,12 @@ def search_for_overlap_with_gene(chromosome, start, end, strand,
         If there is more than one same-strand option, prioritize amount of
         overlap. Antisense matches may be returned if there is no same strand
         match. """
-
+    print('in search for overlap with gene')
     min_start = min(start, end)
     max_end = max(start, end)
     query_interval = [min_start, max_end]
+    print('query interval')
+    print(query_interval)
 
     query = Template(""" SELECT gene_ID,
                        chromosome,
@@ -637,6 +639,9 @@ def search_for_overlap_with_gene(chromosome, start, end, strand,
 
     # Among multiple matches, preferentially return the same-strand gene with
     # the greatest amount of overlap
+    # print('start+end')
+    # print(start)
+    # print(end)
     same_strand_matches = len([x for x in matches if x["strand"] == strand])
     # for m in matches:
     #     print()
@@ -663,16 +668,28 @@ def get_best_match(matches, query_interval):
     """ Given a set of gene matches and a query interval, return the match
         that has the greatest amount of overlap with the query."""
 
+    print('matching based on overlap')
     max_overlap = 0
+    max_perc_overlap = 0
     best_match = None
-
     for match in matches:
+        print(match['gene_ID'])
         match_interval = [match['start'], match['end']]
-        overlap = get_overlap(query_interval, match_interval)
-        if overlap >= max_overlap:
+        overlap, perc_overlap = get_overlap(query_interval, match_interval)
+        print(overlap)
+        print(perc_overlap)
+        if overlap > max_overlap:
             max_overlap = overlap
+            max_perc_overlap = perc_overlap
             best_match = match
+        elif overlap == max_overlap:
+            if perc_overlap > max_perc_overlap:
+                max_overlap = overlap
+                max_perc_overlap = perc_overlap
+                best_match = match
 
+    print('best match')
+    print(best_match['gene_ID'])
     return best_match
 
 
@@ -682,11 +699,15 @@ def get_overlap(a, b):
         ends of each interval as inclusive, meaning that if a = b = [10, 20],
         the overlap reported would be 11, not 10.
         Args:
-            a: First interval, formattted as a list
-            b: Second interval, formatted as a list
+            a: First interval, formattted as a list (query)
+            b: Second interval, formatted as a list (reference)
+            perc_overlap: Percent overlap from the reference interval that the
+               query interval consumed
     """
     overlap = max(0, min(a[1], b[1]) - max(a[0], b[0]) + 1)
-    return overlap
+    ref_len = abs(b[1]-b[0])
+    perc_overlap = (overlap/ref_len)*100
+    return overlap, perc_overlap
 
 
 def search_for_transcript(edge_IDs, transcript_dict):
@@ -1017,10 +1038,10 @@ def get_vertex_2_gene_df(vertex_2_gene):
     df = pd.DataFrame()
     df['gid'] = gids
     df['vid'] = vids
-    print(df.head())
-    print(len(df.index))
-    print(len(df.vid.unique().tolist()))
-    print(df.loc[df.vid.duplicated(keep=False)].sort_values(by='vid'))
+    # print(df.head())
+    # print(len(df.index))
+    # print(len(df.vid.unique().tolist()))
+    # print(df.loc[df.vid.duplicated(keep=False)].sort_values(by='vid'))
     return df
 
 def find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex_2_gene):
@@ -1051,22 +1072,23 @@ def find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex_2_gene):
 
             # how many genes have this splice site?
             n_gene_matches.append(len(matches))
-    print('curr_matches)')
-    print(curr_matches)
+    # print('curr_matches)')
+    # print(curr_matches)
 
     df = get_vertex_2_gene_df(vertex_2_gene)
 
-    print('eeps epps')
-    print(vertex_IDs)
-    print(df.head())
+    # print('eeps epps')
+    # print(vertex_IDs)
+    # print(df.head())
 
     # how many splice sites are from each gene
     gene_tally = dict((x, gene_matches.count(x)) for x in set(gene_matches))
-    print(gene_tally)
-    print(len(gene_tally))
-    print(n_gene_matches)
-    print(' genes')
-    print(gene_matches)
+    # print('tally')
+    # print(gene_tally)
+    # print(len(gene_tally))
+    # print(n_gene_matches)
+    # print(' genes')
+    # print(gene_matches)
 
     # no shared splice junctions
     if len(gene_matches) == 0:
@@ -1081,22 +1103,23 @@ def find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex_2_gene):
         return None, True
 
     # if we hit more than one gene and they have overlapping sjs,
-    # tiebreak based on % of SJs from each
-    # gene that we hit. pick gene w/ greatest percentage
+    # tie break based on ?????
     elif len(gene_tally) > 1:
-        temp = df.loc[df.gid.isin(gene_matches)].copy(deep=True)
-        temp = temp.drop_duplicates()
-
-        # get total # vertices / gene
-        temp1 = temp.groupby('gid').count().reset_index().rename({'vid': 'n_vert'}, axis=1)
-
-        # get total # detected vertices / gene
-        temp2 = temp.loc[temp.vid.isin(vertex_IDs)].copy(deep=True)
-        temp2 = temp2.groupby('gid').count().reset_index().rename({'vid': 'n_vert_in_t'}, axis=1)
-
-        # merge
-        temp3 = temp1.merge(temp2, on='gid')
-        print(temp3)
+        print('i am here')
+        return None, False
+        # temp = df.loc[df.gid.isin(gene_matches)].copy(deep=True)
+        # temp = temp.drop_duplicates()
+        #
+        # # get total # vertices / gene
+        # temp1 = temp.groupby('gid').count().reset_index().rename({'vid': 'n_vert'}, axis=1)
+        #
+        # # get total # detected vertices / gene
+        # temp2 = temp.loc[temp.vid.isin(vertex_IDs)].copy(deep=True)
+        # temp2 = temp2.groupby('gid').count().reset_index().rename({'vid': 'n_vert_in_t'}, axis=1)
+        #
+        # # merge
+        # temp3 = temp1.merge(temp2, on='gid')
+        # print(temp3)
 
 
@@ -1109,16 +1132,26 @@ def find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex_2_gene):
 
 
 def process_NNC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict,
-                gene_starts, gene_ends, edge_dict, locations, vertex_2_gene, run_info):
+                gene_starts, gene_ends, edge_dict, locations, vertex_2_gene, run_info,
+                cursor, tmp_gene):
     """ Novel not in catalog case """
 
     novelty = []
     start_end_info = {}
 
+    # first try to assign gene based on vertex concordance
     gene_ID, fusion = find_gene_match_on_vertex_basis(
         vertex_IDs, strand, vertex_2_gene)
+
+    # otherwise look for genomic overlap with existing genes
     if gene_ID == None:
-        return None, None, [], None, False
+        gene_ID, match_strand = search_for_overlap_with_gene(chrom, positions[0],
+                                                             positions[-1], strand,
+                                                             cursor, run_info, tmp_gene)
+        print('geneid from search for overlap with gene')
+        print(gene_ID)
+        if gene_ID == None:
+            return None, None, [], None, False
 
     # Get matches for the ends
     start_vertex, start_exon, start_novelty, known_start, diff_5p = process_5p(chrom,
@@ -1232,7 +1265,6 @@ def process_remaining_mult_cases(chrom, positions, strand, edge_IDs, vertex_IDs,
     gene_novelty = []
     transcript_novelty = []
     start_end_info = {}
-
     if not run_info.create_novel_spliced_genes and not fusion:
         gene_ID, match_strand = search_for_overlap_with_gene(chrom, positions[0],
                                                              positions[-1], strand,
@@ -1443,7 +1475,9 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di
                                                                                  vertex_IDs, transcript_dict,
                                                                                  gene_starts, gene_ends,
                                                                                  edge_dict, location_dict,
-                                                                                 vertex_2_gene, run_info)
+                                                                                 vertex_2_gene, run_info,
+                                                                                 cursor, tmp_gene)
+    print(f'geneID from process_nnc: {gene_ID}')
     # Transcripts that don't match the previous categories end up here
     if gene_ID == None:
         print('looking for this other stuff')
@@ -1458,6 +1492,8 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di
                                          cursor, tmp_gene,
                                          fusion)
 
+    print('this is the gene id it decided on')
+    print(gene_ID)
     # Add all novel vertices to vertex_2_gene now that we have the gene ID
     vertex_IDs = start_end_info["vertex_IDs"]
     edge_IDs = start_end_info["edge_IDs"]
@@ -1885,7 +1921,7 @@ def identify_monoexon_transcript(chrom, positions, strand, cursor, location_dict
         for match in matches:
             # get overlap and compare
             match_interval = [match['start'], match['end']]
-            overlap = get_overlap([start, end], match_interval)
+            overlap, perc_overlap = get_overlap([start, end], match_interval)
             if overlap >= best_overlap:
                 best_overlap = overlap
                 best_match = match

From b73d6aea380e7e271f39ac07d25c8ee81f30b785 Mon Sep 17 00:00:00 2001
From: fairliereese <fairliek@comcast.net>
Date: Mon, 25 Sep 2023 17:04:36 -0700
Subject: [PATCH 11/31] changed gene overlap heuristic to be % overlap

---
 src/talon/talon.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/talon/talon.py b/src/talon/talon.py
index 14be17d..682998b 100644
--- a/src/talon/talon.py
+++ b/src/talon/talon.py
@@ -678,15 +678,15 @@ def get_best_match(matches, query_interval):
         overlap, perc_overlap = get_overlap(query_interval, match_interval)
         print(overlap)
         print(perc_overlap)
-        if overlap > max_overlap:
+        if perc_overlap > max_perc_overlap:
             max_overlap = overlap
             max_perc_overlap = perc_overlap
             best_match = match
-        elif overlap == max_overlap:
-            if perc_overlap > max_perc_overlap:
-                max_overlap = overlap
-                max_perc_overlap = perc_overlap
-                best_match = match
+        # elif overlap == max_overlap:
+        #     if perc_overlap > max_perc_overlap:
+        #         max_overlap = overlap
+        #         max_perc_overlap = perc_overlap
+        #         best_match = match
 
     print('best match')
     print(best_match['gene_ID'])

From ac800cee95a15ef1fe88647d22d7c1f1ba04c35a Mon Sep 17 00:00:00 2001
From: fairliereese <fairliek@comcast.net>
Date: Tue, 26 Sep 2023 13:48:04 -0700
Subject: [PATCH 12/31] changed tiebreaking b/w genes to sum of 3' + 5'
 distance of read ends to gene end

---
 src/talon/talon.py | 111 ++++++++++++++++++++++++++++++++-------------
 1 file changed, 79 insertions(+), 32 deletions(-)

diff --git a/src/talon/talon.py b/src/talon/talon.py
index 682998b..2983695 100644
--- a/src/talon/talon.py
+++ b/src/talon/talon.py
@@ -604,7 +604,8 @@ def search_for_ISM(edge_IDs, transcript_dict):
 
 
 def search_for_overlap_with_gene(chromosome, start, end, strand,
-                                 cursor, run_info, tmp_gene):
+                                 cursor, run_info, tmp_gene,
+                                 gene_starts, gene_ends):
     """ Given a start and an end value for an interval, query the database to
         determine whether the interval overlaps with any genes. If it there is
         more than one match, prioritize same-strand first and foremost.
@@ -615,8 +616,8 @@ def search_for_overlap_with_gene(chromosome, start, end, strand,
     min_start = min(start, end)
     max_end = max(start, end)
     query_interval = [min_start, max_end]
-    print('query interval')
-    print(query_interval)
+    # print('query interval')
+    # print(query_interval)
 
     query = Template(""" SELECT gene_ID,
                        chromosome,
@@ -655,44 +656,76 @@ def search_for_overlap_with_gene(chromosome, start, end, strand,
             strand == "-" and same_strand_matches == 0:
 
         matches = [x for x in matches if x["strand"] == "+"]
-        best_match = get_best_match(matches, query_interval)
+        # best_match = get_best_match(matches, query_interval)
+        best_match = get_best_match(matches, start, end,
+                                    gene_starts, gene_ends)
 
     else:
         matches = [x for x in matches if x["strand"] == "-"]
-        best_match = get_best_match(matches, query_interval)
+        # best_match = get_best_match(matches, query_interval)
+        best_match = get_best_match(matches, start, end,
+                                    gene_starts, gene_ends)
 
     return best_match['gene_ID'], best_match['strand']
 
+def get_best_match(matches, start, end,
+                   gene_starts, gene_ends):
+    """
+    Get the best gene match based on distances of start and end of
+    read to starts and ends from transcripts of genes. The gene with the
+    lowest absolute genomic distance between 5' ends and 3' ends will win.
+    """
+    min_dist = sys.maxsize
+    best_match = None
 
-def get_best_match(matches, query_interval):
-    """ Given a set of gene matches and a query interval, return the match
-        that has the greatest amount of overlap with the query."""
+    print(f'read start: {start}')
+    print(f'read end: {end}')
 
-    print('matching based on overlap')
-    max_overlap = 0
-    max_perc_overlap = 0
-    best_match = None
+    # TODO - maybe don't need gene_starts + gene_ends?
     for match in matches:
-        print(match['gene_ID'])
-        match_interval = [match['start'], match['end']]
-        overlap, perc_overlap = get_overlap(query_interval, match_interval)
-        print(overlap)
-        print(perc_overlap)
-        if perc_overlap > max_perc_overlap:
-            max_overlap = overlap
-            max_perc_overlap = perc_overlap
+        print()
+        print(f"gene: {match['gene_ID']}")
+        end_dist = abs(match['end']-end)
+        start_dist = abs(match['start']-start)
+
+        print(f"gene start: {match['start']}")
+        print(f"gene end: {match['end']}")
+        dist = end_dist+start_dist
+        print(f'dist: {dist}')
+        if dist < min_dist:
+            min_dist = dist
             best_match = match
-        # elif overlap == max_overlap:
-        #     if perc_overlap > max_perc_overlap:
-        #         max_overlap = overlap
-        #         max_perc_overlap = perc_overlap
-        #         best_match = match
 
     print('best match')
     print(best_match['gene_ID'])
     return best_match
 
 
+
+# def get_best_match(matches, query_interval):
+#     """ Given a set of gene matches and a query interval, return the match
+#         that has the greatest amount of overlap with the query."""
+#
+#     print('matching based on overlap')
+#     max_overlap = 0
+#     max_perc_overlap = 0
+#     best_match = None
+#     for match in matches:
+#         print(match['gene_ID'])
+#         match_interval = [match['start'], match['end']]
+#         overlap, perc_overlap = get_overlap(query_interval, match_interval)
+#         print(overlap)
+#         print(perc_overlap)
+#         if overlap > max_overlap:
+#             max_overlap = overlap
+#             max_perc_overlap = perc_overlap
+#             best_match = match
+#
+#     print('best match')
+#     print(best_match['gene_ID'])
+#     return best_match
+
+
 def get_overlap(a, b):
     """ Computes the amount of overlap between two intervals.
         Returns 0 if there is no overlap. The function treats the start and
@@ -965,7 +998,8 @@ def process_ISM(chrom, positions, strand, edge_IDs, vertex_IDs, all_matches, tra
 
 
 def process_NIC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict,
-                gene_starts, gene_ends, edge_dict, locations, vertex_2_gene, run_info):
+                gene_starts, gene_ends, edge_dict, locations, vertex_2_gene, run_info,
+                cursor, tmp_gene):
     """ For a transcript that has been determined to be novel in catalog, find
         the proper gene match (documenting fusion event if applicable). To do
         this, look up each vertex in the vertex_2_gene dict, and keep track of all
@@ -976,6 +1010,14 @@ def process_NIC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict,
     gene_ID, fusion = find_gene_match_on_vertex_basis(vertex_IDs,
                                                       strand,
                                                       vertex_2_gene)
+    # otherwise look for closest gene based on end differences
+    if gene_ID == None:
+      gene_ID, match_strand = search_for_overlap_with_gene(chrom, positions[0],
+                                                           positions[-1], strand,
+                                                           cursor, run_info, tmp_gene,
+                                                           gene_starts, gene_ends)
+      print('geneid from search for overlap with gene  9NIC)')
+      print(gene_ID)
     if gene_ID == None:
       return None, None, [], None, fusion
 
@@ -1143,11 +1185,12 @@ def process_NNC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict,
     gene_ID, fusion = find_gene_match_on_vertex_basis(
         vertex_IDs, strand, vertex_2_gene)
 
-    # otherwise look for genomic overlap with existing genes
+    # otherwise look for closest gene based on end differences
     if gene_ID == None:
         gene_ID, match_strand = search_for_overlap_with_gene(chrom, positions[0],
                                                              positions[-1], strand,
-                                                             cursor, run_info, tmp_gene)
+                                                             cursor, run_info, tmp_gene,
+                                                             gene_starts, gene_ends)
         print('geneid from search for overlap with gene')
         print(gene_ID)
         if gene_ID == None:
@@ -1268,7 +1311,8 @@ def process_remaining_mult_cases(chrom, positions, strand, edge_IDs, vertex_IDs,
     if not run_info.create_novel_spliced_genes and not fusion:
         gene_ID, match_strand = search_for_overlap_with_gene(chrom, positions[0],
                                                              positions[-1], strand,
-                                                             cursor, run_info, tmp_gene)
+                                                             cursor, run_info, tmp_gene,
+                                                             gene_starts, gene_ends)
     else:
         gene_ID = None
         match_strand = None
@@ -1437,7 +1481,8 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di
                                                                                      vertex_IDs, transcript_dict,
                                                                                      gene_starts, gene_ends,
                                                                                      edge_dict, location_dict,
-                                                                                     vertex_2_gene, run_info)
+                                                                                     vertex_2_gene, run_info,
+                                                                                     cursor, tmp_gene)
 
     # Novel in catalog transcripts have known splice donors and acceptors,
     # but new connections between them.
@@ -1449,7 +1494,8 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di
                                                                                  vertex_IDs, transcript_dict,
                                                                                  gene_starts, gene_ends,
                                                                                  edge_dict, location_dict,
-                                                                                 vertex_2_gene, run_info)
+                                                                                 vertex_2_gene, run_info,
+                                                                                 cursor, tmp_gene)
 
     # Antisense transcript with splice junctions matching known gene
     if splice_vertices_known and gene_ID == None and not fusion:
@@ -1964,7 +2010,8 @@ def identify_monoexon_transcript(chrom, positions, strand, cursor, location_dict
             # Find best gene match using overlap search if the ISM/NIC check didn't work
             gene_ID, match_strand = search_for_overlap_with_gene(chrom, positions[0],
                                                                  positions[1], strand,
-                                                                 cursor, run_info, tmp_gene)
+                                                                 cursor, run_info, tmp_gene,
+                                                                 gene_starts, gene_ends)
             # Intergenic case
             if gene_ID == None:
                 gene_ID = create_gene(chrom, positions[0], positions[-1],

From accf79524cdc7e5174b70ce8c05a2588a1747725 Mon Sep 17 00:00:00 2001
From: fairliereese <fairliek@comcast.net>
Date: Tue, 26 Sep 2023 19:07:33 -0700
Subject: [PATCH 13/31] made sure that NIC assignment tiebreaking doesn't run
 when finding readthrough loci

---
 src/talon/talon.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/talon/talon.py b/src/talon/talon.py
index 2983695..a3db0c2 100644
--- a/src/talon/talon.py
+++ b/src/talon/talon.py
@@ -1010,8 +1010,9 @@ def process_NIC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict,
     gene_ID, fusion = find_gene_match_on_vertex_basis(vertex_IDs,
                                                       strand,
                                                       vertex_2_gene)
-    # otherwise look for closest gene based on end differences
-    if gene_ID == None:
+    # otherwise look for closest gene based on end differences,
+    # only if it wasn't previously labeled as fusion
+    if gene_ID == None and fusion == False:
       gene_ID, match_strand = search_for_overlap_with_gene(chrom, positions[0],
                                                            positions[-1], strand,
                                                            cursor, run_info, tmp_gene,

From 49eb8d149a3fc3b52f8a1e44b1704ad99c762c26 Mon Sep 17 00:00:00 2001
From: fairliereese <fairliek@comcast.net>
Date: Wed, 27 Sep 2023 17:04:37 -0700
Subject: [PATCH 14/31] added more edge case fixes for annotating fusion
 transcritps

---
 src/talon/talon.py | 160 ++++++++++++++++++++++++++++++++++-----------
 1 file changed, 121 insertions(+), 39 deletions(-)

diff --git a/src/talon/talon.py b/src/talon/talon.py
index a3db0c2..c409043 100644
--- a/src/talon/talon.py
+++ b/src/talon/talon.py
@@ -456,7 +456,9 @@ def create_transcript(chromosome, start_pos, end_pos, gene_ID, edge_IDs, vertex_
                       transcript_dict):
     """Creates a novel transcript and adds it to the transcript data structure.
     """
+    print('creating new transcript')
     new_ID = transcript_counter.increment()
+    print(f'new tid:{new_ID}')
     if len(edge_IDs) > 1:
         jn_path = ",".join(map(str, edge_IDs[1:-1]))
     else:
@@ -605,13 +607,17 @@ def search_for_ISM(edge_IDs, transcript_dict):
 
 def search_for_overlap_with_gene(chromosome, start, end, strand,
                                  cursor, run_info, tmp_gene,
-                                 gene_starts, gene_ends):
+                                 gene_starts, gene_ends, gene_IDs=None):
     """ Given a start and an end value for an interval, query the database to
         determine whether the interval overlaps with any genes. If it there is
         more than one match, prioritize same-strand first and foremost.
-        If there is more than one same-strand option, prioritize amount of
-        overlap. Antisense matches may be returned if there is no same strand
-        match. """
+        If there is more than one same-strand option, prioritize distance from 3' / 5'.
+        Antisense matches may be returned if there is no same strand
+        match.
+
+        Parameters:
+            gene_ID (list of str or None): Restrict results to genes in this list
+    """
     print('in search for overlap with gene')
     min_start = min(start, end)
     max_end = max(start, end)
@@ -638,6 +644,11 @@ def search_for_overlap_with_gene(chromosome, start, end, strand,
     if len(matches) == 0:
         return None, None
 
+    # restrict to just the genes we care about
+    if gene_IDs:
+        print(f'restricting just to {gene_IDs}')
+        matches = [match for match in matches if match['gene_ID'] in gene_IDs]
+
     # Among multiple matches, preferentially return the same-strand gene with
     # the greatest amount of overlap
     # print('start+end')
@@ -666,6 +677,8 @@ def search_for_overlap_with_gene(chromosome, start, end, strand,
         best_match = get_best_match(matches, start, end,
                                     gene_starts, gene_ends)
 
+    print(f"but right here it says {best_match['gene_ID']}")
+
     return best_match['gene_ID'], best_match['strand']
 
 def get_best_match(matches, start, end,
@@ -886,7 +899,8 @@ def process_3p(chrom, positions, strand, vertex_IDs, gene_ID, gene_ends, edge_di
 
 
 def process_ISM(chrom, positions, strand, edge_IDs, vertex_IDs, all_matches, transcript_dict,
-                gene_starts, gene_ends, edge_dict, locations, run_info):
+                gene_starts, gene_ends, edge_dict, locations, run_info,
+                cursor, tmp_gene):
     """ Given a transcript, try to find an ISM match for it. If the
         best match is an ISM with known ends, that will be promoted to NIC. """
 
@@ -899,7 +913,26 @@ def process_ISM(chrom, positions, strand, edge_IDs, vertex_IDs, all_matches, tra
     ISM = []
     suffix = []
     prefix = []
-    gene_ID = all_matches[0]['gene_ID']
+
+    # choose gene to assign it to
+    gene_matches = list(set([match['gene_ID'] for match in all_matches]))
+    print(gene_matches)
+
+    # tie break based on distance to 5' / 3' ends
+    if len(gene_matches) > 1:
+        gene_ID, _ = search_for_overlap_with_gene(chrom, positions[0],
+                        positions[-1], strand, cursor, run_info, tmp_gene,
+                        gene_starts, gene_ends, gene_IDs=gene_matches)
+        all_matches = [m for m in all_matches if m['gene_ID'] == gene_ID]
+    else:
+        gene_ID = all_matches[0]['gene_ID']
+
+    # print('edge IDs')
+    # print(edge_IDs)
+    # for match in all_matches:
+    #     print(f"gene id:{match['gene_ID']}")
+    #     print(match['jn_path'])
+
     # Get matches for the ends
     if n_exons > 1:
         start_vertex, start_exon, start_novelty, known_start, diff_5p = process_5p(chrom,
@@ -932,7 +965,7 @@ def process_ISM(chrom, positions, strand, edge_IDs, vertex_IDs, all_matches, tra
         known_start = 0
         known_end = 0
 
-    # Iterate over matches to characterize ISMs
+    # Iterate over all matches from assigned gene to characterize ISMs
     for match in all_matches:
 
         # Add ISM
@@ -996,6 +1029,36 @@ def process_ISM(chrom, positions, strand, edge_IDs, vertex_IDs, all_matches, tra
 
     return gene_ID, transcript_ID, novelty, start_end_info
 
+def assign_gene(vertex_IDs, strand, vertex_2_gene,
+                             chrom, start, end, cursor, run_info,
+                             tmp_gene, gene_starts, gene_ends):
+    """
+    Assign a gene to a transcript. First do this on the basis of splice site
+    matching. If this yields more than one gene, then choose the gene with the
+    closest 5' / 3' ends. If the splice site matching returns multiple matches
+    between non-overlapping genes, mark as fusion and do not assign a gene.
+
+    Returns:
+    gene_ID (str or None): Gene ID of assigned gene, None if not fount
+    fusion (bool): Whether read appears to come from a novel fusion gene
+    """
+
+    # first attempt to assign based on matching vertices
+    gene_ID, fusion = find_gene_match_on_vertex_basis(vertex_IDs,
+                                                       strand,
+                                                       vertex_2_gene)
+
+    # if previous function returned more than one gene that we need to tiebreak,
+    # look for closest gene based on end differences, out of candidate genes
+    # only if it wasn't previously labeled as fusion
+    if type(gene_ID) == list and fusion == False:
+        gene_ID, match_strand = search_for_overlap_with_gene(chrom, start,
+                                                            end, strand,
+                                                            cursor, run_info, tmp_gene,
+                                                            gene_starts, gene_ends,
+                                                            gene_IDs=gene_ID)
+    return gene_ID, fusion
+
 
 def process_NIC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict,
                 gene_starts, gene_ends, edge_dict, locations, vertex_2_gene, run_info,
@@ -1006,19 +1069,23 @@ def process_NIC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict,
         same-strand genes. """
 
     start_end_info = {}
+    gene_ID, fusion = assign_gene(vertex_IDs, strand, vertex_2_gene,
+                                 chrom, positions[0], positions[-1], cursor, run_info,
+                                 tmp_gene, gene_starts, gene_ends)
+
+    # gene_ID, fusion = find_gene_match_on_vertex_basis(vertex_IDs,
+    #                                                   strand,
+    #                                                   vertex_2_gene)
+    # # otherwise look for closest gene based on end differences,
+    # # only if it wasn't previously labeled as fusion
+    # if gene_ID == None and fusion == False:
+    #   gene_ID, match_strand = search_for_overlap_with_gene(chrom, positions[0],
+    #                                                        positions[-1], strand,
+    #                                                        cursor, run_info, tmp_gene,
+    #                                                        gene_starts, gene_ends)
+    #   print('geneid from search for overlap with gene  9NIC)')
+    #   print(gene_ID)
 
-    gene_ID, fusion = find_gene_match_on_vertex_basis(vertex_IDs,
-                                                      strand,
-                                                      vertex_2_gene)
-    # otherwise look for closest gene based on end differences,
-    # only if it wasn't previously labeled as fusion
-    if gene_ID == None and fusion == False:
-      gene_ID, match_strand = search_for_overlap_with_gene(chrom, positions[0],
-                                                           positions[-1], strand,
-                                                           cursor, run_info, tmp_gene,
-                                                           gene_starts, gene_ends)
-      print('geneid from search for overlap with gene  9NIC)')
-      print(gene_ID)
     if gene_ID == None:
       return None, None, [], None, fusion
 
@@ -1148,8 +1215,10 @@ def find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex_2_gene):
     # if we hit more than one gene and they have overlapping sjs,
     # tie break based on ?????
     elif len(gene_tally) > 1:
-        print('i am here')
-        return None, False
+        print('i found more than one gene')
+        print(gene_tally)
+        print(n_gene_matches)
+        return list(gene_tally.keys()), False
         # temp = df.loc[df.gid.isin(gene_matches)].copy(deep=True)
         # temp = temp.drop_duplicates()
         #
@@ -1182,20 +1251,27 @@ def process_NNC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict,
     novelty = []
     start_end_info = {}
 
-    # first try to assign gene based on vertex concordance
-    gene_ID, fusion = find_gene_match_on_vertex_basis(
-        vertex_IDs, strand, vertex_2_gene)
+    # # first try to assign gene based on vertex concordance
+    # gene_ID, fusion = find_gene_match_on_vertex_basis(
+    #     vertex_IDs, strand, vertex_2_gene)
+    #
+    # # otherwise look for closest gene based on end differences
+    # if gene_ID == None and fusion == False:
+    #     gene_ID, match_strand = search_for_overlap_with_gene(chrom, positions[0],
+    #                                                          positions[-1], strand,
+    #                                                          cursor, run_info, tmp_gene,
+    #                                                          gene_starts, gene_ends)
+    #     print('geneid from search for overlap with gene')
+    #     print(gene_ID)
+    gene_ID, fusion = assign_gene(vertex_IDs, strand, vertex_2_gene,
+                                 chrom, positions[0], positions[-1], cursor, run_info,
+                                 tmp_gene, gene_starts, gene_ends)
+    print('gene id process_nnc')
+    print(gene_ID)
+    print(fusion)
 
-    # otherwise look for closest gene based on end differences
     if gene_ID == None:
-        gene_ID, match_strand = search_for_overlap_with_gene(chrom, positions[0],
-                                                             positions[-1], strand,
-                                                             cursor, run_info, tmp_gene,
-                                                             gene_starts, gene_ends)
-        print('geneid from search for overlap with gene')
-        print(gene_ID)
-        if gene_ID == None:
-            return None, None, [], None, False
+        return None, None, [], None, fusion
 
     # Get matches for the ends
     start_vertex, start_exon, start_novelty, known_start, diff_5p = process_5p(chrom,
@@ -1346,11 +1422,13 @@ def process_remaining_mult_cases(chrom, positions, strand, edge_IDs, vertex_IDs,
     start_end_info["vertex_IDs"] = vertex_IDs
 
     if gene_ID == None:
-
+        print(f'fusion: {fusion}')
         if fusion:
+            print('i should be here')
             t_nov = 'fusion_transcript'
             g_nov = 'fusion_novel'
         else:
+            print('but I think im going here')
             t_nov = 'intergenic_transcript'
             g_nov = 'intergenic_novel'
 
@@ -1446,6 +1524,7 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di
     all_exons_known = check_all_exons_known(e_novelty)
     splice_vertices_known = (sum(v_novelty) == 0)
     all_exons_novel = (reduce(operator.mul, e_novelty, 1) == 1)
+    print(f'all exons novel : {all_exons_novel}')
     fusion = False
 
     # Look for FSM or ISM.
@@ -1472,7 +1551,9 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di
                                                                                          transcript_dict,
                                                                                          gene_starts, gene_ends,
                                                                                          edge_dict, location_dict,
-                                                                                         run_info)
+                                                                                         run_info, cursor, tmp_gene)
+                print(f'gene id from process ism {gene_ID}')
+
         # Look for NIC
         if gene_ID == None:
             print('looking for nic')
@@ -1513,8 +1594,9 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di
                                       cursor, tmp_gene)
 
     # Novel not in catalog transcripts contain new splice donors/acceptors
-    # and contain at least one splice junction.
-    elif not(splice_vertices_known) and not fusion:
+    # and contain at least one splice junction. There should also be at least
+    # one shared exon from existing transcripts to even try assigning a gene
+    elif not(splice_vertices_known) and not fusion and not all_exons_novel:
         print('lookign for NNCs')
         gene_ID, transcript_ID, transcript_novelty, start_end_info, fusion = process_NNC(chrom,
                                                                                  positions,
@@ -1524,7 +1606,7 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di
                                                                                  edge_dict, location_dict,
                                                                                  vertex_2_gene, run_info,
                                                                                  cursor, tmp_gene)
-    print(f'geneID from process_nnc: {gene_ID}')
+        print(f'geneID from process_nnc: {gene_ID}')
     # Transcripts that don't match the previous categories end up here
     if gene_ID == None:
         print('looking for this other stuff')
@@ -2006,7 +2088,7 @@ def identify_monoexon_transcript(chrom, positions, strand, cursor, location_dict
                                                                                transcript_dict,
                                                                                gene_starts, gene_ends,
                                                                                edge_dict, location_dict,
-                                                                               run_info)
+                                                                               run_info, cursor, tmp_gene)
         if gene_ID == None:
             # Find best gene match using overlap search if the ISM/NIC check didn't work
             gene_ID, match_strand = search_for_overlap_with_gene(chrom, positions[0],

From 200fd72aa9bcf7a3103c3214575b9a851828d5e3 Mon Sep 17 00:00:00 2001
From: fairliereese <fairliek@comcast.net>
Date: Wed, 27 Sep 2023 20:23:34 -0700
Subject: [PATCH 15/31] fixed (hopefully) antisense multimatch gene case

---
 src/talon/talon.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/talon/talon.py b/src/talon/talon.py
index c409043..0ca09b1 100644
--- a/src/talon/talon.py
+++ b/src/talon/talon.py
@@ -1326,6 +1326,12 @@ def process_spliced_antisense(chrom, positions, strand, edge_IDs, vertex_IDs,
         anti_strand = "+"
     anti_gene_ID, fusion = find_gene_match_on_vertex_basis(vertex_IDs, anti_strand,
                                                    vertex_2_gene)
+    if type(anti_gene_ID) == list and fusion == False:
+        anti_gene_ID, match_strand = search_for_overlap_with_gene(chrom, positions[0],
+                                                           positions[-1], strand,
+                                                           cursor, run_info, tmp_gene,
+                                                                                                           gene_starts, gene_ends,
+                                                                                                           gene_IDs=gene_ID)
     if anti_gene_ID == None:
         return None, None, gene_novelty, transcript_novelty, start_end_info
 

From 939ee26e9a7115f2104736678871342f8ce062ed Mon Sep 17 00:00:00 2001
From: fairliereese <fairliek@comcast.net>
Date: Wed, 27 Sep 2023 21:03:22 -0700
Subject: [PATCH 16/31] fixed typo

---
 src/talon/talon.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/talon/talon.py b/src/talon/talon.py
index 0ca09b1..7fd1d46 100644
--- a/src/talon/talon.py
+++ b/src/talon/talon.py
@@ -1330,6 +1330,8 @@ def process_spliced_antisense(chrom, positions, strand, edge_IDs, vertex_IDs,
         anti_gene_ID, match_strand = search_for_overlap_with_gene(chrom, positions[0],
                                                            positions[-1], strand,
                                                            cursor, run_info, tmp_gene,
+                                                           gene_starts, gene_ends,
+                                                           gene_IDs=anti_gene_ID)
                                                                                                            gene_starts, gene_ends,
                                                                                                            gene_IDs=gene_ID)
     if anti_gene_ID == None:

From 00705c7b4561df734d52cb6a7ce42b538d483ef0 Mon Sep 17 00:00:00 2001
From: fairliereese <fairliek@comcast.net>
Date: Wed, 27 Sep 2023 21:08:15 -0700
Subject: [PATCH 17/31] fixed typo

---
 src/talon/talon.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/talon/talon.py b/src/talon/talon.py
index 7fd1d46..417bc36 100644
--- a/src/talon/talon.py
+++ b/src/talon/talon.py
@@ -1332,8 +1332,6 @@ def process_spliced_antisense(chrom, positions, strand, edge_IDs, vertex_IDs,
                                                            cursor, run_info, tmp_gene,
                                                            gene_starts, gene_ends,
                                                            gene_IDs=anti_gene_ID)
-                                                                                                           gene_starts, gene_ends,
-                                                                                                           gene_IDs=gene_ID)
     if anti_gene_ID == None:
         return None, None, gene_novelty, transcript_novelty, start_end_info
 

From d6f8d3a39526a541c78d9765995deaa92e9deaf6 Mon Sep 17 00:00:00 2001
From: fairliereese <fairliek@comcast.net>
Date: Thu, 28 Sep 2023 12:03:17 -0700
Subject: [PATCH 18/31] hopefully fixed an edge case

---
 src/talon/talon.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/talon/talon.py b/src/talon/talon.py
index 417bc36..46ae61c 100644
--- a/src/talon/talon.py
+++ b/src/talon/talon.py
@@ -641,14 +641,15 @@ def search_for_overlap_with_gene(chromosome, start, end, strand,
     cursor.execute(query)
     matches = cursor.fetchall()
 
-    if len(matches) == 0:
-        return None, None
-
     # restrict to just the genes we care about
     if gene_IDs:
         print(f'restricting just to {gene_IDs}')
         matches = [match for match in matches if match['gene_ID'] in gene_IDs]
 
+
+    if len(matches) == 0:
+        return None, None
+
     # Among multiple matches, preferentially return the same-strand gene with
     # the greatest amount of overlap
     # print('start+end')

From 025f4632c23cea311385cb45c8554e58f7634d6a Mon Sep 17 00:00:00 2001
From: fairliereese <fairliek@comcast.net>
Date: Fri, 29 Sep 2023 10:50:46 -0700
Subject: [PATCH 19/31] changed behavior to only make a novel gene for reads w/
 no known splice sites only if it doesn't overlap just one gene (ie if it is a
 fusion candidate)

---
 src/talon/talon.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/talon/talon.py b/src/talon/talon.py
index 46ae61c..c37a9c7 100644
--- a/src/talon/talon.py
+++ b/src/talon/talon.py
@@ -648,6 +648,7 @@ def search_for_overlap_with_gene(chromosome, start, end, strand,
 
 
     if len(matches) == 0:
+        print('uwu here')
         return None, None
 
     # Among multiple matches, preferentially return the same-strand gene with
@@ -928,6 +929,10 @@ def process_ISM(chrom, positions, strand, edge_IDs, vertex_IDs, all_matches, tra
     else:
         gene_ID = all_matches[0]['gene_ID']
 
+    # if we didn't assign a gene ID
+    if gene_ID == None:
+      return None, None, [], None
+
     # print('edge IDs')
     # print(edge_IDs)
     # for match in all_matches:
@@ -1040,7 +1045,7 @@ def assign_gene(vertex_IDs, strand, vertex_2_gene,
     between non-overlapping genes, mark as fusion and do not assign a gene.
 
     Returns:
-    gene_ID (str or None): Gene ID of assigned gene, None if not fount
+    gene_ID (str or None): Gene ID of assigned gene, None if not found
     fusion (bool): Whether read appears to come from a novel fusion gene
     """
 
@@ -1392,7 +1397,8 @@ def process_remaining_mult_cases(chrom, positions, strand, edge_IDs, vertex_IDs,
     gene_novelty = []
     transcript_novelty = []
     start_end_info = {}
-    if not run_info.create_novel_spliced_genes and not fusion:
+    if not run_info.create_novel_spliced_genes or not fusion:
+        print('did i get here?')
         gene_ID, match_strand = search_for_overlap_with_gene(chrom, positions[0],
                                                              positions[-1], strand,
                                                              cursor, run_info, tmp_gene,

From 4fa3a1f09e34a8a61093ba65706b9f86ad2416f3 Mon Sep 17 00:00:00 2001
From: fairliereese <fairliek@comcast.net>
Date: Mon, 2 Oct 2023 15:59:28 -0700
Subject: [PATCH 20/31] changed tie breaking to work based on 3'/5' distance to
 any transcript from gene, rather than min/max 3'/5' of all transcripts from
 gene

---
 src/talon/init_refs.py |  61 +++++++++++++++++++
 src/talon/talon.py     | 133 ++++++++++++++++++++++++++---------------
 2 files changed, 145 insertions(+), 49 deletions(-)

diff --git a/src/talon/init_refs.py b/src/talon/init_refs.py
index 7c36710..df7f86f 100644
--- a/src/talon/init_refs.py
+++ b/src/talon/init_refs.py
@@ -72,6 +72,67 @@ def make_temp_novel_gene_table(cursor, build, chrom = None, start = None,
 
     return tmp_tab
 
+def make_temp_transcript_table(cursor, build, chrom = None,
+                                          start = None, end = None,
+                                          tmp_tab = "temp_transcript"):
+    """ Attaches a temporary database with a table that has the following fields:
+            - gene_ID
+            - transcript_ID
+            - chromosome
+            - start (min position)
+            - end (max position)
+            - strand
+        The purpose is to allow location-based matching tiebreaking
+        transcripts. """
+
+    if any(val == None for val in [chrom, start, end]):
+        command = Template(""" CREATE TEMPORARY TABLE IF NOT EXISTS $tmp_tab AS
+                                   SELECT t.gene_ID,
+                                      t.transcript_ID,
+                                      loc1.chromosome,
+                                      genes.strand,
+                                      MIN(loc1.position, loc2.position) as min_pos,
+                                      MAX(loc1.position, loc2.position) as max_pos
+                                   FROM transcripts as t
+                                   LEFT JOIN location as loc1
+                                       ON loc1.location_ID = t.start_vertex
+                                   LEFT JOIN location as loc2
+                                       ON loc2.location_ID = t.end_vertex
+                                   LEFT JOIN genes
+                                       ON genes.gene_ID = t.gene_ID
+                                   WHERE loc1.genome_build = '$build'
+                                       AND loc2.genome_build = '$build' """)
+    else:
+        command = Template(""" CREATE TEMPORARY TABLE IF NOT EXISTS $tmp_tab AS
+                                   SELECT t.gene_ID,
+                                      t.transcript_ID,
+                                      loc1.chromosome,
+                                      genes.strand,
+                                      t.start_exon as exon_ID,
+                                      MIN(loc1.position, loc2.position) as min_pos,
+                                      MAX(loc1.position, loc2.position) as max_pos
+                                   FROM transcripts as t
+                                   LEFT JOIN location as loc1
+                                       ON loc1.location_ID = t.start_vertex
+                                   LEFT JOIN location as loc2
+                                       ON loc2.location_ID = t.end_vertex
+                                   LEFT JOIN genes
+                                       ON genes.gene_ID = t.gene_ID
+                                   WHERE loc1.genome_build = '$build'
+                                   AND loc2.genome_build = '$build'
+                                   AND loc1.chromosome = '$chrom'
+                                   AND ((min_pos <= $start AND max_pos >= $end)
+                                       OR (min_pos >= $start AND max_pos <= $end)
+                                       OR (min_pos >= $start AND min_pos <= $end)
+                                       OR (max_pos >= $start AND max_pos <= $end))""")
+
+    command = command.substitute({'build':build, 'chrom':chrom,
+                                  'start':start, 'end':end,
+                                  'tmp_tab':tmp_tab})
+    cursor.execute(command)
+
+    return tmp_tab
+
 def make_temp_monoexonic_transcript_table(cursor, build, chrom = None,
                                           start = None, end = None,
                                           tmp_tab = "temp_monoexon"):
diff --git a/src/talon/talon.py b/src/talon/talon.py
index c37a9c7..418ee2e 100644
--- a/src/talon/talon.py
+++ b/src/talon/talon.py
@@ -606,8 +606,8 @@ def search_for_ISM(edge_IDs, transcript_dict):
 
 
 def search_for_overlap_with_gene(chromosome, start, end, strand,
-                                 cursor, run_info, tmp_gene,
-                                 gene_starts, gene_ends, gene_IDs=None):
+                                 cursor, run_info, tmp_gene, tmp_t,
+                                 gene_IDs=None):
     """ Given a start and an end value for an interval, query the database to
         determine whether the interval overlaps with any genes. If it there is
         more than one match, prioritize same-strand first and foremost.
@@ -625,21 +625,46 @@ def search_for_overlap_with_gene(chromosome, start, end, strand,
     # print('query interval')
     # print(query_interval)
 
-    query = Template(""" SELECT gene_ID,
-                       chromosome,
-                       MIN(start) AS start,
-                       MAX(end) AS end,
-                       strand
-                FROM $tmp_gene
-                WHERE (chromosome = '$chrom') AND
-                      ((start <= $min_start AND end >= $max_end) OR
-                      (start >= $min_start AND end <= $max_end) OR
-                      (start >= $min_start AND start <= $max_end) OR
-                      (end >= $min_start AND end <= $max_end))
-                 GROUP BY gene_ID;""").substitute({'tmp_gene': tmp_gene, 'chrom': chromosome,
-                                                   'min_start': min_start, 'max_end': max_end})
+    # query = Template(""" SELECT gene_ID,
+    #                    chromosome,
+    #                    MIN(start) AS start,
+    #                    MAX(end) AS end,
+    #                    strand
+    #             FROM $tmp_gene
+    #             WHERE (chromosome = '$chrom') AND
+    #                   ((start <= $min_start AND end >= $max_end) OR
+    #                   (start >= $min_start AND end <= $max_end) OR
+    #                   (start >= $min_start AND start <= $max_end) OR
+    #                   (end >= $min_start AND end <= $max_end))
+    #              GROUP BY gene_ID;""").substitute({'tmp_gene': tmp_gene, 'chrom': chromosome,
+    #                                                'min_start': min_start, 'max_end': max_end})
+    if isinstance(gene_IDs, list):
+        query = Template("""SELECT gene_ID,
+                           chromosome,
+                           min_pos,
+                           max_pos,
+                           strand
+                    FROM $tmp_t
+                    WHERE gene_ID IN $gene_ids""").substitute({'tmp_t': tmp_t, \
+                                                               'gene_ids': qutils.format_for_IN(gene_IDs)})
+    elif not gene_IDs:
+        query = Template("""SELECT gene_ID,
+                           chromosome,
+                           min_pos,
+                           max_pos,
+                           strand
+                    FROM $tmp_t
+                    WHERE (chromosome = '$chrom') AND
+                          ((start <= $min_start AND end >= $max_end) OR
+                          (start >= $min_start AND end <= $max_end) OR
+                          (start >= $min_start AND start <= $max_end) OR
+                          (end >= $min_start AND end <= $max_end))
+                     GROUP BY gene_ID;""").substitute({'tmp_t': tmp_t, 'chrom': chromosome,
+                                                       'min_start': min_start, 'max_end': max_end})
     cursor.execute(query)
     matches = cursor.fetchall()
+    print('quwewy:')
+    print(query)
 
     # restrict to just the genes we care about
     if gene_IDs:
@@ -648,7 +673,7 @@ def search_for_overlap_with_gene(chromosome, start, end, strand,
 
 
     if len(matches) == 0:
-        print('uwu here')
+        print('herere here')
         return None, None
 
     # Among multiple matches, preferentially return the same-strand gene with
@@ -670,21 +695,18 @@ def search_for_overlap_with_gene(chromosome, start, end, strand,
 
         matches = [x for x in matches if x["strand"] == "+"]
         # best_match = get_best_match(matches, query_interval)
-        best_match = get_best_match(matches, start, end,
-                                    gene_starts, gene_ends)
+        best_match = get_best_match(matches, min_start, max_end)
 
     else:
         matches = [x for x in matches if x["strand"] == "-"]
         # best_match = get_best_match(matches, query_interval)
-        best_match = get_best_match(matches, start, end,
-                                    gene_starts, gene_ends)
+        best_match = get_best_match(matches, min_start, max_end)
 
     print(f"but right here it says {best_match['gene_ID']}")
 
     return best_match['gene_ID'], best_match['strand']
 
-def get_best_match(matches, start, end,
-                   gene_starts, gene_ends):
+def get_best_match(matches, min_end, max_end):
     """
     Get the best gene match based on distances of start and end of
     read to starts and ends from transcripts of genes. The gene with the
@@ -693,18 +715,17 @@ def get_best_match(matches, start, end,
     min_dist = sys.maxsize
     best_match = None
 
-    print(f'read start: {start}')
-    print(f'read end: {end}')
+    print(f'read min: {min_end}')
+    print(f'read end: {max_end}')
 
-    # TODO - maybe don't need gene_starts + gene_ends?
     for match in matches:
         print()
         print(f"gene: {match['gene_ID']}")
-        end_dist = abs(match['end']-end)
-        start_dist = abs(match['start']-start)
+        end_dist = abs(match['max_pos']-max_end)
+        start_dist = abs(match['min_pos']-min_end)
 
-        print(f"gene start: {match['start']}")
-        print(f"gene end: {match['end']}")
+        print(f"gene start: {match['min_pos']}")
+        print(f"gene end: {match['max_pos']}")
         dist = end_dist+start_dist
         print(f'dist: {dist}')
         if dist < min_dist:
@@ -902,7 +923,7 @@ def process_3p(chrom, positions, strand, vertex_IDs, gene_ID, gene_ends, edge_di
 
 def process_ISM(chrom, positions, strand, edge_IDs, vertex_IDs, all_matches, transcript_dict,
                 gene_starts, gene_ends, edge_dict, locations, run_info,
-                cursor, tmp_gene):
+                cursor, tmp_gene, tmp_t):
     """ Given a transcript, try to find an ISM match for it. If the
         best match is an ISM with known ends, that will be promoted to NIC. """
 
@@ -924,7 +945,7 @@ def process_ISM(chrom, positions, strand, edge_IDs, vertex_IDs, all_matches, tra
     if len(gene_matches) > 1:
         gene_ID, _ = search_for_overlap_with_gene(chrom, positions[0],
                         positions[-1], strand, cursor, run_info, tmp_gene,
-                        gene_starts, gene_ends, gene_IDs=gene_matches)
+                        tmp_t, gene_IDs=gene_matches)
         all_matches = [m for m in all_matches if m['gene_ID'] == gene_ID]
     else:
         gene_ID = all_matches[0]['gene_ID']
@@ -1037,7 +1058,7 @@ def process_ISM(chrom, positions, strand, edge_IDs, vertex_IDs, all_matches, tra
 
 def assign_gene(vertex_IDs, strand, vertex_2_gene,
                              chrom, start, end, cursor, run_info,
-                             tmp_gene, gene_starts, gene_ends):
+                             tmp_gene, tmp_t, gene_starts, gene_ends):
     """
     Assign a gene to a transcript. First do this on the basis of splice site
     matching. If this yields more than one gene, then choose the gene with the
@@ -1061,14 +1082,14 @@ def assign_gene(vertex_IDs, strand, vertex_2_gene,
         gene_ID, match_strand = search_for_overlap_with_gene(chrom, start,
                                                             end, strand,
                                                             cursor, run_info, tmp_gene,
-                                                            gene_starts, gene_ends,
+                                                            tmp_t,
                                                             gene_IDs=gene_ID)
     return gene_ID, fusion
 
 
 def process_NIC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict,
                 gene_starts, gene_ends, edge_dict, locations, vertex_2_gene, run_info,
-                cursor, tmp_gene):
+                cursor, tmp_gene, tmp_t):
     """ For a transcript that has been determined to be novel in catalog, find
         the proper gene match (documenting fusion event if applicable). To do
         this, look up each vertex in the vertex_2_gene dict, and keep track of all
@@ -1077,7 +1098,7 @@ def process_NIC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict,
     start_end_info = {}
     gene_ID, fusion = assign_gene(vertex_IDs, strand, vertex_2_gene,
                                  chrom, positions[0], positions[-1], cursor, run_info,
-                                 tmp_gene, gene_starts, gene_ends)
+                                 tmp_gene, tmp_t, gene_starts, gene_ends)
 
     # gene_ID, fusion = find_gene_match_on_vertex_basis(vertex_IDs,
     #                                                   strand,
@@ -1251,7 +1272,7 @@ def find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex_2_gene):
 
 def process_NNC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict,
                 gene_starts, gene_ends, edge_dict, locations, vertex_2_gene, run_info,
-                cursor, tmp_gene):
+                cursor, tmp_gene, tmp_t):
     """ Novel not in catalog case """
 
     novelty = []
@@ -1271,7 +1292,7 @@ def process_NNC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict,
     #     print(gene_ID)
     gene_ID, fusion = assign_gene(vertex_IDs, strand, vertex_2_gene,
                                  chrom, positions[0], positions[-1], cursor, run_info,
-                                 tmp_gene, gene_starts, gene_ends)
+                                 tmp_gene, tmp_t, gene_starts, gene_ends)
     print('gene id process_nnc')
     print(gene_ID)
     print(fusion)
@@ -1319,7 +1340,8 @@ def process_NNC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict,
 
 def process_spliced_antisense(chrom, positions, strand, edge_IDs, vertex_IDs,
                               transcript_dict, gene_starts, gene_ends, edge_dict,
-                              locations, vertex_2_gene, run_info, cursor, tmp_gene):
+                              locations, vertex_2_gene, run_info, cursor, tmp_gene,
+                              tmp_t):
     """ Annotate a transcript as antisense with splice junctions """
 
     gene_novelty = []
@@ -1336,7 +1358,7 @@ def process_spliced_antisense(chrom, positions, strand, edge_IDs, vertex_IDs,
         anti_gene_ID, match_strand = search_for_overlap_with_gene(chrom, positions[0],
                                                            positions[-1], strand,
                                                            cursor, run_info, tmp_gene,
-                                                           gene_starts, gene_ends,
+                                                           tmp_t,
                                                            gene_IDs=anti_gene_ID)
     if anti_gene_ID == None:
         return None, None, gene_novelty, transcript_novelty, start_end_info
@@ -1390,6 +1412,7 @@ def process_spliced_antisense(chrom, positions, strand, edge_IDs, vertex_IDs,
 def process_remaining_mult_cases(chrom, positions, strand, edge_IDs, vertex_IDs,
                                  transcript_dict, gene_starts, gene_ends, edge_dict,
                                  locations, vertex_2_gene, run_info, cursor, tmp_gene,
+                                 tmp_t,
                                  fusion):
     """ This function is a catch-all for multiexonic transcripts that were not
         FSM, ISM, NIC, NNC, or spliced antisense.
@@ -1402,7 +1425,7 @@ def process_remaining_mult_cases(chrom, positions, strand, edge_IDs, vertex_IDs,
         gene_ID, match_strand = search_for_overlap_with_gene(chrom, positions[0],
                                                              positions[-1], strand,
                                                              cursor, run_info, tmp_gene,
-                                                             gene_starts, gene_ends)
+                                                             tmp_t)
     else:
         gene_ID = None
         match_strand = None
@@ -1496,7 +1519,7 @@ def update_vertex_2_gene(gene_ID, vertex_IDs, strand, vertex_2_gene):
 
 def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_dict,
                         transcript_dict, vertex_2_gene, gene_starts, gene_ends,
-                        run_info, tmp_gene):
+                        run_info, tmp_gene, tmp_t):
     """ Inputs:
         - Information about the query transcript
           - chromosome
@@ -1564,7 +1587,7 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di
                                                                                          transcript_dict,
                                                                                          gene_starts, gene_ends,
                                                                                          edge_dict, location_dict,
-                                                                                         run_info, cursor, tmp_gene)
+                                                                                         run_info, cursor, tmp_gene, tmp_t)
                 print(f'gene id from process ism {gene_ID}')
 
         # Look for NIC
@@ -1577,7 +1600,8 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di
                                                                                      gene_starts, gene_ends,
                                                                                      edge_dict, location_dict,
                                                                                      vertex_2_gene, run_info,
-                                                                                     cursor, tmp_gene)
+                                                                                     cursor, tmp_gene,
+                                                                                     tmp_t)
 
     # Novel in catalog transcripts have known splice donors and acceptors,
     # but new connections between them.
@@ -1590,7 +1614,8 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di
                                                                                  gene_starts, gene_ends,
                                                                                  edge_dict, location_dict,
                                                                                  vertex_2_gene, run_info,
-                                                                                 cursor, tmp_gene)
+                                                                                 cursor, tmp_gene,
+                                                                                 tmp_t)
 
     # Antisense transcript with splice junctions matching known gene
     if splice_vertices_known and gene_ID == None and not fusion:
@@ -1604,7 +1629,7 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di
                                       gene_ends,
                                       edge_dict, location_dict,
                                       vertex_2_gene, run_info,
-                                      cursor, tmp_gene)
+                                      cursor, tmp_gene, tmp_t)
 
     # Novel not in catalog transcripts contain new splice donors/acceptors
     # and contain at least one splice junction. There should also be at least
@@ -1618,7 +1643,8 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di
                                                                                  gene_starts, gene_ends,
                                                                                  edge_dict, location_dict,
                                                                                  vertex_2_gene, run_info,
-                                                                                 cursor, tmp_gene)
+                                                                                 cursor, tmp_gene,
+                                                                                 tmp_t)
         print(f'geneID from process_nnc: {gene_ID}')
     # Transcripts that don't match the previous categories end up here
     if gene_ID == None:
@@ -1632,6 +1658,7 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di
                                          edge_dict, location_dict,
                                          vertex_2_gene, run_info,
                                          cursor, tmp_gene,
+                                         tmp_t,
                                          fusion)
 
     print('this is the gene id it decided on')
@@ -1970,6 +1997,11 @@ def prepare_data_structures(cursor, run_info, chrom=None, start=None,
                                                                                      start=start, end=end,
                                                                                      tmp_tab="temp_monoexon_" + tmp_id)
 
+    struct_collection.tmp_t = init_refs.make_temp_transcript_table(cursor,
+                                                                   build, chrom=chrom,
+                                                                   start=start, end=end,
+                                                                   tmp_tab="temp_t_" + tmp_id)
+
     location_dict = init_refs.make_location_dict(build, cursor, chrom=chrom,
                                                  start=start, end=end)
 
@@ -2027,6 +2059,7 @@ def compute_delta(orig_pos, new_pos, strand):
 def identify_monoexon_transcript(chrom, positions, strand, cursor, location_dict,
                                  edge_dict, transcript_dict, vertex_2_gene,
                                  gene_starts, gene_ends, run_info, tmp_gene,
+                                 tmp_t,
                                  tmp_monoexon):
     gene_novelty = []
     transcript_novelty = []
@@ -2101,13 +2134,13 @@ def identify_monoexon_transcript(chrom, positions, strand, cursor, location_dict
                                                                                transcript_dict,
                                                                                gene_starts, gene_ends,
                                                                                edge_dict, location_dict,
-                                                                               run_info, cursor, tmp_gene)
+                                                                               run_info, cursor, tmp_gene, tmp_t)
         if gene_ID == None:
             # Find best gene match using overlap search if the ISM/NIC check didn't work
             gene_ID, match_strand = search_for_overlap_with_gene(chrom, positions[0],
                                                                  positions[1], strand,
                                                                  cursor, run_info, tmp_gene,
-                                                                 gene_starts, gene_ends)
+                                                                 tmp_t)
             # Intergenic case
             if gene_ID == None:
                 gene_ID = create_gene(chrom, positions[0], positions[-1],
@@ -2776,7 +2809,8 @@ def annotate_read(sam_record: pysam.AlignedSegment, cursor, run_info,
                                               vertex_2_gene,
                                               gene_starts, gene_ends,
                                               run_info,
-                                              struct_collection.tmp_gene)
+                                              struct_collection.tmp_gene,
+                                              struct_collection.tmp_t)
     else:
         annotation_info = identify_monoexon_transcript(chrom, positions, strand,
                                                        cursor, location_dict,
@@ -2784,6 +2818,7 @@ def annotate_read(sam_record: pysam.AlignedSegment, cursor, run_info,
                                                        vertex_2_gene,
                                                        gene_starts, gene_ends,
                                                        run_info, struct_collection.tmp_gene,
+                                                       struct_collection.tmp_t,
                                                        struct_collection.tmp_monoexon)
 
     annotation_info.read_ID = read_ID

From 34491fc65e501d7a0870307ac35fcc5a319ecc54 Mon Sep 17 00:00:00 2001
From: fairliereese <fairliek@comcast.net>
Date: Mon, 2 Oct 2023 16:09:50 -0700
Subject: [PATCH 21/31] fixed incorrect sql col names

---
 src/talon/talon.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/talon/talon.py b/src/talon/talon.py
index 418ee2e..0fadc48 100644
--- a/src/talon/talon.py
+++ b/src/talon/talon.py
@@ -655,10 +655,10 @@ def search_for_overlap_with_gene(chromosome, start, end, strand,
                            strand
                     FROM $tmp_t
                     WHERE (chromosome = '$chrom') AND
-                          ((start <= $min_start AND end >= $max_end) OR
-                          (start >= $min_start AND end <= $max_end) OR
-                          (start >= $min_start AND start <= $max_end) OR
-                          (end >= $min_start AND end <= $max_end))
+                          ((min_pos <= $min_start AND max_pos >= $max_end) OR
+                          (min_pos >= $min_start AND max_pos <= $max_end) OR
+                          (min_pos >= $min_start AND min_pos <= $max_end) OR
+                          (max_pos >= $min_start AND max_pos <= $max_end))
                      GROUP BY gene_ID;""").substitute({'tmp_t': tmp_t, 'chrom': chromosome,
                                                        'min_start': min_start, 'max_end': max_end})
     cursor.execute(query)

From 94b4e954014e97b2221e41ace5fe0fd60dee650e Mon Sep 17 00:00:00 2001
From: fairliereese <fairliek@comcast.net>
Date: Tue, 3 Oct 2023 11:06:46 -0700
Subject: [PATCH 22/31] added tmp transcript db update when new transcripts are
 created

---
 src/talon/talon.py | 55 +++++++++++++++++++++++++++-------------------
 1 file changed, 32 insertions(+), 23 deletions(-)

diff --git a/src/talon/talon.py b/src/talon/talon.py
index 0fadc48..9a14db0 100644
--- a/src/talon/talon.py
+++ b/src/talon/talon.py
@@ -452,13 +452,16 @@ def create_gene(chromosome, start, end, strand, memory_cursor, tmp_gene):
     return new_ID
 
 
-def create_transcript(chromosome, start_pos, end_pos, gene_ID, edge_IDs, vertex_IDs,
-                      transcript_dict):
-    """Creates a novel transcript and adds it to the transcript data structure.
+def create_transcript(strand, chromosome, start_pos, end_pos, gene_ID, edge_IDs, vertex_IDs,
+                      transcript_dict, tmp_t, memory_cursor):
+    """Creates a novel transcript, add it to the transcript data structure,
+       and add to tmp_t
     """
     print('creating new transcript')
     new_ID = transcript_counter.increment()
     print(f'new tid:{new_ID}')
+
+    # updating the dict
     if len(edge_IDs) > 1:
         jn_path = ",".join(map(str, edge_IDs[1:-1]))
     else:
@@ -479,6 +482,12 @@ def create_transcript(chromosome, start_pos, end_pos, gene_ID, edge_IDs, vertex_
     path_key = frozenset(edge_IDs)
     transcript_dict[path_key] = new_transcript
 
+    # updating tmp_t
+    new_t = (gene_ID, new_ID, chromosome, strand, min(start_pos, end_pos), max(start_pos, end_pos))
+    cols = ' ("gene_ID", "transcript_ID", "chromosome", "strand", "min_pos", "max_pos")'
+    command = 'INSERT INTO ' + tmp_t + cols + ' VALUES ' + '(?,?,?,?,?,?)'
+    memory_cursor.execute(command, new_t)
+
     return new_transcript
 
 
@@ -1030,9 +1039,9 @@ def process_ISM(chrom, positions, strand, edge_IDs, vertex_IDs, all_matches, tra
             gene_ID = match['gene_ID']
             suffix.append(str(match['transcript_ID']))
 
-    novel_transcript = create_transcript(chrom, positions[0], positions[-1],
+    novel_transcript = create_transcript(strand, chrom, positions[0], positions[-1],
                                          gene_ID, edge_IDs, vertex_IDs,
-                                         transcript_dict)
+                                         transcript_dict, tmp_t, cursor)
 
     transcript_ID = novel_transcript['transcript_ID']
 
@@ -1144,9 +1153,9 @@ def process_NIC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict,
     start_end_info["vertex_IDs"] = vertex_IDs
 
     # Create a new transcript of that gene
-    novel_transcript = create_transcript(chrom, positions[0], positions[-1],
+    novel_transcript = create_transcript(strand, chrom, positions[0], positions[-1],
                                          gene_ID, edge_IDs, vertex_IDs,
-                                         transcript_dict)
+                                         transcript_dict, tmp_t, cursor)
     transcript_ID = novel_transcript["transcript_ID"]
     novelty = [(transcript_ID, run_info.idprefix, "TALON",
                 "NIC_transcript", "TRUE")]
@@ -1327,9 +1336,9 @@ def process_NNC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict,
     start_end_info["edge_IDs"] = edge_IDs
     start_end_info["vertex_IDs"] = vertex_IDs
 
-    transcript_ID = create_transcript(chrom, positions[0], positions[-1],
+    transcript_ID = create_transcript(strand, chrom, positions[0], positions[-1],
                                       gene_ID, edge_IDs, vertex_IDs,
-                                      transcript_dict)["transcript_ID"]
+                                      transcript_dict, tmp_t, cursor)["transcript_ID"]
 
     novelty.append((transcript_ID, run_info.idprefix, "TALON",
                     "NNC_transcript", "TRUE"))
@@ -1392,9 +1401,9 @@ def process_spliced_antisense(chrom, positions, strand, edge_IDs, vertex_IDs,
 
     gene_ID = create_gene(chrom, positions[0], positions[-1],
                           strand, cursor, tmp_gene)
-    transcript_ID = create_transcript(chrom, positions[0], positions[-1],
+    transcript_ID = create_transcript(strand, chrom, positions[0], positions[-1],
                                       gene_ID, edge_IDs, vertex_IDs,
-                                      transcript_dict)["transcript_ID"]
+                                      transcript_dict, tmp_t, cursor)["transcript_ID"]
 
     # Handle gene annotations
     gene_novelty.append((gene_ID, run_info.idprefix, "TALON",
@@ -1474,9 +1483,9 @@ def process_remaining_mult_cases(chrom, positions, strand, edge_IDs, vertex_IDs,
         gene_novelty.append((gene_ID, run_info.idprefix, "TALON",
                              g_nov, "TRUE"))
 
-        transcript_ID = create_transcript(chrom, positions[0], positions[-1],
+        transcript_ID = create_transcript(strand, chrom, positions[0], positions[-1],
                                           gene_ID, edge_IDs, vertex_IDs,
-                                          transcript_dict)["transcript_ID"]
+                                          transcript_dict, tmp_t, cursor)["transcript_ID"]
         transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON",
                                    t_nov, "TRUE"))
 
@@ -1484,9 +1493,9 @@ def process_remaining_mult_cases(chrom, positions, strand, edge_IDs, vertex_IDs,
         anti_gene_ID = gene_ID
         gene_ID = create_gene(chrom, positions[0], positions[-1], strand,
                               cursor, tmp_gene)
-        transcript_ID = create_transcript(chrom, positions[0], positions[-1],
+        transcript_ID = create_transcript(strand, chrom, positions[0], positions[-1],
                                           gene_ID, edge_IDs, vertex_IDs,
-                                          transcript_dict)["transcript_ID"]
+                                          transcript_dict, tmp_t, cursor)["transcript_ID"]
 
         gene_novelty.append((gene_ID, run_info.idprefix, "TALON",
                              "antisense_gene", "TRUE"))
@@ -1495,9 +1504,9 @@ def process_remaining_mult_cases(chrom, positions, strand, edge_IDs, vertex_IDs,
         transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON",
                                    "antisense_transcript", "TRUE"))
     else:
-        transcript_ID = create_transcript(chrom, positions[0], positions[-1],
+        transcript_ID = create_transcript(strand, chrom, positions[0], positions[-1],
                                           gene_ID, edge_IDs, vertex_IDs,
-                                          transcript_dict)["transcript_ID"]
+                                          transcript_dict, tmp_t, cursor)["transcript_ID"]
         transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON",
                                    "genomic_transcript", "TRUE"))
 
@@ -2148,9 +2157,9 @@ def identify_monoexon_transcript(chrom, positions, strand, cursor, location_dict
 
                 gene_novelty.append((gene_ID, run_info.idprefix, "TALON",
                                      "intergenic_novel", "TRUE"))
-                transcript_ID = create_transcript(chrom, positions[0], positions[-1],
+                transcript_ID = create_transcript(strand, chrom, positions[0], positions[-1],
                                                   gene_ID, edge_IDs, vertex_IDs,
-                                                  transcript_dict)["transcript_ID"]
+                                                  transcript_dict, tmp_t, cursor)["transcript_ID"]
                 transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON",
                                            "intergenic_transcript", "TRUE"))
             # Antisense case
@@ -2158,9 +2167,9 @@ def identify_monoexon_transcript(chrom, positions, strand, cursor, location_dict
                 anti_gene_ID = gene_ID
                 gene_ID = create_gene(chrom, positions[0], positions[-1],
                                       strand, cursor, tmp_gene)
-                transcript_ID = create_transcript(chrom, positions[0], positions[-1],
+                transcript_ID = create_transcript(strand, chrom, positions[0], positions[-1],
                                                   gene_ID, edge_IDs, vertex_IDs,
-                                                  transcript_dict)["transcript_ID"]
+                                                  transcript_dict, tmp_t, cursor)["transcript_ID"]
 
                 gene_novelty.append((gene_ID, run_info.idprefix, "TALON",
                                      "antisense_gene", "TRUE"))
@@ -2171,9 +2180,9 @@ def identify_monoexon_transcript(chrom, positions, strand, cursor, location_dict
 
             # Same strand
             else:
-                transcript_ID = create_transcript(chrom, positions[0], positions[-1],
+                transcript_ID = create_transcript(strand, chrom, positions[0], positions[-1],
                                                   gene_ID, edge_IDs, vertex_IDs,
-                                                  transcript_dict)["transcript_ID"]
+                                                  transcript_dict, tmp_t, cursor)["transcript_ID"]
                 transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON",
                                            "genomic_transcript", "TRUE"))
 

From 8135a67f4a278034faeba381a27e92b396eac8f3 Mon Sep 17 00:00:00 2001
From: fairliereese <fairliek@comcast.net>
Date: Mon, 9 Oct 2023 10:45:48 -0700
Subject: [PATCH 23/31] started to update with verbosity-tuneable logger

---
 src/talon/logger.py       |  17 +++
 src/talon/process_sams.py |  32 +++--
 src/talon/talon.py        | 240 +++++++++++++++++++++-----------------
 3 files changed, 170 insertions(+), 119 deletions(-)
 create mode 100644 src/talon/logger.py

diff --git a/src/talon/logger.py b/src/talon/logger.py
new file mode 100644
index 0000000..9d485f0
--- /dev/null
+++ b/src/talon/logger.py
@@ -0,0 +1,17 @@
+import logging
+
+
+def _init_logger(verbosity):
+    # https://coralogix.com/blog/python-logging-best-practices-tips/
+    # https://stackoverflow.com/questions/14097061/easier-way-to-enable-verbose-logging
+
+    levels = [logging.WARNING, logging.INFO, logging.DEBUG]
+    level = levels[min(verbosity, len(levels) - 1)]  # cap to last level index
+
+    # set defaults
+    msg_fmt = "%(asctime)s : %(levelname)s : [%(filename)s:%(lineno)d] : %(message)s"
+    date_fmt = "[ %Y-%m-%d %H:%M:%S ]"
+
+    logging.basicConfig(level=level,
+                        format=msg_fmt,
+                        datefmt=date_fmt)
diff --git a/src/talon/process_sams.py b/src/talon/process_sams.py
index 4960198..84c8071 100644
--- a/src/talon/process_sams.py
+++ b/src/talon/process_sams.py
@@ -8,6 +8,7 @@
 import pysam
 import os
 import time
+import logging
 
 save = pysam.set_verbosity(0)
 # pysam.set_verbosity(save)
@@ -23,8 +24,11 @@ def convert_to_bam(sam, bam, threads):
             outfile.write(s)
 
     except Exception as e:
-        print(e)
-        raise RuntimeError("Problem converting sam file '%s' to bam." % (sam))
+        logging.error(e)
+        msg = f'Problem converting SAM file {sam} to BAM'
+        logging.error(msg)
+        raise RuntimeError(msg)
+        # raise RuntimeError("Problem converting sam file '%s' to bam." % (sam))
 
 
 def preprocess_sam(sam_files, datasets, use_cb_tag,
@@ -90,13 +94,18 @@ def preprocess_sam(sam_files, datasets, use_cb_tag,
         sorted_bam = tmp_dir + "merged_sorted.bam"
         pysam.sort("-@", str(n_threads), "-o", sorted_bam, merged_bam)
         pysam.index(sorted_bam)
-        ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
-        print("[ %s ] Merged input SAM/BAM files" % (ts))
+        # ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
+        # print("[ %s ] Merged input SAM/BAM files" % (ts))
+        logging.info('Merged input SAM/BAM files')
     except:
-        raise RuntimeError(("Problem merging and indexing SAM/BAM files. "
-                            "Check your file paths and make sure that all "
-                            "files have headers."))
-
+        # raise RuntimeError(("Problem merging and indexing SAM/BAM files. "
+        #                     "Check your file paths and make sure that all "
+        #                     "files have headers."))
+        msg = "Problem merging and indexing SAM/BAM files. "+\
+                            "Check your file paths and make sure that all "+\
+                            "files have headers."
+        logging.error(msg)
+        raise RuntimeError(msg)
     return sorted_bam
 
 
@@ -117,8 +126,11 @@ def partition_reads(sam_files, datasets, use_cb_tag,
     try:
         gr = pr.read_bam(merged_bam)
     except Exception as e:
-        print(e)
-        raise RuntimeError("Problem opening sam file %s" % (merged_bam))
+        # print(e)
+        logging.error(e)
+        msg = f'Problem opening SAM file {merged_bam}'
+        logging.error(msg)
+        raise RuntimeError(msg)
 
     gr = gr.merge(slack=100000000, strand=False)
 
diff --git a/src/talon/talon.py b/src/talon/talon.py
index 9a14db0..d85dace 100644
--- a/src/talon/talon.py
+++ b/src/talon/talon.py
@@ -13,11 +13,14 @@
 from pathlib import Path
 import pandas as pd
 import warnings
+import logging
+
 from . import dstruct
 from . import process_sams as procsams
 from . import transcript_utils as tutils
 from . import query_utils as qutils
 from . import init_refs as init_refs
+from . import logger as logger
 from talon.post import get_read_annotations
 import pysam
 from string import Template
@@ -120,6 +123,8 @@ def get_args():
     parser.add_argument("--tmpDir", dest="tmp_dir",
                         help="Path to directory for tmp files. Default = `talon_tmp/`",
                         type=str,  default="talon_tmp/")
+    parser.add_argument("--verbosity", "-v", type=int, default=1,
+                        help="Verbosity of TALON output. Higher numbers = more verbose.")
     parser.add_argument("--o", dest="outprefix",
                         help="Prefix for output files", type=str)
 
@@ -294,9 +299,13 @@ def permissive_match_with_gene_priority(chromosome, position, strand, sj_pos,
     """
     # Check inputs
     if pos_type != "start" and pos_type != "end":
-        raise ValueError("Please set pos_type to either 'start' or 'end'.")
+        msg = "Please set pos_type to either 'start' or 'end'."
+        logging.error(msg)
+        raise ValueError(msg)
     if strand != "+" and strand != "-":
-        raise ValueError("Invalid strand specified: %s" % strand)
+        msg = f'Invalid strand specified: {strand}'
+        logging.error(msg)
+        raise ValueError(msg)
 
     # Try exact match first
     if chromosome in locations and position in locations[chromosome]:
@@ -365,9 +374,13 @@ def permissive_vertex_search(chromosome, position, strand, sj_pos, pos_type,
         return match['location_ID'], dist
 
     if pos_type != "start" and pos_type != "end":
-        raise ValueError("Please set pos_type to either 'start' or 'end'.")
+        msg = "Please set pos_type to either 'start' or 'end'."
+        logging.error(msg)
+        raise ValueError(msg)
     if strand != "+" and strand != "-":
-        raise ValueError("Invalid strand specified: %s" % strand)
+        msg = f"Invalid strand specified: {s}"
+        logging.error(msg)
+        raise ValueError(msg)
 
     # If there is no strict match, look for vertices that are
     #     (1) On the correct chromosome
@@ -627,28 +640,14 @@ def search_for_overlap_with_gene(chromosome, start, end, strand,
         Parameters:
             gene_ID (list of str or None): Restrict results to genes in this list
     """
-    print('in search for overlap with gene')
+    logging.debug('Tiebreaking for gene assignment')
     min_start = min(start, end)
     max_end = max(start, end)
     query_interval = [min_start, max_end]
-    # print('query interval')
-    # print(query_interval)
-
-    # query = Template(""" SELECT gene_ID,
-    #                    chromosome,
-    #                    MIN(start) AS start,
-    #                    MAX(end) AS end,
-    #                    strand
-    #             FROM $tmp_gene
-    #             WHERE (chromosome = '$chrom') AND
-    #                   ((start <= $min_start AND end >= $max_end) OR
-    #                   (start >= $min_start AND end <= $max_end) OR
-    #                   (start >= $min_start AND start <= $max_end) OR
-    #                   (end >= $min_start AND end <= $max_end))
-    #              GROUP BY gene_ID;""").substitute({'tmp_gene': tmp_gene, 'chrom': chromosome,
-    #                                                'min_start': min_start, 'max_end': max_end})
+
     if isinstance(gene_IDs, list):
         query = Template("""SELECT gene_ID,
+                           transcript_ID,
                            chromosome,
                            min_pos,
                            max_pos,
@@ -658,6 +657,7 @@ def search_for_overlap_with_gene(chromosome, start, end, strand,
                                                                'gene_ids': qutils.format_for_IN(gene_IDs)})
     elif not gene_IDs:
         query = Template("""SELECT gene_ID,
+                           transcript_ID,
                            chromosome,
                            min_pos,
                            max_pos,
@@ -672,47 +672,33 @@ def search_for_overlap_with_gene(chromosome, start, end, strand,
                                                        'min_start': min_start, 'max_end': max_end})
     cursor.execute(query)
     matches = cursor.fetchall()
-    print('quwewy:')
-    print(query)
 
     # restrict to just the genes we care about
     if gene_IDs:
-        print(f'restricting just to {gene_IDs}')
+        # print(f'restricting just to {gene_IDs}')
+        logging.debug(f'Restricing gene tiebreak to {gene_IDs}')
         matches = [match for match in matches if match['gene_ID'] in gene_IDs]
 
 
     if len(matches) == 0:
-        print('herere here')
+        # print('herere here')
+        logging.debug(f'Unable to tiebreak')
         return None, None
 
     # Among multiple matches, preferentially return the same-strand gene with
     # the greatest amount of overlap
-    # print('start+end')
-    # print(start)
-    # print(end)
     same_strand_matches = len([x for x in matches if x["strand"] == strand])
-    # for m in matches:
-    #     print()
-    #     print(m['gene_ID'])
-    #     print(m['start'])
-    #     print(m['end'])
-    #
-    # print(same_strand_matches)
 
     if strand == "+" and same_strand_matches > 0 or \
             strand == "-" and same_strand_matches == 0:
 
         matches = [x for x in matches if x["strand"] == "+"]
-        # best_match = get_best_match(matches, query_interval)
         best_match = get_best_match(matches, min_start, max_end)
 
     else:
         matches = [x for x in matches if x["strand"] == "-"]
-        # best_match = get_best_match(matches, query_interval)
         best_match = get_best_match(matches, min_start, max_end)
 
-    print(f"but right here it says {best_match['gene_ID']}")
-
     return best_match['gene_ID'], best_match['strand']
 
 def get_best_match(matches, min_end, max_end):
@@ -724,25 +710,24 @@ def get_best_match(matches, min_end, max_end):
     min_dist = sys.maxsize
     best_match = None
 
-    print(f'read min: {min_end}')
-    print(f'read end: {max_end}')
+    # print(f'read min: {min_end}')
+    # print(f'read end: {max_end}')
+    logging.debug(f'Read start / end: ({min_end}, {min_end})')
 
     for match in matches:
-        print()
-        print(f"gene: {match['gene_ID']}")
+        logging.debug(f"Matching with transcripts from gene {match['gene_ID']}, transcript {match['transcript_ID']}")
         end_dist = abs(match['max_pos']-max_end)
         start_dist = abs(match['min_pos']-min_end)
 
-        print(f"gene start: {match['min_pos']}")
-        print(f"gene end: {match['max_pos']}")
+        logging.debug(f"Transcript start / end: ({match['min_pos']}, {match['max_pos']})")
         dist = end_dist+start_dist
-        print(f'dist: {dist}')
+        logging.debug(f'Distance between read and transcript ends: {dist}')
         if dist < min_dist:
             min_dist = dist
             best_match = match
 
-    print('best match')
-    print(best_match['gene_ID'])
+    logging.debug(f"Best gene match: {best_match['gene_ID']}")
+    # print(best_match['gene_ID'])
     return best_match
 
 
@@ -1747,7 +1732,9 @@ def check_inputs(options):
     # Make sure that the input database exists!
     database = options.database
     if not Path(database).exists():
-        raise ValueError("Database file '%s' does not exist!" % database)
+        msg = f"Database file '{s}' does not exist!"
+        logging.error(msg)
+        raise ValueError(msg)
 
     # Make sure that the genome build exists in the provided TALON database.
     with sqlite3.connect(database) as conn:
@@ -1756,8 +1743,10 @@ def check_inputs(options):
         builds = [str(x[0]) for x in cursor.fetchall()]
         if options.build not in builds:
             build_names = ", ".join(list(builds))
-            raise ValueError("Please specify a genome build that exists in the" +
-                             " database. The choices are: " + build_names)
+            msg = "Please specify a genome build that exists in the" +\
+                             " database. The choices are: " + build_names
+            logging.error(msg)
+            raise ValueError(msg)
 
         # Make sure that each input dataset is not already in the database, and
         # also make sure that each dataset name is unique
@@ -1774,33 +1763,37 @@ def check_inputs(options):
                     line = line.strip().split(',')
                     curr_sam = line[3]
                     if len(line) != 4:
-                        raise ValueError('Incorrect number of comma-separated fields' +
-                                         ' in config file. There should be four: ' +
-                                         '(dataset name, sample description, ' +
-                                         'platform, associated sam/bam file).')
+                        msg = 'Incorrect number of comma-separated fields' +\
+                                         ' in config file. There should be four: ' +\
+                                         '(dataset name, sample description, ' +\
+                                         'platform, associated sam/bam file).'
+                        logging.error(msg)
+                        raise ValueError(msg)
 
                     # Make sure that the sam file exists
                     if not Path(curr_sam).exists():
-                        raise ValueError(
-                            "SAM/BAM file '%s' does not exist!" % curr_sam)
+                        msg = f"SAM/BAM file '{curr_sam}' does not exist!"
+                        logging.error(msg)
+                        raise ValueError(msg)
 
                     metadata = (line[0], line[1], line[2])
                     dataname = metadata[0]
                     if dataname in existing_datasets:
-                        warnings.warn("Ignoring dataset with name '" + dataname +
+                        logging.warning("Ignoring dataset with name '" + dataname +
                                       "' because it is already in the database.")
                     elif dataname in curr_datasets:
-                        warnings.warn("Skipping duplicated instance of dataset '" +
+                        logging.warning("Skipping duplicated instance of dataset '" +
                                       dataname + "'.")
                     elif curr_sam in sam_files:
-                        warnings.warn("Skipping duplicated instance of sam file '" +
+                        logging.warning("Skipping duplicated instance of sam file '" +
                                       curr_sam + "'.")
                     else:
                         dataset_metadata.append(metadata)
                         curr_datasets.append(dataname)
                     if not curr_sam.endswith(".sam") and not curr_sam.endswith(".bam"):
-                        raise ValueError(
-                            'Last field in config file must be a .sam/.bam file')
+                        msg =  'Last field in config file must be a .sam/.bam file'
+                        logging.error(msg)
+                        raise ValueError(msg)
                     sam_files.append(curr_sam)
 
         # if we are using the RG tag, check that the config file adheres to the
@@ -1825,15 +1818,18 @@ def check_inputs(options):
                     line = line.strip().split(',')
                     curr_sam = line[2]
                     if len(line) != 3:
-                        raise ValueError('Incorrect number of comma-separated fields' +
-                                         ' in config file. There should be three: ' +
-                                         '(sample description, ' +
-                                         'platform, associated sam/bam file).')
+                        msg = 'Incorrect number of comma-separated fields' +\
+                                         ' in config file. There should be three: ' +\
+                                         '(sample description, ' +\
+                                         'platform, associated sam/bam file).'
+                        logging.error(msg)
+                        raise ValueError(msg)
 
                     # Make sure that the sam file exists
                     if not Path(curr_sam).exists():
-                        raise ValueError(
-                            "SAM/BAM file '%s' does not exist!" % curr_sam)
+                        msg = f"SAM/BAM file '{curr_sam}' does not exist!"
+                        logging.error(msg)
+                        raise ValueError(msg)
                     metadata = ['', line[0], line[1]]
 
                     # get list of dataset names from the CB tag in the sam file
@@ -1854,8 +1850,9 @@ def check_inputs(options):
                                          names=['cb_tag'], engine='python')
                         # is the df empty?
                         if df.empty:
-                            raise RuntimeError(
-                                "SAM/BAM file contains no CB tags")
+                            msg = 'SAM/BAM file contains no CB tags'
+                            logging.error(msg)
+                            raise RuntimeError(msg)
                         df['dataset'] = df.cb_tag.str.split(
                             pat='\t', n=1, expand=True)[0]
                         datasets = df.dataset.unique().tolist()
@@ -1876,24 +1873,29 @@ def check_inputs(options):
                         metadata[0] = dataname
 
                         if dataname in existing_datasets:
-                            raise RuntimeError((f"Dataset for read group {f} " +
-                                                "already in database."))
+                            msg = f"Dataset for read group {f} " +\
+                                                "already in database."
+                            logging.error(msg)
+                            raise RuntimeError(msg)
                             # warnings.warn("Ignoring dataset with name '" + dataname + \
                             #               "' because it is already in the database.")
                         elif dataname in curr_datasets:
-                            raise RuntimeError((f"Dataset for read group {f} " +
-                                                "already in config file."))
+                            msg = f"Dataset for read group {f} " +\
+                                                "already in config file."
+                            logging.error(msg)
+                            raise RuntimeError(msg)
                             # warnings.warn("Skipping duplicated instance of dataset '" + \
                             #                dataname + "'.")
                         else:
                             dataset_metadata.append(tuple(metadata))
                             curr_datasets.append(dataname)
                     if curr_sam in sam_files:
-                        warnings.warn("Skipping duplicated instance of sam/bam file '" +
+                        logging.warning("Skipping duplicated instance of sam/bam file '" +
                                       curr_sam + "'.")
                     if not curr_sam.endswith(".sam") and not curr_sam.endswith(".bam"):
-                        raise ValueError(
-                            'Last field in config file must be a .sam/.bam file')
+                        msg = 'Last field in config file must be a .sam/.bam file'
+                        logging.error(msg)
+                        raise ValueError(msg)
                     sam_files.append(curr_sam)
 
                     # else:
@@ -1904,8 +1906,10 @@ def check_inputs(options):
                     #     sam_files.append(curr_sam)
 
     if sam_files == []:
-        raise RuntimeError(("All of the provided dataset names are already in "
-                            "the database. Please check your config file."))
+        msg = "All of the provided dataset names are already in "+\
+              "the database. Please check your config file."
+        logging.error(msg)
+        raise RuntimeError(msg)
 
     return sam_files, dataset_metadata
 
@@ -2062,7 +2066,9 @@ def compute_delta(orig_pos, new_pos, strand):
         else:
             return -1*abs_dist
     else:
-        raise ValueError("Strand must be either + or -")
+        msg = 'Strand must be either + or -'
+        logging.error(msg)
+        raise ValueError(msg)
 
 
 def identify_monoexon_transcript(chrom, positions, strand, cursor, location_dict,
@@ -2445,8 +2451,10 @@ def batch_add_annotations(cursor, annot_file, annot_type, batch_size):
     """
     batch_size = 1
     if annot_type not in ["gene", "transcript", "exon"]:
-        raise ValueError("When running batch annot update, must specify " +
-                         "annot_type as 'gene', 'exon', or 'transcript'.")
+        msg = "When running batch annot update, must specify " +\
+                "annot_type as 'gene', 'exon', or 'transcript'."
+        logging.error(msg)
+        raise ValueError(msg)
 
     with open(annot_file, 'r') as f:
         while True:
@@ -2566,8 +2574,9 @@ def batch_add_abundance(cursor, entries, batch_size):
 def check_database_integrity(cursor):
     """ Perform some checks on the database. Run before committing changes"""
 
-    ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
-    print("[ %s ] Validating database........" % (ts))
+    # ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
+    # print("[ %s ] Validating database........" % (ts))
+    logging.info('Validating database')
 
     # For each category, check that the number of table entries matches the counter
     counter_query = "SELECT * FROM counters"
@@ -2588,15 +2597,17 @@ def check_database_integrity(cursor):
 
         if actual_count != curr_counter:
             fail = 1
-            print("Database counter for '" + table_name +
+            logging.error("Database counter for '" + table_name +
                   "' does not match the number of entries in the table." +
                   " Discarding changes to database and exiting...")
-            print("table_count: " + str(actual_count))
-            print("counter_value: " + str(curr_counter))
+            logging.debug("table_count: " + str(actual_count))
+            logging.debug("counter_value: " + str(curr_counter))
 
     if fail == 1:
-        raise RuntimeError("Discrepancy found in database. " +
-                           "Discarding changes to database and exiting...")
+        msg = "Discrepancy found in database. " +\
+                           "Discarding changes to database and exiting..."
+        logging.error(msg)
+        raise RuntimeError(msg)
 
     return
 
@@ -2609,9 +2620,10 @@ def parallel_talon(read_file, interval, database, run_info, queue):
         added to the database, OR alternately, pickle them and write to file
         where they can be accessed later. """
 
-    ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
-    print("[ %s ] Annotating reads in interval %s:%d-%d..." %
-          (ts, interval[0], interval[1], interval[2]))
+    # ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
+    # print("[ %s ] Annotating reads in interval %s:%d-%d..." %
+    #       (ts, interval[0], interval[1], interval[2]))
+    logging.info(f'Annotating reads in interval {interval[0]}:{interval[1]}-{interval[2]}...')
 
     with sqlite3.connect(database) as conn:
         conn.row_factory = sqlite3.Row
@@ -2887,8 +2899,9 @@ def listener(queue, outfiles, QC_header, timeout=72):
         msg_fname = msg[0]
         msg_value = msg[1]
         if datetime.now() > wait_until or msg_value == 'complete':
-            ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
-            print("[ %s ] Shutting down message queue..." % (ts))
+            # ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
+            # print("[ %s ] Shutting down message queue..." % (ts))
+            logging.info('Shutting down message queue...')
             for f in open_files.values():
                 f.close()
             break
@@ -2914,10 +2927,14 @@ def make_QC_header(coverage, identity, length):
 
 def main():
     """ Runs program """
-    ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
-    print("[ %s ] Started TALON run" % (ts))
-
     options = get_args()
+    logger._init_logger(options.verbosity)
+
+    # ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
+    # print("[ %s ] Started TALON run" % (ts))
+    logging.info('Started TALON run')
+
+
     sam_files, dset_metadata = check_inputs(options)
     # print(sam_files)
     # print(dset_metadata[:5])
@@ -2969,8 +2986,8 @@ def main():
 
         read_files = procsams.write_reads_to_file(
             read_groups, intervals, header_file, tmp_dir=tmp_dir)
-        ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
-        print("[ %s ] Split reads into %d intervals" % (ts, len(read_groups)))
+        # ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
+        logging.info(f'Split reads into {len(read_groups)} intervals')
 
         # Set up a queue specifically for writing to outfiles
         manager = mp.Manager()
@@ -2981,8 +2998,9 @@ def main():
         for read_file, interval in zip(read_files, intervals):
             jobs.append((read_file, interval, database, run_info, queue))
 
-        ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
-        print("[ %s ] Launching parallel annotation jobs" % (ts))
+        # ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
+        # print("[ %s ] Launching parallel annotation jobs" % (ts))
+        logging.info('Launching parallel annotation jobs')
 
         # Start running listener, which will monitor queue for messages
         QC_header = make_QC_header(run_info.min_coverage, run_info.min_identity,
@@ -2998,19 +3016,22 @@ def main():
         pool.close()
         pool.join()
 
-    ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
-    print("[ %s ] All jobs complete. Starting database update." % (ts))
+    # ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
+    # print("[ %s ] All jobs complete. Starting database update." % (ts))
+    logging.info('All jobs complete. Starting database update')
 
     # Update the database
     batch_size = 10000
     update_database(database, batch_size,
                     run_info.outfiles, dataset_db_entries)
-    ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
-    print("[ %s ] Database update complete." % (ts))
+    # ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
+    # print("[ %s ] Database update complete." % (ts))
+    logging.info('Database update complete.')
 
     # Write output reads file
-    ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
-    print("[ %s ] Creating read-wise annotation file." % (ts))
+    # ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
+    # print("[ %s ] Creating read-wise annotation file." % (ts))
+    logging.info('Creating read-wise annotation file')
     get_read_annotations.make_read_annot_file(database, build,
                                               outprefix, datasets=datasets)
 
@@ -3019,8 +3040,9 @@ def main():
     #print("Transcripts: %d" % transcript_counter.value())
     #print("Observed: %d" % observed_counter.value())
 
-    ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
-    print("[ %s ] DONE" % (ts))
+    # ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
+    # print("[ %s ] DONE" % (ts))
+    logging.info('DONE')
 
 
 if __name__ == '__main__':

From da547a1cd3d4c704eede27e384db77958555448c Mon Sep 17 00:00:00 2001
From: fairliereese <fairliek@comcast.net>
Date: Mon, 9 Oct 2023 10:50:01 -0700
Subject: [PATCH 24/31] formatting changes

---
 src/talon/dstruct.py                          |    7 +-
 src/talon/edge.py                             |  125 +-
 src/talon/gene.py                             |  136 +-
 src/talon/init_refs.py                        |  233 +-
 src/talon/initialize_talon_database.py        |  518 ++--
 src/talon/length_utils.py                     |   34 +-
 src/talon/logger.py                           |    4 +-
 src/talon/post/ab_utils.py                    |   64 +-
 src/talon/post/call_longest_ends.py           |  293 ++-
 .../create_GTF_abundance_from_database.py     |  111 +-
 src/talon/post/create_GTF_from_database.py    |  326 ++-
 .../create_abundance_file_from_database.py    |  165 +-
 .../post/create_anndata_from_database.py      |  254 +-
 src/talon/post/filter_talon_transcripts.py    |  357 +--
 src/talon/post/generate_talon_report.py       |    6 +-
 src/talon/post/get_read_annotations.py        |  395 ++-
 src/talon/post/get_transcript_sjs.py          |  712 ++---
 .../post/map_antisense_genes_to_sense.py      |   50 +-
 src/talon/post/post_utils.py                  |   22 +-
 src/talon/post/summarize_datasets.py          |  113 +-
 src/talon/process_sams.py                     |   90 +-
 src/talon/query_utils.py                      |  304 ++-
 src/talon/reformat_gtf.py                     |  300 ++-
 src/talon/talon.py                            | 2339 +++++++++--------
 src/talon/talon_label_reads.py                |  234 +-
 src/talon/transcript.py                       |  253 +-
 src/talon/transcript_utils.py                 |  179 +-
 27 files changed, 4194 insertions(+), 3430 deletions(-)

diff --git a/src/talon/dstruct.py b/src/talon/dstruct.py
index da32d7e..b922922 100644
--- a/src/talon/dstruct.py
+++ b/src/talon/dstruct.py
@@ -3,10 +3,11 @@ class Struct(dict):
     Make a dict behave as a struct.
 
     Example:
-    
+
         test = Struct(a=1, b=2, c=3)
 
     """
-    def __init__(self,**kw):
-        dict.__init__(self,kw)
+
+    def __init__(self, **kw):
+        dict.__init__(self, kw)
         self.__dict__ = self
diff --git a/src/talon/edge.py b/src/talon/edge.py
index 0fccecb..8f2ebd9 100644
--- a/src/talon/edge.py
+++ b/src/talon/edge.py
@@ -1,29 +1,29 @@
 # TALON: Techonology-Agnostic Long Read Analysis Pipeline
 # Author: Dana Wyman
-#------------------------------------------------------------------------------
+# ------------------------------------------------------------------------------
+
 
 class Edge(object):
     """Stores information about an edge, including its location
-       and the gene/transcript(s) it belongs to.
-       Attributes:
-           identifier: Accession ID of the edge
-           gene: Accession ID of the gene that the edge belongs to
-           transcript_ids: Set of transcript accession IDs that the edge 
-           belongs to
-           chromosome: Chromosome that the transcript is located on 
-           (format "chr1")
-           start: The start position of the edge with respect to the
-           forward strand 
-           end: The end position of the edge with respect to the
-           forward strand
-           strand: "+" if the edge is on the forward strand, and "-" if
-           it is on the reverse strand
- 
-           length: The length of the edge
+    and the gene/transcript(s) it belongs to.
+    Attributes:
+        identifier: Accession ID of the edge
+        gene: Accession ID of the gene that the edge belongs to
+        transcript_ids: Set of transcript accession IDs that the edge
+        belongs to
+        chromosome: Chromosome that the transcript is located on
+        (format "chr1")
+        start: The start position of the edge with respect to the
+        forward strand
+        end: The end position of the edge with respect to the
+        forward strand
+        strand: "+" if the edge is on the forward strand, and "-" if
+        it is on the reverse strand
+
+        length: The length of the edge
     """
 
-    def __init__(self, identifier, chromosome, start, end, strand, gene_id,
-                 transcript_id, annotations):
+    def __init__(self, identifier, chromosome, start, end, strand, gene_id, transcript_id, annotations):
         self.chromosome = str(chromosome)
         self.gene_id = gene_id
         self.start = int(start)
@@ -40,27 +40,27 @@ def __init__(self, identifier, chromosome, start, end, strand, gene_id,
         self.v2 = None
 
     def print_edge(self):
-        """ Prints a string representation of the edge"""
-        print(self.identifier + ": " + self.chromosome + ":" + \
-              str(self.start) + "-" + str(self.end))
+        """Prints a string representation of the edge"""
+        print(self.identifier + ": " + self.chromosome + ":" + str(self.start) + "-" + str(self.end))
         print(self.transcript_ids)
         return
 
+
 def create_edge_from_gtf(edge_info):
-    """ Creates an edge object using information from a GTF entry
-            Args:
-               edge_info: A list containing fields from a GTF file edge entry.
-               Example:   
-               ['chr1', 'HAVANA', 'exon', '11869', '12227', '.', '+', '.', 
-                'gene_id "ENSG00000223972.5"; transcript_id "ENST00000456328.2"; 
-                gene_type "transcribed_unprocessed_pseudogene"; 
-                gene_status "KNOWN"; gene_name "DDX11L1"; 
-                transcript_type "processed_transcript"; 
-                transcript_status "KNOWN"; transcript_name "DDX11L1-002"; 
-                edge_number 1; edge_id "ENSE00002234944.1"; level 2; 
-                tag "basic"; transcript_support_level "1"; 
-                havana_gene "OTTHUMG00000000961.2"; 
-                havana_transcript "OTTHUMT00000362751.1";'] 
+    """Creates an edge object using information from a GTF entry
+    Args:
+       edge_info: A list containing fields from a GTF file edge entry.
+       Example:
+       ['chr1', 'HAVANA', 'exon', '11869', '12227', '.', '+', '.',
+        'gene_id "ENSG00000223972.5"; transcript_id "ENST00000456328.2";
+        gene_type "transcribed_unprocessed_pseudogene";
+        gene_status "KNOWN"; gene_name "DDX11L1";
+        transcript_type "processed_transcript";
+        transcript_status "KNOWN"; transcript_name "DDX11L1-002";
+        edge_number 1; edge_id "ENSE00002234944.1"; level 2;
+        tag "basic"; transcript_support_level "1";
+        havana_gene "OTTHUMG00000000961.2";
+        havana_transcript "OTTHUMT00000362751.1";']
     """
     description = edge_info[-1]
     start = int(edge_info[3])
@@ -71,38 +71,39 @@ def create_edge_from_gtf(edge_info):
     annotations = extract_edge_annotations_from_GTF(edge_info)
     if "exon_id" not in annotations:
         annotations["exon_id"] = "_".join([chromosome, str(start), str(end), strand])
-    gene_id = annotations['gene_id']
-    transcript_id = annotations['transcript_id']
+    gene_id = annotations["gene_id"]
+    transcript_id = annotations["transcript_id"]
     edge_id = "_".join([chromosome, str(start), str(end), strand])
 
     if "gene_id" in description:
         gene_id = (description.split("gene_id ")[1]).split('"')[1]
     if "transcript_id" in description:
         transcript_id = (description.split("transcript_id ")[1]).split('"')[1]
-    
-    edge = Edge(edge_id, chromosome, start, end, strand, gene_id, transcript_id,
-                annotations)
+
+    edge = Edge(edge_id, chromosome, start, end, strand, gene_id, transcript_id, annotations)
     return edge
 
+
 def extract_edge_annotations_from_GTF(tab_fields):
-    """ Extracts key-value annotations from the GTF description field
-    """
+    """Extracts key-value annotations from the GTF description field"""
 
     attributes = {}
 
     # remove trailing newline and split by semicolon
-    description = tab_fields[-1].strip('\n')
-    description = description.split(';')
+    description = tab_fields[-1].strip("\n")
+    description = description.split(";")
 
     # Parse description
     for fields in description:
-        if fields == "" or fields == " ": continue
+        if fields == "" or fields == " ":
+            continue
         fields = fields.split()
-        if fields[0] == '': fields = fields[1:]
+        if fields[0] == "":
+            fields = fields[1:]
+
+        key = fields[0].replace('"', "")
+        val = " ".join(fields[1:]).replace('"', "")
 
-        key = fields[0].replace('"', '')
-        val = ' '.join(fields[1:]).replace('"', '')
-        
         attributes[key] = val
 
     # Put in placeholders for important attributes (such as gene_id) if they
@@ -116,29 +117,29 @@ def extract_edge_annotations_from_GTF(tab_fields):
 
     return attributes
 
+
 def get_edge_from_db(vertex_info_1, vertex_info_2):
-    """ Uses information from a database edge entry to create an edge object.
-    """
+    """Uses information from a database edge entry to create an edge object."""
     if vertex_info_1["edge_id"] != vertex_info_2["edge_id"]:
-        raise ValueError('Tried to create edge from endpoints with different IDs')
+        raise ValueError("Tried to create edge from endpoints with different IDs")
     edge_id = vertex_info_1["edge_id"]
-    chromosome = vertex_info_1['chromosome']
-    start = min(vertex_info_1['position'], vertex_info_2['position'])
-    end = max(vertex_info_1['position'], vertex_info_2['position']) 
-    strand = vertex_info_1['strand']
-    gene_id = vertex_info_1['gene_id']
+    chromosome = vertex_info_1["chromosome"]
+    start = min(vertex_info_1["position"], vertex_info_2["position"])
+    end = max(vertex_info_1["position"], vertex_info_2["position"])
+    strand = vertex_info_1["strand"]
+    gene_id = vertex_info_1["gene_id"]
 
     edge = Edge(edge_id, chromosome, start, end, strand, gene_id, None, None)
     edge.v1 = str(vertex_info_1["vertex_ID"])
     edge.v2 = str(vertex_info_2["vertex_ID"])
     return edge
 
+
 def create_novel_edge(chromosome, start, end, strand, gene_id, transcript_id, counter):
-    """ Creates a novel edge with a unique identifier (obtained using
-        counter). Returns the edge object as well as the updated counter.
+    """Creates a novel edge with a unique identifier (obtained using
+    counter). Returns the edge object as well as the updated counter.
     """
     counter["edges"] += 1
     curr_novel = counter["edges"]
-    edge = Edge(curr_novel, chromosome, start, end, strand, gene_id, transcript_id,
-                None)
+    edge = Edge(curr_novel, chromosome, start, end, strand, gene_id, transcript_id, None)
     return edge
diff --git a/src/talon/gene.py b/src/talon/gene.py
index 52121ea..1f8188f 100644
--- a/src/talon/gene.py
+++ b/src/talon/gene.py
@@ -1,23 +1,24 @@
 # TALON: Techonology-Agnostic Long Read Analysis Pipeline
 # Author: Dana Wyman
-#------------------------------------------------------------------------------
+# ------------------------------------------------------------------------------
+
 
 class Gene(object):
-    """ Contains high-level information about a gene, such as its identifiers,
-        genomic location, and transcripts. Does not contain exon information.
-        Attributes:
-            - identifier: Accession ID of gene, i.e. an Ensembl ID. Required.
-            - name: Human-readable name of the gene. This attribute can be left
-              empty if the gene does not have an assigned name.
-            - chromosome: Chromosome that the gene is located on (format "chr1")
-            - start: The start position of the gene with respect to the forward
-              strand (int). Should always be less than or equal to end.
-            - end: The end position of the gene with respect to the forward strand
-              (int). Should always be greater than or equal to start.
-            - strand: "+" if the gene is on the forward strand, "-" if it is on
-              the reverse strand
-            - annotations: a dictionary of miscellaneous annotation categories
-              extracted from a GTF
+    """Contains high-level information about a gene, such as its identifiers,
+    genomic location, and transcripts. Does not contain exon information.
+    Attributes:
+        - identifier: Accession ID of gene, i.e. an Ensembl ID. Required.
+        - name: Human-readable name of the gene. This attribute can be left
+          empty if the gene does not have an assigned name.
+        - chromosome: Chromosome that the gene is located on (format "chr1")
+        - start: The start position of the gene with respect to the forward
+          strand (int). Should always be less than or equal to end.
+        - end: The end position of the gene with respect to the forward strand
+          (int). Should always be greater than or equal to start.
+        - strand: "+" if the gene is on the forward strand, "-" if it is on
+          the reverse strand
+        - annotations: a dictionary of miscellaneous annotation categories
+          extracted from a GTF
 
     """
 
@@ -35,51 +36,48 @@ def __init__(self, identifier, chromosome, start, end, strand, annotations):
         self.annotations = annotations
 
         if start > end:
-            raise ValueError("""Plus strand gene start must be less than or
-                             equal to end.""")
+            raise ValueError(
+                """Plus strand gene start must be less than or
+                             equal to end."""
+            )
 
     def set_name(self, name):
-        """ Sets the name attribute of the Gene to the provided value.
-        """
-        self.annotations['name'] = name
+        """Sets the name attribute of the Gene to the provided value."""
+        self.annotations["name"] = name
         return
 
     def add_transcript(self, transcript):
-        """ Adds a key-value pair (transcript identifier -> Transcript oject)
-            to the gene's transcript dictionary
-            Args:
-                transcript: object of type Transcript. Must overlap with the
-                location of the gene.
+        """Adds a key-value pair (transcript identifier -> Transcript oject)
+        to the gene's transcript dictionary
+        Args:
+            transcript: object of type Transcript. Must overlap with the
+            location of the gene.
         """
         if transcript.start >= self.end or transcript.end <= self.start:
-
             # only throw the error if we have a multi-bp transcript
             if transcript.start != transcript.end:
                 transcript_id = transcript.identifier
                 gene_id = transcript.gene_id
-                raise ValueError(f'Transcript ({transcript_id}) must overlap the gene ({gene_id}) it is assigned to')
+                raise ValueError(f"Transcript ({transcript_id}) must overlap the gene ({gene_id}) it is assigned to")
 
         if transcript.gene_id == self.identifier:
             # In order to belong to a gene, the transcript gene_id must match
             transcript_id = transcript.identifier
             self.transcripts[transcript_id] = transcript
         else:
-            raise ValueError('Gene ID of transcript must match gene ' + \
-                  'in order for assignment to be made.')
+            raise ValueError("Gene ID of transcript must match gene " + "in order for assignment to be made.")
         return
 
-
     def print_gene(self):
-        """ Print a string representation of the Gene. Good for debugging. """
+        """Print a string representation of the Gene. Good for debugging."""
 
         if "name" in self.annotations != "":
             # Include name in output if there is one
-            print(self.identifier + " (" + self.annotations['name']  + "):")
+            print(self.identifier + " (" + self.annotations["name"] + "):")
         else:
             print(self.identifier + ":")
 
-        print("\tLocation: " + self.chromosome + ":" + str(self.start) + "-" + \
-              str(self.end) + "(" + self.strand + ")")
+        print("\tLocation: " + self.chromosome + ":" + str(self.start) + "-" + str(self.end) + "(" + self.strand + ")")
 
         # Print transcripts in shorthand
         for transcript in self.transcripts:
@@ -87,37 +85,38 @@ def print_gene(self):
 
         return
 
+
 def get_gene_from_db(gene_start_row, gene_end_row):
-    """ Uses information from a database gene entry to create a
+    """Uses information from a database gene entry to create a
     Gene object.
         Args:
             gene_row: Tuple-formatted row from 'genes' table of a
             TALON database
     """
-    if gene_start_row['gene_id'] != gene_end_row['gene_id']:
-            raise ValueError("get_gene_from_db: provided start and stop " + \
-                             "come from different genes")
-    gene_id = gene_start_row['gene_ID']
-    chromosome = gene_start_row['chromosome']
+    if gene_start_row["gene_id"] != gene_end_row["gene_id"]:
+        raise ValueError("get_gene_from_db: provided start and stop " + "come from different genes")
+    gene_id = gene_start_row["gene_ID"]
+    chromosome = gene_start_row["chromosome"]
     start = gene_start_row[2]
     end = gene_end_row[2]
-    strand = gene_start_row['strand']
+    strand = gene_start_row["strand"]
 
-    #transcripts = {} #gene_row['transcript_ids'].split(",")
+    # transcripts = {} #gene_row['transcript_ids'].split(",")
 
     gene = Gene(gene_id, chromosome, start, end, strand, {})
     return gene
 
+
 def get_gene_from_gtf(gene_info):
-    """ Creates a Gene object from a GTF file entry
-        Args:
-            gene_info: A list containing fields from a GTF file gene entry.
-            Example:
-            ['chr1', 'HAVANA', 'gene', '11869', '14409', '.', '+', '.',
-            'gene_id "ENSG00000223972.5";
-            gene_type "transcribed_unprocessed_pseudogene";
-            gene_status "KNOWN"; gene_name "DDX11L1"; level 2;
-            havana_gene "OTTHUMG00000000961.2";']
+    """Creates a Gene object from a GTF file entry
+    Args:
+        gene_info: A list containing fields from a GTF file gene entry.
+        Example:
+        ['chr1', 'HAVANA', 'gene', '11869', '14409', '.', '+', '.',
+        'gene_id "ENSG00000223972.5";
+        gene_type "transcribed_unprocessed_pseudogene";
+        gene_status "KNOWN"; gene_name "DDX11L1"; level 2;
+        havana_gene "OTTHUMG00000000961.2";']
     """
     chromosome = gene_info[0]
     start = int(gene_info[3])
@@ -125,31 +124,34 @@ def get_gene_from_gtf(gene_info):
     strand = gene_info[6]
     annotations = extract_gene_annotations_from_GTF(gene_info)
     if "gene_id" not in gene_info[-1]:
-            raise ValueError('GTF entry lacks a gene_id field')
-    gene_id = annotations['gene_id']
+        raise ValueError("GTF entry lacks a gene_id field")
+    gene_id = annotations["gene_id"]
 
     gene = Gene(gene_id, chromosome, start, end, strand, annotations)
     return gene
 
+
 def extract_gene_annotations_from_GTF(tab_fields):
     """Parses the description field of a gene GTF in order to organize the
-       information therein into a dictionary.
+    information therein into a dictionary.
     """
 
     attributes = {}
 
     # remove trailing newline and split by semicolon
-    description = tab_fields[-1].strip('\n')
-    description = description.split(';')
+    description = tab_fields[-1].strip("\n")
+    description = description.split(";")
 
     # Parse description
     for fields in description:
-        if fields == "" or fields == " ": continue
+        if fields == "" or fields == " ":
+            continue
         fields = fields.split()
-        if fields[0] == '': fields = fields[1:]
+        if fields[0] == "":
+            fields = fields[1:]
 
-        key = fields[0].replace('"', '')
-        val = ' '.join(fields[1:]).replace('"', '')
+        key = fields[0].replace('"', "")
+        val = " ".join(fields[1:]).replace('"', "")
 
         attributes[key] = val
 
@@ -157,10 +159,11 @@ def extract_gene_annotations_from_GTF(tab_fields):
 
     return attributes
 
+
 def get_gene_from_exon(exon, gene_id):
-    """ In rare cases, GTF exons are listed with gene and transcript IDs that
-        do not have corresponding entries. In this case, we create a gene
-        for this exon for bookkeeping purposes."""
+    """In rare cases, GTF exons are listed with gene and transcript IDs that
+    do not have corresponding entries. In this case, we create a gene
+    for this exon for bookkeeping purposes."""
 
     gene_name = gene_id
     chromosome = exon.chromosome
@@ -170,9 +173,10 @@ def get_gene_from_exon(exon, gene_id):
     gene = Gene(gene_id, gene_name, None, chromosome, start, end, strand)
     return gene
 
+
 def create_novel_gene(chromosome, start, end, strand, counter):
-    """ Creates a novel gene with a unique identifier (obtained using
-        counter). Returns the gene object as well as the updated counter.
+    """Creates a novel gene with a unique identifier (obtained using
+    counter). Returns the gene object as well as the updated counter.
     """
     gene_id = str(counter["genes"] + 1)
     counter["genes"] += 1
diff --git a/src/talon/init_refs.py b/src/talon/init_refs.py
index df7f86f..735a174 100644
--- a/src/talon/init_refs.py
+++ b/src/talon/init_refs.py
@@ -13,21 +13,23 @@
 # make_gene_start_and_end_dict
 
 from string import Template
+
 import pandas as pd
 
-def make_temp_novel_gene_table(cursor, build, chrom = None, start = None,
-                               end = None, tmp_tab = "temp_gene"):
-    """ Attaches a temporary database with a table that has the following fields:
-            - gene_ID
-            - chromosome
-            - start
-            - end
-            - strand
-        The purpose is to track novel genes from this run in order to match
-        transcripts to them when other forms of gene assignment have failed.
+
+def make_temp_novel_gene_table(cursor, build, chrom=None, start=None, end=None, tmp_tab="temp_gene"):
+    """Attaches a temporary database with a table that has the following fields:
+        - gene_ID
+        - chromosome
+        - start
+        - end
+        - strand
+    The purpose is to track novel genes from this run in order to match
+    transcripts to them when other forms of gene assignment have failed.
     """
     if any(val == None for val in [chrom, start, end]):
-        command = Template(""" CREATE TEMPORARY TABLE IF NOT EXISTS $tmp_tab AS
+        command = Template(
+            """ CREATE TEMPORARY TABLE IF NOT EXISTS $tmp_tab AS
                                    SELECT gene_ID,
                                      chromosome,
                                      start,
@@ -42,9 +44,11 @@ def make_temp_novel_gene_table(cursor, build, chrom = None, start = None,
                                         LEFT JOIN vertex as v ON g.gene_ID = v.gene_ID
                                         LEFT JOIN location as loc ON loc.location_ID = v.vertex_ID
                                         WHERE loc.genome_build = '$build'
-                                        GROUP BY g.gene_ID); """)
+                                        GROUP BY g.gene_ID); """
+        )
     else:
-        command = Template(""" CREATE TEMPORARY TABLE IF NOT EXISTS $tmp_tab AS
+        command = Template(
+            """ CREATE TEMPORARY TABLE IF NOT EXISTS $tmp_tab AS
                                    SELECT gene_ID,
                                      chromosome,
                                      start,
@@ -64,29 +68,29 @@ def make_temp_novel_gene_table(cursor, build, chrom = None, start = None,
                                         AND ((start <= $start AND end >= $end)
                                           OR (start >= $start AND end <= $end)
                                           OR (start >= $start AND start <= $end)
-                                          OR (end >= $start AND end <= $end)); """)
+                                          OR (end >= $start AND end <= $end)); """
+        )
 
-    command = command.substitute({'tmp_tab':tmp_tab, 'build':build, 'chrom':chrom,
-                                  'start':start, 'end':end})
+    command = command.substitute({"tmp_tab": tmp_tab, "build": build, "chrom": chrom, "start": start, "end": end})
     cursor.execute(command)
 
     return tmp_tab
 
-def make_temp_transcript_table(cursor, build, chrom = None,
-                                          start = None, end = None,
-                                          tmp_tab = "temp_transcript"):
-    """ Attaches a temporary database with a table that has the following fields:
-            - gene_ID
-            - transcript_ID
-            - chromosome
-            - start (min position)
-            - end (max position)
-            - strand
-        The purpose is to allow location-based matching tiebreaking
-        transcripts. """
+
+def make_temp_transcript_table(cursor, build, chrom=None, start=None, end=None, tmp_tab="temp_transcript"):
+    """Attaches a temporary database with a table that has the following fields:
+        - gene_ID
+        - transcript_ID
+        - chromosome
+        - start (min position)
+        - end (max position)
+        - strand
+    The purpose is to allow location-based matching tiebreaking
+    transcripts."""
 
     if any(val == None for val in [chrom, start, end]):
-        command = Template(""" CREATE TEMPORARY TABLE IF NOT EXISTS $tmp_tab AS
+        command = Template(
+            """ CREATE TEMPORARY TABLE IF NOT EXISTS $tmp_tab AS
                                    SELECT t.gene_ID,
                                       t.transcript_ID,
                                       loc1.chromosome,
@@ -101,9 +105,11 @@ def make_temp_transcript_table(cursor, build, chrom = None,
                                    LEFT JOIN genes
                                        ON genes.gene_ID = t.gene_ID
                                    WHERE loc1.genome_build = '$build'
-                                       AND loc2.genome_build = '$build' """)
+                                       AND loc2.genome_build = '$build' """
+        )
     else:
-        command = Template(""" CREATE TEMPORARY TABLE IF NOT EXISTS $tmp_tab AS
+        command = Template(
+            """ CREATE TEMPORARY TABLE IF NOT EXISTS $tmp_tab AS
                                    SELECT t.gene_ID,
                                       t.transcript_ID,
                                       loc1.chromosome,
@@ -124,30 +130,29 @@ def make_temp_transcript_table(cursor, build, chrom = None,
                                    AND ((min_pos <= $start AND max_pos >= $end)
                                        OR (min_pos >= $start AND max_pos <= $end)
                                        OR (min_pos >= $start AND min_pos <= $end)
-                                       OR (max_pos >= $start AND max_pos <= $end))""")
+                                       OR (max_pos >= $start AND max_pos <= $end))"""
+        )
 
-    command = command.substitute({'build':build, 'chrom':chrom,
-                                  'start':start, 'end':end,
-                                  'tmp_tab':tmp_tab})
+    command = command.substitute({"build": build, "chrom": chrom, "start": start, "end": end, "tmp_tab": tmp_tab})
     cursor.execute(command)
 
     return tmp_tab
 
-def make_temp_monoexonic_transcript_table(cursor, build, chrom = None,
-                                          start = None, end = None,
-                                          tmp_tab = "temp_monoexon"):
-    """ Attaches a temporary database with a table that has the following fields:
-            - gene_ID
-            - transcript_ID
-            - chromosome
-            - start (min position)
-            - end (max position)
-            - strand
-        The purpose is to allow location-based matching for monoexonic query
-        transcripts. """
+
+def make_temp_monoexonic_transcript_table(cursor, build, chrom=None, start=None, end=None, tmp_tab="temp_monoexon"):
+    """Attaches a temporary database with a table that has the following fields:
+        - gene_ID
+        - transcript_ID
+        - chromosome
+        - start (min position)
+        - end (max position)
+        - strand
+    The purpose is to allow location-based matching for monoexonic query
+    transcripts."""
 
     if any(val == None for val in [chrom, start, end]):
-        command = Template(""" CREATE TEMPORARY TABLE IF NOT EXISTS $tmp_tab AS
+        command = Template(
+            """ CREATE TEMPORARY TABLE IF NOT EXISTS $tmp_tab AS
                                    SELECT t.gene_ID,
                                       t.transcript_ID,
                                       loc1.chromosome,
@@ -168,9 +173,11 @@ def make_temp_monoexonic_transcript_table(cursor, build, chrom = None,
                                        ON genes.gene_ID = t.gene_ID
                                    WHERE n_exons = 1
                                        AND loc1.genome_build = '$build'
-                                       AND loc2.genome_build = '$build' """)
+                                       AND loc2.genome_build = '$build' """
+        )
     else:
-        command = Template(""" CREATE TEMPORARY TABLE IF NOT EXISTS $tmp_tab AS
+        command = Template(
+            """ CREATE TEMPORARY TABLE IF NOT EXISTS $tmp_tab AS
                                    SELECT t.gene_ID,
                                       t.transcript_ID,
                                       loc1.chromosome,
@@ -196,35 +203,36 @@ def make_temp_monoexonic_transcript_table(cursor, build, chrom = None,
                                    AND ((min_pos <= $start AND max_pos >= $end)
                                        OR (min_pos >= $start AND max_pos <= $end)
                                        OR (min_pos >= $start AND min_pos <= $end)
-                                       OR (max_pos >= $start AND max_pos <= $end))""")
+                                       OR (max_pos >= $start AND max_pos <= $end))"""
+        )
 
-    command = command.substitute({'build':build, 'chrom':chrom,
-                                  'start':start, 'end':end,
-                                  'tmp_tab':tmp_tab})
+    command = command.substitute({"build": build, "chrom": chrom, "start": start, "end": end, "tmp_tab": tmp_tab})
     cursor.execute(command)
 
     return tmp_tab
 
-def make_location_dict(genome_build, cursor, chrom = None, start = None, end = None):
-    """ Format of dict:
-        chromosome -> dict(position -> SQLite3 row from location table)
 
-        old:
-            Key: chromosome, pos
-            Value: SQLite3 row from location table
+def make_location_dict(genome_build, cursor, chrom=None, start=None, end=None):
+    """Format of dict:
+    chromosome -> dict(position -> SQLite3 row from location table)
+
+    old:
+        Key: chromosome, pos
+        Value: SQLite3 row from location table
     """
     location_dict = {}
 
-    if any(val == None for val in [chrom, start,end]):
+    if any(val == None for val in [chrom, start, end]):
         query = Template("""SELECT * FROM location WHERE genome_build = '$build' """)
     else:
-        query = Template("""SELECT * FROM location
+        query = Template(
+            """SELECT * FROM location
                             WHERE genome_build = '$build'
                             AND chromosome = '$chrom'
                             AND position >= $start
-                            AND position <= $end""")
-    query = query.substitute({'build':genome_build, 'chrom':chrom,
-                              'start':start, 'end':end})
+                            AND position <= $end"""
+        )
+    query = query.substitute({"build": genome_build, "chrom": chrom, "start": start, "end": end})
     cursor.execute(query)
     for location in cursor.fetchall():
         chromosome = location["chromosome"]
@@ -236,16 +244,18 @@ def make_location_dict(genome_build, cursor, chrom = None, start = None, end = N
 
     return location_dict
 
-def make_edge_dict(cursor, build = None, chrom = None, start = None, end = None):
-    """ Format of dict:
-            Key: vertex1_vertex2_type
-            Value: SQLite3 row from edge table
+
+def make_edge_dict(cursor, build=None, chrom=None, start=None, end=None):
+    """Format of dict:
+    Key: vertex1_vertex2_type
+    Value: SQLite3 row from edge table
     """
     edge_dict = {}
     if any(val == None for val in [chrom, start, end, build]):
         query = """SELECT * FROM edge"""
     else:
-        query = Template("""SELECT e.*
+        query = Template(
+            """SELECT e.*
                             FROM edge AS e
                             LEFT JOIN location as loc1 ON e.v1 = loc1.location_ID
                             LEFT JOIN location as loc2 ON e.v2 = loc2.location_ID
@@ -253,9 +263,9 @@ def make_edge_dict(cursor, build = None, chrom = None, start = None, end = None)
                                  AND loc1.chromosome = "$chrom"
                                  AND (loc1.position >= $start AND loc1.position <= $end)
                                  AND (loc2.position >= $start AND loc2.position <= $end);
-                         """)
-        query = query.substitute({'build':build, 'chrom':chrom,
-                                  'start':start, 'end':end})
+                         """
+        )
+        query = query.substitute({"build": build, "chrom": chrom, "start": start, "end": end})
     cursor.execute(query)
     for edge in cursor.fetchall():
         vertex_1 = edge["v1"]
@@ -266,14 +276,16 @@ def make_edge_dict(cursor, build = None, chrom = None, start = None, end = None)
 
     return edge_dict
 
-def make_transcript_dict(cursor, build, chrom = None, start = None, end = None):
-    """ Format of dict:
-            Key: tuple consisting of edges in transcript path
-            Value: SQLite3 row from transcript table
+
+def make_transcript_dict(cursor, build, chrom=None, start=None, end=None):
+    """Format of dict:
+    Key: tuple consisting of edges in transcript path
+    Value: SQLite3 row from transcript table
     """
     transcript_dict = {}
     if any(val == None for val in [chrom, start, end]):
-         query = Template("""SELECT t.*,
+        query = Template(
+            """SELECT t.*,
                                 loc1.chromosome as chromosome,
                                 loc1.position as start_pos,
                                 loc2.position as end_pos
@@ -281,10 +293,12 @@ def make_transcript_dict(cursor, build, chrom = None, start = None, end = None):
                                 LEFT JOIN location as loc1 ON t.start_vertex = loc1.location_ID
                                 LEFT JOIN location as loc2 ON t.end_vertex = loc2.location_ID
                                 WHERE loc1.genome_build = '$build' AND loc2.genome_build = '$build';
-                          """)
+                          """
+        )
 
     else:
-        query = Template("""SELECT t.*,
+        query = Template(
+            """SELECT t.*,
                                 loc1.chromosome as chrom,
                                 loc1.position as start_pos,
                                 loc2.position as end_pos,
@@ -298,26 +312,25 @@ def make_transcript_dict(cursor, build, chrom = None, start = None, end = None):
                                          AND ((min_pos <= $start AND max_pos >= $end)
                                            OR (min_pos >= $start AND max_pos <= $end)
                                            OR (min_pos >= $start AND min_pos <= $end)
-                                           OR (max_pos >= $start AND max_pos <= $end))""")
+                                           OR (max_pos >= $start AND max_pos <= $end))"""
+        )
 
-    query = query.substitute({'build':build, 'chrom':chrom,
-                                  'start':start, 'end':end})
+    query = query.substitute({"build": build, "chrom": chrom, "start": start, "end": end})
     cursor.execute(query)
     for transcript in cursor.fetchall():
         transcript_path = transcript["jn_path"]
         if transcript_path != None:
-            transcript_path = transcript_path.split(",") + \
-                              [transcript["start_exon"], transcript["end_exon"]]
-            transcript_path = frozenset([ int(x) for x in transcript_path])
+            transcript_path = transcript_path.split(",") + [transcript["start_exon"], transcript["end_exon"]]
+            transcript_path = frozenset([int(x) for x in transcript_path])
         else:
             transcript_path = frozenset([transcript["start_exon"]])
         transcript_dict[transcript_path] = transcript
 
     return transcript_dict
 
-def make_vertex_2_gene_dict(cursor, build = None, chrom = None, start = None, end = None):
-    """ Create a dictionary that maps vertices to the genes that they belong to.
-    """
+
+def make_vertex_2_gene_dict(cursor, build=None, chrom=None, start=None, end=None):
+    """Create a dictionary that maps vertices to the genes that they belong to."""
     vertex_2_gene = {}
     if any(val == None for val in [chrom, start, end, build]):
         query = """SELECT vertex_ID,
@@ -326,7 +339,8 @@ def make_vertex_2_gene_dict(cursor, build = None, chrom = None, start = None, en
                        FROM vertex
                        LEFT JOIN genes ON vertex.gene_ID = genes.gene_ID"""
     else:
-        query = Template("""SELECT vertex_ID,
+        query = Template(
+            """SELECT vertex_ID,
                                    vertex.gene_ID,
                                    strand
                             FROM vertex
@@ -335,9 +349,9 @@ def make_vertex_2_gene_dict(cursor, build = None, chrom = None, start = None, en
                                 WHERE loc.genome_build = '$build'
                                      AND loc.chromosome = '$chrom'
                                      AND (loc.position >= $start AND loc.position <= $end)
-                         """)
-        query = query.substitute({'build':build, 'chrom':chrom,
-                                  'start':start, 'end':end})
+                         """
+        )
+        query = query.substitute({"build": build, "chrom": chrom, "start": start, "end": end})
 
     cursor.execute(query)
     for vertex_line in cursor.fetchall():
@@ -353,20 +367,20 @@ def make_vertex_2_gene_dict(cursor, build = None, chrom = None, start = None, en
 
     return vertex_2_gene
 
-def make_gene_start_or_end_dict(cursor, build, mode, chrom = None, start = None, end = None):
-    """ Select the starts (or ends) of known genes in the database and store
-        in a dict.
-        Format of dict:
-            Key: gene ID from database
-            Value: dict mapping positions to start vertices (or end vertices) of
-                   KNOWN transcripts from that gene
+
+def make_gene_start_or_end_dict(cursor, build, mode, chrom=None, start=None, end=None):
+    """Select the starts (or ends) of known genes in the database and store
+    in a dict.
+    Format of dict:
+        Key: gene ID from database
+        Value: dict mapping positions to start vertices (or end vertices) of
+               KNOWN transcripts from that gene
     """
     if mode not in ["start", "end"]:
-        raise ValueError(("Incorrect mode supplied to 'make_gene_start_or_end_dict'."
-                          " Expected 'start' or 'end'."))
+        raise ValueError(("Incorrect mode supplied to 'make_gene_start_or_end_dict'." " Expected 'start' or 'end'."))
 
     output_dict = {}
-    if any(val == None for val in [chrom, start,end]):
+    if any(val == None for val in [chrom, start, end]):
         query = """SELECT gene_ID,
                           %s_vertex as vertex,
                           loc1.position as %s
@@ -381,7 +395,8 @@ def make_gene_start_or_end_dict(cursor, build, mode, chrom = None, start = None,
         cursor.execute(query % (mode, mode, mode, build))
 
     else:
-        query = Template("""SELECT  gene_ID,
+        query = Template(
+            """SELECT  gene_ID,
                                     ${mode}_vertex as vertex,
                                     loc1.chromosome as chrom,
                                     loc1.position as $mode
@@ -395,14 +410,14 @@ def make_gene_start_or_end_dict(cursor, build, mode, chrom = None, start = None,
                                   AND ta.value = 'KNOWN'
                                   AND loc1.genome_build = '$build'
                                   AND chrom = '$chrom'
-                                  AND ($mode >= $start AND $mode <= $end)""")
-        query = query.substitute({'build':build, 'chrom':chrom,
-                                  'start':start, 'end':end, 'mode':mode})
+                                  AND ($mode >= $start AND $mode <= $end)"""
+        )
+        query = query.substitute({"build": build, "chrom": chrom, "start": start, "end": end, "mode": mode})
         cursor.execute(query)
 
     for entry in cursor.fetchall():
-        gene_ID = entry['gene_ID']
-        vertex = entry['vertex']
+        gene_ID = entry["gene_ID"]
+        vertex = entry["vertex"]
         pos = entry[mode]
 
         try:
diff --git a/src/talon/initialize_talon_database.py b/src/talon/initialize_talon_database.py
index d3ced4d..fe18d81 100644
--- a/src/talon/initialize_talon_database.py
+++ b/src/talon/initialize_talon_database.py
@@ -6,51 +6,65 @@
 # This database is used by the TALON pipeline to maintain a registry of
 # known annotations as well as novel discoveries.
 
+import os
 import sqlite3
-from sqlite3 import Error
+import time
 from optparse import OptionParser
+from sqlite3 import Error
+
+from . import edge as Edge
 from . import gene as Gene
 from . import transcript as Transcript
-from . import edge as Edge
-import os
-import time
+
 
 def getOptions():
     parser = OptionParser()
-    parser.add_option("--f", dest = "gtf",
-        help = "GTF annotation containing genes, transcripts, and edges.",
-        metavar = "FILE", type = str)
-    parser.add_option("--g", dest = "genome_build",
-        help = "Name of genome build that the GTF file is based on (ie hg38)",
-        type = str)
-    parser.add_option("--a", dest = "annot_name",
-        help = "Name of supplied annotation (will be used to label data)",
-        type = str)
-    parser.add_option("--l", dest = "min_length",
-        help = "Minimum required transcript length (default = 0 bp) ",
-        type = int, default = 0)
-    parser.add_option("--idprefix", dest = "idprefix",
-        help = "Prefix for naming novel discoveries in eventual TALON runs",
-        type = str, default = "TALON")
-    parser.add_option("--5p", dest = "cutoff_5p",
-        help = "Maximum allowable distance (bp) at the 5' end during annotation",
-        type = int, default = "500")
-    parser.add_option("--3p", dest = "cutoff_3p",
-        help = "Maximum allowable distance (bp) at the 3' end during annotation",
-        type = int, default = "300")
-
-    parser.add_option("--o", dest = "outprefix",
-        help = "Outprefix for the annotation files",
-        metavar = "FILE", type = "string")
+    parser.add_option(
+        "--f", dest="gtf", help="GTF annotation containing genes, transcripts, and edges.", metavar="FILE", type=str
+    )
+    parser.add_option(
+        "--g", dest="genome_build", help="Name of genome build that the GTF file is based on (ie hg38)", type=str
+    )
+    parser.add_option(
+        "--a", dest="annot_name", help="Name of supplied annotation (will be used to label data)", type=str
+    )
+    parser.add_option(
+        "--l", dest="min_length", help="Minimum required transcript length (default = 0 bp) ", type=int, default=0
+    )
+    parser.add_option(
+        "--idprefix",
+        dest="idprefix",
+        help="Prefix for naming novel discoveries in eventual TALON runs",
+        type=str,
+        default="TALON",
+    )
+    parser.add_option(
+        "--5p",
+        dest="cutoff_5p",
+        help="Maximum allowable distance (bp) at the 5' end during annotation",
+        type=int,
+        default="500",
+    )
+    parser.add_option(
+        "--3p",
+        dest="cutoff_3p",
+        help="Maximum allowable distance (bp) at the 3' end during annotation",
+        type=int,
+        default="300",
+    )
+
+    parser.add_option("--o", dest="outprefix", help="Outprefix for the annotation files", metavar="FILE", type="string")
 
     (options, args) = parser.parse_args()
     return options
 
+
 ############### Database initialization section #############################
 
+
 def create_database(path):
-    """ Creates an SQLite database with the provided name. If a database
-        of the name already exists, an error is generated. """
+    """Creates an SQLite database with the provided name. If a database
+    of the name already exists, an error is generated."""
 
     if os.path.isfile(path):
         raise ValueError("Database with name '" + path + "' already exists!")
@@ -65,40 +79,38 @@ def create_database(path):
 
     return
 
+
 def init_run_info(database, idprefix, min_length, cutoff_5p, cutoff_3p):
-    """ Initializes a table that keeps track of important run information
-        such as the prefix for novel identifiers and the 5 prime and 3 prime
-        distance cutoffs. Affects how downstream TALON runs are done"""
+    """Initializes a table that keeps track of important run information
+    such as the prefix for novel identifiers and the 5 prime and 3 prime
+    distance cutoffs. Affects how downstream TALON runs are done"""
 
     # Connecting to the database file
     conn = sqlite3.connect(database)
     c = conn.cursor()
 
     # Add table and set primary key column, which will be the gene ID
-    c.execute("""CREATE TABLE "run_info" ("item" TEXT PRIMARY KEY,
-                                          "value" TEXT)""")
+    c.execute(
+        """CREATE TABLE "run_info" ("item" TEXT PRIMARY KEY,
+                                          "value" TEXT)"""
+    )
     # Add rows
     cols = " (" + ", ".join([str_wrap_double(x) for x in ["item", "value"]]) + ") "
-    c.execute('INSERT INTO run_info ' + cols + ' VALUES ' + '(?,?)',
-             ('schema_version', "v5.0"))
-    c.execute('INSERT INTO run_info ' + cols + ' VALUES ' + '(?,?)',
-             ('idprefix', idprefix))
-    c.execute('INSERT INTO run_info ' + cols + ' VALUES ' + '(?,?)',
-             ('cutoff_5p', cutoff_5p))
-    c.execute('INSERT INTO run_info ' + cols + ' VALUES ' + '(?,?)',
-               ('cutoff_3p', cutoff_3p))
-    c.execute('INSERT INTO run_info ' + cols + ' VALUES ' + '(?,?)',
-               ('min_length', min_length))
-    c.execute('INSERT INTO run_info ' + cols + ' VALUES ' + '(?,?)',
-               ('n_places', 9))
+    c.execute("INSERT INTO run_info " + cols + " VALUES " + "(?,?)", ("schema_version", "v5.0"))
+    c.execute("INSERT INTO run_info " + cols + " VALUES " + "(?,?)", ("idprefix", idprefix))
+    c.execute("INSERT INTO run_info " + cols + " VALUES " + "(?,?)", ("cutoff_5p", cutoff_5p))
+    c.execute("INSERT INTO run_info " + cols + " VALUES " + "(?,?)", ("cutoff_3p", cutoff_3p))
+    c.execute("INSERT INTO run_info " + cols + " VALUES " + "(?,?)", ("min_length", min_length))
+    c.execute("INSERT INTO run_info " + cols + " VALUES " + "(?,?)", ("n_places", 9))
 
     conn.commit()
     conn.close()
     return
 
+
 def add_gene_table(database):
-    """ Add a table to the database to track genes. Attributes are:
-        - Primary Key: Gene ID (interally assigned by database)
+    """Add a table to the database to track genes. Attributes are:
+    - Primary Key: Gene ID (interally assigned by database)
     """
 
     # Connecting to the database file
@@ -106,18 +118,21 @@ def add_gene_table(database):
     c = conn.cursor()
 
     # Add table and set primary key column, which will be the gene ID
-    c.execute("""CREATE TABLE "genes" ("gene_ID" INTEGER PRIMARY KEY,
-                                       "strand" TEXT)""")
+    c.execute(
+        """CREATE TABLE "genes" ("gene_ID" INTEGER PRIMARY KEY,
+                                       "strand" TEXT)"""
+    )
 
     conn.commit()
     conn.close()
     return
 
+
 def add_transcript_table(database):
-    """ Add a table to the database to track transcripts. Attributes are:
-        - Primary Key: Transcript ID (interally assigned by database)
-        - Gene ID
-        - Path (Edges)
+    """Add a table to the database to track transcripts. Attributes are:
+    - Primary Key: Transcript ID (interally assigned by database)
+    - Gene ID
+    - Path (Edges)
     """
 
     # Connecting to the database file
@@ -146,12 +161,13 @@ def add_transcript_table(database):
     conn.close()
     return
 
+
 def add_edge_table(database):
-    """ Add a table to the database to track edges linking vertices.
-        Attributes are:
-        - Primary Key: ID (interally assigned by database)
-        - Donor ID
-        - Acceptor ID
+    """Add a table to the database to track edges linking vertices.
+    Attributes are:
+    - Primary Key: ID (interally assigned by database)
+    - Donor ID
+    - Acceptor ID
     """
 
     # Connecting to the database file
@@ -180,11 +196,12 @@ def add_edge_table(database):
     conn.close()
     return
 
+
 def add_edgetype_table(database):
-    """ Add a table to the database to track permitted edge types. We start
-        with "edge" and "intron"
-        Attributes are:
-        - Primary Key: Type
+    """Add a table to the database to track permitted edge types. We start
+    with "edge" and "intron"
+    Attributes are:
+    - Primary Key: Type
     """
     # Connecting to the database file
     conn = sqlite3.connect(database)
@@ -198,27 +215,26 @@ def add_edgetype_table(database):
     # Add entries for 'exon' and 'intron'
     for t in ["exon", "intron"]:
         cols = "(type)"
-        vals = [ t ]
-        command = 'INSERT OR IGNORE INTO "edge_type"' + cols + "VALUES " + \
-                  '(?)'
-        c.execute(command,vals)
+        vals = [t]
+        command = 'INSERT OR IGNORE INTO "edge_type"' + cols + "VALUES " + "(?)"
+        c.execute(command, vals)
 
     conn.commit()
     conn.close()
     return
 
+
 def add_vertex_table(database):
-    """ Add a table to the database to track vertices.
-        Attributes are:
-        - Vertex_ID: ID (interally assigned by database)
-        - Gene ID
+    """Add a table to the database to track vertices.
+    Attributes are:
+    - Vertex_ID: ID (interally assigned by database)
+    - Gene ID
     """
 
     # Connecting to the database file
     conn = sqlite3.connect(database)
     c = conn.cursor()
 
-
     # Add table and set primary key column, which will be the transcript ID
     # Also include relationship to the gene table
     command = """ CREATE TABLE IF NOT EXISTS vertex (
@@ -235,9 +251,10 @@ def add_vertex_table(database):
     conn.close()
     return
 
+
 def add_genome_table(database, build):
-    """ Add a table that tracks the genome builds in use, then add the provided
-        genome build to it.
+    """Add a table that tracks the genome builds in use, then add the provided
+    genome build to it.
     """
 
     # Connecting to the database file
@@ -245,9 +262,11 @@ def add_genome_table(database, build):
     c = conn.cursor()
 
     # Add table and set primary key column, which will be the edge ID
-    c.execute("""CREATE TABLE genome_build (
+    c.execute(
+        """CREATE TABLE genome_build (
                      build_ID INTEGER PRIMARY KEY,
-                     name TEXT)""")
+                     name TEXT)"""
+    )
 
     # Get value of genome_build counter
     c.execute('SELECT "count" FROM "counters" WHERE "category" = "genome_build"')
@@ -257,10 +276,9 @@ def add_genome_table(database, build):
 
     # Add entry for current genome build
     cols = "(build_ID, name)"
-    vals = [ db_id, build ]
-    command = 'INSERT OR IGNORE INTO "genome_build"' + cols + "VALUES " + \
-              '(?,?)'
-    c.execute(command,vals)
+    vals = [db_id, build]
+    command = 'INSERT OR IGNORE INTO "genome_build"' + cols + "VALUES " + "(?,?)"
+    c.execute(command, vals)
 
     # Update the counter
     update_counter = 'UPDATE "counters" SET "count" = ? WHERE "category" = ?'
@@ -270,36 +288,40 @@ def add_genome_table(database, build):
     conn.close()
     return
 
+
 def add_dataset_table(database):
-    """ Add a table that tracks the datasets added to the database.
-    """
+    """Add a table that tracks the datasets added to the database."""
 
     # Connecting to the database file
     conn = sqlite3.connect(database)
     c = conn.cursor()
 
     # Add table and set primary key column
-    c.execute("""CREATE TABLE dataset (
+    c.execute(
+        """CREATE TABLE dataset (
                      dataset_ID INTEGER PRIMARY KEY,
                      dataset_name TEXT,
                      sample TEXT,
                      platform TEXT
-              )""")
+              )"""
+    )
 
     conn.commit()
     conn.close()
     return
 
+
 def add_observed_table(database):
-    """ Add a table that tracks attributes of observed transcripts, including
-        5' and 3' end deltas, as well as the read length. """
+    """Add a table that tracks attributes of observed transcripts, including
+    5' and 3' end deltas, as well as the read length."""
 
     # Connecting to the database file
     conn = sqlite3.connect(database)
     c = conn.cursor()
 
     # Add table and set primary key column
-    c.execute("""CREATE TABLE observed (
+    c.execute(
+        """CREATE TABLE observed (
                      obs_ID INTEGER PRIMARY KEY,
                      gene_ID INTEGER,
                      transcript_ID INTEGER,
@@ -325,18 +347,20 @@ def add_observed_table(database):
                      FOREIGN KEY(end_vertex) REFERENCES vertex(vertex_ID),
                      FOREIGN KEY(start_exon) REFERENCES edge(edge_ID),
                      FOREIGN KEY(end_exon) REFERENCES edge(edge_ID)
-              )""")
+              )"""
+    )
 
     conn.commit()
     conn.close()
     return
 
+
 def add_abundance_table(database):
-    """ Add a table to the database to track transcript abundance over
-        all datasets.
-        - Transcript ID
-        - Dataset
-        - Count
+    """Add a table to the database to track transcript abundance over
+    all datasets.
+    - Transcript ID
+    - Dataset
+    - Count
     """
 
     # Connecting to the database file
@@ -344,7 +368,8 @@ def add_abundance_table(database):
     c = conn.cursor()
 
     # Add table and set primary key column, which will be the edge ID
-    c.execute("""CREATE TABLE abundance (
+    c.execute(
+        """CREATE TABLE abundance (
                      transcript_ID INTEGER,
                      dataset INTEGER,
                      count INTEGER,
@@ -352,16 +377,18 @@ def add_abundance_table(database):
                  PRIMARY KEY(transcript_ID, dataset),
                  FOREIGN KEY(transcript_ID) REFERENCES transcripts(transcript_ID),
                  FOREIGN KEY(dataset) REFERENCES dataset(dataset_ID)
-              )""")
+              )"""
+    )
 
     conn.commit()
     conn.close()
     return
 
+
 def add_counter_table(database):
-    """ Add a table to the database to track novel events. Attributes are:
-        - Category (gene, transcript, edge)
-        - Count (number of items in that category so far)
+    """Add a table to the database to track novel events. Attributes are:
+    - Category (gene, transcript, edge)
+    - Count (number of items in that category so far)
     """
 
     # Connecting to the database file
@@ -374,42 +401,39 @@ def add_counter_table(database):
 
     # Add novel column
     default_val = 0
-    c.execute("ALTER TABLE {tn} ADD COLUMN '{cn}' {ct}"\
-        .format(tn=table_name, cn="count", ct="INTEGER", df=default_val))
+    c.execute("ALTER TABLE {tn} ADD COLUMN '{cn}' {ct}".format(tn=table_name, cn="count", ct="INTEGER", df=default_val))
 
     # Add rows
-    c.execute("INSERT INTO {tn} ({idf}, {cn}) VALUES ('genes', 0)".\
-        format(tn=table_name, idf="category", cn="count"))
-    c.execute("INSERT INTO {tn} ({idf}, {cn}) VALUES ('transcripts', 0)".\
-        format(tn=table_name, idf="category", cn="count"))
-    c.execute("INSERT INTO {tn} ({idf}, {cn}) VALUES ('vertex', 0)".\
-        format(tn=table_name, idf="category", cn="count"))
-    c.execute("INSERT INTO {tn} ({idf}, {cn}) VALUES ('edge', 0)".\
-        format(tn=table_name, idf="category", cn="count"))
-    c.execute("INSERT INTO {tn} ({idf}, {cn}) VALUES ('genome_build', 0)".\
-        format(tn=table_name, idf="category", cn="count"))
-    c.execute("INSERT INTO {tn} ({idf}, {cn}) VALUES ('dataset', 0)".\
-        format(tn=table_name, idf="category", cn="count"))
-    c.execute("INSERT INTO {tn} ({idf}, {cn}) VALUES ('observed', 0)".\
-        format(tn=table_name, idf="category", cn="count"))
+    c.execute("INSERT INTO {tn} ({idf}, {cn}) VALUES ('genes', 0)".format(tn=table_name, idf="category", cn="count"))
+    c.execute(
+        "INSERT INTO {tn} ({idf}, {cn}) VALUES ('transcripts', 0)".format(tn=table_name, idf="category", cn="count")
+    )
+    c.execute("INSERT INTO {tn} ({idf}, {cn}) VALUES ('vertex', 0)".format(tn=table_name, idf="category", cn="count"))
+    c.execute("INSERT INTO {tn} ({idf}, {cn}) VALUES ('edge', 0)".format(tn=table_name, idf="category", cn="count"))
+    c.execute(
+        "INSERT INTO {tn} ({idf}, {cn}) VALUES ('genome_build', 0)".format(tn=table_name, idf="category", cn="count")
+    )
+    c.execute("INSERT INTO {tn} ({idf}, {cn}) VALUES ('dataset', 0)".format(tn=table_name, idf="category", cn="count"))
+    c.execute("INSERT INTO {tn} ({idf}, {cn}) VALUES ('observed', 0)".format(tn=table_name, idf="category", cn="count"))
 
     conn.commit()
     conn.close()
     return
 
+
 def add_annotation_table(database, table_name, key_table, fk_id):
-    """ Add a table to keep track of annotation attributes for genes,
-        transcripts, etc. The table will be given the provided table name. A
-        foreign key will be created to link the ID column of the annotation
-        table to the fk_id column of the key_table.
-
-        Attributes:
-        - Item ID
-        - Annotation name: user-provided name for annotation
-        - Source (in case of an object from a GTF, this comes from the 2nd col)
-        - Feature type
-        - Attribute
-        - Value
+    """Add a table to keep track of annotation attributes for genes,
+    transcripts, etc. The table will be given the provided table name. A
+    foreign key will be created to link the ID column of the annotation
+    table to the fk_id column of the key_table.
+
+    Attributes:
+    - Item ID
+    - Annotation name: user-provided name for annotation
+    - Source (in case of an object from a GTF, this comes from the 2nd col)
+    - Feature type
+    - Attribute
+    - Value
     """
 
     # Connecting to the database file
@@ -420,28 +444,34 @@ def add_annotation_table(database, table_name, key_table, fk_id):
     if key_table == "exon":
         fk_statement = ""
     else:
-        fk_statement = ", FOREIGN KEY (ID) REFERENCES "+ key_table + "(" + fk_id + ")"
-    command = " CREATE TABLE IF NOT EXISTS " + table_name + \
-                """ (ID INTEGER,
+        fk_statement = ", FOREIGN KEY (ID) REFERENCES " + key_table + "(" + fk_id + ")"
+    command = (
+        " CREATE TABLE IF NOT EXISTS "
+        + table_name
+        + """ (ID INTEGER,
                   annot_name text,
                   source text,
                   attribute text,
                   value text,
 
-                  PRIMARY KEY (ID, source, attribute)""" + fk_statement + """); """
+                  PRIMARY KEY (ID, source, attribute)"""
+        + fk_statement
+        + """); """
+    )
     c.execute(command)
     conn.commit()
     conn.close()
     return
 
+
 def add_location_table(database):
-    """ Add a table to the database to track the locations of objects across
-        the different genome builds. Attributes are:
-        - Vertex ID
-        - Genome build
-        - Chromosome
-        - Position (1-based)
-        - Strand
+    """Add a table to the database to track the locations of objects across
+    the different genome builds. Attributes are:
+    - Vertex ID
+    - Genome build
+    - Chromosome
+    - Position (1-based)
+    - Strand
     """
 
     # Connecting to the database file
@@ -467,15 +497,16 @@ def add_location_table(database):
 
 ####################### GTF parsing section #################################
 
+
 def read_gtf_file(gtf_file):
-    """ Reads gene, transcript, and edge information from a GTF file.
-        Args:
-            gtf_file: Path to the GTF file
-        Returns:
-            genes: A dictionary mapping gene IDs to corresponding gene objects
-            transcripts: A dictionary mapping gene IDs to corresponding
-                   transcript objects
-            exons: A dictionary mapping exon IDs to corresponding edge objects
+    """Reads gene, transcript, and edge information from a GTF file.
+    Args:
+        gtf_file: Path to the GTF file
+    Returns:
+        genes: A dictionary mapping gene IDs to corresponding gene objects
+        transcripts: A dictionary mapping gene IDs to corresponding
+               transcript objects
+        exons: A dictionary mapping exon IDs to corresponding edge objects
     """
     genes = {}
     transcripts = {}
@@ -533,10 +564,11 @@ def read_gtf_file(gtf_file):
 
     return genes, transcripts, exons
 
+
 def filter_by_length(genes, transcripts, min_length):
-    """ Given a minimum transcript length, this function
-          - Iterates over transcripts and keeps the ones with length >= min_length
-          - Removes genes not represented in the transcript set
+    """Given a minimum transcript length, this function
+    - Iterates over transcripts and keeps the ones with length >= min_length
+    - Removes genes not represented in the transcript set
     """
     filtered_transcripts = {}
     filtered_genes = {}
@@ -553,8 +585,9 @@ def filter_by_length(genes, transcripts, min_length):
 
     return filtered_genes, filtered_transcripts
 
+
 def organize_by_chromosome(genes, transcripts):
-    """ Iterate through genes and transcripts and group them by chromosome """
+    """Iterate through genes and transcripts and group them by chromosome"""
     gene_dict = {}
     transcript_dict = {}
 
@@ -579,11 +612,13 @@ def organize_by_chromosome(genes, transcripts):
 
     return gene_dict, transcript_dict
 
+
 ######################### Populate the database ############################
 
+
 def populate_db(database, annot_name, chrom_genes, chrom_transcripts, edges, genome_build):
-    """ Iterate over GTF-derived gene, transcript, and edge entries in order
-        to add a record for each in the database.
+    """Iterate over GTF-derived gene, transcript, and edge entries in order
+    to add a record for each in the database.
     """
     # Connecting to the database file
     conn = sqlite3.connect(database)
@@ -605,8 +640,8 @@ def populate_db(database, annot_name, chrom_genes, chrom_transcripts, edges, gen
 
     return
 
-def add_genes(c, genes, annot_name):
 
+def add_genes(c, genes, annot_name):
     bulk_genes = []
     bulk_annotations = []
     gene_id_map = {}
@@ -641,14 +676,15 @@ def add_genes(c, genes, annot_name):
     bulk_update_gene_annotations(c, bulk_annotations)
     return gene_id_map
 
+
 def bulk_update_genes(c, genes, gene_counter):
     """
-       Given a list of tuple-formatted gene entries, this function inserts them
-       into the database at the provided cursor (c).
+    Given a list of tuple-formatted gene entries, this function inserts them
+    into the database at the provided cursor (c).
     """
     # Insert entries into database in bulk
     cols = " (" + ", ".join([str_wrap_double(x) for x in ["gene_id", "strand"]]) + ") "
-    g_command = 'INSERT INTO "genes"' + cols + "VALUES " + '(?,?)'
+    g_command = 'INSERT INTO "genes"' + cols + "VALUES " + "(?,?)"
     c.executemany(g_command, genes)
 
     # Update counter
@@ -657,22 +693,21 @@ def bulk_update_genes(c, genes, gene_counter):
 
     return
 
+
 def bulk_update_gene_annotations(c, bulk_annotations):
     """
-       Given a list of tuple-formatted gene annotation entries, this function
-       inserts them into the database at the provided cursor (c).
+    Given a list of tuple-formatted gene annotation entries, this function
+    inserts them into the database at the provided cursor (c).
     """
 
-    cols = " (" + ", ".join([str_wrap_double(x) for x in ["ID","annot_name",
-               "source", "attribute", "value"]]) + ") "
-    command = 'INSERT INTO "gene_annotations"' + cols + "VALUES " + \
-                  '(?,?,?,?,?)'
+    cols = " (" + ", ".join([str_wrap_double(x) for x in ["ID", "annot_name", "source", "attribute", "value"]]) + ") "
+    command = 'INSERT INTO "gene_annotations"' + cols + "VALUES " + "(?,?,?,?,?)"
     c.executemany(command, bulk_annotations)
 
     return
 
-def add_transcripts(c, transcripts, annot_name, gene_id_map, genome_build):
 
+def add_transcripts(c, transcripts, annot_name, gene_id_map, genome_build):
     bulk_transcripts = []
     bulk_annotations = []
 
@@ -683,11 +718,11 @@ def add_transcripts(c, transcripts, annot_name, gene_id_map, genome_build):
     # Get vertex and edge counters from database
     c.execute('SELECT "count" FROM "counters" WHERE "category" = "vertex"')
     v_counter = int(c.fetchone()[0])
-    vertices['counter'] = v_counter
+    vertices["counter"] = v_counter
 
     c.execute('SELECT "count" FROM "counters" WHERE "category" = "edge"')
     e_counter = int(c.fetchone()[0])
-    edges['counter'] = e_counter
+    edges["counter"] = e_counter
 
     # Get transcript counter
     c.execute('SELECT "count" FROM "counters" WHERE "category" = "transcripts"')
@@ -710,9 +745,9 @@ def add_transcripts(c, transcripts, annot_name, gene_id_map, genome_build):
             db_gene_id = "NULL"
 
         # Process exons to create vertices and edges
-        transcript_tuple = process_transcript(c, transcript, db_transcript_id,
-                                              db_gene_id, genome_build,
-                                              annot_name, vertices, edges)
+        transcript_tuple = process_transcript(
+            c, transcript, db_transcript_id, db_gene_id, genome_build, annot_name, vertices, edges
+        )
         bulk_transcripts.append(transcript_tuple)
 
         # Create annotation entries
@@ -736,40 +771,56 @@ def add_transcripts(c, transcripts, annot_name, gene_id_map, genome_build):
 
     return
 
+
 def bulk_update_transcripts(c, transcripts, counter):
     """
-       Given a list of tuple-formatted transcript entries, this function inserts them
-       into the database at the provided cursor (c).
+    Given a list of tuple-formatted transcript entries, this function inserts them
+    into the database at the provided cursor (c).
     """
-    cols = " (" + ", ".join([str_wrap_double(x) for x in ["transcript_ID",
-           "gene_ID", "start_exon", "jn_path", "end_exon", "start_vertex", "end_vertex",
-           "n_exons"]]) + ") "
-    g_command = 'INSERT INTO "transcripts"' + cols + "VALUES " + \
-                '(?,?,?,?,?,?,?,?)'
-    c.executemany(g_command,transcripts)
+    cols = (
+        " ("
+        + ", ".join(
+            [
+                str_wrap_double(x)
+                for x in [
+                    "transcript_ID",
+                    "gene_ID",
+                    "start_exon",
+                    "jn_path",
+                    "end_exon",
+                    "start_vertex",
+                    "end_vertex",
+                    "n_exons",
+                ]
+            ]
+        )
+        + ") "
+    )
+    g_command = 'INSERT INTO "transcripts"' + cols + "VALUES " + "(?,?,?,?,?,?,?,?)"
+    c.executemany(g_command, transcripts)
 
     update_counter = 'UPDATE "counters" SET "count" = ? WHERE "category" = ?'
     c.execute(update_counter, [counter, "transcripts"])
 
     return
 
+
 def bulk_update_transcript_annotations(c, bulk_annotations):
     """
-       Given a list of tuple-formatted transcript annotation entries, this
-       function inserts them into the database at the provided cursor (c).
+    Given a list of tuple-formatted transcript annotation entries, this
+    function inserts them into the database at the provided cursor (c).
     """
-    cols = " (" + ", ".join([str_wrap_double(x) for x in ["ID","annot_name",
-                   "source", "attribute", "value"]]) + ") "
-    command = 'INSERT INTO "transcript_annotations"' + cols + "VALUES " + \
-                  '(?,?,?,?,?)'
+    cols = " (" + ", ".join([str_wrap_double(x) for x in ["ID", "annot_name", "source", "attribute", "value"]]) + ") "
+    command = 'INSERT INTO "transcript_annotations"' + cols + "VALUES " + "(?,?,?,?,?)"
     c.executemany(command, bulk_annotations)
 
     return
 
+
 def bulk_update_vertices(c, vertices):
     """
-       Given a list of tuple-formatted vertex entries, this
-       function inserts them into the database at the provided cursor (c).
+    Given a list of tuple-formatted vertex entries, this
+    function inserts them into the database at the provided cursor (c).
     """
     # Extract the counter
     counter = vertices.pop("counter")
@@ -779,21 +830,20 @@ def bulk_update_vertices(c, vertices):
     location_list = []
     for vertex in list(vertices.values()):
         gene_IDs = list(vertex[-1])
-        vertex_list += [ (vertex[0], x) for x in gene_IDs ]
+        vertex_list += [(vertex[0], x) for x in gene_IDs]
         location_list.append(vertex[0:4])
 
     # Bulk entry of vertices
-    cols = " (" + ", ".join([str_wrap_double(x) for x in ["vertex_ID","gene_id"]]) + ") "
-    command = 'INSERT INTO "vertex"' + cols + "VALUES " + \
-                  '(?,?)'
+    cols = " (" + ", ".join([str_wrap_double(x) for x in ["vertex_ID", "gene_id"]]) + ") "
+    command = 'INSERT INTO "vertex"' + cols + "VALUES " + "(?,?)"
 
     c.executemany(command, vertex_list)
 
     # Bulk entry of locations
-    cols = " (" + ", ".join([str_wrap_double(x) for x in ["location_ID",
-                   "genome_build", "chromosome", "position"]]) + ") "
-    command = 'INSERT INTO "location"' + cols + "VALUES " + \
-                  '(?,?,?,?)'
+    cols = (
+        " (" + ", ".join([str_wrap_double(x) for x in ["location_ID", "genome_build", "chromosome", "position"]]) + ") "
+    )
+    command = 'INSERT INTO "location"' + cols + "VALUES " + "(?,?,?,?)"
     c.executemany(command, location_list)
 
     # Counter update
@@ -802,18 +852,17 @@ def bulk_update_vertices(c, vertices):
 
     return
 
+
 def bulk_update_edges(c, edges):
     """
-       Given a list of tuple-formatted edge entries, this
-       function inserts them into the database at the provided cursor (c).
+    Given a list of tuple-formatted edge entries, this
+    function inserts them into the database at the provided cursor (c).
     """
     # Extract the counter
     counter = edges.pop("counter")
 
-    cols = " (" + ", ".join([str_wrap_double(x) for x in ["edge_ID","v1",
-                   "v2", "edge_type", "strand"]]) + ") "
-    command = 'INSERT INTO "edge"' + cols + "VALUES " + \
-                  '(?,?,?,?,?)'
+    cols = " (" + ", ".join([str_wrap_double(x) for x in ["edge_ID", "v1", "v2", "edge_type", "strand"]]) + ") "
+    command = 'INSERT INTO "edge"' + cols + "VALUES " + "(?,?,?,?,?)"
     c.executemany(command, edges.values())
 
     update_counter = 'UPDATE "counters" SET "count" = ? WHERE "category" = ?'
@@ -821,9 +870,8 @@ def bulk_update_edges(c, edges):
 
     return
 
-def process_transcript(c, transcript, transcript_id, gene_id, genome_build,
-                       annot_name, vertices, edges):
 
+def process_transcript(c, transcript, transcript_id, gene_id, genome_build, annot_name, vertices, edges):
     exons = transcript.exons
     strand = transcript.strand
     transcript_vertices = []
@@ -833,19 +881,17 @@ def process_transcript(c, transcript, transcript_id, gene_id, genome_build,
         exon = exons[i]
         left = exon.start
         right = exon.end
-        v1, vertices = create_vertex(c, gene_id, genome_build,
-                                     exon.chromosome, left, vertices)
+        v1, vertices = create_vertex(c, gene_id, genome_build, exon.chromosome, left, vertices)
         transcript_vertices.append(v1)
 
-        v2, vertices = create_vertex(c, gene_id, genome_build,
-                                     exon.chromosome, right, vertices)
+        v2, vertices = create_vertex(c, gene_id, genome_build, exon.chromosome, right, vertices)
         transcript_vertices.append(v2)
 
     # Iterate over vertices in order to create edges. If the transcript is on the
     # minus strand, reverse the vertex and edge lists
     if strand == "-":
-         transcript_vertices = transcript_vertices[::-1]
-         exons = exons[::-1]
+        transcript_vertices = transcript_vertices[::-1]
+        exons = exons[::-1]
 
     # Keep track of start vertex, end vertex, and n_exons
     start_vertex = transcript_vertices[0]
@@ -854,9 +900,9 @@ def process_transcript(c, transcript, transcript_id, gene_id, genome_build,
 
     prev_edge_type = None
     exon_index = 0
-    for i in range(0,len(transcript_vertices) - 1):
+    for i in range(0, len(transcript_vertices) - 1):
         vertex_1 = transcript_vertices[i]
-        vertex_2 = transcript_vertices[i+1]
+        vertex_2 = transcript_vertices[i + 1]
 
         # Try to create an edge between vertex 1 and 2
         if prev_edge_type == None or prev_edge_type == "intron":
@@ -875,48 +921,57 @@ def process_transcript(c, transcript, transcript_id, gene_id, genome_build,
 
         prev_edge_type = edge_type
     if len(transcript_edges) > 1:
-        transcript_path = ",".join(map(str,transcript_edges[1:-1]))
+        transcript_path = ",".join(map(str, transcript_edges[1:-1]))
     else:
         transcript_path = None
     start_exon = transcript_edges[0]
     end_exon = transcript_edges[-1]
 
-    transcript_tuple = (transcript_id, gene_id, start_exon, transcript_path,
-                        end_exon, start_vertex, end_vertex, n_exons)
+    transcript_tuple = (
+        transcript_id,
+        gene_id,
+        start_exon,
+        transcript_path,
+        end_exon,
+        start_vertex,
+        end_vertex,
+        n_exons,
+    )
 
     return transcript_tuple
 
 
 def add_exon_annotations_to_db(c, exon, exon_id, annot_name):
-    """ Adds annotations from edge object to the database"""
+    """Adds annotations from edge object to the database"""
 
     ignore = ["gene_id", "gene_name"]
     attributes = exon.annotations
-    source = attributes['source']
+    source = attributes["source"]
     if "exon_status" not in attributes:
-            attributes["exon_status"] = "KNOWN"
+        attributes["exon_status"] = "KNOWN"
 
     for att in attributes.keys():
         if (att in ignore) or ("gene" in att) or ("transcript" in att):
             continue
         value = attributes[att]
-        cols = " (" + ", ".join([str_wrap_double(x) for x in ["ID","annot_name",
-               "source", "attribute", "value"]]) + ") "
+        cols = (
+            " (" + ", ".join([str_wrap_double(x) for x in ["ID", "annot_name", "source", "attribute", "value"]]) + ") "
+        )
         vals = [exon_id, annot_name, source, att, value]
 
-        command = 'INSERT OR IGNORE INTO "exon_annotations"' + cols + "VALUES " + \
-                  '(?,?,?,?,?)'
-        c.execute(command,vals)
+        command = 'INSERT OR IGNORE INTO "exon_annotations"' + cols + "VALUES " + "(?,?,?,?,?)"
+        c.execute(command, vals)
 
     return
 
+
 def create_edge(vertex_1, vertex_2, edge_type, strand, edges):
     """
-       Creates a new edge with the provided information, unless a duplicate
-       already exists in the 'edges' dict.
+    Creates a new edge with the provided information, unless a duplicate
+    already exists in the 'edges' dict.
     """
     # Check if the edge exists, and return the ID if it does
-    query = ",".join([str(vertex_1), str(vertex_2), edge_type,strand])
+    query = ",".join([str(vertex_1), str(vertex_2), edge_type, strand])
     if query in edges.keys():
         existing_edge_id = edges[query][0]
         return existing_edge_id, edges
@@ -931,10 +986,11 @@ def create_edge(vertex_1, vertex_2, edge_type, strand, edges):
 
     return edge_id, edges
 
+
 def create_vertex(c, gene_id, genome_build, chromosome, pos, vertices):
     """
-       Creates a new vertex with the provided information, unless a duplicate
-       already exists in the database.
+    Creates a new vertex with the provided information, unless a duplicate
+    already exists in the database.
     """
     # Check if the vertex exists. If yes, add current gene ID to it
     query = ",".join([genome_build, chromosome, str(pos)])
@@ -957,18 +1013,21 @@ def create_vertex(c, gene_id, genome_build, chromosome, pos, vertices):
 
 
 def str_wrap_double(s):
-    """ Adds double quotes around the input string """
+    """Adds double quotes around the input string"""
     s = str(s)
     return '"' + s + '"'
 
+
 def hms_string(sec_elapsed):
     h = int(sec_elapsed / (60 * 60))
     m = int((sec_elapsed % (60 * 60)) / 60)
-    s = sec_elapsed % 60.
+    s = sec_elapsed % 60.0
     return "{}:{:>02}:{:>05.2f}".format(h, m, s)
 
+
 ########################### Main ###########################################
 
+
 def main():
     options = getOptions()
     gtf_file = options.gtf
@@ -993,8 +1052,7 @@ def main():
     add_genome_table(db_name, genome_build)
     add_location_table(db_name)
     add_annotation_table(db_name, "gene_annotations", "genes", "gene_ID")
-    add_annotation_table(db_name, "transcript_annotations", "transcripts",
-                         "transcript_ID")
+    add_annotation_table(db_name, "transcript_annotations", "transcripts", "transcript_ID")
     add_annotation_table(db_name, "exon_annotations", "exon", "ID")
     add_dataset_table(db_name)
     add_abundance_table(db_name)
@@ -1015,5 +1073,5 @@ def main():
     populate_db(db_name, annot_name, chrom_genes, chrom_transcripts, exons, genome_build)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/src/talon/length_utils.py b/src/talon/length_utils.py
index ebc8d04..769c813 100644
--- a/src/talon/length_utils.py
+++ b/src/talon/length_utils.py
@@ -3,11 +3,13 @@
 # -----------------------------------------------------------------------------
 # Queries for working with exon and transcript lengths
 
+
 def get_all_exon_lengths(cursor, build):
-    """ Compute all exon lengths and store in a dict """
+    """Compute all exon lengths and store in a dict"""
 
     exon_lengths = {}
-    cursor.execute(""" SELECT edge_ID,
+    cursor.execute(
+        """ SELECT edge_ID,
                           loc1.position AS pos1,
                           loc2.position AS pos2,
                           abs(loc1.position - loc2.position) + 1 AS diff
@@ -16,35 +18,35 @@ def get_all_exon_lengths(cursor, build):
                        LEFT JOIN location AS loc2 ON edge.v2 = loc2.location_ID
                        WHERE edge_type = 'exon' 
                        AND loc1.genome_build = '%s'
-                       AND loc2.genome_build = '%s' """ % (build, build))
+                       AND loc2.genome_build = '%s' """
+        % (build, build)
+    )
 
     for exon in cursor.fetchall():
-        exon_ID = exon['edge_ID']
-        length = exon['diff']
+        exon_ID = exon["edge_ID"]
+        length = exon["diff"]
         exon_lengths[exon_ID] = length
 
     return exon_lengths
 
+
 def get_transcript_length(transcript_row, exon_lengths):
-    """ Compute the length of the supplied transcript model based on its
-        exons. Expected input format consists of a transcript row from a
-        TALON database. """
+    """Compute the length of the supplied transcript model based on its
+    exons. Expected input format consists of a transcript row from a
+    TALON database."""
 
     length = 0
-    start_exon = transcript_row['start_exon']
-    end_exon = transcript_row['end_exon']
-    n_exons = transcript_row['n_exons']
+    start_exon = transcript_row["start_exon"]
+    end_exon = transcript_row["end_exon"]
+    n_exons = transcript_row["n_exons"]
 
     if n_exons == 1:
         return exon_lengths[start_exon]
     else:
-        jn_path =  transcript_row['jn_path'].split(",")
+        jn_path = transcript_row["jn_path"].split(",")
         all_exons = [start_exon] + [int(x) for x in jn_path[1::2]] + [end_exon]
-       
+
         for exon in all_exons:
             length += exon_lengths[exon]
 
         return length
-
-
-    
diff --git a/src/talon/logger.py b/src/talon/logger.py
index 9d485f0..31103d7 100644
--- a/src/talon/logger.py
+++ b/src/talon/logger.py
@@ -12,6 +12,4 @@ def _init_logger(verbosity):
     msg_fmt = "%(asctime)s : %(levelname)s : [%(filename)s:%(lineno)d] : %(message)s"
     date_fmt = "[ %Y-%m-%d %H:%M:%S ]"
 
-    logging.basicConfig(level=level,
-                        format=msg_fmt,
-                        datefmt=date_fmt)
+    logging.basicConfig(level=level, format=msg_fmt, datefmt=date_fmt)
diff --git a/src/talon/post/ab_utils.py b/src/talon/post/ab_utils.py
index 2a2b8e6..1818d23 100644
--- a/src/talon/post/ab_utils.py
+++ b/src/talon/post/ab_utils.py
@@ -1,20 +1,22 @@
-import sqlite3
 import itertools
 import operator
+import sqlite3
 from optparse import OptionParser
 from pathlib import Path
-import scanpy
+
 import numpy as np
+import scanpy
 
-from . import filter_talon_transcripts as filt
 from .. import dstruct as dstruct
 from .. import length_utils as lu
-from . import post_utils as putils
 from .. import query_utils as qutils
 from .. import talon as talon
+from . import filter_talon_transcripts as filt
+from . import post_utils as putils
+
 
 def check_annot_validity(annot, database):
-    """ Make sure that the user has entered a correct annotation name """
+    """Make sure that the user has entered a correct annotation name"""
 
     conn = sqlite3.connect(database)
     cursor = conn.cursor()
@@ -27,21 +29,25 @@ def check_annot_validity(annot, database):
         annotations.remove("TALON")
 
     if annot == None:
-        message = "Please provide a valid annotation name. " + \
-                  "In this database, your options are: " + \
-                  ", ".join(annotations)
+        message = (
+            "Please provide a valid annotation name. " + "In this database, your options are: " + ", ".join(annotations)
+        )
         raise ValueError(message)
 
     if annot not in annotations:
-        message = "Annotation name '" + annot + \
-                  "' not found in this database. Try one of the following: " + \
-                  ", ".join(annotations)
+        message = (
+            "Annotation name '"
+            + annot
+            + "' not found in this database. Try one of the following: "
+            + ", ".join(annotations)
+        )
         raise ValueError(message)
 
     return
 
+
 def check_build_validity(build, database):
-    """ Make sure that the user has entered a correct build name """
+    """Make sure that the user has entered a correct build name"""
 
     conn = sqlite3.connect(database)
     cursor = conn.cursor()
@@ -51,21 +57,22 @@ def check_build_validity(build, database):
     conn.close()
 
     if build == None:
-        message = "Please provide a valid genome build name. " + \
-                  "In this database, your options are: " + \
-                  ", ".join(builds)
+        message = (
+            "Please provide a valid genome build name. " + "In this database, your options are: " + ", ".join(builds)
+        )
         raise ValueError(message)
 
     if build not in builds:
-        message = "Build name '" + build + \
-                  "' not found in this database. Try one of the following: " + \
-                  ", ".join(builds)
+        message = (
+            "Build name '" + build + "' not found in this database. Try one of the following: " + ", ".join(builds)
+        )
         raise ValueError(message)
 
     return
 
+
 def fetch_naming_prefix(database):
-    """ Get naming prefix from the database run_info table """
+    """Get naming prefix from the database run_info table"""
     conn = sqlite3.connect(database)
     conn.row_factory = sqlite3.Row
     cursor = conn.cursor()
@@ -75,8 +82,9 @@ def fetch_naming_prefix(database):
     conn.close()
     return prefix
 
+
 def fetch_n_places(database):
-    """ Get length of name field from the database run_info table """
+    """Get length of name field from the database run_info table"""
     conn = sqlite3.connect(database)
     conn.row_factory = sqlite3.Row
     cursor = conn.cursor()
@@ -86,9 +94,10 @@ def fetch_n_places(database):
     conn.close()
     return int(n_places)
 
+
 def get_transcript_lengths(database, build):
-    """ Read the transcripts from the database. Then compute the lengths.
-        Store in a dictionary """
+    """Read the transcripts from the database. Then compute the lengths.
+    Store in a dictionary"""
 
     transcript_lengths = {}
 
@@ -101,15 +110,16 @@ def get_transcript_lengths(database, build):
 
     cursor.execute("SELECT * FROM transcripts")
     for transcript_row in cursor.fetchall():
-        transcript_ID = transcript_row['transcript_ID']
+        transcript_ID = transcript_row["transcript_ID"]
         length = lu.get_transcript_length(transcript_row, exon_lens)
         transcript_lengths[transcript_ID] = length
 
     conn.close()
     return transcript_lengths
 
+
 def fetch_dataset_list(dataset_file, database):
-    """ Gets a list of all datasets in the database """
+    """Gets a list of all datasets in the database"""
 
     conn = sqlite3.connect(database)
     cursor = conn.cursor()
@@ -117,17 +127,15 @@ def fetch_dataset_list(dataset_file, database):
     conn.close()
 
     if dataset_file == None:
-
         return all_db_datasets
 
     else:
         datasets = []
-        with open(dataset_file, 'r') as f:
+        with open(dataset_file, "r") as f:
             for line in f:
                 dataset = line.strip()
                 if dataset not in all_db_datasets:
-                    raise ValueError("Dataset name '%s' not found in database" \
-                                      % (dataset))
+                    raise ValueError("Dataset name '%s' not found in database" % (dataset))
                 datasets.append(dataset)
 
         return datasets
diff --git a/src/talon/post/call_longest_ends.py b/src/talon/post/call_longest_ends.py
index f570683..7611e10 100644
--- a/src/talon/post/call_longest_ends.py
+++ b/src/talon/post/call_longest_ends.py
@@ -1,82 +1,95 @@
 import pandas as pd
-pd.options.mode.chained_assignment = None 
+
+pd.options.mode.chained_assignment = None
 import argparse
-import numpy as np
 import csv
 
-def get_args():
+import numpy as np
 
-    desc = ('Replaces the starts or ends of transcripts in a GTF with the'
-            ' longest alternatives (similar to the GENCODE model of'
-            'calling transcripts)')
+
+def get_args():
+    desc = (
+        "Replaces the starts or ends of transcripts in a GTF with the"
+        " longest alternatives (similar to the GENCODE model of"
+        "calling transcripts)"
+    )
     parser = argparse.ArgumentParser(description=desc)
 
-    parser.add_argument('-gtf', dest='gtf',
-        help='TALON GTF to serve as the template to modify')
-    parser.add_argument('-read_annot', dest='annot',
-        help='Read annot file from TALON to extract raw read ends from')
-    parser.add_argument("--datasets", "--d",  dest = "datasets_file",
-        help = """A file indicating which datasets should be
+    parser.add_argument("-gtf", dest="gtf", help="TALON GTF to serve as the template to modify")
+    parser.add_argument("-read_annot", dest="annot", help="Read annot file from TALON to extract raw read ends from")
+    parser.add_argument(
+        "--datasets",
+        "--d",
+        dest="datasets_file",
+        help="""A file indicating which datasets should be
                   included (one dataset name per line). Default is to include
-                  all datasets.""", default='all')
-    parser.add_argument('--mode', dest='mode', default='tss',
+                  all datasets.""",
+        default="all",
+    )
+    parser.add_argument(
+        "--mode",
+        dest="mode",
+        default="tss",
         help="Modify TSSs or TESs, 'tss', 'tes', 'both'. Default: 'tss'",
-        choices={'tss', 'tes', 'both'})
-    parser.add_argument('--novelty', dest='novelty',
-        help="Whether to only modify ends from novel or all models, "+\
-        "'all' or 'novel'. Default: 'all'", choices={'all', 'novel'}, default='all')
-    parser.add_argument('-outprefix', '-o', dest='outprefix',
-        help='Prefix for output file', default='talon')
-    parser.add_argument('--verbose', '-v', action='store_true',
-        default=False, help="Display in progress output")
+        choices={"tss", "tes", "both"},
+    )
+    parser.add_argument(
+        "--novelty",
+        dest="novelty",
+        help="Whether to only modify ends from novel or all models, " + "'all' or 'novel'. Default: 'all'",
+        choices={"all", "novel"},
+        default="all",
+    )
+    parser.add_argument("-outprefix", "-o", dest="outprefix", help="Prefix for output file", default="talon")
+    parser.add_argument("--verbose", "-v", action="store_true", default=False, help="Display in progress output")
 
     args = parser.parse_args()
 
     return args
 
+
 # df: TALON read annotation dataframe
 # how: 'tes' or 'tss' for calling ends or starts respectively
 # novelty: 'all' or 'novel' based on which transcript models
 #          you want to modify the ends of
 # datasets: string file path of file with datasets to use when
 #           calling longest ends from reads
-def get_longest_ends(df, how='tes', novelty='novel', datasets='all'):
+def get_longest_ends(df, how="tes", novelty="novel", datasets="all"):
+    if novelty == "novel":
+        df = df.loc[df.transcript_novelty != "Known"]
 
-    if novelty == 'novel':
-        df = df.loc[df.transcript_novelty != 'Known']
-
-    if datasets != 'all':
+    if datasets != "all":
         df = df.loc[df.dataset.isin(datasets)]
 
-    fwd = df.loc[df.strand == '+']
-    rev = df.loc[df.strand == '-']
+    fwd = df.loc[df.strand == "+"]
+    rev = df.loc[df.strand == "-"]
 
     # furthest downstream for tes
     # if + strand, max coord of read end
     # if - strand, min coord of read end
-    if how == 'tes':
-        fwd = fwd[['transcript_ID', 'read_end']]
-        fwd = fwd.groupby('transcript_ID').max().reset_index()
-        rev = rev[['transcript_ID', 'read_end']]
-        rev = rev.groupby('transcript_ID').min().reset_index()
-
+    if how == "tes":
+        fwd = fwd[["transcript_ID", "read_end"]]
+        fwd = fwd.groupby("transcript_ID").max().reset_index()
+        rev = rev[["transcript_ID", "read_end"]]
+        rev = rev.groupby("transcript_ID").min().reset_index()
 
     # furthest upstream for tss:
     # if + strand, min coord of read start
     # if - strand, max coord of read start
-    elif how == 'tss':
-        fwd = fwd[['transcript_ID', 'read_start']]
-        fwd = fwd.groupby('transcript_ID').min().reset_index()
-        rev = rev[['transcript_ID', 'read_start']]
-        rev = rev.groupby('transcript_ID').max().reset_index()
+    elif how == "tss":
+        fwd = fwd[["transcript_ID", "read_start"]]
+        fwd = fwd.groupby("transcript_ID").min().reset_index()
+        rev = rev[["transcript_ID", "read_start"]]
+        rev = rev.groupby("transcript_ID").max().reset_index()
 
     # concat fwd and rev
     df = pd.concat([fwd, rev])
 
-    df = df.sort_values(by='transcript_ID', ascending='True')
+    df = df.sort_values(by="transcript_ID", ascending="True")
 
     return df
 
+
 # get the longest ends from the read annotation file
 # annot: TALON read annotation file path
 # how: 'tss' or 'tes', tss will find start ends and tes will find stop ends
@@ -86,136 +99,155 @@ def get_longest_ends(df, how='tes', novelty='novel', datasets='all'):
 # opref: output file prefix
 # verbose: display processing progress
 # test: print out dataframe before and after editing
-def replace_gtf_end_coords(gtf_df, ends, how='tes', test=False, verbose=False):
-
-    if how == 'tes':
-        ends.columns = ['transcript_id', 'tes']
-    elif how == 'tss':
-        ends.columns = ['transcript_id', 'tss']
+def replace_gtf_end_coords(gtf_df, ends, how="tes", test=False, verbose=False):
+    if how == "tes":
+        ends.columns = ["transcript_id", "tes"]
+    elif how == "tss":
+        ends.columns = ["transcript_id", "tss"]
 
     # merge gtf_df with end information
-#     ends.transcript_id = ends.transcript_id.astype('str')
+    #     ends.transcript_id = ends.transcript_id.astype('str')
     df = gtf_df.loc[gtf_df.transcript_id.notnull()]
-    ends.transcript_id = ends.transcript_id.astype('str')
-    gtf_df.transcript_id = gtf_df.transcript_id.astype('str')
-    gtf_df = gtf_df.merge(ends, how='left', on='transcript_id')
-    df.transcript_id = df.transcript_id.astype('str')
-    df = df.merge(ends, how='inner')
+    ends.transcript_id = ends.transcript_id.astype("str")
+    gtf_df.transcript_id = gtf_df.transcript_id.astype("str")
+    gtf_df = gtf_df.merge(ends, how="left", on="transcript_id")
+    df.transcript_id = df.transcript_id.astype("str")
+    df = df.merge(ends, how="inner")
 
     if test:
-        print('Before editing')
-        print(gtf_df[['transcript_id', 'entry_type', 'strand', 'start', 'stop', how]])
+        print("Before editing")
+        print(gtf_df[["transcript_id", "entry_type", "strand", "start", "stop", how]])
 
     # swap out read starts or ends for the longest ones
     tids = df.transcript_id.unique()
     for t, tid in enumerate(tids):
         if t % 1000 == 0 and verbose:
-            print('Processing transcript {} of {}'.format(t, len(tids)))
+            print("Processing transcript {} of {}".format(t, len(tids)))
 
         # fwd: swap out transcript "stop" and last exon "stop"
         # rev: swap out transcript "start" and last exon "start"
-        if how == 'tes':
+        if how == "tes":
             # tes fwd
-            ind = gtf_df.loc[(gtf_df.strand=='+')&(gtf_df.transcript_id==tid)].index.tolist()
+            ind = gtf_df.loc[(gtf_df.strand == "+") & (gtf_df.transcript_id == tid)].index.tolist()
             if ind:
                 # stop of transcript for fwd
                 i = ind[0]
-                gtf_df.loc[i, 'stop'] = gtf_df.loc[i, 'tes']
+                gtf_df.loc[i, "stop"] = gtf_df.loc[i, "tes"]
                 # stop of last exon for fwd
                 i = ind[-1]
-                gtf_df.loc[i, 'stop'] = gtf_df.loc[i, 'tes']
+                gtf_df.loc[i, "stop"] = gtf_df.loc[i, "tes"]
 
             # tes rev
-            ind = gtf_df.loc[(gtf_df.strand=='-')&(gtf_df.transcript_id==tid)].index.tolist()
+            ind = gtf_df.loc[(gtf_df.strand == "-") & (gtf_df.transcript_id == tid)].index.tolist()
             if ind:
                 # start of trancscript for rev
                 i = ind[0]
-                gtf_df.loc[i, 'start'] = gtf_df.loc[i, 'tes']
+                gtf_df.loc[i, "start"] = gtf_df.loc[i, "tes"]
                 # start of last exon for rev
                 i = ind[-1]
-                gtf_df.loc[i, 'start'] = gtf_df.loc[i, 'tes']
+                gtf_df.loc[i, "start"] = gtf_df.loc[i, "tes"]
         # fwd: swap out transcript "start" and first exon "start"
         # rev: swap out transcript "stop" and first exon "stop"
-        elif how == 'tss':
+        elif how == "tss":
             # tss fwd
-            ind = gtf_df.loc[(gtf_df.strand=='+')&(gtf_df.transcript_id==tid)].index.tolist()
+            ind = gtf_df.loc[(gtf_df.strand == "+") & (gtf_df.transcript_id == tid)].index.tolist()
             if ind:
                 # start of transcript for fwd
                 i = ind[0]
-                gtf_df.loc[i, 'start'] = gtf_df.loc[i, 'tss']
+                gtf_df.loc[i, "start"] = gtf_df.loc[i, "tss"]
                 # start of first exon for fwd
                 i = ind[1]
-                gtf_df.loc[i, 'start'] = gtf_df.loc[i, 'tss']
+                gtf_df.loc[i, "start"] = gtf_df.loc[i, "tss"]
             # tss rev
-            ind = gtf_df.loc[(gtf_df.strand=='-')&(gtf_df.transcript_id==tid)].index.tolist()
+            ind = gtf_df.loc[(gtf_df.strand == "-") & (gtf_df.transcript_id == tid)].index.tolist()
             if ind:
                 # stop of transcript for rev
                 i = ind[0]
-                gtf_df.loc[i, 'stop'] = gtf_df.loc[i, 'tss']
+                gtf_df.loc[i, "stop"] = gtf_df.loc[i, "tss"]
                 # stop of first exon for rev
                 i = ind[1]
-                gtf_df.loc[i, 'stop'] = gtf_df.loc[i, 'tss']
+                gtf_df.loc[i, "stop"] = gtf_df.loc[i, "tss"]
 
     # now fix gene coordinates
 
     # tes
-    if how == 'tes':
+    if how == "tes":
         # fwd: replace "stop" of the gene with the maximum of the "stops"
         # gene_ind = gtf_df.loc[(gtf_df.strand == '+')&(gtf_df.entry_type=='gene')&(gtf_df.tes.notnull())].index.tolist()
         # gene_ind = gtf_df.loc[(gtf_df.strand == '+')&(gtf_df.entry_type=='gene')].index.tolist()
-        genes = gtf_df.loc[(gtf_df.strand == '+')&(gtf_df.entry_type=='transcript')&(gtf_df.tes.notnull())].gene_id.unique().tolist()
-        gene_ind = gtf_df.loc[(gtf_df.gene_id.isin(genes)&(gtf_df.entry_type=='gene'))].index.tolist()
+        genes = (
+            gtf_df.loc[(gtf_df.strand == "+") & (gtf_df.entry_type == "transcript") & (gtf_df.tes.notnull())]
+            .gene_id.unique()
+            .tolist()
+        )
+        gene_ind = gtf_df.loc[(gtf_df.gene_id.isin(genes) & (gtf_df.entry_type == "gene"))].index.tolist()
         if gene_ind:
-            fwd = gtf_df.loc[(gtf_df.strand == '+')&(gtf_df.entry_type=='transcript')]
+            fwd = gtf_df.loc[(gtf_df.strand == "+") & (gtf_df.entry_type == "transcript")]
             if test:
-                print('fwd')
+                print("fwd")
                 print(gtf_df.loc[gene_ind])
-            gtf_df.loc[gene_ind, 'stop'] = gtf_df.loc[gene_ind].apply(lambda x: \
-                fwd.loc[fwd.gene_id==x.gene_id, 'stop'].max(), axis=1)
+            gtf_df.loc[gene_ind, "stop"] = gtf_df.loc[gene_ind].apply(
+                lambda x: fwd.loc[fwd.gene_id == x.gene_id, "stop"].max(), axis=1
+            )
 
         # rev: replace "start" of the gene with the minimum of the "starts"
         # gene_ind = gtf_df.loc[(gtf_df.strand == '-')&(gtf_df.entry_type=='gene')&(gtf_df.tes.notnull())].index.tolist()
-        genes = gtf_df.loc[(gtf_df.strand == '-')&(gtf_df.entry_type=='transcript')&(gtf_df.tes.notnull())].gene_id.unique().tolist()
-        gene_ind = gtf_df.loc[(gtf_df.gene_id.isin(genes)&(gtf_df.entry_type=='gene'))].index.tolist()
+        genes = (
+            gtf_df.loc[(gtf_df.strand == "-") & (gtf_df.entry_type == "transcript") & (gtf_df.tes.notnull())]
+            .gene_id.unique()
+            .tolist()
+        )
+        gene_ind = gtf_df.loc[(gtf_df.gene_id.isin(genes) & (gtf_df.entry_type == "gene"))].index.tolist()
         if gene_ind:
-            rev = gtf_df.loc[(gtf_df.strand == '-')&(gtf_df.entry_type=='transcript')]
+            rev = gtf_df.loc[(gtf_df.strand == "-") & (gtf_df.entry_type == "transcript")]
             if test:
-                print('rev')
+                print("rev")
                 print(gtf_df.loc[gene_ind])
-            gtf_df.loc[gene_ind, 'start'] = gtf_df.loc[gene_ind].apply(lambda x: \
-                rev.loc[rev.gene_id==x.gene_id, 'start'].min(), axis=1)
+            gtf_df.loc[gene_ind, "start"] = gtf_df.loc[gene_ind].apply(
+                lambda x: rev.loc[rev.gene_id == x.gene_id, "start"].min(), axis=1
+            )
 
     # tss
-    elif how == 'tss':
+    elif how == "tss":
         # fwd: replace "start" of the gene with the minimum of the "starts"
         # gene_ind = gtf_df.loc[(gtf_df.strand == '+')&(gtf_df.entry_type=='gene')&(gtf_df.tss.notnull())].index.tolist()
-        genes = gtf_df.loc[(gtf_df.strand == '+')&(gtf_df.entry_type=='transcript')&(gtf_df.tss.notnull())].gene_id.unique().tolist()
-        gene_ind = gtf_df.loc[(gtf_df.gene_id.isin(genes)&(gtf_df.entry_type=='gene'))].index.tolist()
+        genes = (
+            gtf_df.loc[(gtf_df.strand == "+") & (gtf_df.entry_type == "transcript") & (gtf_df.tss.notnull())]
+            .gene_id.unique()
+            .tolist()
+        )
+        gene_ind = gtf_df.loc[(gtf_df.gene_id.isin(genes) & (gtf_df.entry_type == "gene"))].index.tolist()
         if gene_ind:
-            fwd = gtf_df.loc[(gtf_df.strand == '+')&(gtf_df.entry_type=='transcript')]
-            gtf_df.loc[gene_ind, 'start'] = gtf_df.loc[gene_ind].apply(lambda x: \
-                fwd.loc[fwd.gene_id==x.gene_id, 'start'].min(), axis=1)
+            fwd = gtf_df.loc[(gtf_df.strand == "+") & (gtf_df.entry_type == "transcript")]
+            gtf_df.loc[gene_ind, "start"] = gtf_df.loc[gene_ind].apply(
+                lambda x: fwd.loc[fwd.gene_id == x.gene_id, "start"].min(), axis=1
+            )
 
         # rev: replace "stop" of the gene with the maximum of the "stops"
         # gene_ind = gtf_df.loc[(gtf_df.strand == '-')&(gtf_df.entry_type=='gene')&(gtf_df.tss.notnull())].index.tolist()
-        genes = gtf_df.loc[(gtf_df.strand == '-')&(gtf_df.entry_type=='transcript')&(gtf_df.tss.notnull())].gene_id.unique().tolist()
-        gene_ind = gtf_df.loc[(gtf_df.gene_id.isin(genes)&(gtf_df.entry_type=='gene'))].index.tolist()
+        genes = (
+            gtf_df.loc[(gtf_df.strand == "-") & (gtf_df.entry_type == "transcript") & (gtf_df.tss.notnull())]
+            .gene_id.unique()
+            .tolist()
+        )
+        gene_ind = gtf_df.loc[(gtf_df.gene_id.isin(genes) & (gtf_df.entry_type == "gene"))].index.tolist()
         if gene_ind:
-            rev = gtf_df.loc[(gtf_df.strand == '-')&(gtf_df.entry_type=='transcript')]
-            gtf_df.loc[gene_ind, 'stop'] = gtf_df.loc[gene_ind].apply(lambda x: \
-                rev.loc[rev.gene_id==x.gene_id, 'stop'].max(), axis=1)
+            rev = gtf_df.loc[(gtf_df.strand == "-") & (gtf_df.entry_type == "transcript")]
+            gtf_df.loc[gene_ind, "stop"] = gtf_df.loc[gene_ind].apply(
+                lambda x: rev.loc[rev.gene_id == x.gene_id, "stop"].max(), axis=1
+            )
 
     if test:
         print()
-        print('After editing')
-        print(gtf_df[['transcript_id', 'entry_type', 'strand', 'start', 'stop', how]])
+        print("After editing")
+        print(gtf_df[["transcript_id", "entry_type", "strand", "start", "stop", how]])
 
     # cols=['chr', 'source', 'entry_type', \
     #       'start', 'stop', 'score', 'strand',\
     #        'frame', 'fields']
     # gtf_df = gtf_df[cols]
-    gtf_df['start'] = gtf_df['start'].astype('int')
-    gtf_df['stop'] = gtf_df['stop'].astype('int')
+    gtf_df["start"] = gtf_df["start"].astype("int")
+    gtf_df["stop"] = gtf_df["stop"].astype("int")
     # if test:
     #     fname = '{}_revised_{}_test.gtf'.format(opref, how)
     # else:
@@ -223,12 +255,13 @@ def replace_gtf_end_coords(gtf_df, ends, how='tes', test=False, verbose=False):
     # gtf_df.to_csv(fname, sep='\t', header=None, index=False, quoting=csv.QUOTE_NONE)
     return gtf_df
 
+
 # return a list of datasets from read_annot file
 # subset by a list of datasets given from a datasets file
-def get_datasets_from_read_annot(df, datasets='all'):
-    if datasets != 'all':
-        dataset_df = pd.read_csv(datasets, header=None, names=['dataset'])
-        dataset_list = dataset_df['dataset'].tolist()
+def get_datasets_from_read_annot(df, datasets="all"):
+    if datasets != "all":
+        dataset_df = pd.read_csv(datasets, header=None, names=["dataset"])
+        dataset_list = dataset_df["dataset"].tolist()
         for d in dataset_list:
             if d not in df.dataset.unique().tolist():
                 raise ValueError("Dataset name {} not found in read_annot".format(d))
@@ -237,7 +270,6 @@ def get_datasets_from_read_annot(df, datasets='all'):
 
 
 def main():
-
     args = get_args()
     gtf = args.gtf
     annot = args.annot
@@ -252,9 +284,9 @@ def main():
 
     # read in read_annot file
     try:
-        df = pd.read_csv(annot, sep='\t')
+        df = pd.read_csv(annot, sep="\t")
     except:
-        raise Error('Problem loading read annot file {}'.format(annot))
+        raise Error("Problem loading read annot file {}".format(annot))
 
     # make sure datasets are valid
     # if datasets != 'all':
@@ -263,39 +295,42 @@ def main():
     datasets = get_datasets_from_read_annot(df, datasets)
 
     # read gtf
-    gtf_df = pd.read_csv(gtf, sep='\t', header=None, \
-                names=['chr', 'source', 'entry_type', \
-                       'start', 'stop', 'score', 'strand',\
-                       'frame', 'fields'], comment='#')
+    gtf_df = pd.read_csv(
+        gtf,
+        sep="\t",
+        header=None,
+        names=["chr", "source", "entry_type", "start", "stop", "score", "strand", "frame", "fields"],
+        comment="#",
+    )
 
     # get relevant values from fields
-    gtf_df['transcript_id'] = np.nan
-    gtf_df.loc[gtf_df.entry_type!='gene', 'transcript_id'] = gtf_df.loc[gtf_df.entry_type!='gene'].fields.str.split(pat='talon_transcript "', n=1, expand=True)[1]
-    gtf_df.loc[gtf_df.entry_type!='gene', 'transcript_id'] = gtf_df.loc[gtf_df.entry_type!='gene'].transcript_id.str.split(pat='"', n=1, expand=True)[0]
-    gtf_df['gene_id'] = gtf_df.loc[gtf_df.entry_type!='gene'].fields.str.split(pat='talon_gene "', n=1, expand=True)[1]
-    gtf_df['gene_id'] = gtf_df.loc[gtf_df.entry_type!='gene'].gene_id.str.split(pat='"', n=1, expand=True)[0]
+    gtf_df["transcript_id"] = np.nan
+    gtf_df.loc[gtf_df.entry_type != "gene", "transcript_id"] = gtf_df.loc[gtf_df.entry_type != "gene"].fields.str.split(
+        pat='talon_transcript "', n=1, expand=True
+    )[1]
+    gtf_df.loc[gtf_df.entry_type != "gene", "transcript_id"] = gtf_df.loc[
+        gtf_df.entry_type != "gene"
+    ].transcript_id.str.split(pat='"', n=1, expand=True)[0]
+    gtf_df["gene_id"] = gtf_df.loc[gtf_df.entry_type != "gene"].fields.str.split(pat='talon_gene "', n=1, expand=True)[
+        1
+    ]
+    gtf_df["gene_id"] = gtf_df.loc[gtf_df.entry_type != "gene"].gene_id.str.split(pat='"', n=1, expand=True)[0]
 
     # first, call ends from the read annot file
-    if mode == 'both':
-
+    if mode == "both":
         # tss first
-        ends = get_longest_ends(df, how='tss', novelty=novelty, datasets=datasets)
-        gtf_df = replace_gtf_end_coords(gtf_df, ends,
-            how='tss', verbose=verbose)
+        ends = get_longest_ends(df, how="tss", novelty=novelty, datasets=datasets)
+        gtf_df = replace_gtf_end_coords(gtf_df, ends, how="tss", verbose=verbose)
 
         # tes
-        ends = get_longest_ends(df, how='tes', novelty=novelty, datasets=datasets)
-        gtf_df = replace_gtf_end_coords(gtf_df, ends,
-            how='tes', verbose=verbose)
+        ends = get_longest_ends(df, how="tes", novelty=novelty, datasets=datasets)
+        gtf_df = replace_gtf_end_coords(gtf_df, ends, how="tes", verbose=verbose)
 
     else:
         ends = get_longest_ends(df, how=mode, novelty=novelty, datasets=datasets)
-        gtf_df = replace_gtf_end_coords(gtf_df, ends,
-            how=mode, verbose=verbose)
+        gtf_df = replace_gtf_end_coords(gtf_df, ends, how=mode, verbose=verbose)
 
-    cols=['chr', 'source', 'entry_type', \
-          'start', 'stop', 'score', 'strand',\
-           'frame', 'fields']
+    cols = ["chr", "source", "entry_type", "start", "stop", "score", "strand", "frame", "fields"]
     gtf_df = gtf_df[cols]
-    fname = '{}_revised_{}.gtf'.format(opref, mode)
-    gtf_df.to_csv(fname, sep='\t', header=None, index=False, quoting=csv.QUOTE_NONE)
+    fname = "{}_revised_{}.gtf".format(opref, mode)
+    gtf_df.to_csv(fname, sep="\t", header=None, index=False, quoting=csv.QUOTE_NONE)
diff --git a/src/talon/post/create_GTF_abundance_from_database.py b/src/talon/post/create_GTF_abundance_from_database.py
index 6f25510..3943f1f 100644
--- a/src/talon/post/create_GTF_abundance_from_database.py
+++ b/src/talon/post/create_GTF_abundance_from_database.py
@@ -2,45 +2,60 @@
 # Author: Dana Wyman
 # -----------------------------------------------------------------------------
 # create_GTF_abundance_from_database.py is designed to generate a GTF
-# as well as an abundance file with the same filtering options. 
+# as well as an abundance file with the same filtering options.
 
-from optparse import OptionParser
-import subprocess
 import os
-import sys
 import shlex
+import subprocess
+import sys
+from optparse import OptionParser
+
 from .create_abundance_file_from_database import main as create_abundance_file_main
-from .filter_talon_transcripts import main as filter_transcripts_main
 from .create_GTF_from_database import main as gtf_from_db_main
+from .filter_talon_transcripts import main as filter_transcripts_main
 
-parser = OptionParser(description="""A script to generate a GTF and abundance file
-							with the same filtering options.""")
+parser = OptionParser(
+    description="""A script to generate a GTF and abundance file
+							with the same filtering options."""
+)
 
-parser.add_option("--db", dest = "database",
-    help = "TALON database", metavar = "FILE", type = "string")
-parser.add_option("--annot", "-a", dest = "annot",
-    help = """Which annotation version to use. Will determine which
+parser.add_option("--db", dest="database", help="TALON database", metavar="FILE", type="string")
+parser.add_option(
+    "--annot",
+    "-a",
+    dest="annot",
+    help="""Which annotation version to use. Will determine which
               annotation transcripts are considered known or novel
               relative to. Note: must be in the TALON database.""",
-    type = "string")
-parser.add_option("--build", "-b", dest = "build",
-    help = "Genome build to use. Note: must be in the TALON database.",
-    type = "string")
-parser.add_option("--filter", dest ="filtering", action='store_true',
-                  help = "If this option is set, the transcripts in the  \
+    type="string",
+)
+parser.add_option(
+    "--build", "-b", dest="build", help="Genome build to use. Note: must be in the TALON database.", type="string"
+)
+parser.add_option(
+    "--filter",
+    dest="filtering",
+    action="store_true",
+    help="If this option is set, the transcripts in the  \
                   database will be filtered prior to GTF creation \
-                  (for more information, see filter_talon_transcripts.py)")
-parser.add_option("--pairings", "-p",  dest = "pairings_file",
-    help = """Optional (only relevant if filter = true): A file indicating
+                  (for more information, see filter_talon_transcripts.py)",
+)
+parser.add_option(
+    "--pairings",
+    "-p",
+    dest="pairings_file",
+    help="""Optional (only relevant if filter = true): A file indicating
               which datasets should be considered together when filtering
               novel transcripts (i.e. biological replicates).
               Format: Each line of the file constitutes a group, with
               member datasets separated by commas.
               If no file is provided, then novel transcripts appearing in
               any two datasets will be accepted.""",
-    metavar = "FILE", type = "string", default = None)
-parser.add_option("--o", dest = "outprefix", help = "Prefix for output file",
-    metavar = "FILE", type = "string")
+    metavar="FILE",
+    type="string",
+    default=None,
+)
+parser.add_option("--o", dest="outprefix", help="Prefix for output file", metavar="FILE", type="string")
 
 (opt, args) = parser.parse_args()
 
@@ -51,47 +66,43 @@
 # make abundance file
 db = opt.database
 annot = opt.annot
-build = opt.build 
+build = opt.build
 o = opt.outprefix
 filtering = opt.filtering
 pairings = opt.pairings_file
-create_abundance_file_arguments = (
-    "--db {} --annot {} --build {} --o {}".format(tpath, db, annot, build, o))
+create_abundance_file_arguments = "--db {} --annot {} --build {} --o {}".format(tpath, db, annot, build, o)
 
 if filtering:
-	create_abundance_file_arguments+= ' --filter'
-	if pairings != None:
-		create_abundance_file_arguments+= ' --pairings {}'.format(pairings)
+    create_abundance_file_arguments += " --filter"
+    if pairings != None:
+        create_abundance_file_arguments += " --pairings {}".format(pairings)
 
 # TODO: Call a function with argument instead of using argv and calling main
-sys.argv = ["create_abundance_file_from_database.py"] + shlex.split(
-	create_abundance_file_arguments)
+sys.argv = ["create_abundance_file_from_database.py"] + shlex.split(create_abundance_file_arguments)
 create_abundance_file_main()
 
 # make whitelist file for GTF
 if filtering:
-	outfile = o+'_whitelist'
-	filter_arguments = "--db {} --annot {} --o {}".format(
-		tpath, db, annot, outfile)
-	if pairings != None:
-		filter_arguments+=' --pairings {}'.format(pairings)
-	sys.argv = ["filter_talon_transcripts.py"] + shlex.split(filter_arguments)
-	filter_transcripts_main()
+    outfile = o + "_whitelist"
+    filter_arguments = "--db {} --annot {} --o {}".format(tpath, db, annot, outfile)
+    if pairings != None:
+        filter_arguments += " --pairings {}".format(pairings)
+    sys.argv = ["filter_talon_transcripts.py"] + shlex.split(filter_arguments)
+    filter_transcripts_main()
 
 # make GTF
-gtf_from_db_arguments = "--db {} --build {} --annot {} --o {}".format(
-	tpath, db, build, annot, o)
+gtf_from_db_arguments = "--db {} --build {} --annot {} --o {}".format(tpath, db, build, annot, o)
 if filtering:
-	gtf_from_db_arguments +=' --whitelist {}'.format(outfile)
+    gtf_from_db_arguments += " --whitelist {}".format(outfile)
 else:
-	gtf_from_db_arguments +=' --observed'
-	# pfile = open(pairings, 'r')
-	# pairing_str = pfile.read()
-	# pfile.close()
-	# pairing_str.replace(',', '\n')
-	# ofile = o+'_datasets'
-	# ofile = open(ofile, 'w')
-	# ofile.write(pairing_str)
-	# cmd+=' --datasets {}'.format(o+'_datasets')
+    gtf_from_db_arguments += " --observed"
+    # pfile = open(pairings, 'r')
+    # pairing_str = pfile.read()
+    # pfile.close()
+    # pairing_str.replace(',', '\n')
+    # ofile = o+'_datasets'
+    # ofile = open(ofile, 'w')
+    # ofile.write(pairing_str)
+    # cmd+=' --datasets {}'.format(o+'_datasets')
 sys.argv = ["create_GTF_from_database.py"] + shlex.split(gtf_from_db_arguments)
 gtf_from_db_main()
diff --git a/src/talon/post/create_GTF_from_database.py b/src/talon/post/create_GTF_from_database.py
index 922398a..b72af34 100644
--- a/src/talon/post/create_GTF_from_database.py
+++ b/src/talon/post/create_GTF_from_database.py
@@ -7,76 +7,94 @@
 import copy
 import itertools
 import operator
-from optparse import OptionParser
 import sqlite3
-
-from . import post_utils as putils
-from . import ab_utils as autils
+from optparse import OptionParser
 from pathlib import Path
 
 from .. import query_utils as qutils
+from . import ab_utils as autils
+from . import post_utils as putils
+
 
 def getOptions():
     parser = OptionParser()
 
-    parser.add_option("--db", dest = "database",
-        help = "TALON database", metavar = "FILE", type = "string")
+    parser.add_option("--db", dest="database", help="TALON database", metavar="FILE", type="string")
 
-    parser.add_option("--build", "-b", dest = "build",
-        help = "Genome build to use. Note: must be in the TALON database.",
-        type = "string")
+    parser.add_option(
+        "--build", "-b", dest="build", help="Genome build to use. Note: must be in the TALON database.", type="string"
+    )
 
-    parser.add_option("--annot", "-a", dest = "annot",
-        help = """Which annotation version to use. Will determine which
+    parser.add_option(
+        "--annot",
+        "-a",
+        dest="annot",
+        help="""Which annotation version to use. Will determine which
                   annotation transcripts are considered known or novel
                   relative to. Note: must be in the TALON database.""",
-        type = "string")
+        type="string",
+    )
 
-    parser.add_option("--whitelist", dest = "whitelist",
-                      help = "Whitelist file of transcripts to include in the \
+    parser.add_option(
+        "--whitelist",
+        dest="whitelist",
+        help="Whitelist file of transcripts to include in the \
                               output. First column should be TALON gene ID, \
                               second column should be TALON transcript ID",
-                      metavar = "FILE", type = "string", default = None)
-
-    parser.add_option("--observed", dest ="observed", action='store_true',
-                      help = "If this option is set, the GTF file will only  \
+        metavar="FILE",
+        type="string",
+        default=None,
+    )
+
+    parser.add_option(
+        "--observed",
+        dest="observed",
+        action="store_true",
+        help="If this option is set, the GTF file will only  \
                       include transcripts that were observed in at least one \
-                      dataset (redundant if dataset file provided).")
-
-    parser.add_option("--datasets", "-d",  dest = "datasets_file",
-        help = """Optional: A file indicating which datasets should be
+                      dataset (redundant if dataset file provided).",
+    )
+
+    parser.add_option(
+        "--datasets",
+        "-d",
+        dest="datasets_file",
+        help="""Optional: A file indicating which datasets should be
                   included (one dataset name per line). Default is to include
                   all datasets.""",
-        metavar = "FILE", type = "string", default = None)
-
-    parser.add_option("--o", dest = "outprefix", help = "Prefix for output GTF",
-        metavar = "FILE", type = "string")
+        metavar="FILE",
+        type="string",
+        default=None,
+    )
 
+    parser.add_option("--o", dest="outprefix", help="Prefix for output GTF", metavar="FILE", type="string")
 
     (options, args) = parser.parse_args()
     return options
 
+
 def create_outname(options):
-    """ Creates filename for the output GTF that reflects the input options that
-        were used. """
+    """Creates filename for the output GTF that reflects the input options that
+    were used."""
 
     outname = options.outprefix + "_talon"
     if options.observed == True:
-        outname = "_".join([ outname, "observedOnly" ])
+        outname = "_".join([outname, "observedOnly"])
 
     outname += ".gtf"
     return outname
 
-def get_annotations(database, feat_type, annot, whitelist = None):
-    """ Extracts annotations from the gene/transcript/exon annotation table of
-        the database (depending on choice of feat_type). Limited to rows where
-        the annot_name column matches the value of annot.
 
-        Returns:
-            annotation_dict: dictionary data structure in which the keys are
-                             gene/transcript/exon TALON IDs (depending on
-                             choice of feat_type) and the value is a list of
-                             annotation tuples.
+def get_annotations(database, feat_type, annot, whitelist=None):
+    """Extracts annotations from the gene/transcript/exon annotation table of
+    the database (depending on choice of feat_type). Limited to rows where
+    the annot_name column matches the value of annot.
+
+    Returns:
+        annotation_dict: dictionary data structure in which the keys are
+                         gene/transcript/exon TALON IDs (depending on
+                         choice of feat_type) and the value is a list of
+                         annotation tuples.
     """
     # Fetch the annotations
     conn = sqlite3.connect(database)
@@ -85,12 +103,17 @@ def get_annotations(database, feat_type, annot, whitelist = None):
     table_name = feat_type + "_annotations"
 
     if whitelist == None:
-        query = "SELECT * FROM " + table_name + " WHERE annot_name = '" + annot + \
-         "' OR source = 'TALON'"
+        query = "SELECT * FROM " + table_name + " WHERE annot_name = '" + annot + "' OR source = 'TALON'"
     else:
-        whitelist_string = "(" + ','.join([str(x) for x in whitelist]) + ")"
-        query = "SELECT * FROM " + table_name + " WHERE (annot_name = '" + annot + \
-                "' OR source = 'TALON') AND ID IN " + whitelist_string
+        whitelist_string = "(" + ",".join([str(x) for x in whitelist]) + ")"
+        query = (
+            "SELECT * FROM "
+            + table_name
+            + " WHERE (annot_name = '"
+            + annot
+            + "' OR source = 'TALON') AND ID IN "
+            + whitelist_string
+        )
 
     cursor.execute(query)
     annotation_tuples = cursor.fetchall()
@@ -100,29 +123,31 @@ def get_annotations(database, feat_type, annot, whitelist = None):
 
     # Group by ID and store in a dictionary
     ID_groups = {}
-    for key,group in itertools.groupby(sorted_annotations,operator.itemgetter(0)):
+    for key, group in itertools.groupby(sorted_annotations, operator.itemgetter(0)):
         ID_groups[key] = list(group)
 
     return ID_groups
 
+
 def get_gene_2_transcripts(database, genome_build, whitelist):
-    """ Creates a dictionary mapping gene IDs to the transcripts that belong to
-        them. The columns in each tuple are:
-            0: gene ID
-            1: transcript ID
-            2: chromosome
-            3: start position (min of 5' and 3')
-            4: end position (max of 5' and 3')
-            5: strand
-            6: edge path
-            7. n_exons
- """
+    """Creates a dictionary mapping gene IDs to the transcripts that belong to
+    them. The columns in each tuple are:
+        0: gene ID
+        1: transcript ID
+        2: chromosome
+        3: start position (min of 5' and 3')
+        4: end position (max of 5' and 3')
+        5: strand
+        6: edge path
+        7. n_exons
+    """
 
     conn = sqlite3.connect(database)
     conn.row_factory = sqlite3.Row
     cursor = conn.cursor()
-    whitelist_string = "(" + ','.join([str(x) for x in whitelist]) + ")"
-    query = """
+    whitelist_string = "(" + ",".join([str(x) for x in whitelist]) + ")"
+    query = (
+        """
             SELECT
                t.gene_ID,
                t.transcript_ID,
@@ -138,9 +163,14 @@ def get_gene_2_transcripts(database, genome_build, whitelist):
            LEFT JOIN location loc1 ON t.start_vertex = loc1.location_ID
            LEFT JOIN location loc2 ON t.end_vertex = loc2.location_ID
            LEFT JOIN genes ON t.gene_ID = genes.gene_ID
-           WHERE loc1.genome_build = '""" + genome_build + """' AND
-           loc2.genome_build = '""" + genome_build + \
-           """' AND t.transcript_ID IN """ + whitelist_string
+           WHERE loc1.genome_build = '"""
+        + genome_build
+        + """' AND
+           loc2.genome_build = '"""
+        + genome_build
+        + """' AND t.transcript_ID IN """
+        + whitelist_string
+    )
     cursor.execute(query)
     transcript_tuples = cursor.fetchall()
 
@@ -148,21 +178,23 @@ def get_gene_2_transcripts(database, genome_build, whitelist):
     sorted_transcript_tuples = sorted(transcript_tuples, key=lambda x: x["gene_ID"])
 
     gene_groups = {}
-    for key,group in itertools.groupby(sorted_transcript_tuples,operator.itemgetter(0)):
+    for key, group in itertools.groupby(sorted_transcript_tuples, operator.itemgetter(0)):
         # Sort by transcript start position
         gene_groups[key] = sorted(list(group), key=lambda x: x["min_pos"])
     conn.close()
 
     return gene_groups
 
+
 def fetch_exon_locations(database, genome_build):
-    """ Queries the database to create a dictionary mapping exon IDs to
-        the chromosome, start, end, and strand of the exon """
+    """Queries the database to create a dictionary mapping exon IDs to
+    the chromosome, start, end, and strand of the exon"""
 
     conn = sqlite3.connect(database)
     cursor = conn.cursor()
 
-    query = """
+    query = (
+        """
             SELECT
                 e.edge_ID,
                 loc1.chromosome,
@@ -172,9 +204,13 @@ def fetch_exon_locations(database, genome_build):
              FROM edge e
              LEFT JOIN location loc1 ON e.v1 = loc1.location_ID
              LEFT JOIN location loc2 ON e.v2 = loc2.location_ID
-             WHERE loc1.genome_build = '""" + genome_build + """' AND
-             loc2.genome_build = '""" + genome_build + \
-             """' AND e.edge_type = 'exon';"""
+             WHERE loc1.genome_build = '"""
+        + genome_build
+        + """' AND
+             loc2.genome_build = '"""
+        + genome_build
+        + """' AND e.edge_type = 'exon';"""
+    )
 
     cursor.execute(query)
     exon_location_tuples = cursor.fetchall()
@@ -188,34 +224,30 @@ def fetch_exon_locations(database, genome_build):
     conn.close()
     return exon_locations
 
-def create_gtf(database, annot, genome_build, whitelist, outfile):
 
+def create_gtf(database, annot, genome_build, whitelist, outfile):
     # Create separate gene and transcript whitelists
     gene_whitelist = []
     transcript_whitelist = []
-    for key,group in itertools.groupby(whitelist,operator.itemgetter(0)):
+    for key, group in itertools.groupby(whitelist, operator.itemgetter(0)):
         gene_whitelist.append(key)
         for id_tuple in list(group):
             transcript_whitelist.append(id_tuple[1])
 
     # Get gene, transcript, and exon annotations
-    gene_annotations = get_annotations(database, "gene", annot,
-                                       whitelist = gene_whitelist)
-    transcript_annotations = get_annotations(database, "transcript", annot,
-                                             whitelist = transcript_whitelist)
+    gene_annotations = get_annotations(database, "gene", annot, whitelist=gene_whitelist)
+    transcript_annotations = get_annotations(database, "transcript", annot, whitelist=transcript_whitelist)
     exon_annotations = get_annotations(database, "exon", annot)
 
-
     # Get transcript data from the database
-    gene_2_transcripts = get_gene_2_transcripts(database, genome_build,
-                         transcript_whitelist)
+    gene_2_transcripts = get_gene_2_transcripts(database, genome_build, transcript_whitelist)
 
     # Get exon location info from database
     exon_ID_2_location = fetch_exon_locations(database, genome_build)
 
     # -------------------------------------------------------------
 
-    o = open(outfile, 'w')
+    o = open(outfile, "w")
 
     # Create a GTF entry for every gene
     for gene_ID, transcript_tuples in gene_2_transcripts.items():
@@ -225,8 +257,7 @@ def create_gtf(database, annot, genome_build, whitelist, outfile):
             attribute = annot[3]
             value = annot[4]
             gene_annotation_dict[attribute] = value
-        gene_GTF_line = get_gene_GTF_entry(gene_ID, transcript_tuples,
-                                      copy.copy(gene_annotation_dict))
+        gene_GTF_line = get_gene_GTF_entry(gene_ID, transcript_tuples, copy.copy(gene_annotation_dict))
         o.write(gene_GTF_line + "\n")
 
         # Create a GTF entry for every transcript of this gene
@@ -239,14 +270,16 @@ def create_gtf(database, annot, genome_build, whitelist, outfile):
                 attribute = annot[3]
                 value = annot[4]
                 transcript_annotation_dict[attribute] = value
-            transcript_GTF_line = get_transcript_GTF_entry(transcript_entry,
-                                            copy.copy(gene_annotation_dict),
-                                       copy.copy(transcript_annotation_dict))
+            transcript_GTF_line = get_transcript_GTF_entry(
+                transcript_entry, copy.copy(gene_annotation_dict), copy.copy(transcript_annotation_dict)
+            )
             o.write(transcript_GTF_line + "\n")
             if transcript_entry["n_exons"] != 1:
-                transcript_edges = [str(transcript_entry["start_exon"])] + \
-                                   str(transcript_entry["jn_path"]).split(",")+ \
-                                   [str(transcript_entry["end_exon"])]
+                transcript_edges = (
+                    [str(transcript_entry["start_exon"])]
+                    + str(transcript_entry["jn_path"]).split(",")
+                    + [str(transcript_entry["end_exon"])]
+                )
             else:
                 transcript_edges = [transcript_entry["start_exon"]]
 
@@ -262,30 +295,35 @@ def create_gtf(database, annot, genome_build, whitelist, outfile):
                     value = annot[4]
                     exon_annotation_dict[attribute] = value
 
-
-                exon_GTF_line = get_exon_GTF_entry(gene_ID, transcript_ID,
-                                                   exon_ID, exon_num,
-                                                   exon_ID_2_location,
-                                                   copy.copy(gene_annotation_dict),
-                                                   copy.copy(transcript_annotation_dict),
-                                                   exon_annotation_dict)
+                exon_GTF_line = get_exon_GTF_entry(
+                    gene_ID,
+                    transcript_ID,
+                    exon_ID,
+                    exon_num,
+                    exon_ID_2_location,
+                    copy.copy(gene_annotation_dict),
+                    copy.copy(transcript_annotation_dict),
+                    exon_annotation_dict,
+                )
                 o.write(exon_GTF_line + "\n")
                 exon_num += 1
     o.close()
     return
 
+
 def make_descriptor_string(attribute, value):
-    """ Create a key-value string to form part of a GTF entry.
-        Example:    gene_id and ENSG00000117676.13
-                          becomes
-                    gene_id "ENSG00000117676.13";
+    """Create a key-value string to form part of a GTF entry.
+    Example:    gene_id and ENSG00000117676.13
+                      becomes
+                gene_id "ENSG00000117676.13";
     """
 
     return str(attribute) + ' "' + str(value) + '";'
 
+
 def format_GTF_tag_values_for_gene(gene_ID, annotation_dict):
-    """ Parses the annotations for this gene, and supplements them where
-        necessary for novel transcripts """
+    """Parses the annotations for this gene, and supplements them where
+    necessary for novel transcripts"""
 
     attributes = []
 
@@ -323,15 +361,15 @@ def format_GTF_tag_values_for_gene(gene_ID, annotation_dict):
     attributes.append(make_descriptor_string("talon_gene", gene_ID))
 
     # Add any remaining annotations
-    for attribute,value in sorted(annotation_dict.items()):
+    for attribute, value in sorted(annotation_dict.items()):
         attributes.append(make_descriptor_string(attribute, value))
 
     return attributes
 
-def format_GTF_tag_values_for_transcript(gene_ID, transcript_ID, gene_annot_dict,
-                                         transcript_annot_dict):
-    """ Parses the annotations for this transcript, and supplements them where
-        necessary for novel transcripts """
+
+def format_GTF_tag_values_for_transcript(gene_ID, transcript_ID, gene_annot_dict, transcript_annot_dict):
+    """Parses the annotations for this transcript, and supplements them where
+    necessary for novel transcripts"""
 
     attributes = []
 
@@ -388,16 +426,17 @@ def format_GTF_tag_values_for_transcript(gene_ID, transcript_ID, gene_annot_dict
     attributes.append(make_descriptor_string("talon_transcript", transcript_ID))
 
     # Add any remaining annotations
-    for attribute,value in sorted(transcript_annot_dict.items()):
+    for attribute, value in sorted(transcript_annot_dict.items()):
         attributes.append(make_descriptor_string(attribute, value))
 
     return attributes
 
-def format_GTF_tag_values_for_exon(gene_ID, transcript_ID, exon_ID, exon_number,
-                                   gene_annot_dict, transcript_annot_dict,
-                                   exon_annot_dict):
-    """ Parses the annotations for this exon, and supplements them where
-        necessary for novel exons """
+
+def format_GTF_tag_values_for_exon(
+    gene_ID, transcript_ID, exon_ID, exon_number, gene_annot_dict, transcript_annot_dict, exon_annot_dict
+):
+    """Parses the annotations for this exon, and supplements them where
+    necessary for novel exons"""
 
     attributes = []
 
@@ -469,13 +508,14 @@ def format_GTF_tag_values_for_exon(gene_ID, transcript_ID, exon_ID, exon_number,
         exon_annot_dict.pop("exon_number")
 
     # Add any remaining annotations
-    for attribute,value in sorted(exon_annot_dict.items()):
+    for attribute, value in sorted(exon_annot_dict.items()):
         attributes.append(make_descriptor_string(attribute, value))
 
     return attributes
 
+
 def get_gene_GTF_entry(gene_ID, associated_transcript_tuples, annotation_dict):
-    """ Creates a GTF annotation entry for the given gene """
+    """Creates a GTF annotation entry for the given gene"""
 
     if "source" in annotation_dict:
         source = annotation_dict["source"]
@@ -492,13 +532,12 @@ def get_gene_GTF_entry(gene_ID, associated_transcript_tuples, annotation_dict):
     frame = "."
     attributes = " ".join(format_GTF_tag_values_for_gene(gene_ID, annotation_dict))
 
-    GTF = '\t'.join([chromosome, source, feature, start, end, score, strand,
-                     frame, attributes])
+    GTF = "\t".join([chromosome, source, feature, start, end, score, strand, frame, attributes])
     return GTF
 
 
 def get_transcript_GTF_entry(transcript_entry, curr_gene_annot_dict, curr_transcript_annot_dict):
-    """ Creates a GTF annotation entry for the given transcript """
+    """Creates a GTF annotation entry for the given transcript"""
 
     if "source" in curr_transcript_annot_dict:
         source = curr_transcript_annot_dict["source"]
@@ -516,19 +555,25 @@ def get_transcript_GTF_entry(transcript_entry, curr_gene_annot_dict, curr_transc
     score = "."
     strand = transcript_entry["strand"]
     frame = "."
-    attributes = " ".join(format_GTF_tag_values_for_transcript(gene_ID,
-                                                               transcript_ID,
-                                                               curr_gene_annot_dict,
-                                                               curr_transcript_annot_dict))
+    attributes = " ".join(
+        format_GTF_tag_values_for_transcript(gene_ID, transcript_ID, curr_gene_annot_dict, curr_transcript_annot_dict)
+    )
 
-    GTF = '\t'.join([chromosome, source, feature, start, end, score, strand,
-                     frame, attributes])
+    GTF = "\t".join([chromosome, source, feature, start, end, score, strand, frame, attributes])
     return GTF
 
-def get_exon_GTF_entry(gene_ID, transcript_ID, exon_ID, exon_num, exon_ID_2_location,
-                       curr_gene_annot_dict, curr_transcript_annot_dict,
-                       curr_exon_annot_dict):
-    """ Creates a GTF annotation entry for the given exon """
+
+def get_exon_GTF_entry(
+    gene_ID,
+    transcript_ID,
+    exon_ID,
+    exon_num,
+    exon_ID_2_location,
+    curr_gene_annot_dict,
+    curr_transcript_annot_dict,
+    curr_exon_annot_dict,
+):
+    """Creates a GTF annotation entry for the given exon"""
 
     if "source" in curr_exon_annot_dict:
         source = curr_exon_annot_dict["source"]
@@ -543,17 +588,22 @@ def get_exon_GTF_entry(gene_ID, transcript_ID, exon_ID, exon_num, exon_ID_2_loca
     score = "."
     strand = curr_exon_location[3]
     frame = "."
-    attributes = " ".join(format_GTF_tag_values_for_exon(gene_ID,
-                                                         transcript_ID,
-                                                         exon_ID, exon_num,
-                                                         curr_gene_annot_dict,
-                                                         curr_transcript_annot_dict,
-                                                         curr_exon_annot_dict))
-
-    GTF = '\t'.join([chromosome, source, feature, start, end, score, strand,
-                     frame, attributes])
+    attributes = " ".join(
+        format_GTF_tag_values_for_exon(
+            gene_ID,
+            transcript_ID,
+            exon_ID,
+            exon_num,
+            curr_gene_annot_dict,
+            curr_transcript_annot_dict,
+            curr_exon_annot_dict,
+        )
+    )
+
+    GTF = "\t".join([chromosome, source, feature, start, end, score, strand, frame, attributes])
     return GTF
 
+
 # def check_annot_validity(annot, database):
 #     """ Make sure that the user has entered a correct annotation name """
 #
@@ -605,6 +655,7 @@ def get_exon_GTF_entry(gene_ID, transcript_ID, exon_ID, exon_num, exon_ID_2_loca
 #
 #     return
 
+
 def main():
     options = getOptions()
     database = options.database
@@ -622,18 +673,13 @@ def main():
     if not Path(database).exists():
         raise ValueError("Database file '%s' does not exist!" % database)
 
-
     # Determine which transcripts to include
-    whitelist = putils.handle_filtering(database,
-                                        annot,
-                                        observed,
-                                        whitelist_file,
-                                        dataset_file)
+    whitelist = putils.handle_filtering(database, annot, observed, whitelist_file, dataset_file)
     # Sort on gene ID
     sorted_whitelist = sorted(whitelist, key=lambda x: x[0])
 
     create_gtf(database, annot, build, whitelist, outfile)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/src/talon/post/create_abundance_file_from_database.py b/src/talon/post/create_abundance_file_from_database.py
index 73a3b39..1447d20 100644
--- a/src/talon/post/create_abundance_file_from_database.py
+++ b/src/talon/post/create_abundance_file_from_database.py
@@ -5,67 +5,81 @@
 # for each transcript in the TALON database across datasets. Modified by
 # filtering option.
 
-import sqlite3
 import itertools
 import operator
+import sqlite3
 from optparse import OptionParser
 from pathlib import Path
 
-from . import filter_talon_transcripts as filt
 from .. import dstruct as dstruct
 from .. import length_utils as lu
-from . import post_utils as putils
-from . import ab_utils as autils
 from .. import query_utils as qutils
 from .. import talon as talon
+from . import ab_utils as autils
+from . import filter_talon_transcripts as filt
+from . import post_utils as putils
 
 
 def getOptions():
     parser = OptionParser()
 
-    parser.add_option("--db", dest = "database",
-        help = "TALON database", metavar = "FILE", type = "string")
+    parser.add_option("--db", dest="database", help="TALON database", metavar="FILE", type="string")
 
-    parser.add_option("--annot", "-a", dest = "annot",
-        help = """Which annotation version to use. Will determine which
+    parser.add_option(
+        "--annot",
+        "-a",
+        dest="annot",
+        help="""Which annotation version to use. Will determine which
                   annotation transcripts are considered known or novel
                   relative to. Note: must be in the TALON database.""",
-        type = "string")
+        type="string",
+    )
 
-    parser.add_option("--whitelist", dest = "whitelist",
-                      help = "Whitelist file of transcripts to include in the \
+    parser.add_option(
+        "--whitelist",
+        dest="whitelist",
+        help="Whitelist file of transcripts to include in the \
                               output. First column should be TALON gene ID, \
                               second column should be TALON transcript ID",
-                      metavar = "FILE", type = "string", default = None)
-
-    parser.add_option("--build", "-b", dest = "build",
-        help = "Genome build to use. Note: must be in the TALON database.",
-        type = "string")
-
-    parser.add_option("--datasets", "-d",  dest = "datasets_file",
-        help = """Optional: A file indicating which datasets should be
+        metavar="FILE",
+        type="string",
+        default=None,
+    )
+
+    parser.add_option(
+        "--build", "-b", dest="build", help="Genome build to use. Note: must be in the TALON database.", type="string"
+    )
+
+    parser.add_option(
+        "--datasets",
+        "-d",
+        dest="datasets_file",
+        help="""Optional: A file indicating which datasets should be
                   included (one dataset name per line). Default is to include
                   all datasets.""",
-        metavar = "FILE", type = "string", default = None)
-
-    parser.add_option("--o", dest = "outprefix", help = "Prefix for output file",
-        metavar = "FILE", type = "string")
+        metavar="FILE",
+        type="string",
+        default=None,
+    )
 
+    parser.add_option("--o", dest="outprefix", help="Prefix for output file", metavar="FILE", type="string")
 
     (options, args) = parser.parse_args()
     return options
 
+
 def create_outname(options):
-    """ Creates filename for the output abundance that reflects the input options that
-        were used. """
+    """Creates filename for the output abundance that reflects the input options that
+    were used."""
 
     outname = options.outprefix + "_talon_abundance"
     if options.whitelist != None:
-        outname = "_".join([ outname, "filtered" ])
+        outname = "_".join([outname, "filtered"])
 
     outname += ".tsv"
     return outname
 
+
 # def fetch_dataset_list(dataset_file, database):
 #     """ Gets a list of all datasets in the database """
 #
@@ -90,10 +104,11 @@ def create_outname(options):
 #
 #         return datasets
 
+
 def create_abundance_dict(database, datasets):
     """Process the abundance table by dataset in order to create a dictionary
-       data structure organized like this:
-           transcript_ID -> dataset -> abundance in that dataset
+    data structure organized like this:
+        transcript_ID -> dataset -> abundance in that dataset
     """
     abundance = {}
 
@@ -102,8 +117,11 @@ def create_abundance_dict(database, datasets):
     cursor = conn.cursor()
 
     for dataset in datasets:
-        query = """ SELECT transcript_ID, count FROM abundance
-                    WHERE dataset = '%s' """ % dataset
+        query = (
+            """ SELECT transcript_ID, count FROM abundance
+                    WHERE dataset = '%s' """
+            % dataset
+        )
         cursor.execute(query)
 
         for transcript in cursor.fetchall():
@@ -119,18 +137,19 @@ def create_abundance_dict(database, datasets):
     conn.close()
     return abundance
 
+
 def fetch_abundances(database, datasets, annot, whitelist):
     """Constructs a query to get the following information for every
-       whitelisted transcript:
-           1) TALON gene ID
-           2) TALON transcript ID
-           3) Gene ID (from annotation specified in 'annot', None otherwise)
-           4) Transcript ID (from annotation specified in 'annot', None otherwise)
-           5) Gene name (from annotation specified in 'annot', None otherwise)
-           6) Transcript name (from annotation specified in 'annot', None otherwise)
-           7) number of exons in transcript
-
-        Returns a list of tuples (one tuple per transcript)
+    whitelisted transcript:
+        1) TALON gene ID
+        2) TALON transcript ID
+        3) Gene ID (from annotation specified in 'annot', None otherwise)
+        4) Transcript ID (from annotation specified in 'annot', None otherwise)
+        5) Gene name (from annotation specified in 'annot', None otherwise)
+        6) Transcript name (from annotation specified in 'annot', None otherwise)
+        7) number of exons in transcript
+
+     Returns a list of tuples (one tuple per transcript)
     """
 
     # datasets = fetch_dataset_list(database)
@@ -149,7 +168,7 @@ def fetch_abundances(database, datasets, annot, whitelist):
     conn.row_factory = sqlite3.Row
     cursor = conn.cursor()
 
-    whitelist_string = "WHERE t.transcript_ID IN (" + ','.join(whitelist) + ");"
+    whitelist_string = "WHERE t.transcript_ID IN (" + ",".join(whitelist) + ");"
 
     name_status_query = """
                 FROM transcripts t
@@ -165,7 +184,12 @@ def fetch_abundances(database, datasets, annot, whitelist):
                 LEFT JOIN transcript_annotations ta_name ON t.transcript_ID = ta_name.ID
 	            AND ta_name.annot_name = '%s'
                     AND ta_name.attribute = 'transcript_name'
-                """ % (annot, annot, annot, annot)
+                """ % (
+        annot,
+        annot,
+        annot,
+        annot,
+    )
 
     full_query = "\n".join([col_query, name_status_query, whitelist_string])
 
@@ -201,11 +225,11 @@ def fetch_abundances(database, datasets, annot, whitelist):
 
     return final_abundance, colnames
 
-def write_abundance_file(abundances, col_names, prefix, n_places, datasets,
-                         novelty_types, transcript_lengths, outfile):
-    """ Writes abundances and metadata to an output file """
 
-    o = open(outfile, 'w')
+def write_abundance_file(abundances, col_names, prefix, n_places, datasets, novelty_types, transcript_lengths, outfile):
+    """Writes abundances and metadata to an output file"""
+
+    o = open(outfile, "w")
 
     novelty_type_cols = ["gene_novelty", "transcript_novelty", "ISM_subtype"]
 
@@ -224,22 +248,24 @@ def write_abundance_file(abundances, col_names, prefix, n_places, datasets,
     annot_transcript_ID_index = all_colnames.index("annot_transcript_id")
     gene_name_index = all_colnames.index("annot_gene_name")
     transcript_name_index = all_colnames.index("annot_transcript_name")
-    dataset_indices = [i for i,s in enumerate(all_colnames) if s in set(datasets)]
+    dataset_indices = [i for i, s in enumerate(all_colnames) if s in set(datasets)]
 
     # Iterate over abundances, fixing Nones, and write to file
     for transcript in abundances:
-        curr_novelty = get_gene_and_transcript_novelty_types(transcript[gene_ID_index],
-                                                             transcript[transcript_ID_index],
-                                                             novelty_types)
+        curr_novelty = get_gene_and_transcript_novelty_types(
+            transcript[gene_ID_index], transcript[transcript_ID_index], novelty_types
+        )
         transcript = list(transcript)
-        transcript = transcript[0:first_dataset_index] + \
-                     [transcript_lengths[transcript[transcript_ID_index]]] + \
-                     [ curr_novelty[x] for x in novelty_type_cols] + \
-                     transcript[first_dataset_index:]
+        transcript = (
+            transcript[0:first_dataset_index]
+            + [transcript_lengths[transcript[transcript_ID_index]]]
+            + [curr_novelty[x] for x in novelty_type_cols]
+            + transcript[first_dataset_index:]
+        )
 
-        alt_gene_name, alt_transcript_name = talon.construct_names(transcript[gene_ID_index], \
-                                                             transcript[transcript_ID_index], \
-                                                             prefix, n_places)
+        alt_gene_name, alt_transcript_name = talon.construct_names(
+            transcript[gene_ID_index], transcript[transcript_ID_index], prefix, n_places
+        )
 
         if transcript[annot_gene_ID_index] == None:
             transcript[annot_gene_ID_index] = alt_gene_name
@@ -263,8 +289,8 @@ def write_abundance_file(abundances, col_names, prefix, n_places, datasets,
 
 
 def get_gene_and_transcript_novelty_types(gene_ID, transcript_ID, novelty_type):
-    """ Look up gene and transcript IDs in data structure to determine which types
-        of novelty are present """
+    """Look up gene and transcript IDs in data structure to determine which types
+    of novelty are present"""
 
     curr_novel = {}
 
@@ -297,8 +323,7 @@ def get_gene_and_transcript_novelty_types(gene_ID, transcript_ID, novelty_type):
         print("Warning: Could not locate novelty type for transcript %s" % transcript_ID)
 
     # Look for ISM subtype
-    if transcript_ID in novelty_type.ISM_prefix and \
-       transcript_ID in novelty_type.ISM_suffix:
+    if transcript_ID in novelty_type.ISM_prefix and transcript_ID in novelty_type.ISM_suffix:
         curr_novel["ISM_subtype"] = "Both"
     elif transcript_ID in novelty_type.ISM_prefix:
         curr_novel["ISM_subtype"] = "Prefix"
@@ -309,6 +334,7 @@ def get_gene_and_transcript_novelty_types(gene_ID, transcript_ID, novelty_type):
 
     return curr_novel
 
+
 # def check_annot_validity(annot, database):
 #     """ Make sure that the user has entered a correct annotation name """
 #
@@ -360,9 +386,10 @@ def get_gene_and_transcript_novelty_types(gene_ID, transcript_ID, novelty_type):
 #
 #     return
 
+
 def make_novelty_type_struct(database, datasets):
-    """ Create a data structure where it is possible to look up whether a gene
-        or transcript belongs to a particular category of novelty"""
+    """Create a data structure where it is possible to look up whether a gene
+    or transcript belongs to a particular category of novelty"""
 
     conn = sqlite3.connect(database)
     conn.row_factory = sqlite3.Row
@@ -385,6 +412,7 @@ def make_novelty_type_struct(database, datasets):
     conn.close()
     return novelty_type
 
+
 # def fetch_naming_prefix(database):
 #     """ Get naming prefix from the database run_info table """
 #     conn = sqlite3.connect(database)
@@ -448,15 +476,11 @@ def main():
     autils.check_build_validity(build, database)
 
     # Determine which transcripts to include
-    whitelist = putils.handle_filtering(database,
-                                        annot,
-                                        False,
-                                        whitelist_file,
-                                        dataset_file)
+    whitelist = putils.handle_filtering(database, annot, False, whitelist_file, dataset_file)
 
     # create transcript whitelist
     transcript_whitelist = []
-    for key,group in itertools.groupby(whitelist,operator.itemgetter(0)):
+    for key, group in itertools.groupby(whitelist, operator.itemgetter(0)):
         for id_tuple in list(group):
             transcript_whitelist.append(str(id_tuple[1]))
 
@@ -471,5 +495,6 @@ def main():
     n_places = autils.fetch_n_places(database)
     write_abundance_file(abundances, colnames, prefix, n_places, datasets, novelty_type, transcript_lengths, outfile)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     main()
diff --git a/src/talon/post/create_anndata_from_database.py b/src/talon/post/create_anndata_from_database.py
index f3ebca3..2c96c76 100644
--- a/src/talon/post/create_anndata_from_database.py
+++ b/src/talon/post/create_anndata_from_database.py
@@ -4,60 +4,76 @@
 # create_anndata_from_database.py is a utility that outputs the abundance
 # for each transcript in the TALON database across datasets in AnnData format.
 
-import sqlite3
 import itertools
 import operator
+import sqlite3
 from optparse import OptionParser
 from pathlib import Path
-import scanpy
+
+import anndata
 import numpy as np
 import pandas as pd
-import anndata
+import scanpy
 from scipy.sparse import csr_matrix
 
-
-from . import filter_talon_transcripts as filt
 from .. import dstruct as dstruct
 from .. import length_utils as lu
-from . import post_utils as putils
-from . import ab_utils as autils
 from .. import query_utils as qutils
 from .. import talon as talon
+from . import ab_utils as autils
+from . import filter_talon_transcripts as filt
+from . import post_utils as putils
 
 
 def getOptions():
     parser = OptionParser()
 
-    parser.add_option("--db", dest = "database",
-        help = "TALON database", metavar = "FILE", type = "string")
-    parser.add_option("--annot", "-a", dest = "annot",
-        help = """Which annotation version to use. Will determine which
+    parser.add_option("--db", dest="database", help="TALON database", metavar="FILE", type="string")
+    parser.add_option(
+        "--annot",
+        "-a",
+        dest="annot",
+        help="""Which annotation version to use. Will determine which
                   annotation transcripts are considered known or novel
                   relative to. Note: must be in the TALON database.""",
-        type = "string")
-    parser.add_option("--pass_list", dest = "pass_list",
-                      help = "Pass list file of transcripts to include in the \
+        type="string",
+    )
+    parser.add_option(
+        "--pass_list",
+        dest="pass_list",
+        help="Pass list file of transcripts to include in the \
                               output. First column should be TALON gene ID, \
                               second column should be TALON transcript ID",
-                      metavar = "FILE", type = "string", default = None)
-    parser.add_option("--build", "-b", dest = "build",
-        help = "Genome build to use. Note: must be in the TALON database.",
-        type = "string")
-    parser.add_option('--gene', dest='gene_level',
-        help='Output AnnData on the gene level rather than the transcript',
-        action='store_true')
-    parser.add_option("--datasets", "-d",  dest = "dataset_file",
-        help = """Optional: A file indicating which datasets should be
+        metavar="FILE",
+        type="string",
+        default=None,
+    )
+    parser.add_option(
+        "--build", "-b", dest="build", help="Genome build to use. Note: must be in the TALON database.", type="string"
+    )
+    parser.add_option(
+        "--gene",
+        dest="gene_level",
+        help="Output AnnData on the gene level rather than the transcript",
+        action="store_true",
+    )
+    parser.add_option(
+        "--datasets",
+        "-d",
+        dest="dataset_file",
+        help="""Optional: A file indicating which datasets should be
                   included (one dataset name per line). Default is to include
                   all datasets.""",
-        metavar = "FILE", type = "string", default = None)
-    parser.add_option("--o", dest = "ofile", help = "Output file name",
-        metavar = "FILE", type = "string")
-
+        metavar="FILE",
+        type="string",
+        default=None,
+    )
+    parser.add_option("--o", dest="ofile", help="Output file name", metavar="FILE", type="string")
 
     (options, args) = parser.parse_args()
     return options
 
+
 def assign_novelties(df, d, order, how):
     """
     Assign novelty types based on a priority order
@@ -76,17 +92,17 @@ def assign_novelties(df, d, order, how):
         df (pandas DataFrame): DataFrame indexed by gene /
             transcript ID with novelty information
     """
-    if how == 'gene':
-        nov_col = 'gene_novelty'
+    if how == "gene":
+        nov_col = "gene_novelty"
         cols = [nov_col]
-    elif how == 'transcript':
-        nov_col = 'transcript_novelty'
-        cols = [nov_col, 'ISM_subtype']
+    elif how == "transcript":
+        nov_col = "transcript_novelty"
+        cols = [nov_col, "ISM_subtype"]
 
     # assign gene or transcript novelty
-    df = df.pivot(index='ID', columns=['attribute'], values=['value'])
+    df = df.pivot(index="ID", columns=["attribute"], values=["value"])
     df = df.droplevel(0, axis=1)
-    df.columns.name = ''
+    df.columns.name = ""
 
     for key, value in d.items():
         df[key] = False
@@ -94,20 +110,20 @@ def assign_novelties(df, d, order, how):
         # in cases where we're filtering out a lot,
         # not all novelty types will be represented
         if value[0] in df.columns:
-            df.loc[df[value[0]]==value[1], key] = True
+            df.loc[df[value[0]] == value[1], key] = True
             df.drop(value[0], axis=1, inplace=True)
 
     df[nov_col] = np.nan
     for o in order:
-        df.loc[(df[nov_col].isnull())&(df[o]==True), nov_col] = o
+        df.loc[(df[nov_col].isnull()) & (df[o] == True), nov_col] = o
 
     # assign ism subtype if needed
-    if how == 'transcript':
-        df['ISM_subtype'] = np.nan
-        df.loc[(df.ISM_subtype.isnull())&(df['ISM-prefix'])&(df['ISM-suffix']), 'ISM_subtype'] = 'Both'
-        df.loc[(df.ISM_subtype.isnull())&(df['ISM-prefix']), 'ISM_subtype'] = 'Prefix'
-        df.loc[(df.ISM_subtype.isnull())&(df['ISM-suffix']), 'ISM_subtype'] = 'Suffix'
-        df.loc[df.ISM_subtype.isnull(), 'ISM_subtype'] = 'None'
+    if how == "transcript":
+        df["ISM_subtype"] = np.nan
+        df.loc[(df.ISM_subtype.isnull()) & (df["ISM-prefix"]) & (df["ISM-suffix"]), "ISM_subtype"] = "Both"
+        df.loc[(df.ISM_subtype.isnull()) & (df["ISM-prefix"]), "ISM_subtype"] = "Prefix"
+        df.loc[(df.ISM_subtype.isnull()) & (df["ISM-suffix"]), "ISM_subtype"] = "Suffix"
+        df.loc[df.ISM_subtype.isnull(), "ISM_subtype"] = "None"
 
     # reduce cols
     df = df[cols]
@@ -115,6 +131,7 @@ def assign_novelties(df, d, order, how):
 
     return df
 
+
 def get_transcript_novs(db, tids):
     """
     Get transcript novelties and ISM subtypes from a TALON db
@@ -129,17 +146,18 @@ def get_transcript_novs(db, tids):
     """
 
     # attributes to search for
-    nov_col_dict = {'Known': ('transcript_status', 'KNOWN'),
-                    'ISM': ('ISM_transcript', 'TRUE'),
-                    'ISM-prefix': ('ISM-prefix_transcript', 'TRUE'),
-                    'ISM-suffix': ('ISM-suffix_transcript', 'TRUE'),
-                    'NIC': ('NIC_transcript', 'TRUE'),
-                    'NNC': ('NNC_transcript', 'TRUE'),
-                    'Antisense': ('antisense_transcript', 'TRUE'),
-                    'Intergenic': ('intergenic_transcript', 'TRUE'),
-                    'Genomic': ('genomic_transcript', 'TRUE')}
-    order = ['ISM', 'NIC', 'NNC', 'Antisense',
-             'Intergenic', 'Genomic', 'Known']
+    nov_col_dict = {
+        "Known": ("transcript_status", "KNOWN"),
+        "ISM": ("ISM_transcript", "TRUE"),
+        "ISM-prefix": ("ISM-prefix_transcript", "TRUE"),
+        "ISM-suffix": ("ISM-suffix_transcript", "TRUE"),
+        "NIC": ("NIC_transcript", "TRUE"),
+        "NNC": ("NNC_transcript", "TRUE"),
+        "Antisense": ("antisense_transcript", "TRUE"),
+        "Intergenic": ("intergenic_transcript", "TRUE"),
+        "Genomic": ("genomic_transcript", "TRUE"),
+    }
+    order = ["ISM", "NIC", "NNC", "Antisense", "Intergenic", "Genomic", "Known"]
     attr_list = [val[0] for key, val in nov_col_dict.items()]
     attrs = qutils.format_for_IN(attr_list)
 
@@ -154,10 +172,11 @@ def get_transcript_novs(db, tids):
                  """
         df = pd.read_sql_query(query, conn)
 
-    df = assign_novelties(df, nov_col_dict, order, 'transcript')
+    df = assign_novelties(df, nov_col_dict, order, "transcript")
 
     return df
 
+
 def get_gene_novs(db, gids):
     """
     Get gene novelties from a TALON db
@@ -171,10 +190,12 @@ def get_gene_novs(db, gids):
     """
 
     # attributes to search for
-    nov_col_dict = {'Known': ('gene_status', 'KNOWN'),
-                   'Intergenic': ('intergenic_novel', 'TRUE'),
-                   'Antisense': ('antisense_gene', 'TRUE')}
-    order = ['Antisense', 'Intergenic', 'Known']
+    nov_col_dict = {
+        "Known": ("gene_status", "KNOWN"),
+        "Intergenic": ("intergenic_novel", "TRUE"),
+        "Antisense": ("antisense_gene", "TRUE"),
+    }
+    order = ["Antisense", "Intergenic", "Known"]
     attr_list = [val[0] for key, val in nov_col_dict.items()]
     attrs = qutils.format_for_IN(attr_list)
 
@@ -188,10 +209,11 @@ def get_gene_novs(db, gids):
                     AND ID IN {gene_query}
                  """
         df = pd.read_sql_query(query, conn)
-    df = assign_novelties(df, nov_col_dict, order, 'gene')
+    df = assign_novelties(df, nov_col_dict, order, "gene")
 
     return df
 
+
 def get_g_t_names(db, annot, tids):
     """
     Get names / IDs of genes / transcripts from TALON db
@@ -238,6 +260,7 @@ def get_g_t_names(db, annot, tids):
 
     return df
 
+
 def get_var_info(db, annot, build, tids=None, gids=None, gene_level=False):
     """
     Get info about names, IDs, novelty categories, etc. for each gene
@@ -264,22 +287,21 @@ def get_var_info(db, annot, build, tids=None, gids=None, gene_level=False):
     # make names for novel genes / transcripts
     # determine how many missing digits there are
     # repeat '0' for that many spaces for each gene / transcript ID
-    df['zero'] = '0'
-    df['n_gid_zero_to_add'] = n_places-df.gene_ID.astype(str).str.len()
-    df['temp_gid'] = prefix+'G'+df['zero'].str.repeat(df['n_gid_zero_to_add'])+df['gene_ID'].astype(str)
-    df['n_tid_zero_to_add'] = n_places-df.transcript_ID.astype(str).str.len()
-    df['temp_tid'] = prefix+'T'+df['zero'].str.repeat(df['n_tid_zero_to_add'])+df['transcript_ID'].astype(str)
-
-    df['temp'] = df.temp_gid.str.len()
-    if len(df['temp'].unique().tolist()) != 1:
-        raise ValueError('Problem naming genes')
-    df['temp'] = df.temp_tid.str.len()
-    if len(df['temp'].unique().tolist()) != 1:
-        raise ValueError('Problem naming transcripts')
+    df["zero"] = "0"
+    df["n_gid_zero_to_add"] = n_places - df.gene_ID.astype(str).str.len()
+    df["temp_gid"] = prefix + "G" + df["zero"].str.repeat(df["n_gid_zero_to_add"]) + df["gene_ID"].astype(str)
+    df["n_tid_zero_to_add"] = n_places - df.transcript_ID.astype(str).str.len()
+    df["temp_tid"] = prefix + "T" + df["zero"].str.repeat(df["n_tid_zero_to_add"]) + df["transcript_ID"].astype(str)
+
+    df["temp"] = df.temp_gid.str.len()
+    if len(df["temp"].unique().tolist()) != 1:
+        raise ValueError("Problem naming genes")
+    df["temp"] = df.temp_tid.str.len()
+    if len(df["temp"].unique().tolist()) != 1:
+        raise ValueError("Problem naming transcripts")
 
     # drop extra stuff
-    drop_cols = ['zero', 'n_gid_zero_to_add',
-                 'n_tid_zero_to_add', 'temp']
+    drop_cols = ["zero", "n_gid_zero_to_add", "n_tid_zero_to_add", "temp"]
     df.drop(drop_cols, axis=1, inplace=True)
 
     # # add gene / transcript names / ids
@@ -291,54 +313,67 @@ def get_var_info(db, annot, build, tids=None, gids=None, gene_level=False):
 
     # replace null gene names / ids
     inds = df.loc[df.annot_gene_id.isnull()].index
-    df.loc[inds, 'annot_gene_id'] = df.loc[inds, 'temp_gid']
+    df.loc[inds, "annot_gene_id"] = df.loc[inds, "temp_gid"]
     inds = df.loc[df.annot_gene_name.isnull()].index
-    df.loc[inds, 'annot_gene_name'] = df.loc[inds, 'temp_gid']
+    df.loc[inds, "annot_gene_name"] = df.loc[inds, "temp_gid"]
 
     # replace null transcript names / ids
     inds = df.loc[df.annot_transcript_id.isnull()].index
-    df.loc[inds, 'annot_transcript_id'] = df.loc[inds, 'temp_tid']
+    df.loc[inds, "annot_transcript_id"] = df.loc[inds, "temp_tid"]
     inds = df.loc[df.annot_transcript_name.isnull()].index
-    df.loc[inds, 'annot_transcript_name'] = df.loc[inds, 'temp_tid']
+    df.loc[inds, "annot_transcript_name"] = df.loc[inds, "temp_tid"]
 
     # remove temp cols
-    df.drop(['temp_gid', 'temp_tid'], axis=1, inplace=True)
+    df.drop(["temp_gid", "temp_tid"], axis=1, inplace=True)
 
     # add transcript len
-    t_lens = pd.DataFrame.from_dict(autils.get_transcript_lengths(db, build),
-                                                orient='index',
-                                                columns=['length'])
-    df = df.merge(t_lens, how='left', left_on='transcript_ID', right_index=True)
+    t_lens = pd.DataFrame.from_dict(autils.get_transcript_lengths(db, build), orient="index", columns=["length"])
+    df = df.merge(t_lens, how="left", left_on="transcript_ID", right_index=True)
 
     # add gene novelty
     g_df = get_gene_novs(db, gids)
-    df = df.merge(g_df, how='left', left_on='gene_ID', right_on='ID')
-    df.drop('ID', axis=1, inplace=True)
+    df = df.merge(g_df, how="left", left_on="gene_ID", right_on="ID")
+    df.drop("ID", axis=1, inplace=True)
 
     # add transcript novelty / ism subtype
     t_df = get_transcript_novs(db, tids)
-    df = df.merge(t_df, how='left', left_on='transcript_ID', right_on='ID')
-    df.drop('ID', axis=1, inplace=True)
+    df = df.merge(t_df, how="left", left_on="transcript_ID", right_on="ID")
+    df.drop("ID", axis=1, inplace=True)
 
     # column order
-    order = ['gene_ID', 'transcript_ID', 'annot_gene_id',
-             'annot_transcript_id', 'annot_gene_name',
-             'annot_transcript_name', 'n_exons', 'length',
-             'gene_novelty', 'transcript_novelty', 'ISM_subtype']
+    order = [
+        "gene_ID",
+        "transcript_ID",
+        "annot_gene_id",
+        "annot_transcript_id",
+        "annot_gene_name",
+        "annot_transcript_name",
+        "n_exons",
+        "length",
+        "gene_novelty",
+        "transcript_novelty",
+        "ISM_subtype",
+    ]
     df = df[order]
 
     # gene level -- drop columns that are only relevant to transcripts
     # and drop duplicated entries
     if gene_level:
-        drop_cols = ['transcript_ID', 'annot_transcript_id',
-                     'annot_transcript_name', 'length',
-                     'transcript_novelty', 'ISM_subtype',
-                     'n_exons']
+        drop_cols = [
+            "transcript_ID",
+            "annot_transcript_id",
+            "annot_transcript_name",
+            "length",
+            "transcript_novelty",
+            "ISM_subtype",
+            "n_exons",
+        ]
         df.drop(drop_cols, axis=1, inplace=True)
         df.drop_duplicates(inplace=True)
 
     return df
 
+
 def get_obs_info(db, dataset_file):
     """
     Get metadata table for each dataset in TALON
@@ -360,9 +395,10 @@ def get_obs_info(db, dataset_file):
                  FROM dataset WHERE dataset_name IN {datasets_query}
                  """
         df = pd.read_sql_query(query, conn)
-        df.rename({'dataset_name': 'dataset'}, axis=1, inplace=True)
+        df.rename({"dataset_name": "dataset"}, axis=1, inplace=True)
     return df
 
+
 def get_X_info(db, obs, var, gene_level=False):
     """
     Get sparse matrix representation of gene or transcript counts
@@ -381,7 +417,7 @@ def get_X_info(db, obs, var, gene_level=False):
 
     # filter on genes
     if gene_level:
-        var_col = 'gene_ID'
+        var_col = "gene_ID"
         feat_str = qutils.format_for_IN(var[var_col].unique().tolist())
         query = f"""SELECT t.gene_ID, ab.transcript_ID, ab.dataset, ab.count
                     FROM abundance as ab
@@ -393,7 +429,7 @@ def get_X_info(db, obs, var, gene_level=False):
 
     # filter on transcripts
     else:
-        var_col = 'transcript_ID'
+        var_col = "transcript_ID"
         feat_str = qutils.format_for_IN(var[var_col].unique().tolist())
         query = f"""SELECT transcript_ID, dataset, count
                     FROM abundance WHERE transcript_ID in {feat_str}
@@ -407,25 +443,23 @@ def get_X_info(db, obs, var, gene_level=False):
 
     # sum over transcripts from the same gene / dataset
     if gene_level:
-        df.drop('transcript_ID', axis=1, inplace=True)
-        df = df.groupby(['gene_ID', 'dataset']).sum().reset_index()
+        df.drop("transcript_ID", axis=1, inplace=True)
+        df = df.groupby(["gene_ID", "dataset"]).sum().reset_index()
 
     # make categories based on ordering of obs and var tables
-    obs_col = 'dataset'
+    obs_col = "dataset"
     obs_cat = pd.api.types.CategoricalDtype(obs[obs_col], ordered=True)
     if obs_cat.categories.tolist() != obs[obs_col].tolist():
-        raise ValueError('Problem with dataset names')
+        raise ValueError("Problem with dataset names")
     var_cat = pd.api.types.CategoricalDtype(var[var_col], ordered=True)
     if var_cat.categories.tolist() != var[var_col].tolist():
-        raise ValueError('Problem with feature IDs')
+        raise ValueError("Problem with feature IDs")
 
     # create sparse matrix representation without
     # inflating
     row = df[obs_col].astype(obs_cat).cat.codes
     col = df[var_col].astype(var_cat).cat.codes
-    X = csr_matrix((df['count'], (row, col)), \
-                   shape=(obs_cat.categories.size,
-                          var_cat.categories.size))
+    X = csr_matrix((df["count"], (row, col)), shape=(obs_cat.categories.size, var_cat.categories.size))
 
     # # code to inflate matrix
     # dfs = pd.SparseDataFrame(X, \
@@ -435,6 +469,7 @@ def get_X_info(db, obs, var, gene_level=False):
 
     return X
 
+
 def main():
     options = getOptions()
     db = options.database
@@ -454,11 +489,7 @@ def main():
     autils.check_build_validity(build, db)
 
     # determine which transcripts to include
-    pass_list = putils.handle_filtering(db,
-                                        annot,
-                                        True,
-                                        pass_list_file,
-                                        dataset_file)
+    pass_list = putils.handle_filtering(db, annot, True, pass_list_file, dataset_file)
     gids = [i[0] for i in list(set(pass_list))]
     tids = [i[1] for i in list(set(pass_list))]
 
@@ -471,5 +502,6 @@ def main():
     adata = anndata.AnnData(X=X, obs=obs, var=var)
     adata.write(ofile)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     main()
diff --git a/src/talon/post/filter_talon_transcripts.py b/src/talon/post/filter_talon_transcripts.py
index 01f5518..6ed4b8e 100644
--- a/src/talon/post/filter_talon_transcripts.py
+++ b/src/talon/post/filter_talon_transcripts.py
@@ -6,76 +6,119 @@
 # used by downstream analysis tools to determine which transcripts and other
 # features should be reported (for example in a GTF file).
 
-from optparse import OptionParser
+import os
 import sqlite3
+import warnings
+from optparse import OptionParser
 from pathlib import Path
+
+import pandas as pd
+
+from talon.post import get_read_annotations as read_annot
+
 from .. import query_utils as qutils
 from . import ab_utils as autils
-from talon.post import get_read_annotations as read_annot
-import pandas as pd
-import os
-import warnings
+
 
 def getOptions():
-    parser = OptionParser(description = ("talon_filter_transcripts is a "
-                          "utility that filters the transcripts inside "
-                          "a TALON database to produce a transcript pass list. "
-                          "This list can then be used by downstream analysis "
-                          "tools to determine which transcripts and other "
-                          "features should be reported (for example in a GTF file)"))
-    parser.add_option("--db", dest = "database",
-        help = "TALON database", metavar = "FILE", type = str)
-    parser.add_option("--annot", "-a", dest = "annot",
-                    help = """Which annotation version to use. Will determine which
+    parser = OptionParser(
+        description=(
+            "talon_filter_transcripts is a "
+            "utility that filters the transcripts inside "
+            "a TALON database to produce a transcript pass list. "
+            "This list can then be used by downstream analysis "
+            "tools to determine which transcripts and other "
+            "features should be reported (for example in a GTF file)"
+        )
+    )
+    parser.add_option("--db", dest="database", help="TALON database", metavar="FILE", type=str)
+    parser.add_option(
+        "--annot",
+        "-a",
+        dest="annot",
+        help="""Which annotation version to use. Will determine which
                               annotation transcripts are considered known or novel
                               relative to. Note: must be in the TALON database.""",
-                    type = "string")
-    parser.add_option("--datasets", dest = "datasets", default = None,
-                      help = ("Datasets to include. Can be provided as a "
-                              "comma-delimited list on the command line, "
-                              "or as a file with one dataset per line. "
-                              "If this option is omitted, all datasets will "
-                              "be included."))
-    parser.add_option("--includeAnnot", dest = "include_annot", action= "store_true",
-                help = ("Include all transcripts from the annotation, regardless "
-                        "of if they were observed in the data."))
-    parser.add_option("--maxFracA", dest = "max_frac_A", default = 0.5,
-                      help = ("Maximum fraction of As to allow in the window "
-                              "located immediately after any read assigned to "
-                              "a novel transcript (helps to filter out internal "
-                              "priming artifacts). Default = 0.5. Use 1 if you prefer"
-                              "to not filter out internal priming events."),
-                      type = float)
-    parser.add_option("--minCount", dest = "min_count", default = 5,
-                      type = int,
-                      help = ("Number of minimum occurrences required for a "
-                              "novel transcript PER dataset. Default = 5"))
-    parser.add_option("--minDatasets", dest = "min_datasets", default = None,
-                      type = int,
-                      help = ("Minimum number of datasets novel transcripts "
-                              "must be found in. Default = all datasets provided"))
-    parser.add_option("--allowGenomic", dest ="allow_genomic", action='store_true',
-                  help = ("If this option is set, transcripts from the Genomic "
-                          "novelty category will be permitted in the output "
-                          "(provided they pass the thresholds). Default "
-                          "behavior is to filter out genomic transcripts "
-                          "since they are unlikely to be real novel isoforms."),
-                   default = False)
-    parser.add_option("--excludeISM", dest = "exclude_ISMs", action='store_true',
-                  help = ("If this option is set, transcripts from the ISM "
-                          "novelty category will be excluded from the output. "
-                          "Default behavior is to include those that pass other "
-                          "filtering thresholds."))
-    parser.add_option("--o", dest = "outfile", help = "Outfile name",
-        metavar = "FILE", type = "string")
-
+        type="string",
+    )
+    parser.add_option(
+        "--datasets",
+        dest="datasets",
+        default=None,
+        help=(
+            "Datasets to include. Can be provided as a "
+            "comma-delimited list on the command line, "
+            "or as a file with one dataset per line. "
+            "If this option is omitted, all datasets will "
+            "be included."
+        ),
+    )
+    parser.add_option(
+        "--includeAnnot",
+        dest="include_annot",
+        action="store_true",
+        help=("Include all transcripts from the annotation, regardless " "of if they were observed in the data."),
+    )
+    parser.add_option(
+        "--maxFracA",
+        dest="max_frac_A",
+        default=0.5,
+        help=(
+            "Maximum fraction of As to allow in the window "
+            "located immediately after any read assigned to "
+            "a novel transcript (helps to filter out internal "
+            "priming artifacts). Default = 0.5. Use 1 if you prefer"
+            "to not filter out internal priming events."
+        ),
+        type=float,
+    )
+    parser.add_option(
+        "--minCount",
+        dest="min_count",
+        default=5,
+        type=int,
+        help=("Number of minimum occurrences required for a " "novel transcript PER dataset. Default = 5"),
+    )
+    parser.add_option(
+        "--minDatasets",
+        dest="min_datasets",
+        default=None,
+        type=int,
+        help=("Minimum number of datasets novel transcripts " "must be found in. Default = all datasets provided"),
+    )
+    parser.add_option(
+        "--allowGenomic",
+        dest="allow_genomic",
+        action="store_true",
+        help=(
+            "If this option is set, transcripts from the Genomic "
+            "novelty category will be permitted in the output "
+            "(provided they pass the thresholds). Default "
+            "behavior is to filter out genomic transcripts "
+            "since they are unlikely to be real novel isoforms."
+        ),
+        default=False,
+    )
+    parser.add_option(
+        "--excludeISM",
+        dest="exclude_ISMs",
+        action="store_true",
+        help=(
+            "If this option is set, transcripts from the ISM "
+            "novelty category will be excluded from the output. "
+            "Default behavior is to include those that pass other "
+            "filtering thresholds."
+        ),
+    )
+    parser.add_option("--o", dest="outfile", help="Outfile name", metavar="FILE", type="string")
 
     (options, args) = parser.parse_args()
     return options
 
-def get_known_transcripts(database, annot, include_annot, datasets = None):
-    """ Fetch gene ID and transcript ID of all known transcripts detected in
-        the specified datasets """
+
+def get_known_transcripts(database, annot, include_annot, datasets=None):
+    """Fetch gene ID and transcript ID of all known transcripts detected in
+    the specified datasets"""
 
     with sqlite3.connect(database) as conn:
         # pull from observed table
@@ -85,7 +128,9 @@ def get_known_transcripts(database, annot, include_annot, datasets = None):
                                ON ta.ID = observed.transcript_ID
                            WHERE (ta.attribute = 'transcript_status'
                                   AND ta.value = 'KNOWN'
-                                  AND ta.annot_name = '%s')""" % (annot)
+                                  AND ta.annot_name = '%s')""" % (
+                annot
+            )
 
         # pull from normal transcripts table
         elif include_annot:
@@ -108,40 +153,47 @@ def get_known_transcripts(database, annot, include_annot, datasets = None):
 
     return known
 
+
 def fetch_reads_in_datasets_fracA_cutoff(database, datasets, max_frac_A):
-    """ Selects reads from the database that are from the specified datasets
-        and which pass the following cutoffs:
-            - fraction_As <= max_frac_A
-        Reads with fraction_As value of None will not be included.
-        If datasets == None, then all datasets are permitted"""
+    """Selects reads from the database that are from the specified datasets
+    and which pass the following cutoffs:
+        - fraction_As <= max_frac_A
+    Reads with fraction_As value of None will not be included.
+    If datasets == None, then all datasets are permitted"""
 
-# convert non-iterable datasets to an iterable
+    # convert non-iterable datasets to an iterable
     if datasets == None:
-       with sqlite3.connect(database) as conn:
-        query = """SELECT dataset_name
+        with sqlite3.connect(database) as conn:
+            query = """SELECT dataset_name
                        FROM dataset"""
-        iter_datasets = pd.read_sql_query(query, conn).dataset_name.tolist()
+            iter_datasets = pd.read_sql_query(query, conn).dataset_name.tolist()
     else:
-      iter_datasets = datasets
+        iter_datasets = datasets
 
     # first check if we have non-null fraction_As columns at all
     # (one dataset at a time)
     for dataset in iter_datasets:
-      with sqlite3.connect(database) as conn:
-          query = """SELECT read_name, gene_ID, transcript_ID, dataset, fraction_As
-                         FROM observed WHERE dataset='{}' LIMIT 0, 10""".format(dataset)
+        with sqlite3.connect(database) as conn:
+            query = """SELECT read_name, gene_ID, transcript_ID, dataset, fraction_As
+                         FROM observed WHERE dataset='{}' LIMIT 0, 10""".format(
+                dataset
+            )
 
-          data = pd.read_sql_query(query, conn)
-          nans = all(data.fraction_As.isna().tolist())
+            data = pd.read_sql_query(query, conn)
+            nans = all(data.fraction_As.isna().tolist())
 
-          if nans and max_frac_A != 1:
-            print("Reads in dataset {} appear to be unlabelled. "
-              "Only known transcripts will pass the filter.".format(dataset))
+            if nans and max_frac_A != 1:
+                print(
+                    "Reads in dataset {} appear to be unlabelled. "
+                    "Only known transcripts will pass the filter.".format(dataset)
+                )
 
     with sqlite3.connect(database) as conn:
         query = """SELECT read_name, gene_ID, transcript_ID, dataset, fraction_As
                        FROM observed
-                       WHERE fraction_As <= %f""" % (max_frac_A)
+                       WHERE fraction_As <= %f""" % (
+            max_frac_A
+        )
         if datasets != None:
             datasets = qutils.format_for_IN(datasets)
             query += " AND dataset IN " + datasets
@@ -150,7 +202,7 @@ def fetch_reads_in_datasets_fracA_cutoff(database, datasets, max_frac_A):
 
     # warn the user if no novel models passed filtering
     if len(data.index) == 0:
-      print('No reads passed maxFracA cutoff. Is this expected?')
+        print("No reads passed maxFracA cutoff. Is this expected?")
 
     return data
 
@@ -182,8 +234,9 @@ def fetch_reads_in_datasets_fracA_cutoff(database, datasets, max_frac_A):
 #
 #     return
 
+
 def check_db_version(database):
-    """ Make sure the user is using a v5 database """
+    """Make sure the user is using a v5 database"""
     conn = sqlite3.connect(database)
     cursor = conn.cursor()
 
@@ -194,19 +247,19 @@ def check_db_version(database):
         ver = pd.read_sql_query(query, conn)
 
         if ver.empty:
-          message = "Database version is not compatible with v5.0 filtering."
-          raise ValueError(message)
+            message = "Database version is not compatible with v5.0 filtering."
+            raise ValueError(message)
+
 
 def parse_datasets(dataset_option, database):
-    """ Parses dataset names from command line. Valid forms of input:
-            - None (returns None)
-            - Comma-delimited list of names
-            - File of names (One per line)
-        Also checks to make sure that the datasets are in the database.
+    """Parses dataset names from command line. Valid forms of input:
+        - None (returns None)
+        - Comma-delimited list of names
+        - File of names (One per line)
+    Also checks to make sure that the datasets are in the database.
     """
     if dataset_option == None:
-        print(("No dataset names specified, so filtering process will use all "
-               "datasets present in the database."))
+        print(("No dataset names specified, so filtering process will use all " "datasets present in the database."))
         return None
 
     elif os.path.isfile(dataset_option):
@@ -228,103 +281,106 @@ def parse_datasets(dataset_option, database):
             if dset not in valid_datasets:
                 invalid_datasets.append(dset)
         if len(invalid_datasets) > 0:
-            raise ValueError(("Problem parsing datasets. The following names are "
-                              "not in the database: '%s'. \nValid dataset names: '%s'")
-                              % (", ".join(invalid_datasets),
-                                 ", ".join(valid_datasets)))
+            raise ValueError(
+                (
+                    "Problem parsing datasets. The following names are "
+                    "not in the database: '%s'. \nValid dataset names: '%s'"
+                )
+                % (", ".join(invalid_datasets), ", ".join(valid_datasets))
+            )
         else:
-            print("Parsed the following dataset names successfully: %s" % \
-                  (", ".join(datasets)))
+            print("Parsed the following dataset names successfully: %s" % (", ".join(datasets)))
     return datasets
 
+
 def get_novelty_df(database):
-    """ Get the novelty category assignment of each transcript and
-        store in a data frame """
+    """Get the novelty category assignment of each transcript and
+    store in a data frame"""
 
     transcript_novelty_dict = read_annot.get_transcript_novelty(database)
-    transcript_novelty = pd.DataFrame.from_dict(transcript_novelty_dict,
-                                                orient='index')
+    transcript_novelty = pd.DataFrame.from_dict(transcript_novelty_dict, orient="index")
     transcript_novelty = transcript_novelty.reset_index()
-    transcript_novelty.columns = ['transcript_ID', 'transcript_novelty']
+    transcript_novelty.columns = ["transcript_ID", "transcript_novelty"]
 
     return transcript_novelty
 
+
 def merge_reads_with_novelty(reads, novelty):
-    """ Given a data frame of reads and a transcript novelty data frame,
-        perform a left merge to annotate the reads with their novelty status.
+    """Given a data frame of reads and a transcript novelty data frame,
+    perform a left merge to annotate the reads with their novelty status.
     """
 
-    merged = pd.merge(reads, novelty, on = "transcript_ID", how = "left")
+    merged = pd.merge(reads, novelty, on="transcript_ID", how="left")
     return merged
 
+
 def filter_on_min_count(reads, min_count):
-    """ Given a reads data frame, compute the number of times that each
-        transcript ID occurs per dataset.
-        Keep the rows that meet the min_count threshold and return them. """
+    """Given a reads data frame, compute the number of times that each
+    transcript ID occurs per dataset.
+    Keep the rows that meet the min_count threshold and return them."""
 
-    cols = ['gene_ID', 'transcript_ID', 'dataset']
+    cols = ["gene_ID", "transcript_ID", "dataset"]
 
     counts_df = reads[cols].groupby(cols).size()
     counts_df = counts_df.reset_index()
     counts_df.columns = cols + ["count"]
 
-    filtered = counts_df.loc[counts_df['count'] >= min_count]
+    filtered = counts_df.loc[counts_df["count"] >= min_count]
     return filtered
 
+
 def filter_on_n_datasets(counts_in_datasets, min_datasets):
-    """ Given a data frame with columns gene_ID, transcript_ID, dataset,
-        and count (in that dataset), count the number of datasets that each
-        transcript appears in. Then, filter the data such that only transcripts
-        found in at least 'min_datasets' remain. """
+    """Given a data frame with columns gene_ID, transcript_ID, dataset,
+    and count (in that dataset), count the number of datasets that each
+    transcript appears in. Then, filter the data such that only transcripts
+    found in at least 'min_datasets' remain."""
 
-    cols = ['gene_ID', 'transcript_ID']
+    cols = ["gene_ID", "transcript_ID"]
     dataset_count_df = counts_in_datasets[cols].groupby(cols).size()
     dataset_count_df = dataset_count_df.reset_index()
     dataset_count_df.columns = cols + ["n_datasets"]
 
-    filtered = dataset_count_df.loc[dataset_count_df['n_datasets'] >= min_datasets]
+    filtered = dataset_count_df.loc[dataset_count_df["n_datasets"] >= min_datasets]
     return filtered
 
+
 def filter_talon_transcripts(database, annot, datasets, options):
-    """ Filter transcripts belonging to the specified datasets in a TALON
-        database. The 'annot' parameter specifies which annotation transcripts
-        are known relative to. Can be tuned with the following options:
-        - options.include_annot: Include all annotated transcripts regardless
-                                 of whether they are expressed
-        - options.max_frac_A: maximum allowable fraction of As recorded for
-                              region after the read (0-1)
-        - options.allow_genomic: Removes genomic transcripts if set to False
-        - options.exlude_ISMs: Removes ISM transcripts if set to True
-        - options.min_count: Transcripts must appear at least this many times
-                             to count as present in a dataset
-        - options.min_datasets: After the min_count threshold has been
-                                applied, the transcript must be found in at
-                                least this many datasets to pass the filter.
-                                If this option is set to None, then it will
-                                default to the total number of datasets in the
-                                reads.
-        Please note that known transcripts are allowed through independently
-        of these parameters.
-        """
+    """Filter transcripts belonging to the specified datasets in a TALON
+    database. The 'annot' parameter specifies which annotation transcripts
+    are known relative to. Can be tuned with the following options:
+    - options.include_annot: Include all annotated transcripts regardless
+                             of whether they are expressed
+    - options.max_frac_A: maximum allowable fraction of As recorded for
+                          region after the read (0-1)
+    - options.allow_genomic: Removes genomic transcripts if set to False
+    - options.exlude_ISMs: Removes ISM transcripts if set to True
+    - options.min_count: Transcripts must appear at least this many times
+                         to count as present in a dataset
+    - options.min_datasets: After the min_count threshold has been
+                            applied, the transcript must be found in at
+                            least this many datasets to pass the filter.
+                            If this option is set to None, then it will
+                            default to the total number of datasets in the
+                            reads.
+    Please note that known transcripts are allowed through independently
+    of these parameters.
+    """
     # Known transcripts automatically pass the filter
-    known = get_known_transcripts(database, annot,
-                                  options.include_annot,
-                                  datasets = datasets)
+    known = get_known_transcripts(database, annot, options.include_annot, datasets=datasets)
 
     # Get reads that pass fraction A cutoff
-    reads = fetch_reads_in_datasets_fracA_cutoff(database, datasets,
-                                                 options.max_frac_A)
+    reads = fetch_reads_in_datasets_fracA_cutoff(database, datasets, options.max_frac_A)
 
     # Fetch novelty information and merge with reads
     reads = merge_reads_with_novelty(reads, get_novelty_df(database))
 
     # Drop genomic transcripts if desired
     if options.allow_genomic == False:
-        reads = reads.loc[reads.transcript_novelty != 'Genomic']
+        reads = reads.loc[reads.transcript_novelty != "Genomic"]
 
     # Drop ISMs if desired
     if options.exclude_ISMs == True:
-        reads = reads.loc[reads.transcript_novelty != 'ISM']
+        reads = reads.loc[reads.transcript_novelty != "ISM"]
 
     # Perform counts-based filtering
     filtered_counts = filter_on_min_count(reads, options.min_count)
@@ -336,12 +392,15 @@ def filter_talon_transcripts(database, annot, datasets, options):
 
     # Join the known transcripts with the filtered ones and return
     if len(dataset_filtered.index) != 0:
-      final_filtered = pd.concat([known[["gene_ID", "transcript_ID"]],
-                       dataset_filtered[["gene_ID", "transcript_ID"]]]).drop_duplicates()
-    else: final_filtered = known
+        final_filtered = pd.concat(
+            [known[["gene_ID", "transcript_ID"]], dataset_filtered[["gene_ID", "transcript_ID"]]]
+        ).drop_duplicates()
+    else:
+        final_filtered = known
 
     return final_filtered
 
+
 def main():
     options = getOptions()
     database = options.database
@@ -360,16 +419,18 @@ def main():
     # Parse datasets
     datasets = parse_datasets(options.datasets, database)
     if datasets != None and len(datasets) == 1:
-        warnings.warn("Only one dataset provided. For best performance, please "
-                      "run TALON with at least 2 biological replicates if possible.")
+        warnings.warn(
+            "Only one dataset provided. For best performance, please "
+            "run TALON with at least 2 biological replicates if possible."
+        )
 
     # Perform the filtering
     filtered = filter_talon_transcripts(database, annot, datasets, options)
 
     # Write gene and transcript IDs to file
     print("Writing gene-transcript TALON ID pairs that passed filtering to " + options.outfile + "...")
-    filtered.to_csv(options.outfile, sep = ",", header = False, index = False)
+    filtered.to_csv(options.outfile, sep=",", header=False, index=False)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/src/talon/post/generate_talon_report.py b/src/talon/post/generate_talon_report.py
index a78254c..f43c685 100644
--- a/src/talon/post/generate_talon_report.py
+++ b/src/talon/post/generate_talon_report.py
@@ -2,8 +2,7 @@
 import sys
 from pathlib import Path
 
-R_SCRIPT_FILE = Path(__file__).parent / Path("r_scripts") / Path(
-    "generate_talon_report.R")
+R_SCRIPT_FILE = Path(__file__).parent / Path("r_scripts") / Path("generate_talon_report.R")
 
 
 def main():
@@ -11,8 +10,7 @@ def main():
     try:
         subprocess.run(args, stdout=sys.stdout, stderr=sys.stderr)
     except FileNotFoundError as e:
-        print("This is a wrapper command for an R script. "
-              "Do you have GNU R installed?\n" + str(e))
+        print("This is a wrapper command for an R script. " "Do you have GNU R installed?\n" + str(e))
 
 
 if __name__ == "__main__":
diff --git a/src/talon/post/get_read_annotations.py b/src/talon/post/get_read_annotations.py
index c7d93e4..3be8f97 100644
--- a/src/talon/post/get_read_annotations.py
+++ b/src/talon/post/get_read_annotations.py
@@ -5,39 +5,51 @@
 # database in order to get read-specific annotation information.
 
 import argparse
-import sqlite3
 import os
+import sqlite3
 from pathlib import Path
-from .. import query_utils as qutils
 from string import Template
 
+from .. import query_utils as qutils
+
+
 def get_args():
-    """ Fetches the arguments for the program """
+    """Fetches the arguments for the program"""
 
-    program_desc = ("This utility queries a TALON database in order to get "
-                    "read-specific annotation information.")
+    program_desc = "This utility queries a TALON database in order to get " "read-specific annotation information."
     parser = argparse.ArgumentParser(description=program_desc)
 
-    parser.add_argument('--db', dest = 'database', metavar='FILE,', type = str,
-        help='TALON database')
-    parser.add_argument('--build', dest = 'build', metavar='STRING,', type = str,
-        help='Genome build (i.e. hg38) to use. Must be in the database.')
-    parser.add_argument('--datasets', dest = 'datasets', metavar='STRING,', type = str,
-        help=('Optional: Comma-delimited list of datasets to include. Default '
-              'behavior is to include all datasets in the database.'),
-        default = None)
-    parser.add_argument("--o", dest = "outprefix", help = "Prefix for output files",
-        type = str)
+    parser.add_argument("--db", dest="database", metavar="FILE,", type=str, help="TALON database")
+    parser.add_argument(
+        "--build",
+        dest="build",
+        metavar="STRING,",
+        type=str,
+        help="Genome build (i.e. hg38) to use. Must be in the database.",
+    )
+    parser.add_argument(
+        "--datasets",
+        dest="datasets",
+        metavar="STRING,",
+        type=str,
+        help=(
+            "Optional: Comma-delimited list of datasets to include. Default "
+            "behavior is to include all datasets in the database."
+        ),
+        default=None,
+    )
+    parser.add_argument("--o", dest="outprefix", help="Prefix for output files", type=str)
 
     args = parser.parse_args()
     return args
 
-def fetch_reads(database, build, tmp_file = None, datasets = None):
-    """ Performs database query to fetch location and gene/transcript assignment
-        info for each long read in the specified datasets.
-        If tmp_file is set to None (default), then the function will return
-        the query results in a list of lists. If an alternate value is provided,
-        then the results will be written to a tmp file of that name."""
+
+def fetch_reads(database, build, tmp_file=None, datasets=None):
+    """Performs database query to fetch location and gene/transcript assignment
+    info for each long read in the specified datasets.
+    If tmp_file is set to None (default), then the function will return
+    the query results in a list of lists. If an alternate value is provided,
+    then the results will be written to a tmp file of that name."""
 
     if datasets != None:
         # Format as a string for query
@@ -86,7 +98,7 @@ def fetch_reads(database, build, tmp_file = None, datasets = None):
             raise RuntimeError("Problem with reads database query")
 
         if tmp_file != None:
-            o = open(tmp_file, 'w')
+            o = open(tmp_file, "w")
         else:
             reads = []
 
@@ -115,32 +127,43 @@ def fetch_reads(database, build, tmp_file = None, datasets = None):
                 raise ValueError("Unrecognized strand value: " + str(strand))
 
             # Create entry for output
-            out_read = (entry["read_name"], entry["dataset"],
-                        entry["genome_build"], entry["gene_ID"],
-                        entry["transcript_ID"], entry["chrom"],
-                        read_start, read_end, strand, entry["n_exons"],
-                        entry["read_length"], entry["fraction_As"],
-                        entry["custom_label"], entry["allelic_label"],
-                        entry["start_support"], entry["end_support"])
+            out_read = (
+                entry["read_name"],
+                entry["dataset"],
+                entry["genome_build"],
+                entry["gene_ID"],
+                entry["transcript_ID"],
+                entry["chrom"],
+                read_start,
+                read_end,
+                strand,
+                entry["n_exons"],
+                entry["read_length"],
+                entry["fraction_As"],
+                entry["custom_label"],
+                entry["allelic_label"],
+                entry["start_support"],
+                entry["end_support"],
+            )
 
             if tmp_file != None:
-                o.write("\t".join([ str(x) for x in out_read ]) + "\n")
+                o.write("\t".join([str(x) for x in out_read]) + "\n")
             else:
                 reads.append(out_read)
             count += 1
 
     # Return results or close file
     if count == 0:
-        raise ValueError(("No reads detected. Make sure your dataset names are "
-                          "correct."))
+        raise ValueError(("No reads detected. Make sure your dataset names are " "correct."))
 
     if tmp_file != None:
         o.close()
     else:
         return reads
 
+
 def get_gene_novelty(database):
-    """ Given a database, get the novelty status of each gene. """
+    """Given a database, get the novelty status of each gene."""
 
     gene_novelty = {}
     with sqlite3.connect(database) as conn:
@@ -148,37 +171,46 @@ def get_gene_novelty(database):
         cursor = conn.cursor()
 
         # Fetch known genes
-        cursor.execute("""SELECT ID FROM gene_annotations
+        cursor.execute(
+            """SELECT ID FROM gene_annotations
                               WHERE attribute = "gene_status"
-                              AND value = "KNOWN";""")
+                              AND value = "KNOWN";"""
+        )
         for entry in cursor:
             gene_novelty[entry[0]] = "Known"
 
         # Fetch antisense genes
-        cursor.execute("""SELECT ID FROM gene_annotations
+        cursor.execute(
+            """SELECT ID FROM gene_annotations
                               WHERE attribute = "antisense_gene"
-                              AND value = "TRUE";""")
+                              AND value = "TRUE";"""
+        )
         for entry in cursor:
             gene_novelty[entry[0]] = "Antisense"
 
         # Fetch fusion genes
-        cursor.execute("""SELECT ID FROM gene_annotations
+        cursor.execute(
+            """SELECT ID FROM gene_annotations
                               WHERE attribute = "fusion_novel"
-                              AND value = "TRUE";""")
+                              AND value = "TRUE";"""
+        )
         for entry in cursor:
             gene_novelty[entry[0]] = "Fusion"
 
         # Fetch intergenic genes
-        cursor.execute("""SELECT ID FROM gene_annotations
+        cursor.execute(
+            """SELECT ID FROM gene_annotations
                               WHERE attribute = "intergenic_novel"
-                              AND value = "TRUE";""")
+                              AND value = "TRUE";"""
+        )
         for entry in cursor:
             gene_novelty[entry[0]] = "Intergenic"
 
     return gene_novelty
 
+
 def get_transcript_novelty(database):
-    """ Given a database, get the novelty status of each transcript. """
+    """Given a database, get the novelty status of each transcript."""
 
     transcript_novelty = {}
     with sqlite3.connect(database) as conn:
@@ -186,65 +218,82 @@ def get_transcript_novelty(database):
         cursor = conn.cursor()
 
         # Fetch known transcripts
-        cursor.execute("""SELECT ID FROM transcript_annotations
+        cursor.execute(
+            """SELECT ID FROM transcript_annotations
                               WHERE attribute = "transcript_status"
-                              AND value = "KNOWN";""")
+                              AND value = "KNOWN";"""
+        )
         for entry in cursor:
             transcript_novelty[entry[0]] = "Known"
 
         # Fetch ISM transcripts
-        cursor.execute("""SELECT ID FROM transcript_annotations
+        cursor.execute(
+            """SELECT ID FROM transcript_annotations
                               WHERE attribute = "ISM_transcript"
-                              AND value = "TRUE";""")
+                              AND value = "TRUE";"""
+        )
         for entry in cursor:
             transcript_novelty[entry[0]] = "ISM"
 
         # Fetch NIC transcripts
-        cursor.execute("""SELECT ID FROM transcript_annotations
+        cursor.execute(
+            """SELECT ID FROM transcript_annotations
                               WHERE attribute = "NIC_transcript"
-                              AND value = "TRUE";""")
+                              AND value = "TRUE";"""
+        )
         for entry in cursor:
             transcript_novelty[entry[0]] = "NIC"
 
         # Fetch NNC transcripts
-        cursor.execute("""SELECT ID FROM transcript_annotations
+        cursor.execute(
+            """SELECT ID FROM transcript_annotations
                               WHERE attribute = "NNC_transcript"
-                              AND value = "TRUE";""")
+                              AND value = "TRUE";"""
+        )
         for entry in cursor:
             transcript_novelty[entry[0]] = "NNC"
 
         # Fetch antisense transcripts
-        cursor.execute("""SELECT ID FROM transcript_annotations
+        cursor.execute(
+            """SELECT ID FROM transcript_annotations
                               WHERE attribute = "antisense_transcript"
-                              AND value = "TRUE";""")
+                              AND value = "TRUE";"""
+        )
         for entry in cursor:
             transcript_novelty[entry[0]] = "Antisense"
 
         # Fetch intergenic transcripts
-        cursor.execute("""SELECT ID FROM transcript_annotations
+        cursor.execute(
+            """SELECT ID FROM transcript_annotations
                               WHERE attribute = "intergenic_transcript"
-                              AND value = "TRUE";""")
+                              AND value = "TRUE";"""
+        )
         for entry in cursor:
             transcript_novelty[entry[0]] = "Intergenic"
 
         # Fetch genomic transcripts
-        cursor.execute("""SELECT ID FROM transcript_annotations
+        cursor.execute(
+            """SELECT ID FROM transcript_annotations
                               WHERE attribute = "genomic_transcript"
-                              AND value = "TRUE";""")
+                              AND value = "TRUE";"""
+        )
         for entry in cursor:
             transcript_novelty[entry[0]] = "Genomic"
 
         # Fetch fusion transcripts
-        cursor.execute("""SELECT ID FROM transcript_annotations
+        cursor.execute(
+            """SELECT ID FROM transcript_annotations
                               WHERE attribute = "fusion_transcript"
-                              AND value = "TRUE";""")
+                              AND value = "TRUE";"""
+        )
         for entry in cursor:
             transcript_novelty[entry[0]] = "Fusion"
 
     return transcript_novelty
 
+
 def get_ISM_novelty(database):
-    """ Given a database, get the ISM subtype of each ISM transcript. """
+    """Given a database, get the ISM subtype of each ISM transcript."""
 
     all_ISMs = set()
     prefix_ISMs = set()
@@ -256,23 +305,29 @@ def get_ISM_novelty(database):
         cursor = conn.cursor()
 
         # Fetch ISM transcripts
-        cursor.execute("""SELECT ID FROM transcript_annotations
+        cursor.execute(
+            """SELECT ID FROM transcript_annotations
                               WHERE attribute = "ISM_transcript"
-                              AND value = "TRUE";""")
+                              AND value = "TRUE";"""
+        )
         for entry in cursor:
             all_ISMs.add(entry[0])
 
         # Fetch Prefix ISMs
-        cursor.execute("""SELECT ID FROM transcript_annotations
+        cursor.execute(
+            """SELECT ID FROM transcript_annotations
                               WHERE attribute = "ISM-prefix_transcript"
-                              AND value = "TRUE";""")
+                              AND value = "TRUE";"""
+        )
         for entry in cursor:
             prefix_ISMs.add(entry[0])
 
         # Fetch Suffix ISMs
-        cursor.execute("""SELECT ID FROM transcript_annotations
+        cursor.execute(
+            """SELECT ID FROM transcript_annotations
                               WHERE attribute = "ISM-suffix_transcript"
-                              AND value = "TRUE";""")
+                              AND value = "TRUE";"""
+        )
         for entry in cursor:
             suffix_ISMs.add(entry[0])
 
@@ -289,9 +344,10 @@ def get_ISM_novelty(database):
 
     return ISM_novelty
 
+
 def get_gene_annotations(database):
-    """ Create a dictionary linking each TALON gene ID to its human-readable
-        name and accession ID """
+    """Create a dictionary linking each TALON gene ID to its human-readable
+    name and accession ID"""
 
     gene_name = {}
     gene_ID = {}
@@ -300,21 +356,26 @@ def get_gene_annotations(database):
         conn.row_factory = sqlite3.Row
         cursor = conn.cursor()
 
-        cursor.execute("""SELECT ID, ga.value FROM gene_annotations as ga
-                          WHERE attribute = "gene_name";""")
+        cursor.execute(
+            """SELECT ID, ga.value FROM gene_annotations as ga
+                          WHERE attribute = "gene_name";"""
+        )
         for entry in cursor:
             gene_name[entry["ID"]] = entry["value"]
 
-        cursor.execute("""SELECT ID, ga.value FROM gene_annotations as ga
-                          WHERE attribute = "gene_id";""")
+        cursor.execute(
+            """SELECT ID, ga.value FROM gene_annotations as ga
+                          WHERE attribute = "gene_id";"""
+        )
         for entry in cursor:
             gene_ID[entry["ID"]] = entry["value"]
 
     return gene_name, gene_ID
 
+
 def get_transcript_annotations(database):
-    """ Create a dictionary linking each TALON transcript ID to its human-readable
-        name and accession ID """
+    """Create a dictionary linking each TALON transcript ID to its human-readable
+    name and accession ID"""
 
     transcript_name = {}
     transcript_ID = {}
@@ -323,50 +384,55 @@ def get_transcript_annotations(database):
         conn.row_factory = sqlite3.Row
         cursor = conn.cursor()
 
-        cursor.execute("""SELECT ID, ta.value FROM transcript_annotations as ta
-                          WHERE attribute = "transcript_name";""")
+        cursor.execute(
+            """SELECT ID, ta.value FROM transcript_annotations as ta
+                          WHERE attribute = "transcript_name";"""
+        )
         for entry in cursor:
             transcript_name[entry["ID"]] = entry["value"]
 
-        cursor.execute("""SELECT ID, ta.value FROM transcript_annotations as ta
-                          WHERE attribute = "transcript_id";""")
+        cursor.execute(
+            """SELECT ID, ta.value FROM transcript_annotations as ta
+                          WHERE attribute = "transcript_id";"""
+        )
         for entry in cursor:
             transcript_ID[entry["ID"]] = entry["value"]
 
     return transcript_name, transcript_ID
 
-def make_read_annot_file(database, build, outprefix, datasets = "all"):
-    """ Creates an output file with the following columns:
-            1. read_name
-            2. dataset
-            3. genome_build
-            4. chrom
-            5. read_start
-            6. read_end
-            7. strand
-            8. n_exons
-            9. read_length
-            10. gene_ID (TALON)
-            11. transcript_ID (TALON)
-            12. annot_gene_id
-            13. annot_transcript_id
-            14. annot_gene_name
-            15. annot_transcript_name
-            16. gene_novelty
-            17. transcript_novelty
-            18. ISM_subtype
-            19. Fraction As (following the alignment)
-            20. Custom label
-            21. Allelic label
-            22. Start support (external assay)
-            23. End support (external assay)
-
-        By default, reads from all datasets in the database are included, but
-        this can be modified by supplying a list/tuple of dataset names to the
-        datasets parameter.
+
+def make_read_annot_file(database, build, outprefix, datasets="all"):
+    """Creates an output file with the following columns:
+        1. read_name
+        2. dataset
+        3. genome_build
+        4. chrom
+        5. read_start
+        6. read_end
+        7. strand
+        8. n_exons
+        9. read_length
+        10. gene_ID (TALON)
+        11. transcript_ID (TALON)
+        12. annot_gene_id
+        13. annot_transcript_id
+        14. annot_gene_name
+        15. annot_transcript_name
+        16. gene_novelty
+        17. transcript_novelty
+        18. ISM_subtype
+        19. Fraction As (following the alignment)
+        20. Custom label
+        21. Allelic label
+        22. Start support (external assay)
+        23. End support (external assay)
+
+    By default, reads from all datasets in the database are included, but
+    this can be modified by supplying a list/tuple of dataset names to the
+    datasets parameter.
     """
     tmp_read_file = outprefix + "_reads.tmp"
-    fetch_reads(database, build, tmp_file = tmp_read_file, datasets = datasets)
+    fetch_reads(database, build, tmp_file=tmp_read_file, datasets=datasets)
 
     # Make annotation dicts
     gene_names, gene_IDs = get_gene_annotations(database)
@@ -378,21 +444,54 @@ def make_read_annot_file(database, build, outprefix, datasets = "all"):
     ISM_novelty = get_ISM_novelty(database)
 
     fname = outprefix + "_talon_read_annot.tsv"
-    o = open(fname, 'w')
-    colnames = [ "read_name", "dataset", "genome_build", "chrom",
-                 "read_start", "read_end", "strand", "n_exons", "read_length",
-                 "gene_ID", "transcript_ID", "annot_gene_id", "annot_transcript_id",
-                 "annot_gene_name", "annot_transcript_name", "gene_novelty",
-                 "transcript_novelty", "ISM_subtype", "fraction_As", "custom_label",
-                 "allelic_label", "start_support", "end_support"]
+    o = open(fname, "w")
+    colnames = [
+        "read_name",
+        "dataset",
+        "genome_build",
+        "chrom",
+        "read_start",
+        "read_end",
+        "strand",
+        "n_exons",
+        "read_length",
+        "gene_ID",
+        "transcript_ID",
+        "annot_gene_id",
+        "annot_transcript_id",
+        "annot_gene_name",
+        "annot_transcript_name",
+        "gene_novelty",
+        "transcript_novelty",
+        "ISM_subtype",
+        "fraction_As",
+        "custom_label",
+        "allelic_label",
+        "start_support",
+        "end_support",
+    ]
     o.write("\t".join(colnames) + "\n")
 
-    with open(tmp_read_file, 'r') as f:
+    with open(tmp_read_file, "r") as f:
         for read_entry in f:
-            read_name, dataset, genome_build, gene_ID, \
-            transcript_ID, chrom, read_start, read_end, \
-            strand, n_exons, read_length, fraction_As, custom_label, \
-            allelic_label, start_support, end_support = read_entry.strip().split("\t")
+            (
+                read_name,
+                dataset,
+                genome_build,
+                gene_ID,
+                transcript_ID,
+                chrom,
+                read_start,
+                read_end,
+                strand,
+                n_exons,
+                read_length,
+                fraction_As,
+                custom_label,
+                allelic_label,
+                start_support,
+                end_support,
+            ) = read_entry.strip().split("\t")
 
             gene_ID = int(gene_ID)
             transcript_ID = int(transcript_ID)
@@ -432,20 +531,43 @@ def make_read_annot_file(database, build, outprefix, datasets = "all"):
 
             gene_ID = str(gene_ID)
             transcript_ID = str(transcript_ID)
-            o.write("\t".join([read_name, dataset, genome_build, chrom,
-                               read_start, read_end, strand, n_exons, read_length,
-                               gene_ID, transcript_ID,
-                               annot_gene_id, annot_transcript_id,
-                               annot_gene_name, annot_transcript_name,
-                               curr_gene_novelty, curr_transcript_novelty,
-                               curr_ISM_novelty, fraction_As, custom_label,
-                               allelic_label, start_support, end_support]) + "\n")
+            o.write(
+                "\t".join(
+                    [
+                        read_name,
+                        dataset,
+                        genome_build,
+                        chrom,
+                        read_start,
+                        read_end,
+                        strand,
+                        n_exons,
+                        read_length,
+                        gene_ID,
+                        transcript_ID,
+                        annot_gene_id,
+                        annot_transcript_id,
+                        annot_gene_name,
+                        annot_transcript_name,
+                        curr_gene_novelty,
+                        curr_transcript_novelty,
+                        curr_ISM_novelty,
+                        fraction_As,
+                        custom_label,
+                        allelic_label,
+                        start_support,
+                        end_support,
+                    ]
+                )
+                + "\n"
+            )
 
     o.close()
     os.system("rm " + tmp_read_file)
 
+
 def check_build_validity(build, database):
-    """ Make sure that the user has entered a correct build name """
+    """Make sure that the user has entered a correct build name"""
 
     conn = sqlite3.connect(database)
     cursor = conn.cursor()
@@ -455,19 +577,20 @@ def check_build_validity(build, database):
     conn.close()
 
     if build == None:
-        message = "Please provide a valid genome build name. " + \
-                  "In this database, your options are: " + \
-                  ", ".join(builds)
+        message = (
+            "Please provide a valid genome build name. " + "In this database, your options are: " + ", ".join(builds)
+        )
         raise ValueError(message)
 
     if build not in builds:
-        message = "Build name '" + build + \
-                  "' not found in this database. Try one of the following: " + \
-                  ", ".join(builds)
+        message = (
+            "Build name '" + build + "' not found in this database. Try one of the following: " + ", ".join(builds)
+        )
         raise ValueError(message)
 
     return
 
+
 def main():
     options = get_args()
     database = options.database
@@ -483,8 +606,8 @@ def main():
     if datasets != None:
         datasets = datasets.split(",")
 
-    make_read_annot_file(database, build, outprefix, datasets = datasets)
+    make_read_annot_file(database, build, outprefix, datasets=datasets)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/src/talon/post/get_transcript_sjs.py b/src/talon/post/get_transcript_sjs.py
index 6b16913..57c51da 100644
--- a/src/talon/post/get_transcript_sjs.py
+++ b/src/talon/post/get_transcript_sjs.py
@@ -1,56 +1,63 @@
-import os
-import pandas as pd
 import argparse
+import os
 import sqlite3
+
 import numpy as np
+import pandas as pd
 
 
 def get_args():
-
-	desc = ('Extracts the locations, novelty, and transcript assignments of'
-                ' exons/introns in a TALON database or GTF file. All positions '
-                'are 1-based.')
-	parser = argparse.ArgumentParser(description=desc)
-
-	parser.add_argument('--gtf', dest='gtf', default=None,
-		help = 'TALON GTF file from which to extract exons/introns')
-	parser.add_argument('--db', dest='db', default=None,
-		help = 'TALON database from which to extract exons/introns')
-	parser.add_argument('--ref', dest='ref_gtf', 
-		help = ('GTF reference file (ie GENCODE). Will be used to '
-                        'label novelty.'))
-	parser.add_argument('--mode', dest='mode', 
-		help= ("Choices are 'intron' or 'exon' (default is 'intron'). "
-			"Determines whether to include introns or exons in the "
-			"output"), default='intron')
-	parser.add_argument('--outprefix', dest='outprefix',
-		help = 'Prefix for output file')
+    desc = (
+        "Extracts the locations, novelty, and transcript assignments of"
+        " exons/introns in a TALON database or GTF file. All positions "
+        "are 1-based."
+    )
+    parser = argparse.ArgumentParser(description=desc)
+
+    parser.add_argument("--gtf", dest="gtf", default=None, help="TALON GTF file from which to extract exons/introns")
+    parser.add_argument("--db", dest="db", default=None, help="TALON database from which to extract exons/introns")
+    parser.add_argument(
+        "--ref", dest="ref_gtf", help=("GTF reference file (ie GENCODE). Will be used to " "label novelty.")
+    )
+    parser.add_argument(
+        "--mode",
+        dest="mode",
+        help=(
+            "Choices are 'intron' or 'exon' (default is 'intron'). "
+            "Determines whether to include introns or exons in the "
+            "output"
+        ),
+        default="intron",
+    )
+    parser.add_argument("--outprefix", dest="outprefix", help="Prefix for output file")
+
+    args = parser.parse_args()
+
+    if args.gtf and args.db:
+        raise Exception("only input gtf or db")
+
+    return args
 
 
-	args = parser.parse_args()
-
-	if args.gtf and args.db: 
-		raise Exception('only input gtf or db')
-
-	return args
-
 # creates a dictionary of the last field of a gtf
 # adapted from Dana Wyman
 def get_fields(tab_fields):
     attributes = {}
 
     # remove trailing newline and split by semicolon
-    description = tab_fields[-1].strip('\n')
-    description = description.split(';')
+    description = tab_fields[-1].strip("\n")
+    description = description.split(";")
 
     # Parse description
     for fields in description:
-        if fields == "" or fields == " ": continue
+        if fields == "" or fields == " ":
+            continue
         fields = fields.split()
-        if fields[0] == '': fields = fields[1:]
+        if fields[0] == "":
+            fields = fields[1:]
 
-        key = fields[0].replace('"', '')
-        val = ' '.join(fields[1:]).replace('"', '')
+        key = fields[0].replace('"', "")
+        val = " ".join(fields[1:]).replace('"', "")
 
         attributes[key] = val
 
@@ -59,55 +66,49 @@ def get_fields(tab_fields):
     if "gene_id" not in attributes:
         attributes["gene_id"] = "NULL"
 
-    return attributes    
+    return attributes
+
 
 # create loc_df (for nodes), edge_df (for edges), and t_df (for paths)
 def create_dfs_db(db):
+    # make sure file exists
+    if not os.path.exists(db):
+        raise Exception("TALON db file not found. Check path.")
 
-	# make sure file exists
-	if not os.path.exists(db):
-		raise Exception('TALON db file not found. Check path.')
+    # open db connection
+    conn = sqlite3.connect(db)
+    c = conn.cursor()
 
-	# open db connection
-	conn = sqlite3.connect(db)
-	c = conn.cursor()
+    # loc_df
+    q = "SELECT loc.* FROM location loc"
 
-	# loc_df
-	q = 'SELECT loc.* FROM location loc'
+    c.execute(q)
+    locs = c.fetchall()
 
-	c.execute(q)
-	locs = c.fetchall()
+    loc_df = pd.DataFrame(locs, columns=["location_ID", "genome_build", "chrom", "position"])
 
-	loc_df = pd.DataFrame(locs,
-		columns=['location_ID', 'genome_build',
-				 'chrom', 'position'])
+    # do some df reformatting, add strand
+    loc_df.drop("genome_build", axis=1, inplace=True)
+    loc_df.rename({"location_ID": "vertex_id", "position": "coord"}, inplace=True, axis=1)
+    loc_df.vertex_id = loc_df.vertex_id.map(int)
 
-	# do some df reformatting, add strand
-	loc_df.drop('genome_build', axis=1, inplace=True)
-	loc_df.rename({'location_ID': 'vertex_id',
-				   'position': 'coord'},
-				   inplace=True, axis=1)
-	loc_df.vertex_id = loc_df.vertex_id.map(int)
+    # edge_df
+    q = """SELECT * FROM edge """
 
-	# edge_df
-	q = """SELECT * FROM edge """
+    c.execute(q)
+    edges = c.fetchall()
 
-	c.execute(q)
-	edges = c.fetchall()
+    edge_df = pd.DataFrame(edges, columns=["edge_id", "v1", "v2", "edge_type", "strand"])
+    edge_df.v1 = edge_df.v1.map(int)
+    edge_df.v2 = edge_df.v2.map(int)
+    edge_df["talon_edge_id"] = edge_df.edge_id
+    edge_df["edge_id"] = edge_df.apply(lambda x: (int(x.v1), int(x.v2)), axis=1)
 
-	edge_df = pd.DataFrame(edges, 
-		columns=['edge_id', 'v1', 'v2',
-				 'edge_type', 'strand'])
-	edge_df.v1 = edge_df.v1.map(int)
-	edge_df.v2 = edge_df.v2.map(int)
-	edge_df['talon_edge_id'] = edge_df.edge_id
-	edge_df['edge_id'] = edge_df.apply(lambda x: (int(x.v1), int(x.v2)), axis=1)
+    # t_df
+    t_df = pd.DataFrame()
 
-	# t_df
-	t_df = pd.DataFrame()
-
-	# get tid, gid, gname, and paths
-	q = """SELECT ga.value, ta.value,
+    # get tid, gid, gname, and paths
+    q = """SELECT ga.value, ta.value,
 				  t.start_exon, t.jn_path, t.end_exon,
 				  t.start_vertex, t.end_vertex
 			FROM gene_annotations ga 
@@ -118,325 +119,334 @@ def create_dfs_db(db):
 			OR ga.attribute='gene_id')
 		"""
 
-	c.execute(q)
-	data = c.fetchall()
+    c.execute(q)
+    data = c.fetchall()
+
+    # get fields from each transcript and add to dataframe
+    gids, tids, paths = zip(*[(i[0], i[1], i[2:]) for i in data[::2]])
+    gnames = [i[0] for i in data[1::2]]
+    paths = get_db_edge_paths(paths)
+
+    t_df["tid"] = np.asarray(tids)
+    t_df["path"] = np.asarray(paths)
 
-	# get fields from each transcript and add to dataframe
-	gids, tids, paths = zip(*[(i[0], i[1], i[2:]) for i in data[::2]])
-	gnames = [i[0] for i in data[1::2]]
-	paths = get_db_edge_paths(paths)
+    t_df = create_dupe_index(t_df, "tid")
+    t_df = set_dupe_index(t_df, "tid")
 
-	t_df['tid'] = np.asarray(tids)
-	t_df['path'] = np.asarray(paths)
+    # furnish the last bit of info in each df
+    t_df["path"] = [[int(n) for n in path] for path in get_db_vertex_paths(paths, edge_df)]
+    loc_df = create_dupe_index(loc_df, "vertex_id")
+    loc_df = set_dupe_index(loc_df, "vertex_id")
 
-	t_df = create_dupe_index(t_df, 'tid')
-	t_df = set_dupe_index(t_df, 'tid')
+    edge_df.drop("talon_edge_id", axis=1, inplace=True)
+    edge_df = create_dupe_index(edge_df, "edge_id")
+    edge_df = set_dupe_index(edge_df, "edge_id")
 
-	# furnish the last bit of info in each df
-	t_df['path'] = [[int(n) for n in path]
-					 for path in get_db_vertex_paths(paths, edge_df)]
-	loc_df = create_dupe_index(loc_df, 'vertex_id')
-	loc_df = set_dupe_index(loc_df, 'vertex_id')
+    return loc_df, edge_df, t_df
 
-	edge_df.drop('talon_edge_id', axis=1, inplace=True)
-	edge_df = create_dupe_index(edge_df, 'edge_id')
-	edge_df = set_dupe_index(edge_df, 'edge_id')
+    # create loc_df (nodes), edge_df (edges), and t_df (transcripts) from gtf
+    # adapted from Dana Wyman and TALON
 
-	return loc_df, edge_df, t_df
 
-	# create loc_df (nodes), edge_df (edges), and t_df (transcripts) from gtf
-	# adapted from Dana Wyman and TALON
 def create_dfs_gtf(gtf_file):
+    # make sure file exists
+    if not os.path.exists(gtf_file):
+        raise Exception("GTF file not found. Check path.")
+
+    # depending on the strand, determine the stard and stop
+    # coords of an intron or exon
+    def find_edge_start_stop(v1, v2, strand):
+        if strand == "-":
+            start = max([v1, v2])
+            stop = min([v1, v2])
+        elif strand == "+":
+            start = min([v1, v2])
+            stop = max([v1, v2])
+        return start, stop
+
+    # dictionaries to hold unique edges and transcripts
+    transcripts = {}
+    exons = {}
+
+    with open(gtf_file) as gtf:
+        for line in gtf:
+            # ignore header lines
+            if line.startswith("#"):
+                continue
+
+            # split each entry
+            line = line.strip().split("\t")
+
+            # get some fields from gtf that we care about
+            chrom = line[0]
+            entry_type = line[2]
+            start = int(line[3])
+            stop = int(line[4])
+            strand = line[6]
+            fields = line[-1]
+
+            # transcript entry
+            if entry_type == "transcript":
+                attributes = get_fields(line)
+                tid = attributes["transcript_id"]
+                gid = attributes["gene_id"]
+
+                # add transcript to dictionary
+                transcript = {tid: {"gid": gid, "tid": tid, "strand": strand, "exons": []}}
+                transcripts.update(transcript)
+
+            # exon entry
+            elif entry_type == "exon":
+                attributes = get_fields(line)
+                start, stop = find_edge_start_stop(start, stop, strand)
+                eid = "{}_{}_{}_{}_exon".format(chrom, start, stop, strand)
+                tid = attributes["transcript_id"]
+
+                # add novel exon to dictionary
+                if eid not in exons:
+                    edge = {eid: {"eid": eid, "chrom": chrom, "v1": start, "v2": stop, "strand": strand}}
+                    exons.update(edge)
+
+                    # add this exon to the transcript's list of exons
+                if tid in transcripts:
+                    transcripts[tid]["exons"].append(eid)
+
+    # once we have all transcripts, make loc_df
+    locs = {}
+    vertex_id = 0
+    for edge_id, edge in exons.items():
+        chrom = edge["chrom"]
+        strand = edge["strand"]
+
+        v1 = edge["v1"]
+        v2 = edge["v2"]
+
+        # exon start
+        key = (chrom, v1)
+        if key not in locs:
+            locs[key] = vertex_id
+            vertex_id += 1
+        # exon end
+        key = (chrom, v2)
+        if key not in locs:
+            locs[key] = vertex_id
+            vertex_id += 1
+
+    # add locs-indexed path to transcripts, and populate edges
+    edges = {}
+    for _, t in transcripts.items():
+        t["path"] = []
+        strand = t["strand"]
+        t_exons = t["exons"]
+
+        for i, exon_id in enumerate(t_exons):
+            # pull some information from exon dict
+            exon = exons[exon_id]
+            chrom = exon["chrom"]
+            v1 = exon["v1"]
+            v2 = exon["v2"]
+            strand = exon["strand"]
+
+            # add current exon and subsequent intron
+            # (if not the last exon) for each exon to edges
+            key = (chrom, v1, v2, strand)
+            v1_key = (chrom, v1)
+            v2_key = (chrom, v2)
+            edge_id = (locs[v1_key], locs[v2_key])
+            if key not in edges:
+                edges[key] = {"edge_id": edge_id, "edge_type": "exon"}
+
+            # add exon locs to path
+            t["path"] += list(edge_id)
+
+            # if this isn't the last exon, we also needa add an intron
+            # this consists of v2 of the prev exon and v1 of the next exon
+            if i < len(t_exons) - 1:
+                next_exon = exons[t_exons[i + 1]]
+                v1 = next_exon["v1"]
+                key = (chrom, v2, v1, strand)
+                v1_key = (chrom, v1)
+                edge_id = (locs[v2_key], locs[v1_key])
+                if key not in edges:
+                    edges[key] = {"edge_id": edge_id, "edge_type": "intron"}
+
+    # turn transcripts, edges, and locs into dataframes
+    locs = [{"chrom": key[0], "coord": key[1], "vertex_id": vertex_id} for key, vertex_id in locs.items()]
+    loc_df = pd.DataFrame(locs)
+
+    edges = [
+        {
+            "v1": item["edge_id"][0],
+            "v2": item["edge_id"][1],
+            "strand": key[3],
+            "edge_id": item["edge_id"],
+            "edge_type": item["edge_type"],
+        }
+        for key, item in edges.items()
+    ]
+    edge_df = pd.DataFrame(edges)
+
+    transcripts = [{"tid": key, "gid": item["gid"], "path": item["path"]} for key, item in transcripts.items()]
+    t_df = pd.DataFrame(transcripts)
+
+    # final df formatting
+    loc_df = create_dupe_index(loc_df, "vertex_id")
+    loc_df = set_dupe_index(loc_df, "vertex_id")
+    edge_df = create_dupe_index(edge_df, "edge_id")
+    edge_df = set_dupe_index(edge_df, "edge_id")
+    t_df = create_dupe_index(t_df, "tid")
+    t_df = set_dupe_index(t_df, "tid")
+
+    return loc_df, edge_df, t_df
 
-	# make sure file exists
-	if not os.path.exists(gtf_file):
-		raise Exception('GTF file not found. Check path.')
-
-	# depending on the strand, determine the stard and stop
-	# coords of an intron or exon
-	def find_edge_start_stop(v1, v2, strand):
-		if strand == '-':
-			start = max([v1, v2])
-			stop = min([v1, v2])
-		elif strand == '+':
-			start = min([v1, v2])
-			stop = max([v1, v2])
-		return start, stop
-
-	# dictionaries to hold unique edges and transcripts
-	transcripts = {}
-	exons = {}
-
-	with open(gtf_file) as gtf:
-		for line in gtf:
-
-			# ignore header lines
-			if line.startswith('#'):
-				continue
-
-			# split each entry
-			line = line.strip().split('\t')
-
-			# get some fields from gtf that we care about
-			chrom = line[0]
-			entry_type = line[2]
-			start = int(line[3])
-			stop = int(line[4])
-			strand = line[6]
-			fields = line[-1]
-
-			# transcript entry 
-			if entry_type == "transcript":
-				attributes = get_fields(line)
-				tid = attributes['transcript_id']
-				gid = attributes['gene_id']
-
-				# add transcript to dictionary 
-				transcript = {tid: {'gid': gid,
-									'tid': tid,
-									'strand': strand,
-									'exons': []}}
-				transcripts.update(transcript)
-				
-			# exon entry
-			elif entry_type == "exon":
-				attributes = get_fields(line)
-				start, stop = find_edge_start_stop(start, stop, strand)
-				eid = '{}_{}_{}_{}_exon'.format(chrom, start, stop, strand)
-				tid = attributes['transcript_id']	
-
-				# add novel exon to dictionary 
-				if eid not in exons:
-					edge = {eid: {'eid': eid,
-								  'chrom': chrom,
-								  'v1': start,
-								  'v2': stop,
-								  'strand': strand}}
-					exons.update(edge)
-		   
-		   		# add this exon to the transcript's list of exons
-				if tid in transcripts:
-					transcripts[tid]['exons'].append(eid)
-
-	# once we have all transcripts, make loc_df
-	locs = {}
-	vertex_id = 0
-	for edge_id, edge in exons.items():
-		chrom = edge['chrom']
-		strand = edge['strand']
-
-		v1 = edge['v1']
-		v2 = edge['v2']
-
-		# exon start
-		key = (chrom, v1)
-		if key not in locs:
-			locs[key] = vertex_id
-			vertex_id += 1
-		# exon end
-		key = (chrom, v2)
-		if key not in locs:
-			locs[key] = vertex_id
-			vertex_id += 1
-
-	# add locs-indexed path to transcripts, and populate edges
-	edges = {}
-	for _,t in transcripts.items():
-		t['path'] = []
-		strand = t['strand']
-		t_exons = t['exons']
-
-		for i, exon_id in enumerate(t_exons):
-
-			# pull some information from exon dict
-			exon = exons[exon_id]
-			chrom = exon['chrom']
-			v1 = exon['v1']
-			v2 = exon['v2']
-			strand = exon['strand']
-
-			# add current exon and subsequent intron 
-			# (if not the last exon) for each exon to edges
-			key = (chrom, v1, v2, strand)
-			v1_key = (chrom, v1)
-			v2_key = (chrom, v2)
-			edge_id = (locs[v1_key], locs[v2_key])
-			if key not in edges:
-				edges[key] = {'edge_id': edge_id, 'edge_type': 'exon'}
-
-			# add exon locs to path
-			t['path'] += list(edge_id)
-
-			# if this isn't the last exon, we also needa add an intron
-			# this consists of v2 of the prev exon and v1 of the next exon
-			if i < len(t_exons)-1:
-				next_exon = exons[t_exons[i+1]]
-				v1 = next_exon['v1']
-				key = (chrom, v2, v1, strand)
-				v1_key = (chrom, v1)
-				edge_id = (locs[v2_key], locs[v1_key])
-				if key not in edges:
-					edges[key] = {'edge_id': edge_id, 'edge_type': 'intron'}
-
-	# turn transcripts, edges, and locs into dataframes
-	locs = [{'chrom': key[0],
-			 'coord': key[1],
-			 'vertex_id': vertex_id} for key, vertex_id in locs.items()]
-	loc_df = pd.DataFrame(locs)
-
-	edges = [{'v1': item['edge_id'][0],
-			  'v2': item['edge_id'][1], 
-			  'strand': key[3],
-			  'edge_id': item['edge_id'],
-			  'edge_type': item['edge_type']} for key, item in edges.items()]
-	edge_df = pd.DataFrame(edges)
-
-	transcripts = [{'tid': key,
-					'gid': item['gid'],
-					'path': item['path']} for key, item in transcripts.items()]
-	t_df = pd.DataFrame(transcripts)
-
-	# final df formatting
-	loc_df = create_dupe_index(loc_df, 'vertex_id')
-	loc_df = set_dupe_index(loc_df, 'vertex_id')
-	edge_df = create_dupe_index(edge_df, 'edge_id')
-	edge_df = set_dupe_index(edge_df, 'edge_id')
-	t_df = create_dupe_index(t_df, 'tid')
-	t_df = set_dupe_index(t_df, 'tid')
-
-	return loc_df, edge_df, t_df
 
 # convert talon query into edge path
 def get_db_edge_paths(paths):
-	edge_paths = []
-	for p in paths:
-		if p[1] == None:
-			edge_paths.append([p[0]])
-		else:
-			edge_paths.append(
-				[p[0], *[int(i) for i in p[1].split(',')], p[2]])
-	return edge_paths
+    edge_paths = []
+    for p in paths:
+        if p[1] == None:
+            edge_paths.append([p[0]])
+        else:
+            edge_paths.append([p[0], *[int(i) for i in p[1].split(",")], p[2]])
+    return edge_paths
+
 
 # convert edge path to vertex path
 def get_db_vertex_paths(paths, edge_df):
-	vertex_paths = []
-	for p in paths: 
-		path = []
-		for i, e in enumerate(p): 
-			entry = edge_df.loc[edge_df.talon_edge_id == e]
-			if i == 0:
-				path.extend([entry.v1.values[0], entry.v2.values[0]])
-			else: path.append(entry.v2.values[0])
-		vertex_paths.append(path)
-	return vertex_paths
+    vertex_paths = []
+    for p in paths:
+        path = []
+        for i, e in enumerate(p):
+            entry = edge_df.loc[edge_df.talon_edge_id == e]
+            if i == 0:
+                path.extend([entry.v1.values[0], entry.v2.values[0]])
+            else:
+                path.append(entry.v2.values[0])
+        vertex_paths.append(path)
+    return vertex_paths
 
 
 # creates the duplicate index
 def create_dupe_index(df, ind_name):
-	df[ind_name+'_back'] = df[ind_name]
-	return df
+    df[ind_name + "_back"] = df[ind_name]
+    return df
+
 
 def add_coord_info(edge_df, loc_df):
-	edge_df['chrom'] = edge_df.apply(lambda x: loc_df.loc[x.v1, 'chrom'], axis=1)
-	edge_df['start'] = edge_df.apply(lambda x: loc_df.loc[x.v1, 'coord'], axis=1)
-	edge_df['stop'] = edge_df.apply(lambda x: loc_df.loc[x.v2, 'coord'], axis=1)
+    edge_df["chrom"] = edge_df.apply(lambda x: loc_df.loc[x.v1, "chrom"], axis=1)
+    edge_df["start"] = edge_df.apply(lambda x: loc_df.loc[x.v1, "coord"], axis=1)
+    edge_df["stop"] = edge_df.apply(lambda x: loc_df.loc[x.v2, "coord"], axis=1)
+
+    return edge_df
+
 
-	return edge_df
+def subset_edges(edge_df, mode="intron"):
+    sjs = edge_df[edge_df.apply(lambda x: True if x.edge_type == mode else False, axis=1)]
+    return sjs
 
-def subset_edges(edge_df, mode='intron'):
-	sjs = edge_df[edge_df.apply(
-		lambda x: True if x.edge_type == mode else False, axis=1)]
-	return sjs
 
 def determine_sj_novelty(ref_edge_df, edge_df):
+    # Merge known starts from ref_edge_df with the query edges
+    ref_edge_df["start_known"] = True
+    edge_df = edge_df.merge(
+        ref_edge_df[["chrom", "start", "strand", "start_known"]], how="left", on=["chrom", "strand", "start"]
+    )
+    edge_df.fillna(value=False, inplace=True)
+
+    # Merge known ends from ref_edge_df with the query edges
+    ref_edge_df["stop_known"] = True
+    edge_df = edge_df.merge(
+        ref_edge_df[["chrom", "stop", "strand", "stop_known"]], how="left", on=["chrom", "strand", "stop"]
+    )
+    edge_df.fillna(value=False, inplace=True)
+
+    # Now determine whether the edge in whole has been seen before
+    ref_edge_df["combination_known"] = True
+    edge_df = edge_df.merge(
+        ref_edge_df[["chrom", "start", "stop", "strand", "combination_known"]],
+        how="left",
+        on=["chrom", "strand", "start", "stop"],
+    )
+    edge_df.fillna(value=False, inplace=True)
+
+    return edge_df
 
-	# Merge known starts from ref_edge_df with the query edges
-	ref_edge_df['start_known'] = True
-	edge_df = edge_df.merge(ref_edge_df[['chrom', 'start', 'strand', 'start_known']],
-				how = 'left', 
-				on = ['chrom', 'strand', 'start'])
-	edge_df.fillna(value=False, inplace=True)
-
-	# Merge known ends from ref_edge_df with the query edges
-	ref_edge_df['stop_known'] = True
-	edge_df = edge_df.merge(ref_edge_df[['chrom', 'stop', 'strand', 'stop_known']],
-                                how = 'left',
-                                on = ['chrom', 'strand', 'stop'])
-	edge_df.fillna(value=False, inplace=True)
-
-	# Now determine whether the edge in whole has been seen before
-	ref_edge_df['combination_known'] = True
-	edge_df = edge_df.merge(ref_edge_df[['chrom', 'start', 'stop', 'strand', 
-					     'combination_known']],
-                                 how = 'left', on = ['chrom', 'strand', 'start', 'stop'])
-	edge_df.fillna(value=False, inplace=True)
-
-	return edge_df
 
 # renames old index dupe column in df and resets the index
 def reset_dupe_index(df, ind_name):
-	df.rename({ind_name: ind_name+'_back'}, inplace=True, axis=1)
-	df.reset_index(inplace=True)
-	return(df)
+    df.rename({ind_name: ind_name + "_back"}, inplace=True, axis=1)
+    df.reset_index(inplace=True)
+    return df
+
 
 # set index, rename dupe index in df
 def set_dupe_index(df, ind_name):
-	df.set_index(ind_name, inplace=True)
-	df.rename({ind_name+'_back': ind_name}, inplace=True, axis=1)
-	return(df)
+    df.set_index(ind_name, inplace=True)
+    df.rename({ind_name + "_back": ind_name}, inplace=True, axis=1)
+    return df
 
-def format_edge_df(edge_df):
-	edge_df.reset_index(drop=True, inplace=True)
-	edge_df.drop(['edge_type', 'v1', 'v2'], axis=1, inplace=True)
-	return edge_df
-
-def find_tids_from_sj(edge_df, t_df, mode='intron'):
-	if mode == 'exon':
-		t_df['edges'] = t_df.apply(
-			lambda x: [(x.path[i], x.path[i+1]) for i in range(len(x.path[:-1]))][::2],
-			axis=1)
-	elif mode == 'intron':
-		t_df['edges'] = t_df.apply(
-			lambda x: [(x.path[i], x.path[i+1]) for i in range(len(x.path[:-1]))][1::2],
-			axis=1)
-	edge_df['tids'] = edge_df.apply(lambda x: add_tids_to_sj(x, t_df), axis=1)
-	edge_df.reset_index(drop=True, inplace=True)
-	edge_df.drop('edge_id', inplace=True, axis=1)
-
-	return edge_df
 
-def add_tids_to_sj(x, t_df):
-	return ','.join([tid for tid, edges in zip(t_df.tid, t_df.edges) if x.edge_id in edges])
-
-
-def main():
-	args = get_args()
+def format_edge_df(edge_df):
+    edge_df.reset_index(drop=True, inplace=True)
+    edge_df.drop(["edge_type", "v1", "v2"], axis=1, inplace=True)
+    return edge_df
 
-	ref_loc_df, ref_edge_df, ref_t_df = create_dfs_gtf(args.ref_gtf)
-	ref_edge_df = add_coord_info(ref_edge_df, ref_loc_df)
-	ref_edge_df = subset_edges(ref_edge_df, mode=args.mode)
-	ref_edge_df = format_edge_df(ref_edge_df)
 
-	if args.db: 
-		loc_df, edge_df, t_df = create_dfs_db(args.db)
+def find_tids_from_sj(edge_df, t_df, mode="intron"):
+    if mode == "exon":
+        t_df["edges"] = t_df.apply(lambda x: [(x.path[i], x.path[i + 1]) for i in range(len(x.path[:-1]))][::2], axis=1)
+    elif mode == "intron":
+        t_df["edges"] = t_df.apply(
+            lambda x: [(x.path[i], x.path[i + 1]) for i in range(len(x.path[:-1]))][1::2], axis=1
+        )
+    edge_df["tids"] = edge_df.apply(lambda x: add_tids_to_sj(x, t_df), axis=1)
+    edge_df.reset_index(drop=True, inplace=True)
+    edge_df.drop("edge_id", inplace=True, axis=1)
 
-	elif args.gtf:
-		loc_df, edge_df, t_df = create_dfs_gtf(args.gtf)
+    return edge_df
 
-	edge_df = add_coord_info(edge_df, loc_df)
-	edge_df = subset_edges(edge_df, mode=args.mode)
-	edge_df = format_edge_df(edge_df)
-	edge_df = determine_sj_novelty(ref_edge_df, edge_df)
-	edge_df = find_tids_from_sj(edge_df, t_df, mode=args.mode)
 
-	edge_df = edge_df.rename(columns={'tids': 'transcript_ids'})
-	edge_df.to_csv('{}_{}s.tsv'.format(args.outprefix, args.mode), 
-			sep='\t', index=False, columns=["chrom","start","stop",
-                                                        "strand", "start_known", 
-                                                        "stop_known", 
-                                                        "combination_known",
-                                                        "transcript_ids"])
+def add_tids_to_sj(x, t_df):
+    return ",".join([tid for tid, edges in zip(t_df.tid, t_df.edges) if x.edge_id in edges])
 
 
-if __name__ == '__main__':
-	main()
+def main():
+    args = get_args()
+
+    ref_loc_df, ref_edge_df, ref_t_df = create_dfs_gtf(args.ref_gtf)
+    ref_edge_df = add_coord_info(ref_edge_df, ref_loc_df)
+    ref_edge_df = subset_edges(ref_edge_df, mode=args.mode)
+    ref_edge_df = format_edge_df(ref_edge_df)
+
+    if args.db:
+        loc_df, edge_df, t_df = create_dfs_db(args.db)
+
+    elif args.gtf:
+        loc_df, edge_df, t_df = create_dfs_gtf(args.gtf)
+
+    edge_df = add_coord_info(edge_df, loc_df)
+    edge_df = subset_edges(edge_df, mode=args.mode)
+    edge_df = format_edge_df(edge_df)
+    edge_df = determine_sj_novelty(ref_edge_df, edge_df)
+    edge_df = find_tids_from_sj(edge_df, t_df, mode=args.mode)
+
+    edge_df = edge_df.rename(columns={"tids": "transcript_ids"})
+    edge_df.to_csv(
+        "{}_{}s.tsv".format(args.outprefix, args.mode),
+        sep="\t",
+        index=False,
+        columns=[
+            "chrom",
+            "start",
+            "stop",
+            "strand",
+            "start_known",
+            "stop_known",
+            "combination_known",
+            "transcript_ids",
+        ],
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/talon/post/map_antisense_genes_to_sense.py b/src/talon/post/map_antisense_genes_to_sense.py
index 1a506ec..bdd4276 100644
--- a/src/talon/post/map_antisense_genes_to_sense.py
+++ b/src/talon/post/map_antisense_genes_to_sense.py
@@ -4,27 +4,32 @@
 # map_antisense_genes_to_sense.py is a utility that outputs the ID of the
 # corresponding sense gene for every antisense gene in the database
 
-from optparse import OptionParser
 import sqlite3
+from optparse import OptionParser
 from pathlib import Path
 
 from . import ab_utils as autils
 
+
 def getOptions():
     parser = OptionParser()
 
-    parser.add_option("--db", dest = "database",
-        help = "TALON database", metavar = "FILE", type = "string")
-    parser.add_option("--annot", "-a", dest = "annot",
-        help = """Which annotation version to use. Will determine which
+    parser.add_option("--db", dest="database", help="TALON database", metavar="FILE", type="string")
+    parser.add_option(
+        "--annot",
+        "-a",
+        dest="annot",
+        help="""Which annotation version to use. Will determine which
                   annotation is used to fetch gene names. Note:
-                  Must be in the TALON database.""", type = "string")
-    parser.add_option("--o", dest = "outprefix", help = "Prefix for output GTF",
-        metavar = "FILE", type = "string")
+                  Must be in the TALON database.""",
+        type="string",
+    )
+    parser.add_option("--o", dest="outprefix", help="Prefix for output GTF", metavar="FILE", type="string")
 
     (options, args) = parser.parse_args()
     return options
 
+
 # def check_annot_validity(annot, database):
 #     """ Make sure that the user has entered a correct annotation name """
 #
@@ -52,16 +57,19 @@ def getOptions():
 #
 #     return
 
+
 def create_gene_name_dict(cursor, annot):
-    """ Create a dictionary mapping TALON gene IDs to their names in the
-        annot annotation"""
+    """Create a dictionary mapping TALON gene IDs to their names in the
+    annot annotation"""
 
-    cursor.execute("""SELECT ga.ID,
+    cursor.execute(
+        """SELECT ga.ID,
                              ga.value AS gene_name
                         FROM gene_annotations AS ga
                         WHERE ga.attribute = 'gene_name'
-                        AND (ga.annot_name = '%s' OR ga.source = 'TALON')""" \
-                        % (annot))
+                        AND (ga.annot_name = '%s' OR ga.source = 'TALON')"""
+        % (annot)
+    )
 
     gene_names = {}
     for entry in cursor.fetchall():
@@ -71,6 +79,7 @@ def create_gene_name_dict(cursor, annot):
 
     return gene_names
 
+
 def main():
     options = getOptions()
     database = options.database
@@ -89,10 +98,12 @@ def main():
     conn = sqlite3.connect(database)
     conn.row_factory = sqlite3.Row
     cursor = conn.cursor()
-    cursor.execute("""SELECT ga.ID As antisense_talon_ID,
+    cursor.execute(
+        """SELECT ga.ID As antisense_talon_ID,
                              ga.value AS sense_talon_ID
                         FROM gene_annotations AS ga
-                        WHERE ga.attribute = 'gene_antisense_to_IDs'""")
+                        WHERE ga.attribute = 'gene_antisense_to_IDs'"""
+    )
     antisense_rows = cursor.fetchall()
 
     # Create a dict of gene names
@@ -100,16 +111,15 @@ def main():
 
     # Write antisense-sense pairs to file. When there is more than one sense match,
     # create separate lines
-    o = open(outfile, 'w')
+    o = open(outfile, "w")
     o.write(",".join(["antisense_talon_ID", "sense_talon_ID", "sense_gene_name"]) + "\n")
     for entry in antisense_rows:
         sense_IDs = entry["sense_talon_ID"].split(",")
         for sense_ID in sense_IDs:
-            o.write(",".join([str(entry["antisense_talon_ID"]),
-                              str(sense_ID),
-                              gene_name_dict[int(sense_ID)]]) + "\n")
+            o.write(",".join([str(entry["antisense_talon_ID"]), str(sense_ID), gene_name_dict[int(sense_ID)]]) + "\n")
 
     o.close()
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     main()
diff --git a/src/talon/post/post_utils.py b/src/talon/post/post_utils.py
index 0805f65..097d9a0 100644
--- a/src/talon/post/post_utils.py
+++ b/src/talon/post/post_utils.py
@@ -3,13 +3,14 @@
 
 from .. import query_utils as qutils
 
+
 def handle_filtering(database, annot, observed, whitelist_file, dataset_file):
-    """ Determines which transcripts to allow in the analysis. This can be done
-        in two different ways. If no whitelist is included, then all of the
-        transcripts in the database are included (modified by 'observed'
-        option). If a whitelist is provided, then transcripts on that list
-        will be included (modified by 'observed' option). This can be
-        tuned further by providing a dataset file, but this is optional. """
+    """Determines which transcripts to allow in the analysis. This can be done
+    in two different ways. If no whitelist is included, then all of the
+    transcripts in the database are included (modified by 'observed'
+    option). If a whitelist is provided, then transcripts on that list
+    will be included (modified by 'observed' option). This can be
+    tuned further by providing a dataset file, but this is optional."""
 
     conn = sqlite3.connect(database)
     conn.row_factory = sqlite3.Row
@@ -31,14 +32,17 @@ def handle_filtering(database, annot, observed, whitelist_file, dataset_file):
 
     if datasets != None:
         # Limit the whitelist to transcripts detected in the datasets
-        transcripts = [ x[1] for x in whitelist ]
+        transcripts = [x[1] for x in whitelist]
         transcript_str = qutils.format_for_IN(transcripts)
         dataset_str = qutils.format_for_IN(datasets)
 
         query = """ SELECT DISTINCT gene_ID, transcript_ID
                     FROM observed
                     WHERE transcript_ID IN %s
-                    AND dataset in %s """ % (transcript_str, dataset_str)
+                    AND dataset in %s """ % (
+            transcript_str,
+            dataset_str,
+        )
         cursor.execute(query)
         whitelist = cursor.fetchall()
 
@@ -46,6 +50,6 @@ def handle_filtering(database, annot, observed, whitelist_file, dataset_file):
 
     # check if the pass list has any transcripts
     if len(whitelist) == 0:
-        raise ValueError('No transcripts found with the given filtering settings')
+        raise ValueError("No transcripts found with the given filtering settings")
 
     return whitelist
diff --git a/src/talon/post/summarize_datasets.py b/src/talon/post/summarize_datasets.py
index 2bdd885..e0bbfac 100644
--- a/src/talon/post/summarize_datasets.py
+++ b/src/talon/post/summarize_datasets.py
@@ -4,61 +4,76 @@
 
 from .. import query_utils as qutils
 
+
 def get_args():
-    """ Fetches the arguments for the program """
+    """Fetches the arguments for the program"""
 
     program_desc = """Generates a tab-delimited file of gene and transcript
                       counts for each dataset in the database (broken down
                       by category)."""
     parser = argparse.ArgumentParser(description=program_desc)
-    parser.add_argument('--db', dest = 'database', metavar='FILE,', type = str,
-        help='TALON database')
-    parser.add_argument('--groups', dest = 'groups', metavar='FILE,', type = str,
-        help='Optional: file of comma-delimited dataset groups to process together', default = None)
-    parser.add_argument("--verbose",
-                        help = "Verbose mode: print out the counts in terminal",
-                        action="store_true")
-    parser.add_argument("--o", dest = "outprefix",
-                        help = "Prefix for output file", type = str)
+    parser.add_argument("--db", dest="database", metavar="FILE,", type=str, help="TALON database")
+    parser.add_argument(
+        "--groups",
+        dest="groups",
+        metavar="FILE,",
+        type=str,
+        help="Optional: file of comma-delimited dataset groups to process together",
+        default=None,
+    )
+    parser.add_argument("--verbose", help="Verbose mode: print out the counts in terminal", action="store_true")
+    parser.add_argument("--o", dest="outprefix", help="Prefix for output file", type=str)
 
     args = parser.parse_args()
     return args
 
-def write_counts_file(cursor, outprefix, datasets, verbose = False):
-    """ Create a log file with the following columns:
-            - dataset name
-            - Number of reads annotated
-            - Number of known genes detected (total)
-            - Number of novel genes detected (total)
-            - Number of known transcripts detected (total)
-            - Number of novel transcripts detected (total)
-            Breakdowns by category
-            - Number of antisense genes detected
-            - Number of intergenic genes detected
-            - Number of known transcripts
-            - Number of FSM transcripts detected (perfect + with novelty)
-            - Number of total ISM transcripts detected
-            - Number of suffix ISMs detected
-            - Number of antisense transcripts detected
-            - Number of genomic transcripts detected
+
+def write_counts_file(cursor, outprefix, datasets, verbose=False):
+    """Create a log file with the following columns:
+    - dataset name
+    - Number of reads annotated
+    - Number of known genes detected (total)
+    - Number of novel genes detected (total)
+    - Number of known transcripts detected (total)
+    - Number of novel transcripts detected (total)
+    Breakdowns by category
+    - Number of antisense genes detected
+    - Number of intergenic genes detected
+    - Number of known transcripts
+    - Number of FSM transcripts detected (perfect + with novelty)
+    - Number of total ISM transcripts detected
+    - Number of suffix ISMs detected
+    - Number of antisense transcripts detected
+    - Number of genomic transcripts detected
     """
-    o = open(outprefix + "_talon_summary.tsv", 'w')
+    o = open(outprefix + "_talon_summary.tsv", "w")
 
     d = dict()
-    columns = [ "dataset", "reads_annotated", "known_genes", "antisense_genes",
-                "other_novel_genes", "known_transcripts", "novel_transcripts",
-                 "ISMs", "prefix_ISMs", "suffix_ISMs", "NICs", "NNCs",
-                "antisense_transcripts", "genomic_transcripts" ]
+    columns = [
+        "dataset",
+        "reads_annotated",
+        "known_genes",
+        "antisense_genes",
+        "other_novel_genes",
+        "known_transcripts",
+        "novel_transcripts",
+        "ISMs",
+        "prefix_ISMs",
+        "suffix_ISMs",
+        "NICs",
+        "NNCs",
+        "antisense_transcripts",
+        "genomic_transcripts",
+    ]
 
     o.write("\t".join(columns) + "\n")
 
     # Get dataset names
     if datasets == None:
         cursor.execute(""" SELECT dataset_name FROM dataset """)
-        datasets = [ str(x[0]) for x in cursor.fetchall() ]
+        datasets = [str(x[0]) for x in cursor.fetchall()]
 
     for dataset in datasets:
-
         # Get number of reads in the dataset
         reads = qutils.count_observed_reads(cursor, dataset)
 
@@ -97,10 +112,22 @@ def write_counts_file(cursor, outprefix, datasets, verbose = False):
         # Get genomic novel transcripts
         genomic_transcripts = len(qutils.fetch_genomic_transcripts(cursor, dataset))
 
-        outputs = [ dataset, reads, known_genes, antisense_genes,
-                intergenic_genes, known_transcripts, novel_transcripts, ISMs, prefix_ISMs,
-                suffix_ISMs, NICs, NNCs, antisense_transcripts,
-                genomic_transcripts ]
+        outputs = [
+            dataset,
+            reads,
+            known_genes,
+            antisense_genes,
+            intergenic_genes,
+            known_transcripts,
+            novel_transcripts,
+            ISMs,
+            prefix_ISMs,
+            suffix_ISMs,
+            NICs,
+            NNCs,
+            antisense_transcripts,
+            genomic_transcripts,
+        ]
 
         if verbose == True:
             print("---------------%s---------------" % dataset)
@@ -123,18 +150,20 @@ def write_counts_file(cursor, outprefix, datasets, verbose = False):
 
     o.close()
 
+
 def process_groups(group_file):
-    """ Read in a comma-delimited file of dataset groups and format them
-        as a list of lists """
+    """Read in a comma-delimited file of dataset groups and format them
+    as a list of lists"""
 
     datasets = []
-    with open(group_file, 'r') as f:
+    with open(group_file, "r") as f:
         for line in f:
             line = line.strip()
             datasets.append(line.split(","))
 
     return datasets
 
+
 def main():
     options = get_args()
 
@@ -157,5 +186,5 @@ def main():
     conn.close()
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/src/talon/process_sams.py b/src/talon/process_sams.py
index 84c8071..80720c0 100644
--- a/src/talon/process_sams.py
+++ b/src/talon/process_sams.py
@@ -4,39 +4,39 @@
 # Functions related to processing the input SAM files and partitioning them
 # for processing in parallel
 
-import pyranges as pr
-import pysam
+import logging
 import os
 import time
-import logging
+
+import pyranges as pr
+import pysam
 
 save = pysam.set_verbosity(0)
 # pysam.set_verbosity(save)
 
+
 def convert_to_bam(sam, bam, threads):
-    """ Convert provided sam file to bam file (provided name).  """
+    """Convert provided sam file to bam file (provided name)."""
 
     try:
         infile = pysam.AlignmentFile(sam, "r", threads=threads)
-        outfile = pysam.AlignmentFile(
-            bam, "wb", template=infile, threads=threads)
+        outfile = pysam.AlignmentFile(bam, "wb", template=infile, threads=threads)
         for s in infile:
             outfile.write(s)
 
     except Exception as e:
         logging.error(e)
-        msg = f'Problem converting SAM file {sam} to BAM'
+        msg = f"Problem converting SAM file {sam} to BAM"
         logging.error(msg)
         raise RuntimeError(msg)
         # raise RuntimeError("Problem converting sam file '%s' to bam." % (sam))
 
 
-def preprocess_sam(sam_files, datasets, use_cb_tag,
-                   tmp_dir="talon_tmp/", n_threads=0):
-    """ Copy and rename the provided SAM/BAM file(s), merge them, and index.
-        This is necessary in order to use following commands on the reads.
-        The renaming is necessary in order to label the reads according to
-        their dataset."""
+def preprocess_sam(sam_files, datasets, use_cb_tag, tmp_dir="talon_tmp/", n_threads=0):
+    """Copy and rename the provided SAM/BAM file(s), merge them, and index.
+    This is necessary in order to use following commands on the reads.
+    The renaming is necessary in order to label the reads according to
+    their dataset."""
 
     # Create the tmp dir
     os.system("mkdir -p %s " % (tmp_dir))
@@ -57,8 +57,7 @@ def preprocess_sam(sam_files, datasets, use_cb_tag,
             renamed_sams.append(sorted_bam)
 
         merged_bam = tmp_dir + "merged.bam"
-        merge_args = [merged_bam] + renamed_sams + \
-            ["-f", "-r", "-@", str(n_threads)]
+        merge_args = [merged_bam] + renamed_sams + ["-f", "-r", "-@", str(n_threads)]
         # index_args = [merged_bam, "-@", str(n_threads)]
 
         # # Merge datasets and use -r option to include a read group tag
@@ -76,12 +75,12 @@ def preprocess_sam(sam_files, datasets, use_cb_tag,
     elif use_cb_tag:
         for i, sam in enumerate(sam_files):
             fname_split = sam.split(".")
-            suffix = "."+fname_split[-1]
+            suffix = "." + fname_split[-1]
             if suffix == ".sam":
-                bam_copy = '{}{}_unsorted.bam'.format(tmp_dir, i)
+                bam_copy = "{}{}_unsorted.bam".format(tmp_dir, i)
                 convert_to_bam(sam, bam_copy, n_threads)
                 sam = bam_copy
-            sorted_bam = '{}{}.bam'.format(tmp_dir, i)
+            sorted_bam = "{}{}.bam".format(tmp_dir, i)
             pysam.sort("-@", str(n_threads), "-o", sorted_bam, sam)
             renamed_sams.append(sorted_bam)
 
@@ -96,39 +95,39 @@ def preprocess_sam(sam_files, datasets, use_cb_tag,
         pysam.index(sorted_bam)
         # ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
         # print("[ %s ] Merged input SAM/BAM files" % (ts))
-        logging.info('Merged input SAM/BAM files')
+        logging.info("Merged input SAM/BAM files")
     except:
         # raise RuntimeError(("Problem merging and indexing SAM/BAM files. "
         #                     "Check your file paths and make sure that all "
         #                     "files have headers."))
-        msg = "Problem merging and indexing SAM/BAM files. "+\
-                            "Check your file paths and make sure that all "+\
-                            "files have headers."
+        msg = (
+            "Problem merging and indexing SAM/BAM files. "
+            + "Check your file paths and make sure that all "
+            + "files have headers."
+        )
         logging.error(msg)
         raise RuntimeError(msg)
     return sorted_bam
 
 
-def partition_reads(sam_files, datasets, use_cb_tag,
-                    tmp_dir="talon_tmp/", n_threads=0):
-    """ Use bedtools merge to create non-overlapping intervals from all of the
-        transcripts in a series of SAM/BAM files. Then, iterate over the intervals
-        to extract all reads inside of them from the pysam object.
+def partition_reads(sam_files, datasets, use_cb_tag, tmp_dir="talon_tmp/", n_threads=0):
+    """Use bedtools merge to create non-overlapping intervals from all of the
+    transcripts in a series of SAM/BAM files. Then, iterate over the intervals
+    to extract all reads inside of them from the pysam object.
 
-        Returns:
-            - List of lists: sublists contain pysam reads from a given interval
-            - List of tuple intervals
-            - filename of merged bam file (to keep track of the header)
-           """
-    merged_bam = preprocess_sam(sam_files, datasets, use_cb_tag,
-                                tmp_dir=tmp_dir, n_threads=n_threads)
+    Returns:
+        - List of lists: sublists contain pysam reads from a given interval
+        - List of tuple intervals
+        - filename of merged bam file (to keep track of the header)
+    """
+    merged_bam = preprocess_sam(sam_files, datasets, use_cb_tag, tmp_dir=tmp_dir, n_threads=n_threads)
 
     try:
         gr = pr.read_bam(merged_bam)
     except Exception as e:
         # print(e)
         logging.error(e)
-        msg = f'Problem opening SAM file {merged_bam}'
+        msg = f"Problem opening SAM file {merged_bam}"
         logging.error(msg)
         raise RuntimeError(msg)
 
@@ -139,20 +138,17 @@ def partition_reads(sam_files, datasets, use_cb_tag,
     read_groups = []
     with pysam.AlignmentFile(merged_bam) as bam:  # type: pysam.AlignmentFile
         for _, interval in gr.df.iterrows():
-            reads = get_reads_in_interval(bam, interval.Chromosome,
-                                          interval.Start, interval.End)
+            reads = get_reads_in_interval(bam, interval.Chromosome, interval.Start, interval.End)
             read_groups.append(reads)
-            coords.append((interval.Chromosome,
-                           interval.Start + 1, interval.End))
+            coords.append((interval.Chromosome, interval.Start + 1, interval.End))
 
     return read_groups, coords, merged_bam
 
 
-def write_reads_to_file(read_groups, intervals,
-                        header_template, tmp_dir="talon_tmp/"):
-    """ For each read group, iterate over the reads and write them to a file
-        named for the interval they belong to. This step is necessary because
-        Pysam objects cannot be pickled. """
+def write_reads_to_file(read_groups, intervals, header_template, tmp_dir="talon_tmp/"):
+    """For each read group, iterate over the reads and write them to a file
+    named for the interval they belong to. This step is necessary because
+    Pysam objects cannot be pickled."""
 
     tmp_dir = tmp_dir + "interval_files/"
     if not os.path.exists(tmp_dir):
@@ -171,9 +167,9 @@ def write_reads_to_file(read_groups, intervals,
 
 
 def get_reads_in_interval(sam, chrom, start, end):
-    """ Given an open pysam.AlignmentFile, return only the reads that overlap
-        the provided interval. Note that this means there may be reads that
-        extend beyond the bounds of the interval. """
+    """Given an open pysam.AlignmentFile, return only the reads that overlap
+    the provided interval. Note that this means there may be reads that
+    extend beyond the bounds of the interval."""
     iterator = sam.fetch(chrom, start, end)
     reads = [x for x in iterator]
     return reads
diff --git a/src/talon/query_utils.py b/src/talon/query_utils.py
index 992bd2c..c9946c8 100644
--- a/src/talon/query_utils.py
+++ b/src/talon/query_utils.py
@@ -5,12 +5,14 @@
 
 import sqlite3
 
+
 def fetch_reproducible_intergenic(cursor, datasets):
-    """ Return the gene and transcript ID of any intergenic transcripts that were
-        found in at least two of the supplied datasets """
+    """Return the gene and transcript ID of any intergenic transcripts that were
+    found in at least two of the supplied datasets"""
 
     datasets = format_for_IN(datasets)
-    query = """SELECT gene_ID,
+    query = (
+        """SELECT gene_ID,
                       a.transcript_ID
                FROM abundance as a
                LEFT JOIN transcript_annotations as ta
@@ -18,20 +20,24 @@ def fetch_reproducible_intergenic(cursor, datasets):
                LEFT JOIN transcripts
                    ON transcripts.transcript_ID = a.transcript_ID
                WHERE ta.attribute = 'intergenic_transcript'
-               AND a.dataset IN """ + datasets + \
-           """ GROUP BY a.transcript_ID
+               AND a.dataset IN """
+        + datasets
+        + """ GROUP BY a.transcript_ID
                HAVING count(*) > 1;"""
+    )
 
     cursor.execute(query)
     intergenic = [(x[0], x[1], "intergenic_transcript") for x in cursor.fetchall()]
     return intergenic
 
+
 def fetch_reproducible_antisense(cursor, datasets):
-    """ Return the gene and transcript ID of any antisense transcripts that were
-        found in at least two of the supplied datasets """
+    """Return the gene and transcript ID of any antisense transcripts that were
+    found in at least two of the supplied datasets"""
 
     datasets = format_for_IN(datasets)
-    query = """SELECT gene_ID,
+    query = (
+        """SELECT gene_ID,
                       a.transcript_ID
                FROM abundance as a
                LEFT JOIN transcript_annotations as ta
@@ -39,20 +45,24 @@ def fetch_reproducible_antisense(cursor, datasets):
                LEFT JOIN transcripts
                    ON transcripts.transcript_ID = a.transcript_ID
                WHERE ta.attribute = 'antisense_transcript'
-               AND a.dataset IN """ + datasets + \
-           """ GROUP BY a.transcript_ID
+               AND a.dataset IN """
+        + datasets
+        + """ GROUP BY a.transcript_ID
                HAVING count(*) > 1;"""
+    )
 
     cursor.execute(query)
     antisense = [(x[0], x[1], "antisense_transcript") for x in cursor.fetchall()]
     return antisense
 
+
 def fetch_reproducible_NNCs(cursor, datasets):
-    """ Return the gene and transcript ID of any NNC transcripts that were
-        found in at least two of the supplied datasets """
+    """Return the gene and transcript ID of any NNC transcripts that were
+    found in at least two of the supplied datasets"""
 
     datasets = format_for_IN(datasets)
-    query = """SELECT gene_ID, 
+    query = (
+        """SELECT gene_ID, 
                       a.transcript_ID 
                FROM abundance as a
 	       LEFT JOIN transcript_annotations as ta
@@ -60,20 +70,24 @@ def fetch_reproducible_NNCs(cursor, datasets):
                LEFT JOIN transcripts
 	           ON transcripts.transcript_ID = a.transcript_ID
 	       WHERE ta.attribute = 'NNC_transcript'
-	       AND a.dataset IN """ + datasets + \
-           """ GROUP BY a.transcript_ID
+	       AND a.dataset IN """
+        + datasets
+        + """ GROUP BY a.transcript_ID
                HAVING count(*) > 1;"""
+    )
 
     cursor.execute(query)
     NNC = [(x[0], x[1], "NNC_transcript") for x in cursor.fetchall()]
     return NNC
 
+
 def fetch_reproducible_NICs(cursor, datasets):
-    """ Return the gene and transcript ID of any NIC transcripts that were
-        found in at least two of the supplied datasets """
+    """Return the gene and transcript ID of any NIC transcripts that were
+    found in at least two of the supplied datasets"""
 
     datasets = format_for_IN(datasets)
-    query = """SELECT gene_ID,
+    query = (
+        """SELECT gene_ID,
                       a.transcript_ID
                FROM abundance as a
                LEFT JOIN transcript_annotations as ta
@@ -81,24 +95,28 @@ def fetch_reproducible_NICs(cursor, datasets):
                LEFT JOIN transcripts
                    ON transcripts.transcript_ID = a.transcript_ID
                WHERE ta.attribute = 'NIC_transcript'
-               AND a.dataset IN """ + datasets + \
-           """ GROUP BY a.transcript_ID
+               AND a.dataset IN """
+        + datasets
+        + """ GROUP BY a.transcript_ID
                HAVING count(*) > 1;"""
+    )
 
     cursor.execute(query)
     NIC = [(x[0], x[1], "NIC_transcript") for x in cursor.fetchall()]
     return NIC
 
+
 def fetch_reproducible_ISMs(cursor, datasets):
-    """ Return the gene and transcript ID of any ISM transcripts that were
-        found in at least two of the supplied datasets """
+    """Return the gene and transcript ID of any ISM transcripts that were
+    found in at least two of the supplied datasets"""
 
     datasets = format_for_IN(datasets)
     transcripts_seen = {}
 
     # To label novelty, perform queries separately for suffix, prefix, and
     # regular ISMs
-    query = """SELECT gene_ID,
+    query = (
+        """SELECT gene_ID,
                       a.transcript_ID
                FROM abundance as a
                LEFT JOIN transcript_annotations as ta
@@ -106,9 +124,11 @@ def fetch_reproducible_ISMs(cursor, datasets):
                LEFT JOIN transcripts
                    ON transcripts.transcript_ID = a.transcript_ID
                WHERE ta.attribute = 'ISM-prefix_transcript'
-               AND a.dataset IN """ + datasets + \
-           """ GROUP BY a.transcript_ID
+               AND a.dataset IN """
+        + datasets
+        + """ GROUP BY a.transcript_ID
                HAVING count(*) > 1;"""
+    )
 
     cursor.execute(query)
     ISMs = [(x[0], x[1], "ISM-prefix_transcript") for x in cursor.fetchall()]
@@ -116,7 +136,8 @@ def fetch_reproducible_ISMs(cursor, datasets):
     for entry in ISMs:
         transcripts_seen[entry[1]] = 1
 
-    query = """SELECT gene_ID,
+    query = (
+        """SELECT gene_ID,
                       a.transcript_ID
                FROM abundance as a
                LEFT JOIN transcript_annotations as ta
@@ -124,9 +145,11 @@ def fetch_reproducible_ISMs(cursor, datasets):
                LEFT JOIN transcripts
                    ON transcripts.transcript_ID = a.transcript_ID
                WHERE ta.attribute = 'ISM-suffix_transcript'
-               AND a.dataset IN """ + datasets + \
-           """ GROUP BY a.transcript_ID
+               AND a.dataset IN """
+        + datasets
+        + """ GROUP BY a.transcript_ID
                HAVING count(*) > 1;"""
+    )
 
     cursor.execute(query)
     suffix_ISMs = [(x[0], x[1], "ISM-suffix_transcript") for x in cursor.fetchall()]
@@ -136,7 +159,8 @@ def fetch_reproducible_ISMs(cursor, datasets):
             ISMs.append(entry)
             transcripts_seen[entry[1]] = 1
 
-    query = """SELECT gene_ID,
+    query = (
+        """SELECT gene_ID,
                       a.transcript_ID
                FROM abundance as a
                LEFT JOIN transcript_annotations as ta
@@ -144,9 +168,11 @@ def fetch_reproducible_ISMs(cursor, datasets):
                LEFT JOIN transcripts
                    ON transcripts.transcript_ID = a.transcript_ID
                WHERE ta.attribute = 'ISM_transcript'
-               AND a.dataset IN """ + datasets + \
-           """ GROUP BY a.transcript_ID
+               AND a.dataset IN """
+        + datasets
+        + """ GROUP BY a.transcript_ID
                HAVING count(*) > 1;"""
+    )
 
     cursor.execute(query)
     all_ISMs = [(x[0], x[1], "other_ISM_transcript") for x in cursor.fetchall()]
@@ -158,32 +184,41 @@ def fetch_reproducible_ISMs(cursor, datasets):
 
     return ISMs
 
+
 def fetch_known_transcripts_with_gene_label(cursor, datasets):
-    """ Fetch known transcripts along with the gene they belong to """
+    """Fetch known transcripts along with the gene they belong to"""
 
     datasets = format_for_IN(datasets)
-    query = """SELECT DISTINCT gene_ID,transcript_ID FROM observed
+    query = (
+        """SELECT DISTINCT gene_ID,transcript_ID FROM observed
                    LEFT JOIN transcript_annotations AS ta ON ta.ID = observed.transcript_ID
                    WHERE (ta.attribute = 'transcript_status' AND ta.value = 'KNOWN')
-                   AND observed.dataset IN """ + datasets
+                   AND observed.dataset IN """
+        + datasets
+    )
     cursor.execute(query)
     known_transcripts = [(x[0], x[1], "FSM_transcript") for x in cursor.fetchall()]
     return known_transcripts
 
+
 def fetch_NIC_transcripts_with_gene_label(cursor, datasets):
-    """ Fetch NIC transcripts along with the gene they belong to """
+    """Fetch NIC transcripts along with the gene they belong to"""
 
     datasets = format_for_IN(datasets)
-    query = """SELECT DISTINCT gene_ID,transcript_ID FROM observed
+    query = (
+        """SELECT DISTINCT gene_ID,transcript_ID FROM observed
                    LEFT JOIN transcript_annotations AS ta ON ta.ID = observed.transcript_ID
                    WHERE (ta.attribute = 'NIC_transcript')
-                   AND observed.dataset IN """ + datasets
+                   AND observed.dataset IN """
+        + datasets
+    )
     cursor.execute(query)
     known_transcripts = [(x[0], x[1], "NIC_transcript") for x in cursor.fetchall()]
     return known_transcripts
 
+
 def count_observed_reads(cursor, datasets):
-    """ Count the number of observed reads for the provided datasets """
+    """Count the number of observed reads for the provided datasets"""
 
     datasets = format_for_IN(datasets)
     query = "SELECT COUNT(obs_ID) FROM observed WHERE dataset IN " + datasets
@@ -191,61 +226,76 @@ def count_observed_reads(cursor, datasets):
     reads = cursor.fetchone()[0]
     return reads
 
+
 def fetch_all_known_genes_detected(cursor, datasets):
-    """ Get the IDs of all known genes found in a particular dataset (no 
-        filter with respect to the type of transcript detected). """
+    """Get the IDs of all known genes found in a particular dataset (no
+    filter with respect to the type of transcript detected)."""
 
     datasets = format_for_IN(datasets)
-    query = """SELECT DISTINCT(gene_ID) FROM observed
+    query = (
+        """SELECT DISTINCT(gene_ID) FROM observed
                    LEFT JOIN gene_annotations AS ga ON ga.ID = observed.gene_ID
                    WHERE (ga.attribute = 'gene_status' AND ga.value = 'KNOWN')
-                   AND observed.dataset IN """ + datasets
+                   AND observed.dataset IN """
+        + datasets
+    )
     cursor.execute(query)
     known_genes = [x[0] for x in cursor.fetchall()]
     return known_genes
 
+
 def count_known_genes_detected(cursor, dataset):
-    """ Count the number of known genes detected in the dataset (no filter
-        with respect to the type of transcript detected). """
+    """Count the number of known genes detected in the dataset (no filter
+    with respect to the type of transcript detected)."""
 
     known_genes = fetch_all_known_genes_detected(cursor, dataset)
     return len(known_genes)
 
+
 def count_novel_genes_detected(cursor, dataset):
-    """ Count the number of novel genes detected in the dataset (no filter
-        with respect to the type of transcript detected). """
+    """Count the number of novel genes detected in the dataset (no filter
+    with respect to the type of transcript detected)."""
 
     novel_genes = fetch_all_novel_genes_detected(cursor, dataset)
     return len(novel_genes)
 
+
 def fetch_all_novel_genes_detected(cursor, datasets):
-    """ Get the IDs of all novel genes found in a particular dataset (no
-        filter with respect to the type of transcript detected). """
+    """Get the IDs of all novel genes found in a particular dataset (no
+    filter with respect to the type of transcript detected)."""
 
     datasets = format_for_IN(datasets)
-    query = """SELECT DISTINCT(gene_ID) FROM observed
+    query = (
+        """SELECT DISTINCT(gene_ID) FROM observed
                    LEFT JOIN gene_annotations AS ga ON ga.ID = observed.gene_ID
                    WHERE (ga.attribute = 'gene_status' AND ga.value = 'NOVEL')
-                   AND observed.dataset IN """ + datasets
+                   AND observed.dataset IN """
+        + datasets
+    )
     cursor.execute(query)
     novel_genes = [x[0] for x in cursor.fetchall()]
     return novel_genes
 
+
 def fetch_all_known_transcripts_detected(cursor, datasets):
-    """ Get the IDs of all transcripts annotated as known. Does not include 
-        novel FSMs """
- 
+    """Get the IDs of all transcripts annotated as known. Does not include
+    novel FSMs"""
+
     datasets = format_for_IN(datasets)
-    query = """SELECT DISTINCT(transcript_ID) FROM observed
+    query = (
+        """SELECT DISTINCT(transcript_ID) FROM observed
                    LEFT JOIN transcript_annotations AS ta ON ta.ID = observed.transcript_ID
                    WHERE (ta.attribute = 'transcript_status' AND ta.value = 'KNOWN')
-                   AND observed.dataset IN """ + datasets
+                   AND observed.dataset IN """
+        + datasets
+    )
     cursor.execute(query)
     known_transcripts = [x[0] for x in cursor.fetchall()]
     return known_transcripts
 
+
 def fetch_FSM_novel_transcripts(cursor, dataset):
-    """ Fetch IDs of novel FSMs observed in the current dataset """
+    """Fetch IDs of novel FSMs observed in the current dataset"""
 
     query = """SELECT DISTINCT(transcript_ID) FROM observed
                    LEFT JOIN transcript_annotations AS ta ON ta.ID = observed.transcript_ID
@@ -255,167 +305,213 @@ def fetch_FSM_novel_transcripts(cursor, dataset):
     FSM_transcripts = [x[0] for x in cursor.fetchall()]
     return FSM_transcripts
 
+
 def fetch_novel_transcripts(cursor, datasets):
-    """ Fetch IDs of novel transcripts observed in the current dataset """
+    """Fetch IDs of novel transcripts observed in the current dataset"""
     datasets = format_for_IN(datasets)
 
-    query = """SELECT DISTINCT(transcript_ID) FROM observed
+    query = (
+        """SELECT DISTINCT(transcript_ID) FROM observed
                    LEFT JOIN transcript_annotations AS ta ON ta.ID = observed.transcript_ID
                    WHERE (ta.attribute = 'transcript_status' AND ta.value = 'NOVEL')
-                   AND observed.dataset IN """ + datasets
+                   AND observed.dataset IN """
+        + datasets
+    )
     cursor.execute(query)
     transcripts = [x[0] for x in cursor.fetchall()]
     return transcripts
 
+
 def fetch_antisense_genes(cursor, datasets):
-    """ Fetch IDs of antisense genes observed in the dataset(s) """
+    """Fetch IDs of antisense genes observed in the dataset(s)"""
 
     datasets = format_for_IN(datasets)
-    query = """SELECT DISTINCT(gene_ID) FROM observed
+    query = (
+        """SELECT DISTINCT(gene_ID) FROM observed
                    LEFT JOIN gene_annotations AS ga ON ga.ID = observed.gene_ID
                    WHERE (ga.attribute = 'antisense_gene')
-                   AND observed.dataset IN """ + datasets
+                   AND observed.dataset IN """
+        + datasets
+    )
     cursor.execute(query)
     genes = [x[0] for x in cursor.fetchall()]
     return genes
 
+
 def fetch_intergenic_novel_genes(cursor, datasets):
-    """ Fetch IDs of novel genes denoted as intergenic """
+    """Fetch IDs of novel genes denoted as intergenic"""
 
     datasets = format_for_IN(datasets)
-    query = """SELECT DISTINCT(gene_ID) FROM observed
+    query = (
+        """SELECT DISTINCT(gene_ID) FROM observed
                    LEFT JOIN gene_annotations AS ga ON ga.ID = observed.gene_ID
                    WHERE (ga.attribute = 'intergenic_novel')
-                   AND observed.dataset IN """ + datasets
+                   AND observed.dataset IN """
+        + datasets
+    )
     cursor.execute(query)
     genes = [x[0] for x in cursor.fetchall()]
     return genes
 
+
 def fetch_all_ISM_transcripts(cursor, datasets):
-    """ Fetch IDs of all ISM transcripts """
-    
+    """Fetch IDs of all ISM transcripts"""
+
     datasets = format_for_IN(datasets)
-    query = """SELECT DISTINCT(transcript_ID) FROM observed
+    query = (
+        """SELECT DISTINCT(transcript_ID) FROM observed
                    LEFT JOIN transcript_annotations 
                        AS ta ON ta.ID = observed.transcript_ID
                    WHERE (ta.attribute = 'ISM_transcript')
-                   AND observed.dataset IN """ + datasets
+                   AND observed.dataset IN """
+        + datasets
+    )
     cursor.execute(query)
     transcripts = [x[0] for x in cursor.fetchall()]
     return transcripts
 
+
 def fetch_prefix_ISM_transcripts(cursor, datasets):
-    """ Fetch IDs of all ISM prefix transcripts """
+    """Fetch IDs of all ISM prefix transcripts"""
 
     datasets = format_for_IN(datasets)
-    query = """SELECT DISTINCT(transcript_ID) FROM observed
+    query = (
+        """SELECT DISTINCT(transcript_ID) FROM observed
                    LEFT JOIN transcript_annotations
                        AS ta ON ta.ID = observed.transcript_ID
                    WHERE (ta.attribute = 'ISM-prefix_transcript')
-                   AND observed.dataset IN """ + datasets
+                   AND observed.dataset IN """
+        + datasets
+    )
     cursor.execute(query)
     transcripts = [x[0] for x in cursor.fetchall()]
     return transcripts
 
+
 def fetch_suffix_ISM_transcripts(cursor, datasets):
-    """ Fetch IDs of all ISM suffix transcripts """
+    """Fetch IDs of all ISM suffix transcripts"""
 
     datasets = format_for_IN(datasets)
-    query = """SELECT DISTINCT(transcript_ID) FROM observed
+    query = (
+        """SELECT DISTINCT(transcript_ID) FROM observed
                    LEFT JOIN transcript_annotations
                        AS ta ON ta.ID = observed.transcript_ID
                    WHERE (ta.attribute = 'ISM-suffix_transcript')
-                   AND observed.dataset IN """ + datasets
+                   AND observed.dataset IN """
+        + datasets
+    )
     cursor.execute(query)
     transcripts = [x[0] for x in cursor.fetchall()]
     return transcripts
 
+
 def fetch_NIC_transcripts(cursor, datasets):
-    """ Fetch IDs of all NIC transcripts """
+    """Fetch IDs of all NIC transcripts"""
 
     datasets = format_for_IN(datasets)
-    query = """SELECT DISTINCT(transcript_ID) FROM observed
+    query = (
+        """SELECT DISTINCT(transcript_ID) FROM observed
                    LEFT JOIN transcript_annotations
                        AS ta ON ta.ID = observed.transcript_ID
                    WHERE (ta.attribute = 'NIC_transcript')
-                   AND observed.dataset IN """ + datasets
+                   AND observed.dataset IN """
+        + datasets
+    )
     cursor.execute(query)
     transcripts = [x[0] for x in cursor.fetchall()]
     return transcripts
 
+
 def fetch_NNC_transcripts(cursor, datasets):
-    """ Fetch IDs of all NNC transcripts """
+    """Fetch IDs of all NNC transcripts"""
 
     datasets = format_for_IN(datasets)
-    query = """SELECT DISTINCT(transcript_ID) FROM observed
+    query = (
+        """SELECT DISTINCT(transcript_ID) FROM observed
                    LEFT JOIN transcript_annotations
                        AS ta ON ta.ID = observed.transcript_ID
                    WHERE (ta.attribute = 'NNC_transcript')
-                   AND observed.dataset IN """ + datasets
+                   AND observed.dataset IN """
+        + datasets
+    )
     cursor.execute(query)
     transcripts = [x[0] for x in cursor.fetchall()]
     return transcripts
 
+
 def fetch_antisense_transcripts(cursor, datasets):
-    """ Fetch IDs of all antisense transcripts """
+    """Fetch IDs of all antisense transcripts"""
 
     datasets = format_for_IN(datasets)
-    query = """SELECT DISTINCT(transcript_ID) FROM observed
+    query = (
+        """SELECT DISTINCT(transcript_ID) FROM observed
                    LEFT JOIN transcript_annotations
                        AS ta ON ta.ID = observed.transcript_ID
                    WHERE (ta.attribute = 'antisense_transcript')
-                   AND observed.dataset IN """ + datasets
+                   AND observed.dataset IN """
+        + datasets
+    )
     cursor.execute(query)
     transcripts = [x[0] for x in cursor.fetchall()]
     return transcripts
 
+
 def fetch_intergenic_transcripts(cursor, datasets):
-    """ Fetch IDs of all intergenic transcripts """
+    """Fetch IDs of all intergenic transcripts"""
 
     datasets = format_for_IN(datasets)
-    query = """SELECT DISTINCT(transcript_ID) FROM observed
+    query = (
+        """SELECT DISTINCT(transcript_ID) FROM observed
                    LEFT JOIN transcript_annotations
                        AS ta ON ta.ID = observed.transcript_ID
                    WHERE (ta.attribute = 'intergenic_transcript')
-                   AND observed.dataset IN """ + datasets
+                   AND observed.dataset IN """
+        + datasets
+    )
     cursor.execute(query)
     transcripts = [x[0] for x in cursor.fetchall()]
     return transcripts
 
 
 def fetch_genomic_transcripts(cursor, datasets):
-    """ Fetch IDs of all genomic transcripts """
+    """Fetch IDs of all genomic transcripts"""
 
     datasets = format_for_IN(datasets)
-    query = """SELECT DISTINCT(transcript_ID) FROM observed
+    query = (
+        """SELECT DISTINCT(transcript_ID) FROM observed
                    LEFT JOIN transcript_annotations
                        AS ta ON ta.ID = observed.transcript_ID
                    WHERE (ta.attribute = 'genomic_transcript')
-                   AND observed.dataset IN """ + datasets
+                   AND observed.dataset IN """
+        + datasets
+    )
     cursor.execute(query)
     transcripts = [x[0] for x in cursor.fetchall()]
     return transcripts
 
+
 def fetch_all_transcript_gene_pairs(cursor):
-    """ Return gene_ID - transcript_ID tuples from database """
+    """Return gene_ID - transcript_ID tuples from database"""
 
     query = """ SELECT gene_ID, transcript_ID FROM transcripts """
     cursor.execute(query)
-    
+
     pairs = cursor.fetchall()
     return pairs
-    
+
+
 def fetch_all_datasets(cursor):
-    """ Return a list of all datasets in database """
+    """Return a list of all datasets in database"""
     cursor.execute("SELECT dataset_name FROM dataset")
     datasets = [str(x[0]) for x in cursor.fetchall()]
     return datasets
 
+
 def parse_whitelist(whitelist_file):
-    """ From the whitelist file, obtain a list of acccepted gene and 
-        transcript IDs tuples"""
+    """From the whitelist file, obtain a list of acccepted gene and
+    transcript IDs tuples"""
     whitelist = set()
-    with open(whitelist_file, 'r') as f:
+    with open(whitelist_file, "r") as f:
         for line in f:
             line = line.strip()
             fields = line.split(",")
@@ -427,13 +523,14 @@ def parse_whitelist(whitelist_file):
                 raise ValueError("Gene/Transcript IDs in whitelist must be integer TALON IDs")
     return whitelist
 
+
 def parse_datasets(dataset_file, cursor):
-    """ From the dataset file, obtain a list of acccepted dataset names"""
+    """From the dataset file, obtain a list of acccepted dataset names"""
     # Get datasets in this database
     db_datasets = fetch_all_datasets(cursor)
 
     dataset_list = set()
-    with open(dataset_file, 'r') as f:
+    with open(dataset_file, "r") as f:
         for line in f:
             line = line.strip()
             fields = line.split()
@@ -443,13 +540,14 @@ def parse_datasets(dataset_file, cursor):
             dataset_list.add(dataset)
     return dataset_list
 
-#-------------------------------------------------------------------------------
+
+# -------------------------------------------------------------------------------
 def format_for_IN(l):
-    """ Converts input to string that can be used for IN database query """
-    
+    """Converts input to string that can be used for IN database query"""
+
     if type(l) is tuple:
         l = list(l)
     if type(l) is str:
         l = [l]
 
-    return "(" + ','.join(['"' + str(x) + '"' for x in l]) + ")" 
+    return "(" + ",".join(['"' + str(x) + '"' for x in l]) + ")"
diff --git a/src/talon/reformat_gtf.py b/src/talon/reformat_gtf.py
index 5e2912e..3188492 100644
--- a/src/talon/reformat_gtf.py
+++ b/src/talon/reformat_gtf.py
@@ -1,218 +1,212 @@
 import argparse
+
 import pandas as pd
 
+
 def get_args():
+    desc = "Fixes a GTF with no genes"
+    parser = argparse.ArgumentParser(description=desc)
 
-	desc = 'Fixes a GTF with no genes'
-	parser = argparse.ArgumentParser(description=desc)
+    parser.add_argument("-gtf", "-g", dest="gtf", help="gtf to fix")
+    args = parser.parse_args()
 
-	parser.add_argument('-gtf', '-g', dest='gtf',
-		help='gtf to fix')
-	args = parser.parse_args()
+    return args
 
-	return args
 
 # check what entries are missing in the gtf
 def is_bad_gtf(gtffile):
+    missing_gene = False
+    missing_trans = False
 
-	missing_gene = False
-	missing_trans = False
+    # how many lines are useless lines
+    with open(gtffile, "r") as infile:
+        for i, line in enumerate(infile):
+            if "##" not in line:
+                break
+    skiprows = [j for j in range(0, i)]
 
-	# how many lines are useless lines
-	with open(gtffile, 'r') as infile:
-		for i, line in enumerate(infile): 
-			if '##' not in line:
-				break
-	skiprows = [j for j in range(0, i)]
+    df = pd.read_csv(gtffile, sep="\t", usecols=[2], skiprows=skiprows)
+    categories = df.iloc[:, 0].unique()
 
-	df = pd.read_csv(gtffile, sep='\t', usecols=[2], skiprows=skiprows)
-	categories = df.iloc[:,0].unique()
+    # print(categories)
 
-	# print(categories)
+    # what are we missing?
+    if "gene" not in categories:
+        missing_gene = True
+    if "transcript" not in categories:
+        missing_trans = True
 
-	# what are we missing?
-	if 'gene' not in categories:
-		missing_gene = True
-	if 'transcript' not in categories: 
-		missing_trans = True
+    return (missing_gene, missing_trans)
 
-	return (missing_gene, missing_trans)
 
 # get value associated with keyword in the 9th column of gtf
 def get_field_value(key, fields):
     if key not in fields:
         return None
     else:
-        return fields.split(key+' "')[1].split()[0].replace('";','')
+        return fields.split(key + ' "')[1].split()[0].replace('";', "")
+
 
 def construct_new_entry(prev_line, coords, entry_type):
+    # print('Constructing new {} entry'.format(entry_type))
 
-	# print('Constructing new {} entry'.format(entry_type))
-	
-	# add gene or transcript type, coords, and len
-	prev_line[2] = entry_type
-	prev_line[3] = min(coords)
-	prev_line[4] = max(coords)
-	prev_line[7] = '.'
-
-	# change the fields to reflect what type we are now
-	new_fields = ''
-	fields = prev_line[-1]
-	gid = get_field_value('gene_id', fields)
-	new_fields += 'gene_id "{}";'.format(gid)
-
-	# if there's a gene name add it too
-	gname = get_field_value('gene_name', fields)
-	if gname:
-		new_fields += 'gene_name "{}";'.format(gname)
-
-	if entry_type == 'transcript':
-		tid = get_field_value('transcript_id', fields)
-		new_fields += ' transcript_id "{}";'.format(tid)
-		
-	prev_line[-1] = new_fields
-	prev_line = format_to_write(prev_line)
-
-	return prev_line
+    # add gene or transcript type, coords, and len
+    prev_line[2] = entry_type
+    prev_line[3] = min(coords)
+    prev_line[4] = max(coords)
+    prev_line[7] = "."
 
-def make_ofile_name(matfile, prefix=None):
-	fname = matfile.split('.gtf')[0]
-	if prefix:
-		fname += '_'
-		fname += prefix
-	fname += '_reformatted.gtf'
-	return fname
+    # change the fields to reflect what type we are now
+    new_fields = ""
+    fields = prev_line[-1]
+    gid = get_field_value("gene_id", fields)
+    new_fields += 'gene_id "{}";'.format(gid)
 
-def format_to_write(line): 
-	return ''.join('\t'.join([str(i) for i in line])+'\n')
+    # if there's a gene name add it too
+    gname = get_field_value("gene_name", fields)
+    if gname:
+        new_fields += 'gene_name "{}";'.format(gname)
 
-def main():
+    if entry_type == "transcript":
+        tid = get_field_value("transcript_id", fields)
+        new_fields += ' transcript_id "{}";'.format(tid)
 
-	args = get_args()
-	gtffile = args.gtf
+    prev_line[-1] = new_fields
+    prev_line = format_to_write(prev_line)
 
-	(missing_gene, missing_transcript) = is_bad_gtf(gtffile)
+    return prev_line
 
-	print('Missing transcript :  {}'.format(missing_transcript))
 
-	# if nothing is missing, you good!
-	if not missing_gene and not missing_transcript: 
-		print('GTF has both gene and transcript entries. Nothing to add.')
-		return
+def make_ofile_name(matfile, prefix=None):
+    fname = matfile.split(".gtf")[0]
+    if prefix:
+        fname += "_"
+        fname += prefix
+    fname += "_reformatted.gtf"
+    return fname
+
+
+def format_to_write(line):
+    return "".join("\t".join([str(i) for i in line]) + "\n")
 
-	# loop through this thing
-	infile = open(gtffile, 'r')
-	outfile = open(make_ofile_name(gtffile), 'w')
 
-	curr_gid = ''
-	curr_gid_coords = []
+def main():
+    args = get_args()
+    gtffile = args.gtf
+
+    (missing_gene, missing_transcript) = is_bad_gtf(gtffile)
 
-	curr_tid = ''
-	curr_tid_coords = []
+    print("Missing transcript :  {}".format(missing_transcript))
 
-	first_transcript = True
-	first_exon = True
+    # if nothing is missing, you good!
+    if not missing_gene and not missing_transcript:
+        print("GTF has both gene and transcript entries. Nothing to add.")
+        return
 
-	gene_list = []
-	transcript_list = []
+    # loop through this thing
+    infile = open(gtffile, "r")
+    outfile = open(make_ofile_name(gtffile), "w")
 
-	prev_line = ''
+    curr_gid = ""
+    curr_gid_coords = []
 
-	# relevant entries
-	entries = ['exon']
+    curr_tid = ""
+    curr_tid_coords = []
 
-	if missing_gene: 
-		entries.append('transcript')
+    first_transcript = True
+    first_exon = True
 
-	if missing_gene or missing_transcript:
+    gene_list = []
+    transcript_list = []
 
-		for line in infile: 
+    prev_line = ""
 
-			# skip the dumb header lines
-			if line.startswith('#'):
-				continue
+    # relevant entries
+    entries = ["exon"]
 
-			line = line.strip().split('\t')
-			fields = line[-1]
+    if missing_gene:
+        entries.append("transcript")
 
-			gid = get_field_value('gene_id', fields)
-			tid = get_field_value('transcript_id', fields)
+    if missing_gene or missing_transcript:
+        for line in infile:
+            # skip the dumb header lines
+            if line.startswith("#"):
+                continue
 
-			if line[2] in entries:
+            line = line.strip().split("\t")
+            fields = line[-1]
 
-				# set variables if first entry
-				if first_exon: 
-					curr_gid = gid
-					curr_tid = tid
+            gid = get_field_value("gene_id", fields)
+            tid = get_field_value("transcript_id", fields)
 
-					curr_gid_coords = [int(line[3]), int(line[4])]
-					curr_tid_coords = [int(line[3]), int(line[4])]
+            if line[2] in entries:
+                # set variables if first entry
+                if first_exon:
+                    curr_gid = gid
+                    curr_tid = tid
 
-					first_exon = False
+                    curr_gid_coords = [int(line[3]), int(line[4])]
+                    curr_tid_coords = [int(line[3]), int(line[4])]
 
-					prev_line = line
+                    first_exon = False
 
-				# found a new transcript
-				elif missing_transcript and tid != curr_tid: 
+                    prev_line = line
 
-					# create transcript entry and dump to current gene list
-					new_entry = construct_new_entry(
-						prev_line, curr_tid_coords, 'transcript')
-					transcript_list = new_entry+''.join(transcript_list) 
+                # found a new transcript
+                elif missing_transcript and tid != curr_tid:
+                    # create transcript entry and dump to current gene list
+                    new_entry = construct_new_entry(prev_line, curr_tid_coords, "transcript")
+                    transcript_list = new_entry + "".join(transcript_list)
 
-					gene_list += transcript_list 
-					transcript_list = ''
-					curr_tid_coords = []
+                    gene_list += transcript_list
+                    transcript_list = ""
+                    curr_tid_coords = []
 
-				if missing_gene and gid != curr_gid:
-					
-					# create gene entry and write current gene list
-					new_entry = construct_new_entry(
-						prev_line, curr_gid_coords, 'gene')
-					gene_list = new_entry+''.join(gene_list)
+                if missing_gene and gid != curr_gid:
+                    # create gene entry and write current gene list
+                    new_entry = construct_new_entry(prev_line, curr_gid_coords, "gene")
+                    gene_list = new_entry + "".join(gene_list)
 
-					gene_list += ''.join(transcript_list)
-					transcript_list = ''
-					curr_tid_coords = []
+                    gene_list += "".join(transcript_list)
+                    transcript_list = ""
+                    curr_tid_coords = []
 
-					outfile.write(gene_list)
-					gene_list = ''
-					curr_gid_coords = []
+                    outfile.write(gene_list)
+                    gene_list = ""
+                    curr_gid_coords = []
 
-				# update loop vars
-				curr_gid = gid
-				curr_tid = tid
-				curr_gid_coords.append(int(line[3]))
-				curr_gid_coords.append(int(line[4]))
-				curr_tid_coords.append(int(line[3]))
-				curr_tid_coords.append(int(line[4]))
+                # update loop vars
+                curr_gid = gid
+                curr_tid = tid
+                curr_gid_coords.append(int(line[3]))
+                curr_gid_coords.append(int(line[4]))
+                curr_tid_coords.append(int(line[3]))
+                curr_tid_coords.append(int(line[4]))
 
-				prev_line = line
+                prev_line = line
 
-			# regardless, append to list of entries to write
-			transcript_list += format_to_write(line)
+            # regardless, append to list of entries to write
+            transcript_list += format_to_write(line)
 
-		# if we've reached the end of the file
-		# create transcript entry and dump to current gene list
-		if missing_transcript:
-			new_entry = construct_new_entry(
-				prev_line, curr_tid_coords, 'transcript')
-			transcript_list = new_entry+''.join(transcript_list)
+        # if we've reached the end of the file
+        # create transcript entry and dump to current gene list
+        if missing_transcript:
+            new_entry = construct_new_entry(prev_line, curr_tid_coords, "transcript")
+            transcript_list = new_entry + "".join(transcript_list)
 
-		gene_list += transcript_list 
-		transcript_list = ''
+        gene_list += transcript_list
+        transcript_list = ""
 
-		# create gene entry and write current gene list
-		if missing_gene:
-			new_entry = construct_new_entry(
-				prev_line, curr_gid_coords, 'gene')
-		gene_list = new_entry+''.join(gene_list)
-		outfile.write(gene_list)
-		gene_list = ''
+        # create gene entry and write current gene list
+        if missing_gene:
+            new_entry = construct_new_entry(prev_line, curr_gid_coords, "gene")
+        gene_list = new_entry + "".join(gene_list)
+        outfile.write(gene_list)
+        gene_list = ""
 
+        infile.close()
+        outfile.close()
 
-		infile.close()
-		outfile.close()
 
-if __name__ == '__main__':
-	main()
+if __name__ == "__main__":
+    main()
diff --git a/src/talon/talon.py b/src/talon/talon.py
index d85dace..79daa18 100644
--- a/src/talon/talon.py
+++ b/src/talon/talon.py
@@ -5,39 +5,41 @@
 # assigns them transcript and gene identifiers based on a GTF annotation.
 # Novel transcripts are assigned new identifiers.
 import argparse
-from functools import reduce
-import sqlite3
-import sys
+import logging
+import multiprocessing as mp
 import operator
 import os
+import queue
+import sqlite3
+import sys
+import time
+import warnings
+from datetime import datetime, timedelta
+from functools import reduce
+from itertools import islice, repeat
 from pathlib import Path
+from string import Template
+
 import pandas as pd
-import warnings
-import logging
+import pysam
+
+from talon.post import get_read_annotations
 
 from . import dstruct
-from . import process_sams as procsams
-from . import transcript_utils as tutils
-from . import query_utils as qutils
 from . import init_refs as init_refs
 from . import logger as logger
-from talon.post import get_read_annotations
-import pysam
-from string import Template
-import multiprocessing as mp
-import queue
-from datetime import datetime, timedelta
-import time
-from itertools import repeat, islice
-
+from . import process_sams as procsams
+from . import query_utils as qutils
+from . import transcript_utils as tutils
 
 # set verbosity for pysam
 save = pysam.set_verbosity(0)
 # pysam.set_verbosity(save)
 
+
 class Counter(object):
     def __init__(self, initval=0):
-        self.val = mp.Value('i', initval)
+        self.val = mp.Value("i", initval)
         self.lock = mp.Lock()
 
     def increment(self):
@@ -51,8 +53,8 @@ def value(self):
 
 
 def get_counters(database):
-    """ Fetch counter values from the database and create counter objects
-        that will be accessible to all of the threads during the parallel run
+    """Fetch counter values from the database and create counter objects
+    that will be accessible to all of the threads during the parallel run
     """
 
     with sqlite3.connect(database) as conn:
@@ -62,34 +64,33 @@ def get_counters(database):
         # Fetch counter values
         cursor.execute("SELECT * FROM counters WHERE category == 'genes'")
         global gene_counter
-        gene_counter = Counter(initval=cursor.fetchone()['count'])
+        gene_counter = Counter(initval=cursor.fetchone()["count"])
 
-        cursor.execute(
-            "SELECT * FROM counters WHERE category == 'transcripts'")
+        cursor.execute("SELECT * FROM counters WHERE category == 'transcripts'")
         global transcript_counter
-        transcript_counter = Counter(initval=cursor.fetchone()['count'])
+        transcript_counter = Counter(initval=cursor.fetchone()["count"])
 
         cursor.execute("SELECT * FROM counters WHERE category == 'vertex'")
         global vertex_counter
-        vertex_counter = Counter(initval=cursor.fetchone()['count'])
+        vertex_counter = Counter(initval=cursor.fetchone()["count"])
 
         cursor.execute("SELECT * FROM counters WHERE category == 'edge'")
         global edge_counter
-        edge_counter = Counter(initval=cursor.fetchone()['count'])
+        edge_counter = Counter(initval=cursor.fetchone()["count"])
 
         cursor.execute("SELECT * FROM counters WHERE category == 'observed'")
         global observed_counter
-        observed_counter = Counter(initval=cursor.fetchone()['count'])
+        observed_counter = Counter(initval=cursor.fetchone()["count"])
 
         cursor.execute("SELECT * FROM counters WHERE category == 'dataset'")
         global dataset_counter
-        dataset_counter = Counter(initval=cursor.fetchone()['count'])
+        dataset_counter = Counter(initval=cursor.fetchone()["count"])
 
     return
 
 
 def get_args():
-    """ Fetches the arguments for the program """
+    """Fetches the arguments for the program"""
 
     program_desc = """TALON takes transcripts from one or more long read
                       datasets (SAM format) and assigns them transcript and gene
@@ -97,51 +98,88 @@ def get_args():
                       Novel events are assigned new identifiers."""
     parser = argparse.ArgumentParser(description=program_desc)
 
-    parser.add_argument("--f", dest="config_file",
-                        help="Dataset config file: dataset name, sample description, " +
-                        "platform, sam file (comma-delimited)", type=str)
-    parser.add_argument("--cb", dest='use_cb_tag', action='store_true',
-                        help="Use CB tag in input SAM file instead of including " +
-                        "a dataset name in your config file", default=False)
-    parser.add_argument('--db', dest='database', metavar='FILE,', type=str,
-                        help='TALON database. Created using talon_initialize_database')
-    parser.add_argument('--build', dest='build', metavar='STRING,', type=str,
-                        help='Genome build (i.e. hg38) to use. Must be in the database.')
-    parser.add_argument("--threads", "-t", dest="threads",
-                        help="Number of threads to run program with.",
-                        type=int, default=2)
-    parser.add_argument("--cov", "-c", dest="min_coverage",
-                        help="Minimum alignment coverage in order to use a SAM entry. Default = 0.9",
-                        type=float, default=0.9)
-    parser.add_argument("--identity", "-i", dest="min_identity",
-                        help="Minimum alignment identity in order to use a SAM entry. Default = 0.8",
-                        type=float, default=0.8)
-    parser.add_argument("--nsg", "--create_novel_spliced_genes", dest='create_novel_spliced_genes', action='store_true',
-                        help="Make novel genes with the intergenic novelty label " +
-                        "for transcripts that don't share " +
-                        "splice junctions with any other models", default=False)
-    parser.add_argument("--tmpDir", dest="tmp_dir",
-                        help="Path to directory for tmp files. Default = `talon_tmp/`",
-                        type=str,  default="talon_tmp/")
-    parser.add_argument("--verbosity", "-v", type=int, default=1,
-                        help="Verbosity of TALON output. Higher numbers = more verbose.")
-    parser.add_argument("--o", dest="outprefix",
-                        help="Prefix for output files", type=str)
+    parser.add_argument(
+        "--f",
+        dest="config_file",
+        help="Dataset config file: dataset name, sample description, " + "platform, sam file (comma-delimited)",
+        type=str,
+    )
+    parser.add_argument(
+        "--cb",
+        dest="use_cb_tag",
+        action="store_true",
+        help="Use CB tag in input SAM file instead of including " + "a dataset name in your config file",
+        default=False,
+    )
+    parser.add_argument(
+        "--db",
+        dest="database",
+        metavar="FILE,",
+        type=str,
+        help="TALON database. Created using talon_initialize_database",
+    )
+    parser.add_argument(
+        "--build",
+        dest="build",
+        metavar="STRING,",
+        type=str,
+        help="Genome build (i.e. hg38) to use. Must be in the database.",
+    )
+    parser.add_argument(
+        "--threads", "-t", dest="threads", help="Number of threads to run program with.", type=int, default=2
+    )
+    parser.add_argument(
+        "--cov",
+        "-c",
+        dest="min_coverage",
+        help="Minimum alignment coverage in order to use a SAM entry. Default = 0.9",
+        type=float,
+        default=0.9,
+    )
+    parser.add_argument(
+        "--identity",
+        "-i",
+        dest="min_identity",
+        help="Minimum alignment identity in order to use a SAM entry. Default = 0.8",
+        type=float,
+        default=0.8,
+    )
+    parser.add_argument(
+        "--nsg",
+        "--create_novel_spliced_genes",
+        dest="create_novel_spliced_genes",
+        action="store_true",
+        help="Make novel genes with the intergenic novelty label "
+        + "for transcripts that don't share "
+        + "splice junctions with any other models",
+        default=False,
+    )
+    parser.add_argument(
+        "--tmpDir",
+        dest="tmp_dir",
+        help="Path to directory for tmp files. Default = `talon_tmp/`",
+        type=str,
+        default="talon_tmp/",
+    )
+    parser.add_argument(
+        "--verbosity", "-v", type=int, default=1, help="Verbosity of TALON output. Higher numbers = more verbose."
+    )
+    parser.add_argument("--o", dest="outprefix", help="Prefix for output files", type=str)
 
     args = parser.parse_args()
     return args
 
 
 def str_wrap_double(s):
-    """ Adds double quotes around the input string """
+    """Adds double quotes around the input string"""
     s = str(s)
     return '"' + s + '"'
 
 
 def search_for_vertex_at_pos(chromosome, position, location_dict):
-    """ Given a chromosome and a position (1-based), this function queries the
-        location dict to determine whether a vertex
-        fitting those criteria exists. Returns the row if yes, and __ if no.
+    """Given a chromosome and a position (1-based), this function queries the
+    location dict to determine whether a vertex
+    fitting those criteria exists. Returns the row if yes, and __ if no.
     """
     try:
         return location_dict[chromosome][position]
@@ -150,7 +188,7 @@ def search_for_vertex_at_pos(chromosome, position, location_dict):
 
 
 def search_for_edge(vertex_1, vertex_2, edge_type, edge_dict):
-    """ Search the edge dict for an edge linking vertex_1 and vertex_2"""
+    """Search the edge dict for an edge linking vertex_1 and vertex_2"""
     query_key = (vertex_1, vertex_2, edge_type)
     try:
         return edge_dict[query_key]
@@ -158,12 +196,11 @@ def search_for_edge(vertex_1, vertex_2, edge_type, edge_dict):
         return None
 
 
-def match_monoexon_vertices(chromosome, positions, strand, location_dict,
-                            run_info):
-    """ Given the start and end of a single-exon transcript, this function looks
-        for a matching vertex for each position. Also returns a list where each
-        index indicates whether that vertex is novel to the data structure
-        (0 for known, 1 for novel) """
+def match_monoexon_vertices(chromosome, positions, strand, location_dict, run_info):
+    """Given the start and end of a single-exon transcript, this function looks
+    for a matching vertex for each position. Also returns a list where each
+    index indicates whether that vertex is novel to the data structure
+    (0 for known, 1 for novel)"""
 
     # Returned by function
     vertex_matches = []
@@ -183,20 +220,19 @@ def match_monoexon_vertices(chromosome, positions, strand, location_dict,
         if curr_index == start:
             sj_pos = positions[curr_index + 1]
             pos_type = "start"
-            vertex_match, diff_5p = permissive_vertex_search(chromosome, position,
-                                                             strand, sj_pos, pos_type,
-                                                             location_dict, run_info)
+            vertex_match, diff_5p = permissive_vertex_search(
+                chromosome, position, strand, sj_pos, pos_type, location_dict, run_info
+            )
         elif curr_index == end:
             sj_pos = positions[curr_index - 1]
             pos_type = "end"
-            vertex_match, diff_3p = permissive_vertex_search(chromosome, position,
-                                                             strand, sj_pos, pos_type,
-                                                             location_dict, run_info)
+            vertex_match, diff_3p = permissive_vertex_search(
+                chromosome, position, strand, sj_pos, pos_type, location_dict, run_info
+            )
 
         if vertex_match == None:
             # If no vertex matches the position, one is created.
-            vertex_match = create_vertex(chromosome, position, location_dict, run_info)[
-                "location_ID"]
+            vertex_match = create_vertex(chromosome, position, location_dict, run_info)["location_ID"]
             novelty.append(1)
         else:
             novelty.append(0)
@@ -208,42 +244,39 @@ def match_monoexon_vertices(chromosome, positions, strand, location_dict,
 
 
 def match_splice_vertices(chromosome, positions, strand, location_dict, run_info):
-    """ Given a chromosome and a list of positions from the transcript in 5' to
-        3' end order, this function looks for a matching vertex for each splice
-        junction position (so it ignores the ends). Also returns a list where
-        each index indicates whether that vertex is novel to the data structure
-        (0 for known, 1 for novel) """
+    """Given a chromosome and a list of positions from the transcript in 5' to
+    3' end order, this function looks for a matching vertex for each splice
+    junction position (so it ignores the ends). Also returns a list where
+    each index indicates whether that vertex is novel to the data structure
+    (0 for known, 1 for novel)"""
 
     # Returned by function
     vertex_matches = []
     novelty = []
 
     # Iterate over positions
-    for curr_index in range(1, len(positions)-1):
+    for curr_index in range(1, len(positions) - 1):
         position = positions[curr_index]
 
-        vertex_match = search_for_vertex_at_pos(
-            chromosome, position, location_dict)
+        vertex_match = search_for_vertex_at_pos(chromosome, position, location_dict)
         if vertex_match == None:
             # If no vertex matches the position, one is created.
-            vertex_match = create_vertex(
-                chromosome, position, location_dict, run_info)
+            vertex_match = create_vertex(chromosome, position, location_dict, run_info)
             novelty.append(1)
         else:
             novelty.append(0)
 
         # Add to running list of matches
-        vertex_matches.append(vertex_match['location_ID'])
+        vertex_matches.append(vertex_match["location_ID"])
 
     return vertex_matches, novelty
 
 
-def match_all_transcript_vertices(chromosome, positions, strand, location_dict,
-                                  run_info):
-    """ Given a chromosome and a list of positions from the transcript in 5' to
-        3' end order, this function looks for a matching vertex for each
-        position. Also returns a list where each index indicates whether that
-        vertex is novel to the data structure (0 for known, 1 for novel) """
+def match_all_transcript_vertices(chromosome, positions, strand, location_dict, run_info):
+    """Given a chromosome and a list of positions from the transcript in 5' to
+    3' end order, this function looks for a matching vertex for each
+    position. Also returns a list where each index indicates whether that
+    vertex is novel to the data structure (0 for known, 1 for novel)"""
 
     # Returned by function
     vertex_matches = []
@@ -263,39 +296,38 @@ def match_all_transcript_vertices(chromosome, positions, strand, location_dict,
         if curr_index == start:
             sj_pos = positions[curr_index + 1]
             pos_type = "start"
-            vertex_match, diff_5p = permissive_vertex_search(chromosome, position,
-                                                             strand, sj_pos, pos_type,
-                                                             location_dict, run_info)
+            vertex_match, diff_5p = permissive_vertex_search(
+                chromosome, position, strand, sj_pos, pos_type, location_dict, run_info
+            )
         elif curr_index == end:
             sj_pos = positions[curr_index - 1]
             pos_type = "end"
-            vertex_match, diff_3p = permissive_vertex_search(chromosome, position,
-                                                             strand, sj_pos, pos_type,
-                                                             location_dict, run_info)
+            vertex_match, diff_3p = permissive_vertex_search(
+                chromosome, position, strand, sj_pos, pos_type, location_dict, run_info
+            )
 
         # Remaining mid-transcript positions go through strict matching process
         else:
-            vertex_match = search_for_vertex_at_pos(
-                chromosome, position, location_dict)
+            vertex_match = search_for_vertex_at_pos(chromosome, position, location_dict)
         if vertex_match == None:
             # If no vertex matches the position, one is created.
-            vertex_match = create_vertex(
-                chromosome, position, location_dict, run_info)
+            vertex_match = create_vertex(chromosome, position, location_dict, run_info)
             novelty.append(1)
         else:
             novelty.append(0)
 
         # Add to running list of matches
-        vertex_matches.append(vertex_match['location_ID'])
+        vertex_matches.append(vertex_match["location_ID"])
 
     return tuple(vertex_matches), tuple(novelty), diff_5p, diff_3p
 
 
-def permissive_match_with_gene_priority(chromosome, position, strand, sj_pos,
-                                        pos_type, gene_ID, gene_locs, locations, run_info):
-    """ Tries to match a position to a known start/end vertex from the same
-        gene. If none is found, the normal permissive match procedure is
-        invoked.
+def permissive_match_with_gene_priority(
+    chromosome, position, strand, sj_pos, pos_type, gene_ID, gene_locs, locations, run_info
+):
+    """Tries to match a position to a known start/end vertex from the same
+    gene. If none is found, the normal permissive match procedure is
+    invoked.
     """
     # Check inputs
     if pos_type != "start" and pos_type != "end":
@@ -303,7 +335,7 @@ def permissive_match_with_gene_priority(chromosome, position, strand, sj_pos,
         logging.error(msg)
         raise ValueError(msg)
     if strand != "+" and strand != "-":
-        msg = f'Invalid strand specified: {strand}'
+        msg = f"Invalid strand specified: {strand}"
         logging.error(msg)
         raise ValueError(msg)
 
@@ -313,23 +345,21 @@ def permissive_match_with_gene_priority(chromosome, position, strand, sj_pos,
         dist = 0
         if gene_ID in gene_locs:
             if position in gene_locs[gene_ID]:
-                return match['location_ID'], dist, 1
+                return match["location_ID"], dist, 1
             else:
-                return match['location_ID'], dist, 0
+                return match["location_ID"], dist, 0
         else:
-            return match['location_ID'], dist, 0
+            return match["location_ID"], dist, 0
 
     # This approach only works when there are known starts/ends for this gene
     if gene_ID in gene_locs:
-
         # Get cutoff distance
         if pos_type == "start":
             max_dist = run_info.cutoff_5p
         else:
             max_dist = run_info.cutoff_3p
 
-        if (strand == "+" and pos_type == "start") or \
-           (strand == "-" and pos_type == "end"):
+        if (strand == "+" and pos_type == "start") or (strand == "-" and pos_type == "end"):
             search_window_start = position - max_dist
             search_window_end = sj_pos
         else:
@@ -355,23 +385,20 @@ def permissive_match_with_gene_priority(chromosome, position, strand, sj_pos,
             return closest_vertex, best_dist, 1
 
     # Otherwise, revert to permissive match approach.
-    match, dist = permissive_vertex_search(chromosome, position, strand,
-                                           sj_pos, pos_type,
-                                           locations, run_info)
+    match, dist = permissive_vertex_search(chromosome, position, strand, sj_pos, pos_type, locations, run_info)
     return match, dist, 0
 
 
-def permissive_vertex_search(chromosome, position, strand, sj_pos, pos_type,
-                             locations, run_info):
-    """ Given a position, this function tries to find a vertex match within the
-        cutoff distance that also comes before the splice junction begins.
-        If no vertex is found, the function returns None. """
+def permissive_vertex_search(chromosome, position, strand, sj_pos, pos_type, locations, run_info):
+    """Given a position, this function tries to find a vertex match within the
+    cutoff distance that also comes before the splice junction begins.
+    If no vertex is found, the function returns None."""
 
     # Try a strict match first
     if chromosome in locations and position in locations[chromosome]:
         match = locations[chromosome][position]
         dist = 0
-        return match['location_ID'], dist
+        return match["location_ID"], dist
 
     if pos_type != "start" and pos_type != "end":
         msg = "Please set pos_type to either 'start' or 'end'."
@@ -396,8 +423,7 @@ def permissive_vertex_search(chromosome, position, strand, sj_pos, pos_type,
     # position first (since degradtion is more biologically likely).
     # For the + strand, this would be a negative delta, and for the - strand,
     # it would be a positive delta
-    if (strand == "+" and pos_type == "start") or \
-       (strand == "-" and pos_type == "end"):
+    if (strand == "+" and pos_type == "start") or (strand == "-" and pos_type == "end"):
         direction_priority = -1
         search_window_start = position - max_dist
         search_window_end = sj_pos
@@ -407,30 +433,27 @@ def permissive_vertex_search(chromosome, position, strand, sj_pos, pos_type,
         search_window_end = position + max_dist
 
     for dist in range(1, max_dist):
-        curr_pos = position + dist*direction_priority
+        curr_pos = position + dist * direction_priority
         if curr_pos > search_window_start and curr_pos < search_window_end:
             match = search_for_vertex_at_pos(chromosome, curr_pos, locations)
             if match != None:
                 dist = compute_delta(curr_pos, position, strand)
-                return match['location_ID'], dist
+                return match["location_ID"], dist
 
-        curr_pos = position - dist*direction_priority
+        curr_pos = position - dist * direction_priority
         if curr_pos > search_window_start and curr_pos < search_window_end:
             match = search_for_vertex_at_pos(chromosome, curr_pos, locations)
             if match != None:
                 dist = compute_delta(curr_pos, position, strand)
-                return match['location_ID'], dist
+                return match["location_ID"], dist
 
     return None, None
 
 
 def create_vertex(chromosome, position, location_dict, run_info):
-    """ Creates a novel vertex and adds it to the location data structure. """
+    """Creates a novel vertex and adds it to the location data structure."""
     new_ID = vertex_counter.increment()
-    new_vertex = {'location_ID': new_ID,
-                  'genome_build': run_info.build,
-                  'chromosome': chromosome,
-                  'position': position}
+    new_vertex = {"location_ID": new_ID, "genome_build": run_info.build, "chromosome": chromosome, "position": position}
 
     try:
         location_dict[chromosome][position] = new_vertex
@@ -441,38 +464,34 @@ def create_vertex(chromosome, position, location_dict, run_info):
 
 
 def create_edge(vertex_1, vertex_2, edge_type, strand, edge_dict):
-    """ Creates a novel edge and adds it to the edge data structure. """
+    """Creates a novel edge and adds it to the edge data structure."""
     new_ID = edge_counter.increment()
-    new_edge = {'edge_ID': new_ID,
-                'v1': vertex_1,
-                'v2': vertex_2,
-                'edge_type': edge_type,
-                'strand': strand}
+    new_edge = {"edge_ID": new_ID, "v1": vertex_1, "v2": vertex_2, "edge_type": edge_type, "strand": strand}
     edge_dict[(vertex_1, vertex_2, edge_type)] = new_edge
 
     return new_edge
 
 
 def create_gene(chromosome, start, end, strand, memory_cursor, tmp_gene):
-    """ Create a novel gene and add it to the temporary table.
-    """
+    """Create a novel gene and add it to the temporary table."""
     new_ID = gene_counter.increment()
 
     new_gene = (new_ID, chromosome, min(start, end), max(start, end), strand)
     cols = ' ("gene_ID", "chromosome", "start", "end", "strand")'
-    command = 'INSERT INTO ' + tmp_gene + cols + ' VALUES ' + '(?,?,?,?,?)'
+    command = "INSERT INTO " + tmp_gene + cols + " VALUES " + "(?,?,?,?,?)"
     memory_cursor.execute(command, new_gene)
     return new_ID
 
 
-def create_transcript(strand, chromosome, start_pos, end_pos, gene_ID, edge_IDs, vertex_IDs,
-                      transcript_dict, tmp_t, memory_cursor):
+def create_transcript(
+    strand, chromosome, start_pos, end_pos, gene_ID, edge_IDs, vertex_IDs, transcript_dict, tmp_t, memory_cursor
+):
     """Creates a novel transcript, add it to the transcript data structure,
-       and add to tmp_t
+    and add to tmp_t
     """
-    print('creating new transcript')
+    print("creating new transcript")
     new_ID = transcript_counter.increment()
-    print(f'new tid:{new_ID}')
+    print(f"new tid:{new_ID}")
 
     # updating the dict
     if len(edge_IDs) > 1:
@@ -480,17 +499,19 @@ def create_transcript(strand, chromosome, start_pos, end_pos, gene_ID, edge_IDs,
     else:
         jn_path = None
 
-    new_transcript = {'transcript_ID': new_ID,
-                      'gene_ID': gene_ID,
-                      'jn_path': jn_path,
-                      'start_exon': edge_IDs[0],
-                      'end_exon': edge_IDs[-1],
-                      'start_vertex': vertex_IDs[0],
-                      'end_vertex': vertex_IDs[-1],
-                      'n_exons': int((len(edge_IDs) + 1)/2),
-                      'chromosome': chromosome,
-                      'start_pos': start_pos,
-                      'end_pos': end_pos}
+    new_transcript = {
+        "transcript_ID": new_ID,
+        "gene_ID": gene_ID,
+        "jn_path": jn_path,
+        "start_exon": edge_IDs[0],
+        "end_exon": edge_IDs[-1],
+        "start_vertex": vertex_IDs[0],
+        "end_vertex": vertex_IDs[-1],
+        "n_exons": int((len(edge_IDs) + 1) / 2),
+        "chromosome": chromosome,
+        "start_pos": start_pos,
+        "end_pos": end_pos,
+    }
 
     path_key = frozenset(edge_IDs)
     transcript_dict[path_key] = new_transcript
@@ -498,17 +519,17 @@ def create_transcript(strand, chromosome, start_pos, end_pos, gene_ID, edge_IDs,
     # updating tmp_t
     new_t = (gene_ID, new_ID, chromosome, strand, min(start_pos, end_pos), max(start_pos, end_pos))
     cols = ' ("gene_ID", "transcript_ID", "chromosome", "strand", "min_pos", "max_pos")'
-    command = 'INSERT INTO ' + tmp_t + cols + ' VALUES ' + '(?,?,?,?,?,?)'
+    command = "INSERT INTO " + tmp_t + cols + " VALUES " + "(?,?,?,?,?,?)"
     memory_cursor.execute(command, new_t)
 
     return new_transcript
 
 
 def check_all_exons_known(novelty):
-    """ Given a list in which each element represents the novelty (1) or
-        known-ness of a transcript edge (0), determine whether all of the
-        exons are known or not. Return True if all are known, and False
-        otherwise. Input should not include first or last exon. """
+    """Given a list in which each element represents the novelty (1) or
+    known-ness of a transcript edge (0), determine whether all of the
+    exons are known or not. Return True if all are known, and False
+    otherwise. Input should not include first or last exon."""
 
     if len(novelty) == 1:
         return novelty[0] == 0
@@ -522,11 +543,11 @@ def check_all_exons_known(novelty):
 
 
 def check_all_SJs_known(novelty):
-    """ Given a list in which each element represents the novelty (1) or
-        known-ness of a transcript edge (0), determine whether all of the
-        introns are known or not. Return True if all are known, and False
-        otherwise. Input should not include first or last exon. If there is
-        only one entry, then that means there is one splice junction (two exons)"""
+    """Given a list in which each element represents the novelty (1) or
+    known-ness of a transcript edge (0), determine whether all of the
+    introns are known or not. Return True if all are known, and False
+    otherwise. Input should not include first or last exon. If there is
+    only one entry, then that means there is one splice junction (two exons)"""
 
     if len(novelty) == 1:
         return novelty[0] == 0
@@ -540,9 +561,9 @@ def check_all_SJs_known(novelty):
 
 
 def match_all_splice_edges(vertices, strand, edge_dict, run_info):
-    """ Given a list of splice junction-only vertex IDs from the transcript in 5' to
-        3' end order, this function looks for a matching edge ID for each
-        position. If none exists, it creates one. """
+    """Given a list of splice junction-only vertex IDs from the transcript in 5' to
+    3' end order, this function looks for a matching edge ID for each
+    position. If none exists, it creates one."""
 
     edge_matches = []
     novelty = []
@@ -559,9 +580,7 @@ def match_all_splice_edges(vertices, strand, edge_dict, run_info):
         vertex_1 = vertices[index_1]
         vertex_2 = vertices[index_2]
 
-        edge_match, curr_novelty = match_or_create_edge(vertex_1, vertex_2,
-                                                        edge_type, strand,
-                                                        edge_dict)
+        edge_match, curr_novelty = match_or_create_edge(vertex_1, vertex_2, edge_type, strand, edge_dict)
         edge_matches.append(edge_match)
         novelty.append(curr_novelty)
 
@@ -569,23 +588,22 @@ def match_all_splice_edges(vertices, strand, edge_dict, run_info):
 
 
 def match_or_create_edge(vertex_1, vertex_2, edge_type, strand, edge_dict):
-    """ Searches for edge match to provided set of vertices. If none found,
-        creates a new edge. """
+    """Searches for edge match to provided set of vertices. If none found,
+    creates a new edge."""
     novelty = 0
     edge_match = search_for_edge(vertex_1, vertex_2, edge_type, edge_dict)
 
     if edge_match == None:
         # If no edge matches the position, one is created.
-        edge_match = create_edge(vertex_1, vertex_2, edge_type, strand,
-                                 edge_dict)
+        edge_match = create_edge(vertex_1, vertex_2, edge_type, strand, edge_dict)
         novelty = 1
     return edge_match["edge_ID"], novelty
 
 
 def match_all_transcript_edges(vertices, strand, edge_dict, run_info):
-    """ Given a list of vertex IDs from the transcript in 5' to
-        3' end order, this function looks for a matching edge ID for each
-        position. If none exists, it creates one. Only used for monoexon case"""
+    """Given a list of vertex IDs from the transcript in 5' to
+    3' end order, this function looks for a matching edge ID for each
+    position. If none exists, it creates one. Only used for monoexon case"""
 
     edge_matches = []
     novelty = []
@@ -602,9 +620,7 @@ def match_all_transcript_edges(vertices, strand, edge_dict, run_info):
         vertex_1 = vertices[index_1]
         vertex_2 = vertices[index_2]
 
-        edge_match, curr_novelty = match_or_create_edge(vertex_1, vertex_2,
-                                                        edge_type, strand,
-                                                        edge_dict)
+        edge_match, curr_novelty = match_or_create_edge(vertex_1, vertex_2, edge_type, strand, edge_dict)
         edge_matches.append(edge_match)
         novelty.append(curr_novelty)
 
@@ -612,14 +628,13 @@ def match_all_transcript_edges(vertices, strand, edge_dict, run_info):
 
 
 def search_for_ISM(edge_IDs, transcript_dict):
-    """ Given a list of edges in a query transcript, determine whether it is an
-        incomplete splice match (ISM) of any transcript in the dict. Will also
-        return FSM matches if they're there"""
+    """Given a list of edges in a query transcript, determine whether it is an
+    incomplete splice match (ISM) of any transcript in the dict. Will also
+    return FSM matches if they're there"""
 
     edges = frozenset(edge_IDs)
 
-    ISM_matches = [transcript_dict[x]
-                   for x in transcript_dict if edges.issubset(x)]
+    ISM_matches = [transcript_dict[x] for x in transcript_dict if edges.issubset(x)]
 
     if len(ISM_matches) > 0:
         return ISM_matches
@@ -627,36 +642,36 @@ def search_for_ISM(edge_IDs, transcript_dict):
         return None
 
 
-def search_for_overlap_with_gene(chromosome, start, end, strand,
-                                 cursor, run_info, tmp_gene, tmp_t,
-                                 gene_IDs=None):
-    """ Given a start and an end value for an interval, query the database to
-        determine whether the interval overlaps with any genes. If it there is
-        more than one match, prioritize same-strand first and foremost.
-        If there is more than one same-strand option, prioritize distance from 3' / 5'.
-        Antisense matches may be returned if there is no same strand
-        match.
+def search_for_overlap_with_gene(chromosome, start, end, strand, cursor, run_info, tmp_gene, tmp_t, gene_IDs=None):
+    """Given a start and an end value for an interval, query the database to
+    determine whether the interval overlaps with any genes. If it there is
+    more than one match, prioritize same-strand first and foremost.
+    If there is more than one same-strand option, prioritize distance from 3' / 5'.
+    Antisense matches may be returned if there is no same strand
+    match.
 
-        Parameters:
-            gene_ID (list of str or None): Restrict results to genes in this list
+    Parameters:
+        gene_ID (list of str or None): Restrict results to genes in this list
     """
-    logging.debug('Tiebreaking for gene assignment')
+    logging.debug("Tiebreaking for gene assignment")
     min_start = min(start, end)
     max_end = max(start, end)
     query_interval = [min_start, max_end]
 
     if isinstance(gene_IDs, list):
-        query = Template("""SELECT gene_ID,
+        query = Template(
+            """SELECT gene_ID,
                            transcript_ID,
                            chromosome,
                            min_pos,
                            max_pos,
                            strand
                     FROM $tmp_t
-                    WHERE gene_ID IN $gene_ids""").substitute({'tmp_t': tmp_t, \
-                                                               'gene_ids': qutils.format_for_IN(gene_IDs)})
+                    WHERE gene_ID IN $gene_ids"""
+        ).substitute({"tmp_t": tmp_t, "gene_ids": qutils.format_for_IN(gene_IDs)})
     elif not gene_IDs:
-        query = Template("""SELECT gene_ID,
+        query = Template(
+            """SELECT gene_ID,
                            transcript_ID,
                            chromosome,
                            min_pos,
@@ -668,30 +683,27 @@ def search_for_overlap_with_gene(chromosome, start, end, strand,
                           (min_pos >= $min_start AND max_pos <= $max_end) OR
                           (min_pos >= $min_start AND min_pos <= $max_end) OR
                           (max_pos >= $min_start AND max_pos <= $max_end))
-                     GROUP BY gene_ID;""").substitute({'tmp_t': tmp_t, 'chrom': chromosome,
-                                                       'min_start': min_start, 'max_end': max_end})
+                     GROUP BY gene_ID;"""
+        ).substitute({"tmp_t": tmp_t, "chrom": chromosome, "min_start": min_start, "max_end": max_end})
     cursor.execute(query)
     matches = cursor.fetchall()
 
     # restrict to just the genes we care about
     if gene_IDs:
         # print(f'restricting just to {gene_IDs}')
-        logging.debug(f'Restricing gene tiebreak to {gene_IDs}')
-        matches = [match for match in matches if match['gene_ID'] in gene_IDs]
-
+        logging.debug(f"Restricing gene tiebreak to {gene_IDs}")
+        matches = [match for match in matches if match["gene_ID"] in gene_IDs]
 
     if len(matches) == 0:
         # print('herere here')
-        logging.debug(f'Unable to tiebreak')
+        logging.debug(f"Unable to tiebreak")
         return None, None
 
     # Among multiple matches, preferentially return the same-strand gene with
     # the greatest amount of overlap
     same_strand_matches = len([x for x in matches if x["strand"] == strand])
 
-    if strand == "+" and same_strand_matches > 0 or \
-            strand == "-" and same_strand_matches == 0:
-
+    if strand == "+" and same_strand_matches > 0 or strand == "-" and same_strand_matches == 0:
         matches = [x for x in matches if x["strand"] == "+"]
         best_match = get_best_match(matches, min_start, max_end)
 
@@ -699,7 +711,8 @@ def search_for_overlap_with_gene(chromosome, start, end, strand,
         matches = [x for x in matches if x["strand"] == "-"]
         best_match = get_best_match(matches, min_start, max_end)
 
-    return best_match['gene_ID'], best_match['strand']
+    return best_match["gene_ID"], best_match["strand"]
+
 
 def get_best_match(matches, min_end, max_end):
     """
@@ -712,16 +725,16 @@ def get_best_match(matches, min_end, max_end):
 
     # print(f'read min: {min_end}')
     # print(f'read end: {max_end}')
-    logging.debug(f'Read start / end: ({min_end}, {min_end})')
+    logging.debug(f"Read start / end: ({min_end}, {min_end})")
 
     for match in matches:
         logging.debug(f"Matching with transcripts from gene {match['gene_ID']}, transcript {match['transcript_ID']}")
-        end_dist = abs(match['max_pos']-max_end)
-        start_dist = abs(match['min_pos']-min_end)
+        end_dist = abs(match["max_pos"] - max_end)
+        start_dist = abs(match["min_pos"] - min_end)
 
         logging.debug(f"Transcript start / end: ({match['min_pos']}, {match['max_pos']})")
-        dist = end_dist+start_dist
-        logging.debug(f'Distance between read and transcript ends: {dist}')
+        dist = end_dist + start_dist
+        logging.debug(f"Distance between read and transcript ends: {dist}")
         if dist < min_dist:
             min_dist = dist
             best_match = match
@@ -731,7 +744,6 @@ def get_best_match(matches, min_end, max_end):
     return best_match
 
 
-
 # def get_best_match(matches, query_interval):
 #     """ Given a set of gene matches and a query interval, return the match
 #         that has the greatest amount of overlap with the query."""
@@ -757,26 +769,26 @@ def get_best_match(matches, min_end, max_end):
 
 
 def get_overlap(a, b):
-    """ Computes the amount of overlap between two intervals.
-        Returns 0 if there is no overlap. The function treats the start and
-        ends of each interval as inclusive, meaning that if a = b = [10, 20],
-        the overlap reported would be 11, not 10.
-        Args:
-            a: First interval, formattted as a list (query)
-            b: Second interval, formatted as a list (reference)
-            perc_overlap: Percent overlap from the reference interval that the
-               query interval consumed
+    """Computes the amount of overlap between two intervals.
+    Returns 0 if there is no overlap. The function treats the start and
+    ends of each interval as inclusive, meaning that if a = b = [10, 20],
+    the overlap reported would be 11, not 10.
+    Args:
+        a: First interval, formattted as a list (query)
+        b: Second interval, formatted as a list (reference)
+        perc_overlap: Percent overlap from the reference interval that the
+           query interval consumed
     """
     overlap = max(0, min(a[1], b[1]) - max(a[0], b[0]) + 1)
-    ref_len = abs(b[1]-b[0])
-    perc_overlap = (overlap/ref_len)*100
+    ref_len = abs(b[1] - b[0])
+    perc_overlap = (overlap / ref_len) * 100
     return overlap, perc_overlap
 
 
 def search_for_transcript(edge_IDs, transcript_dict):
-    """ Given the edge IDs (in set data structure) that make up a query
-        transcript, look for a match in the transcript dict.
-        Return gene ID and transcript ID if found, and None if not. """
+    """Given the edge IDs (in set data structure) that make up a query
+    transcript, look for a match in the transcript dict.
+    Return gene ID and transcript ID if found, and None if not."""
 
     try:
         transcript = transcript_dict[edge_IDs]
@@ -787,9 +799,10 @@ def search_for_transcript(edge_IDs, transcript_dict):
         return None, None
 
 
-def process_FSM(chrom, positions, strand, edge_IDs, vertex_IDs, all_matches, gene_starts, gene_ends,
-                edge_dict, locations, run_info):
-    """ Given a transcript, try to find an FSM match for it """
+def process_FSM(
+    chrom, positions, strand, edge_IDs, vertex_IDs, all_matches, gene_starts, gene_ends, edge_dict, locations, run_info
+):
+    """Given a transcript, try to find an FSM match for it"""
     gene_ID = None
     transcript_ID = None
     novelty = []
@@ -803,150 +816,146 @@ def process_FSM(chrom, positions, strand, edge_IDs, vertex_IDs, all_matches, gen
 
     # Check if any of the matches have the same number of exons as the query.
     # Such a match should be prioritized because it's an FSM
-    n_exons = int(len(positions)/2)
-    FSM_matches = [x for x in all_matches if x['n_exons'] == n_exons]
+    n_exons = int(len(positions) / 2)
+    FSM_matches = [x for x in all_matches if x["n_exons"] == n_exons]
 
     if len(FSM_matches) == 0:
         return None, None, [], None
 
     else:
         transcript_match = FSM_matches[0]
-        gene_ID = transcript_match['gene_ID']
-        transcript_ID = transcript_match['transcript_ID']
+        gene_ID = transcript_match["gene_ID"]
+        transcript_ID = transcript_match["transcript_ID"]
 
         # Check whether the query's 5' and 3' ends are within range of those of
         # the match. If not, perform a permissive match assignment
-        curr_5p_diff = compute_delta(transcript_match['start_pos'], positions[0],
-                                     strand)
-        curr_3p_diff = compute_delta(transcript_match['end_pos'], positions[-1],
-                                     strand)
+        curr_5p_diff = compute_delta(transcript_match["start_pos"], positions[0], strand)
+        curr_3p_diff = compute_delta(transcript_match["end_pos"], positions[-1], strand)
         # ---------------- 5' end ---------------------------------
         if abs(curr_5p_diff) <= run_info.cutoff_5p:
-            start_vertex = transcript_match['start_vertex']
-            start_exon = transcript_match['start_exon']
+            start_vertex = transcript_match["start_vertex"]
+            start_exon = transcript_match["start_exon"]
             diff_5p = curr_5p_diff
             start_novelty = 0
         else:
             # First get a permissively matched start vertex
-            start_vertex, start_exon, start_novelty, known_start, diff_5p = process_5p(chrom,
-                                                                                       positions, strand,
-                                                                                       vertex_IDs,
-                                                                                       gene_ID, gene_starts,
-                                                                                       edge_dict,
-                                                                                       locations, run_info)
+            start_vertex, start_exon, start_novelty, known_start, diff_5p = process_5p(
+                chrom, positions, strand, vertex_IDs, gene_ID, gene_starts, edge_dict, locations, run_info
+            )
         # ---------------- 3' end ---------------------------------
         if abs(curr_3p_diff) <= run_info.cutoff_3p:
-            end_vertex = transcript_match['end_vertex']
-            end_exon = transcript_match['end_exon']
+            end_vertex = transcript_match["end_vertex"]
+            end_exon = transcript_match["end_exon"]
             diff_3p = curr_3p_diff
             end_novelty = 0
         else:
             # First get a permissively matched end vertex
-            end_vertex, end_exon, end_novelty, known_end, diff_3p = process_3p(chrom,
-                                                                               positions, strand,
-                                                                               vertex_IDs,
-                                                                               gene_ID, gene_ends,
-                                                                               edge_dict,
-                                                                               locations, run_info)
+            end_vertex, end_exon, end_novelty, known_end, diff_3p = process_3p(
+                chrom, positions, strand, vertex_IDs, gene_ID, gene_ends, edge_dict, locations, run_info
+            )
 
         edge_IDs = [start_exon] + edge_IDs + [end_exon]
         vertex_IDs = [start_vertex] + vertex_IDs + [end_vertex]
 
     # Package information for output
-    start_end_info = {"start_vertex": start_vertex,
-                      "end_vertex": end_vertex,
-                      "start_exon": start_exon,
-                      "end_exon": end_exon,
-                      "diff_5p": diff_5p,
-                      "diff_3p": diff_3p,
-                      "start_novelty": start_novelty,
-                      "end_novelty": end_novelty,
-                      "vertex_IDs": vertex_IDs,
-                      "edge_IDs": edge_IDs}
+    start_end_info = {
+        "start_vertex": start_vertex,
+        "end_vertex": end_vertex,
+        "start_exon": start_exon,
+        "end_exon": end_exon,
+        "diff_5p": diff_5p,
+        "diff_3p": diff_3p,
+        "start_novelty": start_novelty,
+        "end_novelty": end_novelty,
+        "vertex_IDs": vertex_IDs,
+        "edge_IDs": edge_IDs,
+    }
 
     return gene_ID, transcript_ID, novelty, start_end_info
 
 
-def process_5p(chrom, positions, strand, vertex_IDs, gene_ID, gene_starts, edge_dict,
-               locations, run_info):
-    """ Conduct permissive match for 5' end and return assigned vertex,
-        edge, and distance """
+def process_5p(chrom, positions, strand, vertex_IDs, gene_ID, gene_starts, edge_dict, locations, run_info):
+    """Conduct permissive match for 5' end and return assigned vertex,
+    edge, and distance"""
 
     # First get a permissively matched start vertex
-    start_vertex, diff_5p, known_start = permissive_match_with_gene_priority(chrom,
-                                                                             positions[0], strand, positions[1],
-                                                                             "start", gene_ID, gene_starts,
-                                                                             locations, run_info)
+    start_vertex, diff_5p, known_start = permissive_match_with_gene_priority(
+        chrom, positions[0], strand, positions[1], "start", gene_ID, gene_starts, locations, run_info
+    )
     if start_vertex == None:
-        start_vertex = create_vertex(chrom, positions[0], locations, run_info)[
-            'location_ID']
+        start_vertex = create_vertex(chrom, positions[0], locations, run_info)["location_ID"]
 
     # Then get the start exon
-    start_exon, start_novelty = match_or_create_edge(start_vertex,
-                                                     vertex_IDs[0],
-                                                     "exon", strand,
-                                                     edge_dict)
+    start_exon, start_novelty = match_or_create_edge(start_vertex, vertex_IDs[0], "exon", strand, edge_dict)
 
     # If known_start == 1, the start vertex is a known startpoint of this gene.
     #  start novelty refers to the novelty of the first exon (1 if yes, 0 if no)
     return start_vertex, start_exon, start_novelty, known_start, diff_5p
 
 
-def process_3p(chrom, positions, strand, vertex_IDs, gene_ID, gene_ends, edge_dict,
-               locations, run_info):
-    """ Conduct permissive match for 3' end and return assigned vertex,
-        edge, and distance """
+def process_3p(chrom, positions, strand, vertex_IDs, gene_ID, gene_ends, edge_dict, locations, run_info):
+    """Conduct permissive match for 3' end and return assigned vertex,
+    edge, and distance"""
 
     # First get a permissively matched end vertex
-    end_vertex, diff_3p, known_end = permissive_match_with_gene_priority(chrom,
-                                                                         positions[-1], strand, positions[-2],
-                                                                         "end", gene_ID, gene_ends,
-                                                                         locations, run_info)
+    end_vertex, diff_3p, known_end = permissive_match_with_gene_priority(
+        chrom, positions[-1], strand, positions[-2], "end", gene_ID, gene_ends, locations, run_info
+    )
     if end_vertex == None:
-        end_vertex = create_vertex(
-            chrom, positions[-1], locations, run_info)['location_ID']
+        end_vertex = create_vertex(chrom, positions[-1], locations, run_info)["location_ID"]
     # Then get the end exon
-    end_exon, end_novelty = match_or_create_edge(vertex_IDs[-1],
-                                                 end_vertex,
-                                                 "exon", strand,
-                                                 edge_dict)
+    end_exon, end_novelty = match_or_create_edge(vertex_IDs[-1], end_vertex, "exon", strand, edge_dict)
     # If known_end == 1, the end vertex is a known endpoint of this gene.
     # end novelty refers to the novelty of the final exon (1 if yes, 0 if no)
     return end_vertex, end_exon, end_novelty, known_end, diff_3p
 
 
-def process_ISM(chrom, positions, strand, edge_IDs, vertex_IDs, all_matches, transcript_dict,
-                gene_starts, gene_ends, edge_dict, locations, run_info,
-                cursor, tmp_gene, tmp_t):
-    """ Given a transcript, try to find an ISM match for it. If the
-        best match is an ISM with known ends, that will be promoted to NIC. """
+def process_ISM(
+    chrom,
+    positions,
+    strand,
+    edge_IDs,
+    vertex_IDs,
+    all_matches,
+    transcript_dict,
+    gene_starts,
+    gene_ends,
+    edge_dict,
+    locations,
+    run_info,
+    cursor,
+    tmp_gene,
+    tmp_t,
+):
+    """Given a transcript, try to find an ISM match for it. If the
+    best match is an ISM with known ends, that will be promoted to NIC."""
 
     gene_ID = None
     transcript_ID = None
     novelty = []
     start_end_info = {}
-    n_exons = int(len(positions)/2)
+    n_exons = int(len(positions) / 2)
 
     ISM = []
     suffix = []
     prefix = []
 
     # choose gene to assign it to
-    gene_matches = list(set([match['gene_ID'] for match in all_matches]))
+    gene_matches = list(set([match["gene_ID"] for match in all_matches]))
     print(gene_matches)
 
     # tie break based on distance to 5' / 3' ends
     if len(gene_matches) > 1:
-        gene_ID, _ = search_for_overlap_with_gene(chrom, positions[0],
-                        positions[-1], strand, cursor, run_info, tmp_gene,
-                        tmp_t, gene_IDs=gene_matches)
-        all_matches = [m for m in all_matches if m['gene_ID'] == gene_ID]
+        gene_ID, _ = search_for_overlap_with_gene(
+            chrom, positions[0], positions[-1], strand, cursor, run_info, tmp_gene, tmp_t, gene_IDs=gene_matches
+        )
+        all_matches = [m for m in all_matches if m["gene_ID"] == gene_ID]
     else:
-        gene_ID = all_matches[0]['gene_ID']
+        gene_ID = all_matches[0]["gene_ID"]
 
     # if we didn't assign a gene ID
     if gene_ID == None:
-      return None, None, [], None
+        return None, None, [], None
 
     # print('edge IDs')
     # print(edge_IDs)
@@ -956,18 +965,12 @@ def process_ISM(chrom, positions, strand, edge_IDs, vertex_IDs, all_matches, tra
 
     # Get matches for the ends
     if n_exons > 1:
-        start_vertex, start_exon, start_novelty, known_start, diff_5p = process_5p(chrom,
-                                                                                   positions, strand,
-                                                                                   vertex_IDs,
-                                                                                   gene_ID, gene_starts,
-                                                                                   edge_dict,
-                                                                                   locations, run_info)
-        end_vertex, end_exon, end_novelty, known_end, diff_3p = process_3p(chrom,
-                                                                           positions, strand,
-                                                                           vertex_IDs,
-                                                                           gene_ID, gene_ends,
-                                                                           edge_dict,
-                                                                           locations, run_info)
+        start_vertex, start_exon, start_novelty, known_start, diff_5p = process_5p(
+            chrom, positions, strand, vertex_IDs, gene_ID, gene_starts, edge_dict, locations, run_info
+        )
+        end_vertex, end_exon, end_novelty, known_end, diff_3p = process_3p(
+            chrom, positions, strand, vertex_IDs, gene_ID, gene_ends, edge_dict, locations, run_info
+        )
         # Update info
         edge_IDs = [start_exon] + edge_IDs + [end_exon]
         vertex_IDs = [start_vertex] + vertex_IDs + [end_vertex]
@@ -988,9 +991,8 @@ def process_ISM(chrom, positions, strand, edge_IDs, vertex_IDs, all_matches, tra
 
     # Iterate over all matches from assigned gene to characterize ISMs
     for match in all_matches:
-
         # Add ISM
-        ISM.append(str(match['transcript_ID']))
+        ISM.append(str(match["transcript_ID"]))
 
         # Single-exon case
         if n_exons == 1:
@@ -1001,58 +1003,53 @@ def process_ISM(chrom, positions, strand, edge_IDs, vertex_IDs, all_matches, tra
                 novelty = []
                 return gene_ID, transcript_ID, novelty, start_end_info
 
-            match_path = match['jn_path']
+            match_path = match["jn_path"]
             exon = str(edge_IDs[0])
             # Look for prefix
             if match_path.startswith(exon):
-                prefix.append(str(match['transcript_ID']))
+                prefix.append(str(match["transcript_ID"]))
             # Look for suffix
             if match_path.endswith(exon):
-                suffix.append(str(match['transcript_ID']))
-                gene_ID = match['gene_ID']
+                suffix.append(str(match["transcript_ID"]))
+                gene_ID = match["gene_ID"]
             continue
 
         # Multi-exon case
         edge_str = ",".join([str(x) for x in edge_IDs[1:-1]])
 
         # Look for prefix
-        if match['jn_path'].startswith(edge_str):
-            prefix.append(str(match['transcript_ID']))
+        if match["jn_path"].startswith(edge_str):
+            prefix.append(str(match["transcript_ID"]))
 
         # Look for suffix
-        if match['jn_path'].endswith(edge_str):
-            gene_ID = match['gene_ID']
-            suffix.append(str(match['transcript_ID']))
+        if match["jn_path"].endswith(edge_str):
+            gene_ID = match["gene_ID"]
+            suffix.append(str(match["transcript_ID"]))
 
-    novel_transcript = create_transcript(strand, chrom, positions[0], positions[-1],
-                                         gene_ID, edge_IDs, vertex_IDs,
-                                         transcript_dict, tmp_t, cursor)
+    novel_transcript = create_transcript(
+        strand, chrom, positions[0], positions[-1], gene_ID, edge_IDs, vertex_IDs, transcript_dict, tmp_t, cursor
+    )
 
-    transcript_ID = novel_transcript['transcript_ID']
+    transcript_ID = novel_transcript["transcript_ID"]
 
     ISM_str = ",".join(ISM)
-    novelty.append((transcript_ID, run_info.idprefix, "TALON",
-                    "ISM_transcript", "TRUE"))
-    novelty.append((transcript_ID, run_info.idprefix, "TALON",
-                    "ISM_to_IDs", ISM_str))
+    novelty.append((transcript_ID, run_info.idprefix, "TALON", "ISM_transcript", "TRUE"))
+    novelty.append((transcript_ID, run_info.idprefix, "TALON", "ISM_to_IDs", ISM_str))
     if prefix != []:
         prefix_str = ",".join(prefix)
-        novelty.append((transcript_ID, run_info.idprefix, "TALON",
-                        "ISM-prefix_transcript", "TRUE"))
-        novelty.append((transcript_ID, run_info.idprefix, "TALON",
-                        "ISM-prefix_to_IDs", prefix_str))
+        novelty.append((transcript_ID, run_info.idprefix, "TALON", "ISM-prefix_transcript", "TRUE"))
+        novelty.append((transcript_ID, run_info.idprefix, "TALON", "ISM-prefix_to_IDs", prefix_str))
     if suffix != []:
         suffix_str = ",".join(suffix)
-        novelty.append((transcript_ID, run_info.idprefix, "TALON",
-                        "ISM-suffix_transcript", "TRUE"))
-        novelty.append((transcript_ID, run_info.idprefix, "TALON",
-                        "ISM-suffix_to_IDs", suffix_str))
+        novelty.append((transcript_ID, run_info.idprefix, "TALON", "ISM-suffix_transcript", "TRUE"))
+        novelty.append((transcript_ID, run_info.idprefix, "TALON", "ISM-suffix_to_IDs", suffix_str))
 
     return gene_ID, transcript_ID, novelty, start_end_info
 
-def assign_gene(vertex_IDs, strand, vertex_2_gene,
-                             chrom, start, end, cursor, run_info,
-                             tmp_gene, tmp_t, gene_starts, gene_ends):
+
+def assign_gene(
+    vertex_IDs, strand, vertex_2_gene, chrom, start, end, cursor, run_info, tmp_gene, tmp_t, gene_starts, gene_ends
+):
     """
     Assign a gene to a transcript. First do this on the basis of splice site
     matching. If this yields more than one gene, then choose the gene with the
@@ -1065,34 +1062,55 @@ def assign_gene(vertex_IDs, strand, vertex_2_gene,
     """
 
     # first attempt to assign based on matching vertices
-    gene_ID, fusion = find_gene_match_on_vertex_basis(vertex_IDs,
-                                                       strand,
-                                                       vertex_2_gene)
+    gene_ID, fusion = find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex_2_gene)
 
     # if previous function returned more than one gene that we need to tiebreak,
     # look for closest gene based on end differences, out of candidate genes
     # only if it wasn't previously labeled as fusion
     if type(gene_ID) == list and fusion == False:
-        gene_ID, match_strand = search_for_overlap_with_gene(chrom, start,
-                                                            end, strand,
-                                                            cursor, run_info, tmp_gene,
-                                                            tmp_t,
-                                                            gene_IDs=gene_ID)
+        gene_ID, match_strand = search_for_overlap_with_gene(
+            chrom, start, end, strand, cursor, run_info, tmp_gene, tmp_t, gene_IDs=gene_ID
+        )
     return gene_ID, fusion
 
 
-def process_NIC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict,
-                gene_starts, gene_ends, edge_dict, locations, vertex_2_gene, run_info,
-                cursor, tmp_gene, tmp_t):
-    """ For a transcript that has been determined to be novel in catalog, find
-        the proper gene match (documenting fusion event if applicable). To do
-        this, look up each vertex in the vertex_2_gene dict, and keep track of all
-        same-strand genes. """
+def process_NIC(
+    chrom,
+    positions,
+    strand,
+    edge_IDs,
+    vertex_IDs,
+    transcript_dict,
+    gene_starts,
+    gene_ends,
+    edge_dict,
+    locations,
+    vertex_2_gene,
+    run_info,
+    cursor,
+    tmp_gene,
+    tmp_t,
+):
+    """For a transcript that has been determined to be novel in catalog, find
+    the proper gene match (documenting fusion event if applicable). To do
+    this, look up each vertex in the vertex_2_gene dict, and keep track of all
+    same-strand genes."""
 
     start_end_info = {}
-    gene_ID, fusion = assign_gene(vertex_IDs, strand, vertex_2_gene,
-                                 chrom, positions[0], positions[-1], cursor, run_info,
-                                 tmp_gene, tmp_t, gene_starts, gene_ends)
+    gene_ID, fusion = assign_gene(
+        vertex_IDs,
+        strand,
+        vertex_2_gene,
+        chrom,
+        positions[0],
+        positions[-1],
+        cursor,
+        run_info,
+        tmp_gene,
+        tmp_t,
+        gene_starts,
+        gene_ends,
+    )
 
     # gene_ID, fusion = find_gene_match_on_vertex_basis(vertex_IDs,
     #                                                   strand,
@@ -1108,21 +1126,15 @@ def process_NIC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict,
     #   print(gene_ID)
 
     if gene_ID == None:
-      return None, None, [], None, fusion
+        return None, None, [], None, fusion
 
     # Get matches for the ends
-    start_vertex, start_exon, start_novelty, known_start, diff_5p = process_5p(chrom,
-                                                                               positions, strand,
-                                                                               vertex_IDs,
-                                                                               gene_ID, gene_starts,
-                                                                               edge_dict,
-                                                                               locations, run_info)
-    end_vertex, end_exon, end_novelty, known_end, diff_3p = process_3p(chrom,
-                                                                       positions, strand,
-                                                                       vertex_IDs,
-                                                                       gene_ID, gene_ends,
-                                                                       edge_dict,
-                                                                       locations, run_info)
+    start_vertex, start_exon, start_novelty, known_start, diff_5p = process_5p(
+        chrom, positions, strand, vertex_IDs, gene_ID, gene_starts, edge_dict, locations, run_info
+    )
+    end_vertex, end_exon, end_novelty, known_end, diff_3p = process_3p(
+        chrom, positions, strand, vertex_IDs, gene_ID, gene_ends, edge_dict, locations, run_info
+    )
     # Update info
     edge_IDs = [start_exon] + edge_IDs + [end_exon]
     vertex_IDs = [start_vertex] + vertex_IDs + [end_vertex]
@@ -1138,16 +1150,16 @@ def process_NIC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict,
     start_end_info["vertex_IDs"] = vertex_IDs
 
     # Create a new transcript of that gene
-    novel_transcript = create_transcript(strand, chrom, positions[0], positions[-1],
-                                         gene_ID, edge_IDs, vertex_IDs,
-                                         transcript_dict, tmp_t, cursor)
+    novel_transcript = create_transcript(
+        strand, chrom, positions[0], positions[-1], gene_ID, edge_IDs, vertex_IDs, transcript_dict, tmp_t, cursor
+    )
     transcript_ID = novel_transcript["transcript_ID"]
-    novelty = [(transcript_ID, run_info.idprefix, "TALON",
-                "NIC_transcript", "TRUE")]
+    novelty = [(transcript_ID, run_info.idprefix, "TALON", "NIC_transcript", "TRUE")]
     fusion = False
 
     return gene_ID, transcript_ID, novelty, start_end_info, fusion
 
+
 def get_vertex_2_gene_df(vertex_2_gene):
     """
     Get a DataFrame mapping each unique combination of vertex:gene_ID
@@ -1167,16 +1179,17 @@ def get_vertex_2_gene_df(vertex_2_gene):
             vids.append(key)
     # df = pd.DataFrame.from_dict(vertex_2_gene, orient='index')
     df = pd.DataFrame()
-    df['gid'] = gids
-    df['vid'] = vids
+    df["gid"] = gids
+    df["vid"] = vids
     # print(df.head())
     # print(len(df.index))
     # print(len(df.vid.unique().tolist()))
     # print(df.loc[df.vid.duplicated(keep=False)].sort_values(by='vid'))
     return df
 
+
 def find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex_2_gene):
-    """ Use vertices in a transcript to try to pinpoint the gene it belongs to.
+    """Use vertices in a transcript to try to pinpoint the gene it belongs to.
 
     Parameters:
         vertex_IDs (list of int): List of vertices in the read
@@ -1230,13 +1243,13 @@ def find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex_2_gene):
     # when there are no shared splice sites between gene hits but we did
     # hit more than one gene
     elif max(n_gene_matches) <= 1 and len(gene_tally) > 1:
-        print(' went here')
+        print(" went here")
         return None, True
 
     # if we hit more than one gene and they have overlapping sjs,
     # tie break based on ?????
     elif len(gene_tally) > 1:
-        print('i found more than one gene')
+        print("i found more than one gene")
         print(gene_tally)
         print(n_gene_matches)
         return list(gene_tally.keys()), False
@@ -1254,8 +1267,6 @@ def find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex_2_gene):
         # temp3 = temp1.merge(temp2, on='gid')
         # print(temp3)
 
-
-
     # For the main assignment, pick the gene that is observed the most
     else:
         gene_ID = max(gene_tally, key=gene_tally.get)
@@ -1264,10 +1275,24 @@ def find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex_2_gene):
     return gene_ID, fusion
 
 
-def process_NNC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict,
-                gene_starts, gene_ends, edge_dict, locations, vertex_2_gene, run_info,
-                cursor, tmp_gene, tmp_t):
-    """ Novel not in catalog case """
+def process_NNC(
+    chrom,
+    positions,
+    strand,
+    edge_IDs,
+    vertex_IDs,
+    transcript_dict,
+    gene_starts,
+    gene_ends,
+    edge_dict,
+    locations,
+    vertex_2_gene,
+    run_info,
+    cursor,
+    tmp_gene,
+    tmp_t,
+):
+    """Novel not in catalog case"""
 
     novelty = []
     start_end_info = {}
@@ -1284,10 +1309,21 @@ def process_NNC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict,
     #                                                          gene_starts, gene_ends)
     #     print('geneid from search for overlap with gene')
     #     print(gene_ID)
-    gene_ID, fusion = assign_gene(vertex_IDs, strand, vertex_2_gene,
-                                 chrom, positions[0], positions[-1], cursor, run_info,
-                                 tmp_gene, tmp_t, gene_starts, gene_ends)
-    print('gene id process_nnc')
+    gene_ID, fusion = assign_gene(
+        vertex_IDs,
+        strand,
+        vertex_2_gene,
+        chrom,
+        positions[0],
+        positions[-1],
+        cursor,
+        run_info,
+        tmp_gene,
+        tmp_t,
+        gene_starts,
+        gene_ends,
+    )
+    print("gene id process_nnc")
     print(gene_ID)
     print(fusion)
 
@@ -1295,18 +1331,12 @@ def process_NNC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict,
         return None, None, [], None, fusion
 
     # Get matches for the ends
-    start_vertex, start_exon, start_novelty, known_start, diff_5p = process_5p(chrom,
-                                                                               positions, strand,
-                                                                               vertex_IDs,
-                                                                               gene_ID, gene_starts,
-                                                                               edge_dict,
-                                                                               locations, run_info)
-    end_vertex, end_exon, end_novelty, known_end, diff_3p = process_3p(chrom,
-                                                                       positions, strand,
-                                                                       vertex_IDs,
-                                                                       gene_ID, gene_ends,
-                                                                       edge_dict,
-                                                                       locations, run_info)
+    start_vertex, start_exon, start_novelty, known_start, diff_5p = process_5p(
+        chrom, positions, strand, vertex_IDs, gene_ID, gene_starts, edge_dict, locations, run_info
+    )
+    end_vertex, end_exon, end_novelty, known_end, diff_3p = process_3p(
+        chrom, positions, strand, vertex_IDs, gene_ID, gene_ends, edge_dict, locations, run_info
+    )
     # Update info
     edge_IDs = [start_exon] + edge_IDs + [end_exon]
     vertex_IDs = [start_vertex] + vertex_IDs + [end_vertex]
@@ -1321,22 +1351,34 @@ def process_NNC(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict,
     start_end_info["edge_IDs"] = edge_IDs
     start_end_info["vertex_IDs"] = vertex_IDs
 
-    transcript_ID = create_transcript(strand, chrom, positions[0], positions[-1],
-                                      gene_ID, edge_IDs, vertex_IDs,
-                                      transcript_dict, tmp_t, cursor)["transcript_ID"]
+    transcript_ID = create_transcript(
+        strand, chrom, positions[0], positions[-1], gene_ID, edge_IDs, vertex_IDs, transcript_dict, tmp_t, cursor
+    )["transcript_ID"]
 
-    novelty.append((transcript_ID, run_info.idprefix, "TALON",
-                    "NNC_transcript", "TRUE"))
+    novelty.append((transcript_ID, run_info.idprefix, "TALON", "NNC_transcript", "TRUE"))
     fusion = False
 
     return gene_ID, transcript_ID, novelty, start_end_info, fusion
 
 
-def process_spliced_antisense(chrom, positions, strand, edge_IDs, vertex_IDs,
-                              transcript_dict, gene_starts, gene_ends, edge_dict,
-                              locations, vertex_2_gene, run_info, cursor, tmp_gene,
-                              tmp_t):
-    """ Annotate a transcript as antisense with splice junctions """
+def process_spliced_antisense(
+    chrom,
+    positions,
+    strand,
+    edge_IDs,
+    vertex_IDs,
+    transcript_dict,
+    gene_starts,
+    gene_ends,
+    edge_dict,
+    locations,
+    vertex_2_gene,
+    run_info,
+    cursor,
+    tmp_gene,
+    tmp_t,
+):
+    """Annotate a transcript as antisense with splice junctions"""
 
     gene_novelty = []
     transcript_novelty = []
@@ -1346,30 +1388,21 @@ def process_spliced_antisense(chrom, positions, strand, edge_IDs, vertex_IDs,
         anti_strand = "-"
     else:
         anti_strand = "+"
-    anti_gene_ID, fusion = find_gene_match_on_vertex_basis(vertex_IDs, anti_strand,
-                                                   vertex_2_gene)
+    anti_gene_ID, fusion = find_gene_match_on_vertex_basis(vertex_IDs, anti_strand, vertex_2_gene)
     if type(anti_gene_ID) == list and fusion == False:
-        anti_gene_ID, match_strand = search_for_overlap_with_gene(chrom, positions[0],
-                                                           positions[-1], strand,
-                                                           cursor, run_info, tmp_gene,
-                                                           tmp_t,
-                                                           gene_IDs=anti_gene_ID)
+        anti_gene_ID, match_strand = search_for_overlap_with_gene(
+            chrom, positions[0], positions[-1], strand, cursor, run_info, tmp_gene, tmp_t, gene_IDs=anti_gene_ID
+        )
     if anti_gene_ID == None:
         return None, None, gene_novelty, transcript_novelty, start_end_info
 
     # Take care of ends
-    start_vertex, start_exon, start_novelty, known_start, diff_5p = process_5p(chrom,
-                                                                               positions, strand,
-                                                                               vertex_IDs,
-                                                                               anti_gene_ID, gene_ends,
-                                                                               edge_dict,
-                                                                               locations, run_info)
-    end_vertex, end_exon, end_novelty, known_end, diff_3p = process_3p(chrom,
-                                                                       positions, strand,
-                                                                       vertex_IDs,
-                                                                       anti_gene_ID, gene_starts,
-                                                                       edge_dict,
-                                                                       locations, run_info)
+    start_vertex, start_exon, start_novelty, known_start, diff_5p = process_5p(
+        chrom, positions, strand, vertex_IDs, anti_gene_ID, gene_ends, edge_dict, locations, run_info
+    )
+    end_vertex, end_exon, end_novelty, known_end, diff_3p = process_3p(
+        chrom, positions, strand, vertex_IDs, anti_gene_ID, gene_starts, edge_dict, locations, run_info
+    )
     # Update info
     edge_IDs = [start_exon] + edge_IDs + [end_exon]
     vertex_IDs = [start_vertex] + vertex_IDs + [end_vertex]
@@ -1384,59 +1417,61 @@ def process_spliced_antisense(chrom, positions, strand, edge_IDs, vertex_IDs,
     start_end_info["edge_IDs"] = edge_IDs
     start_end_info["vertex_IDs"] = vertex_IDs
 
-    gene_ID = create_gene(chrom, positions[0], positions[-1],
-                          strand, cursor, tmp_gene)
-    transcript_ID = create_transcript(strand, chrom, positions[0], positions[-1],
-                                      gene_ID, edge_IDs, vertex_IDs,
-                                      transcript_dict, tmp_t, cursor)["transcript_ID"]
+    gene_ID = create_gene(chrom, positions[0], positions[-1], strand, cursor, tmp_gene)
+    transcript_ID = create_transcript(
+        strand, chrom, positions[0], positions[-1], gene_ID, edge_IDs, vertex_IDs, transcript_dict, tmp_t, cursor
+    )["transcript_ID"]
 
     # Handle gene annotations
-    gene_novelty.append((gene_ID, run_info.idprefix, "TALON",
-                         "antisense_gene", "TRUE"))
-    gene_novelty.append((gene_ID, run_info.idprefix, "TALON",
-                         "gene_antisense_to_IDs", anti_gene_ID))
+    gene_novelty.append((gene_ID, run_info.idprefix, "TALON", "antisense_gene", "TRUE"))
+    gene_novelty.append((gene_ID, run_info.idprefix, "TALON", "gene_antisense_to_IDs", anti_gene_ID))
 
     # Handle transcript annotations
-    transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON",
-                               "antisense_transcript", "TRUE"))
+    transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON", "antisense_transcript", "TRUE"))
 
     return gene_ID, transcript_ID, gene_novelty, transcript_novelty, start_end_info
 
 
-def process_remaining_mult_cases(chrom, positions, strand, edge_IDs, vertex_IDs,
-                                 transcript_dict, gene_starts, gene_ends, edge_dict,
-                                 locations, vertex_2_gene, run_info, cursor, tmp_gene,
-                                 tmp_t,
-                                 fusion):
-    """ This function is a catch-all for multiexonic transcripts that were not
-        FSM, ISM, NIC, NNC, or spliced antisense.
+def process_remaining_mult_cases(
+    chrom,
+    positions,
+    strand,
+    edge_IDs,
+    vertex_IDs,
+    transcript_dict,
+    gene_starts,
+    gene_ends,
+    edge_dict,
+    locations,
+    vertex_2_gene,
+    run_info,
+    cursor,
+    tmp_gene,
+    tmp_t,
+    fusion,
+):
+    """This function is a catch-all for multiexonic transcripts that were not
+    FSM, ISM, NIC, NNC, or spliced antisense.
     """
     gene_novelty = []
     transcript_novelty = []
     start_end_info = {}
     if not run_info.create_novel_spliced_genes or not fusion:
-        print('did i get here?')
-        gene_ID, match_strand = search_for_overlap_with_gene(chrom, positions[0],
-                                                             positions[-1], strand,
-                                                             cursor, run_info, tmp_gene,
-                                                             tmp_t)
+        print("did i get here?")
+        gene_ID, match_strand = search_for_overlap_with_gene(
+            chrom, positions[0], positions[-1], strand, cursor, run_info, tmp_gene, tmp_t
+        )
     else:
         gene_ID = None
         match_strand = None
 
     # We don't care about the gene when making these assignments
-    start_vertex, start_exon, start_novelty, known_start, diff_5p = process_5p(chrom,
-                                                                               positions, strand,
-                                                                               vertex_IDs,
-                                                                               gene_ID, gene_starts,
-                                                                               edge_dict,
-                                                                               locations, run_info)
-    end_vertex, end_exon, end_novelty, known_end, diff_3p = process_3p(chrom,
-                                                                       positions, strand,
-                                                                       vertex_IDs,
-                                                                       gene_ID, gene_ends,
-                                                                       edge_dict,
-                                                                       locations, run_info)
+    start_vertex, start_exon, start_novelty, known_start, diff_5p = process_5p(
+        chrom, positions, strand, vertex_IDs, gene_ID, gene_starts, edge_dict, locations, run_info
+    )
+    end_vertex, end_exon, end_novelty, known_end, diff_3p = process_3p(
+        chrom, positions, strand, vertex_IDs, gene_ID, gene_ends, edge_dict, locations, run_info
+    )
     # Update info
     edge_IDs = [start_exon] + edge_IDs + [end_exon]
     vertex_IDs = [start_vertex] + vertex_IDs + [end_vertex]
@@ -1452,54 +1487,46 @@ def process_remaining_mult_cases(chrom, positions, strand, edge_IDs, vertex_IDs,
     start_end_info["vertex_IDs"] = vertex_IDs
 
     if gene_ID == None:
-        print(f'fusion: {fusion}')
+        print(f"fusion: {fusion}")
         if fusion:
-            print('i should be here')
-            t_nov = 'fusion_transcript'
-            g_nov = 'fusion_novel'
+            print("i should be here")
+            t_nov = "fusion_transcript"
+            g_nov = "fusion_novel"
         else:
-            print('but I think im going here')
-            t_nov = 'intergenic_transcript'
-            g_nov = 'intergenic_novel'
+            print("but I think im going here")
+            t_nov = "intergenic_transcript"
+            g_nov = "intergenic_novel"
 
-        gene_ID = create_gene(chrom, positions[0], positions[-1],
-                              strand, cursor, tmp_gene)
+        gene_ID = create_gene(chrom, positions[0], positions[-1], strand, cursor, tmp_gene)
 
-        gene_novelty.append((gene_ID, run_info.idprefix, "TALON",
-                             g_nov, "TRUE"))
+        gene_novelty.append((gene_ID, run_info.idprefix, "TALON", g_nov, "TRUE"))
 
-        transcript_ID = create_transcript(strand, chrom, positions[0], positions[-1],
-                                          gene_ID, edge_IDs, vertex_IDs,
-                                          transcript_dict, tmp_t, cursor)["transcript_ID"]
-        transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON",
-                                   t_nov, "TRUE"))
+        transcript_ID = create_transcript(
+            strand, chrom, positions[0], positions[-1], gene_ID, edge_IDs, vertex_IDs, transcript_dict, tmp_t, cursor
+        )["transcript_ID"]
+        transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON", t_nov, "TRUE"))
 
     elif match_strand != strand:
         anti_gene_ID = gene_ID
-        gene_ID = create_gene(chrom, positions[0], positions[-1], strand,
-                              cursor, tmp_gene)
-        transcript_ID = create_transcript(strand, chrom, positions[0], positions[-1],
-                                          gene_ID, edge_IDs, vertex_IDs,
-                                          transcript_dict, tmp_t, cursor)["transcript_ID"]
-
-        gene_novelty.append((gene_ID, run_info.idprefix, "TALON",
-                             "antisense_gene", "TRUE"))
-        gene_novelty.append((gene_ID, run_info.idprefix, "TALON",
-                             "gene_antisense_to_IDs", anti_gene_ID))
-        transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON",
-                                   "antisense_transcript", "TRUE"))
+        gene_ID = create_gene(chrom, positions[0], positions[-1], strand, cursor, tmp_gene)
+        transcript_ID = create_transcript(
+            strand, chrom, positions[0], positions[-1], gene_ID, edge_IDs, vertex_IDs, transcript_dict, tmp_t, cursor
+        )["transcript_ID"]
+
+        gene_novelty.append((gene_ID, run_info.idprefix, "TALON", "antisense_gene", "TRUE"))
+        gene_novelty.append((gene_ID, run_info.idprefix, "TALON", "gene_antisense_to_IDs", anti_gene_ID))
+        transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON", "antisense_transcript", "TRUE"))
     else:
-        transcript_ID = create_transcript(strand, chrom, positions[0], positions[-1],
-                                          gene_ID, edge_IDs, vertex_IDs,
-                                          transcript_dict, tmp_t, cursor)["transcript_ID"]
-        transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON",
-                                   "genomic_transcript", "TRUE"))
+        transcript_ID = create_transcript(
+            strand, chrom, positions[0], positions[-1], gene_ID, edge_IDs, vertex_IDs, transcript_dict, tmp_t, cursor
+        )["transcript_ID"]
+        transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON", "genomic_transcript", "TRUE"))
 
     return gene_ID, transcript_ID, gene_novelty, transcript_novelty, start_end_info
 
 
 def update_vertex_2_gene(gene_ID, vertex_IDs, strand, vertex_2_gene):
-    """ Add all vertices with gene pairings to vertex_2_gene dict """
+    """Add all vertices with gene pairings to vertex_2_gene dict"""
 
     for vertex in vertex_IDs:
         if vertex in vertex_2_gene:
@@ -1511,50 +1538,60 @@ def update_vertex_2_gene(gene_ID, vertex_IDs, strand, vertex_2_gene):
     return
 
 
-def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_dict,
-                        transcript_dict, vertex_2_gene, gene_starts, gene_ends,
-                        run_info, tmp_gene, tmp_t):
-    """ Inputs:
-        - Information about the query transcript
-          - chromosome
-          - list of positions
-          - strand
-        - Data structures
-          - location_dict (position --> vertex)
-          - edge_dict (v1_v2_edgetype --> edge)
-          - transcript_dict
-          - vertex_2_gene (maps vertices to the gene(s) they are part of)
-          - gene_starts (maps gene IDs to known start vertices)
-          - gene_ends (maps gene IDs to known end vertices)
-          - run_info
-
-       Outputs:
-          - Assigned gene ID
-          - Assigned transcript ID
-          - gene and transcript novelty entries (to be added to database)
-          - IDs of start and end vertices
-          - 5' and 3' deltas from assigned start/end vertices
+def identify_transcript(
+    chrom,
+    positions,
+    strand,
+    cursor,
+    location_dict,
+    edge_dict,
+    transcript_dict,
+    vertex_2_gene,
+    gene_starts,
+    gene_ends,
+    run_info,
+    tmp_gene,
+    tmp_t,
+):
+    """Inputs:
+     - Information about the query transcript
+       - chromosome
+       - list of positions
+       - strand
+     - Data structures
+       - location_dict (position --> vertex)
+       - edge_dict (v1_v2_edgetype --> edge)
+       - transcript_dict
+       - vertex_2_gene (maps vertices to the gene(s) they are part of)
+       - gene_starts (maps gene IDs to known start vertices)
+       - gene_ends (maps gene IDs to known end vertices)
+       - run_info
+
+    Outputs:
+       - Assigned gene ID
+       - Assigned transcript ID
+       - gene and transcript novelty entries (to be added to database)
+       - IDs of start and end vertices
+       - 5' and 3' deltas from assigned start/end vertices
     """
     gene_novelty = []
     transcript_novelty = []
-    n_exons = int(len(positions)/2.0)
+    n_exons = int(len(positions) / 2.0)
     gene_ID = None
 
     # Get vertex matches for the transcript positions
-    vertex_IDs, v_novelty = match_splice_vertices(chrom, positions, strand,
-                                                  location_dict, run_info)
+    vertex_IDs, v_novelty = match_splice_vertices(chrom, positions, strand, location_dict, run_info)
 
     # Get edge matches for transcript exons and introns based on the vertices
-    edge_IDs, e_novelty = match_all_splice_edges(
-        vertex_IDs, strand, edge_dict, run_info)
+    edge_IDs, e_novelty = match_all_splice_edges(vertex_IDs, strand, edge_dict, run_info)
 
     # Check novelty of exons and splice jns. This will help us categorize
     # what type of novelty the transcript has
     all_SJs_known = check_all_SJs_known(e_novelty)
     all_exons_known = check_all_exons_known(e_novelty)
-    splice_vertices_known = (sum(v_novelty) == 0)
-    all_exons_novel = (reduce(operator.mul, e_novelty, 1) == 1)
-    print(f'all exons novel : {all_exons_novel}')
+    splice_vertices_known = sum(v_novelty) == 0
+    all_exons_novel = reduce(operator.mul, e_novelty, 1) == 1
+    print(f"all exons novel : {all_exons_novel}")
     fusion = False
 
     # Look for FSM or ISM.
@@ -1563,127 +1600,172 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di
         all_matches = search_for_ISM(edge_IDs, transcript_dict)
         if all_matches != None:
             # Look for FSM first
-            print('looking for fsm')
-            gene_ID, transcript_ID, transcript_novelty, start_end_info = process_FSM(chrom,
-                                                                                     positions, strand, edge_IDs,
-                                                                                     vertex_IDs, all_matches,
-                                                                                     gene_starts, gene_ends,
-                                                                                     edge_dict,
-                                                                                     location_dict, run_info)
+            print("looking for fsm")
+            gene_ID, transcript_ID, transcript_novelty, start_end_info = process_FSM(
+                chrom,
+                positions,
+                strand,
+                edge_IDs,
+                vertex_IDs,
+                all_matches,
+                gene_starts,
+                gene_ends,
+                edge_dict,
+                location_dict,
+                run_info,
+            )
             if gene_ID == None:
                 # Now look for ISM
-                print('looking for ism')
-                gene_ID, transcript_ID, transcript_novelty, start_end_info = process_ISM(chrom,
-                                                                                         positions,
-                                                                                         strand, edge_IDs,
-                                                                                         vertex_IDs,
-                                                                                         all_matches,
-                                                                                         transcript_dict,
-                                                                                         gene_starts, gene_ends,
-                                                                                         edge_dict, location_dict,
-                                                                                         run_info, cursor, tmp_gene, tmp_t)
-                print(f'gene id from process ism {gene_ID}')
+                print("looking for ism")
+                gene_ID, transcript_ID, transcript_novelty, start_end_info = process_ISM(
+                    chrom,
+                    positions,
+                    strand,
+                    edge_IDs,
+                    vertex_IDs,
+                    all_matches,
+                    transcript_dict,
+                    gene_starts,
+                    gene_ends,
+                    edge_dict,
+                    location_dict,
+                    run_info,
+                    cursor,
+                    tmp_gene,
+                    tmp_t,
+                )
+                print(f"gene id from process ism {gene_ID}")
 
         # Look for NIC
         if gene_ID == None:
-            print('looking for nic')
-            gene_ID, transcript_ID, transcript_novelty, start_end_info, fusion = process_NIC(chrom,
-                                                                                     positions,
-                                                                                     strand, edge_IDs,
-                                                                                     vertex_IDs, transcript_dict,
-                                                                                     gene_starts, gene_ends,
-                                                                                     edge_dict, location_dict,
-                                                                                     vertex_2_gene, run_info,
-                                                                                     cursor, tmp_gene,
-                                                                                     tmp_t)
+            print("looking for nic")
+            gene_ID, transcript_ID, transcript_novelty, start_end_info, fusion = process_NIC(
+                chrom,
+                positions,
+                strand,
+                edge_IDs,
+                vertex_IDs,
+                transcript_dict,
+                gene_starts,
+                gene_ends,
+                edge_dict,
+                location_dict,
+                vertex_2_gene,
+                run_info,
+                cursor,
+                tmp_gene,
+                tmp_t,
+            )
 
     # Novel in catalog transcripts have known splice donors and acceptors,
     # but new connections between them.
     elif splice_vertices_known and gene_ID == None:
-        print('looking for nic (again?)')
-        gene_ID, transcript_ID, transcript_novelty, start_end_info, fusion = process_NIC(chrom,
-                                                                                 positions,
-                                                                                 strand, edge_IDs,
-                                                                                 vertex_IDs, transcript_dict,
-                                                                                 gene_starts, gene_ends,
-                                                                                 edge_dict, location_dict,
-                                                                                 vertex_2_gene, run_info,
-                                                                                 cursor, tmp_gene,
-                                                                                 tmp_t)
+        print("looking for nic (again?)")
+        gene_ID, transcript_ID, transcript_novelty, start_end_info, fusion = process_NIC(
+            chrom,
+            positions,
+            strand,
+            edge_IDs,
+            vertex_IDs,
+            transcript_dict,
+            gene_starts,
+            gene_ends,
+            edge_dict,
+            location_dict,
+            vertex_2_gene,
+            run_info,
+            cursor,
+            tmp_gene,
+            tmp_t,
+        )
 
     # Antisense transcript with splice junctions matching known gene
     if splice_vertices_known and gene_ID == None and not fusion:
-        print('looking for spliced antisese')
-        gene_ID, transcript_ID, gene_novelty, transcript_novelty, start_end_info = \
-            process_spliced_antisense(chrom, positions,
-                                      strand, edge_IDs,
-                                      vertex_IDs,
-                                      transcript_dict,
-                                      gene_starts,
-                                      gene_ends,
-                                      edge_dict, location_dict,
-                                      vertex_2_gene, run_info,
-                                      cursor, tmp_gene, tmp_t)
+        print("looking for spliced antisese")
+        gene_ID, transcript_ID, gene_novelty, transcript_novelty, start_end_info = process_spliced_antisense(
+            chrom,
+            positions,
+            strand,
+            edge_IDs,
+            vertex_IDs,
+            transcript_dict,
+            gene_starts,
+            gene_ends,
+            edge_dict,
+            location_dict,
+            vertex_2_gene,
+            run_info,
+            cursor,
+            tmp_gene,
+            tmp_t,
+        )
 
     # Novel not in catalog transcripts contain new splice donors/acceptors
     # and contain at least one splice junction. There should also be at least
     # one shared exon from existing transcripts to even try assigning a gene
-    elif not(splice_vertices_known) and not fusion and not all_exons_novel:
-        print('lookign for NNCs')
-        gene_ID, transcript_ID, transcript_novelty, start_end_info, fusion = process_NNC(chrom,
-                                                                                 positions,
-                                                                                 strand, edge_IDs,
-                                                                                 vertex_IDs, transcript_dict,
-                                                                                 gene_starts, gene_ends,
-                                                                                 edge_dict, location_dict,
-                                                                                 vertex_2_gene, run_info,
-                                                                                 cursor, tmp_gene,
-                                                                                 tmp_t)
-        print(f'geneID from process_nnc: {gene_ID}')
+    elif not (splice_vertices_known) and not fusion and not all_exons_novel:
+        print("lookign for NNCs")
+        gene_ID, transcript_ID, transcript_novelty, start_end_info, fusion = process_NNC(
+            chrom,
+            positions,
+            strand,
+            edge_IDs,
+            vertex_IDs,
+            transcript_dict,
+            gene_starts,
+            gene_ends,
+            edge_dict,
+            location_dict,
+            vertex_2_gene,
+            run_info,
+            cursor,
+            tmp_gene,
+            tmp_t,
+        )
+        print(f"geneID from process_nnc: {gene_ID}")
     # Transcripts that don't match the previous categories end up here
     if gene_ID == None:
-        print('looking for this other stuff')
-        gene_ID, transcript_ID, gene_novelty, transcript_novelty, start_end_info = \
-            process_remaining_mult_cases(chrom, positions,
-                                         strand, edge_IDs,
-                                         vertex_IDs,
-                                         transcript_dict,
-                                         gene_starts, gene_ends,
-                                         edge_dict, location_dict,
-                                         vertex_2_gene, run_info,
-                                         cursor, tmp_gene,
-                                         tmp_t,
-                                         fusion)
-
-    print('this is the gene id it decided on')
+        print("looking for this other stuff")
+        gene_ID, transcript_ID, gene_novelty, transcript_novelty, start_end_info = process_remaining_mult_cases(
+            chrom,
+            positions,
+            strand,
+            edge_IDs,
+            vertex_IDs,
+            transcript_dict,
+            gene_starts,
+            gene_ends,
+            edge_dict,
+            location_dict,
+            vertex_2_gene,
+            run_info,
+            cursor,
+            tmp_gene,
+            tmp_t,
+            fusion,
+        )
+
+    print("this is the gene id it decided on")
     print(gene_ID)
     # Add all novel vertices to vertex_2_gene now that we have the gene ID
     vertex_IDs = start_end_info["vertex_IDs"]
     edge_IDs = start_end_info["edge_IDs"]
-    e_novelty = [start_end_info["start_novelty"]] + e_novelty + \
-                [start_end_info["end_novelty"]]
+    e_novelty = [start_end_info["start_novelty"]] + e_novelty + [start_end_info["end_novelty"]]
 
     update_vertex_2_gene(gene_ID, vertex_IDs, strand, vertex_2_gene)
 
     # For novel genes and transcripts, add names to novelty entries
-    talon_gene_name, talon_transcript_name = construct_names(gene_ID,
-                                                             transcript_ID,
-                                                             run_info.idprefix,
-                                                             run_info.n_places)
+    talon_gene_name, talon_transcript_name = construct_names(
+        gene_ID, transcript_ID, run_info.idprefix, run_info.n_places
+    )
     if len(gene_novelty) > 0:
-        gene_novelty.append((gene_ID, run_info.idprefix, "TALON",
-                             "gene_status", "NOVEL"))
-        gene_novelty.append((gene_ID, run_info.idprefix, "TALON",
-                             "gene_name", talon_gene_name))
-        gene_novelty.append((gene_ID, run_info.idprefix, "TALON",
-                             "gene_id", talon_gene_name))
+        gene_novelty.append((gene_ID, run_info.idprefix, "TALON", "gene_status", "NOVEL"))
+        gene_novelty.append((gene_ID, run_info.idprefix, "TALON", "gene_name", talon_gene_name))
+        gene_novelty.append((gene_ID, run_info.idprefix, "TALON", "gene_id", talon_gene_name))
     if len(transcript_novelty) > 0:
-        transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON",
-                                   "transcript_status", "NOVEL"))
-        transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON",
-                                   "transcript_name", talon_transcript_name))
-        transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON",
-                                   "transcript_id", talon_transcript_name))
+        transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON", "transcript_status", "NOVEL"))
+        transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON", "transcript_name", talon_transcript_name))
+        transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON", "transcript_id", talon_transcript_name))
     # Add annotation entries for any novel exons
     exon_novelty = []
     exons = edge_IDs[::2]
@@ -1692,8 +1774,7 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di
     if sum(e_novelty) > 0:
         for exon, is_novel in zip(exons, e_novelty):
             if is_novel:
-                exon_novelty.append((exon, run_info.idprefix, "TALON",
-                                     "exon_status", "NOVEL"))
+                exon_novelty.append((exon, run_info.idprefix, "TALON", "exon_status", "NOVEL"))
 
     # Package up information for output
     annotations = dstruct.Struct()
@@ -1712,9 +1793,9 @@ def identify_transcript(chrom, positions, strand, cursor, location_dict, edge_di
 
 
 def construct_names(gene_ID, transcript_ID, prefix, n_places):
-    """ Create a gene and transcript name using the TALON IDs.
-        The n_places variable indicates how many characters long the numeric
-        part of the name should be. """
+    """Create a gene and transcript name using the TALON IDs.
+    The n_places variable indicates how many characters long the numeric
+    part of the name should be."""
 
     gene_ID_str = str(gene_ID).zfill(n_places)
     gene_name = prefix + "G" + gene_ID_str
@@ -1726,8 +1807,8 @@ def construct_names(gene_ID, transcript_ID, prefix, n_places):
 
 
 def check_inputs(options):
-    """ Checks the input options provided by the user and makes sure that
-        they are valid. Throw an error with descriptive help message if not."""
+    """Checks the input options provided by the user and makes sure that
+    they are valid. Throw an error with descriptive help message if not."""
 
     # Make sure that the input database exists!
     database = options.database
@@ -1743,8 +1824,7 @@ def check_inputs(options):
         builds = [str(x[0]) for x in cursor.fetchall()]
         if options.build not in builds:
             build_names = ", ".join(list(builds))
-            msg = "Please specify a genome build that exists in the" +\
-                             " database. The choices are: " + build_names
+            msg = "Please specify a genome build that exists in the" + " database. The choices are: " + build_names
             logging.error(msg)
             raise ValueError(msg)
 
@@ -1758,15 +1838,17 @@ def check_inputs(options):
             cursor.execute(""" SELECT dataset_name FROM dataset """)
             existing_datasets = [str(x[0]) for x in cursor.fetchall()]
 
-            with open(options.config_file, 'r') as f:
+            with open(options.config_file, "r") as f:
                 for line in f:
-                    line = line.strip().split(',')
+                    line = line.strip().split(",")
                     curr_sam = line[3]
                     if len(line) != 4:
-                        msg = 'Incorrect number of comma-separated fields' +\
-                                         ' in config file. There should be four: ' +\
-                                         '(dataset name, sample description, ' +\
-                                         'platform, associated sam/bam file).'
+                        msg = (
+                            "Incorrect number of comma-separated fields"
+                            + " in config file. There should be four: "
+                            + "(dataset name, sample description, "
+                            + "platform, associated sam/bam file)."
+                        )
                         logging.error(msg)
                         raise ValueError(msg)
 
@@ -1779,19 +1861,18 @@ def check_inputs(options):
                     metadata = (line[0], line[1], line[2])
                     dataname = metadata[0]
                     if dataname in existing_datasets:
-                        logging.warning("Ignoring dataset with name '" + dataname +
-                                      "' because it is already in the database.")
+                        logging.warning(
+                            "Ignoring dataset with name '" + dataname + "' because it is already in the database."
+                        )
                     elif dataname in curr_datasets:
-                        logging.warning("Skipping duplicated instance of dataset '" +
-                                      dataname + "'.")
+                        logging.warning("Skipping duplicated instance of dataset '" + dataname + "'.")
                     elif curr_sam in sam_files:
-                        logging.warning("Skipping duplicated instance of sam file '" +
-                                      curr_sam + "'.")
+                        logging.warning("Skipping duplicated instance of sam file '" + curr_sam + "'.")
                     else:
                         dataset_metadata.append(metadata)
                         curr_datasets.append(dataname)
                     if not curr_sam.endswith(".sam") and not curr_sam.endswith(".bam"):
-                        msg =  'Last field in config file must be a .sam/.bam file'
+                        msg = "Last field in config file must be a .sam/.bam file"
                         logging.error(msg)
                         raise ValueError(msg)
                     sam_files.append(curr_sam)
@@ -1805,7 +1886,7 @@ def check_inputs(options):
             cursor.execute(""" SELECT dataset_name FROM dataset """)
             existing_datasets = [str(x[0]) for x in cursor.fetchall()]
 
-            with open(options.config_file, 'r') as f:
+            with open(options.config_file, "r") as f:
                 n_lines = 0
                 for line in f:
                     n_lines += 1
@@ -1815,13 +1896,15 @@ def check_inputs(options):
                     #     raise ValueError('Using more than one sam file with RG '+\
                     #                      'tag is currently unsupported')
 
-                    line = line.strip().split(',')
+                    line = line.strip().split(",")
                     curr_sam = line[2]
                     if len(line) != 3:
-                        msg = 'Incorrect number of comma-separated fields' +\
-                                         ' in config file. There should be three: ' +\
-                                         '(sample description, ' +\
-                                         'platform, associated sam/bam file).'
+                        msg = (
+                            "Incorrect number of comma-separated fields"
+                            + " in config file. There should be three: "
+                            + "(sample description, "
+                            + "platform, associated sam/bam file)."
+                        )
                         logging.error(msg)
                         raise ValueError(msg)
 
@@ -1830,41 +1913,44 @@ def check_inputs(options):
                         msg = f"SAM/BAM file '{curr_sam}' does not exist!"
                         logging.error(msg)
                         raise ValueError(msg)
-                    metadata = ['', line[0], line[1]]
+                    metadata = ["", line[0], line[1]]
 
                     # get list of dataset names from the CB tag in the sam file
-                    if curr_sam.endswith('.sam'):
-
+                    if curr_sam.endswith(".sam"):
                         # which rows are comment rows?
-                        with open(curr_sam, 'r') as infile:
+                        with open(curr_sam, "r") as infile:
                             skip_rows = []
                             for i, line in enumerate(infile):
-                                if line.startswith('@'):
+                                if line.startswith("@"):
                                     skip_rows.append(i)
                                 else:
                                     break
                         # read just the cb tags
-                        df = pd.read_csv(curr_sam, sep='\tCB:Z:',
-                                         skiprows=skip_rows,
-                                         usecols=[1], header=None,
-                                         names=['cb_tag'], engine='python')
+                        df = pd.read_csv(
+                            curr_sam,
+                            sep="\tCB:Z:",
+                            skiprows=skip_rows,
+                            usecols=[1],
+                            header=None,
+                            names=["cb_tag"],
+                            engine="python",
+                        )
                         # is the df empty?
                         if df.empty:
-                            msg = 'SAM/BAM file contains no CB tags'
+                            msg = "SAM/BAM file contains no CB tags"
                             logging.error(msg)
                             raise RuntimeError(msg)
-                        df['dataset'] = df.cb_tag.str.split(
-                            pat='\t', n=1, expand=True)[0]
+                        df["dataset"] = df.cb_tag.str.split(pat="\t", n=1, expand=True)[0]
                         datasets = df.dataset.unique().tolist()
 
-                    elif curr_sam.endswith('.bam'):
+                    elif curr_sam.endswith(".bam"):
                         datasets = []
 
                         # make an index so we can use the pysam fetch function
                         pysam.index(curr_sam)
-                        infile = pysam.AlignmentFile(curr_sam, 'rb')
+                        infile = pysam.AlignmentFile(curr_sam, "rb")
                         for read in infile.fetch():
-                            tag = read.get_tag('CB')
+                            tag = read.get_tag("CB")
                             datasets.append(tag)
                         # only unique dataset
                         datasets = list(set(datasets))
@@ -1873,15 +1959,13 @@ def check_inputs(options):
                         metadata[0] = dataname
 
                         if dataname in existing_datasets:
-                            msg = f"Dataset for read group {f} " +\
-                                                "already in database."
+                            msg = f"Dataset for read group {f} " + "already in database."
                             logging.error(msg)
                             raise RuntimeError(msg)
                             # warnings.warn("Ignoring dataset with name '" + dataname + \
                             #               "' because it is already in the database.")
                         elif dataname in curr_datasets:
-                            msg = f"Dataset for read group {f} " +\
-                                                "already in config file."
+                            msg = f"Dataset for read group {f} " + "already in config file."
                             logging.error(msg)
                             raise RuntimeError(msg)
                             # warnings.warn("Skipping duplicated instance of dataset '" + \
@@ -1890,10 +1974,9 @@ def check_inputs(options):
                             dataset_metadata.append(tuple(metadata))
                             curr_datasets.append(dataname)
                     if curr_sam in sam_files:
-                        logging.warning("Skipping duplicated instance of sam/bam file '" +
-                                      curr_sam + "'.")
+                        logging.warning("Skipping duplicated instance of sam/bam file '" + curr_sam + "'.")
                     if not curr_sam.endswith(".sam") and not curr_sam.endswith(".bam"):
-                        msg = 'Last field in config file must be a .sam/.bam file'
+                        msg = "Last field in config file must be a .sam/.bam file"
                         logging.error(msg)
                         raise ValueError(msg)
                     sam_files.append(curr_sam)
@@ -1906,19 +1989,25 @@ def check_inputs(options):
                     #     sam_files.append(curr_sam)
 
     if sam_files == []:
-        msg = "All of the provided dataset names are already in "+\
-              "the database. Please check your config file."
+        msg = "All of the provided dataset names are already in " + "the database. Please check your config file."
         logging.error(msg)
         raise RuntimeError(msg)
 
     return sam_files, dataset_metadata
 
 
-def init_run_info(database, genome_build, min_coverage=0.9, min_identity=0,
-                  use_cb_tag=False, create_novel_spliced_genes=False, tmp_dir="talon_tmp/"):
-    """ Initializes a dictionary that keeps track of important run information
-        such as the desired genome build, the prefix for novel identifiers,
-        and the novel counters for the run. """
+def init_run_info(
+    database,
+    genome_build,
+    min_coverage=0.9,
+    min_identity=0,
+    use_cb_tag=False,
+    create_novel_spliced_genes=False,
+    tmp_dir="talon_tmp/",
+):
+    """Initializes a dictionary that keeps track of important run information
+    such as the desired genome build, the prefix for novel identifiers,
+    and the novel counters for the run."""
 
     with sqlite3.connect(database) as conn:
         conn.row_factory = sqlite3.Row
@@ -1936,8 +2025,8 @@ def init_run_info(database, genome_build, min_coverage=0.9, min_identity=0,
         # Fetch information from run_info table
         cursor.execute("""SELECT * FROM run_info""")
         for info in cursor.fetchall():
-            info_name = info['item']
-            value = info['value']
+            info_name = info["item"]
+            value = info["value"]
             if info_name not in ["idprefix", "schema_version"]:
                 value = int(value)
             run_info[info_name] = value
@@ -1945,14 +2034,14 @@ def init_run_info(database, genome_build, min_coverage=0.9, min_identity=0,
         # Fetch dataset counter
         query = "SELECT * FROM counters WHERE category == 'dataset'"
         cursor.execute(query)
-        run_info.dataset = cursor.fetchone()['count']
+        run_info.dataset = cursor.fetchone()["count"]
 
     return run_info
 
 
 def init_outfiles(outprefix, tmp_dir="talon_tmp/"):
-    """ Initialize output files for the run that all processes will be able to
-        write to via the queue. """
+    """Initialize output files for the run that all processes will be able to
+    write to via the queue."""
 
     # If there is a tmp dir there already, remove it
     if os.path.exists(tmp_dir):
@@ -1985,59 +2074,42 @@ def init_outfiles(outprefix, tmp_dir="talon_tmp/"):
 
     for fname in outfiles:
         # Replace with handle to open file
-        open(outfiles[fname], 'w').close()
+        open(outfiles[fname], "w").close()
 
     return outfiles
 
 
-def prepare_data_structures(cursor, run_info, chrom=None, start=None,
-                            end=None, tmp_id="1"):
-    """ Initializes data structures needed for the run and organizes them
-        in a dictionary for more ease of use when passing them between functions
+def prepare_data_structures(cursor, run_info, chrom=None, start=None, end=None, tmp_id="1"):
+    """Initializes data structures needed for the run and organizes them
+    in a dictionary for more ease of use when passing them between functions
     """
     build = run_info.build
     min_coverage = run_info.min_coverage
     min_identity = run_info.min_identity
     struct_collection = dstruct.Struct()
 
-    struct_collection.tmp_gene = init_refs.make_temp_novel_gene_table(cursor,
-                                                                      build, chrom=chrom,
-                                                                      start=start, end=end,
-                                                                      tmp_tab="temp_gene_" + tmp_id)
-
-    struct_collection.tmp_monoexon = init_refs.make_temp_monoexonic_transcript_table(cursor,
-                                                                                     build, chrom=chrom,
-                                                                                     start=start, end=end,
-                                                                                     tmp_tab="temp_monoexon_" + tmp_id)
-
-    struct_collection.tmp_t = init_refs.make_temp_transcript_table(cursor,
-                                                                   build, chrom=chrom,
-                                                                   start=start, end=end,
-                                                                   tmp_tab="temp_t_" + tmp_id)
-
-    location_dict = init_refs.make_location_dict(build, cursor, chrom=chrom,
-                                                 start=start, end=end)
-
-    edge_dict = init_refs.make_edge_dict(cursor, build=build, chrom=chrom,
-                                         start=start, end=end)
-
-    transcript_dict = init_refs.make_transcript_dict(cursor, build, chrom=chrom,
-                                                     start=start, end=end)
-
-    vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor, build=build,
-                                                      chrom=chrom,
-                                                      start=start, end=end)
-
-    gene_starts = init_refs.make_gene_start_or_end_dict(cursor,
-                                                        build, "start",
-                                                        chrom=chrom,
-                                                        start=start,
-                                                        end=end)
-    gene_ends = init_refs.make_gene_start_or_end_dict(cursor,
-                                                      build, "end",
-                                                      chrom=chrom,
-                                                      start=start,
-                                                      end=end)
+    struct_collection.tmp_gene = init_refs.make_temp_novel_gene_table(
+        cursor, build, chrom=chrom, start=start, end=end, tmp_tab="temp_gene_" + tmp_id
+    )
+
+    struct_collection.tmp_monoexon = init_refs.make_temp_monoexonic_transcript_table(
+        cursor, build, chrom=chrom, start=start, end=end, tmp_tab="temp_monoexon_" + tmp_id
+    )
+
+    struct_collection.tmp_t = init_refs.make_temp_transcript_table(
+        cursor, build, chrom=chrom, start=start, end=end, tmp_tab="temp_t_" + tmp_id
+    )
+
+    location_dict = init_refs.make_location_dict(build, cursor, chrom=chrom, start=start, end=end)
+
+    edge_dict = init_refs.make_edge_dict(cursor, build=build, chrom=chrom, start=start, end=end)
+
+    transcript_dict = init_refs.make_transcript_dict(cursor, build, chrom=chrom, start=start, end=end)
+
+    vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor, build=build, chrom=chrom, start=start, end=end)
+
+    gene_starts = init_refs.make_gene_start_or_end_dict(cursor, build, "start", chrom=chrom, start=start, end=end)
+    gene_ends = init_refs.make_gene_start_or_end_dict(cursor, build, "end", chrom=chrom, start=start, end=end)
 
     struct_collection.location_dict = location_dict
     struct_collection.edge_dict = edge_dict
@@ -2050,32 +2122,43 @@ def prepare_data_structures(cursor, run_info, chrom=None, start=None,
 
 
 def compute_delta(orig_pos, new_pos, strand):
-    """ Given a starting position and a new position, compute the distance
-        between them. The sign indicates whether the second point is
-        upstream or downstream of the original with respect to strand. """
+    """Given a starting position and a new position, compute the distance
+    between them. The sign indicates whether the second point is
+    upstream or downstream of the original with respect to strand."""
 
     abs_dist = abs(orig_pos - new_pos)
     if strand == "+":
         if new_pos < orig_pos:
-            return -1*abs_dist
+            return -1 * abs_dist
         else:
             return abs_dist
     elif strand == "-":
         if new_pos < orig_pos:
             return abs_dist
         else:
-            return -1*abs_dist
+            return -1 * abs_dist
     else:
-        msg = 'Strand must be either + or -'
+        msg = "Strand must be either + or -"
         logging.error(msg)
         raise ValueError(msg)
 
 
-def identify_monoexon_transcript(chrom, positions, strand, cursor, location_dict,
-                                 edge_dict, transcript_dict, vertex_2_gene,
-                                 gene_starts, gene_ends, run_info, tmp_gene,
-                                 tmp_t,
-                                 tmp_monoexon):
+def identify_monoexon_transcript(
+    chrom,
+    positions,
+    strand,
+    cursor,
+    location_dict,
+    edge_dict,
+    transcript_dict,
+    vertex_2_gene,
+    gene_starts,
+    gene_ends,
+    run_info,
+    tmp_gene,
+    tmp_t,
+    tmp_monoexon,
+):
     gene_novelty = []
     transcript_novelty = []
     exon_novelty = []
@@ -2087,7 +2170,8 @@ def identify_monoexon_transcript(chrom, positions, strand, cursor, location_dict
     end = positions[-1]
     # First, look for a monoexonic transcript match that overlaps the current
     # transcript
-    query = Template(""" SELECT *
+    query = Template(
+        """ SELECT *
                     FROM $tmp_monoexon AS tm
                     WHERE tm.chromosome = '$chrom'
                     AND tm.strand = '$strand'
@@ -2095,10 +2179,16 @@ def identify_monoexon_transcript(chrom, positions, strand, cursor, location_dict
                       OR (min_pos >= $start AND max_pos <= $end)
                       OR (min_pos >= $start AND min_pos <= $end)
                       OR (max_pos >= $start AND max_pos <= $end))
-                    """).substitute({"tmp_monoexon": tmp_monoexon,
-                                     "chrom": chrom, "strand": strand,
-                                     "start": min(start, end),
-                                     "end": max(start, end)})
+                    """
+    ).substitute(
+        {
+            "tmp_monoexon": tmp_monoexon,
+            "chrom": chrom,
+            "strand": strand,
+            "start": min(start, end),
+            "end": max(start, end),
+        }
+    )
 
     cursor.execute(query)
     matches = cursor.fetchall()
@@ -2110,32 +2200,28 @@ def identify_monoexon_transcript(chrom, positions, strand, cursor, location_dict
         best_match = None
         for match in matches:
             # get overlap and compare
-            match_interval = [match['start'], match['end']]
+            match_interval = [match["start"], match["end"]]
             overlap, perc_overlap = get_overlap([start, end], match_interval)
             if overlap >= best_overlap:
                 best_overlap = overlap
                 best_match = match
 
-        gene_ID = best_match['gene_ID']
-        transcript_ID = best_match['transcript_ID']
-        vertex_IDs = (best_match['start_vertex'], best_match['end_vertex'])
-        edge_IDs = [best_match['exon_ID']]
-        diff_5p = compute_delta(best_match['start'], start, strand)
-        diff_3p = compute_delta(best_match['end'], end, strand)
+        gene_ID = best_match["gene_ID"]
+        transcript_ID = best_match["transcript_ID"]
+        vertex_IDs = (best_match["start_vertex"], best_match["end_vertex"])
+        edge_IDs = [best_match["exon_ID"]]
+        diff_5p = compute_delta(best_match["start"], start, strand)
+        diff_3p = compute_delta(best_match["end"], end, strand)
 
     # If there is no match, proceed to genomic/antisense style matching.
     else:
         # Start by performing vertex match
         vertex_IDs, v_novelty, diff_5p, diff_3p = match_monoexon_vertices(
-            chrom,
-            positions,
-            strand,
-            location_dict,
-            run_info)
+            chrom, positions, strand, location_dict, run_info
+        )
 
         # Get edge match (or create new edge)
-        edge_IDs, e_novelty = match_all_transcript_edges(vertex_IDs, strand,
-                                                         edge_dict, run_info)
+        edge_IDs, e_novelty = match_all_transcript_edges(vertex_IDs, strand, edge_dict, run_info)
 
         # If the exon is known, then this transcript must be ISM or NIC
         gene_ID = None
@@ -2143,94 +2229,125 @@ def identify_monoexon_transcript(chrom, positions, strand, cursor, location_dict
             all_matches = search_for_ISM(edge_IDs, transcript_dict)
 
             if all_matches != None:
-                gene_ID, transcript_ID, transcript_novelty, info = process_ISM(chrom, positions,
-                                                                               strand, edge_IDs,
-                                                                               vertex_IDs, all_matches,
-                                                                               transcript_dict,
-                                                                               gene_starts, gene_ends,
-                                                                               edge_dict, location_dict,
-                                                                               run_info, cursor, tmp_gene, tmp_t)
+                gene_ID, transcript_ID, transcript_novelty, info = process_ISM(
+                    chrom,
+                    positions,
+                    strand,
+                    edge_IDs,
+                    vertex_IDs,
+                    all_matches,
+                    transcript_dict,
+                    gene_starts,
+                    gene_ends,
+                    edge_dict,
+                    location_dict,
+                    run_info,
+                    cursor,
+                    tmp_gene,
+                    tmp_t,
+                )
         if gene_ID == None:
             # Find best gene match using overlap search if the ISM/NIC check didn't work
-            gene_ID, match_strand = search_for_overlap_with_gene(chrom, positions[0],
-                                                                 positions[1], strand,
-                                                                 cursor, run_info, tmp_gene,
-                                                                 tmp_t)
+            gene_ID, match_strand = search_for_overlap_with_gene(
+                chrom, positions[0], positions[1], strand, cursor, run_info, tmp_gene, tmp_t
+            )
             # Intergenic case
             if gene_ID == None:
-                gene_ID = create_gene(chrom, positions[0], positions[-1],
-                                      strand, cursor, tmp_gene)
-
-                gene_novelty.append((gene_ID, run_info.idprefix, "TALON",
-                                     "intergenic_novel", "TRUE"))
-                transcript_ID = create_transcript(strand, chrom, positions[0], positions[-1],
-                                                  gene_ID, edge_IDs, vertex_IDs,
-                                                  transcript_dict, tmp_t, cursor)["transcript_ID"]
-                transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON",
-                                           "intergenic_transcript", "TRUE"))
+                gene_ID = create_gene(chrom, positions[0], positions[-1], strand, cursor, tmp_gene)
+
+                gene_novelty.append((gene_ID, run_info.idprefix, "TALON", "intergenic_novel", "TRUE"))
+                transcript_ID = create_transcript(
+                    strand,
+                    chrom,
+                    positions[0],
+                    positions[-1],
+                    gene_ID,
+                    edge_IDs,
+                    vertex_IDs,
+                    transcript_dict,
+                    tmp_t,
+                    cursor,
+                )["transcript_ID"]
+                transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON", "intergenic_transcript", "TRUE"))
             # Antisense case
             elif match_strand != strand:
                 anti_gene_ID = gene_ID
-                gene_ID = create_gene(chrom, positions[0], positions[-1],
-                                      strand, cursor, tmp_gene)
-                transcript_ID = create_transcript(strand, chrom, positions[0], positions[-1],
-                                                  gene_ID, edge_IDs, vertex_IDs,
-                                                  transcript_dict, tmp_t, cursor)["transcript_ID"]
-
-                gene_novelty.append((gene_ID, run_info.idprefix, "TALON",
-                                     "antisense_gene", "TRUE"))
-                gene_novelty.append((gene_ID, run_info.idprefix, "TALON",
-                                     "gene_antisense_to_IDs", anti_gene_ID))
-                transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON",
-                                           "antisense_transcript", "TRUE"))
+                gene_ID = create_gene(chrom, positions[0], positions[-1], strand, cursor, tmp_gene)
+                transcript_ID = create_transcript(
+                    strand,
+                    chrom,
+                    positions[0],
+                    positions[-1],
+                    gene_ID,
+                    edge_IDs,
+                    vertex_IDs,
+                    transcript_dict,
+                    tmp_t,
+                    cursor,
+                )["transcript_ID"]
+
+                gene_novelty.append((gene_ID, run_info.idprefix, "TALON", "antisense_gene", "TRUE"))
+                gene_novelty.append((gene_ID, run_info.idprefix, "TALON", "gene_antisense_to_IDs", anti_gene_ID))
+                transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON", "antisense_transcript", "TRUE"))
 
             # Same strand
             else:
-                transcript_ID = create_transcript(strand, chrom, positions[0], positions[-1],
-                                                  gene_ID, edge_IDs, vertex_IDs,
-                                                  transcript_dict, tmp_t, cursor)["transcript_ID"]
-                transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON",
-                                           "genomic_transcript", "TRUE"))
+                transcript_ID = create_transcript(
+                    strand,
+                    chrom,
+                    positions[0],
+                    positions[-1],
+                    gene_ID,
+                    edge_IDs,
+                    vertex_IDs,
+                    transcript_dict,
+                    tmp_t,
+                    cursor,
+                )["transcript_ID"]
+                transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON", "genomic_transcript", "TRUE"))
 
         # Add all novel vertices to vertex_2_gene now that we have the gene ID
         update_vertex_2_gene(gene_ID, vertex_IDs, strand, vertex_2_gene)
 
-        talon_gene_name, talon_transcript_name = construct_names(gene_ID,
-                                                                 transcript_ID,
-                                                                 run_info.idprefix,
-                                                                 run_info.n_places)
+        talon_gene_name, talon_transcript_name = construct_names(
+            gene_ID, transcript_ID, run_info.idprefix, run_info.n_places
+        )
 
         # Add novel gene annotation attributes
         if len(gene_novelty) > 0:
-            gene_novelty.append((gene_ID, run_info.idprefix, "TALON",
-                                 "gene_status", "NOVEL"))
-            gene_novelty.append((gene_ID, run_info.idprefix, "TALON",
-                                 "gene_name", talon_gene_name))
-            gene_novelty.append((gene_ID, run_info.idprefix, "TALON",
-                                 "gene_id", talon_gene_name))
+            gene_novelty.append((gene_ID, run_info.idprefix, "TALON", "gene_status", "NOVEL"))
+            gene_novelty.append((gene_ID, run_info.idprefix, "TALON", "gene_name", talon_gene_name))
+            gene_novelty.append((gene_ID, run_info.idprefix, "TALON", "gene_id", talon_gene_name))
 
         # Add novel transcript annotation attributes
-        transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON",
-                                   "transcript_status", "NOVEL"))
-        transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON",
-                                   "transcript_name", talon_transcript_name))
-        transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON",
-                                   "transcript_id", talon_transcript_name))
+        transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON", "transcript_status", "NOVEL"))
+        transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON", "transcript_name", talon_transcript_name))
+        transcript_novelty.append((transcript_ID, run_info.idprefix, "TALON", "transcript_id", talon_transcript_name))
 
         # Add annotation entries for any novel exons
         if e_novelty[0] == 1:
-            exon_novelty.append((edge_IDs[0], run_info.idprefix, "TALON",
-                                 "exon_status", "NOVEL"))
+            exon_novelty.append((edge_IDs[0], run_info.idprefix, "TALON", "exon_status", "NOVEL"))
 
         # Add the novel transcript to the temporary monoexon table
-        new_mono = (gene_ID, transcript_ID, chrom, start, end, strand,
-                    vertex_IDs[0], vertex_IDs[-1], edge_IDs[0],
-                    min(start, end), max(start, end))
-        cols = '("gene_ID", "transcript_ID", "chromosome", "start", "end",' + \
-            '"strand", "start_vertex", "end_vertex", "exon_ID", "min_pos",' + \
-            '"max_pos")'
-        command = 'INSERT INTO ' + tmp_monoexon + ' ' + cols + ' VALUES ' + \
-                  '(?,?,?,?,?,?,?,?,?,?,?)'
+        new_mono = (
+            gene_ID,
+            transcript_ID,
+            chrom,
+            start,
+            end,
+            strand,
+            vertex_IDs[0],
+            vertex_IDs[-1],
+            edge_IDs[0],
+            min(start, end),
+            max(start, end),
+        )
+        cols = (
+            '("gene_ID", "transcript_ID", "chromosome", "start", "end",'
+            + '"strand", "start_vertex", "end_vertex", "exon_ID", "min_pos",'
+            + '"max_pos")'
+        )
+        command = "INSERT INTO " + tmp_monoexon + " " + cols + " VALUES " + "(?,?,?,?,?,?,?,?,?,?,?)"
         cursor.execute(command, new_mono)
 
     # Package annotation information
@@ -2251,7 +2368,7 @@ def identify_monoexon_transcript(chrom, positions, strand, cursor, location_dict
 
 
 def update_database(database, batch_size, outfiles, datasets):
-    """ Adds new entries to the database. """
+    """Adds new entries to the database."""
 
     conn = sqlite3.connect(database)
     conn.row_factory = sqlite3.Row
@@ -2266,8 +2383,7 @@ def update_database(database, batch_size, outfiles, datasets):
     batch_add_observed(cursor, outfiles.observed, batch_size)
     update_counter(cursor)
     batch_add_annotations(cursor, outfiles.gene_annot, "gene", batch_size)
-    batch_add_annotations(cursor, outfiles.transcript_annot, "transcript",
-                          batch_size)
+    batch_add_annotations(cursor, outfiles.transcript_annot, "transcript", batch_size)
     batch_add_annotations(cursor, outfiles.exon_annot, "exon", batch_size)
 
     check_database_integrity(cursor)
@@ -2278,7 +2394,7 @@ def update_database(database, batch_size, outfiles, datasets):
 
 
 def update_counter(cursor):  # , n_datasets):
-    """ Update the database counter using the global counter variables """
+    """Update the database counter using the global counter variables"""
 
     update_g = 'UPDATE "counters" SET "count" = ? WHERE "category" = "genes"'
     cursor.execute(update_g, [gene_counter.value()])
@@ -2303,21 +2419,18 @@ def update_counter(cursor):  # , n_datasets):
 
 
 def batch_add_vertex2gene(cursor, v2g_file, batch_size):
-    """ Add new vertex-gene relationships to the vertex table """
+    """Add new vertex-gene relationships to the vertex table"""
 
-    with open(v2g_file, 'r') as f:
+    with open(v2g_file, "r") as f:
         while True:
-            batch = [tuple(x.strip().split("\t"))
-                     for x in islice(f, batch_size)]
+            batch = [tuple(x.strip().split("\t")) for x in islice(f, batch_size)]
 
             if batch == []:
                 break
 
             try:
-                cols = " (" + ", ".join([str_wrap_double(x) for x in
-                                         ["vertex_ID", "gene_ID"]]) + ") "
-                command = 'INSERT OR IGNORE INTO "vertex"' + cols + "VALUES " + \
-                          '(?,?)'
+                cols = " (" + ", ".join([str_wrap_double(x) for x in ["vertex_ID", "gene_ID"]]) + ") "
+                command = 'INSERT OR IGNORE INTO "vertex"' + cols + "VALUES " + "(?,?)"
                 cursor.executemany(command, batch)
 
             except Exception as e:
@@ -2327,21 +2440,22 @@ def batch_add_vertex2gene(cursor, v2g_file, batch_size):
 
 
 def batch_add_locations(cursor, location_file, batch_size):
-    """ Add new locations to database """
+    """Add new locations to database"""
 
-    with open(location_file, 'r') as f:
+    with open(location_file, "r") as f:
         while True:
-            batch = [tuple(x.strip().split("\t"))
-                     for x in islice(f, batch_size)]
+            batch = [tuple(x.strip().split("\t")) for x in islice(f, batch_size)]
 
             if batch == []:
                 break
 
             try:
-                cols = " (" + ", ".join([str_wrap_double(x) for x in
-                                         ["location_ID", "genome_build", "chromosome", "position"]]) + ") "
-                command = 'INSERT INTO "location"' + cols + "VALUES " + \
-                          '(?,?,?,?)'
+                cols = (
+                    " ("
+                    + ", ".join([str_wrap_double(x) for x in ["location_ID", "genome_build", "chromosome", "position"]])
+                    + ") "
+                )
+                command = 'INSERT INTO "location"' + cols + "VALUES " + "(?,?,?,?)"
                 cursor.executemany(command, batch)
 
             except Exception as e:
@@ -2351,21 +2465,22 @@ def batch_add_locations(cursor, location_file, batch_size):
 
 
 def batch_add_edges(cursor, edge_file, batch_size):
-    """ Add new edges to database """
+    """Add new edges to database"""
 
-    with open(edge_file, 'r') as f:
+    with open(edge_file, "r") as f:
         while True:
-            batch = [tuple(x.strip().split("\t"))
-                     for x in islice(f, batch_size)]
+            batch = [tuple(x.strip().split("\t")) for x in islice(f, batch_size)]
 
             if batch == []:
                 break
 
             try:
-                cols = " (" + ", ".join([str_wrap_double(x) for x in
-                                         ["edge_ID", "v1", "v2", "edge_type", "strand"]]) + ") "
-                command = 'INSERT INTO "edge"' + \
-                    cols + "VALUES " + '(?,?,?,?,?)'
+                cols = (
+                    " ("
+                    + ", ".join([str_wrap_double(x) for x in ["edge_ID", "v1", "v2", "edge_type", "strand"]])
+                    + ") "
+                )
+                command = 'INSERT INTO "edge"' + cols + "VALUES " + "(?,?,?,?,?)"
                 cursor.executemany(command, batch)
 
             except Exception as e:
@@ -2376,15 +2491,15 @@ def batch_add_edges(cursor, edge_file, batch_size):
 
 
 def batch_add_transcripts(cursor, transcript_file, batch_size):
-    """ Add new transcripts to database """
+    """Add new transcripts to database"""
 
-    with open(transcript_file, 'r') as f:
+    with open(transcript_file, "r") as f:
         while True:
             batch_lines = islice(f, batch_size)
             batch = []
             for line in batch_lines:
                 transcript = line.strip().split("\t")
-                if transcript[3] == 'None':
+                if transcript[3] == "None":
                     transcript[3] = None
                 batch.append(transcript)
 
@@ -2392,11 +2507,26 @@ def batch_add_transcripts(cursor, transcript_file, batch_size):
                 break
 
             try:
-                cols = " (" + ", ".join([str_wrap_double(x) for x in
-                                         ["transcript_id", "gene_id", "start_exon", "jn_path",
-                                          "end_exon", "start_vertex", "end_vertex", "n_exons"]]) + ") "
-                command = 'INSERT INTO "transcripts"' + \
-                    cols + "VALUES " + '(?,?,?,?,?,?,?,?)'
+                cols = (
+                    " ("
+                    + ", ".join(
+                        [
+                            str_wrap_double(x)
+                            for x in [
+                                "transcript_id",
+                                "gene_id",
+                                "start_exon",
+                                "jn_path",
+                                "end_exon",
+                                "start_vertex",
+                                "end_vertex",
+                                "n_exons",
+                            ]
+                        ]
+                    )
+                    + ") "
+                )
+                command = 'INSERT INTO "transcripts"' + cols + "VALUES " + "(?,?,?,?,?,?,?,?)"
                 cursor.executemany(command, batch)
 
             except Exception as e:
@@ -2407,21 +2537,18 @@ def batch_add_transcripts(cursor, transcript_file, batch_size):
 
 
 def batch_add_genes(cursor, gene_file, batch_size):
-    """ Add genes to the database gene table """
+    """Add genes to the database gene table"""
 
-    with open(gene_file, 'r') as f:
+    with open(gene_file, "r") as f:
         while True:
-            batch = [tuple(x.strip().split("\t"))
-                     for x in islice(f, batch_size)]
+            batch = [tuple(x.strip().split("\t")) for x in islice(f, batch_size)]
 
             if batch == []:
                 break
 
             try:
-                cols = " (" + ", ".join([str_wrap_double(x) for x in
-                                         ["gene_ID", "strand"]]) + ") "
-                command = 'INSERT OR IGNORE INTO genes' + \
-                    cols + "VALUES " + '(?,?)'
+                cols = " (" + ", ".join([str_wrap_double(x) for x in ["gene_ID", "strand"]]) + ") "
+                command = "INSERT OR IGNORE INTO genes" + cols + "VALUES " + "(?,?)"
                 cursor.executemany(command, batch)
 
             except Exception as e:
@@ -2431,13 +2558,13 @@ def batch_add_genes(cursor, gene_file, batch_size):
 
 
 def add_datasets(cursor, datasets):
-    """ Add dataset records to database """
+    """Add dataset records to database"""
 
     try:
-        cols = " (" + ", ".join([str_wrap_double(x) for x in
-                                 ["dataset_ID", "dataset_name", "sample", "platform"]]) + ") "
-        command = 'INSERT INTO "dataset"' + cols + \
-                  "VALUES " + '(?,?,?,?)'
+        cols = (
+            " (" + ", ".join([str_wrap_double(x) for x in ["dataset_ID", "dataset_name", "sample", "platform"]]) + ") "
+        )
+        command = 'INSERT INTO "dataset"' + cols + "VALUES " + "(?,?,?,?)"
         cursor.executemany(command, datasets)
 
     except Exception as e:
@@ -2447,28 +2574,27 @@ def add_datasets(cursor, datasets):
 
 
 def batch_add_annotations(cursor, annot_file, annot_type, batch_size):
-    """ Add gene/transcript/exon annotations to the appropriate annotation table
-    """
+    """Add gene/transcript/exon annotations to the appropriate annotation table"""
     batch_size = 1
     if annot_type not in ["gene", "transcript", "exon"]:
-        msg = "When running batch annot update, must specify " +\
-                "annot_type as 'gene', 'exon', or 'transcript'."
+        msg = "When running batch annot update, must specify " + "annot_type as 'gene', 'exon', or 'transcript'."
         logging.error(msg)
         raise ValueError(msg)
 
-    with open(annot_file, 'r') as f:
+    with open(annot_file, "r") as f:
         while True:
-            batch = [tuple(x.strip().split("\t"))
-                     for x in islice(f, batch_size)]
+            batch = [tuple(x.strip().split("\t")) for x in islice(f, batch_size)]
 
             if batch == []:
                 break
 
             try:
-                cols = " (" + ", ".join([str_wrap_double(x) for x in
-                                         ["ID", "annot_name", "source", "attribute", "value"]]) + ") "
-                command = 'INSERT OR IGNORE INTO "' + annot_type + \
-                          '_annotations" ' + cols + "VALUES " + '(?,?,?,?,?)'
+                cols = (
+                    " ("
+                    + ", ".join([str_wrap_double(x) for x in ["ID", "annot_name", "source", "attribute", "value"]])
+                    + ") "
+                )
+                command = 'INSERT OR IGNORE INTO "' + annot_type + '_annotations" ' + cols + "VALUES " + "(?,?,?,?,?)"
                 cursor.executemany(command, batch)
 
             except Exception as e:
@@ -2478,12 +2604,12 @@ def batch_add_annotations(cursor, annot_file, annot_type, batch_size):
 
 
 def batch_add_observed(cursor, observed_file, batch_size):
-    """ Adds observed tuples (obs_ID, gene_ID, transcript_ID, read_name,
-        dataset, start_vertex_ID, end_vertex_ID, start_exon, end_exon,
-        start_delta, end_delta, read_length) to observed table of database. """
+    """Adds observed tuples (obs_ID, gene_ID, transcript_ID, read_name,
+    dataset, start_vertex_ID, end_vertex_ID, start_exon, end_exon,
+    start_delta, end_delta, read_length) to observed table of database."""
 
     abundance = {}
-    with open(observed_file, 'r') as f:
+    with open(observed_file, "r") as f:
         while True:
             batch = []
             for observed in islice(f, batch_size):
@@ -2524,14 +2650,35 @@ def batch_add_observed(cursor, observed_file, batch_size):
 
             # Add to database
             try:
-                cols = " (" + ", ".join([str_wrap_double(x) for x in
-                                         ["obs_ID", "gene_ID", "transcript_ID", "read_name",
-                                          "dataset", "start_vertex", "end_vertex",
-                                          "start_exon", "end_exon", "start_delta", "end_delta",
-                                          "read_length", "fraction_As", "custom_label",
-                                          "allelic_label", "start_support", "end_support"]]) + ") "
-                command = 'INSERT INTO "observed"' + cols + \
-                          "VALUES " + '(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)'
+                cols = (
+                    " ("
+                    + ", ".join(
+                        [
+                            str_wrap_double(x)
+                            for x in [
+                                "obs_ID",
+                                "gene_ID",
+                                "transcript_ID",
+                                "read_name",
+                                "dataset",
+                                "start_vertex",
+                                "end_vertex",
+                                "start_exon",
+                                "end_exon",
+                                "start_delta",
+                                "end_delta",
+                                "read_length",
+                                "fraction_As",
+                                "custom_label",
+                                "allelic_label",
+                                "start_support",
+                                "end_support",
+                            ]
+                        ]
+                    )
+                    + ") "
+                )
+                command = 'INSERT INTO "observed"' + cols + "VALUES " + "(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)"
                 cursor.executemany(command, batch)
 
             except Exception as e:
@@ -2549,21 +2696,20 @@ def batch_add_observed(cursor, observed_file, batch_size):
 
 
 def batch_add_abundance(cursor, entries, batch_size):
-    """ Reads abundance tuples (transcript_ID, dataset, count) and
-        adds to the abundance table of the database """
+    """Reads abundance tuples (transcript_ID, dataset, count) and
+    adds to the abundance table of the database"""
 
     index = 0
     while index < len(entries):
         try:
-            batch = entries[index:index + batch_size]
+            batch = entries[index : index + batch_size]
         except:
             batch = entries[index:]
         index += batch_size
 
         try:
-            cols = " (" + ", ".join([str_wrap_double(x) for x in
-                                     ["transcript_id", "dataset", "count"]]) + ") "
-            command = 'INSERT INTO "abundance"' + cols + "VALUES " + '(?,?,?)'
+            cols = " (" + ", ".join([str_wrap_double(x) for x in ["transcript_id", "dataset", "count"]]) + ") "
+            command = 'INSERT INTO "abundance"' + cols + "VALUES " + "(?,?,?)"
             cursor.executemany(command, batch)
         except Exception as e:
             print(e)
@@ -2572,11 +2718,11 @@ def batch_add_abundance(cursor, entries, batch_size):
 
 
 def check_database_integrity(cursor):
-    """ Perform some checks on the database. Run before committing changes"""
+    """Perform some checks on the database. Run before committing changes"""
 
     # ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
     # print("[ %s ] Validating database........" % (ts))
-    logging.info('Validating database')
+    logging.info("Validating database")
 
     # For each category, check that the number of table entries matches the counter
     counter_query = "SELECT * FROM counters"
@@ -2597,15 +2743,17 @@ def check_database_integrity(cursor):
 
         if actual_count != curr_counter:
             fail = 1
-            logging.error("Database counter for '" + table_name +
-                  "' does not match the number of entries in the table." +
-                  " Discarding changes to database and exiting...")
+            logging.error(
+                "Database counter for '"
+                + table_name
+                + "' does not match the number of entries in the table."
+                + " Discarding changes to database and exiting..."
+            )
             logging.debug("table_count: " + str(actual_count))
             logging.debug("counter_value: " + str(curr_counter))
 
     if fail == 1:
-        msg = "Discrepancy found in database. " +\
-                           "Discarding changes to database and exiting..."
+        msg = "Discrepancy found in database. " + "Discarding changes to database and exiting..."
         logging.error(msg)
         raise RuntimeError(msg)
 
@@ -2613,28 +2761,26 @@ def check_database_integrity(cursor):
 
 
 def parallel_talon(read_file, interval, database, run_info, queue):
-    """ Manage TALON processing of a single chunk of the input. Initialize
-        reference data structures covering only the provided interval region,
-        then send the read file to the annotation step. Once annotation is
-        complete, return the data tuples generated so that they can be
-        added to the database, OR alternately, pickle them and write to file
-        where they can be accessed later. """
+    """Manage TALON processing of a single chunk of the input. Initialize
+    reference data structures covering only the provided interval region,
+    then send the read file to the annotation step. Once annotation is
+    complete, return the data tuples generated so that they can be
+    added to the database, OR alternately, pickle them and write to file
+    where they can be accessed later."""
 
     # ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
     # print("[ %s ] Annotating reads in interval %s:%d-%d..." %
     #       (ts, interval[0], interval[1], interval[2]))
-    logging.info(f'Annotating reads in interval {interval[0]}:{interval[1]}-{interval[2]}...')
+    logging.info(f"Annotating reads in interval {interval[0]}:{interval[1]}-{interval[2]}...")
 
     with sqlite3.connect(database) as conn:
         conn.row_factory = sqlite3.Row
         cursor = conn.cursor()
 
         tmp_id = str(os.getpid())
-        struct_collection = prepare_data_structures(cursor, run_info,
-                                                    chrom=interval[0],
-                                                    start=interval[1],
-                                                    end=interval[2],
-                                                    tmp_id=tmp_id)
+        struct_collection = prepare_data_structures(
+            cursor, run_info, chrom=interval[0], start=interval[1], end=interval[2], tmp_id=tmp_id
+        )
 
         interval_id = "%s_%d_%d" % interval
 
@@ -2644,37 +2790,29 @@ def parallel_talon(read_file, interval, database, run_info, queue):
                 qc_metrics = tutils.check_read_quality(record, run_info)
 
                 passed_qc = qc_metrics[2]
-                qc_msg = (run_info.outfiles.qc, "\t".join(
-                    [str(x) for x in qc_metrics]))
+                qc_msg = (run_info.outfiles.qc, "\t".join([str(x) for x in qc_metrics]))
                 queue.put(qc_msg)
 
                 if passed_qc:
-                    annotation_info = annotate_read(record, cursor, run_info,
-                                                    struct_collection)
-                    unpack_observed(annotation_info, queue,
-                                    run_info.outfiles.observed)
+                    annotation_info = annotate_read(record, cursor, run_info, struct_collection)
+                    unpack_observed(annotation_info, queue, run_info.outfiles.observed)
 
                     # Update annotation records
                     # TODO: there is no need for entry to be a list/tuple
                     for entry in annotation_info.gene_novelty:
-                        msg = (run_info.outfiles.gene_annot,
-                               "\t".join([str(x) for x in entry]))
+                        msg = (run_info.outfiles.gene_annot, "\t".join([str(x) for x in entry]))
                         queue.put(msg)
                     for entry in annotation_info.transcript_novelty:
-                        msg = (run_info.outfiles.transcript_annot,
-                               "\t".join([str(x) for x in entry]))
+                        msg = (run_info.outfiles.transcript_annot, "\t".join([str(x) for x in entry]))
                         queue.put(msg)
                     for entry in annotation_info.exon_novelty:
-                        msg = (run_info.outfiles.exon_annot,
-                               "\t".join([str(x) for x in entry]))
+                        msg = (run_info.outfiles.exon_annot, "\t".join([str(x) for x in entry]))
                         queue.put(msg)
 
         # Write the temp_gene table to file
-        cursor.execute("SELECT gene_ID, strand FROM " +
-                       struct_collection.tmp_gene)
+        cursor.execute("SELECT gene_ID, strand FROM " + struct_collection.tmp_gene)
         for row in cursor.fetchall():
-            msg = ((run_info.outfiles.genes, str(
-                row['gene_ID'])+"\t" + row['strand']))
+            msg = (run_info.outfiles.genes, str(row["gene_ID"]) + "\t" + row["strand"])
             queue.put(msg)
 
     # Pass messages to output files
@@ -2684,23 +2822,30 @@ def parallel_talon(read_file, interval, database, run_info, queue):
     for transcript in list(transcripts.values()):
         # Only write novel transcripts to file
         if type(transcript) is dict:
-            entry = "\t".join([str(x) for x in (transcript['transcript_ID'],
-                                                transcript['gene_ID'],
-                                                transcript['start_exon'],
-                                                transcript['jn_path'],
-                                                transcript['end_exon'],
-                                                transcript['start_vertex'],
-                                                transcript['end_vertex'],
-                                                transcript['n_exons'])])
+            entry = "\t".join(
+                [
+                    str(x)
+                    for x in (
+                        transcript["transcript_ID"],
+                        transcript["gene_ID"],
+                        transcript["start_exon"],
+                        transcript["jn_path"],
+                        transcript["end_exon"],
+                        transcript["start_vertex"],
+                        transcript["end_vertex"],
+                        transcript["n_exons"],
+                    )
+                ]
+            )
             queue.put((run_info.outfiles.transcripts, entry))
 
     # Write new edges to file
     edges = struct_collection.edge_dict
     for edge in list(edges.values()):
         if type(edge) is dict:
-            entry = "\t".join([str(x) for x in [edge['edge_ID'], edge['v1'],
-                                                edge['v2'], edge['edge_type'],
-                                                edge['strand']]])
+            entry = "\t".join(
+                [str(x) for x in [edge["edge_ID"], edge["v1"], edge["v2"], edge["edge_type"], edge["strand"]]]
+            )
             queue.put((run_info.outfiles.edges, entry))
 
     # Write locations to file
@@ -2708,18 +2853,18 @@ def parallel_talon(read_file, interval, database, run_info, queue):
     for chrom_dict in location_dict.values():
         for loc in list(chrom_dict.values()):
             if type(loc) is dict:
-                msg = (run_info.outfiles.location,
-                       "\t".join([str(x) for x in (loc['location_ID'],
-                                                   loc['genome_build'],
-                                                   loc['chromosome'],
-                                                   loc['position'])]))
+                msg = (
+                    run_info.outfiles.location,
+                    "\t".join(
+                        [str(x) for x in (loc["location_ID"], loc["genome_build"], loc["chromosome"], loc["position"])]
+                    ),
+                )
                 queue.put(msg)
 
     # Write new vertex-gene combos to file
     for vertex_ID, gene_set in struct_collection.vertex_2_gene.items():
         for gene in gene_set:
-            msg = (run_info.outfiles.v2g,
-                   "\t".join([str(x) for x in (vertex_ID, gene[0])]))
+            msg = (run_info.outfiles.v2g, "\t".join([str(x) for x in (vertex_ID, gene[0])]))
             queue.put(msg)
 
     struct_collection = None
@@ -2728,13 +2873,13 @@ def parallel_talon(read_file, interval, database, run_info, queue):
 
 
 def parse_custom_SAM_tags(sam_record: pysam.AlignedSegment):
-    """ Looks for the following tags in the read. Will be set to None if no tag
-        is found
-            fA: fraction As in the 10-bp interval following the alignment end
-            lC: custom label (type = string)
-            lA: custom allele label (type = string)
-            tS: flag indicating start site support (type = string)
-            tE: flag indicating end site support (typ = string)
+    """Looks for the following tags in the read. Will be set to None if no tag
+    is found
+        fA: fraction As in the 10-bp interval following the alignment end
+        lC: custom label (type = string)
+        lA: custom allele label (type = string)
+        tS: flag indicating start site support (type = string)
+        tE: flag indicating end site support (typ = string)
     """
     try:
         fraction_As = sam_record.get_tag("fA")
@@ -2760,28 +2905,27 @@ def parse_custom_SAM_tags(sam_record: pysam.AlignedSegment):
     return fraction_As, custom_label, allelic_label, start_support, end_support
 
 
-def annotate_read(sam_record: pysam.AlignedSegment, cursor, run_info,
-                  struct_collection, mode=1):
-    """ Accepts a pysam-formatted read as input, and compares it to the
-        annotations in struct_collection to assign it a gene and transcript
-        identity. Returns annotation_info, which is a dict that has the
-        following attributes:
-            gene_ID
-            transcript_ID
-            gene_novelty
-            transcript_novelty
-            exon_novelty
-            start_vertex
-            end_vertex
-            start_exon
-            end_exon
-            start_delta
-            end_delta
-            fraction_As (following the end of the alignment)
-            custom_label
-            allelic_label
-            start_support
-            end_support
+def annotate_read(sam_record: pysam.AlignedSegment, cursor, run_info, struct_collection, mode=1):
+    """Accepts a pysam-formatted read as input, and compares it to the
+    annotations in struct_collection to assign it a gene and transcript
+    identity. Returns annotation_info, which is a dict that has the
+    following attributes:
+        gene_ID
+        transcript_ID
+        gene_novelty
+        transcript_novelty
+        exon_novelty
+        start_vertex
+        end_vertex
+        start_exon
+        end_exon
+        start_delta
+        end_delta
+        fraction_As (following the end of the alignment)
+        custom_label
+        allelic_label
+        start_support
+        end_support
     """
     # Parse attributes to determine the chromosome, positions, and strand of the transcript
     read_ID = sam_record.query_name
@@ -2800,14 +2944,12 @@ def annotate_read(sam_record: pysam.AlignedSegment, cursor, run_info,
     cigar = sam_record.cigarstring
 
     # Parse custom TALON tags
-    fraction_As, custom_label, allelic_label, start_support, \
-        end_support = parse_custom_SAM_tags(sam_record)
+    fraction_As, custom_label, allelic_label, start_support, end_support = parse_custom_SAM_tags(sam_record)
 
     intron_list = tutils.get_introns(sam_record, sam_start, cigar)
 
     # Adjust intron positions by 1 to get splice sites in exon terms
-    splice_sites = [x + 1 if i % 2 == 1 else x - 1 for i, x in
-                    enumerate(intron_list)]
+    splice_sites = [x + 1 if i % 2 == 1 else x - 1 for i, x in enumerate(intron_list)]
     positions = [sam_start] + splice_sites + [sam_end]
 
     # Flip the positions' order if the read is on the minus strand
@@ -2822,25 +2964,40 @@ def annotate_read(sam_record: pysam.AlignedSegment, cursor, run_info,
     gene_starts = struct_collection.gene_starts
     gene_ends = struct_collection.gene_ends
 
-    n_exons = int(len(positions)/2)
+    n_exons = int(len(positions) / 2)
     if n_exons > 1:
-        annotation_info = identify_transcript(chrom, positions, strand,
-                                              cursor, location_dict,
-                                              edge_dict, transcript_dict,
-                                              vertex_2_gene,
-                                              gene_starts, gene_ends,
-                                              run_info,
-                                              struct_collection.tmp_gene,
-                                              struct_collection.tmp_t)
+        annotation_info = identify_transcript(
+            chrom,
+            positions,
+            strand,
+            cursor,
+            location_dict,
+            edge_dict,
+            transcript_dict,
+            vertex_2_gene,
+            gene_starts,
+            gene_ends,
+            run_info,
+            struct_collection.tmp_gene,
+            struct_collection.tmp_t,
+        )
     else:
-        annotation_info = identify_monoexon_transcript(chrom, positions, strand,
-                                                       cursor, location_dict,
-                                                       edge_dict, transcript_dict,
-                                                       vertex_2_gene,
-                                                       gene_starts, gene_ends,
-                                                       run_info, struct_collection.tmp_gene,
-                                                       struct_collection.tmp_t,
-                                                       struct_collection.tmp_monoexon)
+        annotation_info = identify_monoexon_transcript(
+            chrom,
+            positions,
+            strand,
+            cursor,
+            location_dict,
+            edge_dict,
+            transcript_dict,
+            vertex_2_gene,
+            gene_starts,
+            gene_ends,
+            run_info,
+            struct_collection.tmp_gene,
+            struct_collection.tmp_t,
+            struct_collection.tmp_monoexon,
+        )
 
     annotation_info.read_ID = read_ID
     annotation_info.dataset = dataset
@@ -2858,19 +3015,30 @@ def annotate_read(sam_record: pysam.AlignedSegment, cursor, run_info,
 
 
 def unpack_observed(annotation_info, queue, obs_file):
-    """ Now that transcript has been annotated, unpack values and
-        create an observed entry. Send the observed entry to the queue
-        for output to obs_file."""
+    """Now that transcript has been annotated, unpack values and
+    create an observed entry. Send the observed entry to the queue
+    for output to obs_file."""
 
     obs_ID = observed_counter.increment()
-    observed = (obs_ID, annotation_info.gene_ID, annotation_info.transcript_ID,
-                annotation_info.read_ID, annotation_info.dataset,
-                annotation_info.start_vertex, annotation_info.end_vertex,
-                annotation_info.start_exon, annotation_info.end_exon,
-                annotation_info.start_delta, annotation_info.end_delta,
-                annotation_info.read_length, annotation_info.fraction_As,
-                annotation_info.custom_label, annotation_info.allelic_label,
-                annotation_info.start_support, annotation_info.end_support)
+    observed = (
+        obs_ID,
+        annotation_info.gene_ID,
+        annotation_info.transcript_ID,
+        annotation_info.read_ID,
+        annotation_info.dataset,
+        annotation_info.start_vertex,
+        annotation_info.end_vertex,
+        annotation_info.start_exon,
+        annotation_info.end_exon,
+        annotation_info.start_delta,
+        annotation_info.end_delta,
+        annotation_info.read_length,
+        annotation_info.fraction_As,
+        annotation_info.custom_label,
+        annotation_info.allelic_label,
+        annotation_info.start_support,
+        annotation_info.end_support,
+    )
     msg = (obs_file, "\t".join([str(x) for x in observed]))
     queue.put(msg)
 
@@ -2878,14 +3046,14 @@ def unpack_observed(annotation_info, queue, obs_file):
 
 
 def listener(queue, outfiles, QC_header, timeout=72):
-    """ During the run, this function listens for messages on the provided
-        queue. When a message is received (consisting of a filename and a
-        string), it writes the string to that file. Timeout unit is in hours"""
+    """During the run, this function listens for messages on the provided
+    queue. When a message is received (consisting of a filename and a
+    string), it writes the string to that file. Timeout unit is in hours"""
 
     # Open all of the outfiles
     open_files = {}
     for fpath in outfiles.values():
-        open_files[fpath] = open(fpath, 'w')
+        open_files[fpath] = open(fpath, "w")
 
     # Add a header to the QC file
     QC_file = open_files[outfiles.qc]
@@ -2898,10 +3066,10 @@ def listener(queue, outfiles, QC_header, timeout=72):
         msg = queue.get()
         msg_fname = msg[0]
         msg_value = msg[1]
-        if datetime.now() > wait_until or msg_value == 'complete':
+        if datetime.now() > wait_until or msg_value == "complete":
             # ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
             # print("[ %s ] Shutting down message queue..." % (ts))
-            logging.info('Shutting down message queue...')
+            logging.info("Shutting down message queue...")
             for f in open_files.values():
                 f.close()
             break
@@ -2911,29 +3079,33 @@ def listener(queue, outfiles, QC_header, timeout=72):
 
 
 def make_QC_header(coverage, identity, length):
-    """ Create a header for the read QC file """
-
-    cols = "\t".join(["dataset", "read_ID", "passed_QC", "primary_mapped",
-                      "read_length", "fraction_aligned", "identity"])
-    header = "\n".join(["# TALON run filtering settings:",
-                        "# Min fraction read aligned: %f " % coverage,
-                        "# Min read identity to reference: %f" % identity,
-                        "# Min transcript length: %d" % length,
-                        "# -------------------------------------------",
-                        cols])
+    """Create a header for the read QC file"""
+
+    cols = "\t".join(
+        ["dataset", "read_ID", "passed_QC", "primary_mapped", "read_length", "fraction_aligned", "identity"]
+    )
+    header = "\n".join(
+        [
+            "# TALON run filtering settings:",
+            "# Min fraction read aligned: %f " % coverage,
+            "# Min read identity to reference: %f" % identity,
+            "# Min transcript length: %d" % length,
+            "# -------------------------------------------",
+            cols,
+        ]
+    )
 
     return header
 
 
 def main():
-    """ Runs program """
+    """Runs program"""
     options = get_args()
     logger._init_logger(options.verbosity)
 
     # ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
     # print("[ %s ] Started TALON run" % (ts))
-    logging.info('Started TALON run')
-
+    logging.info("Started TALON run")
 
     sam_files, dset_metadata = check_inputs(options)
     # print(sam_files)
@@ -2955,19 +3127,18 @@ def main():
     create_novel_spliced_genes = bool(options.create_novel_spliced_genes)
 
     # format tmp_dir if missing fwd slash
-    if not tmp_dir.endswith('/'):
-        tmp_dir += '/'
+    if not tmp_dir.endswith("/"):
+        tmp_dir += "/"
 
     # Set globally accessible counters
     get_counters(database)
 
     # Initialize worker pool
     with mp.Pool(processes=threads) as pool:
-        run_info = init_run_info(database, build, min_coverage, min_identity,
-                                 use_cb_tag, create_novel_spliced_genes,
-                                 tmp_dir=tmp_dir)
-        run_info.outfiles = init_outfiles(options.outprefix,
-                                          tmp_dir=tmp_dir)
+        run_info = init_run_info(
+            database, build, min_coverage, min_identity, use_cb_tag, create_novel_spliced_genes, tmp_dir=tmp_dir
+        )
+        run_info.outfiles = init_outfiles(options.outprefix, tmp_dir=tmp_dir)
 
         # Create annotation entry for each dataset
         datasets = []
@@ -2978,16 +3149,13 @@ def main():
             dataset_db_entries.append((d_id, d_name, description, platform))
 
         # Partition the reads
-        read_groups, intervals, header_file = procsams.partition_reads(sam_files,
-                                                                       datasets,
-                                                                       use_cb_tag,
-                                                                       tmp_dir=tmp_dir,
-                                                                       n_threads=threads)
-
-        read_files = procsams.write_reads_to_file(
-            read_groups, intervals, header_file, tmp_dir=tmp_dir)
+        read_groups, intervals, header_file = procsams.partition_reads(
+            sam_files, datasets, use_cb_tag, tmp_dir=tmp_dir, n_threads=threads
+        )
+
+        read_files = procsams.write_reads_to_file(read_groups, intervals, header_file, tmp_dir=tmp_dir)
         # ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
-        logging.info(f'Split reads into {len(read_groups)} intervals')
+        logging.info(f"Split reads into {len(read_groups)} intervals")
 
         # Set up a queue specifically for writing to outfiles
         manager = mp.Manager()
@@ -3000,50 +3168,47 @@ def main():
 
         # ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
         # print("[ %s ] Launching parallel annotation jobs" % (ts))
-        logging.info('Launching parallel annotation jobs')
+        logging.info("Launching parallel annotation jobs")
 
         # Start running listener, which will monitor queue for messages
-        QC_header = make_QC_header(run_info.min_coverage, run_info.min_identity,
-                                   run_info.min_length)
+        QC_header = make_QC_header(run_info.min_coverage, run_info.min_identity, run_info.min_length)
         pool.apply_async(listener, (queue, run_info.outfiles, QC_header))
 
         # Now launch the parallel TALON jobs
         pool.starmap(parallel_talon, jobs)
 
         # Now we are done, kill the listener
-        msg_done = (None, 'complete')
+        msg_done = (None, "complete")
         queue.put(msg_done)
         pool.close()
         pool.join()
 
     # ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
     # print("[ %s ] All jobs complete. Starting database update." % (ts))
-    logging.info('All jobs complete. Starting database update')
+    logging.info("All jobs complete. Starting database update")
 
     # Update the database
     batch_size = 10000
-    update_database(database, batch_size,
-                    run_info.outfiles, dataset_db_entries)
+    update_database(database, batch_size, run_info.outfiles, dataset_db_entries)
     # ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
     # print("[ %s ] Database update complete." % (ts))
-    logging.info('Database update complete.')
+    logging.info("Database update complete.")
 
     # Write output reads file
     # ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
     # print("[ %s ] Creating read-wise annotation file." % (ts))
-    logging.info('Creating read-wise annotation file')
-    get_read_annotations.make_read_annot_file(database, build,
-                                              outprefix, datasets=datasets)
+    logging.info("Creating read-wise annotation file")
+    get_read_annotations.make_read_annot_file(database, build, outprefix, datasets=datasets)
 
     # For debugging
-    #print("Genes: %d" % gene_counter.value())
-    #print("Transcripts: %d" % transcript_counter.value())
-    #print("Observed: %d" % observed_counter.value())
+    # print("Genes: %d" % gene_counter.value())
+    # print("Transcripts: %d" % transcript_counter.value())
+    # print("Observed: %d" % observed_counter.value())
 
     # ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
     # print("[ %s ] DONE" % (ts))
-    logging.info('DONE')
+    logging.info("DONE")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/src/talon/talon_label_reads.py b/src/talon/talon_label_reads.py
index 0915e69..8a565db 100644
--- a/src/talon/talon_label_reads.py
+++ b/src/talon/talon_label_reads.py
@@ -1,59 +1,72 @@
 # TALON: Techonology-Agnostic Long Read Analysis Pipeline
 # Author: Dana Wyman
 # -----------------------------------------------------------------------------
-# This program reads in SAM-formatted long read alignments and adds a custom 
+# This program reads in SAM-formatted long read alignments and adds a custom
 # tag to reflect the fraction of As in the sequence immediately following the
-# alignment. This can help indicate the likelihood of an internal priming 
-# artifact. 
+# alignment. This can help indicate the likelihood of an internal priming
+# artifact.
 
-import pyfaidx
-import pysam
+import glob
 import multiprocessing as mp
-from datetime import datetime, timedelta
-import time
 import os
-import glob
+import time
+from datetime import datetime, timedelta
 from optparse import OptionParser
 
+import pyfaidx
+import pysam
+
+
 def get_options():
-    """ Read input args """
-
-    parser = OptionParser(description=("This program reads in SAM-formatted "
-                          "long read alignments and adds a custom tag to "
-                          "reflect the fraction of As in the sequence "
-                          "immediately following the alignment. This can help "
-                          "indicate the likelihood of an internal priming "
-                          "artifact."))
-    parser.add_option("--f", dest = "sam_file",
-                      help = "SAM file of transcripts")
-    parser.add_option("--g", dest = "genome_file",
-                      help = "Reference genome fasta file")
-    parser.add_option("--t", dest = "threads", type = int,
-                      help = "Number of threads to run", default = 1)
-    parser.add_option("--ar", dest = "fracA_range_size", type = int,
-                      help = ("Size of post-transcript interval to compute "
-                              "fraction As on. Default = 20"), default = 20)
-    parser.add_option("--tmpDir", dest = "tmp_dir",
-                      help = ("Path to directory for tmp files. "
-                              "Default = tmp_label_reads"), 
-                      default = "tmp_label_reads")
-    parser.add_option("--deleteTmp", dest = "delete_tmp",
-                      action='store_true',
-                      help = ("If this option is set, the temporary directory "
-                              "generated by the program will be "
-                              "removed at the end of the run."))
-    parser.add_option("--o", dest = "outprefix", default = "talon_prelabels",
-                      help = "Prefix for outfiles")
+    """Read input args"""
+
+    parser = OptionParser(
+        description=(
+            "This program reads in SAM-formatted "
+            "long read alignments and adds a custom tag to "
+            "reflect the fraction of As in the sequence "
+            "immediately following the alignment. This can help "
+            "indicate the likelihood of an internal priming "
+            "artifact."
+        )
+    )
+    parser.add_option("--f", dest="sam_file", help="SAM file of transcripts")
+    parser.add_option("--g", dest="genome_file", help="Reference genome fasta file")
+    parser.add_option("--t", dest="threads", type=int, help="Number of threads to run", default=1)
+    parser.add_option(
+        "--ar",
+        dest="fracA_range_size",
+        type=int,
+        help=("Size of post-transcript interval to compute " "fraction As on. Default = 20"),
+        default=20,
+    )
+    parser.add_option(
+        "--tmpDir",
+        dest="tmp_dir",
+        help=("Path to directory for tmp files. " "Default = tmp_label_reads"),
+        default="tmp_label_reads",
+    )
+    parser.add_option(
+        "--deleteTmp",
+        dest="delete_tmp",
+        action="store_true",
+        help=(
+            "If this option is set, the temporary directory "
+            "generated by the program will be "
+            "removed at the end of the run."
+        ),
+    )
+    parser.add_option("--o", dest="outprefix", default="talon_prelabels", help="Prefix for outfiles")
 
     (opts, args) = parser.parse_args()
     return opts
 
-def fetch_seq(chrom: str, start: int, stop: int, strand: str, genome: pyfaidx.Fasta,
-              indexing=0):
-    """ Given a genomic interval, return the sequence with respect to the
-        strand supplied.
-        If 1-based indexing is specified, then 1 will be subtracted from the
-        position to convert to the Python indexing. """
+
+def fetch_seq(chrom: str, start: int, stop: int, strand: str, genome: pyfaidx.Fasta, indexing=0):
+    """Given a genomic interval, return the sequence with respect to the
+    strand supplied.
+    If 1-based indexing is specified, then 1 will be subtracted from the
+    position to convert to the Python indexing."""
 
     if start > stop:
         raise ValueError("Start must be less than or equal to stop")
@@ -71,32 +84,34 @@ def fetch_seq(chrom: str, start: int, stop: int, strand: str, genome: pyfaidx.Fa
 
     return str(seq)
 
+
 def compute_frac_As(seq: str):
-    """ Compute fraction of sequence made up of As """
+    """Compute fraction of sequence made up of As"""
 
-    a = seq.count('A')
+    a = seq.count("A")
     n = len(seq)
     if n == 0:
         return 0
     else:
-        return float(a)/n
+        return float(a) / n
+
 
 def fetch_range_after_transcript(transcript_end: int, strand: str, length: int):
-    """ Given the 1-based stop position of a transcript and its strand,
-        return a 1-based genomic range of the specified length that starts with
-        the base just after the end position. The smaller position is always
-        reported first.
-        Example:
-              fetch_range_after_transcript(4, '+', 2) would yield (5, 6)
-              fetch_range_after_transcript(4, '-', 2) would yield (2, 3)
+    """Given the 1-based stop position of a transcript and its strand,
+    return a 1-based genomic range of the specified length that starts with
+    the base just after the end position. The smaller position is always
+    reported first.
+    Example:
+          fetch_range_after_transcript(4, '+', 2) would yield (5, 6)
+          fetch_range_after_transcript(4, '-', 2) would yield (2, 3)
     """
     if length < 1:
         raise ValueError("Length must be greater than or equal to 1")
 
-    if strand == '+':
+    if strand == "+":
         range_start = transcript_end + 1
         range_end = range_start + length - 1
-    elif strand == '-':
+    elif strand == "-":
         range_start = transcript_end - 1
         range_end = range_start - length + 1
     else:
@@ -104,49 +119,49 @@ def fetch_range_after_transcript(transcript_end: int, strand: str, length: int):
 
     return (min(range_start, range_end), max(range_start, range_end))
 
+
 def compute_transcript_end(transcript: pysam.AlignedSegment):
-    """ Compute the position of the final transcript base relative to the genome,
-        taking strand into account. Position is 1-based. """
+    """Compute the position of the final transcript base relative to the genome,
+    taking strand into account. Position is 1-based."""
 
     strand = "-" if transcript.is_reverse else "+"
-    if strand == '+':
+    if strand == "+":
         return transcript.reference_end
-    if strand == '-':
-        return transcript.reference_start + 1 # (make 1-based)
+    if strand == "-":
+        return transcript.reference_start + 1  # (make 1-based)
+
 
-def compute_frac_as_after_transcript(chrom: str, transcript_end: int, strand: str,
-                                     range_size: int, genome: pyfaidx.Fasta):
-    """ Given a transcript end, strand, range size, and genome object,
-        compute the fraction of sequence in the range immediately after
-        the transcript end that is made up of As."""
+def compute_frac_as_after_transcript(
+    chrom: str, transcript_end: int, strand: str, range_size: int, genome: pyfaidx.Fasta
+):
+    """Given a transcript end, strand, range size, and genome object,
+    compute the fraction of sequence in the range immediately after
+    the transcript end that is made up of As."""
 
     # Get sequence of range immediately after transcript
-    range_start, range_end = fetch_range_after_transcript(transcript_end,
-                                                          strand, range_size)
-    range_seq = fetch_seq(chrom, range_start, range_end, strand, genome,
-                          indexing = 1)
+    range_start, range_end = fetch_range_after_transcript(transcript_end, strand, range_size)
+    range_seq = fetch_seq(chrom, range_start, range_end, strand, genome, indexing=1)
 
     # Get fraction As in sequence
     return compute_frac_As(range_seq)
 
-  
-def split_reads_by_chrom(sam_file, tmp_dir = "tmp_label_reads", n_threads = 1):
-    """ Reads a SAM/BAM file and splits the reads into one file per chromosome.
-        Returns a list of the resulting filenames."""
+
+def split_reads_by_chrom(sam_file, tmp_dir="tmp_label_reads", n_threads=1):
+    """Reads a SAM/BAM file and splits the reads into one file per chromosome.
+    Returns a list of the resulting filenames."""
 
     ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
     print("[ %s ] Splitting SAM by chromosome..." % (ts))
 
     tmp_dir = tmp_dir + "/raw"
-    os.system("mkdir -p %s" %(tmp_dir))
+    os.system("mkdir -p %s" % (tmp_dir))
 
     if sam_file.endswith(".sam"):
         # Convert to bam
         ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
         print("[ %s ] -----Converting to bam...." % (ts))
         bam_file = tmp_dir + "/all_reads.bam"
-        pysam.view("-b", "-S", "-@", str(n_threads), "-o", bam_file, sam_file, 
-                   catch_stdout=False)
+        pysam.view("-b", "-S", "-@", str(n_threads), "-o", bam_file, sam_file, catch_stdout=False)
     elif sam_file.endswith(".bam"):
         bam_file = sam_file
     else:
@@ -160,24 +175,23 @@ def split_reads_by_chrom(sam_file, tmp_dir = "tmp_label_reads", n_threads = 1):
         pysam.sort("-@", str(n_threads), "-o", sorted_bam, bam_file)
         bam_file = sorted_bam
         pysam.index(bam_file)
-        
+
     # Open bam file
     tmp_dir += "/chroms"
-    os.system("mkdir -p %s" %(tmp_dir))
+    os.system("mkdir -p %s" % (tmp_dir))
     read_files = []
     ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
     print("[ %s ] -----Writing chrom files..." % (ts))
     with pysam.AlignmentFile(bam_file, "rb") as bam:
         # Iterate over chromosomes and write a reads file for each
-        chromosomes = [ x.contig for x in bam.get_index_statistics() \
-                        if x.mapped > 0 ]
+        chromosomes = [x.contig for x in bam.get_index_statistics() if x.mapped > 0]
         for chrom in chromosomes:
-           records = bam.fetch(chrom)
-           fname = tmp_dir + "/" + chrom + ".sam"
-           with pysam.AlignmentFile(fname, "w", template = bam) as o: 
-               for record in records:
-                   o.write(record)
-           read_files.append(fname)
+            records = bam.fetch(chrom)
+            fname = tmp_dir + "/" + chrom + ".sam"
+            with pysam.AlignmentFile(fname, "w", template=bam) as o:
+                for record in records:
+                    o.write(record)
+            read_files.append(fname)
 
     return read_files
 
@@ -185,16 +199,15 @@ def split_reads_by_chrom(sam_file, tmp_dir = "tmp_label_reads", n_threads = 1):
 def run_chrom_thread(sam_file, options):
     """ """
     outname = sam_file.split("/")[-1].split(".sam")[0]
-    genome = pyfaidx.Fasta(options.genome_file, sequence_always_upper=True,
-                           one_based_attributes=False)
+    genome = pyfaidx.Fasta(options.genome_file, sequence_always_upper=True, one_based_attributes=False)
 
-    os.system("mkdir -p %s" % (options.tmp_dir + "/labeled")) 
+    os.system("mkdir -p %s" % (options.tmp_dir + "/labeled"))
     out_log_fname = options.tmp_dir + "/labeled/" + outname + "_read_labels.tsv"
     out_sam_fname = options.tmp_dir + "/labeled/" + outname + ".sam"
 
     # Iterate over reads
-    out_log = open(out_log_fname, 'w')
-    pos_seen_fracA = {} # Store fraction As for previously seen positions
+    out_log = open(out_log_fname, "w")
+    pos_seen_fracA = {}  # Store fraction As for previously seen positions
     with pysam.AlignmentFile(sam_file) as sam:
         out_sam = pysam.AlignmentFile(out_sam_fname, "w", template=sam)
 
@@ -211,50 +224,51 @@ def run_chrom_thread(sam_file, options):
             if location_str in pos_seen_fracA:
                 frac_As = pos_seen_fracA[location_str]
             else:
-                frac_As = compute_frac_as_after_transcript(chrom, transcript_end, 
-                                                           strand,
-                                                           options.fracA_range_size,
-                                                           genome)
+                frac_As = compute_frac_as_after_transcript(
+                    chrom, transcript_end, strand, options.fracA_range_size, genome
+                )
                 pos_seen_fracA[location_str] = frac_As
 
-            record.tags += [('fA', round(frac_As,3))]
+            record.tags += [("fA", round(frac_As, 3))]
 
             # TODO: Add other labels to the read, i.e. CAGE, canonical polyA
 
             # Write to output files
             out_sam.write(record)
-            out_log.write("\t".join([read_id, str(frac_As)]) + '\n')
+            out_log.write("\t".join([read_id, str(frac_As)]) + "\n")
 
         out_sam.close()
     out_log.close()
     return
 
+
 def pool_outputs(indir, outprefix):
-    """ Given an input directory containing SAM files and log files,
-        concatenate them to form the final output. """
+    """Given an input directory containing SAM files and log files,
+    concatenate them to form the final output."""
 
     sam_fname = outprefix + "_labeled.sam"
-    log_fname = outprefix + "_read_labels.tsv" 
-    
+    log_fname = outprefix + "_read_labels.tsv"
+
     # Get list of files to combine
     sam_files = glob.glob(indir + "/*.sam")
     log_files = glob.glob(indir + "/*_read_labels.tsv")
 
     # Add headers
-    with open(log_fname, 'w') as f:
-        f.write("\t".join(["read_name", "fraction_As"]) + '\n')
+    with open(log_fname, "w") as f:
+        f.write("\t".join(["read_name", "fraction_As"]) + "\n")
 
-    os.system('cp %s %s' % (sam_files[0], sam_fname))
+    os.system("cp %s %s" % (sam_files[0], sam_fname))
 
-    # Concatenate 
+    # Concatenate
     for sam in sam_files[1:]:
         os.system('grep -v "^@" %s >> %s' % (sam, sam_fname))
 
     for logfile in log_files:
-        os.system('cat %s >> %s' % (logfile, log_fname))
+        os.system("cat %s >> %s" % (logfile, log_fname))
 
     return
 
+
 def main(options=None):
     if options == None:
         options = get_options()
@@ -270,8 +284,7 @@ def main(options=None):
             os.system("rm -r %s" % (options.tmp_dir))
 
         # Partition reads by chromosome
-        read_files = split_reads_by_chrom(options.sam_file, tmp_dir = options.tmp_dir, 
-                                          n_threads = options.threads)
+        read_files = split_reads_by_chrom(options.sam_file, tmp_dir=options.tmp_dir, n_threads=options.threads)
 
         # Now launch the parallel TALON read label jobs
         ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
@@ -290,10 +303,11 @@ def main(options=None):
     # Delete tmp_dir if desired
     if options.delete_tmp:
         os.system("rm -r %s" % (options.tmp_dir))
- 
+
     ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
     print("[ %s ] Run complete" % (ts))
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     options = get_options()
-    main(options) 
+    main(options)
diff --git a/src/talon/transcript.py b/src/talon/transcript.py
index b1c32d7..b64cb32 100644
--- a/src/talon/transcript.py
+++ b/src/talon/transcript.py
@@ -1,31 +1,30 @@
 # TALON: Techonology-Agnostic Long Read Analysis Pipeline
 # Author: Dana Wyman
-#------------------------------------------------------------------------------
+# ------------------------------------------------------------------------------
+
 
 class Transcript(object):
     """Stores information about a gene transcript, including its location
-       and constitutive exons.
-       Attributes:
-           identifier: Accession ID of transcript, i.e. an Ensembl ID. Must
-           be unique.
-           name: Human-readable name of the transcript. Does not have to be 
-           unique
-           chromosome: Chromosome that the transcript is located on 
-           (format "chr1")
-           start: The start position of the transcript with respect to the
-           forward strand 
-           end: The end position of the transcript with respect to the
-           forward strand
-           strand: "+" if the transcript is on the forward strand, and "-" if
-           it is on the reverse strand
-           gene_id: unique ID of the gene that this transcript belongs to
-           exons: List of exon objects belonging to this transcript, in sorted
-           order.
+    and constitutive exons.
+    Attributes:
+        identifier: Accession ID of transcript, i.e. an Ensembl ID. Must
+        be unique.
+        name: Human-readable name of the transcript. Does not have to be
+        unique
+        chromosome: Chromosome that the transcript is located on
+        (format "chr1")
+        start: The start position of the transcript with respect to the
+        forward strand
+        end: The end position of the transcript with respect to the
+        forward strand
+        strand: "+" if the transcript is on the forward strand, and "-" if
+        it is on the reverse strand
+        gene_id: unique ID of the gene that this transcript belongs to
+        exons: List of exon objects belonging to this transcript, in sorted
+        order.
     """
 
-    def __init__(self, identifier, chromosome, start, end, strand, gene_id, 
-                 annotations):
-
+    def __init__(self, identifier, chromosome, start, end, strand, gene_id, annotations):
         self.identifier = str(identifier)
         self.gene_id = str(gene_id)
 
@@ -39,7 +38,7 @@ def __init__(self, identifier, chromosome, start, end, strand, gene_id,
         self.annotations = annotations
 
     def get_5prime_vertex(self):
-        """ Returns ID of 5' end vertex """
+        """Returns ID of 5' end vertex"""
 
         if self.strand == "+":
             return self.exons[0].v1
@@ -47,7 +46,7 @@ def get_5prime_vertex(self):
             return self.exons[-1].v2
 
     def get_3prime_vertex(self):
-        """ Returns ID of 5' end vertex """
+        """Returns ID of 5' end vertex"""
 
         if self.strand == "+":
             return self.exons[-1].v2
@@ -58,7 +57,7 @@ def get_edge_path(self):
         edges = self.get_all_edges()
         if len(edges) == 0:
             return None
-        path = [ x.identifier for x in edges]
+        path = [x.identifier for x in edges]
 
         # Must reverse the path if the transcript is on the '-' strand
         if self.strand == "-":
@@ -67,30 +66,29 @@ def get_edge_path(self):
 
     def get_all_edges(self):
         all_edges = []
-        for i in range(0,self.n_exons):
+        for i in range(0, self.n_exons):
             all_edges.append(self.exons[i])
             try:
                 all_edges.append(self.introns[i])
             except:
                 pass
-            
+
         return all_edges
 
     def get_length(self):
-        """ Computes the length of the transcript by summing the lengths of
-            its exons """
+        """Computes the length of the transcript by summing the lengths of
+        its exons"""
 
         if len(self.exons) == 0:
-            raise ValueError('Cannot compute length: Transcript does not ' + \
-                             'have any exons')
-        
+            raise ValueError("Cannot compute length: Transcript does not " + "have any exons")
+
         transcript_length = 0
         for exon in self.exons:
             transcript_length += exon.length
         return transcript_length
 
     def get_exon_coords(self):
-        """ Returns a list of the exon coordinates in order """
+        """Returns a list of the exon coordinates in order"""
         exon_coords = []
         for exon in self.exons:
             exon_coords.append(int(exon.start))
@@ -101,11 +99,17 @@ def add_exon(self, exon):
         """Adds an exon object to the transcript."""
 
         if exon.start > exon.end:
-            raise ValueError('Exon start (' + str(exon.start) + ') ' + \
-                'is supposed to be before the exon end (' + str(exon.end) + ')')
+            raise ValueError(
+                "Exon start ("
+                + str(exon.start)
+                + ") "
+                + "is supposed to be before the exon end ("
+                + str(exon.end)
+                + ")"
+            )
 
         # Check where in the list the exon should be added
-        for i in range(0,len(self.exons)):
+        for i in range(0, len(self.exons)):
             existing_exon = self.exons[i]
             if exon.end < existing_exon.start:
                 self.exons = self.exons[0:i] + [exon] + self.exons[i:]
@@ -121,44 +125,53 @@ def add_intron(self, intron):
         """Adds an edge object to the transcript."""
 
         if intron.start > intron.end:
-            raise ValueError('Intron start (' + str(intron.start) + ')' + \
-                'is supposed to be before the intron end (' + str(intron.end) + ')')
+            raise ValueError(
+                "Intron start ("
+                + str(intron.start)
+                + ")"
+                + "is supposed to be before the intron end ("
+                + str(intron.end)
+                + ")"
+            )
 
         # Check where in the list the intron should be added
-        for i in range(0,len(self.introns)):
+        for i in range(0, len(self.introns)):
             existing_intron = self.introns[i]
             if intron.end < existing_intron.start:
                 self.introns = self.introns[0:i] + [intron] + self.introns[i:]
                 return
         self.introns.append(intron)
         return
-                    
+
     def check_exon_validity(self):
-        """ The transcript's exons are valid if:
-            1) Exons are in sorted order (ascending)
-            2) Exon bounds do not exceed transcript start and end
-            3) Exons are all on the appropriate chromosome
-            If these conditions are violated, this function raises an error.
+        """The transcript's exons are valid if:
+        1) Exons are in sorted order (ascending)
+        2) Exon bounds do not exceed transcript start and end
+        3) Exons are all on the appropriate chromosome
+        If these conditions are violated, this function raises an error.
         """
         prev = 0
         for exon in self.exons:
             if exon.chromosome != self.chromosome:
-                raise ValueError('Invalid exon in transcript ' + \
-                      self.identifier + ': wrong chromosome')
+                raise ValueError("Invalid exon in transcript " + self.identifier + ": wrong chromosome")
             if exon.start < self.start or exon.end > self.end:
                 print("self.start: " + str(self.start))
                 print("self.end: " + str(self.end))
                 print("exon.start: " + str(exon.start))
                 print("exon.end: " + str(exon.end))
-                raise ValueError('Invalid exon in transcript ' + \
-                      self.identifier + ': (' + str(exon.start) + "-" + \
-                      str(exon.end) + \
-                      ') is located beyond start or end of transcript')
+                raise ValueError(
+                    "Invalid exon in transcript "
+                    + self.identifier
+                    + ": ("
+                    + str(exon.start)
+                    + "-"
+                    + str(exon.end)
+                    + ") is located beyond start or end of transcript"
+                )
             if exon.start <= prev:
                 # This error would indicate a TALON bug rather than user error,
-                # so we shouldn't see it. 
-                raise ValueError('Exons of transcript ' + \
-                      self.identifier + ' are not stored in ascending order.')
+                # so we shouldn't see it.
+                raise ValueError("Exons of transcript " + self.identifier + " are not stored in ascending order.")
             prev = exon.end
         return
 
@@ -170,7 +183,7 @@ def get_introns(self):
         intron_list = []
 
         i = 1
-        while (i < len(exon_coords) - 1):
+        while i < len(exon_coords) - 1:
             j = i + 1
 
             intron_list.append(exon_coords[i] + 1)
@@ -179,32 +192,30 @@ def get_introns(self):
 
         return intron_list
 
-
     def print_transcript(self):
-        """ Print a string representation of the Transcript. Good for debugging
-        """
+        """Print a string representation of the Transcript. Good for debugging"""
         transcript_id = self.identifier
         if transcript_id == None:
             transcript_id = "Transcript"
 
-        print("\tLocation: " + self.chromosome + ":" + str(self.start) + "-" + \
-              str(self.end) + "(" + self.strand + ")")
+        print("\tLocation: " + self.chromosome + ":" + str(self.start) + "-" + str(self.end) + "(" + self.strand + ")")
 
         # Print exons
         print("\tExons: " + "\n".join([str(x.start) + "-" + str(x.end) for x in self.exons]))
-        return 
+        return
+
 
 def get_transcript_from_db(transcript_row, exon_tree, intron_tree):
-    """ Uses information from a database transcript entry to create a
+    """Uses information from a database transcript entry to create a
     Transcript object.
         Args:
-            transcript_row: Tuple-formatted row from transcripts table of a 
+            transcript_row: Tuple-formatted row from transcripts table of a
             TALON database
     """
-    transcript_id = str(transcript_row['transcript_id'])
-    gene_id = str(transcript_row['gene_id'])
+    transcript_id = str(transcript_row["transcript_id"])
+    gene_id = str(transcript_row["gene_id"])
 
-    edges = transcript_row['path'].split(",")
+    edges = transcript_row["path"].split(",")
 
     # Check strand
     sample_edge = str(edges[0])
@@ -212,7 +223,7 @@ def get_transcript_from_db(transcript_row, exon_tree, intron_tree):
 
     # Reverse the edge list if the transcript is on the - strand
     if strand == "-":
-        edges = edges[::-1]    
+        edges = edges[::-1]
 
     # Get start and end of transcript
     if edges[0] in exon_tree.edges and edges[-1] in exon_tree.edges:
@@ -220,30 +231,40 @@ def get_transcript_from_db(transcript_row, exon_tree, intron_tree):
         start = (exon_tree.edges[edges[0]]).start
         end = (exon_tree.edges[edges[-1]]).end
     else:
-        raise RuntimeError("Ignoring transcript with ID " + transcript_id +\
-                " because first or last exon not found in exon tree.")
-        
-    transcript = Transcript(transcript_id, chromosome, start, end, strand, 
-                            gene_id,{})
+        raise RuntimeError(
+            "Ignoring transcript with ID " + transcript_id + " because first or last exon not found in exon tree."
+        )
 
-    # Make sure that all of the exons and introns in this transcript have a 
+    transcript = Transcript(transcript_id, chromosome, start, end, strand, gene_id, {})
+
+    # Make sure that all of the exons and introns in this transcript have a
     # non-zero length. Otherwise, return None
-    for i in range(0,len(edges)):
+    for i in range(0, len(edges)):
         # Even indices are exons
         if i % 2 == 0:
             curr_exon_id = str(edges[i])
             if curr_exon_id not in exon_tree.edges:
-                raise RuntimeError("Ignoring transcript with ID " + transcript_id +\
-                " because exon " + curr_exon_id + " not found in exon tree.")
-                
+                raise RuntimeError(
+                    "Ignoring transcript with ID "
+                    + transcript_id
+                    + " because exon "
+                    + curr_exon_id
+                    + " not found in exon tree."
+                )
+
         else:
             curr_intron_id = str(edges[i])
             if curr_intron_id not in intron_tree.edges:
-                print("Warning: Ignoring transcript with ID " + transcript_id +\
-                " because intron " + curr_intron_id + " not found in intron tree.")
+                print(
+                    "Warning: Ignoring transcript with ID "
+                    + transcript_id
+                    + " because intron "
+                    + curr_intron_id
+                    + " not found in intron tree."
+                )
                 return None
 
-    for i in range(0,len(edges)):
+    for i in range(0, len(edges)):
         # Even indices are exons
         if i % 2 == 0:
             curr_exon_id = str(edges[i])
@@ -257,23 +278,23 @@ def get_transcript_from_db(transcript_row, exon_tree, intron_tree):
             (curr_intron.transcript_ids).add(transcript_id)
 
     return transcript
-    
+
 
 def get_transcript_from_gtf(transcript_info):
-    """ Uses information from a GTF-formatted transcript entry to create a
+    """Uses information from a GTF-formatted transcript entry to create a
     Transcript object.
         Args:
-            transcript_info: A list containing fields from a GTF file gene 
+            transcript_info: A list containing fields from a GTF file gene
             entry. Example:
-          
+
             chr1	HAVANA	transcript	12010	13670	.	+
-            .	gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; 
-            gene_type "transcribed_unprocessed_pseudogene"; 
-            gene_status "KNOWN"; gene_name "DDX11L1"; 
-            transcript_type "transcribed_unprocessed_pseudogene"; 
-            transcript_status "KNOWN"; transcript_name "DDX11L1-001"; 
-            level 2; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; 
-            transcript_support_level "NA"; havana_gene "OTTHUMG00000000961.2"; 
+            .	gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2";
+            gene_type "transcribed_unprocessed_pseudogene";
+            gene_status "KNOWN"; gene_name "DDX11L1";
+            transcript_type "transcribed_unprocessed_pseudogene";
+            transcript_status "KNOWN"; transcript_name "DDX11L1-001";
+            level 2; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic";
+            transcript_support_level "NA"; havana_gene "OTTHUMG00000000961.2";
             havana_transcript "OTTHUMT00000002844.2";
     """
     chromosome = transcript_info[0]
@@ -282,36 +303,36 @@ def get_transcript_from_gtf(transcript_info):
     strand = transcript_info[6]
 
     if "transcript_id" not in transcript_info[-1]:
-            raise ValueError('GTF entry lacks a transcript_id field')
+        raise ValueError("GTF entry lacks a transcript_id field")
     annotations = extract_transcript_annotations_from_GTF(transcript_info)
 
+    gene_id = annotations["gene_id"]
+    transcript_id = annotations["transcript_id"]
 
-    gene_id = annotations['gene_id']
-    transcript_id = annotations['transcript_id']
-
-    transcript = Transcript(transcript_id, chromosome, start, end, strand, 
-                            gene_id, annotations)
+    transcript = Transcript(transcript_id, chromosome, start, end, strand, gene_id, annotations)
 
     return transcript
 
+
 def extract_transcript_annotations_from_GTF(tab_fields):
-    """ Extracts key-value annotations from the GTF description field
-    """
+    """Extracts key-value annotations from the GTF description field"""
 
     attributes = {}
 
     # remove trailing newline and split by semicolon
-    description = tab_fields[-1].strip('\n')
-    description = description.split(';')
+    description = tab_fields[-1].strip("\n")
+    description = description.split(";")
 
     # Parse description
     for fields in description:
-        if fields == "" or fields == " ": continue
+        if fields == "" or fields == " ":
+            continue
         fields = fields.split()
-        if fields[0] == '': fields = fields[1:]
+        if fields[0] == "":
+            fields = fields[1:]
 
-        key = fields[0].replace('"', '')
-        val = ' '.join(fields[1:]).replace('"', '')
+        key = fields[0].replace('"', "")
+        val = " ".join(fields[1:]).replace('"', "")
 
         attributes[key] = val
 
@@ -322,33 +343,31 @@ def extract_transcript_annotations_from_GTF(tab_fields):
 
     attributes["source"] = tab_fields[1]
 
-    return attributes    
+    return attributes
 
 
 def get_transcript_from_exon(exon, gene_id, transcript_id):
-    """ In rare cases, GTF exons are listed with gene and transcript IDs that
-        do not have corresponding entries. In this case, we create a transcript
-        for this exon for bookkeeping purposes."""
+    """In rare cases, GTF exons are listed with gene and transcript IDs that
+    do not have corresponding entries. In this case, we create a transcript
+    for this exon for bookkeeping purposes."""
 
     name = transcript_id
     chromosome = exon.chromosome
     start = exon.start
     end = exon.end
     strand = exon.strand
-    transcript = Transcript(transcript_id, name, None, chromosome, start, end,
-                            strand, gene_id)
+    transcript = Transcript(transcript_id, name, None, chromosome, start, end, strand, gene_id)
     return transcript
 
-def create_novel_transcript(chromosome, start, end, strand, gene_id, counter,
-                             exons, introns):
-    """ Creates a novel transcript with a unique identifier (obtained using
-        counter). Returns the transcript object as well as the updated counter.
+
+def create_novel_transcript(chromosome, start, end, strand, gene_id, counter, exons, introns):
+    """Creates a novel transcript with a unique identifier (obtained using
+    counter). Returns the transcript object as well as the updated counter.
     """
     counter["transcripts"] += 1
     transcript_id = str(counter["transcripts"])
-    
-    transcript = Transcript(transcript_id, chromosome, start, end, strand, 
-                            gene_id, None)
+
+    transcript = Transcript(transcript_id, chromosome, start, end, strand, gene_id, None)
 
     for exon in exons:
         transcript.add_exon(exon)
diff --git a/src/talon/transcript_utils.py b/src/talon/transcript_utils.py
index 3a0ec1f..ce37048 100644
--- a/src/talon/transcript_utils.py
+++ b/src/talon/transcript_utils.py
@@ -5,10 +5,12 @@
 
 import itertools
 import re
+
 import pysam
 
+
 def check_read_quality(sam_record: pysam.AlignedSegment, run_info):
-    """ Process an individual sam read and return quality attributes. """
+    """Process an individual sam read and return quality attributes."""
     read_ID = sam_record.query_name
     flag = sam_record.flag
     cigar = sam_record.cigarstring
@@ -16,9 +18,9 @@ def check_read_quality(sam_record: pysam.AlignedSegment, run_info):
     read_length = sam_record.query_length
 
     if not run_info.use_cb_tag:
-        dataset = sam_record.get_tag('RG')
+        dataset = sam_record.get_tag("RG")
     elif run_info.use_cb_tag:
-        dataset = sam_record.get_tag('CB')
+        dataset = sam_record.get_tag("CB")
 
     # Only use uniquely mapped transcripts
     if flag not in [0, 16]:
@@ -30,7 +32,7 @@ def check_read_quality(sam_record: pysam.AlignedSegment, run_info):
 
     # Locate the MD field of the sam transcript
     try:
-        md_tag = sam_record.get_tag('MD')
+        md_tag = sam_record.get_tag("MD")
     except KeyError:
         raise ValueError("SAM transcript %s lacks an MD tag" % read_ID)
 
@@ -39,120 +41,125 @@ def check_read_quality(sam_record: pysam.AlignedSegment, run_info):
     coverage = compute_alignment_coverage(cigar)
     identity = compute_alignment_identity(md_tag, seq)
 
-    if coverage < run_info.min_coverage or \
-       identity < run_info.min_identity:
+    if coverage < run_info.min_coverage or identity < run_info.min_identity:
         return [dataset, read_ID, 0, 1, read_length, coverage, identity]
 
     # At this point, the read has passed the quality control
     return [dataset, read_ID, 1, 1, read_length, coverage, identity]
 
+
 def compute_alignment_coverage(CIGAR):
-    """ This function computes what fraction of the read is actually aligned to
-        the genome by excluding hard or soft-clipped bases."""
+    """This function computes what fraction of the read is actually aligned to
+    the genome by excluding hard or soft-clipped bases."""
 
     total_bases = 0.0
     unaligned_bases = 0.0
     ops, counts = split_cigar(CIGAR)
-    for op,ct in zip(ops, counts):
+    for op, ct in zip(ops, counts):
         if op == "N":
             continue
         if op == "H" or op == "S":
             unaligned_bases += ct
         total_bases += ct
 
-    return (total_bases - unaligned_bases)/total_bases
+    return (total_bases - unaligned_bases) / total_bases
+
 
 def compute_alignment_identity(MD_tag, SEQ):
-    """ This function computes what fraction of the read matches the reference
-        genome."""
+    """This function computes what fraction of the read matches the reference
+    genome."""
 
     total_bases = len(SEQ)
     matches = 0.0
     ops, counts = splitMD(MD_tag)
-    for op,ct in zip(ops, counts):
+    for op, ct in zip(ops, counts):
         if op == "M":
             matches += ct
         if op == "D":
             total_bases += ct
 
-    return matches/total_bases
+    return matches / total_bases
+
 
 def splitMD(MD):
-        """ Takes MD tag and splits into two lists:
-            one with capital letters (match operators), and one with
-            the number of bases that each operation applies to. """
-
-        operations = []
-
-        # Split MD string where type changes.
-        # Digits are separated from base changes.
-        # Deletions (with ^) are captured together.
-        counts = ["".join(x) for _, x in itertools.groupby(MD, key=str.isdigit)]
-
-        # Get operations
-        for i in range(0,len(counts)):
-            curr = counts[i]
-            try:
-                counts[i] = int(curr)
-                operations.append("M")
-            except ValueError:
-                # Handle deletion
-                if curr.startswith("^"):
-                    operations.append("D")
-                    counts[i] = len(counts[i]) - 1
-                else:
-                    operations.append("X")
-                    counts[i] = len(counts[i])
-
-        return operations, counts
+    """Takes MD tag and splits into two lists:
+    one with capital letters (match operators), and one with
+    the number of bases that each operation applies to."""
+
+    operations = []
+
+    # Split MD string where type changes.
+    # Digits are separated from base changes.
+    # Deletions (with ^) are captured together.
+    counts = ["".join(x) for _, x in itertools.groupby(MD, key=str.isdigit)]
+
+    # Get operations
+    for i in range(0, len(counts)):
+        curr = counts[i]
+        try:
+            counts[i] = int(curr)
+            operations.append("M")
+        except ValueError:
+            # Handle deletion
+            if curr.startswith("^"):
+                operations.append("D")
+                counts[i] = len(counts[i]) - 1
+            else:
+                operations.append("X")
+                counts[i] = len(counts[i])
+
+    return operations, counts
+
 
 def split_cigar(cigar):
-    """ Takes CIGAR string from SAM and splits it into two lists:
-        one with capital letters (match operators), and one with
-        the number of bases that each operation applies to. """
+    """Takes CIGAR string from SAM and splits it into two lists:
+    one with capital letters (match operators), and one with
+    the number of bases that each operation applies to."""
 
-    alignTypes = re.sub('[0-9]', " ", cigar).split()
-    counts = re.sub('[=A-Z]', " ", cigar).split()
+    alignTypes = re.sub("[0-9]", " ", cigar).split()
+    counts = re.sub("[=A-Z]", " ", cigar).split()
     counts = [int(i) for i in counts]
 
     return alignTypes, counts
 
+
 def compute_transcript_end(start, cigar):
-    """ Given the start position and CIGAR string of a mapped SAM transcript,
-        compute the end position in the reference genome.
-        Args:
-            start: The start position of the transcript with respect to the
-            forward strand
+    """Given the start position and CIGAR string of a mapped SAM transcript,
+    compute the end position in the reference genome.
+    Args:
+        start: The start position of the transcript with respect to the
+        forward strand
 
-            cigar: SAM CIGAR string describing match operations to the reference
-            genome
+        cigar: SAM CIGAR string describing match operations to the reference
+        genome
 
-        Returns:
-            end position of the transcript.
+    Returns:
+        end position of the transcript.
     """
     end = start
 
     ops, counts = split_cigar(cigar)
-    for op,ct in zip(ops, counts):
+    for op, ct in zip(ops, counts):
         if op in ["=", "H", "M", "N", "D"]:
             end += ct
 
     return end - 1
 
-def compute_jI(start, cigar):
-    """ If the input sam file doesn't have the custom STARlong-derived jI tag,
-        we need to compute it. This is done by stepping through the CIGAR
-        string, where introns are represented by the N operation.
-
-        start: The start position of the transcript with respect to the
-               forward strand
-        cigar: SAM CIGAR string describing match operations to the reference
-               genome
-        Returns: jI string representation of intron start and end positions.
 
-        Example jI strings:
-            no introns: jI:B:i,-1
-            two introns: jI:B:i,167936516,167951806,167951862,167966628
+def compute_jI(start, cigar):
+    """If the input sam file doesn't have the custom STARlong-derived jI tag,
+    we need to compute it. This is done by stepping through the CIGAR
+    string, where introns are represented by the N operation.
+
+    start: The start position of the transcript with respect to the
+           forward strand
+    cigar: SAM CIGAR string describing match operations to the reference
+           genome
+    Returns: jI string representation of intron start and end positions.
+
+    Example jI strings:
+        no introns: jI:B:i,-1
+        two introns: jI:B:i,167936516,167951806,167951862,167966628
     """
 
     operations, counts = split_cigar(cigar)
@@ -160,7 +167,7 @@ def compute_jI(start, cigar):
     genomePos = start
 
     # Iterate over cigar operations
-    for op,ct in zip(operations, counts):
+    for op, ct in zip(operations, counts):
         if op == "N":
             # This is an intron
             intronStart = genomePos
@@ -181,22 +188,22 @@ def compute_jI(start, cigar):
 
 
 def get_introns(sam_record: pysam.AlignedSegment, start, cigar):
-    """ Locates the jI field in a list of SAM fields or computes
-        it from the CIGAR string and start position if it isn't found.
-        Note that positions refer to start and endpoints of introns, not exons,
-        so adjustments are needed to avoid an off-by-one error if you want exons.
-
-        Example jI strings:
-            no introns: jI:B:i,-1
-            two introns: jI:B:i,167936516,167951806,167951862,167966628
-        Args:
-            sam_record: a pysam AlignedSegment
-            start: The start position of the transcript with respect to the
-            forward strand
-            cigar: SAM CIGAR string describing match operations to the reference
-            genome
-        Returns:
-            intron_list: intron starts and ends in a list (sorted order)
+    """Locates the jI field in a list of SAM fields or computes
+    it from the CIGAR string and start position if it isn't found.
+    Note that positions refer to start and endpoints of introns, not exons,
+    so adjustments are needed to avoid an off-by-one error if you want exons.
+
+    Example jI strings:
+        no introns: jI:B:i,-1
+        two introns: jI:B:i,167936516,167951806,167951862,167966628
+    Args:
+        sam_record: a pysam AlignedSegment
+        start: The start position of the transcript with respect to the
+        forward strand
+        cigar: SAM CIGAR string describing match operations to the reference
+        genome
+    Returns:
+        intron_list: intron starts and ends in a list (sorted order)
     """
     try:
         intron_list = sam_record.get_tag("jI").tolist()

From e8f4ca55b9af7fe3d99d08b0c5af251399ee62a7 Mon Sep 17 00:00:00 2001
From: fairliereese <fairliek@comcast.net>
Date: Mon, 9 Oct 2023 11:15:15 -0700
Subject: [PATCH 25/31] added more read assignment info to the logger

---
 src/talon/logger.py |  2 +-
 src/talon/talon.py  | 91 ++++++++++++++++++++++++++-------------------
 2 files changed, 54 insertions(+), 39 deletions(-)

diff --git a/src/talon/logger.py b/src/talon/logger.py
index 31103d7..0d2ede7 100644
--- a/src/talon/logger.py
+++ b/src/talon/logger.py
@@ -9,7 +9,7 @@ def _init_logger(verbosity):
     level = levels[min(verbosity, len(levels) - 1)]  # cap to last level index
 
     # set defaults
-    msg_fmt = "%(asctime)s : %(levelname)s : [%(filename)s:%(lineno)d] : %(message)s"
+    msg_fmt = "%(asctime)s : %(levelname)s : [%(filename)s:%(lineno)d:%(funcName)s] : %(message)s"
     date_fmt = "[ %Y-%m-%d %H:%M:%S ]"
 
     logging.basicConfig(level=level, format=msg_fmt, datefmt=date_fmt)
diff --git a/src/talon/talon.py b/src/talon/talon.py
index 79daa18..da96c93 100644
--- a/src/talon/talon.py
+++ b/src/talon/talon.py
@@ -475,6 +475,8 @@ def create_edge(vertex_1, vertex_2, edge_type, strand, edge_dict):
 def create_gene(chromosome, start, end, strand, memory_cursor, tmp_gene):
     """Create a novel gene and add it to the temporary table."""
     new_ID = gene_counter.increment()
+    logging.debug(f'Creating new gene with id {new_ID}')
+
 
     new_gene = (new_ID, chromosome, min(start, end), max(start, end), strand)
     cols = ' ("gene_ID", "chromosome", "start", "end", "strand")'
@@ -489,9 +491,10 @@ def create_transcript(
     """Creates a novel transcript, add it to the transcript data structure,
     and add to tmp_t
     """
-    print("creating new transcript")
+    # print("creating new transcript")
     new_ID = transcript_counter.increment()
-    print(f"new tid:{new_ID}")
+    # print(f"new tid:{new_ID}")
+    logging.debug(f'Creating new transcript with id {new_ID}')
 
     # updating the dict
     if len(edge_IDs) > 1:
@@ -942,7 +945,8 @@ def process_ISM(
 
     # choose gene to assign it to
     gene_matches = list(set([match["gene_ID"] for match in all_matches]))
-    print(gene_matches)
+    logging.debug(f'Genes with matching vertices: {gene_matches}')
+    # print(gene_matches)
 
     # tie break based on distance to 5' / 3' ends
     if len(gene_matches) > 1:
@@ -1202,6 +1206,8 @@ def find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex_2_gene):
             needs to be created
         fusion (bool): Whether gene read is from might be fusion / read through
     """
+    logging.debug('Attempting to assign gene based on vertex concordance')
+
     gene_matches = []
     n_gene_matches = []
 
@@ -1243,15 +1249,14 @@ def find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex_2_gene):
     # when there are no shared splice sites between gene hits but we did
     # hit more than one gene
     elif max(n_gene_matches) <= 1 and len(gene_tally) > 1:
-        print(" went here")
+        logging.debug('Found a potential fusion transcript')
         return None, True
 
     # if we hit more than one gene and they have overlapping sjs,
-    # tie break based on ?????
     elif len(gene_tally) > 1:
-        print("i found more than one gene")
-        print(gene_tally)
-        print(n_gene_matches)
+        logging.debug('Found more than one gene w/ overlapping vertices')
+        # print(gene_tally)
+        # print(n_gene_matches)
         return list(gene_tally.keys()), False
         # temp = df.loc[df.gid.isin(gene_matches)].copy(deep=True)
         # temp = temp.drop_duplicates()
@@ -1271,6 +1276,8 @@ def find_gene_match_on_vertex_basis(vertex_IDs, strand, vertex_2_gene):
     else:
         gene_ID = max(gene_tally, key=gene_tally.get)
         fusion = False
+        logging.debug(f'Assigning this read to gene {gene_ID}')
+
 
     return gene_ID, fusion
 
@@ -1323,13 +1330,15 @@ def process_NNC(
         gene_starts,
         gene_ends,
     )
-    print("gene id process_nnc")
-    print(gene_ID)
-    print(fusion)
+    # print("gene id process_nnc")
+    # print(gene_ID)
+    # print(fusion)
 
     if gene_ID == None:
         return None, None, [], None, fusion
 
+    logging.debug(f'Assigning this read to gene {gene_ID}')
+
     # Get matches for the ends
     start_vertex, start_exon, start_novelty, known_start, diff_5p = process_5p(
         chrom, positions, strand, vertex_IDs, gene_ID, gene_starts, edge_dict, locations, run_info
@@ -1457,7 +1466,7 @@ def process_remaining_mult_cases(
     transcript_novelty = []
     start_end_info = {}
     if not run_info.create_novel_spliced_genes or not fusion:
-        print("did i get here?")
+        # print("did i get here?")
         gene_ID, match_strand = search_for_overlap_with_gene(
             chrom, positions[0], positions[-1], strand, cursor, run_info, tmp_gene, tmp_t
         )
@@ -1487,13 +1496,13 @@ def process_remaining_mult_cases(
     start_end_info["vertex_IDs"] = vertex_IDs
 
     if gene_ID == None:
-        print(f"fusion: {fusion}")
+        # print(f"fusion: {fusion}")
         if fusion:
-            print("i should be here")
+            # print("i should be here")
             t_nov = "fusion_transcript"
             g_nov = "fusion_novel"
         else:
-            print("but I think im going here")
+            # print("but I think im going here")
             t_nov = "intergenic_transcript"
             g_nov = "intergenic_novel"
 
@@ -1591,7 +1600,8 @@ def identify_transcript(
     all_exons_known = check_all_exons_known(e_novelty)
     splice_vertices_known = sum(v_novelty) == 0
     all_exons_novel = reduce(operator.mul, e_novelty, 1) == 1
-    print(f"all exons novel : {all_exons_novel}")
+    # print(f"all exons novel : {all_exons_novel}")
+    logging.debug(f'All exons novel?: {all_exons_novel}')
     fusion = False
 
     # Look for FSM or ISM.
@@ -1600,7 +1610,8 @@ def identify_transcript(
         all_matches = search_for_ISM(edge_IDs, transcript_dict)
         if all_matches != None:
             # Look for FSM first
-            print("looking for fsm")
+            # print("looking for fsm")
+            logging.debug('Looking for FSMs')
             gene_ID, transcript_ID, transcript_novelty, start_end_info = process_FSM(
                 chrom,
                 positions,
@@ -1616,7 +1627,8 @@ def identify_transcript(
             )
             if gene_ID == None:
                 # Now look for ISM
-                print("looking for ism")
+                # print("looking for ism")
+                logging.debug('Looking for ISM')
                 gene_ID, transcript_ID, transcript_novelty, start_end_info = process_ISM(
                     chrom,
                     positions,
@@ -1634,11 +1646,11 @@ def identify_transcript(
                     tmp_gene,
                     tmp_t,
                 )
-                print(f"gene id from process ism {gene_ID}")
 
         # Look for NIC
         if gene_ID == None:
-            print("looking for nic")
+            # print("looking for nic")
+            logging.debug('Looking for NIC')
             gene_ID, transcript_ID, transcript_novelty, start_end_info, fusion = process_NIC(
                 chrom,
                 positions,
@@ -1660,7 +1672,8 @@ def identify_transcript(
     # Novel in catalog transcripts have known splice donors and acceptors,
     # but new connections between them.
     elif splice_vertices_known and gene_ID == None:
-        print("looking for nic (again?)")
+        # print("looking for nic (again?)")
+        logging.info('Looking for NIC (2)')
         gene_ID, transcript_ID, transcript_novelty, start_end_info, fusion = process_NIC(
             chrom,
             positions,
@@ -1681,7 +1694,8 @@ def identify_transcript(
 
     # Antisense transcript with splice junctions matching known gene
     if splice_vertices_known and gene_ID == None and not fusion:
-        print("looking for spliced antisese")
+        # print("looking for spliced antisese")
+        logging.debug('Looking for splice antisense')
         gene_ID, transcript_ID, gene_novelty, transcript_novelty, start_end_info = process_spliced_antisense(
             chrom,
             positions,
@@ -1704,7 +1718,8 @@ def identify_transcript(
     # and contain at least one splice junction. There should also be at least
     # one shared exon from existing transcripts to even try assigning a gene
     elif not (splice_vertices_known) and not fusion and not all_exons_novel:
-        print("lookign for NNCs")
+        # print("lookign for NNCs")
+        logging.debug('Looking for NNC')
         gene_ID, transcript_ID, transcript_novelty, start_end_info, fusion = process_NNC(
             chrom,
             positions,
@@ -1722,10 +1737,11 @@ def identify_transcript(
             tmp_gene,
             tmp_t,
         )
-        print(f"geneID from process_nnc: {gene_ID}")
+        # print(f"geneID from process_nnc: {gene_ID}")
     # Transcripts that don't match the previous categories end up here
     if gene_ID == None:
-        print("looking for this other stuff")
+        # print("looking for this other stuff")
+        logging.debug('Looking for everything else')
         gene_ID, transcript_ID, gene_novelty, transcript_novelty, start_end_info = process_remaining_mult_cases(
             chrom,
             positions,
@@ -1745,8 +1761,8 @@ def identify_transcript(
             fusion,
         )
 
-    print("this is the gene id it decided on")
-    print(gene_ID)
+    logging.debug(f'Gene ID for this read: {gene_ID}')
+
     # Add all novel vertices to vertex_2_gene now that we have the gene ID
     vertex_IDs = start_end_info["vertex_IDs"]
     edge_IDs = start_end_info["edge_IDs"]
@@ -2434,7 +2450,7 @@ def batch_add_vertex2gene(cursor, v2g_file, batch_size):
                 cursor.executemany(command, batch)
 
             except Exception as e:
-                print(e)
+                logging.error(e)
                 sys.exit(1)
     return
 
@@ -2459,7 +2475,7 @@ def batch_add_locations(cursor, location_file, batch_size):
                 cursor.executemany(command, batch)
 
             except Exception as e:
-                print(e)
+                logging.error(e)
                 sys.exit(1)
     return
 
@@ -2484,7 +2500,7 @@ def batch_add_edges(cursor, edge_file, batch_size):
                 cursor.executemany(command, batch)
 
             except Exception as e:
-                print(e)
+                logging.error(e)
                 sys.exit(1)
 
     return
@@ -2530,7 +2546,7 @@ def batch_add_transcripts(cursor, transcript_file, batch_size):
                 cursor.executemany(command, batch)
 
             except Exception as e:
-                print(e)
+                logging.error(e)
                 sys.exit(1)
 
     return
@@ -2552,7 +2568,7 @@ def batch_add_genes(cursor, gene_file, batch_size):
                 cursor.executemany(command, batch)
 
             except Exception as e:
-                print(e)
+                logging.error(e)
                 sys.exit(1)
     return
 
@@ -2568,7 +2584,7 @@ def add_datasets(cursor, datasets):
         cursor.executemany(command, datasets)
 
     except Exception as e:
-        print(e)
+        logging.error(e)
         sys.exit(1)
     return
 
@@ -2598,7 +2614,7 @@ def batch_add_annotations(cursor, annot_file, annot_type, batch_size):
                 cursor.executemany(command, batch)
 
             except Exception as e:
-                print(e)
+                logging.error(e)
                 sys.exit(1)
     return
 
@@ -2682,7 +2698,7 @@ def batch_add_observed(cursor, observed_file, batch_size):
                 cursor.executemany(command, batch)
 
             except Exception as e:
-                print(e)
+                logging.error(e)
                 sys.exit(1)
 
     # Now create abundance tuples and add to DB
@@ -2712,7 +2728,7 @@ def batch_add_abundance(cursor, entries, batch_size):
             command = 'INSERT INTO "abundance"' + cols + "VALUES " + "(?,?,?)"
             cursor.executemany(command, batch)
         except Exception as e:
-            print(e)
+            logging.error(e)
             sys.exit(1)
     return
 
@@ -2929,8 +2945,7 @@ def annotate_read(sam_record: pysam.AlignedSegment, cursor, run_info, struct_col
     """
     # Parse attributes to determine the chromosome, positions, and strand of the transcript
     read_ID = sam_record.query_name
-    print()
-    print(read_ID)
+    logging.debug(read_ID)
     if not run_info.use_cb_tag:
         dataset = sam_record.get_tag("RG")
     else:

From 129a8ff3f91592d21962053bbbf1beea37e63475 Mon Sep 17 00:00:00 2001
From: fairliereese <fairliek@comcast.net>
Date: Mon, 9 Oct 2023 14:54:31 -0700
Subject: [PATCH 26/31] added adjustments to tests to accomodate new function
 signatures

---
 src/talon/talon.py                            | 52 ++++++++----
 testing_suite/build_test_databases.py         |  5 +-
 testing_suite/test_ISM_identification.py      | 35 ++++----
 testing_suite/test_NIC_identification.py      | 15 +++-
 testing_suite/test_NNC_identification.py      |  6 +-
 testing_suite/test_all_exons_known.py         | 12 ++-
 testing_suite/test_all_exons_novel.py         | 43 ++++++++++
 testing_suite/test_database_updates.py        | 25 +++---
 testing_suite/test_get_overlap.py             | 14 ++--
 testing_suite/test_identification_flow.py     | 81 +++++++++++++------
 testing_suite/test_monoexonic.py              | 17 ++--
 ...on_read_overlapping_monoexon_transcript.py | 17 ++--
 .../test_process_remaining_mult_cases.py      | 22 ++++-
 .../test_search_for_overlap_with_gene.py      | 53 +++++++-----
 14 files changed, 275 insertions(+), 122 deletions(-)
 create mode 100644 testing_suite/test_all_exons_novel.py

diff --git a/src/talon/talon.py b/src/talon/talon.py
index da96c93..b94626e 100644
--- a/src/talon/talon.py
+++ b/src/talon/talon.py
@@ -534,6 +534,7 @@ def check_all_exons_known(novelty):
     exons are known or not. Return True if all are known, and False
     otherwise. Input should not include first or last exon."""
 
+
     if len(novelty) == 1:
         return novelty[0] == 0
 
@@ -544,6 +545,22 @@ def check_all_exons_known(novelty):
     else:
         return True
 
+def check_all_exons_novel(novelty):
+    """Given a list in which each element represents the novelty (1) or
+    known-ness of a transcript edge (0), determine whether all of the
+    exons are novel or not. Return True if all are novel, and False
+    otherwise. Input should not include first or last exon."""
+
+    if len(novelty) == 1:
+        return 0 # we have no exons to analyze
+
+    exons = novelty[1::2]
+
+    if sum(exons) != len(exons):
+        return False
+    else:
+        return True
+
 
 def check_all_SJs_known(novelty):
     """Given a list in which each element represents the novelty (1) or
@@ -694,7 +711,7 @@ def search_for_overlap_with_gene(chromosome, start, end, strand, cursor, run_inf
     # restrict to just the genes we care about
     if gene_IDs:
         # print(f'restricting just to {gene_IDs}')
-        logging.debug(f"Restricing gene tiebreak to {gene_IDs}")
+        logging.debug(f"Restricting gene tiebreak to {gene_IDs}")
         matches = [match for match in matches if match["gene_ID"] in gene_IDs]
 
     if len(matches) == 0:
@@ -728,9 +745,10 @@ def get_best_match(matches, min_end, max_end):
 
     # print(f'read min: {min_end}')
     # print(f'read end: {max_end}')
-    logging.debug(f"Read start / end: ({min_end}, {min_end})")
+    logging.debug(f"Read start / end: ({min_end}, {max_end})")
 
     for match in matches:
+        logging.debug('')
         logging.debug(f"Matching with transcripts from gene {match['gene_ID']}, transcript {match['transcript_ID']}")
         end_dist = abs(match["max_pos"] - max_end)
         start_dist = abs(match["min_pos"] - min_end)
@@ -1052,8 +1070,7 @@ def process_ISM(
 
 
 def assign_gene(
-    vertex_IDs, strand, vertex_2_gene, chrom, start, end, cursor, run_info, tmp_gene, tmp_t, gene_starts, gene_ends
-):
+    vertex_IDs, strand, vertex_2_gene, chrom, start, end, cursor, run_info, tmp_gene, tmp_t):
     """
     Assign a gene to a transcript. First do this on the basis of splice site
     matching. If this yields more than one gene, then choose the gene with the
@@ -1111,9 +1128,7 @@ def process_NIC(
         cursor,
         run_info,
         tmp_gene,
-        tmp_t,
-        gene_starts,
-        gene_ends,
+        tmp_t
     )
 
     # gene_ID, fusion = find_gene_match_on_vertex_basis(vertex_IDs,
@@ -1326,9 +1341,7 @@ def process_NNC(
         cursor,
         run_info,
         tmp_gene,
-        tmp_t,
-        gene_starts,
-        gene_ends,
+        tmp_t
     )
     # print("gene id process_nnc")
     # print(gene_ID)
@@ -1466,7 +1479,6 @@ def process_remaining_mult_cases(
     transcript_novelty = []
     start_end_info = {}
     if not run_info.create_novel_spliced_genes or not fusion:
-        # print("did i get here?")
         gene_ID, match_strand = search_for_overlap_with_gene(
             chrom, positions[0], positions[-1], strand, cursor, run_info, tmp_gene, tmp_t
         )
@@ -1496,7 +1508,7 @@ def process_remaining_mult_cases(
     start_end_info["vertex_IDs"] = vertex_IDs
 
     if gene_ID == None:
-        # print(f"fusion: {fusion}")
+        logging.debug(f"Fusion: {fusion}")
         if fusion:
             # print("i should be here")
             t_nov = "fusion_transcript"
@@ -1590,18 +1602,24 @@ def identify_transcript(
 
     # Get vertex matches for the transcript positions
     vertex_IDs, v_novelty = match_splice_vertices(chrom, positions, strand, location_dict, run_info)
+    logging.debug(f'Vertex IDs: {vertex_IDs}')
+    logging.debug(f'Vertex novelties: {v_novelty}')
+
 
     # Get edge matches for transcript exons and introns based on the vertices
     edge_IDs, e_novelty = match_all_splice_edges(vertex_IDs, strand, edge_dict, run_info)
+    logging.debug(f'Edge IDs: {edge_IDs}')
+    logging.debug(f'Exon novelty: {e_novelty}')
 
     # Check novelty of exons and splice jns. This will help us categorize
     # what type of novelty the transcript has
     all_SJs_known = check_all_SJs_known(e_novelty)
     all_exons_known = check_all_exons_known(e_novelty)
-    splice_vertices_known = sum(v_novelty) == 0
-    all_exons_novel = reduce(operator.mul, e_novelty, 1) == 1
+    all_exons_novel = check_all_exons_novel(e_novelty)
+    splice_vertices_known = (sum(v_novelty) == 0)
+    # all_exons_novel = reduce(operator.mul, e_novelty, 1) == 1
     # print(f"all exons novel : {all_exons_novel}")
-    logging.debug(f'All exons novel?: {all_exons_novel}')
+    logging.debug(f'All internal exons novel?: {all_exons_novel}')
     fusion = False
 
     # Look for FSM or ISM.
@@ -1673,7 +1691,7 @@ def identify_transcript(
     # but new connections between them.
     elif splice_vertices_known and gene_ID == None:
         # print("looking for nic (again?)")
-        logging.info('Looking for NIC (2)')
+        logging.debug('Looking for NIC (2)')
         gene_ID, transcript_ID, transcript_novelty, start_end_info, fusion = process_NIC(
             chrom,
             positions,
@@ -2945,6 +2963,8 @@ def annotate_read(sam_record: pysam.AlignedSegment, cursor, run_info, struct_col
     """
     # Parse attributes to determine the chromosome, positions, and strand of the transcript
     read_ID = sam_record.query_name
+    logging.debug('')
+    logging.debug('')
     logging.debug(read_ID)
     if not run_info.use_cb_tag:
         dataset = sam_record.get_tag("RG")
diff --git a/testing_suite/build_test_databases.py b/testing_suite/build_test_databases.py
index 4baed17..f5dcff4 100644
--- a/testing_suite/build_test_databases.py
+++ b/testing_suite/build_test_databases.py
@@ -232,7 +232,8 @@
         "--3p", "300",
         "--idprefix", "TALON",
         "--l", "0",
-        "--g",  "hg38", "--o", "scratch/readthrough"])
+        "--g",  "hg38",
+        "--o", "scratch/readthrough"])
 except Exception as e:
     print(e)
     sys.exit("Database initialization failed on readthrough annotation")
@@ -246,6 +247,8 @@
         "--build", "hg38",
         "--cov", "0",
         "--identity", "0",
+        "--create_novel_spliced_genes",
+        '-v', '2',
         "--o", "scratch/readthrough" ])
 except Exception as e:
     print(e)
diff --git a/testing_suite/test_ISM_identification.py b/testing_suite/test_ISM_identification.py
index e920215..7685d58 100644
--- a/testing_suite/test_ISM_identification.py
+++ b/testing_suite/test_ISM_identification.py
@@ -13,7 +13,8 @@ def test_ISM_suffix(self):
         database = "scratch/toy.db"
         run_info = talon.init_run_info(database, build)
         talon.get_counters(database)
-
+        init_refs.make_temp_novel_gene_table(cursor, build)
+        init_refs.make_temp_transcript_table(cursor, build)
         edge_dict = init_refs.make_edge_dict(cursor)
         location_dict = init_refs.make_location_dict(build, cursor)
         transcript_dict = init_refs.make_transcript_dict(cursor, build)
@@ -28,17 +29,20 @@ def test_ISM_suffix(self):
         v_novelty = [0, 0]
 
         all_matches = talon.search_for_ISM(edge_IDs, transcript_dict)
-        gene_ID, transcript_ID, novelty, start_end_info = talon.process_ISM(chrom, 
-                                                            positions, 
+        gene_ID, transcript_ID, novelty, start_end_info = talon.process_ISM(chrom,
+                                                            positions,
                                                             strand, edge_IDs,
-                                                            vertex_IDs, 
-                                                            all_matches, 
+                                                            vertex_IDs,
+                                                            all_matches,
                                                             transcript_dict,
-                                                            gene_starts, gene_ends, 
-                                                            edge_dict, location_dict, 
-                                                            run_info)
+                                                            gene_starts, gene_ends,
+                                                            edge_dict, location_dict,
+                                                            run_info,
+                                                            cursor,
+                                                            "temp_gene",
+                                                            "temp_transcript")
 
-        correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor) 
+        correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor)
 
         assert gene_ID == correct_gene_ID
         assert start_end_info["vertex_IDs"] == [3, 4, 5, 6]
@@ -56,7 +60,8 @@ def test_ISM_prefix(self):
         database = "scratch/toy.db"
         run_info = talon.init_run_info(database, build)
         talon.get_counters(database)
-
+        init_refs.make_temp_novel_gene_table(cursor, build)
+        init_refs.make_temp_transcript_table(cursor, build)
         edge_dict = init_refs.make_edge_dict(cursor)
         location_dict = init_refs.make_location_dict(build, cursor)
         transcript_dict = init_refs.make_transcript_dict(cursor, build)
@@ -79,12 +84,15 @@ def test_ISM_prefix(self):
                                                             transcript_dict,
                                                             gene_starts, gene_ends,
                                                             edge_dict, location_dict,
-                                                            run_info)
+                                                            run_info,
+                                                            cursor,
+                                                            "temp_gene",
+                                                            "temp_transcript")
 
         correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor)
         assert gene_ID == correct_gene_ID
         assert start_end_info["vertex_IDs"] == [1, 2, 3, 4]
-        assert start_end_info["edge_IDs"] == [1, 2, 3] 
+        assert start_end_info["edge_IDs"] == [1, 2, 3]
         conn.close()
 
 
@@ -112,5 +120,4 @@ def test_no_match(self):
 
         all_matches = talon.search_for_ISM(edge_IDs, transcript_dict)
         assert all_matches == None
-        conn.close()       
-
+        conn.close()
diff --git a/testing_suite/test_NIC_identification.py b/testing_suite/test_NIC_identification.py
index 92e10fe..1137eb9 100644
--- a/testing_suite/test_NIC_identification.py
+++ b/testing_suite/test_NIC_identification.py
@@ -16,6 +16,8 @@ def test_NIC_match(self):
         edge_dict = init_refs.make_edge_dict(cursor)
         location_dict = init_refs.make_location_dict(build, cursor)
         run_info = talon.init_run_info(database, build)
+        init_refs.make_temp_novel_gene_table(cursor, build)
+        init_refs.make_temp_transcript_table(cursor, build)
         transcript_dict = init_refs.make_transcript_dict(cursor, build)
         vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor)
         gene_starts = init_refs.make_gene_start_or_end_dict(cursor, build, "start")
@@ -32,9 +34,12 @@ def test_NIC_match(self):
                                                             positions,
                                                             strand, edge_IDs,
                                                             vertex_IDs, transcript_dict,
-                                                            gene_starts, gene_ends,
-                                                            edge_dict, location_dict,
-                                                            vertex_2_gene, run_info)
+                                                            gene_starts, gene_ends,                                                          edge_dict, location_dict,
+                                                            vertex_2_gene,
+                                                            run_info,
+                                                            cursor,
+                                                            "temp_gene",
+                                                            "temp_transcript")
 
         correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor)
         assert gene_ID == correct_gene_ID
@@ -54,6 +59,8 @@ def test_antisense(self):
         edge_dict = init_refs.make_edge_dict(cursor)
         locations = init_refs.make_location_dict(build, cursor)
         run_info = talon.init_run_info(database, build)
+        init_refs.make_temp_novel_gene_table(cursor, build)
+        init_refs.make_temp_transcript_table(cursor, build)
         transcript_dict = init_refs.make_transcript_dict(cursor, build)
         vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor)
         gene_starts = init_refs.make_gene_start_or_end_dict(cursor, build, "start")
@@ -82,7 +89,7 @@ def test_antisense(self):
                                                                   gene_ends,
                                                                   edge_dict, locations,
                                                                   vertex_2_gene, run_info,
-                                                                  cursor, "temp_gene")
+                                                                  cursor, "temp_gene", "temp_transcript")
         #anti_gene_ID = talon.find_gene_match_on_vertex_basis(vertex_IDs,
         #                                                     anti_strand,
         #                                                     vertex_2_gene)
diff --git a/testing_suite/test_NNC_identification.py b/testing_suite/test_NNC_identification.py
index 63f1cd6..7b9e852 100644
--- a/testing_suite/test_NNC_identification.py
+++ b/testing_suite/test_NNC_identification.py
@@ -16,6 +16,8 @@ def test_NNC_match(self):
         edge_dict = init_refs.make_edge_dict(cursor)
         location_dict = init_refs.make_location_dict(build, cursor)
         run_info = talon.init_run_info(database, build)
+        init_refs.make_temp_novel_gene_table(cursor, build)
+        init_refs.make_temp_transcript_table(cursor, build)
         transcript_dict = init_refs.make_transcript_dict(cursor, build)
         vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor)
         gene_starts = init_refs.make_gene_start_or_end_dict(cursor, build, "start")
@@ -34,7 +36,9 @@ def test_NNC_match(self):
                                                             vertex_IDs, transcript_dict,
                                                             gene_starts, gene_ends,
                                                             edge_dict, location_dict,
-                                                            vertex_2_gene, run_info)
+                                                            vertex_2_gene, run_info,
+                                                            cursor, "temp_gene",
+                                                            "temp_transcript")
 
         correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor)
         assert gene_ID == correct_gene_ID
diff --git a/testing_suite/test_all_exons_known.py b/testing_suite/test_all_exons_known.py
index 831e433..52f358f 100644
--- a/testing_suite/test_all_exons_known.py
+++ b/testing_suite/test_all_exons_known.py
@@ -8,7 +8,7 @@ def test_find_true(self):
         """ Example where all of the exons are known.
         """
         # Remember that first pos is first intron, last is last intron
-        novelty = [0, 0, 0, 0, 0 ]      
+        novelty = [0, 0, 0, 0, 0 ]
 
         # Make sure that no match got returned
         assert talon.check_all_exons_known(novelty) == True
@@ -21,17 +21,23 @@ def test_find_true_with_novel_exons(self):
 
         # Make sure that no match got returned
         assert talon.check_all_exons_known(novelty) == True
-   
+
     def test_find_false(self):
         """ Example with novel exons
         """
         novelty = [0, 1, 0, 1, 0]
 
         # Make sure that no match got returned
-        assert talon.check_all_exons_known(novelty) == False 
+        assert talon.check_all_exons_known(novelty) == False
 
     def test_monoexonic(self):
         """ Monoexonic known exon """
         novelty = [0]
 
         assert talon.check_all_exons_known(novelty) == True
+
+    def test_no_internal_exons(self):
+        """ No internal exons """
+        novelty = [0, 0, 0]
+
+        assert talon.check_all_exons_known(novelty) == True
diff --git a/testing_suite/test_all_exons_novel.py b/testing_suite/test_all_exons_novel.py
new file mode 100644
index 0000000..004f165
--- /dev/null
+++ b/testing_suite/test_all_exons_novel.py
@@ -0,0 +1,43 @@
+import pytest
+from talon import talon
+@pytest.mark.dbunit
+
+class TestAllExonsNovel(object):
+
+    def test_find_true(self):
+        """ Example where all of the exons are known.
+        """
+        # Remember that first pos is first intron, last is last intron
+        novelty = [0, 0, 0, 0, 0 ]
+
+        # Make sure that no match got returned
+        assert talon.check_all_exons_novel(novelty) == False
+
+    def test_find_true_with_novel_exons(self):
+        """ Example where all of the exons are known, but the introns are not.
+            Note: This is not necessarily realistic biologically.
+        """
+        novelty = [1, 0, 1, 0, 1]
+
+        # Make sure that no match got returned
+        assert talon.check_all_exons_novel(novelty) == False
+
+    def test_find_false(self):
+        """ Example with novel exons
+        """
+        novelty = [0, 1, 0, 1, 0]
+
+        # Make sure that no match got returned
+        assert talon.check_all_exons_novel(novelty) == True
+
+    def test_monoexonic(self):
+        """ Monoexonic known exon """
+        novelty = [0]
+
+        assert talon.check_all_exons_novel(novelty) == False
+
+    def test_no_internal_exons(self):
+        """ No internal exons """
+        novelty = [0, 0, 0]
+
+        assert talon.check_all_exons_novel(novelty) == False
diff --git a/testing_suite/test_database_updates.py b/testing_suite/test_database_updates.py
index d4927c2..36cffbf 100644
--- a/testing_suite/test_database_updates.py
+++ b/testing_suite/test_database_updates.py
@@ -55,7 +55,7 @@ def test_observed(self):
         with open("scratch/db_updates/observed.tsv", 'w') as f:
             for obs in observed:
                 f.write("\t".join([str(x) for x in obs]) + "\n")
-                
+
 
         batch_size = 1
         talon.batch_add_observed(cursor, "scratch/db_updates/observed.tsv", batch_size)
@@ -100,7 +100,7 @@ def test_gene_annot(self):
                 f.write("\t".join([str(x) for x in entry]) + "\n")
 
         batch_size = 1
-        talon.batch_add_annotations(cursor, "scratch/db_updates/gene_annot.tsv", 
+        talon.batch_add_annotations(cursor, "scratch/db_updates/gene_annot.tsv",
                                     "gene", batch_size)
 
         # Test if items are there
@@ -125,7 +125,7 @@ def test_transcript_annot(self):
                 f.write("\t".join([str(x) for x in entry]) + "\n")
 
         batch_size = 2
-        talon.batch_add_annotations(cursor, "scratch/db_updates/transcript_annot.tsv", 
+        talon.batch_add_annotations(cursor, "scratch/db_updates/transcript_annot.tsv",
                                     "transcript", batch_size)
 
         # Test if items are there
@@ -150,7 +150,7 @@ def test_exon_annot(self):
                 f.write("\t".join([str(x) for x in entry]) + "\n")
 
         batch_size = 3
-        talon.batch_add_annotations(cursor, "scratch/db_updates/exon_annot.tsv", 
+        talon.batch_add_annotations(cursor, "scratch/db_updates/exon_annot.tsv",
                                     "exon", batch_size)
 
         # Test if items are there
@@ -180,7 +180,7 @@ def test_gene_update(self):
 
         talon.batch_add_genes(cursor, "scratch/db_updates/genes.tsv", 10)
 
-        # Test if gene with ID 6 is there, but make sure we didn't add 
+        # Test if gene with ID 6 is there, but make sure we didn't add
         # duplicates of the other genes
         query = "SELECT * FROM genes"
         gene_IDs = [ x['gene_ID'] for x in cursor.execute(query)]
@@ -189,15 +189,18 @@ def test_gene_update(self):
         conn.close()
 
     def test_transcript_update(self):
-        """ Try to add novel transcript entries to database while ignoring 
+        """ Try to add novel transcript entries to database while ignoring
             duplicates
         """
         conn, cursor = get_db_cursor()
         build = "toy_build"
         transcript_dict = init_refs.make_transcript_dict(cursor, build)
+        init_refs.make_temp_transcript_table(cursor, build)
+        
         database = "scratch/toy.db"
         talon.get_counters(database)
-        talon.create_transcript("chr1", 1, 1000, 1, (1,), (1,2), transcript_dict)
+        talon.create_transcript('+', "chr1", 1, 1000, 1, (1,), (1,2), transcript_dict,
+            "temp_transcript", cursor)
 
         # Write to file
         os.system("mkdir -p scratch/db_updates/")
@@ -257,7 +260,7 @@ def test_edge_update(self):
 
         batch_size = 10
         talon.batch_add_edges(cursor, "scratch/db_updates/edges.tsv", batch_size)
-        
+
         # Test if the edge table has the correct number of edges now
         query = "SELECT * FROM edge"
         cursor.execute(query)
@@ -277,7 +280,7 @@ def test_location_update(self):
         orig_n_pos = talon.vertex_counter.value()
 
         talon.create_vertex("chr4", 2000, location_dict, run_info)
-   
+
         # Write to file
         os.system("mkdir -p scratch/db_updates/")
         with open("scratch/db_updates/loc.tsv", 'w') as f:
@@ -307,12 +310,12 @@ def test_vertex2gene_update(self):
         build = "toy_build"
         vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor)
 
-        talon.update_vertex_2_gene(2, (1,2), "-", vertex_2_gene) 
+        talon.update_vertex_2_gene(2, (1,2), "-", vertex_2_gene)
         talon.update_vertex_2_gene(1, (1,2,3,4,5,6), "+", vertex_2_gene)
 
         # Write to file
         os.system("mkdir -p scratch/db_updates/")
-        with open("scratch/db_updates/v2g.tsv", 'w') as f: 
+        with open("scratch/db_updates/v2g.tsv", 'w') as f:
             for vertex_ID, gene_set in vertex_2_gene.items():
                 for gene in gene_set:
                     entry = "\t".join([ str(x) for x in (vertex_ID, gene[0])])
diff --git a/testing_suite/test_get_overlap.py b/testing_suite/test_get_overlap.py
index f647892..1a52868 100644
--- a/testing_suite/test_get_overlap.py
+++ b/testing_suite/test_get_overlap.py
@@ -5,11 +5,11 @@
 class TestGetOverlap(object):
     def test_1(self):
         """ Example where intervals of size 11 match exactly. So the answer
-            should be 11. 
+            should be 11.
         """
-        a = [ 10, 20 ] 
+        a = [ 10, 20 ]
         b = [ 10, 20 ]
-        assert talon.get_overlap(a, b) == 11
+        assert talon.get_overlap(a, b)[0] == 11
 
     def test_2(self):
         """ Example where interval a is contained within interval b. The answer
@@ -17,18 +17,18 @@ def test_2(self):
         """
         a = [ 12, 18 ]
         b = [ 10, 20 ]
-        assert talon.get_overlap(a, b) == 7
+        assert talon.get_overlap(a, b)[0] == 7
 
     def test_3(self):
         """ Example where interval a starts and ends earlier than b.
         """
         a = [ 10, 20 ]
         b = [ 15, 25 ]
-        assert talon.get_overlap(a, b) == 6
+        assert talon.get_overlap(a, b)[0] == 6
 
     def test_4(self):
-        """ Example with no overlap. 
+        """ Example with no overlap.
         """
         a = [ 10, 20 ]
         b = [ 30, 40 ]
-        assert talon.get_overlap(a, b) == 0
+        assert talon.get_overlap(a, b)[0] == 0
diff --git a/testing_suite/test_identification_flow.py b/testing_suite/test_identification_flow.py
index a9fb5fd..bf5358a 100644
--- a/testing_suite/test_identification_flow.py
+++ b/testing_suite/test_identification_flow.py
@@ -14,9 +14,12 @@ def test_FSM_perfect(self):
         database = "scratch/toy.db"
         talon.get_counters(database)
         init_refs.make_temp_novel_gene_table(cursor, build)
+        init_refs.make_temp_transcript_table(cursor, build)
+
+        init_refs.make_temp_transcript_table(cursor, build)
         edge_dict = init_refs.make_edge_dict(cursor)
         location_dict = init_refs.make_location_dict(build, cursor)
-        run_info = talon.init_run_info(database, build)        
+        run_info = talon.init_run_info(database, build)
         transcript_dict = init_refs.make_transcript_dict(cursor, build)
         vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor)
         gene_starts = init_refs.make_gene_start_or_end_dict(cursor, build, "start")
@@ -27,13 +30,13 @@ def test_FSM_perfect(self):
         positions = [ 1, 100, 500, 600, 900, 1000 ]
 
 
-        annotation = talon.identify_transcript(chrom, positions, strand, cursor, 
-                                               location_dict, edge_dict, 
-                                               transcript_dict, vertex_2_gene, 
+        annotation = talon.identify_transcript(chrom, positions, strand, cursor,
+                                               location_dict, edge_dict,
+                                               transcript_dict, vertex_2_gene,
                                                gene_starts, gene_ends, run_info,
-                                               "temp_gene")
+                                               "temp_gene", "temp_transcript")
 
-        correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor) 
+        correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor)
         correct_transcript_ID = fetch_correct_ID("TG1-001", "transcript", cursor)
         assert annotation['gene_ID'] == correct_gene_ID
         assert annotation['transcript_ID'] == correct_transcript_ID
@@ -50,6 +53,8 @@ def test_FSM_end_diff(self):
         talon.get_counters(database)
 
         init_refs.make_temp_novel_gene_table(cursor, build)
+        init_refs.make_temp_transcript_table(cursor, build)
+
         edge_dict = init_refs.make_edge_dict(cursor)
         location_dict = init_refs.make_location_dict(build, cursor)
         run_info = talon.init_run_info(database, build)
@@ -67,7 +72,7 @@ def test_FSM_end_diff(self):
                                                location_dict, edge_dict,
                                                transcript_dict, vertex_2_gene,
                                                gene_starts, gene_ends, run_info,
-                                               "temp_gene")
+                                               "temp_gene", "temp_transcript")
 
         correct_gene_ID = fetch_correct_ID("TG2", "gene", cursor)
         novelty_types = [ x[-2] for x in annotation['transcript_novelty']]
@@ -76,7 +81,7 @@ def test_FSM_end_diff(self):
         conn.close()
 
     def test_NIC_instead_of_ISM(self):
-        """ Test case where the transcript looks like an ISM, but has known 
+        """ Test case where the transcript looks like an ISM, but has known
             starts and ends. In past TALON versions, this was considered NIC,
             but expected behavior is now ISM """
         conn, cursor = get_db_cursor()
@@ -85,6 +90,8 @@ def test_NIC_instead_of_ISM(self):
         talon.get_counters(database)
 
         init_refs.make_temp_novel_gene_table(cursor, build)
+        init_refs.make_temp_transcript_table(cursor, build)
+
         edge_dict = init_refs.make_edge_dict(cursor)
         location_dict = init_refs.make_location_dict(build, cursor)
         run_info = talon.init_run_info(database, build)
@@ -101,7 +108,7 @@ def test_NIC_instead_of_ISM(self):
                                                location_dict, edge_dict,
                                                transcript_dict, vertex_2_gene,
                                                gene_starts, gene_ends, run_info,
-                                               "tmp_gene") 
+                                               "tmp_gene", "temp_transcript")
 
         correct_gene_ID = fetch_correct_ID("TG5", "gene", cursor)
         novelty_types = [ x[-2] for x in annotation['transcript_novelty']]
@@ -118,6 +125,8 @@ def test_ISM_suffix(self):
         talon.get_counters(database)
 
         init_refs.make_temp_novel_gene_table(cursor, build)
+        init_refs.make_temp_transcript_table(cursor, build)
+
         edge_dict = init_refs.make_edge_dict(cursor)
         location_dict = init_refs.make_location_dict(build, cursor)
         run_info = talon.init_run_info(database, build)
@@ -134,12 +143,12 @@ def test_ISM_suffix(self):
                                                location_dict, edge_dict,
                                                transcript_dict, vertex_2_gene,
                                                gene_starts, gene_ends, run_info,
-                                               "temp_gene")
+                                               "temp_gene", "temp_transcript")
 
         correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor)
         novelty_types = [ x[-2] for x in annotation['transcript_novelty']]
         assert annotation['gene_ID'] == correct_gene_ID
-        assert "ISM_transcript" in novelty_types 
+        assert "ISM_transcript" in novelty_types
         assert "ISM-suffix_transcript" in novelty_types
         assert annotation['start_delta'] == 50
         conn.close()
@@ -153,6 +162,8 @@ def test_ISM_prefix(self):
         talon.get_counters(database)
 
         init_refs.make_temp_novel_gene_table(cursor, build)
+        init_refs.make_temp_transcript_table(cursor, build)
+
         edge_dict = init_refs.make_edge_dict(cursor)
         location_dict = init_refs.make_location_dict(build, cursor)
         run_info = talon.init_run_info(database, build)
@@ -168,7 +179,8 @@ def test_ISM_prefix(self):
         annotation = talon.identify_transcript(chrom, positions, strand, cursor,
                                                location_dict, edge_dict,
                                                transcript_dict, vertex_2_gene,
-                                               gene_starts, gene_ends, run_info, "temp_gene")
+                                               gene_starts, gene_ends, run_info,
+                                               "temp_gene", "temp_transcript")
 
         correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor)
         novelty_types = [ x[-2] for x in annotation['transcript_novelty']]
@@ -187,6 +199,8 @@ def test_ISM_internal(self):
         talon.get_counters(database)
 
         init_refs.make_temp_novel_gene_table(cursor, build)
+        init_refs.make_temp_transcript_table(cursor, build)
+
         init_refs.make_temp_monoexonic_transcript_table(cursor, build)
         edge_dict = init_refs.make_edge_dict(cursor)
         location_dict = init_refs.make_location_dict(build, cursor)
@@ -203,12 +217,13 @@ def test_ISM_internal(self):
         annotation = talon.identify_monoexon_transcript(chrom, positions, strand, cursor,
                                                location_dict, edge_dict,
                                                transcript_dict, vertex_2_gene,
-                                               gene_starts, gene_ends, run_info, "temp_gene", "temp_monoexon")
+                                               gene_starts, gene_ends, run_info,
+                                               "temp_gene", "temp_transcript", "temp_monoexon")
 
         correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor)
         novelty_types = [ x[-2] for x in annotation['transcript_novelty']]
         assert annotation['gene_ID'] == correct_gene_ID
-        assert "ISM_transcript" in novelty_types 
+        assert "ISM_transcript" in novelty_types
         assert annotation['start_delta'] == annotation['end_delta'] == 0
         conn.close()
 
@@ -221,6 +236,8 @@ def test_NIC(self):
         talon.get_counters(database)
 
         init_refs.make_temp_novel_gene_table(cursor, build)
+        init_refs.make_temp_transcript_table(cursor, build)
+
         edge_dict = init_refs.make_edge_dict(cursor)
         location_dict = init_refs.make_location_dict(build, cursor)
         run_info = talon.init_run_info(database, build)
@@ -231,12 +248,13 @@ def test_NIC(self):
 
         chrom = "chr1"
         strand = "+"
-        positions = ( 1, 100, 900, 1000 ) 
+        positions = ( 1, 100, 900, 1000 )
 
         annotation = talon.identify_transcript(chrom, positions, strand, cursor,
                                                location_dict, edge_dict,
                                                transcript_dict, vertex_2_gene,
-                                               gene_starts, gene_ends, run_info, "temp_gene")
+                                               gene_starts, gene_ends, run_info,
+                                               "temp_gene", "temp_transcript")
 
         correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor)
         novelty_types = [ x[-2] for x in annotation['transcript_novelty']]
@@ -247,14 +265,16 @@ def test_NIC(self):
 
     def test_NNC(self):
         """ Example where the transcript skips an exon and has a novel splice
-            donor
+            donor at the first exon
         """
         conn, cursor = get_db_cursor()
         build = "toy_build"
         database = "scratch/toy.db"
-        talon.get_counters(database) 
+        talon.get_counters(database)
 
         init_refs.make_temp_novel_gene_table(cursor, build)
+        init_refs.make_temp_transcript_table(cursor, build)
+
         edge_dict = init_refs.make_edge_dict(cursor)
         location_dict = init_refs.make_location_dict(build, cursor)
         run_info = talon.init_run_info(database, build)
@@ -270,7 +290,8 @@ def test_NNC(self):
         annotation = talon.identify_transcript(chrom, positions, strand, cursor,
                                                location_dict, edge_dict,
                                                transcript_dict, vertex_2_gene,
-                                               gene_starts, gene_ends, run_info, "temp_gene")
+                                               gene_starts, gene_ends, run_info,
+                                               "temp_gene", "temp_transcript")
 
         correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor)
         novelty_types = [ x[-2] for x in annotation['transcript_novelty']]
@@ -288,6 +309,8 @@ def test_spliced_antisense(self):
         talon.get_counters(database)
 
         init_refs.make_temp_novel_gene_table(cursor, build)
+        init_refs.make_temp_transcript_table(cursor, build)
+
         edge_dict = init_refs.make_edge_dict(cursor)
         location_dict = init_refs.make_location_dict(build, cursor)
         run_info = talon.init_run_info(database, build)
@@ -303,7 +326,8 @@ def test_spliced_antisense(self):
         annotation = talon.identify_transcript(chrom, positions, strand, cursor,
                                                location_dict, edge_dict,
                                                transcript_dict, vertex_2_gene,
-                                               gene_starts, gene_ends, run_info, "temp_gene")
+                                               gene_starts, gene_ends, run_info,
+                                               "temp_gene", "temp_transcript")
 
         anti_gene_ID = fetch_correct_ID("TG2", "gene", cursor)
         gene_novelty_types = [ x[-2] for x in annotation['gene_novelty']]
@@ -322,6 +346,8 @@ def test_genomic_unspliced(self):
         talon.get_counters(database)
 
         init_refs.make_temp_novel_gene_table(cursor, build)
+        init_refs.make_temp_transcript_table(cursor, build)
+
         init_refs.make_temp_monoexonic_transcript_table(cursor, build)
         edge_dict = init_refs.make_edge_dict(cursor)
         location_dict = init_refs.make_location_dict(build, cursor)
@@ -338,14 +364,15 @@ def test_genomic_unspliced(self):
         annotation = talon.identify_monoexon_transcript(chrom, positions, strand, cursor,
                                                location_dict, edge_dict,
                                                transcript_dict, vertex_2_gene,
-                                               gene_starts, gene_ends, run_info, "temp_gene", "temp_monoexon")
+                                               gene_starts, gene_ends, run_info,
+                                               "temp_gene", "temp_transcript", "temp_monoexon")
 
         correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor)
         novelty_types = [ x[-2] for x in annotation['transcript_novelty']]
         assert annotation['gene_ID'] == correct_gene_ID
         assert "genomic_transcript" in novelty_types
         assert annotation['end_delta'] == -10
-        conn.close()        
+        conn.close()
 
     def test_NIC_with_all_known_edges(self):
         """ Test case derived from a real mouse Map2k4 read. All of edges are
@@ -355,9 +382,11 @@ def test_NIC_with_all_known_edges(self):
         talon.get_counters(database)
         conn = sqlite3.connect(database)
         conn.row_factory = sqlite3.Row
-        cursor = conn.cursor()        
+        cursor = conn.cursor()
         build = "mm10"
         init_refs.make_temp_novel_gene_table(cursor, build)
+        init_refs.make_temp_transcript_table(cursor, build)
+
         edge_dict = init_refs.make_edge_dict(cursor)
         location_dict = init_refs.make_location_dict(build, cursor)
         run_info = talon.init_run_info(database, build)
@@ -369,11 +398,12 @@ def test_NIC_with_all_known_edges(self):
         chrom = "chr11"
         strand = "-"
         positions = [65788254, 65788136, 65775765, 65775733, 65756371, 65756269, 65735366, 65735192, 65719603, 65719484, 65712297, 65712178, 65709983, 65709932, 65707111, 65706984, 65696365, 65696288, 65693570, 65693422, 65691773, 65691728, 65690804, 65689322]
-      
+
         annotation = talon.identify_transcript(chrom, positions, strand, cursor,
                                                location_dict, edge_dict,
                                                transcript_dict, vertex_2_gene,
-                                               gene_starts, gene_ends, run_info, "temp_gene")
+                                               gene_starts, gene_ends, run_info,
+                                               "temp_gene", "temp_transcript")
 
         assert annotation['gene_ID'] == 1
         assert annotation['transcript_ID'] == 8
@@ -381,4 +411,3 @@ def test_NIC_with_all_known_edges(self):
         assert "NIC_transcript" in novelty_types
 
         conn.close()
-
diff --git a/testing_suite/test_monoexonic.py b/testing_suite/test_monoexonic.py
index d0ebf64..92ad9c6 100644
--- a/testing_suite/test_monoexonic.py
+++ b/testing_suite/test_monoexonic.py
@@ -14,6 +14,7 @@ def test_match(self):
         talon.get_counters(database)
         init_refs.make_temp_novel_gene_table(cursor, build)
         init_refs.make_temp_monoexonic_transcript_table(cursor, build)
+        init_refs.make_temp_transcript_table(cursor, build)
         edge_dict = init_refs.make_edge_dict(cursor)
         location_dict = init_refs.make_location_dict(build, cursor)
         run_info = talon.init_run_info(database, build)
@@ -26,12 +27,12 @@ def test_match(self):
         strand = "-"
         positions = ( 3900, 1100 )
 
-        annotation = talon.identify_monoexon_transcript(chrom, positions, 
+        annotation = talon.identify_monoexon_transcript(chrom, positions,
                                                strand, cursor,
                                                location_dict, edge_dict,
                                                transcript_dict, vertex_2_gene,
                                                gene_starts, gene_ends, run_info,
-                                               'temp_gene', 'temp_monoexon')
+                                               'temp_gene', 'temp_transcript', 'temp_monoexon')
 
         correct_gene_ID = fetch_correct_ID("TG6", "gene", cursor)
         correct_transcript_ID = fetch_correct_ID("TG6-001", "transcript", cursor)
@@ -43,7 +44,7 @@ def test_match(self):
 
     def test_partial_match(self):
         """ Example where the transcript overlaps a single-exon transcript,
-            but is shorter. In the past, the start would be assigned to the 
+            but is shorter. In the past, the start would be assigned to the
             annotated start, and the end would be novel. This is no longer
             the case- at this time, the transcript will be assigned to
             the annotated match. """
@@ -53,6 +54,7 @@ def test_partial_match(self):
         database = "scratch/toy.db"
         talon.get_counters(database)
         init_refs.make_temp_novel_gene_table(cursor, build)
+        init_refs.make_temp_transcript_table(cursor, build)
         init_refs.make_temp_monoexonic_transcript_table(cursor, build)
         edge_dict = init_refs.make_edge_dict(cursor)
         location_dict = init_refs.make_location_dict(build, cursor)
@@ -71,7 +73,7 @@ def test_partial_match(self):
                                                location_dict, edge_dict,
                                                transcript_dict, vertex_2_gene,
                                                gene_starts, gene_ends, run_info,
-                                               'temp_gene', 'temp_monoexon')
+                                               'temp_gene', 'temp_transcript', 'temp_monoexon')
 
         correct_gene_ID = fetch_correct_ID("TG6", "gene", cursor)
         correct_transcript_ID = fetch_correct_ID("TG6-001", "transcript", cursor)
@@ -82,7 +84,7 @@ def test_partial_match(self):
 
         conn.close()
 
-# Commenting out these tests for now because they are redundant. But saving in 
+# Commenting out these tests for now because they are redundant. But saving in
 # case they might be useful down the line.
 
 #    def test_partial_match_3prime(self):
@@ -176,6 +178,7 @@ def test_antisense(self):
         database = "scratch/toy.db"
         talon.get_counters(database)
         init_refs.make_temp_novel_gene_table(cursor, build)
+        init_refs.make_temp_transcript_table(cursor, build)
         init_refs.make_temp_monoexonic_transcript_table(cursor, build)
         edge_dict = init_refs.make_edge_dict(cursor)
         location_dict = init_refs.make_location_dict(build, cursor)
@@ -194,7 +197,7 @@ def test_antisense(self):
                                                location_dict, edge_dict,
                                                transcript_dict, vertex_2_gene,
                                                gene_starts, gene_ends, run_info,
-                                               'temp_gene', 'temp_monoexon')
+                                               'temp_gene', 'temp_transcript', 'temp_monoexon')
 
         anti_gene_ID = fetch_correct_ID("TG6", "gene", cursor)
         gene_novelty_types = [ x[-2] for x in annotation['gene_novelty']]
@@ -203,4 +206,4 @@ def test_antisense(self):
         assert "antisense_gene" in gene_novelty_types
         assert "antisense_transcript" in t_novelty_types
 
-        conn.close() 
+        conn.close()
diff --git a/testing_suite/test_multiexon_read_overlapping_monoexon_transcript.py b/testing_suite/test_multiexon_read_overlapping_monoexon_transcript.py
index 7ff6a21..e419548 100644
--- a/testing_suite/test_multiexon_read_overlapping_monoexon_transcript.py
+++ b/testing_suite/test_multiexon_read_overlapping_monoexon_transcript.py
@@ -14,7 +14,7 @@ def test_transcript_assigned_intergenic(self):
             when it was actually supposed to be genomic """
 
         # Set up references
-        database = "scratch/multiexon_read_overlapping_monoexon_transcript/talon.db" 
+        database = "scratch/multiexon_read_overlapping_monoexon_transcript/talon.db"
         conn = sqlite3.connect(database)
         conn.row_factory = sqlite3.Row
         cursor = conn.cursor()
@@ -23,6 +23,7 @@ def test_transcript_assigned_intergenic(self):
         talon.get_counters(database)
         run_info = talon.init_run_info(database, build)
         struct_collection = talon.prepare_data_structures(cursor, run_info)
+        init_refs.make_temp_transcript_table(cursor, "toy_build")
 
         # Use pysam to get the read from the SAM file
         sam_file = "input_files/multiexon_read_overlapping_monoexon_transcript/read.sam"
@@ -34,21 +35,21 @@ def test_transcript_assigned_intergenic(self):
         # Get read attributes
         chrom = sam_record.reference_name
         strand = "-" if sam_record.is_reverse else "+"
-        sam_start = sam_record.reference_start 
+        sam_start = sam_record.reference_start
         sam_end = sam_record.reference_end
 
         # Do we get any overlap with the reference gene?
         best_gene, match_strand = talon.search_for_overlap_with_gene(chrom, min(sam_start, sam_end),
-                                                                     max(sam_start, sam_end), strand, 
-                                                                     cursor, run_info, 
-                                                                     struct_collection.tmp_gene)
+                                                                     max(sam_start, sam_end), strand,
+                                                                     cursor, run_info,
+                                                                     struct_collection.tmp_gene,
+                                                                     struct_collection.tmp_t)
         assert best_gene == 1
         assert match_strand == "-"
 
-        annotation_info = talon.annotate_read(sam_record, cursor, run_info, 
+        annotation_info = talon.annotate_read(sam_record, cursor, run_info,
                                               struct_collection, mode = 0)
-        
+
         assert annotation_info['gene_ID'] == 1
         assert annotation_info['transcript_ID'] == 2
         assert 'genomic_transcript' in annotation_info['transcript_novelty'][0]
-
diff --git a/testing_suite/test_process_remaining_mult_cases.py b/testing_suite/test_process_remaining_mult_cases.py
index 36966bb..dbb01fa 100644
--- a/testing_suite/test_process_remaining_mult_cases.py
+++ b/testing_suite/test_process_remaining_mult_cases.py
@@ -1,6 +1,10 @@
 import pytest
 from talon import talon, init_refs
 from .helper_fns import  fetch_correct_ID, get_db_cursor
+import logging
+
+logging.basicConfig(level=logging.DEBUG)
+
 @pytest.mark.integration
 
 class TestIdentifyRemaining(object):
@@ -15,7 +19,9 @@ def test_fusion(self):
         talon.get_counters(db)
         edge_dict = init_refs.make_edge_dict(cursor)
         location_dict = init_refs.make_location_dict(build, cursor)
-        run_info = talon.init_run_info(db, build)
+        run_info = talon.init_run_info(db, build, create_novel_spliced_genes=True)
+        init_refs.make_temp_novel_gene_table(cursor, "toy_build")
+        init_refs.make_temp_transcript_table(cursor, "toy_build")
         transcript_dict = init_refs.make_transcript_dict(cursor, build)
         vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor)
         gene_starts = init_refs.make_gene_start_or_end_dict(cursor, build, "start")
@@ -23,10 +29,10 @@ def test_fusion(self):
         correct_gene_ID = talon.gene_counter.value() + 1
 
         chrom = "chr1"
-        positions = [1, 100, 500, 600, 900, 1010, 5000, 5550, 6000]
+        positions = [100, 500, 600, 900, 1010, 5000, 5550, 6000]
         strand = "+"
         edge_IDs = [2, 3, 4]+[ talon.edge_counter.value() + 1, talon.edge_counter.value() + 2 ]
-        vertex_IDs = [2, 3, 4, 5, 9, 10]
+        vertex_IDs = [2, 3, 4, 5, 9, 10, 11]
         v_novelty = [0, 0, 0, 0, 0, 0]
 
         # Construct temp novel gene db
@@ -42,6 +48,7 @@ def test_fusion(self):
                                                                 edge_dict, location_dict,
                                                                 vertex_2_gene, run_info,
                                                                 cursor, "temp_gene",
+                                                                "temp_transcript",
                                                                 fusion)
 
         assert gene_ID == correct_gene_ID
@@ -62,6 +69,8 @@ def test_intergenic(self):
         edge_dict = init_refs.make_edge_dict(cursor)
         location_dict = init_refs.make_location_dict(build, cursor)
         run_info = talon.init_run_info(database, build)
+        init_refs.make_temp_novel_gene_table(cursor, "toy_build")
+        init_refs.make_temp_transcript_table(cursor, "toy_build")
         transcript_dict = init_refs.make_transcript_dict(cursor, build)
         vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor)
         gene_starts = init_refs.make_gene_start_or_end_dict(cursor, build, "start")
@@ -87,6 +96,7 @@ def test_intergenic(self):
                                                                 edge_dict, location_dict,
                                                                 vertex_2_gene, run_info,
                                                                 cursor, "temp_gene",
+                                                                "temp_transcript",
                                                                 fusion)
 
         assert gene_ID == correct_gene_ID
@@ -106,6 +116,8 @@ def test_antisense(self):
         edge_dict = init_refs.make_edge_dict(cursor)
         location_dict = init_refs.make_location_dict(build, cursor)
         run_info = talon.init_run_info(database, build)
+        init_refs.make_temp_novel_gene_table(cursor, "toy_build")
+        init_refs.make_temp_transcript_table(cursor, "toy_build")
         transcript_dict = init_refs.make_transcript_dict(cursor, build)
         vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor)
         gene_starts = init_refs.make_gene_start_or_end_dict(cursor, build, "start")
@@ -131,6 +143,7 @@ def test_antisense(self):
                                                                 edge_dict, location_dict,
                                                                 vertex_2_gene, run_info,
                                                                 cursor, "temp_gene",
+                                                                "temp_transcript",
                                                                 fusion)
         assert gene_ID == correct_gene_ID
         assert transcript_dict[frozenset(start_end_info["edge_IDs"])] != None
@@ -148,6 +161,8 @@ def test_genomic(self):
         edge_dict = init_refs.make_edge_dict(cursor)
         location_dict = init_refs.make_location_dict(build, cursor)
         run_info = talon.init_run_info(database, build)
+        init_refs.make_temp_novel_gene_table(cursor, "toy_build")
+        init_refs.make_temp_transcript_table(cursor, "toy_build")
         transcript_dict = init_refs.make_transcript_dict(cursor, build)
         vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor)
         gene_starts = init_refs.make_gene_start_or_end_dict(cursor, build, "start")
@@ -172,6 +187,7 @@ def test_genomic(self):
                                                                 edge_dict, location_dict,
                                                                 vertex_2_gene, run_info,
                                                                 cursor, "temp_gene",
+                                                                "temp_transcript",
                                                                 fusion)
         correct_gene_ID = fetch_correct_ID("TG3", "gene", cursor)
         assert gene_ID == correct_gene_ID
diff --git a/testing_suite/test_search_for_overlap_with_gene.py b/testing_suite/test_search_for_overlap_with_gene.py
index 2d3ff48..44f78bd 100644
--- a/testing_suite/test_search_for_overlap_with_gene.py
+++ b/testing_suite/test_search_for_overlap_with_gene.py
@@ -14,6 +14,7 @@ def test_no_match(self):
         database = "scratch/toy.db"
         run_info = talon.init_run_info(database, build, tmp_dir = "scratch/tmp/")
         init_refs.make_temp_novel_gene_table(cursor, "toy_build")
+        init_refs.make_temp_transcript_table(cursor, "toy_build")
         location_dict = init_refs.make_location_dict(build, cursor)
         run_info = talon.init_run_info(database, build, tmp_dir = "scratch/tmp/")
 
@@ -23,16 +24,18 @@ def test_no_match(self):
         gene_ID, match_strand = talon.search_for_overlap_with_gene(chrom, pos[0],
                                                                    pos[1],
                                                                    strand, cursor,
-                                                                   run_info, 
-                                                                   "temp_gene")
+                                                                   run_info,
+                                                                   "temp_gene",
+                                                                   "temp_transcript")
         assert gene_ID == None
 
         # Should get same results for flipped interval
         gene_ID, match_strand = talon.search_for_overlap_with_gene(chrom, pos[0],
                                                                    pos[1],
                                                                    strand, cursor,
-                                                                   run_info, 
-                                                                   "temp_gene")
+                                                                   run_info,
+                                                                   "temp_gene",
+                                                                   "temp_transcript")
         assert gene_ID == None
         conn.close()
 
@@ -42,6 +45,7 @@ def test_single_match(self):
         database = "scratch/toy.db"
         build = "toy_build"
         init_refs.make_temp_novel_gene_table(cursor, "toy_build")
+        init_refs.make_temp_transcript_table(cursor, "toy_build")
         location_dict = init_refs.make_location_dict(build, cursor)
         run_info = talon.init_run_info(database, build, tmp_dir = "scratch/tmp/")
 
@@ -53,7 +57,8 @@ def test_single_match(self):
                                                                    pos[1],
                                                                    strand, cursor,
                                                                    run_info,
-                                                                   "temp_gene")
+                                                                   "temp_gene",
+                                                                   "temp_transcript")
 
 
         assert gene_ID == fetch_correct_ID("TG1", "gene", cursor)
@@ -61,13 +66,14 @@ def test_single_match(self):
         conn.close()
 
     def test_same_strand_match_with_two_genes(self):
-        """ Example where interval overlaps two genes, one of which is on the 
+        """ Example where interval overlaps two genes, one of which is on the
             same strand. """
-        
+
         database = "scratch/toy.db"
         conn, cursor = get_db_cursor()
         build = "toy_build"
         init_refs.make_temp_novel_gene_table(cursor, "toy_build")
+        init_refs.make_temp_transcript_table(cursor, "toy_build")
         location_dict = init_refs.make_location_dict(build, cursor)
         run_info = talon.init_run_info(database, build)
 
@@ -75,24 +81,26 @@ def test_same_strand_match_with_two_genes(self):
         pos = [1500, 910]
         strand = "-"
 
-        gene_ID, match_strand = talon.search_for_overlap_with_gene(chrom, pos[0], 
+        gene_ID, match_strand = talon.search_for_overlap_with_gene(chrom, pos[0],
                                                                    pos[1],
-                                                                   strand, cursor, 
+                                                                   strand, cursor,
                                                                    run_info,
-                                                                   "temp_gene")
+                                                                   "temp_gene",
+                                                                   "temp_transcript")
 
         assert gene_ID == fetch_correct_ID("TG3", "gene", cursor)
         assert match_strand == strand
         conn.close()
 
     def test_same_strand_match_left_overlap(self):
-        """ Example where the overlap is on the same strand. Query start is to 
+        """ Example where the overlap is on the same strand. Query start is to
             the left of the gene, and query end is before the end of the gene. """
 
         database = "scratch/toy.db"
         conn, cursor = get_db_cursor()
         build = "toy_build"
         init_refs.make_temp_novel_gene_table(cursor, "toy_build")
+        init_refs.make_temp_transcript_table(cursor, "toy_build")
         location_dict = init_refs.make_location_dict(build, cursor)
         run_info = talon.init_run_info(database, build)
 
@@ -104,7 +112,8 @@ def test_same_strand_match_left_overlap(self):
                                                                    pos[1],
                                                                    strand, cursor,
                                                                    run_info,
-                                                                   "temp_gene")
+                                                                   "temp_gene",
+                                                                   "temp_transcript")
 
         assert gene_ID == fetch_correct_ID("TG3", "gene", cursor)
         assert match_strand == strand
@@ -118,31 +127,34 @@ def test_antisense_match(self):
         conn, cursor = get_db_cursor()
         build = "toy_build"
         init_refs.make_temp_novel_gene_table(cursor, "toy_build")
+        init_refs.make_temp_transcript_table(cursor, "toy_build")
         location_dict = init_refs.make_location_dict(build, cursor)
         run_info = talon.init_run_info(database, build)
 
         chrom = "chr1"
         pos = [1400, 2100]
         strand = "+"
- 
+
         gene_ID, match_strand = talon.search_for_overlap_with_gene(chrom, pos[0],
                                                                    pos[1],
                                                                    strand, cursor,
                                                                    run_info,
-                                                                   "temp_gene")
+                                                                   "temp_gene",
+                                                                   "temp_transcript")
 
         assert gene_ID == fetch_correct_ID("TG3", "gene", cursor)
         assert match_strand == "-"
         conn.close()
 
     def test_2_genes_same_strand(self):
-        """ Example where query overlaps two genes. Must choose the one with 
+        """ Example where query overlaps two genes. Must choose the one with
             more overlap """
- 
+
         database = "scratch/toy.db"
         conn, cursor = get_db_cursor()
-        build = "toy_build" 
+        build = "toy_build"
         init_refs.make_temp_novel_gene_table(cursor, "toy_build")
+        init_refs.make_temp_transcript_table(cursor, "toy_build")
         location_dict = init_refs.make_location_dict(build, cursor)
         run_info = talon.init_run_info(database, build)
 
@@ -154,10 +166,9 @@ def test_2_genes_same_strand(self):
                                                                    pos[1],
                                                                    strand, cursor,
                                                                    run_info,
-                                                                   "temp_gene")
+                                                                   "temp_gene",
+                                                                   "temp_transcript")
 
         assert gene_ID == fetch_correct_ID("TG1", "gene", cursor)
         assert match_strand == "+"
-        conn.close() 
-
-
+        conn.close()

From a5fb07bcac13b5392fb1caaf65e22f8db001074b Mon Sep 17 00:00:00 2001
From: fairliereese <freese@login-i17.local>
Date: Mon, 9 Oct 2023 15:59:44 -0700
Subject: [PATCH 27/31] added sam version of test input file

---
 .../hl60_1_1_subset_remapped_sorted.sam       | 213 ++++++++++++++++++
 1 file changed, 213 insertions(+)
 create mode 100644 testing_suite/input_files/readthrough/hl60_1_1_subset_remapped_sorted.sam

diff --git a/testing_suite/input_files/readthrough/hl60_1_1_subset_remapped_sorted.sam b/testing_suite/input_files/readthrough/hl60_1_1_subset_remapped_sorted.sam
new file mode 100644
index 0000000..47feae6
--- /dev/null
+++ b/testing_suite/input_files/readthrough/hl60_1_1_subset_remapped_sorted.sam
@@ -0,0 +1,213 @@
+@HD	VN:1.6	SO:coordinate
+@SQ	SN:chr1	LN:248956422
+@SQ	SN:chr2	LN:242193529
+@SQ	SN:chr3	LN:198295559
+@SQ	SN:chr4	LN:190214555
+@SQ	SN:chr5	LN:181538259
+@SQ	SN:chr6	LN:170805979
+@SQ	SN:chr7	LN:159345973
+@SQ	SN:chr8	LN:145138636
+@SQ	SN:chr9	LN:138394717
+@SQ	SN:chr10	LN:133797422
+@SQ	SN:chr11	LN:135086622
+@SQ	SN:chr12	LN:133275309
+@SQ	SN:chr13	LN:114364328
+@SQ	SN:chr14	LN:107043718
+@SQ	SN:chr15	LN:101991189
+@SQ	SN:chr16	LN:90338345
+@SQ	SN:chr17	LN:83257441
+@SQ	SN:chr18	LN:80373285
+@SQ	SN:chr19	LN:58617616
+@SQ	SN:chr20	LN:64444167
+@SQ	SN:chr21	LN:46709983
+@SQ	SN:chr22	LN:50818468
+@SQ	SN:chrX	LN:156040895
+@SQ	SN:chrY	LN:57227415
+@SQ	SN:chrM	LN:16569
+@SQ	SN:chr1_KI270706v1_random	LN:175055
+@SQ	SN:chr1_KI270707v1_random	LN:32032
+@SQ	SN:chr1_KI270708v1_random	LN:127682
+@SQ	SN:chr1_KI270709v1_random	LN:66860
+@SQ	SN:chr1_KI270710v1_random	LN:40176
+@SQ	SN:chr1_KI270711v1_random	LN:42210
+@SQ	SN:chr1_KI270712v1_random	LN:176043
+@SQ	SN:chr1_KI270713v1_random	LN:40745
+@SQ	SN:chr1_KI270714v1_random	LN:41717
+@SQ	SN:chr2_KI270715v1_random	LN:161471
+@SQ	SN:chr2_KI270716v1_random	LN:153799
+@SQ	SN:chr3_GL000221v1_random	LN:155397
+@SQ	SN:chr4_GL000008v2_random	LN:209709
+@SQ	SN:chr5_GL000208v1_random	LN:92689
+@SQ	SN:chr9_KI270717v1_random	LN:40062
+@SQ	SN:chr9_KI270718v1_random	LN:38054
+@SQ	SN:chr9_KI270719v1_random	LN:176845
+@SQ	SN:chr9_KI270720v1_random	LN:39050
+@SQ	SN:chr11_KI270721v1_random	LN:100316
+@SQ	SN:chr14_GL000009v2_random	LN:201709
+@SQ	SN:chr14_GL000225v1_random	LN:211173
+@SQ	SN:chr14_KI270722v1_random	LN:194050
+@SQ	SN:chr14_GL000194v1_random	LN:191469
+@SQ	SN:chr14_KI270723v1_random	LN:38115
+@SQ	SN:chr14_KI270724v1_random	LN:39555
+@SQ	SN:chr14_KI270725v1_random	LN:172810
+@SQ	SN:chr14_KI270726v1_random	LN:43739
+@SQ	SN:chr15_KI270727v1_random	LN:448248
+@SQ	SN:chr16_KI270728v1_random	LN:1872759
+@SQ	SN:chr17_GL000205v2_random	LN:185591
+@SQ	SN:chr17_KI270729v1_random	LN:280839
+@SQ	SN:chr17_KI270730v1_random	LN:112551
+@SQ	SN:chr22_KI270731v1_random	LN:150754
+@SQ	SN:chr22_KI270732v1_random	LN:41543
+@SQ	SN:chr22_KI270733v1_random	LN:179772
+@SQ	SN:chr22_KI270734v1_random	LN:165050
+@SQ	SN:chr22_KI270735v1_random	LN:42811
+@SQ	SN:chr22_KI270736v1_random	LN:181920
+@SQ	SN:chr22_KI270737v1_random	LN:103838
+@SQ	SN:chr22_KI270738v1_random	LN:99375
+@SQ	SN:chr22_KI270739v1_random	LN:73985
+@SQ	SN:chrY_KI270740v1_random	LN:37240
+@SQ	SN:chrUn_KI270302v1	LN:2274
+@SQ	SN:chrUn_KI270304v1	LN:2165
+@SQ	SN:chrUn_KI270303v1	LN:1942
+@SQ	SN:chrUn_KI270305v1	LN:1472
+@SQ	SN:chrUn_KI270322v1	LN:21476
+@SQ	SN:chrUn_KI270320v1	LN:4416
+@SQ	SN:chrUn_KI270310v1	LN:1201
+@SQ	SN:chrUn_KI270316v1	LN:1444
+@SQ	SN:chrUn_KI270315v1	LN:2276
+@SQ	SN:chrUn_KI270312v1	LN:998
+@SQ	SN:chrUn_KI270311v1	LN:12399
+@SQ	SN:chrUn_KI270317v1	LN:37690
+@SQ	SN:chrUn_KI270412v1	LN:1179
+@SQ	SN:chrUn_KI270411v1	LN:2646
+@SQ	SN:chrUn_KI270414v1	LN:2489
+@SQ	SN:chrUn_KI270419v1	LN:1029
+@SQ	SN:chrUn_KI270418v1	LN:2145
+@SQ	SN:chrUn_KI270420v1	LN:2321
+@SQ	SN:chrUn_KI270424v1	LN:2140
+@SQ	SN:chrUn_KI270417v1	LN:2043
+@SQ	SN:chrUn_KI270422v1	LN:1445
+@SQ	SN:chrUn_KI270423v1	LN:981
+@SQ	SN:chrUn_KI270425v1	LN:1884
+@SQ	SN:chrUn_KI270429v1	LN:1361
+@SQ	SN:chrUn_KI270442v1	LN:392061
+@SQ	SN:chrUn_KI270466v1	LN:1233
+@SQ	SN:chrUn_KI270465v1	LN:1774
+@SQ	SN:chrUn_KI270467v1	LN:3920
+@SQ	SN:chrUn_KI270435v1	LN:92983
+@SQ	SN:chrUn_KI270438v1	LN:112505
+@SQ	SN:chrUn_KI270468v1	LN:4055
+@SQ	SN:chrUn_KI270510v1	LN:2415
+@SQ	SN:chrUn_KI270509v1	LN:2318
+@SQ	SN:chrUn_KI270518v1	LN:2186
+@SQ	SN:chrUn_KI270508v1	LN:1951
+@SQ	SN:chrUn_KI270516v1	LN:1300
+@SQ	SN:chrUn_KI270512v1	LN:22689
+@SQ	SN:chrUn_KI270519v1	LN:138126
+@SQ	SN:chrUn_KI270522v1	LN:5674
+@SQ	SN:chrUn_KI270511v1	LN:8127
+@SQ	SN:chrUn_KI270515v1	LN:6361
+@SQ	SN:chrUn_KI270507v1	LN:5353
+@SQ	SN:chrUn_KI270517v1	LN:3253
+@SQ	SN:chrUn_KI270529v1	LN:1899
+@SQ	SN:chrUn_KI270528v1	LN:2983
+@SQ	SN:chrUn_KI270530v1	LN:2168
+@SQ	SN:chrUn_KI270539v1	LN:993
+@SQ	SN:chrUn_KI270538v1	LN:91309
+@SQ	SN:chrUn_KI270544v1	LN:1202
+@SQ	SN:chrUn_KI270548v1	LN:1599
+@SQ	SN:chrUn_KI270583v1	LN:1400
+@SQ	SN:chrUn_KI270587v1	LN:2969
+@SQ	SN:chrUn_KI270580v1	LN:1553
+@SQ	SN:chrUn_KI270581v1	LN:7046
+@SQ	SN:chrUn_KI270579v1	LN:31033
+@SQ	SN:chrUn_KI270589v1	LN:44474
+@SQ	SN:chrUn_KI270590v1	LN:4685
+@SQ	SN:chrUn_KI270584v1	LN:4513
+@SQ	SN:chrUn_KI270582v1	LN:6504
+@SQ	SN:chrUn_KI270588v1	LN:6158
+@SQ	SN:chrUn_KI270593v1	LN:3041
+@SQ	SN:chrUn_KI270591v1	LN:5796
+@SQ	SN:chrUn_KI270330v1	LN:1652
+@SQ	SN:chrUn_KI270329v1	LN:1040
+@SQ	SN:chrUn_KI270334v1	LN:1368
+@SQ	SN:chrUn_KI270333v1	LN:2699
+@SQ	SN:chrUn_KI270335v1	LN:1048
+@SQ	SN:chrUn_KI270338v1	LN:1428
+@SQ	SN:chrUn_KI270340v1	LN:1428
+@SQ	SN:chrUn_KI270336v1	LN:1026
+@SQ	SN:chrUn_KI270337v1	LN:1121
+@SQ	SN:chrUn_KI270363v1	LN:1803
+@SQ	SN:chrUn_KI270364v1	LN:2855
+@SQ	SN:chrUn_KI270362v1	LN:3530
+@SQ	SN:chrUn_KI270366v1	LN:8320
+@SQ	SN:chrUn_KI270378v1	LN:1048
+@SQ	SN:chrUn_KI270379v1	LN:1045
+@SQ	SN:chrUn_KI270389v1	LN:1298
+@SQ	SN:chrUn_KI270390v1	LN:2387
+@SQ	SN:chrUn_KI270387v1	LN:1537
+@SQ	SN:chrUn_KI270395v1	LN:1143
+@SQ	SN:chrUn_KI270396v1	LN:1880
+@SQ	SN:chrUn_KI270388v1	LN:1216
+@SQ	SN:chrUn_KI270394v1	LN:970
+@SQ	SN:chrUn_KI270386v1	LN:1788
+@SQ	SN:chrUn_KI270391v1	LN:1484
+@SQ	SN:chrUn_KI270383v1	LN:1750
+@SQ	SN:chrUn_KI270393v1	LN:1308
+@SQ	SN:chrUn_KI270384v1	LN:1658
+@SQ	SN:chrUn_KI270392v1	LN:971
+@SQ	SN:chrUn_KI270381v1	LN:1930
+@SQ	SN:chrUn_KI270385v1	LN:990
+@SQ	SN:chrUn_KI270382v1	LN:4215
+@SQ	SN:chrUn_KI270376v1	LN:1136
+@SQ	SN:chrUn_KI270374v1	LN:2656
+@SQ	SN:chrUn_KI270372v1	LN:1650
+@SQ	SN:chrUn_KI270373v1	LN:1451
+@SQ	SN:chrUn_KI270375v1	LN:2378
+@SQ	SN:chrUn_KI270371v1	LN:2805
+@SQ	SN:chrUn_KI270448v1	LN:7992
+@SQ	SN:chrUn_KI270521v1	LN:7642
+@SQ	SN:chrUn_GL000195v1	LN:182896
+@SQ	SN:chrUn_GL000219v1	LN:179198
+@SQ	SN:chrUn_GL000220v1	LN:161802
+@SQ	SN:chrUn_GL000224v1	LN:179693
+@SQ	SN:chrUn_KI270741v1	LN:157432
+@SQ	SN:chrUn_GL000226v1	LN:15008
+@SQ	SN:chrUn_GL000213v1	LN:164239
+@SQ	SN:chrUn_KI270743v1	LN:210658
+@SQ	SN:chrUn_KI270744v1	LN:168472
+@SQ	SN:chrUn_KI270745v1	LN:41891
+@SQ	SN:chrUn_KI270746v1	LN:66486
+@SQ	SN:chrUn_KI270747v1	LN:198735
+@SQ	SN:chrUn_KI270748v1	LN:93321
+@SQ	SN:chrUn_KI270749v1	LN:158759
+@SQ	SN:chrUn_KI270750v1	LN:148850
+@SQ	SN:chrUn_KI270751v1	LN:150742
+@SQ	SN:chrUn_KI270752v1	LN:27745
+@SQ	SN:chrUn_KI270753v1	LN:62944
+@SQ	SN:chrUn_KI270754v1	LN:40191
+@SQ	SN:chrUn_KI270755v1	LN:36723
+@SQ	SN:chrUn_KI270756v1	LN:79590
+@SQ	SN:chrUn_KI270757v1	LN:71251
+@SQ	SN:chrUn_GL000214v1	LN:137718
+@SQ	SN:chrUn_KI270742v1	LN:186739
+@SQ	SN:chrUn_GL000216v2	LN:176608
+@SQ	SN:chrUn_GL000218v1	LN:161147
+@SQ	SN:chrEBV	LN:171823
+@PG	ID:minimap2	PN:minimap2	VN:2.24-r1122	CL:minimap2 -t 1 -ax splice -uf --secondary=no -C5 /data/homezvol1/freese/mortazavi_lab/ref/hg38/hg38.fa hl60_1_1_subset.fastq
+@PG	ID:samtools	PN:samtools	PP:minimap2	VN:1.15.1	CL:samtools calmd -b hl60_1_1_subset_remapped.sam /data/homezvol1/freese/mortazavi_lab/ref/hg38/hg38.fa
+@PG	ID:samtools.1	PN:samtools	PP:samtools	VN:1.15.1	CL:samtools sort hl60_1_1_subset_remapped.bam
+@PG	ID:samtools.2	PN:samtools	PP:samtools.1	VN:1.15.1	CL:samtools view -h readthrough/hl60_1_1_subset_remapped_sorted.bam
+cenps_cort_fsm	0	chr1	10430430	60	139M3273N124M691N34M5656N67M1851N177M8935N282M	*	0	0	CTCGCGCCGCGGCGGGAAAATCCGACCTGGCCGCGCACCACCGCCCCTTCTCGGCCCTCCTGCGTTTGCCCAGGGTCGGCCCGCAGTGATGGAGGAGGAGGCGGAGACCGAGGAGCAGCAGCGATTCTCTTACCAACAGAGGCTAAAGGCAGCAGTTCACTATACTGTGGGTTGTCTTTGCGAGGAAGTTGCATTGGACAAAGAGATGCAGTTCAGCAAACAGACCATTGCGGCCATTTCGGAGCTGACTTTCCGACAGTGTGAAAATTTTGCCAAAGACCTTGAAATGTTTGCAAGACATGCGAAAAGAACCACAATTAACACTGAAGATGTGAAGCTCTTAGCCAGGAGGAGTAATTCACTGCTAAAATACATCACAGACAAAAGTGAAGAGATTGCTCAGATTAACCTAGAACGAAAAGCACAGAAGAAAAAGAAGTCAGAGGATGGAAGCAAAAATTCAAGGCAGCCAGCAGAGGCTGGAGTGGTGGAAAGTGAGAATTAAAGTCCCTCGCCGCTTGGAAAGTGCAGCCTTCTACAGCATATGCAGGAAGCGGCAGGAATAAGGAAAAGCAGCCTCCTGACTTTCCTCGCTTGGTGGTTTGAGTGGACCTCCCAGGCCAGTGCCGGGCCCCTCATAGGAGAGGAAGCCCGGGAGGTGGCCAGGCGGCAGGAAGGCGCACCCCCCCAGCAATCCGCGCGCCGGGACAGAATGCCCTGCAGGAACTTCTTCTGGAAGACCTTCTCCTCCTGCAAATAAAACCTCACCCATGAATGCTCACGCAAGTGTAATGACAGACCTGAATAAAATGTATTAAGCAGC	5555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555	NM:i:1	ms:i:820	AS:i:658	nn:i:0	ts:A:+	tp:A:P	cm:i:249	s1:i:759	s2:i:159	de:f:0.0012	rl:i:0	MD:Z:696T126
+cenps_cort_nnc	0	chr1	10430430	60	149M2I5M2I1M3257N124M691N34M5656N67M1851N177M8935N282M	*	0	0	CTCGCGCCGCGGCGGGAAAATCCGACCTGGCCGCGCACCACCGCCCCTTCTCGGCCCTCCTGCGTTTGCCCAGGGTCGGCCCGCAGTGATGGAGGAGGAGGCGGAGACCGAGGAGCAGCAGCGATTCTCTTACCAACAGGTACAGGAAAGTACAGGAAGAGGCTAAAGGCAGCAGTTCACTATACTGTGGGTTGTCTTTGCGAGGAAGTTGCATTGGACAAAGAGATGCAGTTCAGCAAACAGACCATTGCGGCCATTTCGGAGCTGACTTTCCGACAGTGTGAAAATTTTGCCAAAGACCTTGAAATGTTTGCAAGACATGCGAAAAGAACCACAATTAACACTGAAGATGTGAAGCTCTTAGCCAGGAGGAGTAATTCACTGCTAAAATACATCACAGACAAAAGTGAAGAGATTGCTCAGATTAACCTAGAACGAAAAGCACAGAAGAAAAAGAAGTCAGAGGATGGAAGCAAAAATTCAAGGCAGCCAGCAGAGGCTGGAGTGGTGGAAAGTGAGAATTAAAGTCCCTCGCCGCTTGGAAAGTGCAGCCTTCTACAGCATATGCAGGAAGCGGCAGGAATAAGGAAAAGCAGCCTCCTGACTTTCCTCGCTTGGTGGTTTGAGTGGACCTCCCAGGCCAGTGCCGGGCCCCTCATAGGAGAGGAAGCCCGGGAGGTGGCCAGGCGGCAGGAAGGCGCACCCCCCCAGCAATCCGCGCGCCGGGACAGAATGCCCTGCAGGAACTTCTTCTGGAAGACCTTCTCCTCCTGCAAATAAAACCTCACCCATGAATGCTCACGCAAGTGTAATGACAGACCTGAATAAAATGTATTAAGCAGC	555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555	NM:i:6	ms:i:826	AS:i:661	nn:i:0	ts:A:+	tp:A:P	cm:i:250	s1:i:766	s2:i:159	de:f:0.0048	rl:i:0	MD:Z:151G560T126
+cenps_cort_nic	0	chr1	10430430	60	139M4088N34M5656N67M1851N177M8935N282M	*	0	0	CTCGCGCCGCGGCGGGAAAATCCGACCTGGCCGCGCACCACCGCCCCTTCTCGGCCCTCCTGCGTTTGCCCAGGGTCGGCCCGCAGTGATGGAGGAGGAGGCGGAGACCGAGGAGCAGCAGCGATTCTCTTACCAACAGAAAATTTTGCCAAAGACCTTGAAATGTTTGCAAGACATGCGAAAAGAACCACAATTAACACTGAAGATGTGAAGCTCTTAGCCAGGAGGAGTAATTCACTGCTAAAATACATCACAGACAAAAGTGAAGAGATTGCTCAGATTAACCTAGAACGAAAAGCACAGAAGAAAAAGAAGTCAGAGGATGGAAGCAAAAATTCAAGGCAGCCAGCAGAGGCTGGAGTGGTGGAAAGTGAGAATTAAAGTCCCTCGCCGCTTGGAAAGTGCAGCCTTCTACAGCATATGCAGGAAGCGGCAGGAATAAGGAAAAGCAGCCTCCTGACTTTCCTCGCTTGGTGGTTTGAGTGGACCTCCCAGGCCAGTGCCGGGCCCCTCATAGGAGAGGAAGCCCGGGAGGTGGCCAGGCGGCAGGAAGGCGCACCCCCCCAGCAATCCGCGCGCCGGGACAGAATGCCCTGCAGGAACTTCTTCTGGAAGACCTTCTCCTCCTGCAAATAAAACCTCACCCATGAATGCTCACGCAAGTGTAATGACAGACCTGAATAAAATGTATTAAGCAGC	555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555	NM:i:1	ms:i:696	AS:i:566	nn:i:0	ts:A:+	tp:A:P	cm:i:209	s1:i:644	s2:i:99	de:f:0.0014	rl:i:0	MD:Z:572T126
+cenps_fsm	0	chr1	10430430	60	139M3273N124M691N34M5656N67M1851N546M	*	0	0	CTCGCGCCGCGGCGGGAAAATCCGACCTGGCCGCGCACCACCGCCCCTTCTCGGCCCTCCTGCGTTTGCCCAGGGTCGGCCCGCAGTGATGGAGGAGGAGGCGGAGACCGAGGAGCAGCAGCGATTCTCTTACCAACAGAGGCTAAAGGCAGCAGTTCACTATACTGTGGGTTGTCTTTGCGAGGAAGTTGCATTGGACAAAGAGATGCAGTTCAGCAAACAGACCATTGCGGCCATTTCGGAGCTGACTTTCCGACAGTGTGAAAATTTTGCCAAAGACCTTGAAATGTTTGCAAGACATGCGAAAAGAACCACAATTAACACTGAAGATGTGAAGCTCTTAGCCAGGAGGAGTAATTCACTGCTAAAATACATCACAGACAAAAGTGAAGAGATTGCTCAGATTAACCTAGAACGAAAAGCACAGAAGAAAAAGAAGTCAGAGGATGGAAGCAAAAATTCAAGGCAGCCAGCAGAGGCTGGAGTGGTGGAAAGTGAGAATTAAAGTCCCTCGCCGCTTGGAAAGTGCAGCCTTCTACAGGTAGAGCCACCTAGAAATGCATATGGCTGCAAAGGAAACTTTGAAGGGTTAAATAGAGATTTAAAAAAATAAAATAAAAAGGCTGGGCTAGGGTGCTTTTTGTGCTGAATTCTCCACATTGTTAACTGCCAAAGCTAGTTTTAGAGAATGAGAAAGTCTTAAGCAAAATACTCCCAGGTCTCACTCCAGAACATAAAAATGGTGTGTGATCGAATGGTATATATTAGAAATTACATCTGTTGTAATTAAAATTGTGTGAGCAATTAAACATGGTTGACTTTTTCAAGCAAAAATCAGTTCATCTTTTGATGTAATTTTCTAGGCTAAATGGCAATCTCTGAAAGATGAATAAAGCTATATTTATTTAGC	5555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555	NM:i:1	ms:i:907	AS:i:779	nn:i:0	ts:A:+	tp:A:P	cm:i:280	s1:i:857	s2:i:174	de:f:0.0011	rl:i:20	MD:Z:752A157
+cenps_nic	0	chr1	10430430	60	139M307N526M2440N124M691N34M5656N67M1851N225M1S	*	0	0	CTCGCGCCGCGGCGGGAAAATCCGACCTGGCCGCGCACCACCGCCCCTTCTCGGCCCTCCTGCGTTTGCCCAGGGTCGGCCCGCAGTGATGGAGGAGGAGGCGGAGACCGAGGAGCAGCAGCGATTCTCTTACCAACAGATGGGGTTTTCGTGAGGGTACAACGTCGGCATTAGACATTCCAGGTGACGCCCGTACGCGGTGGGCGGTTCGGGCCGGAGCTCTGGAACGCTGGCCCTGGAGGCGTCGACCCCTCGTTACTGATGCAGGGACGCGGTGCGGACCAGTCAGGCCCAGAGCTCGTCCTTAGATGTGGGTTCGAATCTCTGCCCCGCCAACTTGTGATCGTATCGACTCGGCCCAGACGCAATTTTCTTCTCTGCAAAATCGTCATAAGAATAATCACTTGTCAGGGTAGCTGCGGGCATCCCATTCGTTCCTTTCATCAGCGCCGGGCATATGGGGCGTCAGAGGCTGAGAACGTTGCCGTGAAGAGGCTTAAAAGCAAGACCCGGAGTGGCGACCTTAAAGAGGACGGACTGAAGAAACGCGGGAATGAGCTCCAGACGCGGGAGTTTCCTCTCTACAAAGTTACACTGCAGCAGCTGTCTACCCTGCCCCTTGTCTTTTGAGAAGTTCAAACCTTCAGAAAAGTTGCAAGAACACGAGGCTAAAGGCAGCAGTTCACTATACTGTGGGTTGTCTTTGCGAGGAAGTTGCATTGGACAAAGAGATGCAGTTCAGCAAACAGACCATTGCGGCCATTTCGGAGCTGACTTTCCGACAGTGTGAAAATTTTGCCAAAGACCTTGAAATGTTTGCAAGACATGCGAAAAGAACCACAATTAACACTGAAGATGTGAAGCTCTTAGCCAGGAGGAGTAATTCACTGCTAAAATACATCACAGACAAAAGTGAAGAGATTGCTCAGATTAACCTAGAACGAAAAGCACAGAAGAAAAAGAAGTCAGAGGATGGAAGCAAAAATTCAAGGCAGCCAGCAGAGGCTGGAGTGGTGGAAAGTGAGAATTAAAGTCCCTCGCCGCTTGGAAAGTGCAGCCTTCTACAGGTAGAGCCACCTAGAAATGCATATGGCTGCAAAGGAAACTTTGAAGGGC	555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555	NM:i:0	ms:i:1115	AS:i:955	nn:i:0	ts:A:+	tp:A:P	cm:i:351	s1:i:1057	s2:i:129	de:f:0	rl:i:0	MD:Z:1115
+cenps_nnc	0	chr1	10430430	60	150M296N526M2440N124M691N34M5656N67M1851N225M1S	*	0	0	CTCGCGCCGCGGCGGGAAAATCCGACCTGGCCGCGCACCACCGCCCCTTCTCGGCCCTCCTGCGTTTGCCCAGGGTCGGCCCGCAGTGATGGAGGAGGAGGCGGAGACCGAGGAGCAGCAGCGATTCTCTTACCAACAGGTACAGGAAAAATGGGGTTTTCGTGAGGGTACAACGTCGGCATTAGACATTCCAGGTGACGCCCGTACGCGGTGGGCGGTTCGGGCCGGAGCTCTGGAACGCTGGCCCTGGAGGCGTCGACCCCTCGTTACTGATGCAGGGACGCGGTGCGGACCAGTCAGGCCCAGAGCTCGTCCTTAGATGTGGGTTCGAATCTCTGCCCCGCCAACTTGTGATCGTATCGACTCGGCCCAGACGCAATTTTCTTCTCTGCAAAATCGTCATAAGAATAATCACTTGTCAGGGTAGCTGCGGGCATCCCATTCGTTCCTTTCATCAGCGCCGGGCATATGGGGCGTCAGAGGCTGAGAACGTTGCCGTGAAGAGGCTTAAAAGCAAGACCCGGAGTGGCGACCTTAAAGAGGACGGACTGAAGAAACGCGGGAATGAGCTCCAGACGCGGGAGTTTCCTCTCTACAAAGTTACACTGCAGCAGCTGTCTACCCTGCCCCTTGTCTTTTGAGAAGTTCAAACCTTCAGAAAAGTTGCAAGAACACGAGGCTAAAGGCAGCAGTTCACTATACTGTGGGTTGTCTTTGCGAGGAAGTTGCATTGGACAAAGAGATGCAGTTCAGCAAACAGACCATTGCGGCCATTTCGGAGCTGACTTTCCGACAGTGTGAAAATTTTGCCAAAGACCTTGAAATGTTTGCAAGACATGCGAAAAGAACCACAATTAACACTGAAGATGTGAAGCTCTTAGCCAGGAGGAGTAATTCACTGCTAAAATACATCACAGACAAAAGTGAAGAGATTGCTCAGATTAACCTAGAACGAAAAGCACAGAAGAAAAAGAAGTCAGAGGATGGAAGCAAAAATTCAAGGCAGCCAGCAGAGGCTGGAGTGGTGGAAAGTGAGAATTAAAGTCCCTCGCCGCTTGGAAAGTGCAGCCTTCTACAGGTAGAGCCACCTAGAAATGCATATGGCTGCAAAGGAAACTTTGAAGGGC	55555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555	NM:i:0	ms:i:1126	AS:i:961	nn:i:0	ts:A:+	tp:A:P	cm:i:352	s1:i:1063	s2:i:129	de:f:0	rl:i:0	MD:Z:1126
+cenps_cort_ism	0	chr1	10440345	60	12S69M1851N177M8935N282M	*	0	0	GAAATGTTTGCAAGACATGCGAAAAGAACCACAATTAACACTGAAGATGTGAAGCTCTTAGCCAGGAGGAGTAATTCACTGCTAAAATACATCACAGACAAAAGTGAAGAGATTGCTCAGATTAACCTAGAACGAAAAGCACAGAAGAAAAAGAAGTCAGAGGATGGAAGCAAAAATTCAAGGCAGCCAGCAGAGGCTGGAGTGGTGGAAAGTGAGAATTAAAGTCCCTCGCCGCTTGGAAAGTGCAGCCTTCTACAGCATATGCAGGAAGCGGCAGGAATAAGGAAAAGCAGCCTCCTGACTTTCCTCGCTTGGTGGTTTGAGTGGACCTCCCAGGCCAGTGCCGGGCCCCTCATAGGAGAGGAAGCCCGGGAGGTGGCCAGGCGGCAGGAAGGCGCACCCCCCCAGCAATCCGCGCGCCGGGACAGAATGCCCTGCAGGAACTTCTTCTGGAAGACCTTCTCCTCCTGCAAATAAAACCTCACCCATGAATGCTCACGCAAGTGTAATGACAGACCTGAATAAAATGTATTAAGCAGC	555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555	NM:i:1	ms:i:525	AS:i:459	nn:i:0	ts:A:+	tp:A:P	cm:i:160	s1:i:498	s2:i:74	de:f:0.0019	rl:i:0	MD:Z:401T126
+eloa_rpl11_fsm_1	0	chr1	23691792	60	38M779N151M1047N107M746N132M54229N57M764N107M896N1186M376N112M1581N156M107N98M1382N181M250N112M567N173M2386N393M2S	*	0	0	GGAAGCTCCGCTTTCTCTTCCTGCTCTCCATCATGGCGCAGGATCAAGGTGAAAAGGAGAACCCCATGCGGGAACTTCGCATCCGCAAACTCTGTCTCAACATCTGTGTTGGGGAGAGTGGAGACAGACTGACGCGAGCAGCCAAGGTGTTGGAGCAGCTCACAGGGCAGACCCCTGTGTTTTCCAAAGCTAGATACACTGTCAGATCCTTTGGCATCCGGAGAAATGAAAAGATTGCTGTCCACTGCACAGTTCGAGGGGCCAAGGCAGAAGAAATCTTGGAGAAGGGTCTAAAGGTGCGGGAGTATGAGTTAAGAAAAAACAACTTCTCAGATACTGGAAACTTTGGTTTTGGGATCCAGGAACACATCGATCTGGGTATCAAATATGACCCAAGCATTGGTATCTACGGCCTGGACTTCTATGTGCTATTGAAATATTTGAAGAAACTCTCCACCCTGCCTATTACAGTAGACATTCTTGCGGAGACTGGGGTTGGGAAAACAGTAAATAGCTTGCGAAAACACGAGCATGTTGGAAGCTTTGCCAGGGACCTAGTGGCCCAGTGGAAGAAGCTGGTTCCTGTGGAACGAAATGCTGAGCCTGATGAACAGGACTTTGAGAAGAGCAATTCCCGAAAGCGCCCTCGGGATGCCCTGCAGAAGGAGGAGGAGATGGAGGGGGACTACCAAGAAACCTGGAAAGCCACGGGGAGCCGATCCTATAGCCCTGACCACAGGCAGAAGAAACATAGGAAACTCTCGGAGCTCGAGAGACCTCACAAAGTGTCTCACGGTCATGAGAGGAGAGATGAGAGAAAGAGGTGTCACAGAATGTCACCAACTTACTCTTCAGACCCTGAGTCTTCTGATTATGGCCATGTTCAATCCCCTCCATCTTGTACCAGTCCTCATCAGATGTACGTCGACCACTACAGATCCCTGGAGGAGGACCAGGAGCCCATTGTTTCACACCAGAAGCCTGGGAAAGGCCACAGCAATGCCTTTCAGGACAGACTCGGGGCCAGCCAAGAACGACACCTGGGTGAACCCCATGGGAAAGGGGTTGTGAGTCAAAACAAGGAGCACAAATCTTCCCACAAGGACAAACGCCCCGTGGATGCCAAGAGTGATGAGAAGGCCTCTGTGGTGAGCAGAGAGAAATCACACAAGGCCCTCTCCAAAGAGGAGAACCGAAGGCCACCCTCAGGGGACAATGCAAGGGAGAAACCGCCCTCTAGTGGCGTAAAGAAAGAGAAGGACAGAGAGGGCAGCAGCCTGAAGAAGAAGTGTTTGCCTCCCTCAGAGGCCGCTTCAGACAACCACCTGAAAAAGCCAAAGCACAGAGACCCAGAGAAAGCCAAATTGGACAAAAGCAAGCAAGGTCTGGACAGCTTTGACACAGGAAAAGGAGCAGGAGACCTGTTGCCCAAGGTAAAAGAGAAGGGTTCTAACAACCTAAAGACTCCAGAAGGGAAAGTCAAAACTAATTTGGATAGAAAGTCACTGGGCTCCCTCCCTAAAGTTGAGGAGACAGATATGGAGGATGAATTCGAGCAGCCAACCATGTCTTTTGAATCCTACCTCAGCTATGACCAGCCCCGGAAGAAAAAGAAAAAGATTGTGAAAACTTCAGCCACGGCACTTGGAGATAAAGGACTTAAAAAAAATGACTCTAAAAGCACTGGTAAAAACTTGGACTCAGTTCAGAAATTACCCAAGGTGAACAAAACCAAGTCAGAGAAGCCGGCTGGAGCTGATTTAGCCAAGCTGAGAAAGGTGCCTGATGTGTTGCCAGTGTTGCCAGACCTCCCGTTACCCGCGATACAGGCCAATTACCGTCCACTGCCTTCCCTCGAGCTGATATCCTCCTTCCAGCCAAAGCGAAAAGCGTTCTCTTCACCCCAGGAAGAAGAAGAAGCTGGATTTACTGGGCGCAGAATGAATTCCAAGATGCAGGTGTATTCTGGTTCCAAGTGTGCCTATCTCCCTAAAATGATGACCTTGCACCAGCAATGCATCCGAGTACTTAAAAACAACATCGATTCAATCTTTGAAGTGGGAGGAGTCCCATACTCTGTTCTTGAACCCGTTTTGGAGAGGTGTACACCTGATCAGCTGTATCGCATAGAGGAATACAATCATGTATTAATTGAAGAAACAGATCAATTATGGAAAGTTCATTGTCACCGAGACTTTAAGGAAGAAAGACCCGAAGAGTATGAGTCGTGGCGAGAGATGTACCTGCGGCTTCAGGACGCCCGAGAGCAGCGGCTACGAGTACTAACAAAGAATATCCAGTTCGCACATGCCAATAAGCCCAAAGGCCGACAAGCAAAGATGGCCTTTGTCAACTCTGTGGCCAAGCCACCTCGTGACGTCCGGAGGAGGCAGGAAAAGTTTGGAACGGGAGGAGCAGCTGTCCCTGAGAAAATCAAGATCAAGCCAGCCCCGTACCCCATGGGAAGCAGCCATGCTTCCGCCAGTAGTATCAGCTTTAACCCCAGCCCTGAGGAGCCGGCCTATGATGGCCCAAGCACCAGCAGTGCCCACTTGGCACCAGTGGTCAGCAGCACTGTTTCCTATGATCCTAGGAAACCCACTGTGAAGAAAATTGCCCCAATGATGGCCAAGACAATTAAAGCTTTCAAGAACAGATTCTCCCGACGATAAACTGAGGACTTGCCTTGGAAATGGAATCTGGGGAGGCAGGAATACAAGGACAGTGGGGGTTGGGGAATGGAATTCTACAGGAGACTGGAGTCTTGCTTTGTGGATCCTTTTGGTCTCCGAGTCCTGCAGTCTGCAGGTGCTGCCCCTGGGAACCTGCGTGCCACAGCCCCGCCTCCCTGCCTGGAGCACACTTTAGAATTCTGAAGATGTGAAGCCTCTGTCTCACTGAGGATTTTAAAGGTCAATTATACTTTTGTTGTTCATTAGCATCTTTGTAAACTATAAGACGTAGTTTTAATTAATAAATATTGCCCCCAGATTGTATTTATATGG	55555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555	NM:i:1	ms:i:3000	AS:i:2584	nn:i:0	ts:A:+	tp:A:P	cm:i:959	s1:i:2859	s2:i:188	de:f:0.0003	rl:i:30	MD:Z:2488C514
+eloa_rpl11_nic	0	chr1	23691792	60	38M1977N107M746N132M54229N57M764N107M896N1186M376N112M1581N156M107N98M1382N181M250N112M567N173M2386N393M2S	*	0	0	GGAAGCTCCGCTTTCTCTTCCTGCTCTCCATCATGGCGCTAGATACACTGTCAGATCCTTTGGCATCCGGAGAAATGAAAAGATTGCTGTCCACTGCACAGTTCGAGGGGCCAAGGCAGAAGAAATCTTGGAGAAGGGTCTAAAGGTGCGGGAGTATGAGTTAAGAAAAAACAACTTCTCAGATACTGGAAACTTTGGTTTTGGGATCCAGGAACACATCGATCTGGGTATCAAATATGACCCAAGCATTGGTATCTACGGCCTGGACTTCTATGTGCTATTGAAATATTTGAAGAAACTCTCCACCCTGCCTATTACAGTAGACATTCTTGCGGAGACTGGGGTTGGGAAAACAGTAAATAGCTTGCGAAAACACGAGCATGTTGGAAGCTTTGCCAGGGACCTAGTGGCCCAGTGGAAGAAGCTGGTTCCTGTGGAACGAAATGCTGAGCCTGATGAACAGGACTTTGAGAAGAGCAATTCCCGAAAGCGCCCTCGGGATGCCCTGCAGAAGGAGGAGGAGATGGAGGGGGACTACCAAGAAACCTGGAAAGCCACGGGGAGCCGATCCTATAGCCCTGACCACAGGCAGAAGAAACATAGGAAACTCTCGGAGCTCGAGAGACCTCACAAAGTGTCTCACGGTCATGAGAGGAGAGATGAGAGAAAGAGGTGTCACAGAATGTCACCAACTTACTCTTCAGACCCTGAGTCTTCTGATTATGGCCATGTTCAATCCCCTCCATCTTGTACCAGTCCTCATCAGATGTACGTCGACCACTACAGATCCCTGGAGGAGGACCAGGAGCCCATTGTTTCACACCAGAAGCCTGGGAAAGGCCACAGCAATGCCTTTCAGGACAGACTCGGGGCCAGCCAAGAACGACACCTGGGTGAACCCCATGGGAAAGGGGTTGTGAGTCAAAACAAGGAGCACAAATCTTCCCACAAGGACAAACGCCCCGTGGATGCCAAGAGTGATGAGAAGGCCTCTGTGGTGAGCAGAGAGAAATCACACAAGGCCCTCTCCAAAGAGGAGAACCGAAGGCCACCCTCAGGGGACAATGCAAGGGAGAAACCGCCCTCTAGTGGCGTAAAGAAAGAGAAGGACAGAGAGGGCAGCAGCCTGAAGAAGAAGTGTTTGCCTCCCTCAGAGGCCGCTTCAGACAACCACCTGAAAAAGCCAAAGCACAGAGACCCAGAGAAAGCCAAATTGGACAAAAGCAAGCAAGGTCTGGACAGCTTTGACACAGGAAAAGGAGCAGGAGACCTGTTGCCCAAGGTAAAAGAGAAGGGTTCTAACAACCTAAAGACTCCAGAAGGGAAAGTCAAAACTAATTTGGATAGAAAGTCACTGGGCTCCCTCCCTAAAGTTGAGGAGACAGATATGGAGGATGAATTCGAGCAGCCAACCATGTCTTTTGAATCCTACCTCAGCTATGACCAGCCCCGGAAGAAAAAGAAAAAGATTGTGAAAACTTCAGCCACGGCACTTGGAGATAAAGGACTTAAAAAAAATGACTCTAAAAGCACTGGTAAAAACTTGGACTCAGTTCAGAAATTACCCAAGGTGAACAAAACCAAGTCAGAGAAGCCGGCTGGAGCTGATTTAGCCAAGCTGAGAAAGGTGCCTGATGTGTTGCCAGTGTTGCCAGACCTCCCGTTACCCGCGATACAGGCCAATTACCGTCCACTGCCTTCCCTCGAGCTGATATCCTCCTTCCAGCCAAAGCGAAAAGCGTTCTCTTCACCCCAGGAAGAAGAAGAAGCTGGATTTACTGGGCGCAGAATGAATTCCAAGATGCAGGTGTATTCTGGTTCCAAGTGTGCCTATCTCCCTAAAATGATGACCTTGCACCAGCAATGCATCCGAGTACTTAAAAACAACATCGATTCAATCTTTGAAGTGGGAGGAGTCCCATACTCTGTTCTTGAACCCGTTTTGGAGAGGTGTACACCTGATCAGCTGTATCGCATAGAGGAATACAATCATGTATTAATTGAAGAAACAGATCAATTATGGAAAGTTCATTGTCACCGAGACTTTAAGGAAGAAAGACCCGAAGAGTATGAGTCGTGGCGAGAGATGTACCTGCGGCTTCAGGACGCCCGAGAGCAGCGGCTACGAGTACTAACAAAGAATATCCAGTTCGCACATGCCAATAAGCCCAAAGGCCGACAAGCAAAGATGGCCTTTGTCAACTCTGTGGCCAAGCCACCTCGTGACGTCCGGAGGAGGCAGGAAAAGTTTGGAACGGGAGGAGCAGCTGTCCCTGAGAAAATCAAGATCAAGCCAGCCCCGTACCCCATGGGAAGCAGCCATGCTTCCGCCAGTAGTATCAGCTTTAACCCCAGCCCTGAGGAGCCGGCCTATGATGGCCCAAGCACCAGCAGTGCCCACTTGGCACCAGTGGTCAGCAGCACTGTTTCCTATGATCCTAGGAAACCCACTGTGAAGAAAATTGCCCCAATGATGGCCAAGACAATTAAAGCTTTCAAGAACAGATTCTCCCGACGATAAACTGAGGACTTGCCTTGGAAATGGAATCTGGGGAGGCAGGAATACAAGGACAGTGGGGGTTGGGGAATGGAATTCTACAGGAGACTGGAGTCTTGCTTTGTGGATCCTTTTGGTCTCCGAGTCCTGCAGTCTGCAGGTGCTGCCCCTGGGAACCTGCGTGCCACAGCCCCGCCTCCCTGCCTGGAGCACACTTTAGAATTCTGAAGATGTGAAGCCTCTGTCTCACTGAGGATTTTAAAGGTCAATTATACTTTTGTTGTTCATTAGCATCTTTGTAAACTATAAGACGTAGTTTTAATTAATAAATATTGCCCCCAGATTGTATTTATATGG	5555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555	NM:i:1	ms:i:2849	AS:i:2465	nn:i:0	ts:A:+	tp:A:P	cm:i:912	s1:i:2717	s2:i:129	de:f:0.0004	rl:i:30	MD:Z:2337C514
+eloa_rpl11_fsm_2	0	chr1	23692590	60	5S170M1047N107M746N132M54229N57M764N107M896N1186M376N112M1581N156M107N98M1382N181M250N112M567N173M2386N392M2S	*	0	0	CTCTTCTGCTCTTCCCTGTTGCAGCAGGATCAAGGTGAAAAGGAGAACCCCATGCGGGAACTTCGCATCCGCAAACTCTGTCTCAACATCTGTGTTGGGGAGAGTGGAGACAGACTGACGCGAGCAGCCAAGGTGTTGGAGCAGCTCACAGGGCAGACCCCTGTGTTTTCCAAAGCTAGATACACTGTCAGATCCTTTGGCATCCGGAGAAATGAAAAGATTGCTGTCCACTGCACAGTTCGAGGGGCCAAGGCAGAAGAAATCTTGGAGAAGGGTCTAAAGGTGCGGGAGTATGAGTTAAGAAAAAACAACTTCTCAGATACTGGAAACTTTGGTTTTGGGATCCAGGAACACATCGATCTGGGTATCAAATATGACCCAAGCATTGGTATCTACGGCCTGGACTTCTATGTGCTATTGAAATATTTGAAGAAACTCTCCACCCTGCCTATTACAGTAGACATTCTTGCGGAGACTGGGGTTGGGAAAACAGTAAATAGCTTGCGAAAACACGAGCATGTTGGAAGCTTTGCCAGGGACCTAGTGGCCCAGTGGAAGAAGCTGGTTCCTGTGGAACGAAATGCTGAGCCTGATGAACAGGACTTTGAGAAGAGCAATTCCCGAAAGCGCCCTCGGGATGCCCTGCAGAAGGAGGAGGAGATGGAGGGGGACTACCAAGAAACCTGGAAAGCCACGGGGAGCCGATCCTATAGCCCTGACCACAGGCAGAAGAAACATAGGAAACTCTCGGAGCTCGAGAGACCTCACAAAGTGTCTCACGGTCATGAGAGGAGAGATGAGAGAAAGAGGTGTCACAGAATGTCACCAACTTACTCTTCAGACCCTGAGTCTTCTGATTATGGCCATGTTCAATCCCCTCCATCTTGTACCAGTCCTCATCAGATGTACGTCGACCACTACAGATCCCTGGAGGAGGACCAGGAGCCCATTGTTTCACACCAGAAGCCTGGGAAAGGCCACAGCAATGCCTTTCAGGACAGACTCGGGGCCAGCCAAGAACGACACCTGGGTGAACCCCATGGGAAAGGGGTTGTGAGTCAAAACAAGGAGCACAAATCTTCCCACAAGGACAAACGCCCCGTGGATGCCAAGAGTGATGAGAAGGCCTCTGTGGTGAGCAGAGAGAAATCACACAAGGCCCTCTCCAAAGAGGAGAACCGAAGGCCACCCTCAGGGGACAATGCAAGGGAGAAACCGCCCTCTAGTGGCGTAAAGAAAGAGAAGGACAGAGAGGGCAGCAGCCTGAAGAAGAAGTGTTTGCCTCCCTCAGAGGCCGCTTCAGACAACCACCTGAAAAAGCCAAAGCACAGAGACCCAGAGAAAGCCAAATTGGACAAAAGCAAGCAAGGTCTGGACAGCTTTGACACAGGAAAAGGAGCAGGAGACCTGTTGCCCAAGGTAAAAGAGAAGGGTTCTAACAACCTAAAGACTCCAGAAGGGAAAGTCAAAACTAATTTGGATAGAAAGTCACTGGGCTCCCTCCCTAAAGTTGAGGAGACAGATATGGAGGATGAATTCGAGCAGCCAACCATGTCTTTTGAATCCTACCTCAGCTATGACCAGCCCCGGAAGAAAAAGAAAAAGATTGTGAAAACTTCAGCCACGGCACTTGGAGATAAAGGACTTAAAAAAAATGACTCTAAAAGCACTGGTAAAAACTTGGACTCAGTTCAGAAATTACCCAAGGTGAACAAAACCAAGTCAGAGAAGCCGGCTGGAGCTGATTTAGCCAAGCTGAGAAAGGTGCCTGATGTGTTGCCAGTGTTGCCAGACCTCCCGTTACCCGCGATACAGGCCAATTACCGTCCACTGCCTTCCCTCGAGCTGATATCCTCCTTCCAGCCAAAGCGAAAAGCGTTCTCTTCACCCCAGGAAGAAGAAGAAGCTGGATTTACTGGGCGCAGAATGAATTCCAAGATGCAGGTGTATTCTGGTTCCAAGTGTGCCTATCTCCCTAAAATGATGACCTTGCACCAGCAATGCATCCGAGTACTTAAAAACAACATCGATTCAATCTTTGAAGTGGGAGGAGTCCCATACTCTGTTCTTGAACCCGTTTTGGAGAGGTGTACACCTGATCAGCTGTATCGCATAGAGGAATACAATCATGTATTAATTGAAGAAACAGATCAATTATGGAAAGTTCATTGTCACCGAGACTTTAAGGAAGAAAGACCCGAAGAGTATGAGTCGTGGCGAGAGATGTACCTGCGGCTTCAGGACGCCCGAGAGCAGCGGCTACGAGTACTAACAAAGAATATCCAGTTCGCACATGCCAATAAGCCCAAAGGCCGACAAGCAAAGATGGCCTTTGTCAACTCTGTGGCCAAGCCACCTCGTGACGTCCGGAGGAGGCAGGAAAAGTTTGGAACGGGAGGAGCAGCTGTCCCTGAGAAAATCAAGATCAAGCCAGCCCCGTACCCCATGGGAAGCAGCCATGCTTCCGCCAGTAGTATCAGCTTTAACCCCAGCCCTGAGGAGCCGGCCTATGATGGCCCAAGCACCAGCAGTGCCCACTTGGCACCAGTGGTCAGCAGCACTGTTTCCTATGATCCTAGGAAACCCACTGTGAAGAAAATTGCCCCAATGATGGCCAAGACAATTAAAGCTTTCAAGAACAGATTCTCCCGACGATAAACTGAGGACTTGCCTTGGAAATGGAATCTGGGGAGGCAGGAATACAAGGACAGTGGGGGTTGGGGAATGGAATTCTACAGGAGACTGGAGTCTTGCTTTGTGGATCCTTTTGGTCTCCGAGTCCTGCAGTCTGCAGGTGCTGCCCCTGGGAACCTGCGTGCCACAGCCCCGCCTCCCTGCCTGGAGCACACTTTAGAATTCTGAAGATGTGAAGCCTCTGTCTCACTGAGGATTTTAAAGGTCAATTATACTTTTGTTGTTCATTAGCATCTTTGTAAACTATAAGACGTAGTTTTAATTAATAAATATTGCCCCCAGATTGTATTTATAGC	55555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555	NM:i:1	ms:i:2980	AS:i:2596	nn:i:0	ts:A:+	tp:A:P	cm:i:959	s1:i:2851	s2:i:188	de:f:0.0003	rl:i:30	MD:Z:2469C513
+eloa_rpl11_nnc	0	chr1	23692590	60	5S184M1033N107M746N132M54229N57M764N107M896N1186M376N112M1581N156M107N98M1382N181M250N112M567N173M2386N392M2S	*	0	0	CTCTTCTGCTCTTCCCTGTTGCAGCAGGATCAAGGTGAAAAGGAGAACCCCATGCGGGAACTTCGCATCCGCAAACTCTGTCTCAACATCTGTGTTGGGGAGAGTGGAGACAGACTGACGCGAGCAGCCAAGGTGTTGGAGCAGCTCACAGGGCAGACCCCTGTGTTTTCCAAAGGTGAGTAGTCACAACTAGATACACTGTCAGATCCTTTGGCATCCGGAGAAATGAAAAGATTGCTGTCCACTGCACAGTTCGAGGGGCCAAGGCAGAAGAAATCTTGGAGAAGGGTCTAAAGGTGCGGGAGTATGAGTTAAGAAAAAACAACTTCTCAGATACTGGAAACTTTGGTTTTGGGATCCAGGAACACATCGATCTGGGTATCAAATATGACCCAAGCATTGGTATCTACGGCCTGGACTTCTATGTGCTATTGAAATATTTGAAGAAACTCTCCACCCTGCCTATTACAGTAGACATTCTTGCGGAGACTGGGGTTGGGAAAACAGTAAATAGCTTGCGAAAACACGAGCATGTTGGAAGCTTTGCCAGGGACCTAGTGGCCCAGTGGAAGAAGCTGGTTCCTGTGGAACGAAATGCTGAGCCTGATGAACAGGACTTTGAGAAGAGCAATTCCCGAAAGCGCCCTCGGGATGCCCTGCAGAAGGAGGAGGAGATGGAGGGGGACTACCAAGAAACCTGGAAAGCCACGGGGAGCCGATCCTATAGCCCTGACCACAGGCAGAAGAAACATAGGAAACTCTCGGAGCTCGAGAGACCTCACAAAGTGTCTCACGGTCATGAGAGGAGAGATGAGAGAAAGAGGTGTCACAGAATGTCACCAACTTACTCTTCAGACCCTGAGTCTTCTGATTATGGCCATGTTCAATCCCCTCCATCTTGTACCAGTCCTCATCAGATGTACGTCGACCACTACAGATCCCTGGAGGAGGACCAGGAGCCCATTGTTTCACACCAGAAGCCTGGGAAAGGCCACAGCAATGCCTTTCAGGACAGACTCGGGGCCAGCCAAGAACGACACCTGGGTGAACCCCATGGGAAAGGGGTTGTGAGTCAAAACAAGGAGCACAAATCTTCCCACAAGGACAAACGCCCCGTGGATGCCAAGAGTGATGAGAAGGCCTCTGTGGTGAGCAGAGAGAAATCACACAAGGCCCTCTCCAAAGAGGAGAACCGAAGGCCACCCTCAGGGGACAATGCAAGGGAGAAACCGCCCTCTAGTGGCGTAAAGAAAGAGAAGGACAGAGAGGGCAGCAGCCTGAAGAAGAAGTGTTTGCCTCCCTCAGAGGCCGCTTCAGACAACCACCTGAAAAAGCCAAAGCACAGAGACCCAGAGAAAGCCAAATTGGACAAAAGCAAGCAAGGTCTGGACAGCTTTGACACAGGAAAAGGAGCAGGAGACCTGTTGCCCAAGGTAAAAGAGAAGGGTTCTAACAACCTAAAGACTCCAGAAGGGAAAGTCAAAACTAATTTGGATAGAAAGTCACTGGGCTCCCTCCCTAAAGTTGAGGAGACAGATATGGAGGATGAATTCGAGCAGCCAACCATGTCTTTTGAATCCTACCTCAGCTATGACCAGCCCCGGAAGAAAAAGAAAAAGATTGTGAAAACTTCAGCCACGGCACTTGGAGATAAAGGACTTAAAAAAAATGACTCTAAAAGCACTGGTAAAAACTTGGACTCAGTTCAGAAATTACCCAAGGTGAACAAAACCAAGTCAGAGAAGCCGGCTGGAGCTGATTTAGCCAAGCTGAGAAAGGTGCCTGATGTGTTGCCAGTGTTGCCAGACCTCCCGTTACCCGCGATACAGGCCAATTACCGTCCACTGCCTTCCCTCGAGCTGATATCCTCCTTCCAGCCAAAGCGAAAAGCGTTCTCTTCACCCCAGGAAGAAGAAGAAGCTGGATTTACTGGGCGCAGAATGAATTCCAAGATGCAGGTGTATTCTGGTTCCAAGTGTGCCTATCTCCCTAAAATGATGACCTTGCACCAGCAATGCATCCGAGTACTTAAAAACAACATCGATTCAATCTTTGAAGTGGGAGGAGTCCCATACTCTGTTCTTGAACCCGTTTTGGAGAGGTGTACACCTGATCAGCTGTATCGCATAGAGGAATACAATCATGTATTAATTGAAGAAACAGATCAATTATGGAAAGTTCATTGTCACCGAGACTTTAAGGAAGAAAGACCCGAAGAGTATGAGTCGTGGCGAGAGATGTACCTGCGGCTTCAGGACGCCCGAGAGCAGCGGCTACGAGTACTAACAAAGAATATCCAGTTCGCACATGCCAATAAGCCCAAAGGCCGACAAGCAAAGATGGCCTTTGTCAACTCTGTGGCCAAGCCACCTCGTGACGTCCGGAGGAGGCAGGAAAAGTTTGGAACGGGAGGAGCAGCTGTCCCTGAGAAAATCAAGATCAAGCCAGCCCCGTACCCCATGGGAAGCAGCCATGCTTCCGCCAGTAGTATCAGCTTTAACCCCAGCCCTGAGGAGCCGGCCTATGATGGCCCAAGCACCAGCAGTGCCCACTTGGCACCAGTGGTCAGCAGCACTGTTTCCTATGATCCTAGGAAACCCACTGTGAAGAAAATTGCCCCAATGATGGCCAAGACAATTAAAGCTTTCAAGAACAGATTCTCCCGACGATAAACTGAGGACTTGCCTTGGAAATGGAATCTGGGGAGGCAGGAATACAAGGACAGTGGGGGTTGGGGAATGGAATTCTACAGGAGACTGGAGTCTTGCTTTGTGGATCCTTTTGGTCTCCGAGTCCTGCAGTCTGCAGGTGCTGCCCCTGGGAACCTGCGTGCCACAGCCCCGCCTCCCTGCCTGGAGCACACTTTAGAATTCTGAAGATGTGAAGCCTCTGTCTCACTGAGGATTTTAAAGGTCAATTATACTTTTGTTGTTCATTAGCATCTTTGTAAACTATAAGACGTAGTTTTAATTAATAAATATTGCCCCCAGATTGTATTTATAGC	5555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555	NM:i:1	ms:i:2994	AS:i:2605	nn:i:0	ts:A:+	tp:A:P	cm:i:962	s1:i:2863	s2:i:186	de:f:0.0003	rl:i:30	MD:Z:2483C513
+rpl11_fsm	0	chr1	23692612	60	26S148M1047N107M746N132M1006N111M435N69M3S	*	0	0	CCTTCTCTCCTCCTCCATCATGCGCAGATCAAGGTGAAAAGGAGAACCCCATGCGGGAACTTCGCATCCGCAAACTCTGTCTCAACATCTGTGTTGGGGAGAGTGGAGACAGACTGACGCGAGCAGCCAAGGTGTTGGAGCAGCTCACAGGGCAGACCCCTGTGTTTTCCAAAGCTAGATACACTGTCAGATCCTTTGGCATCCGGAGAAATGAAAAGATTGCTGTCCACTGCACAGTTCGAGGGGCCAAGGCAGAAGAAATCTTGGAGAAGGGTCTAAAGGTGCGGGAGTATGAGTTAAGAAAAAACAACTTCTCAGATACTGGAAACTTTGGTTTTGGGATCCAGGAACACATCGATCTGGGTATCAAATATGACCCAAGCATTGGTATCTACGGCCTGGACTTCTATGTGGTGCTGGGTAGGCCAGGTTTCAGCATCGCAGACAAGAAGCGCAGGACAGGCTGCATTGGGGCCAAACACAGAATCAGCAAAGAGGAGGCCATGCGCTGGTTCCAGCAGAAGTATGATGGGATCATCCTTCCTGGCAAATAAATTCCCGTTTCTATCCAAAAGAGCAATAAAAAGTTTTCATTG	55555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555	NM:i:0	ms:i:567	AS:i:439	nn:i:0	ts:A:+	tp:A:P	cm:i:173	s1:i:523	s2:i:251	de:f:0	rl:i:0	MD:Z:567
+rpl11_ism	0	chr1	23693834	60	80M746N132M1006N111M435N69M3S	*	0	0	TCCGGAGAAATGAAAAGATTGCTGTCCACTGCACAGTTCGAGGGGCCAAGGCAGAAGAAATCTTGGAGAAGGGTCTAAAGGTGCGGGAGTATGAGTTAAGAAAAAACAACTTCTCAGATACTGGAAACTTTGGTTTTGGGATCCAGGAACACATCGATCTGGGTATCAAATATGACCCAAGCATTGGTATCTACGGCCTGGACTTCTATGTGGTGCTGGGTAGGCCAGGTTTCAGCATCGCAGACAAGAAGCGCAGGACAGGCTGCATTGGGGCCAAACACAGAATCAGCAAAGAGGAGGCCATGCGCTGGTTCCAGCAGAAGTATGATGGGATCATCCTTCCTGGCAAATAAATTCCCGTTTCTATCCAAAAGAGCAATAAAAAGTTTTCATTG	55555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555	NM:i:0	ms:i:392	AS:i:296	nn:i:0	ts:A:+	tp:A:P	cm:i:119	s1:i:359	s2:i:192	de:f:0	rl:i:0	MD:Z:392

From 1e77c98e35c35414a93f9abc743b7312501130c2 Mon Sep 17 00:00:00 2001
From: fairliereese <freese@login-i17.local>
Date: Mon, 9 Oct 2023 16:02:15 -0700
Subject: [PATCH 28/31] added index for test input bam file

---
 .../hl60_1_1_subset_remapped_sorted.bam.bai     | Bin 0 -> 13296 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 testing_suite/input_files/readthrough/hl60_1_1_subset_remapped_sorted.bam.bai

diff --git a/testing_suite/input_files/readthrough/hl60_1_1_subset_remapped_sorted.bam.bai b/testing_suite/input_files/readthrough/hl60_1_1_subset_remapped_sorted.bam.bai
new file mode 100644
index 0000000000000000000000000000000000000000..9e1c7e0235c254aaf8fdd6b1ea24313bd3134fee
GIT binary patch
literal 13296
zcmeI$u?@m75CzZ+f(~eDsA!l2I<f>SFbfqDBV-H)Vg-CyCqze!^diMh9NWd$`4u@H
zPQxQ2=Iy@CB8JvSte4qN``$k{ef)mu?-9Fgejl$9ZArVn$5-#MPjhaMxo_=P`%H_R
zQ)~5^hCI}PI&cm+zyS_$fCC)h00%h00S<701048g2kIZ7zSprF<dElC9jF6!;2dy(
p103K02ROh14sd`29N+*4IKTl8aDW59IFNH(5|DrdBp`uG;03m7IC%g7

literal 0
HcmV?d00001


From 8ae888233d6d3ef7d7eb9dedec43cb0ce4157f44 Mon Sep 17 00:00:00 2001
From: fairliereese <fairliek@comcast.net>
Date: Thu, 12 Oct 2023 10:18:35 -0700
Subject: [PATCH 29/31] added check for reference lengths == 0 in get_overlap

---
 src/talon/talon.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/talon/talon.py b/src/talon/talon.py
index b94626e..1f688fc 100644
--- a/src/talon/talon.py
+++ b/src/talon/talon.py
@@ -802,7 +802,10 @@ def get_overlap(a, b):
     """
     overlap = max(0, min(a[1], b[1]) - max(a[0], b[0]) + 1)
     ref_len = abs(b[1] - b[0])
-    perc_overlap = (overlap / ref_len) * 100
+    if ref_len != 0:
+        perc_overlap = (overlap / ref_len) * 100
+    else:
+        perc_overlap = None
     return overlap, perc_overlap
 
 
From e9ca9675cc38dc453fa1e2398100c157c79f52b5 Mon Sep 17 00:00:00 2001
From: fairliereese <fairliek@comcast.net>
Date: Mon, 16 Oct 2023 12:29:02 -0700
Subject: [PATCH 30/31] removed pybedtools as a dependency

---
 .travis.yml      | 1 -
 requirements.txt | 1 -
 setup.py         | 1 -
 tox.ini          | 1 -
 4 files changed, 4 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index ef002ad..0b88f89 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -7,7 +7,6 @@ python:
 
 install:
   - sudo apt update && sudo apt install bedtools
-  - pip install pybedtools
   - pip install tox
 
 script:
diff --git a/requirements.txt b/requirements.txt
index 404beb3..c2f1349 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,3 @@
 pandas
 pyfaidx
 pysam==0.15.4
-pybedtools
diff --git a/setup.py b/setup.py
index 943debb..94fcce9 100644
--- a/setup.py
+++ b/setup.py
@@ -52,7 +52,6 @@
         "pyranges",
         "bamread>=0.0.11",
         "pysam>=0.15.4",
-        "pybedtools",
         "pyfaidx",
         "scanpy"
     ],
diff --git a/tox.ini b/tox.ini
index 4bb6066..67c93d2 100644
--- a/tox.ini
+++ b/tox.ini
@@ -4,7 +4,6 @@
 envlist=py3
 [testenv]
 deps=pytest
-     pybedtools
 whitelist_externals = make
 commands =
     make -C testing_suite test

From f6164972c4cfcecded81f59d283e89c5323643e9 Mon Sep 17 00:00:00 2001
From: fairliereese <fairliek@comcast.net>
Date: Mon, 16 Oct 2023 12:44:02 -0700
Subject: [PATCH 31/31] added python <3.8 requirements, as higher versions
 change how multiprocessing works

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 94fcce9..5de86d0 100644
--- a/setup.py
+++ b/setup.py
@@ -46,7 +46,7 @@
         "License :: OSI Approved :: MIT License",
         "Topic :: Scientific/Engineering :: Bio-Informatics"
     ],
-    python_requires=">=3.6",
+    python_requires=">=3.6,<3.8",
     install_requires=[
         "pandas",
         "pyranges",