Skip to content

Commit

Permalink
Merge pull request #139 from mortazavilab/230921_update
Browse files Browse the repository at this point in the history
Readthrough transcription support and improved gene assignment
  • Loading branch information
fairliereese authored Oct 16, 2023
2 parents 67aebb3 + f616497 commit 74dbc10
Show file tree
Hide file tree
Showing 54 changed files with 5,942 additions and 3,593 deletions.
1 change: 0 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ python:

install:
- sudo apt update && sudo apt install bedtools
- pip install pybedtools
- pip install tox

script:
Expand Down
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
pandas
pyfaidx
pysam==0.15.4
pybedtools
3 changes: 1 addition & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,12 @@
"License :: OSI Approved :: MIT License",
"Topic :: Scientific/Engineering :: Bio-Informatics"
],
python_requires=">=3.6",
python_requires=">=3.6,<3.8",
install_requires=[
"pandas",
"pyranges",
"bamread>=0.0.11",
"pysam>=0.15.4",
"pybedtools",
"pyfaidx",
"scanpy"
],
Expand Down
7 changes: 4 additions & 3 deletions src/talon/dstruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@ class Struct(dict):
Make a dict behave as a struct.
Example:
test = Struct(a=1, b=2, c=3)
"""
def __init__(self,**kw):
dict.__init__(self,kw)

def __init__(self, **kw):
dict.__init__(self, kw)
self.__dict__ = self
125 changes: 63 additions & 62 deletions src/talon/edge.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,29 @@
# TALON: Techonology-Agnostic Long Read Analysis Pipeline
# Author: Dana Wyman
#------------------------------------------------------------------------------
# ------------------------------------------------------------------------------


class Edge(object):
"""Stores information about an edge, including its location
and the gene/transcript(s) it belongs to.
Attributes:
identifier: Accession ID of the edge
gene: Accession ID of the gene that the edge belongs to
transcript_ids: Set of transcript accession IDs that the edge
belongs to
chromosome: Chromosome that the transcript is located on
(format "chr1")
start: The start position of the edge with respect to the
forward strand
end: The end position of the edge with respect to the
forward strand
strand: "+" if the edge is on the forward strand, and "-" if
it is on the reverse strand
length: The length of the edge
and the gene/transcript(s) it belongs to.
Attributes:
identifier: Accession ID of the edge
gene: Accession ID of the gene that the edge belongs to
transcript_ids: Set of transcript accession IDs that the edge
belongs to
chromosome: Chromosome that the transcript is located on
(format "chr1")
start: The start position of the edge with respect to the
forward strand
end: The end position of the edge with respect to the
forward strand
strand: "+" if the edge is on the forward strand, and "-" if
it is on the reverse strand
length: The length of the edge
"""

def __init__(self, identifier, chromosome, start, end, strand, gene_id,
transcript_id, annotations):
def __init__(self, identifier, chromosome, start, end, strand, gene_id, transcript_id, annotations):
self.chromosome = str(chromosome)
self.gene_id = gene_id
self.start = int(start)
Expand All @@ -40,27 +40,27 @@ def __init__(self, identifier, chromosome, start, end, strand, gene_id,
self.v2 = None

def print_edge(self):
""" Prints a string representation of the edge"""
print(self.identifier + ": " + self.chromosome + ":" + \
str(self.start) + "-" + str(self.end))
"""Prints a string representation of the edge"""
print(self.identifier + ": " + self.chromosome + ":" + str(self.start) + "-" + str(self.end))
print(self.transcript_ids)
return


def create_edge_from_gtf(edge_info):
""" Creates an edge object using information from a GTF entry
Args:
edge_info: A list containing fields from a GTF file edge entry.
Example:
['chr1', 'HAVANA', 'exon', '11869', '12227', '.', '+', '.',
'gene_id "ENSG00000223972.5"; transcript_id "ENST00000456328.2";
gene_type "transcribed_unprocessed_pseudogene";
gene_status "KNOWN"; gene_name "DDX11L1";
transcript_type "processed_transcript";
transcript_status "KNOWN"; transcript_name "DDX11L1-002";
edge_number 1; edge_id "ENSE00002234944.1"; level 2;
tag "basic"; transcript_support_level "1";
havana_gene "OTTHUMG00000000961.2";
havana_transcript "OTTHUMT00000362751.1";']
"""Creates an edge object using information from a GTF entry
Args:
edge_info: A list containing fields from a GTF file edge entry.
Example:
['chr1', 'HAVANA', 'exon', '11869', '12227', '.', '+', '.',
'gene_id "ENSG00000223972.5"; transcript_id "ENST00000456328.2";
gene_type "transcribed_unprocessed_pseudogene";
gene_status "KNOWN"; gene_name "DDX11L1";
transcript_type "processed_transcript";
transcript_status "KNOWN"; transcript_name "DDX11L1-002";
edge_number 1; edge_id "ENSE00002234944.1"; level 2;
tag "basic"; transcript_support_level "1";
havana_gene "OTTHUMG00000000961.2";
havana_transcript "OTTHUMT00000362751.1";']
"""
description = edge_info[-1]
start = int(edge_info[3])
Expand All @@ -71,38 +71,39 @@ def create_edge_from_gtf(edge_info):
annotations = extract_edge_annotations_from_GTF(edge_info)
if "exon_id" not in annotations:
annotations["exon_id"] = "_".join([chromosome, str(start), str(end), strand])
gene_id = annotations['gene_id']
transcript_id = annotations['transcript_id']
gene_id = annotations["gene_id"]
transcript_id = annotations["transcript_id"]
edge_id = "_".join([chromosome, str(start), str(end), strand])

if "gene_id" in description:
gene_id = (description.split("gene_id ")[1]).split('"')[1]
if "transcript_id" in description:
transcript_id = (description.split("transcript_id ")[1]).split('"')[1]

edge = Edge(edge_id, chromosome, start, end, strand, gene_id, transcript_id,
annotations)

edge = Edge(edge_id, chromosome, start, end, strand, gene_id, transcript_id, annotations)
return edge


def extract_edge_annotations_from_GTF(tab_fields):
""" Extracts key-value annotations from the GTF description field
"""
"""Extracts key-value annotations from the GTF description field"""

attributes = {}

# remove trailing newline and split by semicolon
description = tab_fields[-1].strip('\n')
description = description.split(';')
description = tab_fields[-1].strip("\n")
description = description.split(";")

# Parse description
for fields in description:
if fields == "" or fields == " ": continue
if fields == "" or fields == " ":
continue
fields = fields.split()
if fields[0] == '': fields = fields[1:]
if fields[0] == "":
fields = fields[1:]

key = fields[0].replace('"', "")
val = " ".join(fields[1:]).replace('"', "")

key = fields[0].replace('"', '')
val = ' '.join(fields[1:]).replace('"', '')

attributes[key] = val

# Put in placeholders for important attributes (such as gene_id) if they
Expand All @@ -116,29 +117,29 @@ def extract_edge_annotations_from_GTF(tab_fields):

return attributes


def get_edge_from_db(vertex_info_1, vertex_info_2):
""" Uses information from a database edge entry to create an edge object.
"""
"""Uses information from a database edge entry to create an edge object."""
if vertex_info_1["edge_id"] != vertex_info_2["edge_id"]:
raise ValueError('Tried to create edge from endpoints with different IDs')
raise ValueError("Tried to create edge from endpoints with different IDs")
edge_id = vertex_info_1["edge_id"]
chromosome = vertex_info_1['chromosome']
start = min(vertex_info_1['position'], vertex_info_2['position'])
end = max(vertex_info_1['position'], vertex_info_2['position'])
strand = vertex_info_1['strand']
gene_id = vertex_info_1['gene_id']
chromosome = vertex_info_1["chromosome"]
start = min(vertex_info_1["position"], vertex_info_2["position"])
end = max(vertex_info_1["position"], vertex_info_2["position"])
strand = vertex_info_1["strand"]
gene_id = vertex_info_1["gene_id"]

edge = Edge(edge_id, chromosome, start, end, strand, gene_id, None, None)
edge.v1 = str(vertex_info_1["vertex_ID"])
edge.v2 = str(vertex_info_2["vertex_ID"])
return edge


def create_novel_edge(chromosome, start, end, strand, gene_id, transcript_id, counter):
""" Creates a novel edge with a unique identifier (obtained using
counter). Returns the edge object as well as the updated counter.
"""Creates a novel edge with a unique identifier (obtained using
counter). Returns the edge object as well as the updated counter.
"""
counter["edges"] += 1
curr_novel = counter["edges"]
edge = Edge(curr_novel, chromosome, start, end, strand, gene_id, transcript_id,
None)
edge = Edge(curr_novel, chromosome, start, end, strand, gene_id, transcript_id, None)
return edge
Loading

0 comments on commit 74dbc10

Please sign in to comment.