Skip to content

Commit

Permalink
keep source and other features from the original annotation
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewprzh committed Apr 25, 2024
1 parent 55f1b56 commit a664d3d
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 21 deletions.
60 changes: 55 additions & 5 deletions src/gene_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,16 +33,34 @@ class TranscriptModelType(Enum):

# simple class for storing all information needed for GFF
class TranscriptModel:
def __init__(self, chr_id, strand, transcript_id, gene_id, exon_blocks, transcript_type):
def __init__(self, chr_id, strand, transcript_id, gene_id, exon_blocks, transcript_type,
other_features=None, source="IsoQuant"):
self.chr_id = chr_id
self.strand = strand
self.transcript_id = transcript_id
self.gene_id = gene_id
self.exon_blocks = exon_blocks
self.transcript_type = transcript_type
self.source = source
self.other_features = [] if not other_features else other_features
self.additional_info = OrderedDict()
self.intron_path = ()

@classmethod
def from_reference_transcript(cls, gene_info, isoform_id):
transcript_model = cls.__new__(cls)
transcript_model.chr_id = gene_info.chr_id
transcript_model.strand = gene_info.isoform_strands[isoform_id]
transcript_model.transcript_id = isoform_id
transcript_model.gene_id = gene_info.gene_id_map[isoform_id]
transcript_model.exon_blocks = gene_info.all_isoforms_exons[isoform_id]
transcript_model.transcript_type = TranscriptModelType.known
transcript_model.source = gene_info.sources[isoform_id]
transcript_model.other_features = gene_info.other_features[isoform_id]
transcript_model.additional_info = OrderedDict()
transcript_model.intron_path = ()
return transcript_model

def get_start(self):
return self.exon_blocks[0][0]

Expand Down Expand Up @@ -124,6 +142,7 @@ def to_str(self):
# All gene(s) information
class GeneInfo:
EXTRA_BASES_FOR_SEQ = 20
OTHER_FEATURES = {'CDS', 'start_codon', 'stop_codon', 'UTR'}

def __init__(self, gene_db_list, db, delta=0, prepare_profiles=True):
if db is None:
Expand Down Expand Up @@ -158,6 +177,10 @@ def __init__(self, gene_db_list, db, delta=0, prepare_profiles=True):
self.isoform_strands = {}
self.gene_strands = {}
self.set_isoform_strands()
self.other_features = {}
self.set_other_features()
self.sources = {}
self.set_sources()
self.gene_id_map = {}
self.set_gene_ids()
self.gene_attributes = {}
Expand All @@ -182,6 +205,8 @@ def from_models(cls, transcript_model_storage, delta=0):
gene_info.all_isoforms_exons = {}
gene_info.all_isoforms_introns = {}
gene_info.isoform_strands = {}
gene_info.sources = {}
gene_info.other_features = {}
gene_info.gene_id_map = {}
gene_info.gene_attributes = {}
introns = set()
Expand All @@ -196,6 +221,9 @@ def from_models(cls, transcript_model_storage, delta=0):
exons.update(gene_info.all_isoforms_exons[t_id])
introns.update(gene_info.all_isoforms_introns[t_id])
gene_info.isoform_strands[transcript_model.transcript_id] = transcript_model.strand
gene_info.sources[transcript_model.transcript_id] = transcript_model.source
gene_info.sources[transcript_model.gene_id] = transcript_model.source
gene_info.other_features[transcript_model.transcript_id] = transcript_model.other_features
gene_info.gene_id_map[transcript_model.transcript_id] = transcript_model.gene_id

# profiles for all known isoforoms
Expand Down Expand Up @@ -261,10 +289,11 @@ def from_model(cls, transcript_model, delta=0):
gene_info.exon_profiles.set_profiles(t_id, exons, transcript_region, partial(equal_ranges, delta=0))
gene_info.split_exon_profiles.set_profiles(t_id, exons, transcript_region, contains)

gene_info.isoform_strands = {}
gene_info.isoform_strands[transcript_model.transcript_id] = transcript_model.strand
gene_info.gene_id_map = {}
gene_info.gene_id_map[transcript_model.transcript_id] = transcript_model.gene_id
gene_info.isoform_strands = {transcript_model.transcript_id: transcript_model.strand}
gene_info.sources = {transcript_model.transcript_id: transcript_model.source,
transcript_model.gene_id: transcript_model.source}
gene_info.other_features = {transcript_model.transcript_id: transcript_model.other_features}
gene_info.gene_id_map = {transcript_model.transcript_id: transcript_model.gene_id}
gene_info.gene_attributes = {}

gene_info.regions_for_bam_fetch = [(gene_info.start, gene_info.end)]
Expand Down Expand Up @@ -300,6 +329,8 @@ def from_region(cls, chr_id, start, end, delta=0, chr_record=None):
gene_info.all_isoforms_exons = {}
gene_info.all_isoforms_introns = {}
gene_info.isoform_strands = {}
gene_info.other_features = {}
gene_info.sources = {}
gene_info.gene_id_map = {}
gene_info.gene_attributes = {}
gene_info.regions_for_bam_fetch = [(start, end)]
Expand Down Expand Up @@ -353,6 +384,10 @@ def deserialize(cls, infile, genedb):
gene_info.isoform_strands = {}
gene_info.gene_strands = {}
gene_info.set_isoform_strands()
gene_info.other_features = {}
gene_info.set_other_features()
gene_info.sources = {}
gene_info.set_sources()
gene_info.gene_id_map = {}
gene_info.set_gene_ids()
gene_info.gene_attributes = {}
Expand Down Expand Up @@ -417,6 +452,21 @@ def set_isoform_strands(self):
for gene_db in self.gene_db_list:
self.gene_strands[gene_db.id] = gene_db.strand

def set_other_features(self):
self.other_features = defaultdict(list)
for gene in self.gene_db_list:
for t in self.db.children(gene, featuretype=('transcript', 'mRNA')):
for e in self.db.children(t):
if e.featuretype in GeneInfo.OTHER_FEATURES:
self.other_features[t.id].append((e.start, e.end, e.featuretype))

def set_sources(self):
self.sources = {}
for gene in self.gene_db_list:
self.sources[gene.id] = gene.source
for t in self.db.children(gene, featuretype=('transcript', 'mRNA')):
self.sources[t.id] = t.source

# set isoform_id -> gene_id map
def set_gene_ids(self):
self.gene_id_map = {}
Expand Down
5 changes: 1 addition & 4 deletions src/graph_based_model_construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -704,10 +704,7 @@ def construct_nonfl_isoforms(self, spliced_isoform_reads, spliced_isoform_left_s

# create transcript model object from reference isoforms
def transcript_from_reference(self, isoform_id):
gene_id = self.gene_info.gene_id_map[isoform_id]
return TranscriptModel(self.gene_info.chr_id, self.gene_info.isoform_strands[isoform_id],
isoform_id, gene_id, self.gene_info.all_isoforms_exons[isoform_id],
TranscriptModelType.known)
return TranscriptModel.from_reference_transcript(self.gene_info, isoform_id)

# assign reads back to constructed isoforms
def assign_reads_to_models(self, read_assignments):
Expand Down
31 changes: 19 additions & 12 deletions src/transcript_printer.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,9 +100,12 @@ def dump(self, gene_info, transcript_model_storage):
gene_additiional_info = ""
if gene_info and gene_id in gene_info.gene_attributes:
gene_additiional_info = gene_info.gene_attributes[gene_id]
gene_line = '%s\tIsoQuant\tgene\t%d\t%d\t.\t%s\t.\tgene_id "%s"; transcripts "%d"; %s\n' % \
(gene_info_dict[gene_id].chr_id, coords[0], coords[1], gene_info_dict[gene_id].strand,
gene_id, len(gene_to_model_dict[gene_id]), gene_additiional_info)
source = "IsoQuant"
if gene_info and gene_id in gene_info.sources:
source = gene_info.sources[gene_id]
gene_line = '%s\t%s\tgene\t%d\t%d\t.\t%s\t.\tgene_id "%s"; transcripts "%d"; %s\n' % \
(gene_info_dict[gene_id].chr_id, source, coords[0], coords[1], gene_info_dict[gene_id].strand,
gene_id, len(gene_to_model_dict[gene_id]), gene_additiional_info)
self.out_gff.write(gene_line)
self.printed_gene_ids.add(gene_id)

Expand All @@ -113,15 +116,20 @@ def dump(self, gene_info, transcript_model_storage):
if not model.check_additional("exons"):
model.add_additional_attribute("exons", str(len(model.exon_blocks)))

transcript_line = '%s\tIsoQuant\ttranscript\t%d\t%d\t.\t%s\t.\tgene_id "%s"; transcript_id "%s"; %s\n' \
% (model.chr_id, model.exon_blocks[0][0], model.exon_blocks[-1][1], model.strand,
model.gene_id, model.transcript_id, model.additional_attributes_str())
transcript_line = '%s\t%s\ttranscript\t%d\t%d\t.\t%s\t.\tgene_id "%s"; transcript_id "%s"; %s\n' \
% (model.chr_id, model.source, model.exon_blocks[0][0], model.exon_blocks[-1][1],
model.strand, model.gene_id, model.transcript_id,
model.additional_attributes_str())
self.out_gff.write(transcript_line)

prefix_columns = "%s\tIsoQuant\texon\t" % model.chr_id
prefix_columns = "%s\t%s\t" % (model.chr_id, model.source)
suffix_columns = '.\t%s\t.\tgene_id "%s"; transcript_id "%s";' % \
(model.strand, model.gene_id, model.transcript_id)
exons_to_print = sorted(model.exon_blocks, reverse=True) if model.strand == '-' else model.exon_blocks

exons_to_print = model.other_features
for e in model.exon_blocks:
exons_to_print.append((e[0], e[1], 'exon'))
exons_to_print = sorted(exons_to_print, reverse=True) if model.strand == '-' else sorted(exons_to_print)
for i, e in enumerate(exons_to_print):
exon_tuple = (model.chr_id, e[0], e[1], model.strand)
if exon_tuple not in GFFPrinter.transcript_id_dict:
Expand All @@ -130,7 +138,8 @@ def dump(self, gene_info, transcript_model_storage):
else:
exon_id = GFFPrinter.transcript_id_dict[exon_tuple]
exon_str_id = model.chr_id + ".%d" % exon_id
self.out_gff.write(prefix_columns + "%d\t%d\t" % (e[0], e[1]) + suffix_columns +
feature_type = e[2]
self.out_gff.write(prefix_columns + "%s\t%d\t%d\t" % (feature_type, e[0], e[1]) + suffix_columns +
' exon "%d"; exon_id "%s";\n' % ((i + 1), exon_str_id))
self.out_gff.flush()

Expand All @@ -156,9 +165,7 @@ def create_extended_storage(genedb, chr_id, chr_record, novel_model_storage):
gene_info = GeneInfo(gene_list, genedb, prepare_profiles=False)
gene_info.set_reference_sequence(1, len(chr_record), chr_record)
for isoform_id in gene_info.all_isoforms_exons.keys():
all_models.append(TranscriptModel(gene_info.chr_id, gene_info.isoform_strands[isoform_id],
isoform_id, gene_info.gene_id_map[isoform_id],
gene_info.all_isoforms_exons[isoform_id], TranscriptModelType.known))
all_models.append(TranscriptModel.from_reference_transcript(gene_info, isoform_id))
for m in novel_model_storage:
all_models.append(m)

Expand Down

0 comments on commit a664d3d

Please sign in to comment.