Skip to content

Commit

Permalink
preserve exon ids from existing annotation
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewprzh committed Aug 24, 2024
1 parent b48605c commit ede4443
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 5 deletions.
2 changes: 1 addition & 1 deletion src/dataset_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ def construct_models_in_parallel(sample, chr_id, dump_filename, args, read_group
transcript_stat_counter = EnumStats()
io_support = IOSupport(args)
transcript_id_distributor = ExcludingIdDistributor(gffutils_db, chr_id)
exon_id_storage = FeatureIdStorage(SimpleIDDistributor())
exon_id_storage = FeatureIdStorage(SimpleIDDistributor(), gffutils_db, chr_id, "exon")

if construct_models:
tmp_gff_printer = GFFPrinter(sample.out_dir, sample.prefix, exon_id_storage,
Expand Down
16 changes: 14 additions & 2 deletions src/id_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,15 +53,27 @@ def increment(self):


class FeatureIdStorage:
def __init__(self, id_distributor):
def __init__(self, id_distributor, genedb = None, chr_id = None, feature = "exon"):
self.id_distributor = id_distributor
self.id_dict = {}
self.feature_name = feature
if not genedb or not chr_id:
return

id_attribute = feature + "_id"
for f in genedb.region(seqid=chr_id, start=1, featuretype=feature):
if id_attribute in f.attributes:
feature_tuple = (chr_id, f.start, f.end, f.strand)
try:
self.id_dict[feature_tuple] = f.attributes[id_attribute][0]
except IndexError:
pass

def get_id(self, chr_id, feature, strand):
feature_tuple = (chr_id, feature[0], feature[1], strand)
if feature_tuple not in self.id_dict:
feature_id = self.id_distributor.increment()
self.id_dict[feature_tuple] = feature_id
self.id_dict[feature_tuple] = chr_id + ".%d" % feature_id
else:
feature_id = self.id_dict[feature_tuple]

Expand Down
3 changes: 1 addition & 2 deletions src/transcript_printer.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,7 @@ def dump(self, gene_info, transcript_model_storage):
exons_to_print.append((e[0], e[1], 'exon'))
exons_to_print = sorted(exons_to_print, reverse=True) if model.strand == '-' else sorted(exons_to_print)
for i, e in enumerate(exons_to_print):
exon_id = self.exon_id_storage.get_id(model.chr_id, e, model.strand)
exon_str_id = model.chr_id + ".%d" % exon_id
exon_str_id = self.exon_id_storage.get_id(model.chr_id, e, model.strand)
feature_type = e[2]
self.out_gff.write(prefix_columns + "%s\t%d\t%d\t" % (feature_type, e[0], e[1]) + suffix_columns +
' exon "%d"; exon_id "%s";\n' % ((i + 1), exon_str_id))
Expand Down

0 comments on commit ede4443

Please sign in to comment.