-
Notifications
You must be signed in to change notification settings - Fork 1
/
load_genome.py
97 lines (77 loc) · 3.77 KB
/
load_genome.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
"""
Build a genome dictionary from several embl-format files.
The genome contains a dictionary in which the key is the systematic ID,
and the value is a dictionary of features with the feature_type as key.
{
"3'UTR": SeqFeature,
"CDS": SeqFeature,
"peptide": string containing the peptide sequence (only for CDS features, generated by translating the CDS with BioPython),
"5'UTR": SeqFeature,
"contig": SeqRecord of the sequence where this gene is found (generated from one of the files passed as arguments)
}
The dictionary is stored in a pickle file, specified by the argument --output.
"""
import pickle
from Bio import SeqIO
from Bio.SeqFeature import SeqFeature
import argparse
import json
from tqdm import tqdm
genome: dict[str, dict[str, SeqFeature]] = dict()
class Formatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
pass
parser = argparse.ArgumentParser(description=__doc__, formatter_class=Formatter)
parser.add_argument('files', metavar='N', type=str, nargs='+',
help='files to be read')
parser.add_argument('--format', default='embl', help='format of the files to be read (for Biopython)')
parser.add_argument('--output', default='data/genome.pickle', help='output file (using pickle)')
parser.add_argument('--config', default='config.json', help='configuration file')
args = parser.parse_args()
with open(args.config) as ins:
config = json.load(ins)
locus_tag_equivalent = 'locus_tag'
if 'locus_tag_equivalent' in config:
locus_tag_equivalent = config['locus_tag_equivalent']
filename2chromosome_dict = {v: k for k, v in config['chromosome2file'].items()}
for f in args.files:
print('\033[0;32m reading: ' + f + '\033[0m')
iterator = SeqIO.parse(f, args.format)
contig = next(iterator)
if next(iterator, None) is not None:
raise ValueError(f'multiple sequences in file {f}')
for feature in tqdm(contig.features, desc="Reading sequence features", unit="feature"):
feature: SeqFeature
if locus_tag_equivalent not in feature.qualifiers:
continue
gene_id = feature.qualifiers[locus_tag_equivalent][0]
feature_type = feature.type
if feature_type in ['intron', 'misc_feature', 'exon']:
continue
if gene_id not in genome:
genome[gene_id] = dict()
# assigned only once
file_name = f.split('/')[-1].split('.')[0]
# We set the id to the value in the filename2chromosome_dict
contig.id = filename2chromosome_dict[file_name]
genome[gene_id]['contig'] = contig
if (feature_type in genome[gene_id]) and feature_type != 'mRNA':
raise ValueError(f'several features of {feature_type} for {gene_id}')
genome[gene_id][feature_type] = feature
# if feature_type == 'CDS' and not any([('pseudogene' in prod or 'dubious' in prod) for prod in feature.qualifiers['product']]):
if feature_type == 'CDS':
cds_seq = feature.extract(contig).seq
if gene_id.startswith(config['mitochondrial_prefix']):
genome[gene_id]['peptide'] = cds_seq.translate(table=config['mitochondrial_table'])
else:
genome[gene_id]['peptide'] = cds_seq.translate()
errors = list()
if len(cds_seq) % 3 != 0:
errors.append('CDS length not multiple of 3')
if genome[gene_id]['peptide'][-1] != '*':
errors.append('does not end with STOP codon')
if genome[gene_id]['peptide'].count('*') > 1:
errors.append('multiple stop codons')
if len(errors):
print(gene_id, 'CDS errors:', ','.join(errors), sep='\t')
with open(args.output, 'wb') as out:
pickle.dump(genome, out, pickle.HIGHEST_PROTOCOL)