-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgene_database.py
executable file
·44 lines (41 loc) · 1.8 KB
/
gene_database.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#!/usr/bin/env python3
"""
Generate predicted gene database for blast to do an all vs all alignment
See genes.sh
"""
from Bio import SeqIO, SeqRecord
import glob
from collections import defaultdict
genes = defaultdict(int)
with open('gene_database.fna', 'w') as out: #, open('gene_order.txt', 'w') as order:
for fn in glob.glob('sequences/*.gb'):
#pairs = []
print(fn)
with open(fn) as f:
r = next(SeqIO.parse(f, 'genbank'))
last = None
for feat in r.features:
if feat.type == 'CDS':
gene_id = feat.qualifiers.get('gene')
if not gene_id:
gene_id = feat.qualifiers.get('note')
if gene_id:
gene_id = gene_id[0].split(';')
if len(gene_id) > 1 and ' ' not in gene_id[0]:
gene_id = gene_id[0]
else:
gene_id = None
if not gene_id:
gene_id = feat.qualifiers.get('locus_tag')
if not gene_id:
gene_id = feat.qualifiers.get('protein_id')
if isinstance(gene_id, list): gene_id = gene_id[0]
gene_id = gene_id.replace(' ', '_')
#if last:
# pairs.append(f"{last}_{gene_id}" if last < gene_id else f"{gene_id}_{last}")
#last = gene_id
#genes[gene_id] += 1
#if genes[gene_id] > 1:
# gene_id += f"_{genes[gene_id]}"
SeqIO.write(SeqRecord.SeqRecord(seq=feat.extract(r.seq), id=gene_id, description=fn), out, 'fasta')
#print(r.id, ' '.join(pairs), file=order)