-
Notifications
You must be signed in to change notification settings - Fork 0
/
gff2FA.py
56 lines (54 loc) · 2.35 KB
/
gff2FA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import pandas as pd
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
def gff2FA(annotation, sequence, windows, output):
"""Extract fasta files from annotations
"""
df_gff = pd.read_csv(annotation, index_col=False, sep='\t', header=None, comment="#")
df_gff.columns = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']
fasta_seq = SeqIO.parse(sequence, 'fasta')
buffer_seqs = []
cont = 0
for record in fasta_seq:
print(record.id)
dff_extract = df_gff[df_gff.seqname == record.id]
for key,val in dff_extract.iterrows():
clean_seq = ''.join(str(record.seq).splitlines())
if int(val.start) - windows < 0:
start = 0
else:
start = int(val.start) - windows
if int(val.end) + windows > len(clean_seq):
end = len(clean_seq)
else:
end = int(val.end) + windows
new_seq = clean_seq[start:end]
att = val.attribute
id = record.id + '_' + str(start) + '_' + str(end)
desc = "seq_id:" + str(record.id)
desc += " feature_start:" + str(val.start)
desc += " feature_end:" + str(val.end)
desc += " genome_start:" + str(start)
desc += " genome_end:" + str(end)
desc += " feature:" + str(val.feature)
desc += " attributes:" + val.attribute
seq = SeqRecord(Seq(new_seq), id=id, description=desc)
buffer_seqs.append(seq)
cont += 1
if output:
print('Saving...')
SeqIO.write(buffer_seqs, output, "fasta")
else:
return buffer_seqs
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()#pylint: disable=invalid-name
parser.add_argument("-a", "--annotation", help="Annotation file (.gff3 format)", required=True)
parser.add_argument("-s", "--sequence", help="Sequence file (.fasta)", required=True)
parser.add_argument("-w", "--windows", help="+- nt to cut from and to", type=int, default=0)
parser.add_argument("-o", "--output", help="Output file name (.fasta format)")
args = parser.parse_args()#pylint: disable=invalid-name
gff2FA(args.annotation, args.sequence, args.windows,args.output)