-
Notifications
You must be signed in to change notification settings - Fork 11
/
sort_rename_fasta.py
executable file
·58 lines (49 loc) · 1.73 KB
/
sort_rename_fasta.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#!/usr/bin/env python
#This script sorts fasta file and then renames headers
#written by Jon Palmer palmer.jona at gmail dot com
import sys
import os
from Bio import SeqIO
import argparse
parser=argparse.ArgumentParser(
description='''Script that sorts fasta be length and renames headers ''',
epilog="""Jon Palmer (2015) palmer.jona@gmail.com""")
parser.add_argument('fasta', help='Fasta file (Required)')
parser.add_argument('-o','--out', default="sort.renamed.fasta", help='Fasta output')
parser.add_argument('-b','--base', default="Supercontig", help='Base name for fasta headers')
parser.add_argument('-l','--length', default="1", help='min length to keep seq')
args=parser.parse_args()
if len(sys.argv) < 2:
parser.print_usage()
sys.exit(1)
fasta_in = args.fasta
base_name = args.base
length = int(args.length)
#get rid of sequences shorter than min length
seq_pass = []
for record in SeqIO.parse(fasta_in, "fasta"):
if len(record.seq) > length :
seq_pass.append(record)
print "Found %i sequences" % len(seq_pass)
tempout = open("fasta.tmp", "w")
SeqIO.write(seq_pass, tempout, "fasta")
#sort records and write temp file
records = list(SeqIO.parse("fasta.tmp", "fasta"))
records.sort(cmp=lambda x,y: cmp(len(y),len(x)))
SeqIO.write(records, "fasta.tmp2", "fasta")
#load temp fasta back in and change names, prob unnecessary but couldn't figure out how to do with SeqIO
fasta_tmp = open("fasta.tmp2")
fasta_out = open(args.out, 'wb')
count = 1
for line in fasta_tmp.readlines():
if line.startswith('>'):
contig_id = '>' + base_name + '_' + str(count) + '\n'
fasta_out.write(contig_id)
count += 1
else:
fasta_out.write(line)
tempout.close()
fasta_out.close()
fasta_tmp.close()
os.remove("fasta.tmp")
os.remove("fasta.tmp2")