-
Notifications
You must be signed in to change notification settings - Fork 0
/
convert-sequence.py
executable file
·86 lines (80 loc) · 3.39 KB
/
convert-sequence.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/env python3
import os
import sys
import argparse
import textwrap
from Bio import SeqIO
format_exts = {
'clustal': set(['.clustal', '.aln']),
'embl': set(['.embl']),
'fasta': set(['.fa', '.faa', '.fna', '.fasta']),
'fasta-2line': set([]),
'genbank': set(['.gb', '.genbank']),
'imgt': set(['.imgt']),
'maf': set(['.maf']),
'mauve': set(['.alignment', '.xmfa']),
'nexus': set(['.nexus']),
'phylip': set(['.phy', '.ph', '.phylip']),
'phylip-sequential': set([]),
'phylip-relaxed': set([]),
'pir': set(['.pir', '.nbrf']),
'seqxml': set(['.xml']),
'stockholm': set(['.sto', '.stk']),
'tab': set(['.tab', '.tsv']),
'xdna': set(['.xdna'])
}
def infer_format(filename):
extension = os.path.splitext(filename)[1]
for fmt, exts in format_exts.items():
if extension in exts:
return fmt
return None
def main(args):
if args.infmt is None:
fmt = infer_format(args.input)
if fmt is None:
sys.exit("Input file format not recognized: {}".format(args.input))
args.infmt = fmt
if args.outfmt is None:
if args.output == sys.stdout:
sys.exit("You must specify either an output file destination or an output format.")
fmt = infer_format(args.output.name)
if fmt is None:
sys.exit("Output file format not recognized: {}".format(args.output.name))
args.outfmt = fmt
SeqIO.convert(args.input, args.infmt, args.output, args.outfmt)
if __name__ == "__main__":
desc = textwrap.dedent("""\
This script converts between different sequence file formats. It will infer the
input and desired output file formats from the extensions of the provided file names.
Users can also specify file formats, in which case the file extensions are ignored.
""")
epil = textwrap.dedent("""\
Supported format strings and inferred file extensions
-----------------------------------------------------------
clustal {clustal}
embl {embl}
fasta {fasta}
fasta-2line (Note: exactly 2 lines per record, no wrapping)
genbank {genbank}
imgt {imgt}
maf {maf}
mauve {mauve}
nexus {nexus}
phylip {phylip}
phylip-sequential
phylip-relaxed (Note: interleaved, but allows longer sequence names)
pir {pir}
seqxml {seqxml}
stockholm {stockholm}
tab {tab}
xdna {xdna}
""".format(**format_exts))
parser = argparse.ArgumentParser(description=desc, epilog=epil, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('--infmt', metavar="format-string", choices=format_exts.keys(), help="")
parser.add_argument('--outfmt', metavar="format-string", choices=format_exts.keys(), help="")
parser.add_argument('input', help="A file containing one or more sequences or sequence alignments.")
parser.add_argument('output', nargs='?', type=argparse.FileType('w'), default=sys.stdout,
help="A reformatted file containing the same sequence(s).")
args = parser.parse_args()
main(args)