-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathget_sra.py
executable file
·127 lines (108 loc) · 5.68 KB
/
get_sra.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#!/usr/bin/env python
"""
#amnonscript.py
get all the fasta files from the sra runinfo file
input:
a tab or comma delimited file (runinfo table - SraRunTable.txt - from the study runs links - i.e. http://trace.ncbi.nlm.nih.gov/Traces/sra/?study=SRP056779 -> http://trace.ncbi.nlm.nih.gov/Traces/study/?acc=SRP056779)
"""
import argparse
import sys
import os.path
import csv
import subprocess
__version__ = "1.2"
def GetSRA(inputname, path, skipifthere=False, fastq=False, delimiter=None, outdir='fasta', skip_16s_check=True, split_files=False):
'''Get all the samples from the SRA. Using the Metadata (runinfo metadata) table SraRunTable.txt from the run browser
Parameters
----------
inputname: str
the SraRunInfo.txt file. A table containing a column with Run_s/Run/acc column that contains the SRR accession numbers.
path: str
path to the SraToolKit binary directory
skipifthere: bool, optional
if true, do not download files that already exist
fastq: bool, optional
if true, download fastq instead of fasta
delimiter: str or None, optional
delimiter for the table. If none, autodetect
outdir: str, optional
name of the output directory for the downloads
skip_16s_check: bool, optional
if True, try to identify which samples are WGS and not 16s (>500M reads, not PCR/AMPLICON) and ignore them
split_files: bool, optional
if True, split the samples into forward and reverse reads
Returns
-------
num_files: int
number of files downloaded
'''
if delimiter is None:
with open(inputname) as csvfile:
xx = csv.Sniffer()
res = xx.sniff(csvfile.readline(), delimiters=',\t')
delimiter = res.delimiter
print('Detected delimiter %s' % delimiter)
ifile = csv.DictReader(open(inputname, 'r'), delimiter=delimiter)
num_files = 0
num_skipped = 0
for cline in ifile:
if 'Run_s' in cline:
csamp = cline['Run_s']
elif 'Run' in cline:
csamp = cline['Run']
elif 'acc' in cline:
csamp = cline['acc']
num_files += 1
# test if the sample is 16s or shotgun
# look for some clues and also only if it is big (>500Mb)
suspicious = False
if 'LibrarySelection' in cline:
if cline['LibrarySelection'] != 'PCR':
suspicious = True
if 'Assay_Type' in cline:
if cline['Assay_Type'] != 'AMPLICON':
suspicious = True
if not skip_16s_check:
if suspicious:
try:
if 'MBases' in cline:
if int(cline['MBases']) > 500:
print("skipping sample %s since it seems not 16S")
num_skipped += 1
continue
if 'Bases' in cline:
if int(cline['Bases']) > 500000000:
print("skipping sample %s since it seems not 16S")
num_skipped += 1
continue
except ValueError:
print("error parsing reads count for sample %s" % csamp)
if skipifthere:
if os.path.isfile(os.path.join(outdir, csamp) + '.fasta'):
print("skipping sample %s. file exists" % csamp)
continue
print("getting file %s" % csamp)
params = [os.path.join(path, 'fastq-dump'), '--disable-multithreading']
params += ['--outdir', outdir]
if split_files:
params += ['--split-files']
if not fastq:
params += ['--fasta', '0']
params += [csamp]
print(params)
subprocess.call(params)
print("got file %s" % csamp)
print('got %d files.' % num_files)
return num_files
def main(argv):
parser = argparse.ArgumentParser(description='Get all samples of a study from the SRA version ' + __version__, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-i', '--input', help='sra runinfo table for the study')
parser.add_argument('-o', '--outdir', help='directory to place the fasta/fastq files', default='fasta')
parser.add_argument('-p', '--path', help='path to the sratoolkit binary', default='/home/amam7564/bin/sratoolkit.3.0.0-centos_linux64/bin/')
parser.add_argument('-s', '--skipifhere', help='if set, dont reload files already in the dir', action='store_true')
parser.add_argument('-q', '--fastq', help='if set, output fastq instead of fasta', action='store_true')
parser.add_argument('-r', '--split-files', help='if set, split forward and reverse reads', action='store_true')
args = parser.parse_args(argv)
GetSRA(args.input, args.path, args.skipifhere, fastq=args.fastq, split_files=args.split_files)
if __name__ == "__main__":
main(sys.argv[1:])