forked from LangilleLab/microbiome_helper
-
Notifications
You must be signed in to change notification settings - Fork 1
/
qiime2_fastq_lengths.py
executable file
·123 lines (82 loc) · 3.98 KB
/
qiime2_fastq_lengths.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/env python
from __future__ import print_function, division
from tempfile import TemporaryDirectory
from multiprocessing import Pool
import argparse
import pandas as pd
import subprocess
import os
__author__ = "Gavin Douglas"
parser = argparse.ArgumentParser(
description="Creates table of number of reads per " + \
"sample from QIIME2 qza files. Column " + \
"names will be QZA filenames with " + \
"extension removed.",
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('QZA_files', metavar='QZA', type=str, nargs='+',
help='QIIME2 artifacts containing gzipped fastq files.')
parser.add_argument("-p", "--proc", required=False, type=int, default=1,
help="Number of processes to run in parallel")
parser.add_argument("-o", "--output", help="Name of output read count table",
required=True, type=str)
def fastq_read_num(fastq):
'''Runs system calls to determine number of reads in gzipped fastq'''
zcat_process = subprocess.Popen(["zcat", fastq], stdout=subprocess.PIPE)
wc_process = subprocess.Popen(["wc", "-l"], stdin=zcat_process.stdout,
stdout=subprocess.PIPE)
zcat_process.stdout.close()
wc_output, wc_err = wc_process.communicate()
return(int(int(wc_output.decode('ascii'))/4))
def process_fastq(fastq):
'''For an individual FASTQ file determine the sample name, whether it's R1
or R2, and the number of reads in the file. Returns list of the sample and
orientation joined by "_" as first element and the number of reads as
second element.'''
fastq_basename = os.path.basename(fastq)
sample = fastq_basename.split("_")[0]
if "_R1_" in fastq_basename:
read_id = "R1"
elif "_R2_" in fastq_basename:
read_id = "R2"
else:
raise ValueError("Neither _R1_ or _R2_ in filename " + fastq_basename)
return(["_".join([sample,read_id]),
fastq_read_num(fastq)])
def reads_per_qza_gzipped_fastq(qza, num_proc):
'''Decompress QZA of FASTQs and return dictionary of read counts per
sample for both R1 and R2 fastqs (R2 only if applicable).'''
sample_reads = {}
# In temporary directory decompress the qza and output all FASTQs.
with TemporaryDirectory() as tmp:
qza_name = os.path.basename(qza)
sample_reads[qza_name] = {}
export_cmd = ["qiime", "tools", "export", "--input-path", qza, "--output-path", tmp]
subprocess.call(export_cmd)
# Get list of all FASTQs in tmp folder.
tmp_fastqs = [os.path.join(tmp, f) for f in os.listdir(tmp) if os.path.isfile(os.path.join(tmp, f)) and ".fastq.gz" in f]
pool_set = Pool(num_proc)
sample_reads_raw = pool_set.map(process_fastq, tmp_fastqs)
# Add these output values to a dictionary to be returned.
for raw_out in sample_reads_raw:
sample_reads[qza_name][raw_out[0]] = raw_out[1]
return(sample_reads)
def main():
args = parser.parse_args()
# Initialize list containing dictionaries for each QZA.
sample_reads_dicts = []
# Loop over all input QZA files.
for qza in args.QZA_files:
sample_reads_dicts += [reads_per_qza_gzipped_fastq(qza, args.proc)]
# Combine all dictionaries into one.
sample_reads_combined_dict = {}
for tab_dict in sample_reads_dicts:
sample_reads_combined_dict.update(tab_dict)
# Make pandas dataframe from thsi combined dictionary.
sample_reads = pd.DataFrame.from_dict(sample_reads_combined_dict, orient='columns')
# Re-order indices and columns alphabetically.
sample_reads.sort_index(axis=0, inplace=True)
sample_reads.sort_index(axis=1, inplace=True)
sample_reads.to_csv(path_or_buf=args.output, sep="\t",
index_label="dataset", na_rep="NaN")
if __name__ == "__main__":
main()