-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpipeline_single.py
392 lines (342 loc) · 15.3 KB
/
pipeline_single.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
"""
This script runs BWA-MEM2, Samtools view&fastq, Kraken2 and kraken-biom on files from a samplesheet.
It also creates a CSV file with read counts from the SAM files and Kraken reports.
Single end data!
Make sure you are in an active conda environment with bwa-mem2, samtools, kraken2, and kraken-biom installed.
Usage:
python pipeline.py --samplesheet <path_to_samplesheet> --out_dir <output_directory>
Arguments:
--samplesheet: Path to the CSV file containing sample information.
--out_dir: Path to the output directory (not ending with /).
The script performs the following steps for each sample in the samplesheet:
1. Runs BWA-MEM2 to align the single-end FASTQ files to a reference genome. (path to database hardcoded in script)
2. Runs Samtools view to filter and extract unaligned reads from the BWA-MEM2 output.
3. Runs Samtools fastq to convert the filtered SAM file to FASTQ files.
4. Runs Kraken2 to classify the unaligned FASTQ files and generate a report. (path to database hardcoded in script)
5. Extracts the number of Homo sapiens and bacterial classifications from the Kraken2 report.
6. Runs Samtools flagstat to calculate read statistics from the BWA-MEM2 output.
7. Writes the extracted read counts from BWA-MEM2 and Kraken2 to a summary CSV file.
8. Runs Kraken-biom to convert the Kraken2 reports to a BIOM file.
The script also creates subdirectories for BWA-MEM2 and Kraken2 output if they do not exist.
The output directory should not end with a forward slash (/).
Note: Modify the paths to the BWA-MEM2 index and Kraken2 database if needed.
Example usage:
python pipeline.py --samplesheet /path/to/samplesheet.csv --out_dir /path/to/output_directory
"""
# Import modules
import csv
import subprocess
import os
import argparse
import re
# Path to bwa-mem2 index of the reference genome
# Modify if needed
bwamem2_db = "/home/birgit/data/db_index/bwa-mem2/human"
# Path to kraken2 database
# Modify if neeeded
kraken2_db = "/home/birgit/data/db_index/kraken2/standard"
def run_bwa_mem2(sample, fastq_1, output_dir):
"""
Run BWA-MEM2 for a given sample using the provided FASTQ files.
Creates sam file with BWA-MEM2 output.
Print status message to terminal when finished.
Args:
sample (str): The name of the sample.
fastq_1 (str): The path to the first paired FASTQ file.
output_dir (str): The directory where the output sam file will be saved.
Returns:
None
"""
# Construct the command to run BWA-MEM2
command = [
"bwa-mem2",
"mem",
"-t",
"32",
bwamem2_db,
fastq_1
]
# Create file for output
output_file = f"{output_dir}/bwamem2/{sample}_out.sam"
# Open output file
with open(output_file, "w") as f_out:
# Run the BWA-MEM2 command and write output to file
subprocess.run(command, stdout=f_out)
# Print status message
print(f"\nFinished BWA-MEM2 for sample {sample}\n", flush=True)
def run_samtools(sample, output_dir):
"""
Runs samtools view -f12 and samtools fastq commands on BWA-MEM2 output on given sample.
Creates sam file with unaligned reads and fastq files with unaligned reads.
Print status message to terminal when finished with each command.
Args:
sample (str): The name of the sample, from samplesheet.
output_dir (str): The output directory where the sam and fastq files will be saved.
Returns:
None
"""
# Construct the command to run samtools view
# Use -f12 to only extract unaligned reads
command_view = [
"samtools",
"view",
"-h",
"-f4",
f"{output_dir}/bwamem2/{sample}_out.sam"
]
# Create file for output
output_file = f"{output_dir}/bwamem2/{sample}_out_unconc.sam"
# Open output file
with open(output_file, "w") as f_out:
# Run the samtools view command and write output to file
subprocess.run(command_view, stdout=f_out)
# Print status message
print(f"\nFinished samtools view for sample {sample}\n", flush=True)
# Construct the command to run samtools fastq
command_fastq = [
"samtools",
"fastq",
f"{output_dir}/bwamem2/{sample}_out_unconc.sam"
]
# Create file for output
output_file = f"{output_dir}/bwamem2/{sample}_out_unconc.fastq"
# Open output file
with open(output_file, "w") as fastq_out:
# Run the samtools fastq command and write output to file
subprocess.run(command_fastq, stdout=fastq_out)
# Print status message
print(f"\nFinished samtools fastq for sample {sample}\n", flush=True)
def check_fastq(sample, output_dir):
"""
Checks if the fastq file contains data, exits with error if it is empty.
Args:
sample (str): The name of the sample.
output_dir (str): The output directory.
Returns:
None
Raises:
ValueError: If the fastq file is empty
"""
# Path to fastq file
fastq_file = f"{output_dir}/bwamem2/{sample}_out_unconc.fastq"
# Check if fastqfile is empty
if os.path.getsize(fastq_file) == 0:
raise ValueError(f"Error: Fastq file {fastq_file} is empty for sample {sample}.\n")
def run_kraken2(sample, output_dir):
"""
Runs Kraken2 with 32 threads for a given sample.
Creates kraken2 output file and report in reports directory
Print status message to terminal when finished.
Args:
sample (str): The name of the sample.
output_dir (str): The output directory.
Returns:
None
"""
# Command to run kraken2
command_kraken2 = [
"kraken2",
"--threads",
"32",
"--db",
kraken2_db,
f"{output_dir}/bwamem2/{sample}_out_unconc.fastq",
"--output",
f"{output_dir}/kraken2/{sample}_kraken2.out",
"--report",
f"{output_dir}/kraken2/reports/{sample}_kraken2_report.txt"
]
# Run the kraken2 command
subprocess.run(command_kraken2)
# Print status message
print(f"\nFinished kraken2 for sample {sample}\n", flush=True)
def flagstat(sample, output_dir):
"""
Runs samtools flagstat command on a given sample, extracts number of input reads, concordantly and unconcordantly aligned reads.
Prints status message to terminal with the results.
Args:
sample (str): The name of the sample.
output_dir (str): The output directory.
Returns:
tuple: A tuple containing the total number of input reads,
the number of concordantly aligned reads,
and the number of unconcordantly aligned reads.
"""
# Construct command
command = [
"samtools",
"flagstat",
"-@",
"32",
f"{output_dir}/bwamem2/{sample}_out.sam"
]
# Run samtools flagstat command
result = subprocess.run(command, capture_output=True, text=True)
# Split the output into lines
lines = result.stdout.split('\n')
# Select the second line (amount of input reads)
second_line = lines[1]
# Extract the amount of input reads from 2nd line
total_input = int(second_line.split(' ')[0])
# Select the 8th line (primary mapped)
eight_line = lines[7]
# Extract the number from the 12th line
properly_paired = int(eight_line.split(' ')[0])
# Calculate amount of disconcordantly paired reads
unconcordantly_aligned = total_input - properly_paired
# Print the results
print(f"Number of BWA-MEM2 input reads for sample {sample}: {total_input}", flush=True)
print(f"Number of concordantly aligned reads for sample {sample}: {properly_paired}", flush=True)
print(f"Number of unaligned reads for sample {sample}: {unconcordantly_aligned}", flush=True)
# Return amounts
return total_input, properly_paired, unconcordantly_aligned
# function to extract amount of homo sapiens and bacteria kraken2 classifications from kreport
def kraken2_stats(sample, output_dir):
"""
Calculate the number of Homo sapiens and Bacterial classified reads for a given sample based on a Kraken2 report.
Prints status message to terminal with the results, warns when no classification count is zero.
Args:
sample (str): The name of the sample.
output_dir (str): The output directory where the Kraken2 report is located.
Returns:
tuple: A tuple containing the number of Homo sapiens reads and the number of Bacterial reads.
Raises:
FileNotFoundError: If the Kraken2 report file is not found.
"""
# Construct filename for report of sample
filename = f"{output_dir}/kraken2/reports/{sample}_kraken2_report.txt"
# Intitalize counters (outside of for loop, so stays 0 when nothing is encountered and no errors)
homo_number = 0
bac_number = 0
# Open report, loop through lines
with open(filename, 'r') as file:
for line in file:
# Split columns based on tab
columns = line.split('\t')
# If homo sapiens is encountered, take value and print
if 'Homo sapiens' in columns[5]:
homo_number = int(columns[1])
print(f"Number of Homo sapiens reads for sample {sample}: {homo_number}", flush=True)
# If Bacteria is encountered (based on tax id, some species contain 'Bacteria' in name), take value and print
if re.fullmatch('^2$', columns[4]):
bac_number = int(columns[1])
print(f"Number of Bacterial reads for sample {sample}: {bac_number}", flush=True)
# Warn user if no Homo sapiens classifications
if homo_number == 0:
print(f"Warning: No Homo sapiens classifications found in kraken2 report of sample {sample}.\n", flush=True)
# Warn user if no bacteria classifications found
if bac_number == 0:
print(f"Warning: No Bacteria classifications found in kraken2 report of sample {sample}.\n", flush=True)
# Return values
return homo_number, bac_number
# Function that creates csv based on outputs of flagstat and extract homo sapiens reads functions
def read_count_extractor(sample, output_dir):
"""
Extracts read counts for a given sample and writes them to a CSV file.
Calls kraken2_stats and flagstat functions.
Writes information to summary CSV file ("read_counts.csv").
Args:
sample (str): The path to the input sample file.
output_dir (str): The directory where the output CSV file will be written.
Returns:
None
"""
output_file = f"{output_dir}/read_counts.csv"
# Run flagstat for the current sample
total_input, properly_paired, unconcordantly_aligned = flagstat(sample, output_dir)
# extract homo sapiens reads
homo_number, bac_number = kraken2_stats(sample, output_dir)
# Write values from each sample to csv
with open(output_file, 'a', newline='') as outfile:
out_writer = csv.writer(outfile)
out_writer.writerow([sample, total_input, properly_paired, unconcordantly_aligned, homo_number, bac_number])
# Function to run kraken-biom
def run_kraken_biom(output_dir):
"""
Runs kraken-biom command to generate a classified.biom file with taxonomic information of all samples.
Prints status message to terminal when finished.
Parameters:
output_dir (str): The output directory where the classified.biom file will be saved.
Returns:
None
"""
# Command to run kraken-biom
command_kr_biom = [
"kraken-biom",
"-k",
f"{output_dir}/kraken2/reports/",
"-o",
f"{output_dir}/classified.biom"
]
# Run kraken-biom cmd
subprocess.run(command_kr_biom)
# Print status message
print(f"\nCreated {os.path.join(output_dir,'classified.biom')} with taxonomic information of all samples :)\n", flush=True)
# Main function that calls other functions based on samplesheet and passed arguments,
def main():
"""
Run pipeline with BWA-MEM2 host depletion, samtools, kraken2, biom conversion and read count summarizer.
Does this by calling run_bwa_mem2, run_samtools, run_kraken2, read_count_extractor functions
Args:
--samplesheet (str): Path to CSV file containing sample information.
--out_dir (str): Path to output directory (not ending with /).
Returns:
None
"""
# Create parser
parser = argparse.ArgumentParser(description='Run pipeline with BWA-MEM2 host depletion, samtools, kraken2, biom conversion and read count summarizer.')
# Add the arguments
parser.add_argument('--samplesheet', type=str, required=True, help='Path to CSV file containing sample information')
parser.add_argument('--out_dir', type=str, required=True, help='Path to output directory (not ending with /)')
# Parse!
args = parser.parse_args()
# Assign arguments to variables
samplesheet = args.samplesheet
output_dir = args.out_dir
# Create output dir if it does not exist
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# Create subdirectories for bwa-mem2 and kraken2 output if they do not exist
if not os.path.exists(f"{output_dir}/bwamem2"):
os.makedirs(f"{output_dir}/bwamem2")
if not os.path.exists(f"{output_dir}/kraken2"):
os.makedirs(f"{output_dir}/kraken2")
if not os.path.exists(f"{output_dir}/kraken2/reports"):
os.makedirs(f"{output_dir}/kraken2/reports")
# Create file for summary of extracted readcounts
output_file = f"{output_dir}/read_counts.csv"
# Create output file if it does not exist
with open(output_file, 'w', newline='') as outfile:
# Create headers
headers = ['sample', 'total_input_reads', 'human_bwa', 'not_human_bwa', 'human_kraken', 'bacterial_kraken']
# Write headers to the output file
out_writer = csv.writer(outfile)
out_writer.writerow(headers)
# Initialize a counter for the number of samples processed
num_samples_processed = 0
# Read sample information from the CSV file and call functions for each sample
with open(samplesheet, "r") as ssheet:
ssheet_reader = csv.DictReader(ssheet)
for row in ssheet_reader:
sample = row["sample"]
fastq_1 = row["fastq_1"]
# Run BWA-MEM2 for the current sample
run_bwa_mem2(sample, fastq_1, output_dir)
# Run samtools for the current sample
run_samtools(sample, output_dir)
# Check if fastq file contains reads
check_fastq(sample, output_dir)
# Run kraken2 for the current sample
run_kraken2(sample, output_dir)
# Extract read counts and put in summary csv
read_count_extractor(sample, output_dir)
# Up the counter
num_samples_processed += 1
# Run kraken-biom
run_kraken_biom(output_dir)
# Status messages
print(f"\nProcessed all {num_samples_processed} samples :)\n", flush=True)
print(f"\nCreated {output_file} with summarized read counts of all {num_samples_processed} samples :)\n", flush=True)
# If this script is run directly call the main function
if __name__ == '__main__':
main()