-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkr_runner.py
131 lines (109 loc) · 4.5 KB
/
kr_runner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
"""
This script runs Kraken2 on files from a samplesheet.
It detects if the input data is single or paired end based on the samplesheet columns.
Make sure you are in an active conda environment with Kraken2 installed.
Usage:
python kr_runner.py --samplesheet <path_to_samplesheet> --out_dir <output_directory>
Arguments:
--samplesheet: Path to the CSV file containing sample information.
--out_dir: Path to the output directory (not ending with /).
The script performs the following steps for each sample in the samplesheet:
1. Runs Kraken2 to classify the unaligned FASTQ files and generate a report. (path to database hardcoded in script)
The script also creates subdirectories for Kraken2 output if they do not exist.
The output directory should not end with a forward slash (/).
Note: Modify the paths to the Kraken2 database if needed.
Example usage:
python pipeline.py --samplesheet /path/to/samplesheet.csv --out_dir /path/to/output_directory
"""
# Import modules
import csv
import subprocess
import os
import argparse
# Path to kraken2 database
# Modify if neeeded
kraken2_db = "/mnt/FS2/data_2/Users/Birgit/db_index/kraken2/standard/"
def run_kraken2(sample, fastq_1, fastq_2, output_dir):
"""
Runs Kraken2 with 64 threads for a given sample.
Creates Kraken2 output file and report in reports directory
Print status message to terminal when finished.
Args:
sample (str): The name of the sample.
output_dir (str): The output directory.
Returns:
None
"""
# Command to run kraken2 on paired end data
command_kraken2_paired = [
"kraken2",
"--threads",
"64",
"--db",
kraken2_db,
"--paired",
fastq_1,
fastq_2,
"--output",
f"{output_dir}/kraken2/{sample}_kraken2.out",
"--report",
f"{output_dir}/kraken2/reports/{sample}_kraken2_report.txt"
]
subprocess.run(command_kraken2_paired)
# # Check if paired or single end data
# if os.path.exists(f"{output_dir}/bwamem2/{sample}_out_unconc_1.fastq"):
# # Run the kraken2 command for paired end data
# subprocess.run(command_kraken2_paired)
# else:
# # Run the kraken2 command for single end data
# subprocess.run(command_kraken2_single)
# Print status message
print(f"\nFinished kraken2 for sample {sample}\n", flush=True)
# Main function that calls other functions based on samplesheet and passed arguments,
def main():
"""
Main function that calls other functions based on samplesheet and passed arguments.
"""
# Create parser
parser = argparse.ArgumentParser(
description=(
'Runs Kraken2 on all provided samples.'
)
)
# Add the arguments
parser.add_argument('--samplesheet', type=str, required=True,
help='Path to CSV file containing sample information')
parser.add_argument('--out_dir', type=str, required=True,
help='Path to Kraken2 output directory (not ending with /)')
# Parse!
args = parser.parse_args()
# Assign arguments to variables
samplesheet = args.samplesheet
output_dir = args.out_dir
# Create output dir if it does not exist
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# Create subdirectories for Kraken2 output
if not os.path.exists(f"{output_dir}/kraken2"):
os.makedirs(f"{output_dir}/kraken2")
if not os.path.exists(f"{output_dir}/kraken2/reports"):
os.makedirs(f"{output_dir}/kraken2/reports")
# Initialize a counter for the number of samples processed
num_samples_processed = 0
# Read sample information from the CSV file and call functions for each sample
with open(samplesheet, "r") as ssheet:
ssheet_reader = csv.DictReader(ssheet)
# Determine if data is paired or single end
paired = "fastq_2" in ssheet_reader.fieldnames
for row in ssheet_reader:
sample = row["sample"]
fastq_1 = row["fastq_1"]
# Set fastq_2 to None if single end data
fastq_2 = row["fastq_2"] if paired else None
run_kraken2(sample, fastq_1, fastq_2, output_dir)
num_samples_processed += 1
# Status messages
print(f"\nProcessed all {num_samples_processed} samples :)\n", flush=True)
# If this script is run directly call the main function
if __name__ == '__main__':
main()