Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

29 add pod5 support #51

Merged
merged 7 commits into from
May 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion conda/meta.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{% set version = "1.3.1" %}
{% set revision = "725c239b6c1f4d6daf5788871cd47f8781358e3d" %}
{% set revision = "e23078e169eae9e1915a75236e92bafb8d00c56f" %}

package:
name: longreadsum
Expand All @@ -9,6 +9,13 @@ source:
git_url: https://github.com/WGLab/LongReadSum.git
git_rev: {{ revision }}

channels:
- bioconda
- anaconda
- conda-forge
- jannessp # for pod5
- defaults

build:
number: 0
skip: true # [py2k]
Expand All @@ -30,6 +37,8 @@ requirements:
- hdf5
- htslib
- plotly
- pod5
- pyarrow

test:
commands:
Expand Down
3 changes: 3 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ channels:
- bioconda
- anaconda
- conda-forge
- jannessp # for pod5
- defaults
dependencies:
- python
Expand All @@ -13,6 +14,8 @@ dependencies:
- swig
- plotly
- pytest
- pod5
- pyarrow

# conda env create --file=environment.yml
# conda activate longreadsum
77 changes: 70 additions & 7 deletions src/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,13 @@
from lib import lrst # For debugging
from src import generate_html
from src.plot_utils import *
from src.pod5_module import generate_pod5_qc
else:
# logging.debug("Running from installed package.")
import lrst
import generate_html
from plot_utils import *
from pod5_module import generate_pod5_qc

prg_name = "LongReadSum"

Expand Down Expand Up @@ -48,7 +50,7 @@ def get_common_param(margs):
parsing_error_msg = ""

if (margs.input == None or margs.input == "") and (margs.inputs == None or margs.inputs == "") and (
margs.inputPattern == None or margs.inputPattern == ""):
margs.pattern == None or margs.pattern == ""):
parsing_error_msg += "No input file(s) are provided. \n"
else:
# Group parameters into an array
Expand All @@ -59,14 +61,15 @@ def get_common_param(margs):
if not (margs.inputs == None or margs.inputs == ""):
file_list = [file_str.strip() for file_str in margs.inputs.split(',')]
param_dict["input_files"].extend(file_list)
if not (margs.inputPattern == None or margs.inputPattern == ""):
pat_split = margs.inputPattern.split("*")
if not (margs.pattern == None or margs.pattern == ""):
pat_split = margs.pattern.split("*")
param_dict["input_files"].extend(
glob(os.path.join("*".join(pat_split[:-1]), "*" + pat_split[-1])))

# Number of reads to sample
read_count = margs.readCount
read_count = int(margs.read_count[0])
param_dict["read_count"] = read_count
logging.info("Number of reads to sample: %d", read_count)

if len(param_dict["input_files"]) == 0:
parsing_error_msg += "No input file(s) can be found. \n"
Expand Down Expand Up @@ -433,6 +436,53 @@ def fast5_signal_module(margs):
else:
logging.error("QC did not generate.")


def pod5_module(margs):
"""POD5 file input module."""
# Get the filetype-specific parameters
param_dict = get_common_param(margs)
if param_dict == {}:
parser.parse_args(['pod5', '--help'])
sys.exit(0)

else:
logging.info('Input file(s) are:\n%s', '\n'.join(param_dict["input_files"]))
param_dict["out_prefix"] += "POD5"
input_para = {}
input_para['threads'] = param_dict["threads"]
input_para['rdm_seed'] = param_dict["random_seed"]
input_para['downsample_percentage'] = param_dict["downsample_percentage"]
input_para['output_folder'] = str(param_dict["output_folder"])
input_para['out_prefix'] = str(param_dict["out_prefix"])
input_para['other_flags'] = 0 # 0 for normal QC, 1 for signal statistics output
input_para['input_files'] = []
for input_file in param_dict["input_files"]:
input_para['input_files'].append(str(input_file))

# Get the read ID list if specified
read_ids = margs.read_ids
if read_ids != "" and read_ids is not None:
input_para['read_ids'] = read_ids
else:
input_para['read_ids'] = ""

read_signal_dict = generate_pod5_qc(input_para)
if read_signal_dict is not None:
logging.info("QC generated.")
logging.info("Generating HTML report...")
plot_filepaths = plot_pod5(read_signal_dict, param_dict)
# plot_filepaths = plot(read_signal_dict, param_dict, 'POD5')
webpage_title = "POD5 QC"
fast5_html_obj = generate_html.ST_HTML_Generator(
[["basic_st", "read_length_bar", "read_length_hist", "base_counts", "basic_info", "base_quality",
"read_avg_base_quality", "ont_signal"], webpage_title, param_dict], plot_filepaths, static=False)
fast5_html_obj.generate_st_html(signal_plots=True)
logging.info("Done. Output files are in %s", param_dict["output_folder"])

else:
logging.error("QC did not generate.")


# Set up the argument parser
parser = argparse.ArgumentParser(description="QC tools for long-read sequencing data",
epilog="Example with single inputs:\n"
Expand Down Expand Up @@ -461,7 +511,7 @@ def fast5_signal_module(margs):
help="Multiple comma-separated input filepaths", )

input_files_group.add_argument(
"-P", "--inputPattern", type=str, default=None,
"-P", "--pattern", type=str, default=None,
help="Use pattern matching (*) to specify multiple input files. Enclose the pattern in double quotes.")

# Plot style parameters
Expand All @@ -473,8 +523,8 @@ def fast5_signal_module(margs):

# Number of reads to sample
common_grp_param.add_argument(
"-R", "--readCount", nargs="+", type=int, default=8,
help="Set the number of reads to randomly sample from the file. Default: 8.")
"-R", "--read-count", nargs="+", type=int, default=8,
help="Set the number of reads to randomly sample from the file. Default: 3.")

# Misc. parameters
input_files_group.add_argument("-p", "--downsample_percentage", type=float, default=1.0,
Expand Down Expand Up @@ -538,6 +588,19 @@ def fast5_signal_module(margs):
fast5_signal_parser.add_argument("-r", "--read_ids", type=str, default=None,
help="A comma-separated list of read IDs to extract from the file.")

# POD5 input file
pod5_parser = subparsers.add_parser('pod5',
parents=[parent_parser],
help="POD5 file input",
description="For example:\n"
"python %(prog)s -i input.pod5 -o /output_directory/",
formatter_class=RawTextHelpFormatter)
pod5_parser.set_defaults(func=pod5_module)

# Add an argument for specifying the read names to extract
pod5_parser.add_argument("-r", "--read_ids", type=str, default=None,
help="A comma-separated list of read IDs to extract from the file.")

# Sequencing summary text file input
seqtxt_parser = subparsers.add_parser('seqtxt',
parents=[parent_parser],
Expand Down
4 changes: 0 additions & 4 deletions src/fast5_module.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -592,10 +592,6 @@ int generateQCForFAST5(Input_Para &_input_data, Output_FAST5 &output_data)
}

if (signal_mode == true) {
// // Generate the signal data QC output
// std::string signal_raw_csv(_input_data.output_folder + "/FAST5_signal_raw.csv");
// std::string signal_qc_csv(_input_data.output_folder + "/FAST5_signal_QC.csv");

// Loop through each input file and get the QC data across files
size_t file_count = _input_data.num_input_files;
for (size_t i = 0; i < file_count; i++)
Expand Down
9 changes: 9 additions & 0 deletions src/lrst.i
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,15 @@ lrst.i: SWIG module defining the Python wrapper for our C++ modules
$result = list;
}

// Map std::string arrays to Python lists
%typemap(out) std::string[ANY] {
PyObject *list = PyList_New($1_dim0);
for (int i = 0; i < $1_dim0; i++) {
PyList_SetItem(list, i, PyUnicode_FromString($1[i].c_str()));
}
$result = list;
}

%include <std_string.i>
%include <stdint.i>
%include <std_vector.i>
Expand Down
126 changes: 122 additions & 4 deletions src/plot_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@
import plotly.graph_objs as go
from plotly.subplots import make_subplots

if __name__ == 'src.plot_utils':
from lib import lrst # For debugging
else:
import lrst

# Set up logging
import logging
logging.basicConfig(level=logging.INFO)
Expand Down Expand Up @@ -376,24 +381,117 @@ def plot(output_data, para_dict, file_type):

return plot_filepaths

def plot_pod5(output_dict, para_dict):
"""Plot the ONT POD5 signal data for a random sample of reads."""
out_path = para_dict["output_folder"]
plot_filepaths = getDefaultPlotFilenames()

# Get the font size for plotly plots
font_size = para_dict["fontsize"]

# Create the summary table
create_pod5_table(output_dict, plot_filepaths)

# Generate the signal plots
marker_size = para_dict["markersize"]
read_count_max = para_dict["read_count"]

read_count = len(output_dict.keys())
logging.info("Plotting signal data for {} reads".format(read_count))

# Randomly sample a small set of reads if it is a large dataset
read_sample_size = min(read_count_max, read_count)
unsampled_indices = list(range(0, read_sample_size))
read_indices = sample(unsampled_indices, read_sample_size)

if read_sample_size < read_count:
logging.info("Randomly sampling {} reads from the total of {} reads".format(read_sample_size, read_count))
else:
logging.info("Plotting signal data for all {} reads".format(read_count))

# Plot the reads
output_html_plots = {}
for read_index in read_indices:
# Create the figure
fig = go.Figure()

# Get the read data
nth_read_name = list(output_dict.keys())[read_index]
nth_read_data = output_dict[nth_read_name]['signal']
signal_length = len(nth_read_data)
logging.info("Signal data count for read {}: {}".format(nth_read_name, signal_length))
nth_read_mean = output_dict[nth_read_name]['mean']
nth_read_std = output_dict[nth_read_name]['std']
nth_read_median = output_dict[nth_read_name]['median']
nth_read_skewness = output_dict[nth_read_name]['skewness']
nth_read_kurtosis = output_dict[nth_read_name]['kurtosis']

# Set up the output CSV
csv_qc_filepath = os.path.join(out_path, nth_read_name + '_QC.csv')
qc_file = open(csv_qc_filepath, 'w')
qc_writer = csv.writer(qc_file)
qc_writer.writerow(["Raw_Signal", "Length", "Mean", "Median", "StdDev", "PearsonSkewnessCoeff", "Kurtosis"])

# Loop through the data
x = np.arange(signal_length)
fig.add_trace(go.Scatter(
x=x, y=nth_read_data,
mode='markers',
marker=dict(color='LightSkyBlue',
size=5,
line=dict(color='MediumPurple', width=2)),
opacity=0.5))

# Update CSV
raw_row = [nth_read_data, signal_length, nth_read_mean, nth_read_median, nth_read_std, nth_read_skewness, nth_read_kurtosis]
qc_writer.writerow(raw_row)

# Close CSV
qc_file.close()

# Update the plot style
fig.update_layout(
title=nth_read_name,
yaxis_title="Signal",
showlegend=False,
font=dict(size=PLOT_FONT_SIZE)
)
fig.update_traces(marker={'size': marker_size})
fig.update_xaxes(title="Index")

# Append the dynamic HTML object to the output structure
dynamic_html = fig.to_html(full_html=False)
output_html_plots.update({nth_read_name: dynamic_html})

# Update the plot filepaths
plot_filepaths['ont_signal']['dynamic'] = output_html_plots

return plot_filepaths


# Plot the ONT FAST5 signal data
def plot_signal(output_data, para_dict):
"""Plot the ONT FAST5 signal data for a random sample of reads."""

# Get input parameters
output_dir = para_dict["output_folder"]
font_size = para_dict["fontsize"]
# font_size = para_dict["fontsize"]
marker_size = para_dict["markersize"]
read_count_max = para_dict["read_count"]

# Get read and base counts
read_count = output_data.getReadCount()
logging.info("Plotting signal data for {} reads".format(read_count))

# Randomly sample a small set of reads if it is a large dataset
read_sample_size = min(read_count_max, read_count)
unsampled_indices = list(range(0, read_sample_size))
read_indices = sample(unsampled_indices, read_sample_size)

if read_sample_size < read_count:
logging.info("Randomly sampling {} reads from the total of {} reads".format(read_sample_size, read_count))
else:
logging.info("Plotting signal data for all {} reads".format(read_count))

# Plot the reads
output_html_plots = {}
for read_index in read_indices:
Expand Down Expand Up @@ -489,7 +587,7 @@ def plot_signal(output_data, para_dict):

return output_html_plots

# Create a summary table for the basic statistics
# Create a summary table for the basic statistics from the C++ output data
def create_summary_table(output_data, plot_filepaths, file_type):
plot_filepaths["basic_st"] = {}
plot_filepaths["basic_st"]['file'] = ""
Expand All @@ -500,7 +598,7 @@ def create_summary_table(output_data, plot_filepaths, file_type):
if file_type == 'FAST5s':
file_type_label = 'FAST5'

plot_filepaths["basic_st"]['description'] = "{} Basic statistics".format(file_type_label)
plot_filepaths["basic_st"]['description'] = "{} Basic Statistics".format(file_type_label)

if file_type == 'BAM':
table_str = "<table>\n<thead>\n<tr><th>Measurement</th><th>Mapped</th><th>Unmapped</th><th>All</th></tr>\n" \
Expand Down Expand Up @@ -603,6 +701,26 @@ def create_summary_table(output_data, plot_filepaths, file_type):
table_str += "\n</tbody>\n</table>"
plot_filepaths["basic_st"]['detail'] = table_str

def create_pod5_table(output_dict, plot_filepaths):
"""Create a summary table for the ONT POD5 signal data."""
plot_filepaths["basic_st"] = {}
plot_filepaths["basic_st"]['file'] = ""
plot_filepaths["basic_st"]['title'] = "Summary Table"
file_type_label = "POD5"
plot_filepaths["basic_st"]['description'] = f"{file_type_label} Basic Statistics"

# Get values
read_count = len(output_dict.keys())

# Set up the HTML table
table_str = "<table>\n<thead>\n<tr><th>Measurement</th><th>Statistics</th></tr>\n</thead>"
table_str += "\n<tbody>"
int_str_for_format = "<tr><td>{}</td><td style=\"text-align:right\">{:,d}</td></tr>"
table_str += int_str_for_format.format("#Total Reads", read_count)

table_str += "\n</tbody>\n</table>"
plot_filepaths["basic_st"]['detail'] = table_str


def plot_alignment_numbers(data):
category = ['Primary Alignments', 'Supplementary Alignments', 'Secondary Alignments',
Expand Down
Loading
Loading