WGLab · jonperdomo · May 23, 2024 · May 8, 2024 · May 9, 2024 · May 10, 2024
diff --git a/conda/meta.yaml b/conda/meta.yaml
@@ -1,5 +1,5 @@
 {% set version = "1.3.1" %}
-{% set revision = "725c239b6c1f4d6daf5788871cd47f8781358e3d" %}
+{% set revision = "e23078e169eae9e1915a75236e92bafb8d00c56f" %}
 
 package:
  name: longreadsum
@@ -9,6 +9,13 @@ source:
  git_url: https://github.com/WGLab/LongReadSum.git
  git_rev: {{ revision }}
 
+channels:
+ - bioconda
+ - anaconda
+ - conda-forge
+ - jannessp # for pod5
+ - defaults
+
 build:
  number: 0
  skip: true # [py2k]
@@ -30,6 +37,8 @@ requirements:
  - hdf5
  - htslib
  - plotly
+ - pod5
+ - pyarrow
 
 test:
  commands:

diff --git a/environment.yml b/environment.yml
@@ -3,6 +3,7 @@ channels:
  - bioconda
  - anaconda
  - conda-forge
+ - jannessp # for pod5
  - defaults
 dependencies:
  - python
@@ -13,6 +14,8 @@ dependencies:
  - swig
  - plotly
  - pytest
+ - pod5
+ - pyarrow
 
 # conda env create --file=environment.yml
 # conda activate longreadsum
diff --git a/src/cli.py b/src/cli.py
@@ -15,11 +15,13 @@
  from lib import lrst # For debugging
  from src import generate_html
  from src.plot_utils import *
+ from src.pod5_module import generate_pod5_qc
 else:
  # logging.debug("Running from installed package.")
  import lrst
  import generate_html
  from plot_utils import *
+ from pod5_module import generate_pod5_qc
 
 prg_name = "LongReadSum"
 
@@ -48,7 +50,7 @@ def get_common_param(margs):
  parsing_error_msg = ""
 
  if (margs.input == None or margs.input == "") and (margs.inputs == None or margs.inputs == "") and (
- margs.inputPattern == None or margs.inputPattern == ""):
+ margs.pattern == None or margs.pattern == ""):
  parsing_error_msg += "No input file(s) are provided. \n"
  else:
  # Group parameters into an array
@@ -59,14 +61,15 @@ def get_common_param(margs):
  if not (margs.inputs == None or margs.inputs == ""):
  file_list = [file_str.strip() for file_str in margs.inputs.split(',')]
  param_dict["input_files"].extend(file_list)
- if not (margs.inputPattern == None or margs.inputPattern == ""):
- pat_split = margs.inputPattern.split("*")
+ if not (margs.pattern == None or margs.pattern == ""):
+ pat_split = margs.pattern.split("*")
  param_dict["input_files"].extend(
  glob(os.path.join("*".join(pat_split[:-1]), "*" + pat_split[-1])))
 
  # Number of reads to sample
- read_count = margs.readCount
+ read_count = int(margs.read_count[0])
  param_dict["read_count"] = read_count
+ logging.info("Number of reads to sample: %d", read_count)
 
  if len(param_dict["input_files"]) == 0:
  parsing_error_msg += "No input file(s) can be found. \n"
@@ -433,6 +436,53 @@ def fast5_signal_module(margs):
  else:
  logging.error("QC did not generate.")
 
+
+def pod5_module(margs):
+ """POD5 file input module."""
+ # Get the filetype-specific parameters
+ param_dict = get_common_param(margs)
+ if param_dict == {}:
+ parser.parse_args(['pod5', '--help'])
+ sys.exit(0)
+
+ else:
+ logging.info('Input file(s) are:\n%s', '\n'.join(param_dict["input_files"]))
+ param_dict["out_prefix"] += "POD5"
+ input_para = {}
+ input_para['threads'] = param_dict["threads"]
+ input_para['rdm_seed'] = param_dict["random_seed"]
+ input_para['downsample_percentage'] = param_dict["downsample_percentage"]
+ input_para['output_folder'] = str(param_dict["output_folder"])
+ input_para['out_prefix'] = str(param_dict["out_prefix"])
+ input_para['other_flags'] = 0 # 0 for normal QC, 1 for signal statistics output
+ input_para['input_files'] = []
+ for input_file in param_dict["input_files"]:
+ input_para['input_files'].append(str(input_file))
+
+ # Get the read ID list if specified
+ read_ids = margs.read_ids
+ if read_ids != "" and read_ids is not None:
+ input_para['read_ids'] = read_ids
+ else:
+ input_para['read_ids'] = ""
+
+ read_signal_dict = generate_pod5_qc(input_para)
+ if read_signal_dict is not None:
+ logging.info("QC generated.")
+ logging.info("Generating HTML report...")
+ plot_filepaths = plot_pod5(read_signal_dict, param_dict)
+ # plot_filepaths = plot(read_signal_dict, param_dict, 'POD5')
+ webpage_title = "POD5 QC"
+ fast5_html_obj = generate_html.ST_HTML_Generator(
+ [["basic_st", "read_length_bar", "read_length_hist", "base_counts", "basic_info", "base_quality",
+ "read_avg_base_quality", "ont_signal"], webpage_title, param_dict], plot_filepaths, static=False)
+ fast5_html_obj.generate_st_html(signal_plots=True)
+ logging.info("Done. Output files are in %s", param_dict["output_folder"])
+
+ else:
+ logging.error("QC did not generate.")
+
+
 # Set up the argument parser
 parser = argparse.ArgumentParser(description="QC tools for long-read sequencing data",
  epilog="Example with single inputs:\n"
@@ -461,7 +511,7 @@ def fast5_signal_module(margs):
  help="Multiple comma-separated input filepaths", )
 
 input_files_group.add_argument(
- "-P", "--inputPattern", type=str, default=None,
+ "-P", "--pattern", type=str, default=None,
  help="Use pattern matching (*) to specify multiple input files. Enclose the pattern in double quotes.")
 
 # Plot style parameters
@@ -473,8 +523,8 @@ def fast5_signal_module(margs):
 
 # Number of reads to sample
 common_grp_param.add_argument(
- "-R", "--readCount", nargs="+", type=int, default=8,
- help="Set the number of reads to randomly sample from the file. Default: 8.")
+ "-R", "--read-count", nargs="+", type=int, default=8,
+ help="Set the number of reads to randomly sample from the file. Default: 3.")
 
 # Misc. parameters
 input_files_group.add_argument("-p", "--downsample_percentage", type=float, default=1.0,
@@ -538,6 +588,19 @@ def fast5_signal_module(margs):
 fast5_signal_parser.add_argument("-r", "--read_ids", type=str, default=None,
  help="A comma-separated list of read IDs to extract from the file.")
 
+# POD5 input file
+pod5_parser = subparsers.add_parser('pod5',
+ parents=[parent_parser],
+ help="POD5 file input",
+ description="For example:\n"
+ "python %(prog)s -i input.pod5 -o /output_directory/",
+ formatter_class=RawTextHelpFormatter)
+pod5_parser.set_defaults(func=pod5_module)
+
+# Add an argument for specifying the read names to extract
+pod5_parser.add_argument("-r", "--read_ids", type=str, default=None,
+ help="A comma-separated list of read IDs to extract from the file.")
+
 # Sequencing summary text file input
 seqtxt_parser = subparsers.add_parser('seqtxt',
  parents=[parent_parser],

diff --git a/src/fast5_module.cpp b/src/fast5_module.cpp
@@ -592,10 +592,6 @@ int generateQCForFAST5(Input_Para &_input_data, Output_FAST5 &output_data)
  }
 
  if (signal_mode == true) {
-// // Generate the signal data QC output
-// std::string signal_raw_csv(_input_data.output_folder + "/FAST5_signal_raw.csv");
-// std::string signal_qc_csv(_input_data.output_folder + "/FAST5_signal_QC.csv");
-
  // Loop through each input file and get the QC data across files
  size_t file_count = _input_data.num_input_files;
  for (size_t i = 0; i < file_count; i++)

diff --git a/src/lrst.i b/src/lrst.i
@@ -27,6 +27,15 @@ lrst.i: SWIG module defining the Python wrapper for our C++ modules
  $result = list;
 }
 
+// Map std::string arrays to Python lists
+%typemap(out) std::string[ANY] {
+ PyObject *list = PyList_New($1_dim0);
+ for (int i = 0; i < $1_dim0; i++) {
+ PyList_SetItem(list, i, PyUnicode_FromString($1[i].c_str()));
+ }
+ $result = list;
+}
+
 %include <std_string.i>
 %include <stdint.i>
 %include <std_vector.i>

diff --git a/src/plot_utils.py b/src/plot_utils.py
@@ -6,6 +6,11 @@
 import plotly.graph_objs as go
 from plotly.subplots import make_subplots
 
+if __name__ == 'src.plot_utils':
+ from lib import lrst # For debugging
+else:
+ import lrst
+
 # Set up logging
 import logging
 logging.basicConfig(level=logging.INFO)
@@ -376,24 +381,117 @@ def plot(output_data, para_dict, file_type):
 
  return plot_filepaths
 
+def plot_pod5(output_dict, para_dict):
+ """Plot the ONT POD5 signal data for a random sample of reads."""
+ out_path = para_dict["output_folder"]
+ plot_filepaths = getDefaultPlotFilenames()
+
+ # Get the font size for plotly plots
+ font_size = para_dict["fontsize"]
+
+ # Create the summary table
+ create_pod5_table(output_dict, plot_filepaths)
+
+ # Generate the signal plots
+ marker_size = para_dict["markersize"]
+ read_count_max = para_dict["read_count"]
+
+ read_count = len(output_dict.keys())
+ logging.info("Plotting signal data for {} reads".format(read_count))
+
+ # Randomly sample a small set of reads if it is a large dataset
+ read_sample_size = min(read_count_max, read_count)
+ unsampled_indices = list(range(0, read_sample_size))
+ read_indices = sample(unsampled_indices, read_sample_size)
+
+ if read_sample_size < read_count:
+ logging.info("Randomly sampling {} reads from the total of {} reads".format(read_sample_size, read_count))
+ else:
+ logging.info("Plotting signal data for all {} reads".format(read_count))
+
+ # Plot the reads
+ output_html_plots = {}
+ for read_index in read_indices:
+ # Create the figure
+ fig = go.Figure()
+
+ # Get the read data
+ nth_read_name = list(output_dict.keys())[read_index]
+ nth_read_data = output_dict[nth_read_name]['signal']
+ signal_length = len(nth_read_data)
+ logging.info("Signal data count for read {}: {}".format(nth_read_name, signal_length))
+ nth_read_mean = output_dict[nth_read_name]['mean']
+ nth_read_std = output_dict[nth_read_name]['std']
+ nth_read_median = output_dict[nth_read_name]['median']
+ nth_read_skewness = output_dict[nth_read_name]['skewness']
+ nth_read_kurtosis = output_dict[nth_read_name]['kurtosis']
+
+ # Set up the output CSV
+ csv_qc_filepath = os.path.join(out_path, nth_read_name + '_QC.csv')
+ qc_file = open(csv_qc_filepath, 'w')
+ qc_writer = csv.writer(qc_file)
+ qc_writer.writerow(["Raw_Signal", "Length", "Mean", "Median", "StdDev", "PearsonSkewnessCoeff", "Kurtosis"])
+
+ # Loop through the data
+ x = np.arange(signal_length)
+ fig.add_trace(go.Scatter(
+ x=x, y=nth_read_data,
+ mode='markers',
+ marker=dict(color='LightSkyBlue',
+ size=5,
+ line=dict(color='MediumPurple', width=2)),
+ opacity=0.5))
+
+ # Update CSV
+ raw_row = [nth_read_data, signal_length, nth_read_mean, nth_read_median, nth_read_std, nth_read_skewness, nth_read_kurtosis]
+ qc_writer.writerow(raw_row)
+
+ # Close CSV
+ qc_file.close()
+
+ # Update the plot style
+ fig.update_layout(
+ title=nth_read_name,
+ yaxis_title="Signal",
+ showlegend=False,
+ font=dict(size=PLOT_FONT_SIZE)
+ )
+ fig.update_traces(marker={'size': marker_size})
+ fig.update_xaxes(title="Index")
+
+ # Append the dynamic HTML object to the output structure
+ dynamic_html = fig.to_html(full_html=False)
+ output_html_plots.update({nth_read_name: dynamic_html})
+
+ # Update the plot filepaths
+ plot_filepaths['ont_signal']['dynamic'] = output_html_plots
+
+ return plot_filepaths
+
+
 # Plot the ONT FAST5 signal data
 def plot_signal(output_data, para_dict):
+ """Plot the ONT FAST5 signal data for a random sample of reads."""
 
  # Get input parameters
  output_dir = para_dict["output_folder"]
- font_size = para_dict["fontsize"]
+ # font_size = para_dict["fontsize"]
  marker_size = para_dict["markersize"]
  read_count_max = para_dict["read_count"]
 
  # Get read and base counts
  read_count = output_data.getReadCount()
- logging.info("Plotting signal data for {} reads".format(read_count))
 
  # Randomly sample a small set of reads if it is a large dataset
  read_sample_size = min(read_count_max, read_count)
  unsampled_indices = list(range(0, read_sample_size))
  read_indices = sample(unsampled_indices, read_sample_size)
 
+ if read_sample_size < read_count:
+ logging.info("Randomly sampling {} reads from the total of {} reads".format(read_sample_size, read_count))
+ else:
+ logging.info("Plotting signal data for all {} reads".format(read_count))
+
  # Plot the reads
  output_html_plots = {}
  for read_index in read_indices:
@@ -489,7 +587,7 @@ def plot_signal(output_data, para_dict):
 
  return output_html_plots
 
-# Create a summary table for the basic statistics
+# Create a summary table for the basic statistics from the C++ output data
 def create_summary_table(output_data, plot_filepaths, file_type):
  plot_filepaths["basic_st"] = {}
  plot_filepaths["basic_st"]['file'] = ""
@@ -500,7 +598,7 @@ def create_summary_table(output_data, plot_filepaths, file_type):
  if file_type == 'FAST5s':
  file_type_label = 'FAST5'
 
- plot_filepaths["basic_st"]['description'] = "{} Basic statistics".format(file_type_label)
+ plot_filepaths["basic_st"]['description'] = "{} Basic Statistics".format(file_type_label)
 
  if file_type == 'BAM':
  table_str = "<table>\n<thead>\n<tr><th>Measurement</th><th>Mapped</th><th>Unmapped</th><th>All</th></tr>\n" \
@@ -603,6 +701,26 @@ def create_summary_table(output_data, plot_filepaths, file_type):
  table_str += "\n</tbody>\n</table>"
  plot_filepaths["basic_st"]['detail'] = table_str
 
+def create_pod5_table(output_dict, plot_filepaths):
+ """Create a summary table for the ONT POD5 signal data."""
+ plot_filepaths["basic_st"] = {}
+ plot_filepaths["basic_st"]['file'] = ""
+ plot_filepaths["basic_st"]['title'] = "Summary Table"
+ file_type_label = "POD5"
+ plot_filepaths["basic_st"]['description'] = f"{file_type_label} Basic Statistics"
+
+ # Get values
+ read_count = len(output_dict.keys())
+
+ # Set up the HTML table
+ table_str = "<table>\n<thead>\n<tr><th>Measurement</th><th>Statistics</th></tr>\n</thead>"
+ table_str += "\n<tbody>"
+ int_str_for_format = "<tr><td>{}</td><td style=\"text-align:right\">{:,d}</td></tr>"
+ table_str += int_str_for_format.format("#Total Reads", read_count)
+
+ table_str += "\n</tbody>\n</table>"
+ plot_filepaths["basic_st"]['detail'] = table_str
+
 
 def plot_alignment_numbers(data):
  category = ['Primary Alignments', 'Supplementary Alignments', 'Secondary Alignments',