qiita-spots · charles-cowart · Jun 23, 2022 · Jun 22, 2022 · Jun 23, 2022
diff --git a/.github/workflows/qiita-plugin-ci.yml b/.github/workflows/qiita-plugin-ci.yml
@@ -77,7 +77,7 @@ jobs:
         shell: bash -l {0}
         run: |
           conda config --add channels bioconda
-          conda create -q --yes -n qtp-sequencing python=3.9 pip pigz quast
+          conda create -q --yes -n qtp-sequencing python=3.9 pip pigz quast fqtools
           conda activate qtp-sequencing
 
           export QIITA_SERVER_CERT=`pwd`/qiita-dev/qiita_core/support_files/server.crt

diff --git a/qtp_sequencing/summary.py b/qtp_sequencing/summary.py
@@ -7,23 +7,56 @@
 # -----------------------------------------------------------------------------
 
 from hashlib import md5
-from gzip import open as gopen
-from os.path import basename, join
+from os.path import basename, join, dirname, exists
 from base64 import b64encode
 from io import BytesIO
-from html import escape
 
 from qiita_files.demux import stats as demux_stats
 from qiita_client.util import system_call
 
 import matplotlib
+import pandas as pd
 
 matplotlib.use('Agg')
 
 import matplotlib.pyplot as plt # noqa
 
-FILEPATH_TYPE_TO_NOT_SHOW_HEAD = ['SFF']
-LINES_TO_READ_FOR_HEAD = 10
+FILEPATH_TYPE_NO_FQTOOLS = ['SFF', 'FASTA_preprocessed']
+
+
+def _generate_html_summary(artifact_type, filepaths, out_dir):
+    """Helper method to generate html_summary
+
+    Parameters
+    ----------
+    artifact_type : str
+        The artifact_type to summarize
+    filepaths : [(str, str)]
+        A list of string pairs where the first element is the filepath and the
+        second is the filepath type
+    out_dir : str
+        The output folder
+
+    Returns
+    -------
+    str
+        The str representing the artifact html summary
+    """
+    # we have 2 main cases: Demultiplexed and everything else,
+    # splitting on those
+    if artifact_type == 'Demultiplexed':
+        artifact_information = '\n'.join(_summary_demultiplexed(
+            artifact_type, filepaths))
+        if artifact_information is None:
+            raise ValueError("We couldn't find a demux file in your artifact")
+    elif artifact_type == 'FASTA_preprocessed':
+        artifact_information = '\n'.join(_summary_FASTA_preprocessed(
+            artifact_type, filepaths, out_dir))
+    else:
+        artifact_information = _summary_not_demultiplexed(
+            artifact_type, filepaths)
+
+    return artifact_information
 
 
 def generate_html_summary(qclient, job_id, parameters, out_dir):
@@ -63,23 +96,12 @@ def generate_html_summary(qclient, job_id, parameters, out_dir):
     # 1.b get the artifact type_info
     artifact_type = artifact_info['type']
 
-    # we have 2 main cases: Demultiplexed and everything else,
-    # splitting on those
-    if artifact_type == 'Demultiplexed':
-        artifact_information = _summary_demultiplexed(
-            artifact_type, filepaths)
-        if artifact_information is None:
-            raise ValueError("We couldn't find a demux file in your artifact")
-    elif artifact_type == 'FASTA_preprocessed':
-        artifact_information = _summary_FASTA_preprocessed(
-            artifact_type, filepaths, out_dir)
-    else:
-        artifact_information = _summary_not_demultiplexed(
-            artifact_type, filepaths)
+    artifact_information = _generate_html_summary(
+        artifact_type, filepaths, out_dir)
 
     of_fp = join(out_dir, "artifact_%d.html" % artifact_id)
     with open(of_fp, 'w') as of:
-        of.write('\n'.join(artifact_information))
+        of.write(artifact_information)
 
     # Step 3: add the new file to the artifact using REST api
     success = True
@@ -111,40 +133,51 @@ def _summary_not_demultiplexed(artifact_type, filepaths):
     """
     # loop over each of the fps/fps_type pairs
     artifact_information = []
+    errors = []
+    df = None
     for fps_type, fps in sorted(filepaths.items()):
         # Step 2: generate HTML summary
         # md5, from http://stackoverflow.com/a/3431838
-        for fp in fps:
+        for i, fp in enumerate(fps):
+            fn = basename(fp)
             with open(fp, "rb") as f:
                 hash_md5 = md5()
                 for chunk in iter(lambda: f.read(4096), b""):
                     hash_md5.update(chunk)
-
-            # getting head of the files
-            header = []
-            if artifact_type not in FILEPATH_TYPE_TO_NOT_SHOW_HEAD:
-                # we need to encapsulate the full for loop because gzip will
-                # not raise an error until you try to read
-                try:
-                    with gopen(fp, 'r') as fin:
-                        header = [escape(line.decode()) for line, _ in zip(
-                            fin, range(LINES_TO_READ_FOR_HEAD))]
-                except IOError:
-                    with open(fp, 'r') as fin:
-                        header = [escape(line) for line, _ in zip(
-                            fin, range(LINES_TO_READ_FOR_HEAD))]
-            filename = basename(fp)
-            artifact_information.append(
-                "<h3>%s (%s)</h3>" % (filename, fps_type))
-            artifact_information.append("<b>MD5:</b>: %s</br>" %
-                                        hash_md5.hexdigest())
-            if header:
-                artifact_information.append(
-                    "<p style=\"font-family:'Courier New', Courier, monospace;"
-                    "font-size:10;\">%s</p><hr/>" % (
-                        "<br/>".join(header)))
-
-    return artifact_information
+            data = {'filename': fn, 'md5': hash_md5.hexdigest(),
+                    'file_type': fps_type}
+
+            if artifact_type not in FILEPATH_TYPE_NO_FQTOOLS:
+                # check if the validate summary is present
+                if i == 0:
+                    fdata = f'{dirname(fp)}/qtp-sequencing-validate-data.csv'
+                    if exists(fdata):
+                        df = pd.read_csv(fdata, index_col=None)
+
+                if df is None:
+                    cmd = f'fqtools count {fp}'
+                    std_out, std_err, return_value = system_call(cmd)
+                    if std_err or return_value != 0:
+                        errors.append(f'{fn}: {std_err}')
+                    else:
+                        reads = int(std_out)
+                else:
+                    reads = df[(df.filename == fn) &
+                               (df.file_type == fps_type)]
+                    # [0] there is only one value
+                    reads = reads.reads.values[0]
+                data['reads'] = reads
+
+            artifact_information.append(data)
+
+    if errors:
+        raise ValueError('Found errors: \n %s' % ''.join(errors))
+
+    df = pd.DataFrame(artifact_information)
+    order = ['file_type', 'reads'] if 'reads' in df.columns else ['file_type']
+    df.sort_values(order, inplace=True)
+
+    return df.to_html(index=False)
 
 
 def _summary_demultiplexed(artifact_type, filepaths):