Skip to content

improve summary #41

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 23, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/qiita-plugin-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ jobs:
shell: bash -l {0}
run: |
conda config --add channels bioconda
conda create -q --yes -n qtp-sequencing python=3.9 pip pigz quast
conda create -q --yes -n qtp-sequencing python=3.9 pip pigz quast fqtools
conda activate qtp-sequencing

export QIITA_SERVER_CERT=`pwd`/qiita-dev/qiita_core/support_files/server.crt
Expand Down
125 changes: 79 additions & 46 deletions qtp_sequencing/summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,23 +7,56 @@
# -----------------------------------------------------------------------------

from hashlib import md5
from gzip import open as gopen
from os.path import basename, join
from os.path import basename, join, dirname, exists
from base64 import b64encode
from io import BytesIO
from html import escape

from qiita_files.demux import stats as demux_stats
from qiita_client.util import system_call

import matplotlib
import pandas as pd

matplotlib.use('Agg')

import matplotlib.pyplot as plt # noqa

FILEPATH_TYPE_TO_NOT_SHOW_HEAD = ['SFF']
LINES_TO_READ_FOR_HEAD = 10
FILEPATH_TYPE_NO_FQTOOLS = ['SFF', 'FASTA_preprocessed']


def _generate_html_summary(artifact_type, filepaths, out_dir):
"""Helper method to generate html_summary

Parameters
----------
artifact_type : str
The artifact_type to summarize
filepaths : [(str, str)]
A list of string pairs where the first element is the filepath and the
second is the filepath type
out_dir : str
The output folder

Returns
-------
str
The str representing the artifact html summary
"""
# we have 2 main cases: Demultiplexed and everything else,
# splitting on those
if artifact_type == 'Demultiplexed':
artifact_information = '\n'.join(_summary_demultiplexed(
artifact_type, filepaths))
if artifact_information is None:
raise ValueError("We couldn't find a demux file in your artifact")
elif artifact_type == 'FASTA_preprocessed':
artifact_information = '\n'.join(_summary_FASTA_preprocessed(
artifact_type, filepaths, out_dir))
else:
artifact_information = _summary_not_demultiplexed(
artifact_type, filepaths)

return artifact_information


def generate_html_summary(qclient, job_id, parameters, out_dir):
Expand Down Expand Up @@ -63,23 +96,12 @@ def generate_html_summary(qclient, job_id, parameters, out_dir):
# 1.b get the artifact type_info
artifact_type = artifact_info['type']

# we have 2 main cases: Demultiplexed and everything else,
# splitting on those
if artifact_type == 'Demultiplexed':
artifact_information = _summary_demultiplexed(
artifact_type, filepaths)
if artifact_information is None:
raise ValueError("We couldn't find a demux file in your artifact")
elif artifact_type == 'FASTA_preprocessed':
artifact_information = _summary_FASTA_preprocessed(
artifact_type, filepaths, out_dir)
else:
artifact_information = _summary_not_demultiplexed(
artifact_type, filepaths)
artifact_information = _generate_html_summary(
artifact_type, filepaths, out_dir)

of_fp = join(out_dir, "artifact_%d.html" % artifact_id)
with open(of_fp, 'w') as of:
of.write('\n'.join(artifact_information))
of.write(artifact_information)

# Step 3: add the new file to the artifact using REST api
success = True
Expand Down Expand Up @@ -111,40 +133,51 @@ def _summary_not_demultiplexed(artifact_type, filepaths):
"""
# loop over each of the fps/fps_type pairs
artifact_information = []
errors = []
df = None
for fps_type, fps in sorted(filepaths.items()):
# Step 2: generate HTML summary
# md5, from http://stackoverflow.com/a/3431838
for fp in fps:
for i, fp in enumerate(fps):
fn = basename(fp)
with open(fp, "rb") as f:
hash_md5 = md5()
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)

# getting head of the files
header = []
if artifact_type not in FILEPATH_TYPE_TO_NOT_SHOW_HEAD:
# we need to encapsulate the full for loop because gzip will
# not raise an error until you try to read
try:
with gopen(fp, 'r') as fin:
header = [escape(line.decode()) for line, _ in zip(
fin, range(LINES_TO_READ_FOR_HEAD))]
except IOError:
with open(fp, 'r') as fin:
header = [escape(line) for line, _ in zip(
fin, range(LINES_TO_READ_FOR_HEAD))]
filename = basename(fp)
artifact_information.append(
"<h3>%s (%s)</h3>" % (filename, fps_type))
artifact_information.append("<b>MD5:</b>: %s</br>" %
hash_md5.hexdigest())
if header:
artifact_information.append(
"<p style=\"font-family:'Courier New', Courier, monospace;"
"font-size:10;\">%s</p><hr/>" % (
"<br/>".join(header)))

return artifact_information
data = {'filename': fn, 'md5': hash_md5.hexdigest(),
'file_type': fps_type}

if artifact_type not in FILEPATH_TYPE_NO_FQTOOLS:
# check if the validate summary is present
if i == 0:
fdata = f'{dirname(fp)}/qtp-sequencing-validate-data.csv'
if exists(fdata):
df = pd.read_csv(fdata, index_col=None)

if df is None:
cmd = f'fqtools count {fp}'
std_out, std_err, return_value = system_call(cmd)
if std_err or return_value != 0:
errors.append(f'{fn}: {std_err}')
else:
reads = int(std_out)
else:
reads = df[(df.filename == fn) &
(df.file_type == fps_type)]
# [0] there is only one value
reads = reads.reads.values[0]
data['reads'] = reads

artifact_information.append(data)

if errors:
raise ValueError('Found errors: \n %s' % ''.join(errors))

df = pd.DataFrame(artifact_information)
order = ['file_type', 'reads'] if 'reads' in df.columns else ['file_type']
df.sort_values(order, inplace=True)

return df.to_html(index=False)


def _summary_demultiplexed(artifact_type, filepaths):
Expand Down
Loading