Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add single_sample_qc report resources to core #81

Merged
merged 7 commits into from
Jan 2, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion src/core/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,6 @@ ARG OUT_DIR=/tmp/ugbio
COPY --from=build $OUT_DIR $OUT_DIR

RUN WHEEL_FILE=$(find ${OUT_DIR} -name "*.whl") && \
pip install "${WHEEL_FILE}[vcfbed]"
pip install "${WHEEL_FILE}[vcfbed,reports]"

COPY --from=build ./src/core/ugbio_core/reports ./src/core/ugbio_core/reports
7 changes: 7 additions & 0 deletions src/core/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,13 @@ vcfbed = [
"bgzip>=0.5.0",
]

reports = [
"papermill>=2.6.0",
"jupyter>=1.1.1",
"nbconvert>=7.16.4",
"mistune>=2.0.3,<3.1",# 3.1.0 breaks nbconvert. Can be removed when jupyter/nbconvert#2198 is fixed
]

[project.license]
text = "Apache-2.0"

Expand Down
3 changes: 3 additions & 0 deletions src/core/tests/resources/input_for_html_report.h5
Git LFS file not shown
34 changes: 34 additions & 0 deletions src/core/tests/unit/test_single_sample_qc_html_report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import subprocess
from os.path import join as pjoin
from pathlib import Path

import pytest

BASE_PATH = Path(__file__).parent.parent.parent
REPORT_BASE_PATH = BASE_PATH / "ugbio_core" / "reports"
REPORT_NOTEBOOK = REPORT_BASE_PATH / "single_sample_qc_create_html_report.ipynb"


@pytest.fixture
def resources_dir():
return Path(__file__).parent.parent / "resources"


def test_single_sample_qc_create_html_report(tmpdir, resources_dir):
papermill_out = pjoin(tmpdir, "single_sample_qc_create_html_report.papermill.ipynb")
input_h5_file = pjoin(resources_dir, "input_for_html_report.h5")
base_file_name = "test"

cmd = (
f"papermill {REPORT_NOTEBOOK} {papermill_out} "
f"-p top_metrics_file {REPORT_BASE_PATH}/top_metrics_for_tbl.csv "
f"-p input_h5_file {input_h5_file} "
f"-p input_base_file_name {base_file_name}"
)

assert subprocess.check_call(cmd.split(), cwd=tmpdir) == 0

jupyter_convert_cmd = (
f"jupyter nbconvert --to html {papermill_out} --template classic --no-input --output {base_file_name}.html"
)
assert subprocess.check_call(jupyter_convert_cmd.split(), cwd=tmpdir) == 0
275 changes: 275 additions & 0 deletions src/core/ugbio_core/reports/single_sample_qc_create_html_report.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,275 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"is_executing": true,
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"import os\n",
"\n",
"import h5py\n",
"import pandas as pd\n",
"from IPython.display import HTML, display"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"is_executing": true,
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"pd.set_option(\"display.max_rows\", None)\n",
"pd.set_option(\"display.max_colwidth\", 0)\n",
"pd.options.display.float_format = \"{:,.2f}\".format"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"from IPython.core.interactiveshell import InteractiveShell\n",
"\n",
"InteractiveShell.ast_node_interactivity = \"all\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
},
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"top_metrics_file = \"top_metrics_for_tbl.csv\"\n",
"input_h5_file = \"\"\n",
"input_base_file_name = \"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"if not os.path.isfile(top_metrics_file):\n",
" raise ValueError(f\"Input {top_metrics_file} does not exist\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dict_features = {row.iloc[0]: row.iloc[1] for _, row in pd.read_csv(top_metrics_file).iterrows()}\n",
"df_features = pd.read_csv(top_metrics_file)\n",
"list_metrics = list(set(df_features[\"metric\"]))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"# get Keys within the H5 file\n",
"f = h5py.File(input_h5_file, \"r\")\n",
"list_keys = list(f.keys())\n",
"hist_list_keys = [i for i in list_keys if i.startswith(\"histogram_\")]\n",
"tbl_list_keys = pd.DataFrame(list(set(list_keys) - set(hist_list_keys)))\n",
"tbl_list_keys.columns = [\"metric\"]\n",
"del list_keys\n",
"\n",
"# create table merging top required metrics to display and input provided\n",
"# ....................\n",
"tbl_top_values = df_features.merge(tbl_list_keys, on=\"metric\", how=\"inner\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"HTML(\"<b>\" + \"\" + \"</b>\")\n",
"HTML(\"<hr/>\")\n",
"HTML('<h2 style=\"font-size:20px;\">' + \"Input parameters\" + \"</h2>\")\n",
"HTML(\"<hr/>\")\n",
"HTML(\"<b>\" + \"\" + \"</b>\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"to_print_sample_info = pd.DataFrame(\n",
" data={\"value\": [input_base_file_name, str(input_h5_file)]}, index=[\"Sample name\", \"h5 file\"]\n",
")\n",
"\n",
"to_print_sample_info[\"value\"] = to_print_sample_info[\"value\"].str.wrap(100)\n",
"\n",
"\n",
"def wrap_df_text(df):\n",
" return display(HTML(df.to_html().replace(\"\\\\n\", \"<br>\")))\n",
"\n",
"\n",
"wrap_df_text(to_print_sample_info.style.set_properties(**{\"text-align\": \"left\"}))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"HTML(\"<b>\" + \"\" + \"</b>\")\n",
"HTML(\"<hr/>\")\n",
"HTML('<h2 style=\"font-size:20px;\">' + \"Summary View: Main Metrics\" + \"</h2>\")\n",
"HTML(\"<hr/>\")\n",
"HTML(\"<b>\" + \"\" + \"</b>\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"separator = \"___\"\n",
"to_print = pd.DataFrame()\n",
"\n",
"tbl_top = pd.DataFrame()\n",
"for temp_metric in tbl_top_values[\"metric\"].unique():\n",
" list_top_tbl = sub_top_tbl = pd.DataFrame()\n",
" sub_top_tbl = tbl_top_values[tbl_top_values[\"metric\"] == temp_metric]\n",
" df_h5_tbl = pd.read_hdf(input_h5_file, temp_metric).T\n",
" df_h5_tbl = df_h5_tbl.reset_index()\n",
"\n",
" # stats_coverage is a multiindex dataframe\n",
" if temp_metric.startswith(\"stats\"):\n",
" df_h5_tbl[\"metric\"] = df_h5_tbl[\"level_0\"] + separator + df_h5_tbl[\"level_1\"]\n",
" df_h5_tbl = df_h5_tbl.drop(columns=[\"level_0\", \"level_1\"]).copy()\n",
" df_h5_tbl.columns = [\"value\", \"key\"]\n",
" df_h5_tbl = df_h5_tbl[[\"key\", \"value\"]]\n",
" list_top_tbl = df_h5_tbl.merge(sub_top_tbl, on=\"key\", how=\"inner\")\n",
" to_print = pd.concat((to_print, list_top_tbl))\n",
"\n",
" else:\n",
" df_h5_tbl.columns = [\"key\", \"value\"]\n",
" list_top_tbl = df_h5_tbl.merge(sub_top_tbl, on=\"key\", how=\"inner\")\n",
" to_print = pd.concat((to_print, list_top_tbl))\n",
"\n",
"to_print.index = to_print[\"key\"]\n",
"to_print = to_print.rename({c: c.replace(\"PCT_\", \"% \") for c in to_print.index})\n",
"to_print = to_print.rename({c: c.replace(\"PERCENT_\", \"% \") for c in to_print.index})\n",
"to_print.index.name = None\n",
"to_print = to_print.rename(columns={\"value\": \"\"})\n",
"display(to_print[\"\"].to_frame())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"HTML(\"<b>\" + \"\" + \"</b>\")\n",
"HTML(\"<hr/>\")\n",
"HTML('<h2 style=\"font-size:20px;\">' + \"Detailed View: All Metrics\" + \"</h2>\")\n",
"HTML(\"<hr/>\")\n",
"HTML(\"<b>\" + \"\" + \"</b>\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"to_print = pd.DataFrame()\n",
"sorted_keys = tbl_list_keys[\"metric\"].sort_values()\n",
"\n",
"for tbl_key in sorted_keys:\n",
" HTML(\"<br>\" + \"<br>\" + \"<b>\" + \"Metric type: \" + tbl_key + \"</b>\" + \"<br>\")\n",
" to_print = pd.read_hdf(input_h5_file, tbl_key).T\n",
" to_print = to_print.rename(columns={0: \"\"})\n",
" if not isinstance(to_print.index[0], tuple):\n",
" to_print = to_print.rename({c: c.replace(\"PCT_\", \"% \") for c in to_print.index})\n",
" to_print = to_print.rename({c: c.replace(\"PERCENT_\", \"% \") for c in to_print.index})\n",
" else:\n",
" to_print.index = to_print.index.set_levels(to_print.index.levels[1].str.replace(\"percent_\", \"% \"), level=1)\n",
" display(to_print)"
]
}
],
"metadata": {
"celltoolbar": "Tags",
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.17"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
20 changes: 20 additions & 0 deletions src/core/ugbio_core/reports/top_metrics_for_tbl.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
key,metric
TOTAL_READS,QualityYieldMetricsFlow
PCT_PF_READS,AlignmentSummaryMetrics
PCT_PF_READS_ALIGNED,AlignmentSummaryMetrics
PF_BASES,QualityYieldMetricsFlow
PF_Q30_BASES,QualityYieldMetricsFlow
MEAN_READ_LENGTH,AlignmentSummaryMetrics
MEAN_ALIGNED_READ_LENGTH,AlignmentSummaryMetrics
GC_NC_0_19,GcBiasSummaryMetrics
GC_NC_80_100,GcBiasSummaryMetrics
MEAN_COVERAGE,RawWgsMetrics
FOLD_90_BASE_PENALTY,RawWgsMetrics
Genome___percentile_5,stats_coverage
Exome (WG)___percentile_5,stats_coverage
Simple repeat___percentile_5,stats_coverage
PCT_20X,RawWgsMetrics
PERCENT_DUPLICATION,DuplicationMetrics
PF_INDEL_RATE,AlignmentSummaryMetrics
PF_MISMATCH_RATE,AlignmentSummaryMetrics
contamination,contamination
5 changes: 1 addition & 4 deletions src/ppmseq/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,8 @@ name = "ugbio_ppmseq"
version = "1.4.4-0dev-82"
requires-python = ">=3.11"
dependencies = [
"ugbio_core[vcfbed]",
"ugbio_core[vcfbed,reports]",
"seaborn>=0.13.2",
"papermill>=2.6.0",
"jupyter>=1.1.1",
"mistune>=2.0.3,<3.1",# Can be removed when jupyter/nbconvert#2198 is fixed
"pyarrow>=17.0.0",
"fastparquet>=2024.5.0",
]
Expand Down
Loading
Loading