Ultimagen · avigailmo · Jan 2, 2025 · Dec 30, 2024 · Dec 30, 2024 · Dec 31, 2024
diff --git a/src/core/Dockerfile b/src/core/Dockerfile
@@ -29,4 +29,6 @@ ARG OUT_DIR=/tmp/ugbio
 COPY --from=build $OUT_DIR $OUT_DIR
 
 RUN WHEEL_FILE=$(find ${OUT_DIR} -name "*.whl") && \
-    pip install "${WHEEL_FILE}[vcfbed]"
+    pip install "${WHEEL_FILE}[vcfbed,reports]"
+
+COPY --from=build ./src/core/ugbio_core/reports ./src/core/ugbio_core/reports
diff --git a/src/core/pyproject.toml b/src/core/pyproject.toml
@@ -25,6 +25,13 @@ vcfbed = [
     "bgzip>=0.5.0",
 ]
 
+reports = [
+    "papermill>=2.6.0",
+    "jupyter>=1.1.1",
+    "nbconvert>=7.16.4",
+    "mistune>=2.0.3,<3.1",# 3.1.0 breaks nbconvert. Can be removed when jupyter/nbconvert#2198 is fixed
+]
+
 [project.license]
 text = "Apache-2.0"
 

diff --git a/src/core/tests/resources/input_for_html_report.h5 b/src/core/tests/resources/input_for_html_report.h5
diff --git a/src/core/tests/unit/test_single_sample_qc_html_report.py b/src/core/tests/unit/test_single_sample_qc_html_report.py
@@ -0,0 +1,34 @@
+import subprocess
+from os.path import join as pjoin
+from pathlib import Path
+
+import pytest
+
+BASE_PATH = Path(__file__).parent.parent.parent
+REPORT_BASE_PATH = BASE_PATH / "ugbio_core" / "reports"
+REPORT_NOTEBOOK = REPORT_BASE_PATH / "single_sample_qc_create_html_report.ipynb"
+
+
+@pytest.fixture
+def resources_dir():
+    return Path(__file__).parent.parent / "resources"
+
+
+def test_single_sample_qc_create_html_report(tmpdir, resources_dir):
+    papermill_out = pjoin(tmpdir, "single_sample_qc_create_html_report.papermill.ipynb")
+    input_h5_file = pjoin(resources_dir, "input_for_html_report.h5")
+    base_file_name = "test"
+
+    cmd = (
+        f"papermill {REPORT_NOTEBOOK} {papermill_out} "
+        f"-p top_metrics_file {REPORT_BASE_PATH}/top_metrics_for_tbl.csv "
+        f"-p input_h5_file {input_h5_file} "
+        f"-p input_base_file_name {base_file_name}"
+    )
+
+    assert subprocess.check_call(cmd.split(), cwd=tmpdir) == 0
+
+    jupyter_convert_cmd = (
+        f"jupyter nbconvert --to html {papermill_out} --template classic --no-input --output {base_file_name}.html"
+    )
+    assert subprocess.check_call(jupyter_convert_cmd.split(), cwd=tmpdir) == 0
diff --git a/src/core/ugbio_core/reports/single_sample_qc_create_html_report.ipynb b/src/core/ugbio_core/reports/single_sample_qc_create_html_report.ipynb
@@ -0,0 +1,275 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "pycharm": {
+     "is_executing": true,
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "import h5py\n",
+    "import pandas as pd\n",
+    "from IPython.display import HTML, display"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "pycharm": {
+     "is_executing": true,
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "pd.set_option(\"display.max_rows\", None)\n",
+    "pd.set_option(\"display.max_colwidth\", 0)\n",
+    "pd.options.display.float_format = \"{:,.2f}\".format"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from IPython.core.interactiveshell import InteractiveShell\n",
+    "\n",
+    "InteractiveShell.ast_node_interactivity = \"all\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    },
+    "tags": [
+     "parameters"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "top_metrics_file = \"top_metrics_for_tbl.csv\"\n",
+    "input_h5_file = \"\"\n",
+    "input_base_file_name = \"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "if not os.path.isfile(top_metrics_file):\n",
+    "    raise ValueError(f\"Input {top_metrics_file} does not exist\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dict_features = {row.iloc[0]: row.iloc[1] for _, row in pd.read_csv(top_metrics_file).iterrows()}\n",
+    "df_features = pd.read_csv(top_metrics_file)\n",
+    "list_metrics = list(set(df_features[\"metric\"]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# get Keys within the H5 file\n",
+    "f = h5py.File(input_h5_file, \"r\")\n",
+    "list_keys = list(f.keys())\n",
+    "hist_list_keys = [i for i in list_keys if i.startswith(\"histogram_\")]\n",
+    "tbl_list_keys = pd.DataFrame(list(set(list_keys) - set(hist_list_keys)))\n",
+    "tbl_list_keys.columns = [\"metric\"]\n",
+    "del list_keys\n",
+    "\n",
+    "# create table merging top required metrics to display and input provided\n",
+    "# ....................\n",
+    "tbl_top_values = df_features.merge(tbl_list_keys, on=\"metric\", how=\"inner\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "HTML(\"<b>\" + \"\" + \"</b>\")\n",
+    "HTML(\"<hr/>\")\n",
+    "HTML('<h2 style=\"font-size:20px;\">' + \"Input parameters\" + \"</h2>\")\n",
+    "HTML(\"<hr/>\")\n",
+    "HTML(\"<b>\" + \"\" + \"</b>\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "to_print_sample_info = pd.DataFrame(\n",
+    "    data={\"value\": [input_base_file_name, str(input_h5_file)]}, index=[\"Sample name\", \"h5 file\"]\n",
+    ")\n",
+    "\n",
+    "to_print_sample_info[\"value\"] = to_print_sample_info[\"value\"].str.wrap(100)\n",
+    "\n",
+    "\n",
+    "def wrap_df_text(df):\n",
+    "    return display(HTML(df.to_html().replace(\"\\\\n\", \"<br>\")))\n",
+    "\n",
+    "\n",
+    "wrap_df_text(to_print_sample_info.style.set_properties(**{\"text-align\": \"left\"}))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "HTML(\"<b>\" + \"\" + \"</b>\")\n",
+    "HTML(\"<hr/>\")\n",
+    "HTML('<h2 style=\"font-size:20px;\">' + \"Summary View: Main Metrics\" + \"</h2>\")\n",
+    "HTML(\"<hr/>\")\n",
+    "HTML(\"<b>\" + \"\" + \"</b>\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "separator = \"___\"\n",
+    "to_print = pd.DataFrame()\n",
+    "\n",
+    "tbl_top = pd.DataFrame()\n",
+    "for temp_metric in tbl_top_values[\"metric\"].unique():\n",
+    "    list_top_tbl = sub_top_tbl = pd.DataFrame()\n",
+    "    sub_top_tbl = tbl_top_values[tbl_top_values[\"metric\"] == temp_metric]\n",
+    "    df_h5_tbl = pd.read_hdf(input_h5_file, temp_metric).T\n",
+    "    df_h5_tbl = df_h5_tbl.reset_index()\n",
+    "\n",
+    "    # stats_coverage is a multiindex dataframe\n",
+    "    if temp_metric.startswith(\"stats\"):\n",
+    "        df_h5_tbl[\"metric\"] = df_h5_tbl[\"level_0\"] + separator + df_h5_tbl[\"level_1\"]\n",
+    "        df_h5_tbl = df_h5_tbl.drop(columns=[\"level_0\", \"level_1\"]).copy()\n",
+    "        df_h5_tbl.columns = [\"value\", \"key\"]\n",
+    "        df_h5_tbl = df_h5_tbl[[\"key\", \"value\"]]\n",
+    "        list_top_tbl = df_h5_tbl.merge(sub_top_tbl, on=\"key\", how=\"inner\")\n",
+    "        to_print = pd.concat((to_print, list_top_tbl))\n",
+    "\n",
+    "    else:\n",
+    "        df_h5_tbl.columns = [\"key\", \"value\"]\n",
+    "        list_top_tbl = df_h5_tbl.merge(sub_top_tbl, on=\"key\", how=\"inner\")\n",
+    "        to_print = pd.concat((to_print, list_top_tbl))\n",
+    "\n",
+    "to_print.index = to_print[\"key\"]\n",
+    "to_print = to_print.rename({c: c.replace(\"PCT_\", \"% \") for c in to_print.index})\n",
+    "to_print = to_print.rename({c: c.replace(\"PERCENT_\", \"% \") for c in to_print.index})\n",
+    "to_print.index.name = None\n",
+    "to_print = to_print.rename(columns={\"value\": \"\"})\n",
+    "display(to_print[\"\"].to_frame())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "HTML(\"<b>\" + \"\" + \"</b>\")\n",
+    "HTML(\"<hr/>\")\n",
+    "HTML('<h2 style=\"font-size:20px;\">' + \"Detailed View: All Metrics\" + \"</h2>\")\n",
+    "HTML(\"<hr/>\")\n",
+    "HTML(\"<b>\" + \"\" + \"</b>\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "to_print = pd.DataFrame()\n",
+    "sorted_keys = tbl_list_keys[\"metric\"].sort_values()\n",
+    "\n",
+    "for tbl_key in sorted_keys:\n",
+    "    HTML(\"<br>\" + \"<br>\" + \"<b>\" + \"Metric type: \" + tbl_key + \"</b>\" + \"<br>\")\n",
+    "    to_print = pd.read_hdf(input_h5_file, tbl_key).T\n",
+    "    to_print = to_print.rename(columns={0: \"\"})\n",
+    "    if not isinstance(to_print.index[0], tuple):\n",
+    "        to_print = to_print.rename({c: c.replace(\"PCT_\", \"% \") for c in to_print.index})\n",
+    "        to_print = to_print.rename({c: c.replace(\"PERCENT_\", \"% \") for c in to_print.index})\n",
+    "    else:\n",
+    "        to_print.index = to_print.index.set_levels(to_print.index.levels[1].str.replace(\"percent_\", \"% \"), level=1)\n",
+    "    display(to_print)"
+   ]
+  }
+ ],
+ "metadata": {
+  "celltoolbar": "Tags",
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.17"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/src/core/ugbio_core/reports/top_metrics_for_tbl.csv b/src/core/ugbio_core/reports/top_metrics_for_tbl.csv
@@ -0,0 +1,20 @@
+key,metric
+TOTAL_READS,QualityYieldMetricsFlow
+PCT_PF_READS,AlignmentSummaryMetrics
+PCT_PF_READS_ALIGNED,AlignmentSummaryMetrics
+PF_BASES,QualityYieldMetricsFlow
+PF_Q30_BASES,QualityYieldMetricsFlow
+MEAN_READ_LENGTH,AlignmentSummaryMetrics
+MEAN_ALIGNED_READ_LENGTH,AlignmentSummaryMetrics
+GC_NC_0_19,GcBiasSummaryMetrics
+GC_NC_80_100,GcBiasSummaryMetrics
+MEAN_COVERAGE,RawWgsMetrics
+FOLD_90_BASE_PENALTY,RawWgsMetrics
+Genome___percentile_5,stats_coverage
+Exome (WG)___percentile_5,stats_coverage
+Simple repeat___percentile_5,stats_coverage
+PCT_20X,RawWgsMetrics
+PERCENT_DUPLICATION,DuplicationMetrics
+PF_INDEL_RATE,AlignmentSummaryMetrics
+PF_MISMATCH_RATE,AlignmentSummaryMetrics
+contamination,contamination
diff --git a/src/ppmseq/pyproject.toml b/src/ppmseq/pyproject.toml
@@ -3,11 +3,8 @@ name = "ugbio_ppmseq"
 version = "1.4.4-0dev-82"
 requires-python = ">=3.11"
 dependencies = [
-    "ugbio_core[vcfbed]",
+    "ugbio_core[vcfbed,reports]",
     "seaborn>=0.13.2",
-    "papermill>=2.6.0",
-    "jupyter>=1.1.1",
-    "mistune>=2.0.3,<3.1",# Can be removed when jupyter/nbconvert#2198 is fixed
     "pyarrow>=17.0.0",
     "fastparquet>=2024.5.0",
 ]