From 434c1b5eb142dbb5de4319ec57a7a95cb70c177f Mon Sep 17 00:00:00 2001 From: RemyLau Date: Mon, 10 Jul 2023 20:03:09 -0400 Subject: [PATCH 1/4] add progress bar option to fit_and_eval --- src/obnb/model_trainer/base.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/obnb/model_trainer/base.py b/src/obnb/model_trainer/base.py index ff6d7620..bbe78af2 100644 --- a/src/obnb/model_trainer/base.py +++ b/src/obnb/model_trainer/base.py @@ -2,6 +2,7 @@ from copy import deepcopy import numpy as np +from tqdm.auto import tqdm import obnb.metric from obnb.typing import Any, Callable, Dict, LogLevel, Optional @@ -125,6 +126,7 @@ def fit_and_eval( split_idx: int = 0, consider_negative: bool = False, reduce: str = "none", + progress: bool = True, ) -> Dict[str, float]: """Fit model and evaluate. @@ -137,7 +139,8 @@ def fit_and_eval( x = None if dataset.feature is None else dataset.feature.mat _, _, get_predictions, compute_results = self._setup(dataset, split_idx) - for i, label_id in enumerate(dataset.label.label_ids): + pbar = tqdm(enumerate(dataset.label.label_ids), disable=not progress) + for i, label_id in pbar: y, masks = dataset.label.split( splitter=dataset.splitter, target_ids=tuple(dataset.idmap.lst), From eae45280fd72c8b45fe408894d558a9cf804b149 Mon Sep 17 00:00:00 2001 From: Remy Liu <36778645+RemyLau@users.noreply.github.com> Date: Mon, 10 Jul 2023 20:08:53 -0400 Subject: [PATCH 2/4] Created using Colaboratory --- tutorials/basic_tutorial.ipynb | 1669 ++++++++++++++++++++++++++++++++ 1 file changed, 1669 insertions(+) create mode 100644 tutorials/basic_tutorial.ipynb diff --git a/tutorials/basic_tutorial.ipynb b/tutorials/basic_tutorial.ipynb new file mode 100644 index 00000000..2e623df3 --- /dev/null +++ b/tutorials/basic_tutorial.ipynb @@ -0,0 +1,1669 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "authorship_tag": "ABX9TyO107Ua39a7xOmb/P+xC24j", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "fypP0bMZ-Wsu", + "outputId": "f4bf8c41-0632-42dc-b2bc-7035757928ac" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", + " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m61.1/61.1 kB\u001b[0m \u001b[31m2.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m59.0/59.0 kB\u001b[0m \u001b[31m5.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.3/4.3 MB\u001b[0m \u001b[31m42.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m112.7/112.7 kB\u001b[0m \u001b[31m7.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m54.5/54.5 kB\u001b[0m \u001b[31m4.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Building wheel for obnb (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + " Building wheel for littleutils (setup.py) ... \u001b[?25l\u001b[?25hdone\n" + ] + } + ], + "source": [ + "!pip install -q git+https://github.com/krishnanlab/obnb" + ] + }, + { + "cell_type": "code", + "source": [ + "import obnb\n", + "print(f\"Installed obnb {obnb.__version__}\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "XJbywsXU-6zH", + "outputId": "3e561b06-0f81-4545-887f-ec71f7165cc8" + }, + "execution_count": 2, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Installed obnb 0.1.1-dev\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import obnb.data\n", + "import yaml\n", + "\n", + "root = \"datasets\"\n", + "data_version = \"obnbdata-0.1.0\"\n", + "lsc = obnb.data.DisGeNET(root, version=data_version)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "s1apiAau_GDY", + "outputId": "95ac9e0b-52ab-42db-fc1b-dba9f7691827" + }, + "execution_count": 3, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[INFO][2023-07-10 23:25:35,227][base][download_archive] Loading DisGeNET (version='obnbdata-0.1.0')...\n", + "[INFO][2023-07-10 23:25:35,229][download][get_data_url] Download URL: https://zenodo.org/record/8045270/files/DisGeNET.zip\n", + "[INFO][2023-07-10 23:25:35,232][download][download_unzip] Downloading zip archive from https://zenodo.org/record/8045270/files/DisGeNET.zip\n", + "100%|██████████| 219k/219k [00:00<00:00, 687kB/s]\n", + "[INFO][2023-07-10 23:25:36,852][download][download_unzip] Download completed, start unpacking...\n", + "[INFO][2023-07-10 23:25:36,864][download][download_unzip] Done extracting\n", + "[INFO][2023-07-10 23:25:36,869][download][get_data_url] Download URL: https://zenodo.org/record/8045270/files/.cache.zip\n", + "[INFO][2023-07-10 23:25:36,871][download][download_unzip] Downloading zip archive from https://zenodo.org/record/8045270/files/.cache.zip\n", + "100%|██████████| 24.7M/24.7M [00:01<00:00, 13.0MB/s]\n", + "[INFO][2023-07-10 23:25:39,817][download][download_unzip] Download completed, start unpacking...\n", + "[INFO][2023-07-10 23:25:41,874][download][download_unzip] Done extracting\n", + "[INFO][2023-07-10 23:25:41,902][base][load_processed_data] Load processed file datasets/DisGeNET/processed/data.gmt\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "dHZSPRK0_xYz" + }, + "execution_count": 3, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "print(yaml.dump(lsc.to_config()))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "UEZQ-LJ5_vJO", + "outputId": "a88f919b-7eab-4f94-cbfb-ba91a283ac5e" + }, + "execution_count": 4, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "data_module: DisGeNET\n", + "data_module_params:\n", + " gene_id_converter: HumanEntrez\n", + " pre_transform:\n", + " - LabelsetRangeFilterSize:\n", + " max_val: '600'\n", + " min_val: None\n", + " - LabelsetNonRedFilter:\n", + " thresholds: (0.5, 0.7)\n", + " - LabelsetRangeFilterSize:\n", + " max_val: None\n", + " min_val: '10'\n", + " version: obnbdata-0.1.0\n", + "package_version: 0.1.1-dev\n", + "processed_time: '2023-07-10 23:15:57'\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "lsc.to_df()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 748 + }, + "id": "bxYbfhya_GAj", + "outputId": "c3f222d2-c853-4023-901c-9726ac62bed4" + }, + "execution_count": 5, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Name Info Size 0 \\\n", + "0 MONDO:0000004 adrenocortical insufficiency 44 23530 \n", + "1 MONDO:0021034 genetic alopecia 76 340419 \n", + "2 MONDO:0000009 inherited bleeding disorder, platelet-type 37 5328 \n", + "3 MONDO:0002243 hemorrhagic disease 135 7450 \n", + "4 MONDO:0002245 blood platelet disease 329 5698 \n", + "... ... ... ... ... \n", + "1035 MONDO:0044976 obsolete disease of catalytic activity 11 2936 \n", + "1036 MONDO:0100130 adult acute respiratory distress syndrome 55 6347 \n", + "1037 MONDO:0100431 migraine without aura 19 796 \n", + "1038 MONDO:0100459 azoospermia 95 5889 \n", + "1039 MONDO:0100471 vitamin D deficiency 13 6197 \n", + "\n", + " 1 2 3 4 5 6 ... 584 585 \\\n", + "0 2737 55699 3284 1585 1589 50940 ... None None \n", + "1 5894 6635 92344 10913 4289 22808 ... None None \n", + "2 2533 342618 6916 2531 6915 80739 ... None None \n", + "3 342618 4618 6916 2531 421 196527 ... None None \n", + "4 7706 55135 2475 342618 79053 374569 ... None None \n", + "... ... ... ... ... ... ... ... ... ... \n", + "1035 2539 2729 2937 226 2023 3098 ... None None \n", + "1036 1906 407055 442911 5685 406953 210 ... None None \n", + "1037 79783 1909 4209 101929660 79054 1740 ... None None \n", + "1038 3077 4952 9085 2488 84464 6660 ... None None \n", + "1039 3508 10939 9772 4036 84617 7421 ... None None \n", + "\n", + " 586 587 588 589 590 591 592 593 \n", + "0 None None None None None None None None \n", + "1 None None None None None None None None \n", + "2 None None None None None None None None \n", + "3 None None None None None None None None \n", + "4 None None None None None None None None \n", + "... ... ... ... ... ... ... ... ... \n", + "1035 None None None None None None None None \n", + "1036 None None None None None None None None \n", + "1037 None None None None None None None None \n", + "1038 None None None None None None None None \n", + "1039 None None None None None None None None \n", + "\n", + "[1040 rows x 597 columns]" + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameInfoSize0123456...584585586587588589590591592593
0MONDO:0000004adrenocortical insufficiency442353027375569932841585158950940...NoneNoneNoneNoneNoneNoneNoneNoneNoneNone
1MONDO:0021034genetic alopecia76340419589466359234410913428922808...NoneNoneNoneNoneNoneNoneNoneNoneNoneNone
2MONDO:0000009inherited bleeding disorder, platelet-type375328253334261869162531691580739...NoneNoneNoneNoneNoneNoneNoneNoneNoneNone
3MONDO:0002243hemorrhagic disease1357450342618461869162531421196527...NoneNoneNoneNoneNoneNoneNoneNoneNoneNone
4MONDO:0002245blood platelet disease3295698770655135247534261879053374569...NoneNoneNoneNoneNoneNoneNoneNoneNoneNone
..................................................................
1035MONDO:0044976obsolete disease of catalytic activity11293625392729293722620233098...NoneNoneNoneNoneNoneNoneNoneNoneNoneNone
1036MONDO:0100130adult acute respiratory distress syndrome55634719064070554429115685406953210...NoneNoneNoneNoneNoneNoneNoneNoneNoneNone
1037MONDO:0100431migraine without aura197967978319094209101929660790541740...NoneNoneNoneNoneNoneNoneNoneNoneNoneNone
1038MONDO:0100459azoospermia9558893077495290852488844646660...NoneNoneNoneNoneNoneNoneNoneNoneNoneNone
1039MONDO:0100471vitamin D deficiency13619735081093997724036846177421...NoneNoneNoneNoneNoneNoneNoneNoneNoneNone
\n", + "

1040 rows × 597 columns

\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 5 + } + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "nnH4lXpm_F9g" + }, + "execution_count": 10, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "g = obnb.data.BioGRID(root, version=data_version)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ui07Xv9tBh3E", + "outputId": "9f425f89-bc48-4215-962c-bb849eee1100" + }, + "execution_count": 6, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[INFO][2023-07-10 23:16:03,767][base][download_archive] Loading BioGRID (version='obnbdata-0.1.0')...\n", + "[INFO][2023-07-10 23:16:03,770][download][get_data_url] Download URL: https://zenodo.org/record/8045270/files/BioGRID.zip\n", + "[INFO][2023-07-10 23:16:03,773][download][download_unzip] Downloading zip archive from https://zenodo.org/record/8045270/files/BioGRID.zip\n", + "100%|██████████| 39.3M/39.3M [00:02<00:00, 14.3MB/s]\n", + "[INFO][2023-07-10 23:16:07,863][download][download_unzip] Download completed, start unpacking...\n", + "[INFO][2023-07-10 23:16:10,818][download][download_unzip] Done extracting\n", + "[INFO][2023-07-10 23:16:10,825][base][load_processed_data] Load processed file datasets/BioGRID/processed/data.npz\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "g = obnb.data.BioGRID(root, version=data_version)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "lLqZ9f_-BoOt", + "outputId": "8aa93790-55c3-419f-d001-71a157b4caaa" + }, + "execution_count": 7, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[INFO][2023-07-10 23:16:12,911][base][load_processed_data] Load processed file datasets/BioGRID/processed/data.npz\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "print(yaml.dump(g.to_config()))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "1-vLqR-H_F6h", + "outputId": "ba4aee5c-9a99-48b3-b519-7e37c6fd211a" + }, + "execution_count": 8, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "data_module: BioGRID\n", + "data_module_params:\n", + " cx_kwargs:\n", + " interaction_types:\n", + " - interacts-with\n", + " node_id_entry: r\n", + " node_id_prefix: ncbigene\n", + " cx_uuid: 36f7d8fd-23dc-11e8-b939-0ac135e8bacf\n", + " directed: false\n", + " gene_id_converter: HumanEntrez\n", + " largest_comp: true\n", + " version: obnbdata-0.1.0\n", + " weighted: false\n", + "package_version: 0.1.1-dev\n", + "processed_time: '2023-07-10 23:16:15'\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "adj = g.to_adjmat()\n", + "print(adj)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "-QtlyfJQ-6wT", + "outputId": "ee33b889-5206-4e21-9a7d-44f33ceb25be" + }, + "execution_count": 9, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[[0. 1. 0. ... 0. 0. 0.]\n", + " [1. 0. 0. ... 0. 0. 0.]\n", + " [0. 0. 0. ... 0. 0. 0.]\n", + " ...\n", + " [0. 0. 0. ... 0. 0. 0.]\n", + " [0. 0. 0. ... 0. 0. 0.]\n", + " [0. 0. 0. ... 0. 0. 0.]]\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "cmvRjsv6-6pj" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from obnb import OpenBiomedNetBench\n", + "\n", + "dataset = OpenBiomedNetBench(root=root, graph_name=\"BioGRID\", label_name=\"DisGeNET\",\n", + " version=data_version, graph_as_feature=True, use_dense_graph=True)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "F6Di1ZJ--6md", + "outputId": "ff9a0237-7b48-478f-a8c3-885a12af6c6d" + }, + "execution_count": 4, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[INFO][2023-07-10 23:25:45,356][base][download_archive] Loading BioGRID (version='obnbdata-0.1.0')...\n", + "[INFO][2023-07-10 23:25:45,363][download][get_data_url] Download URL: https://zenodo.org/record/8045270/files/BioGRID.zip\n", + "[INFO][2023-07-10 23:25:45,368][download][download_unzip] Downloading zip archive from https://zenodo.org/record/8045270/files/BioGRID.zip\n", + "100%|██████████| 39.3M/39.3M [00:02<00:00, 14.9MB/s]\n", + "[INFO][2023-07-10 23:25:49,102][download][download_unzip] Download completed, start unpacking...\n", + "[INFO][2023-07-10 23:25:52,276][download][download_unzip] Done extracting\n", + "[INFO][2023-07-10 23:25:52,283][base][load_processed_data] Load processed file datasets/BioGRID/processed/data.npz\n", + "[INFO][obnb.GenePropertyConverter][_load_cache] Loaded gene conversion cache datasets/.cache/geneprop_convert-PubMedCount.json\n", + "[INFO][2023-07-10 23:26:03,783][base][load_processed_data] Load processed file datasets/DisGeNET/processed/data.gmt\n", + "[INFO][2023-07-10 23:26:04,977][base][_apply_transform] Before transformation:\n", + "Number of labelsets: 1040\n", + "max: 594\n", + "min: 10\n", + "med: 36.00\n", + "avg: 85.61\n", + "std: 120.19\n", + "\n", + "[INFO][2023-07-10 23:26:04,980][base][_apply_transform] Applying transformation:\n", + "Composition of filters:\n", + "\t- EntityExistenceFilter(remove_specified=False)\n", + "\t- LabelsetRangeFilterSize(min_val=50, max_val=None)\n", + "\t- LabelsetRangeFilterSplit(splitter=RatioPartition(property_converter=GenePropertyConverter(name='PubMedCount'), ascending=False, ratios=(0.6, 0.2, 0.2)), min_val=5, count_negatives=True)\n", + "\t- NegativeGeneratorHypergeom(p_thresh=0.05)\n", + "EntityExistenceFilter(remove_specified=False): 100%|██████████| 9427/9427 [00:07<00:00, 1226.87it/s]\n", + "[INFO][obnb.Compose][__call__] Number of labelsets: 1040\n", + "max: 571\n", + "min: 4\n", + "med: 35.00\n", + "avg: 81.62\n", + "std: 114.00\n", + "\n", + "LabelsetRangeFilterSize(min_val=50, max_val=None): 100%|██████████| 1040/1040 [00:00<00:00, 3544.88it/s]\n", + "[INFO][obnb.Compose][__call__] Number of labelsets: 406\n", + "max: 571\n", + "min: 50\n", + "med: 118.00\n", + "avg: 174.50\n", + "std: 137.68\n", + "\n", + "LabelsetRangeFilterSplit(splitter=RatioPartition(property_converter=GenePropertyConverter(name='PubMedCount'), ascending=False, ratios=(0.6, 0.2, 0.2)), min_val=5, count_negatives=True): 100%|██████████| 406/406 [00:44<00:00, 9.20it/s]\n", + "[INFO][obnb.Compose][__call__] Number of labelsets: 305\n", + "max: 571\n", + "min: 50\n", + "med: 159.00\n", + "avg: 208.26\n", + "std: 143.10\n", + "\n", + "Computing hypergeometric p-value matrix: 100%|██████████| 46360/46360 [00:54<00:00, 857.21it/s] \n", + "NegativeGeneratorHypergeom(p_thresh=0.05): 100%|██████████| 305/305 [00:02<00:00, 113.19it/s]\n", + "[INFO][obnb.Compose][__call__] Number of labelsets: 305\n", + "max: 571\n", + "min: 50\n", + "med: 159.00\n", + "avg: 208.26\n", + "std: 143.10\n", + "\n", + "[INFO][2023-07-10 23:27:53,980][base][_apply_transform] After transformation:\n", + "Number of labelsets: 305\n", + "max: 571\n", + "min: 50\n", + "med: 159.00\n", + "avg: 208.26\n", + "std: 143.10\n", + "\n", + "[INFO][2023-07-10 23:27:53,998][base][_apply_transform] Saved cache transformation to datasets/DisGeNET/processed/.cache/3f21b172b9a43412952d179a1e7d9f3f/data.gmt\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "dataset = OpenBiomedNetBench(root=root, graph_name=\"BioGRID\", label_name=\"DisGeNET\",\n", + " version=data_version, graph_as_feature=True, use_dense_graph=True)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "_I775-DC-6jL", + "outputId": "daaa0cbc-8f3b-4927-f954-3963c9730dd9" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[INFO][2023-07-10 23:19:55,923][base][load_processed_data] Load processed file datasets/BioGRID/processed/data.npz\n", + "[INFO][obnb.GenePropertyConverter][_load_cache] Loaded gene conversion cache datasets/.cache/geneprop_convert-PubMedCount.json\n", + "[INFO][2023-07-10 23:20:09,055][base][load_processed_data] Load processed file datasets/DisGeNET/processed/data.gmt\n", + "[INFO][2023-07-10 23:20:12,024][base][_apply_transform] Loading cached transformed data from datasets/DisGeNET/processed/.cache/3f21b172b9a43412952d179a1e7d9f3f/data.gmt\n", + "[INFO][2023-07-10 23:20:12,028][base][load_processed_data] Load processed file datasets/DisGeNET/processed/.cache/3f21b172b9a43412952d179a1e7d9f3f/data.gmt\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from obnb.model_trainer import LabelPropagationTrainer\n", + "from obnb.model.label_propagation import OneHopPropagation\n", + "\n", + "mdl = OneHopPropagation()\n", + "trainer = LabelPropagationTrainer()\n", + "\n", + "results = trainer.fit_and_eval(mdl, dataset)" + ], + "metadata": { + "id": "QlCizk9x-6gD" + }, + "execution_count": 5, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "\n", + "df = pd.DataFrame(results, index=dataset.label.label_ids)\n", + "df" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "id": "NBtgy76xIoc_", + "outputId": "2c3e0472-c679-41d4-d293-c46b2c1d8059" + }, + "execution_count": 10, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " train_apop val_apop test_apop train_auroc val_auroc \\\n", + "MONDO:0021034 1.843529 2.163573 0.033863 0.675152 0.483754 \n", + "MONDO:0002243 1.813295 0.303094 1.002558 0.701548 0.516716 \n", + "MONDO:0002245 0.809497 0.620750 0.206753 0.637431 0.557898 \n", + "MONDO:0001703 0.778886 1.511866 4.148121 0.548935 0.588175 \n", + "MONDO:0013099 1.914333 2.262900 0.493989 0.646922 0.647186 \n", + "... ... ... ... ... ... \n", + "MONDO:0100284 0.603496 0.189524 0.000000 0.573916 0.527694 \n", + "MONDO:0020019 1.583002 0.591055 0.879580 0.681023 0.518388 \n", + "MONDO:0021002 1.211121 1.055366 1.127546 0.628159 0.585140 \n", + "MONDO:0021017 2.228560 1.146228 0.000000 0.520709 0.528579 \n", + "MONDO:0100459 3.250616 3.966312 0.060178 0.692115 0.708832 \n", + "\n", + " test_auroc \n", + "MONDO:0021034 0.472385 \n", + "MONDO:0002243 0.595784 \n", + "MONDO:0002245 0.560433 \n", + "MONDO:0001703 0.497549 \n", + "MONDO:0013099 0.532789 \n", + "... ... \n", + "MONDO:0100284 0.413082 \n", + "MONDO:0020019 0.598177 \n", + "MONDO:0021002 0.629074 \n", + "MONDO:0021017 0.462455 \n", + "MONDO:0100459 0.517378 \n", + "\n", + "[305 rows x 6 columns]" + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
train_apopval_apoptest_apoptrain_aurocval_auroctest_auroc
MONDO:00210341.8435292.1635730.0338630.6751520.4837540.472385
MONDO:00022431.8132950.3030941.0025580.7015480.5167160.595784
MONDO:00022450.8094970.6207500.2067530.6374310.5578980.560433
MONDO:00017030.7788861.5118664.1481210.5489350.5881750.497549
MONDO:00130991.9143332.2629000.4939890.6469220.6471860.532789
.....................
MONDO:01002840.6034960.1895240.0000000.5739160.5276940.413082
MONDO:00200191.5830020.5910550.8795800.6810230.5183880.598177
MONDO:00210021.2111211.0553661.1275460.6281590.5851400.629074
MONDO:00210172.2285601.1462280.0000000.5207090.5285790.462455
MONDO:01004593.2506163.9663120.0601780.6921150.7088320.517378
\n", + "

305 rows × 6 columns

\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 10 + } + ] + }, + { + "cell_type": "code", + "source": [ + "df.describe()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 300 + }, + "id": "np234taeKVjQ", + "outputId": "19e843c3-b927-4c82-ca9e-b42aea000202" + }, + "execution_count": 11, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " train_apop val_apop test_apop train_auroc val_auroc test_auroc\n", + "count 305.000000 305.000000 305.000000 305.000000 305.000000 305.000000\n", + "mean 1.152127 1.003746 0.819258 0.623729 0.561479 0.521614\n", + "std 0.753213 1.065416 1.116516 0.065599 0.082058 0.063061\n", + "min 0.001297 -0.320246 -0.213516 0.485241 0.351099 0.375165\n", + "25% 0.646890 0.255761 0.053465 0.582988 0.506204 0.473407\n", + "50% 0.993217 0.623817 0.392785 0.620002 0.551109 0.521458\n", + "75% 1.507760 1.465971 1.137018 0.659771 0.596552 0.560724\n", + "max 5.851295 6.370345 6.111766 0.965775 0.951942 0.794405" + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
train_apopval_apoptest_apoptrain_aurocval_auroctest_auroc
count305.000000305.000000305.000000305.000000305.000000305.000000
mean1.1521271.0037460.8192580.6237290.5614790.521614
std0.7532131.0654161.1165160.0655990.0820580.063061
min0.001297-0.320246-0.2135160.4852410.3510990.375165
25%0.6468900.2557610.0534650.5829880.5062040.473407
50%0.9932170.6238170.3927850.6200020.5511090.521458
75%1.5077601.4659711.1370180.6597710.5965520.560724
max5.8512956.3703456.1117660.9657750.9519420.794405
\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 11 + } + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "3I3Z0WJyKVQP" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.linear_model import LogisticRegression\n", + "from obnb.model_trainer import SupervisedLearningTrainer\n", + "\n", + "mdl = LogisticRegression(penalty=\"l2\", solver=\"lbfgs\")\n", + "trainer = SupervisedLearningTrainer()\n", + "\n", + "results2 = trainer.fit_and_eval(mdl, dataset)" + ], + "metadata": { + "id": "GdeNbEDz-6cx" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "-NJKfTpx-6Z4" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "3uHHEcsx-6Wy" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "-14ui8Jt-6Tf" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "poGro_Qo-6Qz" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "Hn4bHRIg-6Nz" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "v3FTLk__-5zs" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file From e813d2ef342406cfb1d55efa9e42cd6ff7f532eb Mon Sep 17 00:00:00 2001 From: Remy Liu <36778645+RemyLau@users.noreply.github.com> Date: Tue, 11 Jul 2023 11:54:22 -0400 Subject: [PATCH 3/4] Created using Colaboratory --- tutorials/basic_tutorial.ipynb | 1932 ++++++++------------------------ 1 file changed, 481 insertions(+), 1451 deletions(-) diff --git a/tutorials/basic_tutorial.ipynb b/tutorials/basic_tutorial.ipynb index 2e623df3..025fc4c8 100644 --- a/tutorials/basic_tutorial.ipynb +++ b/tutorials/basic_tutorial.ipynb @@ -4,7 +4,8 @@ "metadata": { "colab": { "provenance": [], - "authorship_tag": "ABX9TyO107Ua39a7xOmb/P+xC24j", + "toc_visible": true, + "authorship_tag": "ABX9TyM/72QVmPpoW9JPrZYT0/P3", "include_colab_link": true }, "kernelspec": { @@ -26,39 +27,57 @@ "\"Open" ] }, + { + "cell_type": "markdown", + "source": [ + "# Basic Tutorial for the Open Biomedical Network Benchmark package" + ], + "metadata": { + "id": "Ba_AaNS7Stg8" + } + }, + { + "cell_type": "markdown", + "source": [ + "## 1. Installation\n", + "\n", + "Installation can be easily done via `pip`.\n", + "\n", + "via PyPI (released or pre-release versions)\n", + "```bash\n", + "pip install obnb\n", + "```\n", + "\n", + "or via GitHub (latest dev version)\n", + "```bash\n", + "pip install git+https://github.com/krishnanlab/obnb\n", + "```" + ], + "metadata": { + "id": "pv7SYyrlTKl4" + } + }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "fypP0bMZ-Wsu", - "outputId": "f4bf8c41-0632-42dc-b2bc-7035757928ac" + "id": "fypP0bMZ-Wsu" }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", - " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", - " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m61.1/61.1 kB\u001b[0m \u001b[31m2.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m59.0/59.0 kB\u001b[0m \u001b[31m5.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.3/4.3 MB\u001b[0m \u001b[31m42.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m112.7/112.7 kB\u001b[0m \u001b[31m7.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m54.5/54.5 kB\u001b[0m \u001b[31m4.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Building wheel for obnb (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", - " Building wheel for littleutils (setup.py) ... \u001b[?25l\u001b[?25hdone\n" - ] - } - ], + "outputs": [], "source": [ + "# Install latest dev version of OBNB\n", "!pip install -q git+https://github.com/krishnanlab/obnb" ] }, + { + "cell_type": "markdown", + "source": [ + "Check if the package is installed successfully" + ], + "metadata": { + "id": "WefaXPkqUS6e" + } + }, { "cell_type": "code", "source": [ @@ -66,22 +85,47 @@ "print(f\"Installed obnb {obnb.__version__}\")" ], "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "XJbywsXU-6zH", - "outputId": "3e561b06-0f81-4545-887f-ec71f7165cc8" + "id": "XJbywsXU-6zH" }, - "execution_count": 2, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Installed obnb 0.1.1-dev\n" - ] - } - ] + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import obnb.ext.pecanpy\n", + "print(f\"Extension for PecanPy installed: {obnb.ext.pecanpy}\")" + ], + "metadata": { + "id": "_ZYMxfgfUZFe" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## 2. Data downloading and processing" + ], + "metadata": { + "id": "oZsfNaHqVaQu" + } + }, + { + "cell_type": "markdown", + "source": [ + "First, load the `obnb.data` module that contains \"recipies\" for processing\n", + "differentt selections of biological networks and gene annotation data.\n", + "\n", + "We also need to specify (1) the path to which the data will be saved, and more\n", + "importantly, (2) the **version** of the data we want to retrieve. The version\n", + "option allows for flexible data retrieval (either retrieve data from source, or\n", + "retrieve from processed data archive) and also enable reproduction of the\n", + "downstream analysis." + ], + "metadata": { + "id": "bDx-hDiTVsM4" + } }, { "cell_type": "code", @@ -89,1507 +133,465 @@ "import obnb.data\n", "import yaml\n", "\n", + "# Where do we want to save the data and related files to\n", "root = \"datasets\"\n", + "\n", + "# What version of the pre-processed data to download\n", "data_version = \"obnbdata-0.1.0\"\n", - "lsc = obnb.data.DisGeNET(root, version=data_version)" + "# data_version = \"latest\" # download data from source and process from scratch\n", + "# data_version = \"current\" # use the latest archived data version" ], "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "s1apiAau_GDY", - "outputId": "95ac9e0b-52ab-42db-fc1b-dba9f7691827" + "id": "s1apiAau_GDY" }, - "execution_count": 3, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "[INFO][2023-07-10 23:25:35,227][base][download_archive] Loading DisGeNET (version='obnbdata-0.1.0')...\n", - "[INFO][2023-07-10 23:25:35,229][download][get_data_url] Download URL: https://zenodo.org/record/8045270/files/DisGeNET.zip\n", - "[INFO][2023-07-10 23:25:35,232][download][download_unzip] Downloading zip archive from https://zenodo.org/record/8045270/files/DisGeNET.zip\n", - "100%|██████████| 219k/219k [00:00<00:00, 687kB/s]\n", - "[INFO][2023-07-10 23:25:36,852][download][download_unzip] Download completed, start unpacking...\n", - "[INFO][2023-07-10 23:25:36,864][download][download_unzip] Done extracting\n", - "[INFO][2023-07-10 23:25:36,869][download][get_data_url] Download URL: https://zenodo.org/record/8045270/files/.cache.zip\n", - "[INFO][2023-07-10 23:25:36,871][download][download_unzip] Downloading zip archive from https://zenodo.org/record/8045270/files/.cache.zip\n", - "100%|██████████| 24.7M/24.7M [00:01<00:00, 13.0MB/s]\n", - "[INFO][2023-07-10 23:25:39,817][download][download_unzip] Download completed, start unpacking...\n", - "[INFO][2023-07-10 23:25:41,874][download][download_unzip] Done extracting\n", - "[INFO][2023-07-10 23:25:41,902][base][load_processed_data] Load processed file datasets/DisGeNET/processed/data.gmt\n" - ] - } - ] + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### 2.1. Biological networks" + ], + "metadata": { + "id": "8YF_zoqBWOzV" + } + }, + { + "cell_type": "markdown", + "source": [ + "Let's start with an example of obtaining the `BioPlex` network, which is a\n", + "protein-protein interaction (PPI) network that is constructed via AP-MS on\n", + "human cell-lines ([HEK293T](https://www.synthego.com/hek293) and\n", + "[HCT116](https://imanislife.com/collections/cell-lines/hct116-cells/)).\n", + "Checkout other avaialble options for processed biomedical networks on the OBNB\n", + "benchmark\n", + "[README](https://github.com/krishnanlab/obnbench#data-stats-obnbdata-010-) page.\n", + "\n", + "[1] Huttlin, Edward L., et al. \"The BioPlex network: a systematic exploration of the human interactome.\" Cell 162.2 (2015): 425-440.\n", + "\n", + "[2] Huttlin, Edward L., et al. \"Dual proteome-scale networks reveal cell-specific remodeling of the human interactome.\" Cell 184.11 (2021): 3022-3040." + ], + "metadata": { + "id": "7idT6WBxXR29" + } }, { "cell_type": "code", - "source": [], + "source": [ + "# Download network from archive\n", + "g = obnb.data.BioPlex(root, version=data_version)" + ], "metadata": { - "id": "dHZSPRK0_xYz" + "id": "-Wsdv0VmWVfr" }, - "execution_count": 3, + "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ - "print(yaml.dump(lsc.to_config()))" + "# Once downloaded, it can be used in future acess without redownloading\n", + "g = obnb.data.BioPlex(root, version=data_version)" ], "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "UEZQ-LJ5_vJO", - "outputId": "a88f919b-7eab-4f94-cbfb-ba91a283ac5e" + "id": "ovT8pvzbWVdR" }, - "execution_count": 4, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "data_module: DisGeNET\n", - "data_module_params:\n", - " gene_id_converter: HumanEntrez\n", - " pre_transform:\n", - " - LabelsetRangeFilterSize:\n", - " max_val: '600'\n", - " min_val: None\n", - " - LabelsetNonRedFilter:\n", - " thresholds: (0.5, 0.7)\n", - " - LabelsetRangeFilterSize:\n", - " max_val: None\n", - " min_val: '10'\n", - " version: obnbdata-0.1.0\n", - "package_version: 0.1.1-dev\n", - "processed_time: '2023-07-10 23:15:57'\n", - "\n" - ] - } - ] + "execution_count": null, + "outputs": [] }, { "cell_type": "code", "source": [ - "lsc.to_df()" + "# You can also force redownloading the data by specifying redownload=True\n", + "g = obnb.data.BioPlex(root, version=data_version, redownload=True)" ], "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 748 - }, - "id": "bxYbfhya_GAj", - "outputId": "c3f222d2-c853-4023-901c-9726ac62bed4" + "id": "KDFC5JnyWVOb" }, - "execution_count": 5, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " Name Info Size 0 \\\n", - "0 MONDO:0000004 adrenocortical insufficiency 44 23530 \n", - "1 MONDO:0021034 genetic alopecia 76 340419 \n", - "2 MONDO:0000009 inherited bleeding disorder, platelet-type 37 5328 \n", - "3 MONDO:0002243 hemorrhagic disease 135 7450 \n", - "4 MONDO:0002245 blood platelet disease 329 5698 \n", - "... ... ... ... ... \n", - "1035 MONDO:0044976 obsolete disease of catalytic activity 11 2936 \n", - "1036 MONDO:0100130 adult acute respiratory distress syndrome 55 6347 \n", - "1037 MONDO:0100431 migraine without aura 19 796 \n", - "1038 MONDO:0100459 azoospermia 95 5889 \n", - "1039 MONDO:0100471 vitamin D deficiency 13 6197 \n", - "\n", - " 1 2 3 4 5 6 ... 584 585 \\\n", - "0 2737 55699 3284 1585 1589 50940 ... None None \n", - "1 5894 6635 92344 10913 4289 22808 ... None None \n", - "2 2533 342618 6916 2531 6915 80739 ... None None \n", - "3 342618 4618 6916 2531 421 196527 ... None None \n", - "4 7706 55135 2475 342618 79053 374569 ... None None \n", - "... ... ... ... ... ... ... ... ... ... \n", - "1035 2539 2729 2937 226 2023 3098 ... None None \n", - "1036 1906 407055 442911 5685 406953 210 ... None None \n", - "1037 79783 1909 4209 101929660 79054 1740 ... None None \n", - "1038 3077 4952 9085 2488 84464 6660 ... None None \n", - "1039 3508 10939 9772 4036 84617 7421 ... None None \n", - "\n", - " 586 587 588 589 590 591 592 593 \n", - "0 None None None None None None None None \n", - "1 None None None None None None None None \n", - "2 None None None None None None None None \n", - "3 None None None None None None None None \n", - "4 None None None None None None None None \n", - "... ... ... ... ... ... ... ... ... \n", - "1035 None None None None None None None None \n", - "1036 None None None None None None None None \n", - "1037 None None None None None None None None \n", - "1038 None None None None None None None None \n", - "1039 None None None None None None None None \n", - "\n", - "[1040 rows x 597 columns]" - ], - "text/html": [ - "\n", - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameInfoSize0123456...584585586587588589590591592593
0MONDO:0000004adrenocortical insufficiency442353027375569932841585158950940...NoneNoneNoneNoneNoneNoneNoneNoneNoneNone
1MONDO:0021034genetic alopecia76340419589466359234410913428922808...NoneNoneNoneNoneNoneNoneNoneNoneNoneNone
2MONDO:0000009inherited bleeding disorder, platelet-type375328253334261869162531691580739...NoneNoneNoneNoneNoneNoneNoneNoneNoneNone
3MONDO:0002243hemorrhagic disease1357450342618461869162531421196527...NoneNoneNoneNoneNoneNoneNoneNoneNoneNone
4MONDO:0002245blood platelet disease3295698770655135247534261879053374569...NoneNoneNoneNoneNoneNoneNoneNoneNoneNone
..................................................................
1035MONDO:0044976obsolete disease of catalytic activity11293625392729293722620233098...NoneNoneNoneNoneNoneNoneNoneNoneNoneNone
1036MONDO:0100130adult acute respiratory distress syndrome55634719064070554429115685406953210...NoneNoneNoneNoneNoneNoneNoneNoneNoneNone
1037MONDO:0100431migraine without aura197967978319094209101929660790541740...NoneNoneNoneNoneNoneNoneNoneNoneNoneNone
1038MONDO:0100459azoospermia9558893077495290852488844646660...NoneNoneNoneNoneNoneNoneNoneNoneNoneNone
1039MONDO:0100471vitamin D deficiency13619735081093997724036846177421...NoneNoneNoneNoneNoneNoneNoneNoneNoneNone
\n", - "

1040 rows × 597 columns

\n", - "
\n", - " \n", - "\n", - "\n", - "\n", - "
\n", - " \n", - "
\n", - "\n", - "\n", - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n" - ] - }, - "metadata": {}, - "execution_count": 5 - } - ] + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "You can also checkout more information about the processing done for this\n", + "network by looking into the config." + ], + "metadata": { + "id": "z8qGrmgTbRzW" + } }, { "cell_type": "code", - "source": [], + "source": [ + "print(yaml.dump(g.to_config()))" + ], "metadata": { - "id": "nnH4lXpm_F9g" + "id": "4B7NKw55bQ-k" }, - "execution_count": 10, + "execution_count": null, "outputs": [] }, + { + "cell_type": "markdown", + "source": [ + "The gene IDs in the network can be accessed via the `node_ids` attribute, which\n", + "are [Entrez](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1761442/) gene ID by\n", + "default." + ], + "metadata": { + "id": "5G5tGtPSb_Ob" + } + }, { "cell_type": "code", "source": [ - "g = obnb.data.BioGRID(root, version=data_version)" + "print(f\"The first gene in the network is {g.node_ids[0]!r}\")\n", + "print(f\"The second gene in the network is {g.node_ids[1]!r}\")\n", + "print(f\"The third gene in the network is {g.node_ids[2]!r}\")" ], "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "ui07Xv9tBh3E", - "outputId": "9f425f89-bc48-4215-962c-bb849eee1100" + "id": "gQrZ6besb5E_" }, - "execution_count": 6, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "[INFO][2023-07-10 23:16:03,767][base][download_archive] Loading BioGRID (version='obnbdata-0.1.0')...\n", - "[INFO][2023-07-10 23:16:03,770][download][get_data_url] Download URL: https://zenodo.org/record/8045270/files/BioGRID.zip\n", - "[INFO][2023-07-10 23:16:03,773][download][download_unzip] Downloading zip archive from https://zenodo.org/record/8045270/files/BioGRID.zip\n", - "100%|██████████| 39.3M/39.3M [00:02<00:00, 14.3MB/s]\n", - "[INFO][2023-07-10 23:16:07,863][download][download_unzip] Download completed, start unpacking...\n", - "[INFO][2023-07-10 23:16:10,818][download][download_unzip] Done extracting\n", - "[INFO][2023-07-10 23:16:10,825][base][load_processed_data] Load processed file datasets/BioGRID/processed/data.npz\n" - ] - } - ] + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "The graph `g` object is an instance of the `obnb.graph.SparseGraph` object.\n", + "But it could be easily converted into a dense adjacency matrix via `to_adjmat`" + ], + "metadata": { + "id": "yRbbhwpTbgwz" + } }, { "cell_type": "code", "source": [ - "g = obnb.data.BioGRID(root, version=data_version)" + "adj = g.to_adjmat()\n", + "adj" ], "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "lLqZ9f_-BoOt", - "outputId": "8aa93790-55c3-419f-d001-71a157b4caaa" + "id": "BjX_K65nbfvV" }, - "execution_count": 7, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "[INFO][2023-07-10 23:16:12,911][base][load_processed_data] Load processed file datasets/BioGRID/processed/data.npz\n" - ] - } - ] + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### 2.2. Gene annotations" + ], + "metadata": { + "id": "IUeQvMf4WWM7" + } + }, + { + "cell_type": "markdown", + "source": [ + "Setting up gene annotation tasks is a tedious process that include\n", + "\n", + "1. Obtain annotations for gene-term associations and convert gene identifier to\n", + " the desired option.\n", + "1. Obtain and construct ontology graph that represents the relationships among\n", + " different terms.\n", + "1. Propagate the gene-term annotations upward the ontology graph.\n", + "1. Extract non-redundant representative gene sets (terms) from the propagated\n", + " annotations.\n", + "\n", + "\n", + "Here, we use the [DisGeNET](https://www.disgenet.org/) disease gene annotations\n", + "with [MONDO](https://mondo.monarchinitiative.org/) disease ontology as an\n", + "example to set up the DisGeNET gene set collection.\n", + "\n", + "[3] Piñero, Janet, et al. \"DisGeNET: a comprehensive platform integrating information on human disease-associated genes and variants.\" Nucleic acids research (2016): gkw943.\n", + "\n", + "[4] Vasilevsky, Nicole A., et al. \"Mondo: Unifying diseases for the world, by the world.\" medRxiv (2022): 2022-04." + ], + "metadata": { + "id": "RXFunr8jfgA-" + } }, { "cell_type": "code", "source": [ - "print(yaml.dump(g.to_config()))" + "# Download annotations and ontology from archive\n", + "gsc = obnb.data.DisGeNET(root, version=data_version)" ], "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "1-vLqR-H_F6h", - "outputId": "ba4aee5c-9a99-48b3-b519-7e37c6fd211a" + "id": "HGtLoOl8WNfh" }, - "execution_count": 8, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "data_module: BioGRID\n", - "data_module_params:\n", - " cx_kwargs:\n", - " interaction_types:\n", - " - interacts-with\n", - " node_id_entry: r\n", - " node_id_prefix: ncbigene\n", - " cx_uuid: 36f7d8fd-23dc-11e8-b939-0ac135e8bacf\n", - " directed: false\n", - " gene_id_converter: HumanEntrez\n", - " largest_comp: true\n", - " version: obnbdata-0.1.0\n", - " weighted: false\n", - "package_version: 0.1.1-dev\n", - "processed_time: '2023-07-10 23:16:15'\n", - "\n" - ] - } - ] + "execution_count": null, + "outputs": [] }, { "cell_type": "code", "source": [ - "adj = g.to_adjmat()\n", - "print(adj)" + "# Again, once downloaded and processed, it can be used in the future\n", + "gsc = obnb.data.DisGeNET(root, version=data_version)" ], "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "-QtlyfJQ-6wT", - "outputId": "ee33b889-5206-4e21-9a7d-44f33ceb25be" + "id": "PN2yNtAgjGSm" }, - "execution_count": 9, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "[[0. 1. 0. ... 0. 0. 0.]\n", - " [1. 0. 0. ... 0. 0. 0.]\n", - " [0. 0. 0. ... 0. 0. 0.]\n", - " ...\n", - " [0. 0. 0. ... 0. 0. 0.]\n", - " [0. 0. 0. ... 0. 0. 0.]\n", - " [0. 0. 0. ... 0. 0. 0.]]\n" - ] - } - ] + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Processing config can be inspected in a similar fashion as before" + ], + "metadata": { + "id": "aHvUg8NOjU8N" + } + }, + { + "cell_type": "code", + "source": [ + "print(yaml.dump(gsc.to_config()))" + ], + "metadata": { + "id": "UEZQ-LJ5_vJO" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "obnb.label.LabelsetCollection" + ], + "metadata": { + "id": "TvKsu8rejken" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "The `gsc` object is an instance of the `obnb.label.LabelsetCollection` object.\n", + "You can also convert it to a\n", + "[GMT](https://software.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats#GMT:_Gene_Matrix_Transposed_file_format_.28.2A.gmt.29)-like\n", + "dataframe by calling the `to_df` method.\n", + "\n", + "The resulting dataframe is a table where the first three columns correspond to\n", + "the term ID, term info, and the number of genes associated with this term after\n", + "the processing. The rest of the columns are gene IDs that are associated with a\n", + "particular term, padded with `None`s." + ], + "metadata": { + "id": "jHAIQT6ujeKX" + } }, { "cell_type": "code", - "source": [], + "source": [ + "gsc.to_df()" + ], "metadata": { - "id": "cmvRjsv6-6pj" + "id": "bxYbfhya_GAj" }, "execution_count": null, "outputs": [] }, + { + "cell_type": "markdown", + "source": [ + "## 3. Constructing dataset" + ], + "metadata": { + "id": "t7aARNyJlEBk" + } + }, + { + "cell_type": "markdown", + "source": [ + "### 3.1 The hard way: consolidate the network with gene set collection and combine into a dataset\n", + "\n", + "- Pros: Flexible filtering and dataset construction to help investigate specific\n", + " biological questions.\n", + "- Cons: Many steps involved to filter and pre-process." + ], + "metadata": { + "id": "nVOZQA6OlcLu" + } + }, { "cell_type": "code", "source": [ - "from obnb import OpenBiomedNetBench\n", + "from obnb.label import filters\n", + "from obnb.label.split import RatioPartition\n", + "from obnb.util.converter import GenePropertyConverter\n", "\n", - "dataset = OpenBiomedNetBench(root=root, graph_name=\"BioGRID\", label_name=\"DisGeNET\",\n", - " version=data_version, graph_as_feature=True, use_dense_graph=True)" + "\n", + "# Load PubMed count gene property converter\n", + "pubmedcnt_converter = GenePropertyConverter(root, name=\"PubMedCount\")\n", + "\n", + "# 6/2/2/ study-bias holdout split for genes\n", + "splitter = RatioPartition(0.6, 0.2, 0.2, ascending=False,\n", + " property_converter=pubmedcnt_converter)\n", + "\n", + "# Apply filters to the gene set collection\n", + "gsc_filtered = gsc.apply(\n", + " filters.Compose(\n", + " # Only use genes that are present in the network\n", + " filters.EntityExistenceFilter(list(g.node_ids), log_level=\"INFO\",),\n", + " # Remove any labelsets with less than 50 network genes\n", + " filters.LabelsetRangeFilterSize(min_val=50, log_level=\"INFO\",),\n", + " # Make sure each split has at least 10 positive examples\n", + " filters.LabelsetRangeFilterSplit(min_val=10, splitter=splitter, log_level=\"INFO\",),\n", + " log_level=\"INFO\",\n", + " ),\n", + ")" ], "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "F6Di1ZJ--6md", - "outputId": "ff9a0237-7b48-478f-a8c3-885a12af6c6d" + "id": "NOVBeVkPlOIX" }, - "execution_count": 4, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "[INFO][2023-07-10 23:25:45,356][base][download_archive] Loading BioGRID (version='obnbdata-0.1.0')...\n", - "[INFO][2023-07-10 23:25:45,363][download][get_data_url] Download URL: https://zenodo.org/record/8045270/files/BioGRID.zip\n", - "[INFO][2023-07-10 23:25:45,368][download][download_unzip] Downloading zip archive from https://zenodo.org/record/8045270/files/BioGRID.zip\n", - "100%|██████████| 39.3M/39.3M [00:02<00:00, 14.9MB/s]\n", - "[INFO][2023-07-10 23:25:49,102][download][download_unzip] Download completed, start unpacking...\n", - "[INFO][2023-07-10 23:25:52,276][download][download_unzip] Done extracting\n", - "[INFO][2023-07-10 23:25:52,283][base][load_processed_data] Load processed file datasets/BioGRID/processed/data.npz\n", - "[INFO][obnb.GenePropertyConverter][_load_cache] Loaded gene conversion cache datasets/.cache/geneprop_convert-PubMedCount.json\n", - "[INFO][2023-07-10 23:26:03,783][base][load_processed_data] Load processed file datasets/DisGeNET/processed/data.gmt\n", - "[INFO][2023-07-10 23:26:04,977][base][_apply_transform] Before transformation:\n", - "Number of labelsets: 1040\n", - "max: 594\n", - "min: 10\n", - "med: 36.00\n", - "avg: 85.61\n", - "std: 120.19\n", - "\n", - "[INFO][2023-07-10 23:26:04,980][base][_apply_transform] Applying transformation:\n", - "Composition of filters:\n", - "\t- EntityExistenceFilter(remove_specified=False)\n", - "\t- LabelsetRangeFilterSize(min_val=50, max_val=None)\n", - "\t- LabelsetRangeFilterSplit(splitter=RatioPartition(property_converter=GenePropertyConverter(name='PubMedCount'), ascending=False, ratios=(0.6, 0.2, 0.2)), min_val=5, count_negatives=True)\n", - "\t- NegativeGeneratorHypergeom(p_thresh=0.05)\n", - "EntityExistenceFilter(remove_specified=False): 100%|██████████| 9427/9427 [00:07<00:00, 1226.87it/s]\n", - "[INFO][obnb.Compose][__call__] Number of labelsets: 1040\n", - "max: 571\n", - "min: 4\n", - "med: 35.00\n", - "avg: 81.62\n", - "std: 114.00\n", - "\n", - "LabelsetRangeFilterSize(min_val=50, max_val=None): 100%|██████████| 1040/1040 [00:00<00:00, 3544.88it/s]\n", - "[INFO][obnb.Compose][__call__] Number of labelsets: 406\n", - "max: 571\n", - "min: 50\n", - "med: 118.00\n", - "avg: 174.50\n", - "std: 137.68\n", - "\n", - "LabelsetRangeFilterSplit(splitter=RatioPartition(property_converter=GenePropertyConverter(name='PubMedCount'), ascending=False, ratios=(0.6, 0.2, 0.2)), min_val=5, count_negatives=True): 100%|██████████| 406/406 [00:44<00:00, 9.20it/s]\n", - "[INFO][obnb.Compose][__call__] Number of labelsets: 305\n", - "max: 571\n", - "min: 50\n", - "med: 159.00\n", - "avg: 208.26\n", - "std: 143.10\n", - "\n", - "Computing hypergeometric p-value matrix: 100%|██████████| 46360/46360 [00:54<00:00, 857.21it/s] \n", - "NegativeGeneratorHypergeom(p_thresh=0.05): 100%|██████████| 305/305 [00:02<00:00, 113.19it/s]\n", - "[INFO][obnb.Compose][__call__] Number of labelsets: 305\n", - "max: 571\n", - "min: 50\n", - "med: 159.00\n", - "avg: 208.26\n", - "std: 143.10\n", - "\n", - "[INFO][2023-07-10 23:27:53,980][base][_apply_transform] After transformation:\n", - "Number of labelsets: 305\n", - "max: 571\n", - "min: 50\n", - "med: 159.00\n", - "avg: 208.26\n", - "std: 143.10\n", - "\n", - "[INFO][2023-07-10 23:27:53,998][base][_apply_transform] Saved cache transformation to datasets/DisGeNET/processed/.cache/3f21b172b9a43412952d179a1e7d9f3f/data.gmt\n" - ] - } - ] + "execution_count": null, + "outputs": [] }, { "cell_type": "code", "source": [ - "dataset = OpenBiomedNetBench(root=root, graph_name=\"BioGRID\", label_name=\"DisGeNET\",\n", - " version=data_version, graph_as_feature=True, use_dense_graph=True)" + "# Combine into a OBNB dataset object\n", + "dataset = obnb.Dataset(\n", + " graph=g,\n", + " feature=g.to_dense_graph().to_feature(),\n", + " label=gsc_filtered,\n", + " splitter=splitter,\n", + ")" ], "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "_I775-DC-6jL", - "outputId": "daaa0cbc-8f3b-4927-f954-3963c9730dd9" + "id": "SU4bL7WGlOFG" }, "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "[INFO][2023-07-10 23:19:55,923][base][load_processed_data] Load processed file datasets/BioGRID/processed/data.npz\n", - "[INFO][obnb.GenePropertyConverter][_load_cache] Loaded gene conversion cache datasets/.cache/geneprop_convert-PubMedCount.json\n", - "[INFO][2023-07-10 23:20:09,055][base][load_processed_data] Load processed file datasets/DisGeNET/processed/data.gmt\n", - "[INFO][2023-07-10 23:20:12,024][base][_apply_transform] Loading cached transformed data from datasets/DisGeNET/processed/.cache/3f21b172b9a43412952d179a1e7d9f3f/data.gmt\n", - "[INFO][2023-07-10 23:20:12,028][base][load_processed_data] Load processed file datasets/DisGeNET/processed/.cache/3f21b172b9a43412952d179a1e7d9f3f/data.gmt\n" - ] - } - ] + "outputs": [] }, { "cell_type": "code", "source": [ - "from obnb.model_trainer import LabelPropagationTrainer\n", - "from obnb.model.label_propagation import OneHopPropagation\n", - "\n", - "mdl = OneHopPropagation()\n", - "trainer = LabelPropagationTrainer()\n", + "dataset.graph" + ], + "metadata": { + "id": "HYDWpfnNlOBb" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "dataset.label" + ], + "metadata": { + "id": "UhKG5PFalN6_" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### 3.2. The easy way: OBNB default dataset construction\n", "\n", - "results = trainer.fit_and_eval(mdl, dataset)" + "- Pros: Easy to construct the dataset as it masked out a lot of common steps.\n", + "- Cons: Less flexible and hard to construct specialized datasets." ], "metadata": { - "id": "QlCizk9x-6gD" + "id": "sk3HPD3JlXJe" + } + }, + { + "cell_type": "code", + "source": [ + "dataset = obnb.OpenBiomedNetBench(\n", + " root=root,\n", + " graph_name=\"BioPlex\",\n", + " label_name=\"DisGeNET\",\n", + " version=data_version,\n", + " graph_as_feature=True,\n", + " use_dense_graph=True,\n", + ")" + ], + "metadata": { + "id": "F6Di1ZJ--6md" }, - "execution_count": 5, + "execution_count": null, "outputs": [] }, + { + "cell_type": "code", + "source": [ + "# Similar to all previously shown cases, dataset have builtin cache utility\n", + "# to help spead up dataloading after the first instantiation.\n", + "dataset = obnb.OpenBiomedNetBench(\n", + " root=root,\n", + " graph_name=\"BioPlex\",\n", + " label_name=\"DisGeNET\",\n", + " version=data_version,\n", + " graph_as_feature=True,\n", + " use_dense_graph=True,\n", + ")" + ], + "metadata": { + "id": "_I775-DC-6jL" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## 4. Simple model evaluation using the dataset and the builtin trianer" + ], + "metadata": { + "id": "DgV8pJf9otkk" + } + }, + { + "cell_type": "markdown", + "source": [ + "### 4.1. Label propagation" + ], + "metadata": { + "id": "3RDHlp18pP0B" + } + }, { "cell_type": "code", "source": [ "import pandas as pd\n", "\n", - "df = pd.DataFrame(results, index=dataset.label.label_ids)\n", - "df" + "from obnb.model_trainer import LabelPropagationTrainer\n", + "from obnb.model.label_propagation import OneHopPropagation" ], "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 424 - }, - "id": "NBtgy76xIoc_", - "outputId": "2c3e0472-c679-41d4-d293-c46b2c1d8059" + "id": "QlCizk9x-6gD" }, - "execution_count": 10, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " train_apop val_apop test_apop train_auroc val_auroc \\\n", - "MONDO:0021034 1.843529 2.163573 0.033863 0.675152 0.483754 \n", - "MONDO:0002243 1.813295 0.303094 1.002558 0.701548 0.516716 \n", - "MONDO:0002245 0.809497 0.620750 0.206753 0.637431 0.557898 \n", - "MONDO:0001703 0.778886 1.511866 4.148121 0.548935 0.588175 \n", - "MONDO:0013099 1.914333 2.262900 0.493989 0.646922 0.647186 \n", - "... ... ... ... ... ... \n", - "MONDO:0100284 0.603496 0.189524 0.000000 0.573916 0.527694 \n", - "MONDO:0020019 1.583002 0.591055 0.879580 0.681023 0.518388 \n", - "MONDO:0021002 1.211121 1.055366 1.127546 0.628159 0.585140 \n", - "MONDO:0021017 2.228560 1.146228 0.000000 0.520709 0.528579 \n", - "MONDO:0100459 3.250616 3.966312 0.060178 0.692115 0.708832 \n", - "\n", - " test_auroc \n", - "MONDO:0021034 0.472385 \n", - "MONDO:0002243 0.595784 \n", - "MONDO:0002245 0.560433 \n", - "MONDO:0001703 0.497549 \n", - "MONDO:0013099 0.532789 \n", - "... ... \n", - "MONDO:0100284 0.413082 \n", - "MONDO:0020019 0.598177 \n", - "MONDO:0021002 0.629074 \n", - "MONDO:0021017 0.462455 \n", - "MONDO:0100459 0.517378 \n", - "\n", - "[305 rows x 6 columns]" - ], - "text/html": [ - "\n", - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
train_apopval_apoptest_apoptrain_aurocval_auroctest_auroc
MONDO:00210341.8435292.1635730.0338630.6751520.4837540.472385
MONDO:00022431.8132950.3030941.0025580.7015480.5167160.595784
MONDO:00022450.8094970.6207500.2067530.6374310.5578980.560433
MONDO:00017030.7788861.5118664.1481210.5489350.5881750.497549
MONDO:00130991.9143332.2629000.4939890.6469220.6471860.532789
.....................
MONDO:01002840.6034960.1895240.0000000.5739160.5276940.413082
MONDO:00200191.5830020.5910550.8795800.6810230.5183880.598177
MONDO:00210021.2111211.0553661.1275460.6281590.5851400.629074
MONDO:00210172.2285601.1462280.0000000.5207090.5285790.462455
MONDO:01004593.2506163.9663120.0601780.6921150.7088320.517378
\n", - "

305 rows × 6 columns

\n", - "
\n", - " \n", - "\n", - "\n", - "\n", - "
\n", - " \n", - "
\n", - "\n", - "\n", - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n" - ] - }, - "metadata": {}, - "execution_count": 10 - } - ] + "execution_count": null, + "outputs": [] }, { "cell_type": "code", "source": [ - "df.describe()" + "lp_mdl = OneHopPropagation()\n", + "lp_trainer = LabelPropagationTrainer()" ], "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 300 - }, - "id": "np234taeKVjQ", - "outputId": "19e843c3-b927-4c82-ca9e-b42aea000202" + "id": "JyrNWeZ4pDA5" }, - "execution_count": 11, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " train_apop val_apop test_apop train_auroc val_auroc test_auroc\n", - "count 305.000000 305.000000 305.000000 305.000000 305.000000 305.000000\n", - "mean 1.152127 1.003746 0.819258 0.623729 0.561479 0.521614\n", - "std 0.753213 1.065416 1.116516 0.065599 0.082058 0.063061\n", - "min 0.001297 -0.320246 -0.213516 0.485241 0.351099 0.375165\n", - "25% 0.646890 0.255761 0.053465 0.582988 0.506204 0.473407\n", - "50% 0.993217 0.623817 0.392785 0.620002 0.551109 0.521458\n", - "75% 1.507760 1.465971 1.137018 0.659771 0.596552 0.560724\n", - "max 5.851295 6.370345 6.111766 0.965775 0.951942 0.794405" - ], - "text/html": [ - "\n", - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
train_apopval_apoptest_apoptrain_aurocval_auroctest_auroc
count305.000000305.000000305.000000305.000000305.000000305.000000
mean1.1521271.0037460.8192580.6237290.5614790.521614
std0.7532131.0654161.1165160.0655990.0820580.063061
min0.001297-0.320246-0.2135160.4852410.3510990.375165
25%0.6468900.2557610.0534650.5829880.5062040.473407
50%0.9932170.6238170.3927850.6200020.5511090.521458
75%1.5077601.4659711.1370180.6597710.5965520.560724
max5.8512956.3703456.1117660.9657750.9519420.794405
\n", - "
\n", - " \n", - "\n", - "\n", - "\n", - "
\n", - " \n", - "
\n", - "\n", - "\n", - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n" - ] - }, - "metadata": {}, - "execution_count": 11 - } - ] + "execution_count": null, + "outputs": [] }, { "cell_type": "code", - "source": [], + "source": [ + "lp_results = lp_trainer.fit_and_eval(lp_mdl, dataset)" + ], "metadata": { - "id": "3I3Z0WJyKVQP" + "id": "5sWjyXv1pFuP" }, "execution_count": null, "outputs": [] @@ -1597,73 +599,101 @@ { "cell_type": "code", "source": [ - "from sklearn.linear_model import LogisticRegression\n", - "from obnb.model_trainer import SupervisedLearningTrainer\n", - "\n", - "mdl = LogisticRegression(penalty=\"l2\", solver=\"lbfgs\")\n", - "trainer = SupervisedLearningTrainer()\n", - "\n", - "results2 = trainer.fit_and_eval(mdl, dataset)" + "lp_df = pd.DataFrame(lp_results, index=dataset.label.label_ids)\n", + "lp_df" ], "metadata": { - "id": "GdeNbEDz-6cx" + "id": "NBtgy76xIoc_" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", - "source": [], + "source": [ + "lp_df.describe()" + ], "metadata": { - "id": "-NJKfTpx-6Z4" + "id": "np234taeKVjQ" }, "execution_count": null, "outputs": [] }, + { + "cell_type": "markdown", + "source": [ + "### 4.2. Supervised learning" + ], + "metadata": { + "id": "hkVYJQE8pR9F" + } + }, { "cell_type": "code", - "source": [], + "source": [ + "from sklearn.linear_model import LogisticRegression\n", + "from obnb.model_trainer import SupervisedLearningTrainer" + ], "metadata": { - "id": "3uHHEcsx-6Wy" + "id": "GdeNbEDz-6cx" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", - "source": [], + "source": [ + "sl_mdl = LogisticRegression(penalty=\"l2\", solver=\"lbfgs\")\n", + "sl_trainer = SupervisedLearningTrainer()" + ], "metadata": { - "id": "-14ui8Jt-6Tf" + "id": "JXWOi3hGpfIG" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", - "source": [], + "source": [ + "sl_results = sl_trainer.fit_and_eval(sl_mdl, dataset)" + ], "metadata": { - "id": "poGro_Qo-6Qz" + "id": "FTj2l-9ipj-4" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", - "source": [], + "source": [ + "sl_df = pd.DataFrame(sl_results, index=dataset.label.label_ids)\n", + "sl_df" + ], "metadata": { - "id": "Hn4bHRIg-6Nz" + "id": "A4OZoPsipaiw" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", - "source": [], + "source": [ + "sl_df.describe()" + ], "metadata": { - "id": "v3FTLk__-5zs" + "id": "lxHqDjPupcT0" }, "execution_count": null, "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### 4.3. GNN (coming soon)" + ], + "metadata": { + "id": "Qf6Z7iBfpVfZ" + } } ] } \ No newline at end of file From 07188334d3787beeccc428c500ce5efc70e287b6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 11 Jul 2023 15:54:51 +0000 Subject: [PATCH 4/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tutorials/basic_tutorial.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorials/basic_tutorial.ipynb b/tutorials/basic_tutorial.ipynb index 025fc4c8..adf71abf 100644 --- a/tutorials/basic_tutorial.ipynb +++ b/tutorials/basic_tutorial.ipynb @@ -696,4 +696,4 @@ } } ] -} \ No newline at end of file +}