From c60fab669e031bba62ef83765fa3e52d051aea4b Mon Sep 17 00:00:00 2001 From: Julia Curd <79439229+jaceybronte@users.noreply.github.com> Date: Mon, 9 Sep 2024 14:42:17 -0600 Subject: [PATCH] Data download for model comparison (#54) * Data download for model comparison * Fixes --- .../1.download-validation-data.ipynb | 1108 ++++++++++++++++ .../2.download-target-data.ipynb | 1158 +++++++++++++++++ .../scripts/1.download-validation-data.py | 208 +++ .../scripts/2.download-target-data.py | 274 ++++ environment.yml | 8 + 5 files changed, 2756 insertions(+) create mode 100644 4.gene_expression_signatures/1.download-validation-data.ipynb create mode 100644 4.gene_expression_signatures/2.download-target-data.ipynb create mode 100644 4.gene_expression_signatures/scripts/1.download-validation-data.py create mode 100644 4.gene_expression_signatures/scripts/2.download-target-data.py diff --git a/4.gene_expression_signatures/1.download-validation-data.ipynb b/4.gene_expression_signatures/1.download-validation-data.ipynb new file mode 100644 index 0000000..1821e71 --- /dev/null +++ b/4.gene_expression_signatures/1.download-validation-data.ipynb @@ -0,0 +1,1108 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Download and Process Neuroblastoma RNAseq Data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Code from: https://github.com/greenelab/BioBombe/blob/master/10.gene-expression-signatures/0.download-validation-data.ipynb\n", + "\n", + "We are downloading the dataset associated with Harenza et al. 2017. The data profiles RNAseq data from 39 commonly used neuroblastoma (NBL) cell lines.\n", + "\n", + "We are interested in the MYCN amplification status of these cell lines. We will test if the MYCN amplification score learned through the BioBombe signature approach applied to TARGET data generalizes to this cell line dataset.\n", + "\n", + "MYCN Amplification refers to the number of copies of the MYCN gene. MYCN amplification is used as a biomarker for poor prognosis in neuroblastoma patients (Huang and Weiss 2013)." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "import pathlib\n", + "import pandas as pd\n", + "from urllib.request import urlretrieve\n", + "\n", + "from sklearn import preprocessing" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "url = \"https://ndownloader.figshare.com/files/14138792\"\n", + "name = \"2019-01-22-CellLineSTAR-fpkm-2pass_matrix.txt\"\n", + "data_dir = pathlib.Path(\"data\").resolve()\n", + "path = data_dir / name" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "data_dir.mkdir(parents=True, exist_ok=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(PosixPath('data/2019-01-22-CellLineSTAR-fpkm-2pass_matrix.txt'),\n", + " )" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "urlretrieve(url, path)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "55ea0255d1aa7708eba2ebd0113eeb3f data/2019-01-22-CellLineSTAR-fpkm-2pass_matrix.txt\n" + ] + } + ], + "source": [ + "! md5sum \"data/2019-01-22-CellLineSTAR-fpkm-2pass_matrix.txt\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Download Phenotype Data" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "url = \"https://www.nature.com/articles/sdata201733/tables/4\"\n", + "name = \"nbl_cellline_phenotype.txt\"\n", + "path = data_dir / name" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Cell LineMYCN status1p36 del3p26 del11q23 del17q21-qter unbal gainALK mutationp53 mutation
0CHP134AmplifiedLOH p32.3-pter; Gain p34.3-p36.22; Loss p36.22...Gain/AI p26.3NoneGain q12-qterWTWT
1CHP212AmplifiedLoss p13.2-pterGain/AI p26.3cnLOH 23.3Gain q12-qterWTWT
2COGN415AmplifiedUnknownUnknownUnknownUnknownF1174LWT
3COGN440AmplifiedUnknownUnknownUnknownUnknownWTWT
4COGN453AmplifiedUnknownUnknownUnknownUnknownF1174LWT
\n", + "
" + ], + "text/plain": [ + " Cell Line MYCN status 1p36 del \\\n", + "0 CHP134 Amplified LOH p32.3-pter; Gain p34.3-p36.22; Loss p36.22... \n", + "1 CHP212 Amplified Loss p13.2-pter \n", + "2 COGN415 Amplified Unknown \n", + "3 COGN440 Amplified Unknown \n", + "4 COGN453 Amplified Unknown \n", + "\n", + " 3p26 del 11q23 del 17q21-qter unbal gain ALK mutation p53 mutation \n", + "0 Gain/AI p26.3 None Gain q12-qter WT WT \n", + "1 Gain/AI p26.3 cnLOH 23.3 Gain q12-qter WT WT \n", + "2 Unknown Unknown Unknown F1174L WT \n", + "3 Unknown Unknown Unknown WT WT \n", + "4 Unknown Unknown Unknown F1174L WT " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "if not path.is_file():\n", + " html = requests.get(url).content\n", + "\n", + " pheno_df = pd.read_html(html)[0]\n", + " pheno_df['Cell Line'] = pheno_df['Cell Line'].str.replace(\"-\", \"\")\n", + "\n", + " pheno_df.to_csv(path, sep='\\t', index=False)\n", + "\n", + "else:\n", + " pheno_df = pd.read_parquet(path)\n", + "\n", + "pheno_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "c3b0fdeb448296445567e59995f2342c data/nbl_cellline_phenotype.txt\n" + ] + } + ], + "source": [ + "! md5sum \"data/nbl_cellline_phenotype.txt\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Process RNAseq Data" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GeneIDCHP134CHP212COGN415COGN440COGN453COGN471COGN496COGN519COGN534...RPE1SHSY5YSKNASSKNBE2SKNBE2CSKNDZSKNFISKNSHSMSKANSMSSAN
EAF1EAF18.7832106.8634595.46293110.1043896.6042414.8665769.3298094.40107311.083115...4.3638058.5555196.2609624.8423204.0288258.86167619.3907709.9367716.8718377.696442
SARNPSARNP12.75243923.71782422.16240727.40176019.2567369.15738023.00017325.25054923.622492...25.26471031.70508922.24665424.11554228.65224932.97660038.72504115.29027318.65911131.636539
CXCR5CXCR50.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0047270.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
KRTAP4-2KRTAP4-20.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
MAPK7MAPK72.7789972.7794512.7184531.6773803.7095340.9889771.5755971.4552812.519137...1.9712205.1749541.1153601.6799800.9714653.3346102.3540822.7888885.0155113.019704
\n", + "

5 rows × 41 columns

\n", + "
" + ], + "text/plain": [ + " GeneID CHP134 CHP212 COGN415 COGN440 COGN453 \\\n", + "EAF1 EAF1 8.783210 6.863459 5.462931 10.104389 6.604241 \n", + "SARNP SARNP 12.752439 23.717824 22.162407 27.401760 19.256736 \n", + "CXCR5 CXCR5 0.000000 0.000000 0.000000 0.000000 0.000000 \n", + "KRTAP4-2 KRTAP4-2 0.000000 0.000000 0.000000 0.000000 0.000000 \n", + "MAPK7 MAPK7 2.778997 2.779451 2.718453 1.677380 3.709534 \n", + "\n", + " COGN471 COGN496 COGN519 COGN534 ... RPE1 \\\n", + "EAF1 4.866576 9.329809 4.401073 11.083115 ... 4.363805 \n", + "SARNP 9.157380 23.000173 25.250549 23.622492 ... 25.264710 \n", + "CXCR5 0.000000 0.000000 0.004727 0.000000 ... 0.000000 \n", + "KRTAP4-2 0.000000 0.000000 0.000000 0.000000 ... 0.000000 \n", + "MAPK7 0.988977 1.575597 1.455281 2.519137 ... 1.971220 \n", + "\n", + " SHSY5Y SKNAS SKNBE2 SKNBE2C SKNDZ SKNFI \\\n", + "EAF1 8.555519 6.260962 4.842320 4.028825 8.861676 19.390770 \n", + "SARNP 31.705089 22.246654 24.115542 28.652249 32.976600 38.725041 \n", + "CXCR5 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", + "KRTAP4-2 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", + "MAPK7 5.174954 1.115360 1.679980 0.971465 3.334610 2.354082 \n", + "\n", + " SKNSH SMSKAN SMSSAN \n", + "EAF1 9.936771 6.871837 7.696442 \n", + "SARNP 15.290273 18.659111 31.636539 \n", + "CXCR5 0.000000 0.000000 0.000000 \n", + "KRTAP4-2 0.000000 0.000000 0.000000 \n", + "MAPK7 2.788888 5.015511 3.019704 \n", + "\n", + "[5 rows x 41 columns]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "raw_file = data_dir / \"2019-01-22-CellLineSTAR-fpkm-2pass_matrix.txt\"\n", + "raw_df = pd.read_table(raw_file, sep='\\t')\n", + "raw_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Update Gene Names" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# Load curated gene names from versioned resource \n", + "commit = '721204091a96e55de6dcad165d6d8265e67e2a48'\n", + "url = 'https://raw.githubusercontent.com/cognoma/genes/{}/data/genes.tsv'.format(commit)\n", + "gene_df = pd.read_table(url)\n", + "\n", + "# Only consider protein-coding genes\n", + "gene_df = (\n", + " gene_df.query(\"gene_type == 'protein-coding'\")\n", + ")\n", + "\n", + "symbol_to_entrez = dict(zip(gene_df.symbol,\n", + " gene_df.entrez_gene_id))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# Add alternative symbols to entrez mapping dictionary\n", + "gene_df = gene_df.dropna(axis='rows', subset=['synonyms'])\n", + "gene_df.synonyms = gene_df.synonyms.str.split('|')\n", + "\n", + "all_syn = (\n", + " gene_df.apply(lambda x: pd.Series(x.synonyms), axis=1)\n", + " .stack()\n", + " .reset_index(level=1, drop=True)\n", + ")\n", + "\n", + "# Name the synonym series and join with rest of genes\n", + "all_syn.name = 'all_synonyms'\n", + "gene_with_syn_df = gene_df.join(all_syn)\n", + "\n", + "# Remove rows that have redundant symbols in all_synonyms\n", + "gene_with_syn_df = (\n", + " gene_with_syn_df\n", + " \n", + " # Drop synonyms that are duplicated - can't be sure of mapping\n", + " .drop_duplicates(['all_synonyms'], keep=False)\n", + "\n", + " # Drop rows in which the symbol appears in the list of synonyms\n", + " .query('symbol not in all_synonyms')\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a synonym to entrez mapping and add to dictionary\n", + "synonym_to_entrez = dict(zip(gene_with_syn_df.all_synonyms,\n", + " gene_with_syn_df.entrez_gene_id))\n", + "\n", + "symbol_to_entrez.update(synonym_to_entrez)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# Load gene updater\n", + "url = 'https://raw.githubusercontent.com/cognoma/genes/{}/data/updater.tsv'.format(commit)\n", + "updater_df = pd.read_table(url)\n", + "old_to_new_entrez = dict(zip(updater_df.old_entrez_gene_id,\n", + " updater_df.new_entrez_gene_id))" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "gene_map = raw_df.GeneID.replace(symbol_to_entrez)\n", + "gene_map = gene_map.replace(old_to_new_entrez)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(19287, 40)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CHP134CHP212COGN415COGN440COGN453COGN471COGN496COGN519COGN534COGN549...RPE1SHSY5YSKNASSKNBE2SKNBE2CSKNDZSKNFISKNSHSMSKANSMSSAN
entrez_gene_id
108478.7832106.8634595.46293110.1043896.6042414.8665769.3298094.40107311.0831155.182270...4.3638058.5555196.2609624.8423204.0288258.86167619.3907709.9367716.8718377.696442
8432412.75243923.71782422.16240727.40176019.2567369.15738023.00017325.25054923.62249212.517533...25.26471031.70508922.24665424.11554228.65224932.97660038.72504115.29027318.65911131.636539
6430.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0047270.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
852910.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
55982.7789972.7794512.7184531.6773803.7095340.9889771.5755971.4552812.5191371.208765...1.9712205.1749541.1153601.6799800.9714653.3346102.3540822.7888885.0155113.019704
\n", + "

5 rows × 40 columns

\n", + "
" + ], + "text/plain": [ + " CHP134 CHP212 COGN415 COGN440 COGN453 \\\n", + "entrez_gene_id \n", + "10847 8.783210 6.863459 5.462931 10.104389 6.604241 \n", + "84324 12.752439 23.717824 22.162407 27.401760 19.256736 \n", + "643 0.000000 0.000000 0.000000 0.000000 0.000000 \n", + "85291 0.000000 0.000000 0.000000 0.000000 0.000000 \n", + "5598 2.778997 2.779451 2.718453 1.677380 3.709534 \n", + "\n", + " COGN471 COGN496 COGN519 COGN534 COGN549 ... \\\n", + "entrez_gene_id ... \n", + "10847 4.866576 9.329809 4.401073 11.083115 5.182270 ... \n", + "84324 9.157380 23.000173 25.250549 23.622492 12.517533 ... \n", + "643 0.000000 0.000000 0.004727 0.000000 0.000000 ... \n", + "85291 0.000000 0.000000 0.000000 0.000000 0.000000 ... \n", + "5598 0.988977 1.575597 1.455281 2.519137 1.208765 ... \n", + "\n", + " RPE1 SHSY5Y SKNAS SKNBE2 SKNBE2C \\\n", + "entrez_gene_id \n", + "10847 4.363805 8.555519 6.260962 4.842320 4.028825 \n", + "84324 25.264710 31.705089 22.246654 24.115542 28.652249 \n", + "643 0.000000 0.000000 0.000000 0.000000 0.000000 \n", + "85291 0.000000 0.000000 0.000000 0.000000 0.000000 \n", + "5598 1.971220 5.174954 1.115360 1.679980 0.971465 \n", + "\n", + " SKNDZ SKNFI SKNSH SMSKAN SMSSAN \n", + "entrez_gene_id \n", + "10847 8.861676 19.390770 9.936771 6.871837 7.696442 \n", + "84324 32.976600 38.725041 15.290273 18.659111 31.636539 \n", + "643 0.000000 0.000000 0.000000 0.000000 0.000000 \n", + "85291 0.000000 0.000000 0.000000 0.000000 0.000000 \n", + "5598 3.334610 2.354082 2.788888 5.015511 3.019704 \n", + "\n", + "[5 rows x 40 columns]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "raw_df.index = gene_map\n", + "raw_df.index.name = 'entrez_gene_id'\n", + "raw_df = raw_df.drop(['GeneID'], axis='columns')\n", + "raw_df = raw_df.loc[raw_df.index.isin(symbol_to_entrez.values()), :]\n", + "\n", + "print(raw_df.shape)\n", + "raw_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Scale Data and Output" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
entrez_gene_id12912131415161819...105375787106707243106821730106865373107080638107080644107181291107984155107984923107987479
CHP1340.4961420.0000000.0319140.0000000.0000000.3357541.0000000.5271040.1230200.101682...0.00.2020010.3502500.00.3595380.00.00.00.3985150.0
CHP2120.1302690.0776070.3845430.0000000.0476310.4833610.0111380.6805740.0021620.136316...0.00.0303100.6734240.00.0864740.00.00.00.5564930.0
COGN4150.3568760.0000000.1685760.0000000.0000000.3039120.0680800.1538000.1005310.033393...0.00.4177380.1998540.00.2111830.00.00.00.7260230.0
COGN4400.5124930.0005931.0000000.0019410.0000000.3732080.1023370.2495160.2174590.004284...0.00.2305590.5385640.00.2281880.00.00.00.6074770.0
COGN4530.3834740.0000000.0862840.0000000.0000000.4648900.1136650.2619630.1255260.001720...0.00.3015250.4719390.00.2723350.00.00.00.3750540.0
\n", + "

5 rows × 18628 columns

\n", + "
" + ], + "text/plain": [ + "entrez_gene_id 1 2 9 12 13 14 \\\n", + "CHP134 0.496142 0.000000 0.031914 0.000000 0.000000 0.335754 \n", + "CHP212 0.130269 0.077607 0.384543 0.000000 0.047631 0.483361 \n", + "COGN415 0.356876 0.000000 0.168576 0.000000 0.000000 0.303912 \n", + "COGN440 0.512493 0.000593 1.000000 0.001941 0.000000 0.373208 \n", + "COGN453 0.383474 0.000000 0.086284 0.000000 0.000000 0.464890 \n", + "\n", + "entrez_gene_id 15 16 18 19 ... 105375787 \\\n", + "CHP134 1.000000 0.527104 0.123020 0.101682 ... 0.0 \n", + "CHP212 0.011138 0.680574 0.002162 0.136316 ... 0.0 \n", + "COGN415 0.068080 0.153800 0.100531 0.033393 ... 0.0 \n", + "COGN440 0.102337 0.249516 0.217459 0.004284 ... 0.0 \n", + "COGN453 0.113665 0.261963 0.125526 0.001720 ... 0.0 \n", + "\n", + "entrez_gene_id 106707243 106821730 106865373 107080638 107080644 \\\n", + "CHP134 0.202001 0.350250 0.0 0.359538 0.0 \n", + "CHP212 0.030310 0.673424 0.0 0.086474 0.0 \n", + "COGN415 0.417738 0.199854 0.0 0.211183 0.0 \n", + "COGN440 0.230559 0.538564 0.0 0.228188 0.0 \n", + "COGN453 0.301525 0.471939 0.0 0.272335 0.0 \n", + "\n", + "entrez_gene_id 107181291 107984155 107984923 107987479 \n", + "CHP134 0.0 0.0 0.398515 0.0 \n", + "CHP212 0.0 0.0 0.556493 0.0 \n", + "COGN415 0.0 0.0 0.726023 0.0 \n", + "COGN440 0.0 0.0 0.607477 0.0 \n", + "COGN453 0.0 0.0 0.375054 0.0 \n", + "\n", + "[5 rows x 18628 columns]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#MinMax consistent with BetaVAE scaling\n", + "raw_scaled_df = preprocessing.MinMaxScaler().fit_transform(raw_df.transpose())\n", + "raw_scaled_df = (\n", + " pd.DataFrame(raw_scaled_df,\n", + " columns=raw_df.index,\n", + " index=raw_df.columns)\n", + " .sort_index(axis='columns')\n", + " .sort_index(axis='rows')\n", + ")\n", + "raw_scaled_df.columns = raw_scaled_df.columns.astype(str)\n", + "raw_scaled_df = raw_scaled_df.loc[:, ~raw_scaled_df.columns.duplicated(keep='first')]\n", + "\n", + "raw_scaled_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# In[17]\n", + "processed_file = data_dir / 'nbl_celllines_processed_matrix.parquet'\n", + "raw_scaled_df.to_parquet(processed_file)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "gene_dependency_representations", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/4.gene_expression_signatures/2.download-target-data.ipynb b/4.gene_expression_signatures/2.download-target-data.ipynb new file mode 100644 index 0000000..0b1dfa9 --- /dev/null +++ b/4.gene_expression_signatures/2.download-target-data.ipynb @@ -0,0 +1,1158 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Download TARGET Pan Cancer Data for Compression\n", + "The notebook downloads gene expression and clinical data from the TARGET project. The data is downloaded from UCSC Xena.\n", + "The data is in log2(FPKM) RSEM transformed" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pathlib\n", + "from pathlib import Path\n", + "import hashlib\n", + "from urllib.request import urlretrieve\n", + "import random\n", + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def download_and_checksum(url_base, file_info_dict, file_url_paths):\n", + " \"\"\"\n", + " Downloads files from the specified base URL and computes the SHA-256 checksum for each file.\n", + "\n", + " Parameters:\n", + " ----------\n", + " url_base : str\n", + " The base URL from which the files will be downloaded.\n", + "\n", + " file_info_dict : dict\n", + " A dictionary where the keys are file names and the values are the local file paths where \n", + " the files will be saved.\n", + "\n", + " file_url_paths : dict\n", + " A dictionary mapping each file name to the corresponding sub-path (if any) that should \n", + " be appended to the base URL before downloading the file.\n", + " \"\"\"\n", + " for name, path in file_info_dict.items():\n", + " # Determine the correct URL with or without sub-path\n", + " file_url = f\"{url_base}{file_url_paths[name]}{name}\"\n", + " \n", + " # Download the file\n", + " urlretrieve(file_url, path)\n", + "\n", + " # Compute checksum\n", + " md5_hash = hashlib.md5()\n", + " with open(path, \"rb\") as f:\n", + " for byte_block in iter(lambda: f.read(4096), b\"\"):\n", + " md5_hash.update(byte_block)\n", + "\n", + " print(f\"{name} checksum: {md5_hash.hexdigest()}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Base URL\n", + "url_base = 'https://toil-xena-hub.s3.us-east-1.amazonaws.com/download/'\n", + "data_dir = pathlib.Path('data').resolve()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Dictionary for file names and their paths\n", + "file_info_dict = {\n", + " 'target_RSEM_gene_fpkm.gz': Path('data/target_RSEM_gene_fpkm.gz'),\n", + " 'TARGET_phenotype.gz': Path('data/TARGET_phenotype.gz'),\n", + " 'gencode.v23.annotation.gene.probemap': Path('data/gencode.v23.annotation.gene.probemap')\n", + "}\n", + "\n", + "file_url_paths = {\n", + " \"target_RSEM_gene_fpkm.gz\": \"\",\n", + " \"gencode.v23.annotation.gene.probemap\": \"probeMap/\",\n", + " \"TARGET_phenotype.gz\": \"\"\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Create the 'data' directory\n", + "Path('data').mkdir(exist_ok=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "target_RSEM_gene_fpkm.gz checksum: 4584cca6e5befebaead5f1865f23f517321dfc52cf9e752d87219f0bc33a14fc\n", + "TARGET_phenotype.gz checksum: b52ec780daaa940dbfdd76f5f764cde2567ba6e6acbbab7e889030ef9474a5cc\n", + "gencode.v23.annotation.gene.probemap checksum: 6783ea58791ae876efb697889a042cc7be8e32e40fc01191c622ef25d9416931\n" + ] + } + ], + "source": [ + "# Call the function\n", + "download_and_checksum(url_base, file_info_dict, file_url_paths)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Process TARGET PanCancer Data\n", + "Retrive the downloaded expression data, update gene identifiers to entrez, and curate sample IDs. The script will also identify a balanced hold-out test set to compare projection performance into learned latent spaces across algorithms. " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "random.seed(0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Read Phenotype Information" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(5958, 7)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sample_idprimary_disease_code_primary_diseasesample_type_code_sample_type_PATIENT_cohort
0TARGET-00-NAAEMA-20Non cancerous tissueNon cancerous tissueCELLCControl AnalyteNAAEMATARGET
1TARGET-00-NAAEMB-20Non cancerous tissueNon cancerous tissueCELLCControl AnalyteNAAEMBTARGET
2TARGET-00-NAAEMC-20Non cancerous tissueNon cancerous tissueCELLCControl AnalyteNAAEMCTARGET
\n", + "
" + ], + "text/plain": [ + " sample_id primary_disease_code _primary_disease \\\n", + "0 TARGET-00-NAAEMA-20 Non cancerous tissue Non cancerous tissue \n", + "1 TARGET-00-NAAEMB-20 Non cancerous tissue Non cancerous tissue \n", + "2 TARGET-00-NAAEMC-20 Non cancerous tissue Non cancerous tissue \n", + "\n", + " sample_type_code _sample_type _PATIENT _cohort \n", + "0 CELLC Control Analyte NAAEMA TARGET \n", + "1 CELLC Control Analyte NAAEMB TARGET \n", + "2 CELLC Control Analyte NAAEMC TARGET " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pheno_file = data_dir / 'TARGET_phenotype.parquet'\n", + "if not pheno_file.is_file():\n", + " pheno_df = pd.read_table(data_dir / 'TARGET_phenotype.gz')\n", + " pheno_df.to_parquet(pheno_file, index=False)\n", + "else:\n", + " pheno_df = pd.read_parquet(pheno_file)\n", + "\n", + "print(pheno_df.shape)\n", + "pheno_df.head(3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Read Entrez ID Curation Information\n", + "Load curated gene names from versioned resource. See https://github.com/cognoma/genes for more details" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# Commit from https://github.com/cognoma/genes\n", + "genes_commit = 'ad9631bb4e77e2cdc5413b0d77cb8f7e93fc5bee'" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(20395, 7)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
entrez_gene_idsymboldescriptionchromosomegene_typesynonymsaliases
01A1BGalpha-1-B glycoprotein19protein-codingA1B|ABG|GAB|HYST2477alpha-1B-glycoprotein|HEL-S-163pA|epididymis s...
12A2Malpha-2-macroglobulin12protein-codingA2MD|CPAMD5|FWP007|S863-7alpha-2-macroglobulin|C3 and PZP-like alpha-2-...
\n", + "
" + ], + "text/plain": [ + " entrez_gene_id symbol description chromosome gene_type \\\n", + "0 1 A1BG alpha-1-B glycoprotein 19 protein-coding \n", + "1 2 A2M alpha-2-macroglobulin 12 protein-coding \n", + "\n", + " synonyms \\\n", + "0 A1B|ABG|GAB|HYST2477 \n", + "1 A2MD|CPAMD5|FWP007|S863-7 \n", + "\n", + " aliases \n", + "0 alpha-1B-glycoprotein|HEL-S-163pA|epididymis s... \n", + "1 alpha-2-macroglobulin|C3 and PZP-like alpha-2-... " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "url = 'https://raw.githubusercontent.com/cognoma/genes/{}/data/genes.tsv'.format(genes_commit)\n", + "gene_df = pd.read_table(url)\n", + "\n", + "# Only consider protein-coding genes\n", + "gene_df = (\n", + " gene_df.query(\"gene_type == 'protein-coding'\")\n", + ")\n", + "\n", + "\n", + "print(gene_df.shape)\n", + "gene_df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# Load gene updater - old to new Entrez gene identifiers\n", + "url = 'https://raw.githubusercontent.com/cognoma/genes/{}/data/updater.tsv'.format(genes_commit)\n", + "updater_df = pd.read_table(url)\n", + "old_to_new_entrez = dict(zip(updater_df.old_entrez_gene_id,\n", + " updater_df.new_entrez_gene_id))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Read Probe Mapping Info" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(18855, 13)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idgenechromchromStartchromEndstrandentrez_gene_idsymboldescriptionchromosomegene_typesynonymsaliases
0ENSG00000186092.4OR4F5chr16909170008+79501OR4F5olfactory receptor family 4 subfamily F member 51protein-codingNaNolfactory receptor 4F5
1ENSG00000278566.1OR4F29chr1450740451678-729759OR4F29olfactory receptor family 4 subfamily F member 291protein-codingOR7-21olfactory receptor 4F3/4F16/4F29|olfactory rec...
2ENSG00000273547.1OR4F16chr1685716686654-81399OR4F16olfactory receptor family 4 subfamily F member 161protein-codingOR1-1|OR7-21olfactory receptor 4F3/4F16/4F29|olfactory rec...
\n", + "
" + ], + "text/plain": [ + " id gene chrom chromStart chromEnd strand \\\n", + "0 ENSG00000186092.4 OR4F5 chr1 69091 70008 + \n", + "1 ENSG00000278566.1 OR4F29 chr1 450740 451678 - \n", + "2 ENSG00000273547.1 OR4F16 chr1 685716 686654 - \n", + "\n", + " entrez_gene_id symbol description \\\n", + "0 79501 OR4F5 olfactory receptor family 4 subfamily F member 5 \n", + "1 729759 OR4F29 olfactory receptor family 4 subfamily F member 29 \n", + "2 81399 OR4F16 olfactory receptor family 4 subfamily F member 16 \n", + "\n", + " chromosome gene_type synonyms \\\n", + "0 1 protein-coding NaN \n", + "1 1 protein-coding OR7-21 \n", + "2 1 protein-coding OR1-1|OR7-21 \n", + "\n", + " aliases \n", + "0 olfactory receptor 4F5 \n", + "1 olfactory receptor 4F3/4F16/4F29|olfactory rec... \n", + "2 olfactory receptor 4F3/4F16/4F29|olfactory rec... " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "probe_map_file = data_dir / 'gencode.v23.annotation.gene.probemap.parquet'\n", + "if not probe_map_file.is_file():\n", + " probe_map_df = pd.read_table(data_dir / 'gencode.v23.annotation.gene.probemap')\n", + " probe_map_df.to_parquet(probe_map_file, index=False)\n", + "else:\n", + " probe_map_df = pd.read_parquet(probe_map_file)\n", + "\n", + "# Inner merge gene df to get ensembl to entrez mapping\n", + "probe_map_df = probe_map_df.merge(gene_df, how='inner', left_on='gene', right_on='symbol')\n", + "ensembl_to_entrez = dict(zip(probe_map_df.id, probe_map_df.entrez_gene_id))\n", + "\n", + "print(probe_map_df.shape)\n", + "probe_map_df.head(3)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Read Gene Expression Data" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(60498, 734)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TARGET-30-PASWYR-01TARGET-20-PARUBT-09TARGET-30-PASNZU-01TARGET-52-PASDLA-11TARGET-50-PAKNRX-01TARGET-30-PASWFB-01TARGET-30-PALUYS-01TARGET-30-PAUDDK-01TARGET-10-PAPZNK-09TARGET-50-PAJLNJ-01...TARGET-20-PANLIZ-04TARGET-21-PASSLT-41TARGET-20-PASTTW-09TARGET-50-PAKYLT-01TARGET-20-PATJHJ-09TARGET-21-PATKKJ-41TARGET-10-PAPEJN-04TARGET-20-PABLDZ-09TARGET-10-PANSBR-09TARGET-10-PARFLV-04
sample
ENSG00000242268.2-3.0469-9.9658-3.0469-4.6082-5.5735-9.9658-9.9658-9.9658-9.9658-2.2447...-9.9658-9.9658-4.0350-9.9658-9.9658-9.9658-9.9658-9.9658-4.2934-9.9658
ENSG00000259041.1-9.9658-9.9658-9.9658-9.9658-9.9658-9.9658-9.9658-9.9658-9.9658-9.9658...-9.9658-9.9658-9.9658-9.9658-9.9658-9.9658-9.9658-9.9658-9.9658-9.9658
\n", + "

2 rows × 734 columns

\n", + "
" + ], + "text/plain": [ + " TARGET-30-PASWYR-01 TARGET-20-PARUBT-09 \\\n", + "sample \n", + "ENSG00000242268.2 -3.0469 -9.9658 \n", + "ENSG00000259041.1 -9.9658 -9.9658 \n", + "\n", + " TARGET-30-PASNZU-01 TARGET-52-PASDLA-11 \\\n", + "sample \n", + "ENSG00000242268.2 -3.0469 -4.6082 \n", + "ENSG00000259041.1 -9.9658 -9.9658 \n", + "\n", + " TARGET-50-PAKNRX-01 TARGET-30-PASWFB-01 \\\n", + "sample \n", + "ENSG00000242268.2 -5.5735 -9.9658 \n", + "ENSG00000259041.1 -9.9658 -9.9658 \n", + "\n", + " TARGET-30-PALUYS-01 TARGET-30-PAUDDK-01 \\\n", + "sample \n", + "ENSG00000242268.2 -9.9658 -9.9658 \n", + "ENSG00000259041.1 -9.9658 -9.9658 \n", + "\n", + " TARGET-10-PAPZNK-09 TARGET-50-PAJLNJ-01 ... \\\n", + "sample ... \n", + "ENSG00000242268.2 -9.9658 -2.2447 ... \n", + "ENSG00000259041.1 -9.9658 -9.9658 ... \n", + "\n", + " TARGET-20-PANLIZ-04 TARGET-21-PASSLT-41 \\\n", + "sample \n", + "ENSG00000242268.2 -9.9658 -9.9658 \n", + "ENSG00000259041.1 -9.9658 -9.9658 \n", + "\n", + " TARGET-20-PASTTW-09 TARGET-50-PAKYLT-01 \\\n", + "sample \n", + "ENSG00000242268.2 -4.0350 -9.9658 \n", + "ENSG00000259041.1 -9.9658 -9.9658 \n", + "\n", + " TARGET-20-PATJHJ-09 TARGET-21-PATKKJ-41 \\\n", + "sample \n", + "ENSG00000242268.2 -9.9658 -9.9658 \n", + "ENSG00000259041.1 -9.9658 -9.9658 \n", + "\n", + " TARGET-10-PAPEJN-04 TARGET-20-PABLDZ-09 \\\n", + "sample \n", + "ENSG00000242268.2 -9.9658 -9.9658 \n", + "ENSG00000259041.1 -9.9658 -9.9658 \n", + "\n", + " TARGET-10-PANSBR-09 TARGET-10-PARFLV-04 \n", + "sample \n", + "ENSG00000242268.2 -4.2934 -9.9658 \n", + "ENSG00000259041.1 -9.9658 -9.9658 \n", + "\n", + "[2 rows x 734 columns]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "expr_file = data_dir / 'target_RSEM_gene_fpkm.parquet'\n", + "if not expr_file.is_file():\n", + " expr_df = pd.read_table(data_dir / 'target_RSEM_gene_fpkm.gz', index_col=0)\n", + " expr_df.to_parquet(expr_file)\n", + "else:\n", + " expr_df = pd.read_parquet(expr_file)\n", + "\n", + "print(expr_df.shape)\n", + "expr_df.head(2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Process gene expression matrix " + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(734, 18753)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
id12910121314151618...102724231102724398102724473102724536102724631102724862102724928105375355105378803105378952
sample_id
TARGET-10-PAKSWW-035.3754-1.1488-1.4305-9.9658-7.76965-9.96584.3786-1.31832.02890.7321...-2.6349-9.9658-4.60820.1648-9.9658-5.0116-9.9658-0.8863-9.9658-9.9658
TARGET-10-PAMXHJ-094.9388-1.28280.2881-9.9658-9.96580-9.96583.86601.40113.07912.6232...-2.2447-4.2934-4.60820.4016-9.9658-4.2934-9.9658-2.0529-9.9658-9.9658
\n", + "

2 rows × 18753 columns

\n", + "
" + ], + "text/plain": [ + "id 1 2 9 10 12 \\\n", + "sample_id \n", + "TARGET-10-PAKSWW-03 5.3754 -1.1488 -1.4305 -9.9658 -7.76965 \n", + "TARGET-10-PAMXHJ-09 4.9388 -1.2828 0.2881 -9.9658 -9.96580 \n", + "\n", + "id 13 14 15 16 18 \\\n", + "sample_id \n", + "TARGET-10-PAKSWW-03 -9.9658 4.3786 -1.3183 2.0289 0.7321 \n", + "TARGET-10-PAMXHJ-09 -9.9658 3.8660 1.4011 3.0791 2.6232 \n", + "\n", + "id ... 102724231 102724398 102724473 102724536 \\\n", + "sample_id ... \n", + "TARGET-10-PAKSWW-03 ... -2.6349 -9.9658 -4.6082 0.1648 \n", + "TARGET-10-PAMXHJ-09 ... -2.2447 -4.2934 -4.6082 0.4016 \n", + "\n", + "id 102724631 102724862 102724928 105375355 105378803 \\\n", + "sample_id \n", + "TARGET-10-PAKSWW-03 -9.9658 -5.0116 -9.9658 -0.8863 -9.9658 \n", + "TARGET-10-PAMXHJ-09 -9.9658 -4.2934 -9.9658 -2.0529 -9.9658 \n", + "\n", + "id 105378952 \n", + "sample_id \n", + "TARGET-10-PAKSWW-03 -9.9658 \n", + "TARGET-10-PAMXHJ-09 -9.9658 \n", + "\n", + "[2 rows x 18753 columns]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "expr_df = (expr_df\n", + " .dropna(axis='rows')\n", + " .reindex(probe_map_df.id)\n", + " .rename(index=ensembl_to_entrez)\n", + " .rename(index=old_to_new_entrez)\n", + " .groupby(level=0).mean()\n", + " .transpose()\n", + " .sort_index(axis='rows')\n", + " .sort_index(axis='columns')\n", + ")\n", + "\n", + "\n", + "expr_df.index.rename('sample_id', inplace=True)\n", + "\n", + "\n", + "print(expr_df.shape)\n", + "expr_df.head(2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Stratify Balanced Training and Testing Sets in TARGET Gene Expression\n", + "Output training and testing gene expression datasets " + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "strat = pheno_df.set_index('sample_id').reindex(expr_df.index).primary_disease_code" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
n =count
0AML196
1ALL194
2NBL162
3WT132
4AML-IF32
5CCSK13
6RT5
\n", + "
" + ], + "text/plain": [ + " n = count\n", + "0 AML 196\n", + "1 ALL 194\n", + "2 NBL 162\n", + "3 WT 132\n", + "4 AML-IF 32\n", + "5 CCSK 13\n", + "6 RT 5" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cancertype_count_df = pd.DataFrame(strat.value_counts()).reset_index().rename({'index': 'cancertype', 'primary_disease_code': 'n ='}, axis='columns')\n", + "sample_counts_file = data_dir / 'target_sample_counts.parquet'\n", + "cancertype_count_df.to_parquet(sample_counts_file)\n", + "\n", + "cancertype_count_df" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "train_df, test_df = train_test_split(expr_df,\n", + " test_size=0.2,\n", + " random_state=0,\n", + " stratify=strat)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(587, 18753)\n" + ] + }, + { + "data": { + "text/plain": [ + "(147, 18753)" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(train_df.shape)\n", + "test_df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "train_file = data_dir / 'train_target_expression_matrix_processed.parquet'\n", + "train_df.to_parquet(train_file)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "test_file = data_dir / 'test_target_expression_matrix_processed.parquet'\n", + "test_df.to_parquet(test_file)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Sort genes based on median absolute deviation and output to file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mad_genes_df = pd.DataFrame(train_df.mad(axis=0).sort_values(ascending=False)).reset_index()\n", + "mad_genes_df.columns = ['gene_id', 'median_absolute_deviation']\n", + "mad_genes_file = data_dir / 'target_mad_genes.parquet'\n", + "mad_genes_df.to_parquet(mad_genes_file)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "gene_dependency_representations", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/4.gene_expression_signatures/scripts/1.download-validation-data.py b/4.gene_expression_signatures/scripts/1.download-validation-data.py new file mode 100644 index 0000000..bbf4d99 --- /dev/null +++ b/4.gene_expression_signatures/scripts/1.download-validation-data.py @@ -0,0 +1,208 @@ +#!/usr/bin/env python +# coding: utf-8 + +# # Download and Process Neuroblastoma RNAseq Data + +# Code from: https://github.com/greenelab/BioBombe/blob/master/10.gene-expression-signatures/0.download-validation-data.ipynb +# +# We are downloading the dataset associated with Harenza et al. 2017. The data profiles RNAseq data from 39 commonly used neuroblastoma (NBL) cell lines. +# +# We are interested in the MYCN amplification status of these cell lines. We will test if the MYCN amplification score learned through the BioBombe signature approach applied to TARGET data generalizes to this cell line dataset. +# +# MYCN Amplification refers to the number of copies of the MYCN gene. MYCN amplification is used as a biomarker for poor prognosis in neuroblastoma patients (Huang and Weiss 2013). + +# In[1]: + + +import requests +import pathlib +import pandas as pd +from urllib.request import urlretrieve + +from sklearn import preprocessing + + +# In[2]: + + +url = "https://ndownloader.figshare.com/files/14138792" +name = "2019-01-22-CellLineSTAR-fpkm-2pass_matrix.txt" +data_dir = pathlib.Path("data").resolve() +path = data_dir / name + + +# In[3]: + + +data_dir.mkdir(parents=True, exist_ok=True) + + +# In[4]: + + +urlretrieve(url, path) + + +# In[5]: + + +get_ipython().system(' md5sum "data/2019-01-22-CellLineSTAR-fpkm-2pass_matrix.txt"') + + +# # Download Phenotype Data + +# In[6]: + + +url = "https://www.nature.com/articles/sdata201733/tables/4" +name = "nbl_cellline_phenotype.txt" +path = data_dir / name + + +# In[7]: + + +if not path.is_file(): + html = requests.get(url).content + + pheno_df = pd.read_html(html)[0] + pheno_df['Cell Line'] = pheno_df['Cell Line'].str.replace("-", "") + + pheno_df.to_csv(path, sep='\t', index=False) + +else: + pheno_df = pd.read_parquet(path) + +pheno_df.head() + + +# In[8]: + + +get_ipython().system(' md5sum "data/nbl_cellline_phenotype.txt"') + + +# # Process RNAseq Data + +# In[9]: + + +raw_file = data_dir / "2019-01-22-CellLineSTAR-fpkm-2pass_matrix.txt" +raw_df = pd.read_table(raw_file, sep='\t') +raw_df.head() + + +# # Update Gene Names + +# In[10]: + + +# Load curated gene names from versioned resource +commit = '721204091a96e55de6dcad165d6d8265e67e2a48' +url = 'https://raw.githubusercontent.com/cognoma/genes/{}/data/genes.tsv'.format(commit) +gene_df = pd.read_table(url) + +# Only consider protein-coding genes +gene_df = ( + gene_df.query("gene_type == 'protein-coding'") +) + +symbol_to_entrez = dict(zip(gene_df.symbol, + gene_df.entrez_gene_id)) + + +# In[11]: + + +# Add alternative symbols to entrez mapping dictionary +gene_df = gene_df.dropna(axis='rows', subset=['synonyms']) +gene_df.synonyms = gene_df.synonyms.str.split('|') + +all_syn = ( + gene_df.apply(lambda x: pd.Series(x.synonyms), axis=1) + .stack() + .reset_index(level=1, drop=True) +) + +# Name the synonym series and join with rest of genes +all_syn.name = 'all_synonyms' +gene_with_syn_df = gene_df.join(all_syn) + +# Remove rows that have redundant symbols in all_synonyms +gene_with_syn_df = ( + gene_with_syn_df + + # Drop synonyms that are duplicated - can't be sure of mapping + .drop_duplicates(['all_synonyms'], keep=False) + + # Drop rows in which the symbol appears in the list of synonyms + .query('symbol not in all_synonyms') +) + + +# In[12]: + + +# Create a synonym to entrez mapping and add to dictionary +synonym_to_entrez = dict(zip(gene_with_syn_df.all_synonyms, + gene_with_syn_df.entrez_gene_id)) + +symbol_to_entrez.update(synonym_to_entrez) + + +# In[13]: + + +# Load gene updater +url = 'https://raw.githubusercontent.com/cognoma/genes/{}/data/updater.tsv'.format(commit) +updater_df = pd.read_table(url) +old_to_new_entrez = dict(zip(updater_df.old_entrez_gene_id, + updater_df.new_entrez_gene_id)) + + +# In[14]: + + +gene_map = raw_df.GeneID.replace(symbol_to_entrez) +gene_map = gene_map.replace(old_to_new_entrez) + + +# In[15]: + + +raw_df.index = gene_map +raw_df.index.name = 'entrez_gene_id' +raw_df = raw_df.drop(['GeneID'], axis='columns') +raw_df = raw_df.loc[raw_df.index.isin(symbol_to_entrez.values()), :] + +print(raw_df.shape) +raw_df.head() + + +# # Scale Data and Output + +# In[16]: + + +#MinMax consistent with BetaVAE scaling +raw_scaled_df = preprocessing.MinMaxScaler().fit_transform(raw_df.transpose()) +raw_scaled_df = ( + pd.DataFrame(raw_scaled_df, + columns=raw_df.index, + index=raw_df.columns) + .sort_index(axis='columns') + .sort_index(axis='rows') +) +raw_scaled_df.columns = raw_scaled_df.columns.astype(str) +raw_scaled_df = raw_scaled_df.loc[:, ~raw_scaled_df.columns.duplicated(keep='first')] + +raw_scaled_df.head() + + +# In[17]: + + +# In[17] +processed_file = data_dir / 'nbl_celllines_processed_matrix.parquet' +raw_scaled_df.to_parquet(processed_file) + diff --git a/4.gene_expression_signatures/scripts/2.download-target-data.py b/4.gene_expression_signatures/scripts/2.download-target-data.py new file mode 100644 index 0000000..e9a4ab2 --- /dev/null +++ b/4.gene_expression_signatures/scripts/2.download-target-data.py @@ -0,0 +1,274 @@ +#!/usr/bin/env python +# coding: utf-8 + +# # Download TARGET Pan Cancer Data for Compression +# The notebook downloads gene expression and clinical data from the TARGET project. The data is downloaded from UCSC Xena. +# The data is in log2(FPKM) RSEM transformed + +# In[1]: + + +import pathlib +from pathlib import Path +import hashlib +from urllib.request import urlretrieve +import random +import pandas as pd +from sklearn.model_selection import train_test_split + + +# In[2]: + + +def download_and_checksum(url_base, file_info_dict, file_url_paths): + """ + Downloads files from the specified base URL and computes the SHA-256 checksum for each file. + + Parameters: + ---------- + url_base : str + The base URL from which the files will be downloaded. + + file_info_dict : dict + A dictionary where the keys are file names and the values are the local file paths where + the files will be saved. + + file_url_paths : dict + A dictionary mapping each file name to the corresponding sub-path (if any) that should + be appended to the base URL before downloading the file. + """ + for name, path in file_info_dict.items(): + # Determine the correct URL with or without sub-path + file_url = f"{url_base}{file_url_paths[name]}{name}" + + # Download the file + urlretrieve(file_url, path) + + # Compute checksum + md5_hash = hashlib.md5() + with open(path, "rb") as f: + for byte_block in iter(lambda: f.read(4096), b""): + md5_hash.update(byte_block) + + print(f"{name} checksum: {md5_hash.hexdigest()}") + + +# In[3]: + + +# Base URL +url_base = 'https://toil-xena-hub.s3.us-east-1.amazonaws.com/download/' +data_dir = pathlib.Path('data').resolve() + + +# In[4]: + + +# Dictionary for file names and their paths +file_info_dict = { + 'target_RSEM_gene_fpkm.gz': Path('data/target_RSEM_gene_fpkm.gz'), + 'TARGET_phenotype.gz': Path('data/TARGET_phenotype.gz'), + 'gencode.v23.annotation.gene.probemap': Path('data/gencode.v23.annotation.gene.probemap') +} + +file_url_paths = { + "target_RSEM_gene_fpkm.gz": "", + "gencode.v23.annotation.gene.probemap": "probeMap/", + "TARGET_phenotype.gz": "" +} + + +# In[5]: + + +# Create the 'data' directory +Path('data').mkdir(exist_ok=True) + + +# In[6]: + + +# Call the function +download_and_checksum(url_base, file_info_dict, file_url_paths) + + +# # Process TARGET PanCancer Data +# Retrive the downloaded expression data, update gene identifiers to entrez, and curate sample IDs. The script will also identify a balanced hold-out test set to compare projection performance into learned latent spaces across algorithms. + +# In[7]: + + +random.seed(0) + + +# # Read Phenotype Information + +# In[8]: + + +pheno_file = data_dir / 'TARGET_phenotype.parquet' +if not pheno_file.is_file(): + pheno_df = pd.read_table(data_dir / 'TARGET_phenotype.gz') + pheno_df.to_parquet(pheno_file, index=False) +else: + pheno_df = pd.read_parquet(pheno_file) + +print(pheno_df.shape) +pheno_df.head(3) + + +# # Read Entrez ID Curation Information +# Load curated gene names from versioned resource. See https://github.com/cognoma/genes for more details + +# In[9]: + + +# Commit from https://github.com/cognoma/genes +genes_commit = 'ad9631bb4e77e2cdc5413b0d77cb8f7e93fc5bee' + + +# In[10]: + + +url = 'https://raw.githubusercontent.com/cognoma/genes/{}/data/genes.tsv'.format(genes_commit) +gene_df = pd.read_table(url) + +# Only consider protein-coding genes +gene_df = ( + gene_df.query("gene_type == 'protein-coding'") +) + + +print(gene_df.shape) +gene_df.head(2) + + +# In[11]: + + +# Load gene updater - old to new Entrez gene identifiers +url = 'https://raw.githubusercontent.com/cognoma/genes/{}/data/updater.tsv'.format(genes_commit) +updater_df = pd.read_table(url) +old_to_new_entrez = dict(zip(updater_df.old_entrez_gene_id, + updater_df.new_entrez_gene_id)) + + +# # Read Probe Mapping Info + +# In[12]: + + +probe_map_file = data_dir / 'gencode.v23.annotation.gene.probemap.parquet' +if not probe_map_file.is_file(): + probe_map_df = pd.read_table(data_dir / 'gencode.v23.annotation.gene.probemap') + probe_map_df.to_parquet(probe_map_file, index=False) +else: + probe_map_df = pd.read_parquet(probe_map_file) + +# Inner merge gene df to get ensembl to entrez mapping +probe_map_df = probe_map_df.merge(gene_df, how='inner', left_on='gene', right_on='symbol') +ensembl_to_entrez = dict(zip(probe_map_df.id, probe_map_df.entrez_gene_id)) + +print(probe_map_df.shape) +probe_map_df.head(3) + + +# # Read Gene Expression Data + +# In[13]: + + +expr_file = data_dir / 'target_RSEM_gene_fpkm.parquet' +if not expr_file.is_file(): + expr_df = pd.read_table(data_dir / 'target_RSEM_gene_fpkm.gz', index_col=0) + expr_df.to_parquet(expr_file) +else: + expr_df = pd.read_parquet(expr_file) + +print(expr_df.shape) +expr_df.head(2) + + +# # Process gene expression matrix + +# In[14]: + + +expr_df = (expr_df + .dropna(axis='rows') + .reindex(probe_map_df.id) + .rename(index=ensembl_to_entrez) + .rename(index=old_to_new_entrez) + .groupby(level=0).mean() + .transpose() + .sort_index(axis='rows') + .sort_index(axis='columns') +) + + +expr_df.index.rename('sample_id', inplace=True) + + +print(expr_df.shape) +expr_df.head(2) + + +# # Stratify Balanced Training and Testing Sets in TARGET Gene Expression +# Output training and testing gene expression datasets + +# In[15]: + + +strat = pheno_df.set_index('sample_id').reindex(expr_df.index).primary_disease_code + + +# In[16]: + + +cancertype_count_df = pd.DataFrame(strat.value_counts()).reset_index().rename({'index': 'cancertype', 'primary_disease_code': 'n ='}, axis='columns') +sample_counts_file = data_dir / 'target_sample_counts.parquet' +cancertype_count_df.to_parquet(sample_counts_file) + +cancertype_count_df + + +# In[17]: + + +train_df, test_df = train_test_split(expr_df, + test_size=0.2, + random_state=0, + stratify=strat) + + +# In[18]: + + +print(train_df.shape) +test_df.shape + + +# In[19]: + + +train_file = data_dir / 'train_target_expression_matrix_processed.parquet' +train_df.to_parquet(train_file) + + +# In[20]: + + +test_file = data_dir / 'test_target_expression_matrix_processed.parquet' +test_df.to_parquet(test_file) + + +# # Sort genes based on median absolute deviation and output to file + +# In[ ]: + + +mad_genes_df = pd.DataFrame(train_df.mad(axis=0).sort_values(ascending=False)).reset_index() +mad_genes_df.columns = ['gene_id', 'median_absolute_deviation'] +mad_genes_file = data_dir / 'target_mad_genes.parquet' +mad_genes_df.to_parquet(mad_genes_file) + diff --git a/environment.yml b/environment.yml index 5e20fa7..311180e 100644 --- a/environment.yml +++ b/environment.yml @@ -22,7 +22,15 @@ dependencies: - conda-forge::matplotlib=>3.6.0 - conda-forge::keras-tuner=>1.1.3 - conda-forge::hiplot=>0.1.33 + - conda-forge::optuna + - conda-forge::pytorch=>2.4 + - conda-forge::torchvision + - conda-forge::plotly + - conda-forge::pyarrow + - conda-forge::lxml + - pip: - blitzgsea==1.3.33 +