From c60fab669e031bba62ef83765fa3e52d051aea4b Mon Sep 17 00:00:00 2001
From: Julia Curd <79439229+jaceybronte@users.noreply.github.com>
Date: Mon, 9 Sep 2024 14:42:17 -0600
Subject: [PATCH] Data download for model comparison (#54)

* Data download for model comparison

* Fixes
---
 .../1.download-validation-data.ipynb          | 1108 ++++++++++++++++
 .../2.download-target-data.ipynb              | 1158 +++++++++++++++++
 .../scripts/1.download-validation-data.py     |  208 +++
 .../scripts/2.download-target-data.py         |  274 ++++
 environment.yml                               |    8 +
 5 files changed, 2756 insertions(+)
 create mode 100644 4.gene_expression_signatures/1.download-validation-data.ipynb
 create mode 100644 4.gene_expression_signatures/2.download-target-data.ipynb
 create mode 100644 4.gene_expression_signatures/scripts/1.download-validation-data.py
 create mode 100644 4.gene_expression_signatures/scripts/2.download-target-data.py

diff --git a/4.gene_expression_signatures/1.download-validation-data.ipynb b/4.gene_expression_signatures/1.download-validation-data.ipynb
new file mode 100644
index 0000000..1821e71
--- /dev/null
+++ b/4.gene_expression_signatures/1.download-validation-data.ipynb
@@ -0,0 +1,1108 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Download and Process Neuroblastoma RNAseq Data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Code from: https://github.com/greenelab/BioBombe/blob/master/10.gene-expression-signatures/0.download-validation-data.ipynb\n",
+    "\n",
+    "We are downloading the dataset associated with Harenza et al. 2017. The data profiles RNAseq data from 39 commonly used neuroblastoma (NBL) cell lines.\n",
+    "\n",
+    "We are interested in the MYCN amplification status of these cell lines. We will test if the MYCN amplification score learned through the BioBombe signature approach applied to TARGET data generalizes to this cell line dataset.\n",
+    "\n",
+    "MYCN Amplification refers to the number of copies of the MYCN gene. MYCN amplification is used as a biomarker for poor prognosis in neuroblastoma patients (Huang and Weiss 2013)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "import pathlib\n",
+    "import pandas as pd\n",
+    "from urllib.request import urlretrieve\n",
+    "\n",
+    "from sklearn import preprocessing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "url = \"https://ndownloader.figshare.com/files/14138792\"\n",
+    "name = \"2019-01-22-CellLineSTAR-fpkm-2pass_matrix.txt\"\n",
+    "data_dir = pathlib.Path(\"data\").resolve()\n",
+    "path = data_dir / name"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_dir.mkdir(parents=True, exist_ok=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(PosixPath('data/2019-01-22-CellLineSTAR-fpkm-2pass_matrix.txt'),\n",
+       " <http.client.HTTPMessage at 0x7ca24d7b86b0>)"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "urlretrieve(url, path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "55ea0255d1aa7708eba2ebd0113eeb3f  data/2019-01-22-CellLineSTAR-fpkm-2pass_matrix.txt\n"
+     ]
+    }
+   ],
+   "source": [
+    "! md5sum \"data/2019-01-22-CellLineSTAR-fpkm-2pass_matrix.txt\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Download Phenotype Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "url = \"https://www.nature.com/articles/sdata201733/tables/4\"\n",
+    "name = \"nbl_cellline_phenotype.txt\"\n",
+    "path = data_dir / name"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Cell Line</th>\n",
+       "      <th>MYCN status</th>\n",
+       "      <th>1p36 del</th>\n",
+       "      <th>3p26 del</th>\n",
+       "      <th>11q23 del</th>\n",
+       "      <th>17q21-qter unbal gain</th>\n",
+       "      <th>ALK mutation</th>\n",
+       "      <th>p53 mutation</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>CHP134</td>\n",
+       "      <td>Amplified</td>\n",
+       "      <td>LOH p32.3-pter; Gain p34.3-p36.22; Loss p36.22...</td>\n",
+       "      <td>Gain/AI p26.3</td>\n",
+       "      <td>None</td>\n",
+       "      <td>Gain q12-qter</td>\n",
+       "      <td>WT</td>\n",
+       "      <td>WT</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>CHP212</td>\n",
+       "      <td>Amplified</td>\n",
+       "      <td>Loss p13.2-pter</td>\n",
+       "      <td>Gain/AI p26.3</td>\n",
+       "      <td>cnLOH 23.3</td>\n",
+       "      <td>Gain q12-qter</td>\n",
+       "      <td>WT</td>\n",
+       "      <td>WT</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>COGN415</td>\n",
+       "      <td>Amplified</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>F1174L</td>\n",
+       "      <td>WT</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>COGN440</td>\n",
+       "      <td>Amplified</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>WT</td>\n",
+       "      <td>WT</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>COGN453</td>\n",
+       "      <td>Amplified</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>F1174L</td>\n",
+       "      <td>WT</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "  Cell Line MYCN status                                           1p36 del  \\\n",
+       "0    CHP134   Amplified  LOH p32.3-pter; Gain p34.3-p36.22; Loss p36.22...   \n",
+       "1    CHP212   Amplified                                    Loss p13.2-pter   \n",
+       "2   COGN415   Amplified                                            Unknown   \n",
+       "3   COGN440   Amplified                                            Unknown   \n",
+       "4   COGN453   Amplified                                            Unknown   \n",
+       "\n",
+       "        3p26 del   11q23 del 17q21-qter unbal gain ALK mutation p53 mutation  \n",
+       "0  Gain/AI p26.3        None         Gain q12-qter           WT           WT  \n",
+       "1  Gain/AI p26.3  cnLOH 23.3         Gain q12-qter           WT           WT  \n",
+       "2        Unknown     Unknown               Unknown       F1174L           WT  \n",
+       "3        Unknown     Unknown               Unknown           WT           WT  \n",
+       "4        Unknown     Unknown               Unknown       F1174L           WT  "
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "if not path.is_file():\n",
+    "    html = requests.get(url).content\n",
+    "\n",
+    "    pheno_df = pd.read_html(html)[0]\n",
+    "    pheno_df['Cell Line'] = pheno_df['Cell Line'].str.replace(\"-\", \"\")\n",
+    "\n",
+    "    pheno_df.to_csv(path, sep='\\t', index=False)\n",
+    "\n",
+    "else:\n",
+    "    pheno_df = pd.read_parquet(path)\n",
+    "\n",
+    "pheno_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "c3b0fdeb448296445567e59995f2342c  data/nbl_cellline_phenotype.txt\n"
+     ]
+    }
+   ],
+   "source": [
+    "! md5sum \"data/nbl_cellline_phenotype.txt\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Process RNAseq Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>GeneID</th>\n",
+       "      <th>CHP134</th>\n",
+       "      <th>CHP212</th>\n",
+       "      <th>COGN415</th>\n",
+       "      <th>COGN440</th>\n",
+       "      <th>COGN453</th>\n",
+       "      <th>COGN471</th>\n",
+       "      <th>COGN496</th>\n",
+       "      <th>COGN519</th>\n",
+       "      <th>COGN534</th>\n",
+       "      <th>...</th>\n",
+       "      <th>RPE1</th>\n",
+       "      <th>SHSY5Y</th>\n",
+       "      <th>SKNAS</th>\n",
+       "      <th>SKNBE2</th>\n",
+       "      <th>SKNBE2C</th>\n",
+       "      <th>SKNDZ</th>\n",
+       "      <th>SKNFI</th>\n",
+       "      <th>SKNSH</th>\n",
+       "      <th>SMSKAN</th>\n",
+       "      <th>SMSSAN</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>EAF1</th>\n",
+       "      <td>EAF1</td>\n",
+       "      <td>8.783210</td>\n",
+       "      <td>6.863459</td>\n",
+       "      <td>5.462931</td>\n",
+       "      <td>10.104389</td>\n",
+       "      <td>6.604241</td>\n",
+       "      <td>4.866576</td>\n",
+       "      <td>9.329809</td>\n",
+       "      <td>4.401073</td>\n",
+       "      <td>11.083115</td>\n",
+       "      <td>...</td>\n",
+       "      <td>4.363805</td>\n",
+       "      <td>8.555519</td>\n",
+       "      <td>6.260962</td>\n",
+       "      <td>4.842320</td>\n",
+       "      <td>4.028825</td>\n",
+       "      <td>8.861676</td>\n",
+       "      <td>19.390770</td>\n",
+       "      <td>9.936771</td>\n",
+       "      <td>6.871837</td>\n",
+       "      <td>7.696442</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>SARNP</th>\n",
+       "      <td>SARNP</td>\n",
+       "      <td>12.752439</td>\n",
+       "      <td>23.717824</td>\n",
+       "      <td>22.162407</td>\n",
+       "      <td>27.401760</td>\n",
+       "      <td>19.256736</td>\n",
+       "      <td>9.157380</td>\n",
+       "      <td>23.000173</td>\n",
+       "      <td>25.250549</td>\n",
+       "      <td>23.622492</td>\n",
+       "      <td>...</td>\n",
+       "      <td>25.264710</td>\n",
+       "      <td>31.705089</td>\n",
+       "      <td>22.246654</td>\n",
+       "      <td>24.115542</td>\n",
+       "      <td>28.652249</td>\n",
+       "      <td>32.976600</td>\n",
+       "      <td>38.725041</td>\n",
+       "      <td>15.290273</td>\n",
+       "      <td>18.659111</td>\n",
+       "      <td>31.636539</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>CXCR5</th>\n",
+       "      <td>CXCR5</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.004727</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>KRTAP4-2</th>\n",
+       "      <td>KRTAP4-2</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>MAPK7</th>\n",
+       "      <td>MAPK7</td>\n",
+       "      <td>2.778997</td>\n",
+       "      <td>2.779451</td>\n",
+       "      <td>2.718453</td>\n",
+       "      <td>1.677380</td>\n",
+       "      <td>3.709534</td>\n",
+       "      <td>0.988977</td>\n",
+       "      <td>1.575597</td>\n",
+       "      <td>1.455281</td>\n",
+       "      <td>2.519137</td>\n",
+       "      <td>...</td>\n",
+       "      <td>1.971220</td>\n",
+       "      <td>5.174954</td>\n",
+       "      <td>1.115360</td>\n",
+       "      <td>1.679980</td>\n",
+       "      <td>0.971465</td>\n",
+       "      <td>3.334610</td>\n",
+       "      <td>2.354082</td>\n",
+       "      <td>2.788888</td>\n",
+       "      <td>5.015511</td>\n",
+       "      <td>3.019704</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 41 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "            GeneID     CHP134     CHP212    COGN415    COGN440    COGN453  \\\n",
+       "EAF1          EAF1   8.783210   6.863459   5.462931  10.104389   6.604241   \n",
+       "SARNP        SARNP  12.752439  23.717824  22.162407  27.401760  19.256736   \n",
+       "CXCR5        CXCR5   0.000000   0.000000   0.000000   0.000000   0.000000   \n",
+       "KRTAP4-2  KRTAP4-2   0.000000   0.000000   0.000000   0.000000   0.000000   \n",
+       "MAPK7        MAPK7   2.778997   2.779451   2.718453   1.677380   3.709534   \n",
+       "\n",
+       "           COGN471    COGN496    COGN519    COGN534  ...       RPE1  \\\n",
+       "EAF1      4.866576   9.329809   4.401073  11.083115  ...   4.363805   \n",
+       "SARNP     9.157380  23.000173  25.250549  23.622492  ...  25.264710   \n",
+       "CXCR5     0.000000   0.000000   0.004727   0.000000  ...   0.000000   \n",
+       "KRTAP4-2  0.000000   0.000000   0.000000   0.000000  ...   0.000000   \n",
+       "MAPK7     0.988977   1.575597   1.455281   2.519137  ...   1.971220   \n",
+       "\n",
+       "             SHSY5Y      SKNAS     SKNBE2    SKNBE2C      SKNDZ      SKNFI  \\\n",
+       "EAF1       8.555519   6.260962   4.842320   4.028825   8.861676  19.390770   \n",
+       "SARNP     31.705089  22.246654  24.115542  28.652249  32.976600  38.725041   \n",
+       "CXCR5      0.000000   0.000000   0.000000   0.000000   0.000000   0.000000   \n",
+       "KRTAP4-2   0.000000   0.000000   0.000000   0.000000   0.000000   0.000000   \n",
+       "MAPK7      5.174954   1.115360   1.679980   0.971465   3.334610   2.354082   \n",
+       "\n",
+       "              SKNSH     SMSKAN     SMSSAN  \n",
+       "EAF1       9.936771   6.871837   7.696442  \n",
+       "SARNP     15.290273  18.659111  31.636539  \n",
+       "CXCR5      0.000000   0.000000   0.000000  \n",
+       "KRTAP4-2   0.000000   0.000000   0.000000  \n",
+       "MAPK7      2.788888   5.015511   3.019704  \n",
+       "\n",
+       "[5 rows x 41 columns]"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "raw_file = data_dir / \"2019-01-22-CellLineSTAR-fpkm-2pass_matrix.txt\"\n",
+    "raw_df = pd.read_table(raw_file, sep='\\t')\n",
+    "raw_df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Update Gene Names"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load curated gene names from versioned resource \n",
+    "commit = '721204091a96e55de6dcad165d6d8265e67e2a48'\n",
+    "url = 'https://raw.githubusercontent.com/cognoma/genes/{}/data/genes.tsv'.format(commit)\n",
+    "gene_df = pd.read_table(url)\n",
+    "\n",
+    "# Only consider protein-coding genes\n",
+    "gene_df = (\n",
+    "    gene_df.query(\"gene_type == 'protein-coding'\")\n",
+    ")\n",
+    "\n",
+    "symbol_to_entrez = dict(zip(gene_df.symbol,\n",
+    "                            gene_df.entrez_gene_id))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Add alternative symbols to entrez mapping dictionary\n",
+    "gene_df = gene_df.dropna(axis='rows', subset=['synonyms'])\n",
+    "gene_df.synonyms = gene_df.synonyms.str.split('|')\n",
+    "\n",
+    "all_syn = (\n",
+    "    gene_df.apply(lambda x: pd.Series(x.synonyms), axis=1)\n",
+    "    .stack()\n",
+    "    .reset_index(level=1, drop=True)\n",
+    ")\n",
+    "\n",
+    "# Name the synonym series and join with rest of genes\n",
+    "all_syn.name = 'all_synonyms'\n",
+    "gene_with_syn_df = gene_df.join(all_syn)\n",
+    "\n",
+    "# Remove rows that have redundant symbols in all_synonyms\n",
+    "gene_with_syn_df = (\n",
+    "    gene_with_syn_df\n",
+    "    \n",
+    "    # Drop synonyms that are duplicated - can't be sure of mapping\n",
+    "    .drop_duplicates(['all_synonyms'], keep=False)\n",
+    "\n",
+    "    # Drop rows in which the symbol appears in the list of synonyms\n",
+    "    .query('symbol not in all_synonyms')\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create a synonym to entrez mapping and add to dictionary\n",
+    "synonym_to_entrez = dict(zip(gene_with_syn_df.all_synonyms,\n",
+    "                             gene_with_syn_df.entrez_gene_id))\n",
+    "\n",
+    "symbol_to_entrez.update(synonym_to_entrez)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load gene updater\n",
+    "url = 'https://raw.githubusercontent.com/cognoma/genes/{}/data/updater.tsv'.format(commit)\n",
+    "updater_df = pd.read_table(url)\n",
+    "old_to_new_entrez = dict(zip(updater_df.old_entrez_gene_id,\n",
+    "                             updater_df.new_entrez_gene_id))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gene_map = raw_df.GeneID.replace(symbol_to_entrez)\n",
+    "gene_map = gene_map.replace(old_to_new_entrez)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(19287, 40)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>CHP134</th>\n",
+       "      <th>CHP212</th>\n",
+       "      <th>COGN415</th>\n",
+       "      <th>COGN440</th>\n",
+       "      <th>COGN453</th>\n",
+       "      <th>COGN471</th>\n",
+       "      <th>COGN496</th>\n",
+       "      <th>COGN519</th>\n",
+       "      <th>COGN534</th>\n",
+       "      <th>COGN549</th>\n",
+       "      <th>...</th>\n",
+       "      <th>RPE1</th>\n",
+       "      <th>SHSY5Y</th>\n",
+       "      <th>SKNAS</th>\n",
+       "      <th>SKNBE2</th>\n",
+       "      <th>SKNBE2C</th>\n",
+       "      <th>SKNDZ</th>\n",
+       "      <th>SKNFI</th>\n",
+       "      <th>SKNSH</th>\n",
+       "      <th>SMSKAN</th>\n",
+       "      <th>SMSSAN</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>entrez_gene_id</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>10847</th>\n",
+       "      <td>8.783210</td>\n",
+       "      <td>6.863459</td>\n",
+       "      <td>5.462931</td>\n",
+       "      <td>10.104389</td>\n",
+       "      <td>6.604241</td>\n",
+       "      <td>4.866576</td>\n",
+       "      <td>9.329809</td>\n",
+       "      <td>4.401073</td>\n",
+       "      <td>11.083115</td>\n",
+       "      <td>5.182270</td>\n",
+       "      <td>...</td>\n",
+       "      <td>4.363805</td>\n",
+       "      <td>8.555519</td>\n",
+       "      <td>6.260962</td>\n",
+       "      <td>4.842320</td>\n",
+       "      <td>4.028825</td>\n",
+       "      <td>8.861676</td>\n",
+       "      <td>19.390770</td>\n",
+       "      <td>9.936771</td>\n",
+       "      <td>6.871837</td>\n",
+       "      <td>7.696442</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>84324</th>\n",
+       "      <td>12.752439</td>\n",
+       "      <td>23.717824</td>\n",
+       "      <td>22.162407</td>\n",
+       "      <td>27.401760</td>\n",
+       "      <td>19.256736</td>\n",
+       "      <td>9.157380</td>\n",
+       "      <td>23.000173</td>\n",
+       "      <td>25.250549</td>\n",
+       "      <td>23.622492</td>\n",
+       "      <td>12.517533</td>\n",
+       "      <td>...</td>\n",
+       "      <td>25.264710</td>\n",
+       "      <td>31.705089</td>\n",
+       "      <td>22.246654</td>\n",
+       "      <td>24.115542</td>\n",
+       "      <td>28.652249</td>\n",
+       "      <td>32.976600</td>\n",
+       "      <td>38.725041</td>\n",
+       "      <td>15.290273</td>\n",
+       "      <td>18.659111</td>\n",
+       "      <td>31.636539</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>643</th>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.004727</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>85291</th>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5598</th>\n",
+       "      <td>2.778997</td>\n",
+       "      <td>2.779451</td>\n",
+       "      <td>2.718453</td>\n",
+       "      <td>1.677380</td>\n",
+       "      <td>3.709534</td>\n",
+       "      <td>0.988977</td>\n",
+       "      <td>1.575597</td>\n",
+       "      <td>1.455281</td>\n",
+       "      <td>2.519137</td>\n",
+       "      <td>1.208765</td>\n",
+       "      <td>...</td>\n",
+       "      <td>1.971220</td>\n",
+       "      <td>5.174954</td>\n",
+       "      <td>1.115360</td>\n",
+       "      <td>1.679980</td>\n",
+       "      <td>0.971465</td>\n",
+       "      <td>3.334610</td>\n",
+       "      <td>2.354082</td>\n",
+       "      <td>2.788888</td>\n",
+       "      <td>5.015511</td>\n",
+       "      <td>3.019704</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 40 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                   CHP134     CHP212    COGN415    COGN440    COGN453  \\\n",
+       "entrez_gene_id                                                          \n",
+       "10847            8.783210   6.863459   5.462931  10.104389   6.604241   \n",
+       "84324           12.752439  23.717824  22.162407  27.401760  19.256736   \n",
+       "643              0.000000   0.000000   0.000000   0.000000   0.000000   \n",
+       "85291            0.000000   0.000000   0.000000   0.000000   0.000000   \n",
+       "5598             2.778997   2.779451   2.718453   1.677380   3.709534   \n",
+       "\n",
+       "                 COGN471    COGN496    COGN519    COGN534    COGN549  ...  \\\n",
+       "entrez_gene_id                                                        ...   \n",
+       "10847           4.866576   9.329809   4.401073  11.083115   5.182270  ...   \n",
+       "84324           9.157380  23.000173  25.250549  23.622492  12.517533  ...   \n",
+       "643             0.000000   0.000000   0.004727   0.000000   0.000000  ...   \n",
+       "85291           0.000000   0.000000   0.000000   0.000000   0.000000  ...   \n",
+       "5598            0.988977   1.575597   1.455281   2.519137   1.208765  ...   \n",
+       "\n",
+       "                     RPE1     SHSY5Y      SKNAS     SKNBE2    SKNBE2C  \\\n",
+       "entrez_gene_id                                                          \n",
+       "10847            4.363805   8.555519   6.260962   4.842320   4.028825   \n",
+       "84324           25.264710  31.705089  22.246654  24.115542  28.652249   \n",
+       "643              0.000000   0.000000   0.000000   0.000000   0.000000   \n",
+       "85291            0.000000   0.000000   0.000000   0.000000   0.000000   \n",
+       "5598             1.971220   5.174954   1.115360   1.679980   0.971465   \n",
+       "\n",
+       "                    SKNDZ      SKNFI      SKNSH     SMSKAN     SMSSAN  \n",
+       "entrez_gene_id                                                         \n",
+       "10847            8.861676  19.390770   9.936771   6.871837   7.696442  \n",
+       "84324           32.976600  38.725041  15.290273  18.659111  31.636539  \n",
+       "643              0.000000   0.000000   0.000000   0.000000   0.000000  \n",
+       "85291            0.000000   0.000000   0.000000   0.000000   0.000000  \n",
+       "5598             3.334610   2.354082   2.788888   5.015511   3.019704  \n",
+       "\n",
+       "[5 rows x 40 columns]"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "raw_df.index = gene_map\n",
+    "raw_df.index.name = 'entrez_gene_id'\n",
+    "raw_df = raw_df.drop(['GeneID'], axis='columns')\n",
+    "raw_df = raw_df.loc[raw_df.index.isin(symbol_to_entrez.values()), :]\n",
+    "\n",
+    "print(raw_df.shape)\n",
+    "raw_df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Scale Data and Output"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th>entrez_gene_id</th>\n",
+       "      <th>1</th>\n",
+       "      <th>2</th>\n",
+       "      <th>9</th>\n",
+       "      <th>12</th>\n",
+       "      <th>13</th>\n",
+       "      <th>14</th>\n",
+       "      <th>15</th>\n",
+       "      <th>16</th>\n",
+       "      <th>18</th>\n",
+       "      <th>19</th>\n",
+       "      <th>...</th>\n",
+       "      <th>105375787</th>\n",
+       "      <th>106707243</th>\n",
+       "      <th>106821730</th>\n",
+       "      <th>106865373</th>\n",
+       "      <th>107080638</th>\n",
+       "      <th>107080644</th>\n",
+       "      <th>107181291</th>\n",
+       "      <th>107984155</th>\n",
+       "      <th>107984923</th>\n",
+       "      <th>107987479</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>CHP134</th>\n",
+       "      <td>0.496142</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.031914</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.335754</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.527104</td>\n",
+       "      <td>0.123020</td>\n",
+       "      <td>0.101682</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.202001</td>\n",
+       "      <td>0.350250</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.359538</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.398515</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>CHP212</th>\n",
+       "      <td>0.130269</td>\n",
+       "      <td>0.077607</td>\n",
+       "      <td>0.384543</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.047631</td>\n",
+       "      <td>0.483361</td>\n",
+       "      <td>0.011138</td>\n",
+       "      <td>0.680574</td>\n",
+       "      <td>0.002162</td>\n",
+       "      <td>0.136316</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.030310</td>\n",
+       "      <td>0.673424</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.086474</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.556493</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>COGN415</th>\n",
+       "      <td>0.356876</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.168576</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.303912</td>\n",
+       "      <td>0.068080</td>\n",
+       "      <td>0.153800</td>\n",
+       "      <td>0.100531</td>\n",
+       "      <td>0.033393</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.417738</td>\n",
+       "      <td>0.199854</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.211183</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.726023</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>COGN440</th>\n",
+       "      <td>0.512493</td>\n",
+       "      <td>0.000593</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.001941</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.373208</td>\n",
+       "      <td>0.102337</td>\n",
+       "      <td>0.249516</td>\n",
+       "      <td>0.217459</td>\n",
+       "      <td>0.004284</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.230559</td>\n",
+       "      <td>0.538564</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.228188</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.607477</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>COGN453</th>\n",
+       "      <td>0.383474</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.086284</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.464890</td>\n",
+       "      <td>0.113665</td>\n",
+       "      <td>0.261963</td>\n",
+       "      <td>0.125526</td>\n",
+       "      <td>0.001720</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.301525</td>\n",
+       "      <td>0.471939</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.272335</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.375054</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 18628 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "entrez_gene_id         1         2         9        12        13        14  \\\n",
+       "CHP134          0.496142  0.000000  0.031914  0.000000  0.000000  0.335754   \n",
+       "CHP212          0.130269  0.077607  0.384543  0.000000  0.047631  0.483361   \n",
+       "COGN415         0.356876  0.000000  0.168576  0.000000  0.000000  0.303912   \n",
+       "COGN440         0.512493  0.000593  1.000000  0.001941  0.000000  0.373208   \n",
+       "COGN453         0.383474  0.000000  0.086284  0.000000  0.000000  0.464890   \n",
+       "\n",
+       "entrez_gene_id        15        16        18        19  ...  105375787  \\\n",
+       "CHP134          1.000000  0.527104  0.123020  0.101682  ...        0.0   \n",
+       "CHP212          0.011138  0.680574  0.002162  0.136316  ...        0.0   \n",
+       "COGN415         0.068080  0.153800  0.100531  0.033393  ...        0.0   \n",
+       "COGN440         0.102337  0.249516  0.217459  0.004284  ...        0.0   \n",
+       "COGN453         0.113665  0.261963  0.125526  0.001720  ...        0.0   \n",
+       "\n",
+       "entrez_gene_id  106707243  106821730  106865373  107080638  107080644  \\\n",
+       "CHP134           0.202001   0.350250        0.0   0.359538        0.0   \n",
+       "CHP212           0.030310   0.673424        0.0   0.086474        0.0   \n",
+       "COGN415          0.417738   0.199854        0.0   0.211183        0.0   \n",
+       "COGN440          0.230559   0.538564        0.0   0.228188        0.0   \n",
+       "COGN453          0.301525   0.471939        0.0   0.272335        0.0   \n",
+       "\n",
+       "entrez_gene_id  107181291  107984155  107984923  107987479  \n",
+       "CHP134                0.0        0.0   0.398515        0.0  \n",
+       "CHP212                0.0        0.0   0.556493        0.0  \n",
+       "COGN415               0.0        0.0   0.726023        0.0  \n",
+       "COGN440               0.0        0.0   0.607477        0.0  \n",
+       "COGN453               0.0        0.0   0.375054        0.0  \n",
+       "\n",
+       "[5 rows x 18628 columns]"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#MinMax consistent with BetaVAE scaling\n",
+    "raw_scaled_df = preprocessing.MinMaxScaler().fit_transform(raw_df.transpose())\n",
+    "raw_scaled_df = (\n",
+    "    pd.DataFrame(raw_scaled_df,\n",
+    "                 columns=raw_df.index,\n",
+    "                 index=raw_df.columns)\n",
+    "    .sort_index(axis='columns')\n",
+    "    .sort_index(axis='rows')\n",
+    ")\n",
+    "raw_scaled_df.columns = raw_scaled_df.columns.astype(str)\n",
+    "raw_scaled_df = raw_scaled_df.loc[:, ~raw_scaled_df.columns.duplicated(keep='first')]\n",
+    "\n",
+    "raw_scaled_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# In[17]\n",
+    "processed_file = data_dir / 'nbl_celllines_processed_matrix.parquet'\n",
+    "raw_scaled_df.to_parquet(processed_file)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "gene_dependency_representations",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/4.gene_expression_signatures/2.download-target-data.ipynb b/4.gene_expression_signatures/2.download-target-data.ipynb
new file mode 100644
index 0000000..0b1dfa9
--- /dev/null
+++ b/4.gene_expression_signatures/2.download-target-data.ipynb
@@ -0,0 +1,1158 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Download TARGET Pan Cancer Data for Compression\n",
+    "The notebook downloads gene expression and clinical data from the TARGET project. The data is downloaded from UCSC Xena.\n",
+    "The data is in log2(FPKM) RSEM transformed"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pathlib\n",
+    "from pathlib import Path\n",
+    "import hashlib\n",
+    "from urllib.request import urlretrieve\n",
+    "import random\n",
+    "import pandas as pd\n",
+    "from sklearn.model_selection import train_test_split"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def download_and_checksum(url_base, file_info_dict, file_url_paths):\n",
+    "    \"\"\"\n",
+    "    Downloads files from the specified base URL and computes the SHA-256 checksum for each file.\n",
+    "\n",
+    "    Parameters:\n",
+    "    ----------\n",
+    "    url_base : str\n",
+    "        The base URL from which the files will be downloaded.\n",
+    "\n",
+    "    file_info_dict : dict\n",
+    "        A dictionary where the keys are file names and the values are the local file paths where \n",
+    "        the files will be saved.\n",
+    "\n",
+    "    file_url_paths : dict\n",
+    "        A dictionary mapping each file name to the corresponding sub-path (if any) that should \n",
+    "        be appended to the base URL before downloading the file.\n",
+    "    \"\"\"\n",
+    "    for name, path in file_info_dict.items():\n",
+    "        # Determine the correct URL with or without sub-path\n",
+    "        file_url = f\"{url_base}{file_url_paths[name]}{name}\"\n",
+    "        \n",
+    "        # Download the file\n",
+    "        urlretrieve(file_url, path)\n",
+    "\n",
+    "        # Compute checksum\n",
+    "        md5_hash = hashlib.md5()\n",
+    "        with open(path, \"rb\") as f:\n",
+    "            for byte_block in iter(lambda: f.read(4096), b\"\"):\n",
+    "                md5_hash.update(byte_block)\n",
+    "\n",
+    "        print(f\"{name} checksum: {md5_hash.hexdigest()}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Base URL\n",
+    "url_base = 'https://toil-xena-hub.s3.us-east-1.amazonaws.com/download/'\n",
+    "data_dir = pathlib.Path('data').resolve()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Dictionary for file names and their paths\n",
+    "file_info_dict = {\n",
+    "    'target_RSEM_gene_fpkm.gz': Path('data/target_RSEM_gene_fpkm.gz'),\n",
+    "    'TARGET_phenotype.gz': Path('data/TARGET_phenotype.gz'),\n",
+    "    'gencode.v23.annotation.gene.probemap': Path('data/gencode.v23.annotation.gene.probemap')\n",
+    "}\n",
+    "\n",
+    "file_url_paths = {\n",
+    "    \"target_RSEM_gene_fpkm.gz\": \"\",\n",
+    "    \"gencode.v23.annotation.gene.probemap\": \"probeMap/\",\n",
+    "    \"TARGET_phenotype.gz\": \"\"\n",
+    "}\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create the 'data' directory\n",
+    "Path('data').mkdir(exist_ok=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "target_RSEM_gene_fpkm.gz checksum: 4584cca6e5befebaead5f1865f23f517321dfc52cf9e752d87219f0bc33a14fc\n",
+      "TARGET_phenotype.gz checksum: b52ec780daaa940dbfdd76f5f764cde2567ba6e6acbbab7e889030ef9474a5cc\n",
+      "gencode.v23.annotation.gene.probemap checksum: 6783ea58791ae876efb697889a042cc7be8e32e40fc01191c622ef25d9416931\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Call the function\n",
+    "download_and_checksum(url_base, file_info_dict, file_url_paths)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Process TARGET PanCancer Data\n",
+    "Retrive the downloaded expression data, update gene identifiers to entrez, and curate sample IDs. The script will also identify a balanced hold-out test set to compare projection performance into learned latent spaces across algorithms. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "random.seed(0)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Read Phenotype Information"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(5958, 7)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>sample_id</th>\n",
+       "      <th>primary_disease_code</th>\n",
+       "      <th>_primary_disease</th>\n",
+       "      <th>sample_type_code</th>\n",
+       "      <th>_sample_type</th>\n",
+       "      <th>_PATIENT</th>\n",
+       "      <th>_cohort</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>TARGET-00-NAAEMA-20</td>\n",
+       "      <td>Non cancerous tissue</td>\n",
+       "      <td>Non cancerous tissue</td>\n",
+       "      <td>CELLC</td>\n",
+       "      <td>Control Analyte</td>\n",
+       "      <td>NAAEMA</td>\n",
+       "      <td>TARGET</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>TARGET-00-NAAEMB-20</td>\n",
+       "      <td>Non cancerous tissue</td>\n",
+       "      <td>Non cancerous tissue</td>\n",
+       "      <td>CELLC</td>\n",
+       "      <td>Control Analyte</td>\n",
+       "      <td>NAAEMB</td>\n",
+       "      <td>TARGET</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>TARGET-00-NAAEMC-20</td>\n",
+       "      <td>Non cancerous tissue</td>\n",
+       "      <td>Non cancerous tissue</td>\n",
+       "      <td>CELLC</td>\n",
+       "      <td>Control Analyte</td>\n",
+       "      <td>NAAEMC</td>\n",
+       "      <td>TARGET</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "             sample_id  primary_disease_code      _primary_disease  \\\n",
+       "0  TARGET-00-NAAEMA-20  Non cancerous tissue  Non cancerous tissue   \n",
+       "1  TARGET-00-NAAEMB-20  Non cancerous tissue  Non cancerous tissue   \n",
+       "2  TARGET-00-NAAEMC-20  Non cancerous tissue  Non cancerous tissue   \n",
+       "\n",
+       "  sample_type_code     _sample_type _PATIENT _cohort  \n",
+       "0            CELLC  Control Analyte   NAAEMA  TARGET  \n",
+       "1            CELLC  Control Analyte   NAAEMB  TARGET  \n",
+       "2            CELLC  Control Analyte   NAAEMC  TARGET  "
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pheno_file = data_dir / 'TARGET_phenotype.parquet'\n",
+    "if not pheno_file.is_file():\n",
+    "    pheno_df = pd.read_table(data_dir / 'TARGET_phenotype.gz')\n",
+    "    pheno_df.to_parquet(pheno_file, index=False)\n",
+    "else:\n",
+    "    pheno_df = pd.read_parquet(pheno_file)\n",
+    "\n",
+    "print(pheno_df.shape)\n",
+    "pheno_df.head(3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Read Entrez ID Curation Information\n",
+    "Load curated gene names from versioned resource. See https://github.com/cognoma/genes for more details"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Commit from https://github.com/cognoma/genes\n",
+    "genes_commit = 'ad9631bb4e77e2cdc5413b0d77cb8f7e93fc5bee'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(20395, 7)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>entrez_gene_id</th>\n",
+       "      <th>symbol</th>\n",
+       "      <th>description</th>\n",
+       "      <th>chromosome</th>\n",
+       "      <th>gene_type</th>\n",
+       "      <th>synonyms</th>\n",
+       "      <th>aliases</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>A1BG</td>\n",
+       "      <td>alpha-1-B glycoprotein</td>\n",
+       "      <td>19</td>\n",
+       "      <td>protein-coding</td>\n",
+       "      <td>A1B|ABG|GAB|HYST2477</td>\n",
+       "      <td>alpha-1B-glycoprotein|HEL-S-163pA|epididymis s...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2</td>\n",
+       "      <td>A2M</td>\n",
+       "      <td>alpha-2-macroglobulin</td>\n",
+       "      <td>12</td>\n",
+       "      <td>protein-coding</td>\n",
+       "      <td>A2MD|CPAMD5|FWP007|S863-7</td>\n",
+       "      <td>alpha-2-macroglobulin|C3 and PZP-like alpha-2-...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   entrez_gene_id symbol             description chromosome       gene_type  \\\n",
+       "0               1   A1BG  alpha-1-B glycoprotein         19  protein-coding   \n",
+       "1               2    A2M   alpha-2-macroglobulin         12  protein-coding   \n",
+       "\n",
+       "                    synonyms  \\\n",
+       "0       A1B|ABG|GAB|HYST2477   \n",
+       "1  A2MD|CPAMD5|FWP007|S863-7   \n",
+       "\n",
+       "                                             aliases  \n",
+       "0  alpha-1B-glycoprotein|HEL-S-163pA|epididymis s...  \n",
+       "1  alpha-2-macroglobulin|C3 and PZP-like alpha-2-...  "
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "url = 'https://raw.githubusercontent.com/cognoma/genes/{}/data/genes.tsv'.format(genes_commit)\n",
+    "gene_df = pd.read_table(url)\n",
+    "\n",
+    "# Only consider protein-coding genes\n",
+    "gene_df = (\n",
+    "   gene_df.query(\"gene_type == 'protein-coding'\")\n",
+    ")\n",
+    "\n",
+    "\n",
+    "print(gene_df.shape)\n",
+    "gene_df.head(2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load gene updater - old to new Entrez gene identifiers\n",
+    "url = 'https://raw.githubusercontent.com/cognoma/genes/{}/data/updater.tsv'.format(genes_commit)\n",
+    "updater_df = pd.read_table(url)\n",
+    "old_to_new_entrez = dict(zip(updater_df.old_entrez_gene_id,\n",
+    "                            updater_df.new_entrez_gene_id))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Read Probe Mapping Info"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(18855, 13)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>gene</th>\n",
+       "      <th>chrom</th>\n",
+       "      <th>chromStart</th>\n",
+       "      <th>chromEnd</th>\n",
+       "      <th>strand</th>\n",
+       "      <th>entrez_gene_id</th>\n",
+       "      <th>symbol</th>\n",
+       "      <th>description</th>\n",
+       "      <th>chromosome</th>\n",
+       "      <th>gene_type</th>\n",
+       "      <th>synonyms</th>\n",
+       "      <th>aliases</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>ENSG00000186092.4</td>\n",
+       "      <td>OR4F5</td>\n",
+       "      <td>chr1</td>\n",
+       "      <td>69091</td>\n",
+       "      <td>70008</td>\n",
+       "      <td>+</td>\n",
+       "      <td>79501</td>\n",
+       "      <td>OR4F5</td>\n",
+       "      <td>olfactory receptor family 4 subfamily F member 5</td>\n",
+       "      <td>1</td>\n",
+       "      <td>protein-coding</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>olfactory receptor 4F5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>ENSG00000278566.1</td>\n",
+       "      <td>OR4F29</td>\n",
+       "      <td>chr1</td>\n",
+       "      <td>450740</td>\n",
+       "      <td>451678</td>\n",
+       "      <td>-</td>\n",
+       "      <td>729759</td>\n",
+       "      <td>OR4F29</td>\n",
+       "      <td>olfactory receptor family 4 subfamily F member 29</td>\n",
+       "      <td>1</td>\n",
+       "      <td>protein-coding</td>\n",
+       "      <td>OR7-21</td>\n",
+       "      <td>olfactory receptor 4F3/4F16/4F29|olfactory rec...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>ENSG00000273547.1</td>\n",
+       "      <td>OR4F16</td>\n",
+       "      <td>chr1</td>\n",
+       "      <td>685716</td>\n",
+       "      <td>686654</td>\n",
+       "      <td>-</td>\n",
+       "      <td>81399</td>\n",
+       "      <td>OR4F16</td>\n",
+       "      <td>olfactory receptor family 4 subfamily F member 16</td>\n",
+       "      <td>1</td>\n",
+       "      <td>protein-coding</td>\n",
+       "      <td>OR1-1|OR7-21</td>\n",
+       "      <td>olfactory receptor 4F3/4F16/4F29|olfactory rec...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                  id    gene chrom  chromStart  chromEnd strand  \\\n",
+       "0  ENSG00000186092.4   OR4F5  chr1       69091     70008      +   \n",
+       "1  ENSG00000278566.1  OR4F29  chr1      450740    451678      -   \n",
+       "2  ENSG00000273547.1  OR4F16  chr1      685716    686654      -   \n",
+       "\n",
+       "   entrez_gene_id  symbol                                        description  \\\n",
+       "0           79501   OR4F5   olfactory receptor family 4 subfamily F member 5   \n",
+       "1          729759  OR4F29  olfactory receptor family 4 subfamily F member 29   \n",
+       "2           81399  OR4F16  olfactory receptor family 4 subfamily F member 16   \n",
+       "\n",
+       "  chromosome       gene_type      synonyms  \\\n",
+       "0          1  protein-coding           NaN   \n",
+       "1          1  protein-coding        OR7-21   \n",
+       "2          1  protein-coding  OR1-1|OR7-21   \n",
+       "\n",
+       "                                             aliases  \n",
+       "0                             olfactory receptor 4F5  \n",
+       "1  olfactory receptor 4F3/4F16/4F29|olfactory rec...  \n",
+       "2  olfactory receptor 4F3/4F16/4F29|olfactory rec...  "
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "probe_map_file = data_dir / 'gencode.v23.annotation.gene.probemap.parquet'\n",
+    "if not probe_map_file.is_file():\n",
+    "    probe_map_df = pd.read_table(data_dir / 'gencode.v23.annotation.gene.probemap')\n",
+    "    probe_map_df.to_parquet(probe_map_file, index=False)\n",
+    "else:\n",
+    "    probe_map_df = pd.read_parquet(probe_map_file)\n",
+    "\n",
+    "# Inner merge gene df to get ensembl to entrez mapping\n",
+    "probe_map_df = probe_map_df.merge(gene_df, how='inner', left_on='gene', right_on='symbol')\n",
+    "ensembl_to_entrez = dict(zip(probe_map_df.id, probe_map_df.entrez_gene_id))\n",
+    "\n",
+    "print(probe_map_df.shape)\n",
+    "probe_map_df.head(3)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Read Gene Expression Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(60498, 734)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>TARGET-30-PASWYR-01</th>\n",
+       "      <th>TARGET-20-PARUBT-09</th>\n",
+       "      <th>TARGET-30-PASNZU-01</th>\n",
+       "      <th>TARGET-52-PASDLA-11</th>\n",
+       "      <th>TARGET-50-PAKNRX-01</th>\n",
+       "      <th>TARGET-30-PASWFB-01</th>\n",
+       "      <th>TARGET-30-PALUYS-01</th>\n",
+       "      <th>TARGET-30-PAUDDK-01</th>\n",
+       "      <th>TARGET-10-PAPZNK-09</th>\n",
+       "      <th>TARGET-50-PAJLNJ-01</th>\n",
+       "      <th>...</th>\n",
+       "      <th>TARGET-20-PANLIZ-04</th>\n",
+       "      <th>TARGET-21-PASSLT-41</th>\n",
+       "      <th>TARGET-20-PASTTW-09</th>\n",
+       "      <th>TARGET-50-PAKYLT-01</th>\n",
+       "      <th>TARGET-20-PATJHJ-09</th>\n",
+       "      <th>TARGET-21-PATKKJ-41</th>\n",
+       "      <th>TARGET-10-PAPEJN-04</th>\n",
+       "      <th>TARGET-20-PABLDZ-09</th>\n",
+       "      <th>TARGET-10-PANSBR-09</th>\n",
+       "      <th>TARGET-10-PARFLV-04</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>sample</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>ENSG00000242268.2</th>\n",
+       "      <td>-3.0469</td>\n",
+       "      <td>-9.9658</td>\n",
+       "      <td>-3.0469</td>\n",
+       "      <td>-4.6082</td>\n",
+       "      <td>-5.5735</td>\n",
+       "      <td>-9.9658</td>\n",
+       "      <td>-9.9658</td>\n",
+       "      <td>-9.9658</td>\n",
+       "      <td>-9.9658</td>\n",
+       "      <td>-2.2447</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-9.9658</td>\n",
+       "      <td>-9.9658</td>\n",
+       "      <td>-4.0350</td>\n",
+       "      <td>-9.9658</td>\n",
+       "      <td>-9.9658</td>\n",
+       "      <td>-9.9658</td>\n",
+       "      <td>-9.9658</td>\n",
+       "      <td>-9.9658</td>\n",
+       "      <td>-4.2934</td>\n",
+       "      <td>-9.9658</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>ENSG00000259041.1</th>\n",
+       "      <td>-9.9658</td>\n",
+       "      <td>-9.9658</td>\n",
+       "      <td>-9.9658</td>\n",
+       "      <td>-9.9658</td>\n",
+       "      <td>-9.9658</td>\n",
+       "      <td>-9.9658</td>\n",
+       "      <td>-9.9658</td>\n",
+       "      <td>-9.9658</td>\n",
+       "      <td>-9.9658</td>\n",
+       "      <td>-9.9658</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-9.9658</td>\n",
+       "      <td>-9.9658</td>\n",
+       "      <td>-9.9658</td>\n",
+       "      <td>-9.9658</td>\n",
+       "      <td>-9.9658</td>\n",
+       "      <td>-9.9658</td>\n",
+       "      <td>-9.9658</td>\n",
+       "      <td>-9.9658</td>\n",
+       "      <td>-9.9658</td>\n",
+       "      <td>-9.9658</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>2 rows × 734 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                   TARGET-30-PASWYR-01  TARGET-20-PARUBT-09  \\\n",
+       "sample                                                        \n",
+       "ENSG00000242268.2              -3.0469              -9.9658   \n",
+       "ENSG00000259041.1              -9.9658              -9.9658   \n",
+       "\n",
+       "                   TARGET-30-PASNZU-01  TARGET-52-PASDLA-11  \\\n",
+       "sample                                                        \n",
+       "ENSG00000242268.2              -3.0469              -4.6082   \n",
+       "ENSG00000259041.1              -9.9658              -9.9658   \n",
+       "\n",
+       "                   TARGET-50-PAKNRX-01  TARGET-30-PASWFB-01  \\\n",
+       "sample                                                        \n",
+       "ENSG00000242268.2              -5.5735              -9.9658   \n",
+       "ENSG00000259041.1              -9.9658              -9.9658   \n",
+       "\n",
+       "                   TARGET-30-PALUYS-01  TARGET-30-PAUDDK-01  \\\n",
+       "sample                                                        \n",
+       "ENSG00000242268.2              -9.9658              -9.9658   \n",
+       "ENSG00000259041.1              -9.9658              -9.9658   \n",
+       "\n",
+       "                   TARGET-10-PAPZNK-09  TARGET-50-PAJLNJ-01  ...  \\\n",
+       "sample                                                       ...   \n",
+       "ENSG00000242268.2              -9.9658              -2.2447  ...   \n",
+       "ENSG00000259041.1              -9.9658              -9.9658  ...   \n",
+       "\n",
+       "                   TARGET-20-PANLIZ-04  TARGET-21-PASSLT-41  \\\n",
+       "sample                                                        \n",
+       "ENSG00000242268.2              -9.9658              -9.9658   \n",
+       "ENSG00000259041.1              -9.9658              -9.9658   \n",
+       "\n",
+       "                   TARGET-20-PASTTW-09  TARGET-50-PAKYLT-01  \\\n",
+       "sample                                                        \n",
+       "ENSG00000242268.2              -4.0350              -9.9658   \n",
+       "ENSG00000259041.1              -9.9658              -9.9658   \n",
+       "\n",
+       "                   TARGET-20-PATJHJ-09  TARGET-21-PATKKJ-41  \\\n",
+       "sample                                                        \n",
+       "ENSG00000242268.2              -9.9658              -9.9658   \n",
+       "ENSG00000259041.1              -9.9658              -9.9658   \n",
+       "\n",
+       "                   TARGET-10-PAPEJN-04  TARGET-20-PABLDZ-09  \\\n",
+       "sample                                                        \n",
+       "ENSG00000242268.2              -9.9658              -9.9658   \n",
+       "ENSG00000259041.1              -9.9658              -9.9658   \n",
+       "\n",
+       "                   TARGET-10-PANSBR-09  TARGET-10-PARFLV-04  \n",
+       "sample                                                       \n",
+       "ENSG00000242268.2              -4.2934              -9.9658  \n",
+       "ENSG00000259041.1              -9.9658              -9.9658  \n",
+       "\n",
+       "[2 rows x 734 columns]"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "expr_file = data_dir / 'target_RSEM_gene_fpkm.parquet'\n",
+    "if not expr_file.is_file():\n",
+    "    expr_df = pd.read_table(data_dir / 'target_RSEM_gene_fpkm.gz', index_col=0)\n",
+    "    expr_df.to_parquet(expr_file)\n",
+    "else:\n",
+    "    expr_df = pd.read_parquet(expr_file)\n",
+    "\n",
+    "print(expr_df.shape)\n",
+    "expr_df.head(2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Process gene expression matrix "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(734, 18753)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th>id</th>\n",
+       "      <th>1</th>\n",
+       "      <th>2</th>\n",
+       "      <th>9</th>\n",
+       "      <th>10</th>\n",
+       "      <th>12</th>\n",
+       "      <th>13</th>\n",
+       "      <th>14</th>\n",
+       "      <th>15</th>\n",
+       "      <th>16</th>\n",
+       "      <th>18</th>\n",
+       "      <th>...</th>\n",
+       "      <th>102724231</th>\n",
+       "      <th>102724398</th>\n",
+       "      <th>102724473</th>\n",
+       "      <th>102724536</th>\n",
+       "      <th>102724631</th>\n",
+       "      <th>102724862</th>\n",
+       "      <th>102724928</th>\n",
+       "      <th>105375355</th>\n",
+       "      <th>105378803</th>\n",
+       "      <th>105378952</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>sample_id</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>TARGET-10-PAKSWW-03</th>\n",
+       "      <td>5.3754</td>\n",
+       "      <td>-1.1488</td>\n",
+       "      <td>-1.4305</td>\n",
+       "      <td>-9.9658</td>\n",
+       "      <td>-7.76965</td>\n",
+       "      <td>-9.9658</td>\n",
+       "      <td>4.3786</td>\n",
+       "      <td>-1.3183</td>\n",
+       "      <td>2.0289</td>\n",
+       "      <td>0.7321</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-2.6349</td>\n",
+       "      <td>-9.9658</td>\n",
+       "      <td>-4.6082</td>\n",
+       "      <td>0.1648</td>\n",
+       "      <td>-9.9658</td>\n",
+       "      <td>-5.0116</td>\n",
+       "      <td>-9.9658</td>\n",
+       "      <td>-0.8863</td>\n",
+       "      <td>-9.9658</td>\n",
+       "      <td>-9.9658</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>TARGET-10-PAMXHJ-09</th>\n",
+       "      <td>4.9388</td>\n",
+       "      <td>-1.2828</td>\n",
+       "      <td>0.2881</td>\n",
+       "      <td>-9.9658</td>\n",
+       "      <td>-9.96580</td>\n",
+       "      <td>-9.9658</td>\n",
+       "      <td>3.8660</td>\n",
+       "      <td>1.4011</td>\n",
+       "      <td>3.0791</td>\n",
+       "      <td>2.6232</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-2.2447</td>\n",
+       "      <td>-4.2934</td>\n",
+       "      <td>-4.6082</td>\n",
+       "      <td>0.4016</td>\n",
+       "      <td>-9.9658</td>\n",
+       "      <td>-4.2934</td>\n",
+       "      <td>-9.9658</td>\n",
+       "      <td>-2.0529</td>\n",
+       "      <td>-9.9658</td>\n",
+       "      <td>-9.9658</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>2 rows × 18753 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "id                   1          2          9          10         12         \\\n",
+       "sample_id                                                                    \n",
+       "TARGET-10-PAKSWW-03     5.3754    -1.1488    -1.4305    -9.9658   -7.76965   \n",
+       "TARGET-10-PAMXHJ-09     4.9388    -1.2828     0.2881    -9.9658   -9.96580   \n",
+       "\n",
+       "id                   13         14         15         16         18         \\\n",
+       "sample_id                                                                    \n",
+       "TARGET-10-PAKSWW-03    -9.9658     4.3786    -1.3183     2.0289     0.7321   \n",
+       "TARGET-10-PAMXHJ-09    -9.9658     3.8660     1.4011     3.0791     2.6232   \n",
+       "\n",
+       "id                   ...  102724231  102724398  102724473  102724536  \\\n",
+       "sample_id            ...                                               \n",
+       "TARGET-10-PAKSWW-03  ...    -2.6349    -9.9658    -4.6082     0.1648   \n",
+       "TARGET-10-PAMXHJ-09  ...    -2.2447    -4.2934    -4.6082     0.4016   \n",
+       "\n",
+       "id                   102724631  102724862  102724928  105375355  105378803  \\\n",
+       "sample_id                                                                    \n",
+       "TARGET-10-PAKSWW-03    -9.9658    -5.0116    -9.9658    -0.8863    -9.9658   \n",
+       "TARGET-10-PAMXHJ-09    -9.9658    -4.2934    -9.9658    -2.0529    -9.9658   \n",
+       "\n",
+       "id                   105378952  \n",
+       "sample_id                       \n",
+       "TARGET-10-PAKSWW-03    -9.9658  \n",
+       "TARGET-10-PAMXHJ-09    -9.9658  \n",
+       "\n",
+       "[2 rows x 18753 columns]"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "expr_df = (expr_df\n",
+    "   .dropna(axis='rows')\n",
+    "   .reindex(probe_map_df.id)\n",
+    "   .rename(index=ensembl_to_entrez)\n",
+    "   .rename(index=old_to_new_entrez)\n",
+    "   .groupby(level=0).mean()\n",
+    "   .transpose()\n",
+    "   .sort_index(axis='rows')\n",
+    "   .sort_index(axis='columns')\n",
+    ")\n",
+    "\n",
+    "\n",
+    "expr_df.index.rename('sample_id', inplace=True)\n",
+    "\n",
+    "\n",
+    "print(expr_df.shape)\n",
+    "expr_df.head(2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Stratify Balanced Training and Testing Sets in TARGET Gene Expression\n",
+    "Output training and testing gene expression datasets "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "strat = pheno_df.set_index('sample_id').reindex(expr_df.index).primary_disease_code"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>n =</th>\n",
+       "      <th>count</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>AML</td>\n",
+       "      <td>196</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>ALL</td>\n",
+       "      <td>194</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>NBL</td>\n",
+       "      <td>162</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>WT</td>\n",
+       "      <td>132</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>AML-IF</td>\n",
+       "      <td>32</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>CCSK</td>\n",
+       "      <td>13</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>RT</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      n =  count\n",
+       "0     AML    196\n",
+       "1     ALL    194\n",
+       "2     NBL    162\n",
+       "3      WT    132\n",
+       "4  AML-IF     32\n",
+       "5    CCSK     13\n",
+       "6      RT      5"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "cancertype_count_df = pd.DataFrame(strat.value_counts()).reset_index().rename({'index': 'cancertype', 'primary_disease_code': 'n ='}, axis='columns')\n",
+    "sample_counts_file = data_dir / 'target_sample_counts.parquet'\n",
+    "cancertype_count_df.to_parquet(sample_counts_file)\n",
+    "\n",
+    "cancertype_count_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_df, test_df = train_test_split(expr_df,\n",
+    "                                    test_size=0.2,\n",
+    "                                    random_state=0,\n",
+    "                                    stratify=strat)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(587, 18753)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(147, 18753)"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "print(train_df.shape)\n",
+    "test_df.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_file = data_dir / 'train_target_expression_matrix_processed.parquet'\n",
+    "train_df.to_parquet(train_file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_file = data_dir / 'test_target_expression_matrix_processed.parquet'\n",
+    "test_df.to_parquet(test_file)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Sort genes based on median absolute deviation and output to file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mad_genes_df = pd.DataFrame(train_df.mad(axis=0).sort_values(ascending=False)).reset_index()\n",
+    "mad_genes_df.columns = ['gene_id', 'median_absolute_deviation']\n",
+    "mad_genes_file = data_dir / 'target_mad_genes.parquet'\n",
+    "mad_genes_df.to_parquet(mad_genes_file)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "gene_dependency_representations",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/4.gene_expression_signatures/scripts/1.download-validation-data.py b/4.gene_expression_signatures/scripts/1.download-validation-data.py
new file mode 100644
index 0000000..bbf4d99
--- /dev/null
+++ b/4.gene_expression_signatures/scripts/1.download-validation-data.py
@@ -0,0 +1,208 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# # Download and Process Neuroblastoma RNAseq Data
+
+# Code from: https://github.com/greenelab/BioBombe/blob/master/10.gene-expression-signatures/0.download-validation-data.ipynb
+# 
+# We are downloading the dataset associated with Harenza et al. 2017. The data profiles RNAseq data from 39 commonly used neuroblastoma (NBL) cell lines.
+# 
+# We are interested in the MYCN amplification status of these cell lines. We will test if the MYCN amplification score learned through the BioBombe signature approach applied to TARGET data generalizes to this cell line dataset.
+# 
+# MYCN Amplification refers to the number of copies of the MYCN gene. MYCN amplification is used as a biomarker for poor prognosis in neuroblastoma patients (Huang and Weiss 2013).
+
+# In[1]:
+
+
+import requests
+import pathlib
+import pandas as pd
+from urllib.request import urlretrieve
+
+from sklearn import preprocessing
+
+
+# In[2]:
+
+
+url = "https://ndownloader.figshare.com/files/14138792"
+name = "2019-01-22-CellLineSTAR-fpkm-2pass_matrix.txt"
+data_dir = pathlib.Path("data").resolve()
+path = data_dir / name
+
+
+# In[3]:
+
+
+data_dir.mkdir(parents=True, exist_ok=True)
+
+
+# In[4]:
+
+
+urlretrieve(url, path)
+
+
+# In[5]:
+
+
+get_ipython().system(' md5sum "data/2019-01-22-CellLineSTAR-fpkm-2pass_matrix.txt"')
+
+
+# # Download Phenotype Data
+
+# In[6]:
+
+
+url = "https://www.nature.com/articles/sdata201733/tables/4"
+name = "nbl_cellline_phenotype.txt"
+path = data_dir / name
+
+
+# In[7]:
+
+
+if not path.is_file():
+    html = requests.get(url).content
+
+    pheno_df = pd.read_html(html)[0]
+    pheno_df['Cell Line'] = pheno_df['Cell Line'].str.replace("-", "")
+
+    pheno_df.to_csv(path, sep='\t', index=False)
+
+else:
+    pheno_df = pd.read_parquet(path)
+
+pheno_df.head()
+
+
+# In[8]:
+
+
+get_ipython().system(' md5sum "data/nbl_cellline_phenotype.txt"')
+
+
+# # Process RNAseq Data
+
+# In[9]:
+
+
+raw_file = data_dir / "2019-01-22-CellLineSTAR-fpkm-2pass_matrix.txt"
+raw_df = pd.read_table(raw_file, sep='\t')
+raw_df.head()
+
+
+# # Update Gene Names
+
+# In[10]:
+
+
+# Load curated gene names from versioned resource 
+commit = '721204091a96e55de6dcad165d6d8265e67e2a48'
+url = 'https://raw.githubusercontent.com/cognoma/genes/{}/data/genes.tsv'.format(commit)
+gene_df = pd.read_table(url)
+
+# Only consider protein-coding genes
+gene_df = (
+    gene_df.query("gene_type == 'protein-coding'")
+)
+
+symbol_to_entrez = dict(zip(gene_df.symbol,
+                            gene_df.entrez_gene_id))
+
+
+# In[11]:
+
+
+# Add alternative symbols to entrez mapping dictionary
+gene_df = gene_df.dropna(axis='rows', subset=['synonyms'])
+gene_df.synonyms = gene_df.synonyms.str.split('|')
+
+all_syn = (
+    gene_df.apply(lambda x: pd.Series(x.synonyms), axis=1)
+    .stack()
+    .reset_index(level=1, drop=True)
+)
+
+# Name the synonym series and join with rest of genes
+all_syn.name = 'all_synonyms'
+gene_with_syn_df = gene_df.join(all_syn)
+
+# Remove rows that have redundant symbols in all_synonyms
+gene_with_syn_df = (
+    gene_with_syn_df
+    
+    # Drop synonyms that are duplicated - can't be sure of mapping
+    .drop_duplicates(['all_synonyms'], keep=False)
+
+    # Drop rows in which the symbol appears in the list of synonyms
+    .query('symbol not in all_synonyms')
+)
+
+
+# In[12]:
+
+
+# Create a synonym to entrez mapping and add to dictionary
+synonym_to_entrez = dict(zip(gene_with_syn_df.all_synonyms,
+                             gene_with_syn_df.entrez_gene_id))
+
+symbol_to_entrez.update(synonym_to_entrez)
+
+
+# In[13]:
+
+
+# Load gene updater
+url = 'https://raw.githubusercontent.com/cognoma/genes/{}/data/updater.tsv'.format(commit)
+updater_df = pd.read_table(url)
+old_to_new_entrez = dict(zip(updater_df.old_entrez_gene_id,
+                             updater_df.new_entrez_gene_id))
+
+
+# In[14]:
+
+
+gene_map = raw_df.GeneID.replace(symbol_to_entrez)
+gene_map = gene_map.replace(old_to_new_entrez)
+
+
+# In[15]:
+
+
+raw_df.index = gene_map
+raw_df.index.name = 'entrez_gene_id'
+raw_df = raw_df.drop(['GeneID'], axis='columns')
+raw_df = raw_df.loc[raw_df.index.isin(symbol_to_entrez.values()), :]
+
+print(raw_df.shape)
+raw_df.head()
+
+
+# # Scale Data and Output
+
+# In[16]:
+
+
+#MinMax consistent with BetaVAE scaling
+raw_scaled_df = preprocessing.MinMaxScaler().fit_transform(raw_df.transpose())
+raw_scaled_df = (
+    pd.DataFrame(raw_scaled_df,
+                 columns=raw_df.index,
+                 index=raw_df.columns)
+    .sort_index(axis='columns')
+    .sort_index(axis='rows')
+)
+raw_scaled_df.columns = raw_scaled_df.columns.astype(str)
+raw_scaled_df = raw_scaled_df.loc[:, ~raw_scaled_df.columns.duplicated(keep='first')]
+
+raw_scaled_df.head()
+
+
+# In[17]:
+
+
+# In[17]
+processed_file = data_dir / 'nbl_celllines_processed_matrix.parquet'
+raw_scaled_df.to_parquet(processed_file)
+
diff --git a/4.gene_expression_signatures/scripts/2.download-target-data.py b/4.gene_expression_signatures/scripts/2.download-target-data.py
new file mode 100644
index 0000000..e9a4ab2
--- /dev/null
+++ b/4.gene_expression_signatures/scripts/2.download-target-data.py
@@ -0,0 +1,274 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# # Download TARGET Pan Cancer Data for Compression
+# The notebook downloads gene expression and clinical data from the TARGET project. The data is downloaded from UCSC Xena.
+# The data is in log2(FPKM) RSEM transformed
+
+# In[1]:
+
+
+import pathlib
+from pathlib import Path
+import hashlib
+from urllib.request import urlretrieve
+import random
+import pandas as pd
+from sklearn.model_selection import train_test_split
+
+
+# In[2]:
+
+
+def download_and_checksum(url_base, file_info_dict, file_url_paths):
+    """
+    Downloads files from the specified base URL and computes the SHA-256 checksum for each file.
+
+    Parameters:
+    ----------
+    url_base : str
+        The base URL from which the files will be downloaded.
+
+    file_info_dict : dict
+        A dictionary where the keys are file names and the values are the local file paths where 
+        the files will be saved.
+
+    file_url_paths : dict
+        A dictionary mapping each file name to the corresponding sub-path (if any) that should 
+        be appended to the base URL before downloading the file.
+    """
+    for name, path in file_info_dict.items():
+        # Determine the correct URL with or without sub-path
+        file_url = f"{url_base}{file_url_paths[name]}{name}"
+        
+        # Download the file
+        urlretrieve(file_url, path)
+
+        # Compute checksum
+        md5_hash = hashlib.md5()
+        with open(path, "rb") as f:
+            for byte_block in iter(lambda: f.read(4096), b""):
+                md5_hash.update(byte_block)
+
+        print(f"{name} checksum: {md5_hash.hexdigest()}")
+
+
+# In[3]:
+
+
+# Base URL
+url_base = 'https://toil-xena-hub.s3.us-east-1.amazonaws.com/download/'
+data_dir = pathlib.Path('data').resolve()
+
+
+# In[4]:
+
+
+# Dictionary for file names and their paths
+file_info_dict = {
+    'target_RSEM_gene_fpkm.gz': Path('data/target_RSEM_gene_fpkm.gz'),
+    'TARGET_phenotype.gz': Path('data/TARGET_phenotype.gz'),
+    'gencode.v23.annotation.gene.probemap': Path('data/gencode.v23.annotation.gene.probemap')
+}
+
+file_url_paths = {
+    "target_RSEM_gene_fpkm.gz": "",
+    "gencode.v23.annotation.gene.probemap": "probeMap/",
+    "TARGET_phenotype.gz": ""
+}
+
+
+# In[5]:
+
+
+# Create the 'data' directory
+Path('data').mkdir(exist_ok=True)
+
+
+# In[6]:
+
+
+# Call the function
+download_and_checksum(url_base, file_info_dict, file_url_paths)
+
+
+# # Process TARGET PanCancer Data
+# Retrive the downloaded expression data, update gene identifiers to entrez, and curate sample IDs. The script will also identify a balanced hold-out test set to compare projection performance into learned latent spaces across algorithms. 
+
+# In[7]:
+
+
+random.seed(0)
+
+
+# # Read Phenotype Information
+
+# In[8]:
+
+
+pheno_file = data_dir / 'TARGET_phenotype.parquet'
+if not pheno_file.is_file():
+    pheno_df = pd.read_table(data_dir / 'TARGET_phenotype.gz')
+    pheno_df.to_parquet(pheno_file, index=False)
+else:
+    pheno_df = pd.read_parquet(pheno_file)
+
+print(pheno_df.shape)
+pheno_df.head(3)
+
+
+# # Read Entrez ID Curation Information
+# Load curated gene names from versioned resource. See https://github.com/cognoma/genes for more details
+
+# In[9]:
+
+
+# Commit from https://github.com/cognoma/genes
+genes_commit = 'ad9631bb4e77e2cdc5413b0d77cb8f7e93fc5bee'
+
+
+# In[10]:
+
+
+url = 'https://raw.githubusercontent.com/cognoma/genes/{}/data/genes.tsv'.format(genes_commit)
+gene_df = pd.read_table(url)
+
+# Only consider protein-coding genes
+gene_df = (
+   gene_df.query("gene_type == 'protein-coding'")
+)
+
+
+print(gene_df.shape)
+gene_df.head(2)
+
+
+# In[11]:
+
+
+# Load gene updater - old to new Entrez gene identifiers
+url = 'https://raw.githubusercontent.com/cognoma/genes/{}/data/updater.tsv'.format(genes_commit)
+updater_df = pd.read_table(url)
+old_to_new_entrez = dict(zip(updater_df.old_entrez_gene_id,
+                            updater_df.new_entrez_gene_id))
+
+
+# # Read Probe Mapping Info
+
+# In[12]:
+
+
+probe_map_file = data_dir / 'gencode.v23.annotation.gene.probemap.parquet'
+if not probe_map_file.is_file():
+    probe_map_df = pd.read_table(data_dir / 'gencode.v23.annotation.gene.probemap')
+    probe_map_df.to_parquet(probe_map_file, index=False)
+else:
+    probe_map_df = pd.read_parquet(probe_map_file)
+
+# Inner merge gene df to get ensembl to entrez mapping
+probe_map_df = probe_map_df.merge(gene_df, how='inner', left_on='gene', right_on='symbol')
+ensembl_to_entrez = dict(zip(probe_map_df.id, probe_map_df.entrez_gene_id))
+
+print(probe_map_df.shape)
+probe_map_df.head(3)
+
+
+# # Read Gene Expression Data
+
+# In[13]:
+
+
+expr_file = data_dir / 'target_RSEM_gene_fpkm.parquet'
+if not expr_file.is_file():
+    expr_df = pd.read_table(data_dir / 'target_RSEM_gene_fpkm.gz', index_col=0)
+    expr_df.to_parquet(expr_file)
+else:
+    expr_df = pd.read_parquet(expr_file)
+
+print(expr_df.shape)
+expr_df.head(2)
+
+
+# # Process gene expression matrix 
+
+# In[14]:
+
+
+expr_df = (expr_df
+   .dropna(axis='rows')
+   .reindex(probe_map_df.id)
+   .rename(index=ensembl_to_entrez)
+   .rename(index=old_to_new_entrez)
+   .groupby(level=0).mean()
+   .transpose()
+   .sort_index(axis='rows')
+   .sort_index(axis='columns')
+)
+
+
+expr_df.index.rename('sample_id', inplace=True)
+
+
+print(expr_df.shape)
+expr_df.head(2)
+
+
+# # Stratify Balanced Training and Testing Sets in TARGET Gene Expression
+# Output training and testing gene expression datasets 
+
+# In[15]:
+
+
+strat = pheno_df.set_index('sample_id').reindex(expr_df.index).primary_disease_code
+
+
+# In[16]:
+
+
+cancertype_count_df = pd.DataFrame(strat.value_counts()).reset_index().rename({'index': 'cancertype', 'primary_disease_code': 'n ='}, axis='columns')
+sample_counts_file = data_dir / 'target_sample_counts.parquet'
+cancertype_count_df.to_parquet(sample_counts_file)
+
+cancertype_count_df
+
+
+# In[17]:
+
+
+train_df, test_df = train_test_split(expr_df,
+                                    test_size=0.2,
+                                    random_state=0,
+                                    stratify=strat)
+
+
+# In[18]:
+
+
+print(train_df.shape)
+test_df.shape
+
+
+# In[19]:
+
+
+train_file = data_dir / 'train_target_expression_matrix_processed.parquet'
+train_df.to_parquet(train_file)
+
+
+# In[20]:
+
+
+test_file = data_dir / 'test_target_expression_matrix_processed.parquet'
+test_df.to_parquet(test_file)
+
+
+# # Sort genes based on median absolute deviation and output to file
+
+# In[ ]:
+
+
+mad_genes_df = pd.DataFrame(train_df.mad(axis=0).sort_values(ascending=False)).reset_index()
+mad_genes_df.columns = ['gene_id', 'median_absolute_deviation']
+mad_genes_file = data_dir / 'target_mad_genes.parquet'
+mad_genes_df.to_parquet(mad_genes_file)
+
diff --git a/environment.yml b/environment.yml
index 5e20fa7..311180e 100644
--- a/environment.yml
+++ b/environment.yml
@@ -22,7 +22,15 @@ dependencies:
   - conda-forge::matplotlib=>3.6.0
   - conda-forge::keras-tuner=>1.1.3
   - conda-forge::hiplot=>0.1.33
+  - conda-forge::optuna
+  - conda-forge::pytorch=>2.4
+  - conda-forge::torchvision
+  - conda-forge::plotly
+  - conda-forge::pyarrow
+  - conda-forge::lxml
+
   - pip:
       - blitzgsea==1.3.33
+