methods.html

<!DOCTYPE html>
<html lang="" xml:lang="">
<head>

  <meta charset="utf-8" />
  <meta http-equiv="X-UA-Compatible" content="IE=edge" />
  <title>2 Methods and Materials | Network analysis approach using morphological profiling of chemical perturbation</title>
  <meta name="description" content="Exploring the intersection of graph representation learning and cell profiling" />
  <meta name="generator" content="bookdown 0.33 and GitBook 2.6.7" />

  <meta property="og:title" content="2 Methods and Materials | Network analysis approach using morphological profiling of chemical perturbation" />
  <meta property="og:type" content="book" />
  
  <meta property="og:description" content="Exploring the intersection of graph representation learning and cell profiling" />
  

  <meta name="twitter:card" content="summary" />
  <meta name="twitter:title" content="2 Methods and Materials | Network analysis approach using morphological profiling of chemical perturbation" />
  
  <meta name="twitter:description" content="Exploring the intersection of graph representation learning and cell profiling" />
  

<meta name="author" content="Nima Chamyani" />


<meta name="date" content="2023-07-04" />

  <meta name="viewport" content="width=device-width, initial-scale=1" />
  <meta name="apple-mobile-web-app-capable" content="yes" />
  <meta name="apple-mobile-web-app-status-bar-style" content="black" />
  
  
<link rel="prev" href="intro.html"/>
<link rel="next" href="results.html"/>
<script src="libs/jquery-3.6.0/jquery-3.6.0.min.js"></script>
<script src="https://cdn.jsdelivr.net/npm/fuse.js@6.4.6/dist/fuse.min.js"></script>
<link href="libs/gitbook-2.6.7/css/style.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-table.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-bookdown.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-highlight.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-search.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-fontsettings.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-clipboard.css" rel="stylesheet" />


<link href="libs/anchor-sections-1.1.0/anchor-sections.css" rel="stylesheet" />
<link href="libs/anchor-sections-1.1.0/anchor-sections-hash.css" rel="stylesheet" />
<script src="libs/anchor-sections-1.1.0/anchor-sections.js"></script>


<style type="text/css">
pre > code.sourceCode { white-space: pre; position: relative; }
pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
pre > code.sourceCode > span:empty { height: 1.2em; }
.sourceCode { overflow: visible; }
code.sourceCode > span { color: inherit; text-decoration: inherit; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
pre > code.sourceCode { white-space: pre-wrap; }
pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
}
pre.numberSource code
  { counter-reset: source-line 0; }
pre.numberSource code > span
  { position: relative; left: -4em; counter-increment: source-line; }
pre.numberSource code > span > a:first-child::before
  { content: counter(source-line);
    position: relative; left: -1em; text-align: right; vertical-align: baseline;
    border: none; display: inline-block;
    -webkit-touch-callout: none; -webkit-user-select: none;
    -khtml-user-select: none; -moz-user-select: none;
    -ms-user-select: none; user-select: none;
    padding: 0 4px; width: 4em;
    color: #aaaaaa;
  }
pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa;  padding-left: 4px; }
div.sourceCode
  {   }
@media screen {
pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
}
code span.al { color: #ff0000; font-weight: bold; } /* Alert */
code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
code span.at { color: #7d9029; } /* Attribute */
code span.bn { color: #40a070; } /* BaseN */
code span.bu { color: #008000; } /* BuiltIn */
code span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
code span.ch { color: #4070a0; } /* Char */
code span.cn { color: #880000; } /* Constant */
code span.co { color: #60a0b0; font-style: italic; } /* Comment */
code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
code span.do { color: #ba2121; font-style: italic; } /* Documentation */
code span.dt { color: #902000; } /* DataType */
code span.dv { color: #40a070; } /* DecVal */
code span.er { color: #ff0000; font-weight: bold; } /* Error */
code span.ex { } /* Extension */
code span.fl { color: #40a070; } /* Float */
code span.fu { color: #06287e; } /* Function */
code span.im { color: #008000; font-weight: bold; } /* Import */
code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
code span.kw { color: #007020; font-weight: bold; } /* Keyword */
code span.op { color: #666666; } /* Operator */
code span.ot { color: #007020; } /* Other */
code span.pp { color: #bc7a00; } /* Preprocessor */
code span.sc { color: #4070a0; } /* SpecialChar */
code span.ss { color: #bb6688; } /* SpecialString */
code span.st { color: #4070a0; } /* String */
code span.va { color: #19177c; } /* Variable */
code span.vs { color: #4070a0; } /* VerbatimString */
code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
</style>

<style type="text/css">
/* Used with Pandoc 2.11+ new --citeproc when CSL is used */
div.csl-bib-body { }
div.csl-entry {
  clear: both;
}
.hanging div.csl-entry {
  margin-left:2em;
  text-indent:-2em;
}
div.csl-left-margin {
  min-width:2em;
  float:left;
}
div.csl-right-inline {
  margin-left:2em;
  padding-left:1em;
}
div.csl-indent {
  margin-left: 2em;
}
</style>

<link rel="stylesheet" href="style.css" type="text/css" />
</head>

<body>


  <div class="book without-animation with-summary font-size-2 font-family-1" data-basepath=".">

    <div class="book-summary">
      <nav role="navigation">

<ul class="summary">
<li class="chapter" data-level="" data-path="index.html"><a href="index.html"><i class="fa fa-check"></i>Intersecting Graph Representation Learning and Cell Profiling: A Novel Approach to Analyzing Complex Biomedical Data</a>
<ul>
<li class="chapter" data-level="" data-path="index.html"><a href="index.html#aim"><i class="fa fa-check"></i>Aim</a></li>
<li class="chapter" data-level="" data-path="index.html"><a href="index.html#what-can-be-found-in-this-document"><i class="fa fa-check"></i>What can be found in this document?</a></li>
</ul></li>
<li class="chapter" data-level="1" data-path="intro.html"><a href="intro.html"><i class="fa fa-check"></i><b>1</b> Introduction</a>
<ul>
<li class="chapter" data-level="1.1" data-path="intro.html"><a href="intro.html#graphs"><i class="fa fa-check"></i><b>1.1</b> Graphs</a></li>
<li class="chapter" data-level="1.2" data-path="intro.html"><a href="intro.html#graph-representation-learning"><i class="fa fa-check"></i><b>1.2</b> Graph representation learning</a></li>
<li class="chapter" data-level="1.3" data-path="intro.html"><a href="intro.html#cell-profiling"><i class="fa fa-check"></i><b>1.3</b> Cell profiling</a></li>
</ul></li>
<li class="chapter" data-level="2" data-path="methods.html"><a href="methods.html"><i class="fa fa-check"></i><b>2</b> Methods and Materials</a>
<ul>
<li class="chapter" data-level="2.1" data-path="methods.html"><a href="methods.html#data-preprocessing"><i class="fa fa-check"></i><b>2.1</b> Data Preprocessing</a>
<ul>
<li class="chapter" data-level="2.1.1" data-path="methods.html"><a href="methods.html#covid-19-cell-profilling-data"><i class="fa fa-check"></i><b>2.1.1</b> COVID-19 Cell profilling Data</a>
<ul>
<li class="chapter" data-level="2.1.1.1" data-path="methods.html"><a href="methods.html#normalization"><i class="fa fa-check"></i><b>2.1.1.1</b> Normalization</a></li>
<li class="chapter" data-level="2.1.1.2" data-path="methods.html"><a href="methods.html#dimensionality-reduction"><i class="fa fa-check"></i><b>2.1.1.2</b> Dimensionality Reduction</a></li>
<li class="chapter" data-level="2.1.1.3" data-path="methods.html"><a href="methods.html#development-of-a-binary-classification-of-data"><i class="fa fa-check"></i><b>2.1.1.3</b> Development of a binary classification of data</a></li>
</ul></li>
<li class="chapter" data-level="2.1.2" data-path="methods.html"><a href="methods.html#compound-protein-and-pathway-data-aggregation"><i class="fa fa-check"></i><b>2.1.2</b> Compound, Protein and Pathway Data Aggregation</a></li>
<li class="chapter" data-level="2.1.3" data-path="methods.html"><a href="methods.html#featurizing-the-biomedical-entities"><i class="fa fa-check"></i><b>2.1.3</b> Featurizing the Biomedical Entities</a>
<ul>
<li class="chapter" data-level="2.1.3.1" data-path="methods.html"><a href="methods.html#featurizing-compounds"><i class="fa fa-check"></i><b>2.1.3.1</b> Featurizing Compounds</a></li>
<li class="chapter" data-level="2.1.3.2" data-path="methods.html"><a href="methods.html#featurizing-proteins"><i class="fa fa-check"></i><b>2.1.3.2</b> Featurizing Proteins</a></li>
<li class="chapter" data-level="2.1.3.3" data-path="methods.html"><a href="methods.html#featurizing-pathways"><i class="fa fa-check"></i><b>2.1.3.3</b> Featurizing Pathways</a></li>
</ul></li>
<li class="chapter" data-level="2.1.4" data-path="methods.html"><a href="methods.html#covid-19-bio-graph"><i class="fa fa-check"></i><b>2.1.4</b> COVID-19 Bio-Graph</a></li>
<li class="chapter" data-level="2.1.5" data-path="methods.html"><a href="methods.html#representing-chemical-molecules-as-graph"><i class="fa fa-check"></i><b>2.1.5</b> Representing Chemical Molecules as Graph</a></li>
</ul></li>
<li class="chapter" data-level="2.2" data-path="methods.html"><a href="methods.html#models"><i class="fa fa-check"></i><b>2.2</b> Models</a>
<ul>
<li class="chapter" data-level="2.2.1" data-path="methods.html"><a href="methods.html#graph-level-molecular-predictor-glmp"><i class="fa fa-check"></i><b>2.2.1</b> Graph-Level Molecular Predictor (GLMP)</a></li>
<li class="chapter" data-level="2.2.2" data-path="methods.html"><a href="methods.html#bio-graph-integrative-classifierregressor-biogicbiogir"><i class="fa fa-check"></i><b>2.2.2</b> Bio-Graph Integrative Classifier/Regressor (BioGIC/BioGIR)</a>
<ul>
<li class="chapter" data-level="2.2.2.1" data-path="methods.html"><a href="methods.html#classificationregression"><i class="fa fa-check"></i><b>2.2.2.1</b> Classification/Regression</a></li>
<li class="chapter" data-level="2.2.2.2" data-path="methods.html"><a href="methods.html#predicting-joint-effect-of-nodes-chemical-combination"><i class="fa fa-check"></i><b>2.2.2.2</b> Predicting joint effect of nodes (Chemical Combination)</a></li>
</ul></li>
<li class="chapter" data-level="2.2.3" data-path="methods.html"><a href="methods.html#optimized-molecular-graph-generator-omg"><i class="fa fa-check"></i><b>2.2.3</b> Optimized Molecular Graph Generator (OMG)</a></li>
</ul></li>
<li class="chapter" data-level="2.3" data-path="methods.html"><a href="methods.html#model-validation-and-optimization"><i class="fa fa-check"></i><b>2.3</b> Model Validation and Optimization</a></li>
<li class="chapter" data-level="2.4" data-path="methods.html"><a href="methods.html#model-enhancement"><i class="fa fa-check"></i><b>2.4</b> Model Enhancement</a></li>
<li class="chapter" data-level="2.5" data-path="methods.html"><a href="methods.html#data-acquisition-software-and-libraries"><i class="fa fa-check"></i><b>2.5</b> Data acquisition, software and libraries</a></li>
</ul></li>
<li class="chapter" data-level="3" data-path="results.html"><a href="results.html"><i class="fa fa-check"></i><b>3</b> Result and Discussion</a>
<ul>
<li class="chapter" data-level="3.1" data-path="results.html"><a href="results.html#regressionclassification-performance"><i class="fa fa-check"></i><b>3.1</b> Regression/Classification Performance</a></li>
<li class="chapter" data-level="3.2" data-path="results.html"><a href="results.html#covid-19-biograph-topology"><i class="fa fa-check"></i><b>3.2</b> COVID-19 BioGraph Topology</a></li>
<li class="chapter" data-level="3.3" data-path="results.html"><a href="results.html#combination-prediction"><i class="fa fa-check"></i><b>3.3</b> Combination Prediction</a></li>
<li class="chapter" data-level="3.4" data-path="results.html"><a href="results.html#molecule-generation"><i class="fa fa-check"></i><b>3.4</b> Molecule Generation</a></li>
</ul></li>
<li class="chapter" data-level="" data-path="references.html"><a href="references.html"><i class="fa fa-check"></i>References</a></li>
</ul>

      </nav>
    </div>

    <div class="book-body">
      <div class="body-inner">
        <div class="book-header" role="navigation">
          <h1>
            <i class="fa fa-circle-o-notch fa-spin"></i><a href="./">Network analysis approach using morphological profiling of chemical perturbation</a>
          </h1>
        </div>

        <div class="page-wrapper" tabindex="-1" role="main">
          <div class="page-inner">

            <section class="normal" id="section-">
<div id="methods" class="section level1 hasAnchor" number="2">
<h1><span class="header-section-number">2</span> Methods and Materials<a href="methods.html#methods" class="anchor-section" aria-label="Anchor link to header"></a></h1>
<p>A diverse set of computational tools and methodologies were employed in this study to analyze and interpret complex biomedical data.</p>
<div id="data-preprocessing" class="section level2 hasAnchor" number="2.1">
<h2><span class="header-section-number">2.1</span> Data Preprocessing<a href="methods.html#data-preprocessing" class="anchor-section" aria-label="Anchor link to header"></a></h2>
<div id="covid-19-cell-profilling-data" class="section level3 hasAnchor" number="2.1.1">
<h3><span class="header-section-number">2.1.1</span> COVID-19 Cell profilling Data<a href="methods.html#covid-19-cell-profilling-data" class="anchor-section" aria-label="Anchor link to header"></a></h3>
<p>In this study, the data preprocessing stage consisted of the preparation and normalization of a COVID-19 dataset. This dataset contained phenotype features and metadata extracted from multiple images of Vero-E6 cells (African green monkey) infected with Human coronavirus SARS-CoV-2 and treated with 5300 drugs from the Specs Repurposing Library. Each compound was represented in two plate replicates within the set of 32 plates, each containing 384 wells. Fluorescent images were captured using an Image Xpress Micro XLS (Molecular Devices) microscope with a 20× objective using laser-based autofocus. Five labels were used to stain the cells, characterizing seven cellular components, including DNA, Golgi apparatus, plasma membrane, F-actin, nucleoli and cytoplasmic RNA, the endoplasmic reticulum, and the SARS-CoV-2 spike protein. The image files were then stored in grayscale TIFF format.</p>
<p>The open-source image analysis software CellProfiler version 4.0.6 was utilized to extract a total of 2009 morphological features, including size, shape, pixel intensities, and texture, from these images. The initial dataset cleaning involved the removal of features with constant values or missing data and empty features. Numeric columns in the dataset were subsequently isolated into ‘phenotype features’ and ‘metadata’. These features were then averaged on an image level. Features with extreme and outlier standard deviation values (SD &lt; 0.001 and SD &gt; 10000) were also eliminated.</p>
<p>This dataset represents an extensive collection of phenotype features extracted from images, along with associated metadata. Each row in the dataset corresponds to a single image, with each image associated with a specific site within a well. There are 9 sites (numbered 1-9) within each well, and approximately 350-360 wells within each plate, with a maximum of 384 wells per plate. The dataset encompasses 24 plates.</p>
<table style="width:100%;">
<colgroup>
<col width="6%" />
<col width="6%" />
<col width="6%" />
<col width="6%" />
<col width="6%" />
<col width="6%" />
<col width="6%" />
<col width="6%" />
<col width="6%" />
<col width="6%" />
<col width="6%" />
<col width="6%" />
<col width="6%" />
<col width="6%" />
<col width="6%" />
</colgroup>
<thead>
<tr class="header">
<th align="left">ImageID</th>
<th align="left">~ 2000 phenotype features</th>
<th>PlateID</th>
<th>Well</th>
<th>Site</th>
<th>Plate</th>
<th>Plate_Well</th>
<th>batch_id</th>
<th>pertType</th>
<th>cmpd_conc</th>
<th>Flag</th>
<th>Count_nuclei</th>
<th>Batch nr</th>
<th>Compound ID</th>
<th>selected_mechanism</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td align="left">P03-L2_B03_1</td>
<td align="left">……….. values ……….</td>
<td>P03-L2</td>
<td>B03</td>
<td>1</td>
<td>03-L2</td>
<td>specs935-plate03-L2_B03</td>
<td>BJ1894547</td>
<td>trt</td>
<td>10.0</td>
<td>0</td>
<td>109.0</td>
<td>BJ1894547</td>
<td>CBK042132</td>
<td>estrogen receptor alpha modulator</td>
</tr>
<tr class="even">
<td align="left">P03-L2_B03_2</td>
<td align="left">……….. values ……….</td>
<td>P03-L2</td>
<td>B03</td>
<td>2</td>
<td>03-L2</td>
<td>specs935-plate03-L2_B03</td>
<td>BJ1894547</td>
<td>trt</td>
<td>10.0</td>
<td>0</td>
<td>121.0</td>
<td>BJ1894547</td>
<td>CBK042132</td>
<td>estrogen receptor alpha modulator</td>
</tr>
</tbody>
</table>
<p>. . . |54366 rows, 2140 columns|</p>
<p>During the preparation phase, the first step involved dropping empty features, i.e., columns with no values or with a standard deviation (SD) of 0. The dataset was then segregated into numeric columns, further filtered down to ‘phenotype features’ and ‘metadata’. The ‘phenotype features’ included numeric columns excluding those with certain strings such as ‘Metadata’, ‘Number’, ‘Outlier’, ‘ImageQuality’, ‘cmpd_conc’, ‘Total’, ‘Flag’ and ‘Site’. The difference between the number of numeric columns and the number of phenotype features gives the number of ‘metadata’.</p>
<div class="sourceCode" id="cb1"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb1-1"><a href="methods.html#cb1-1" tabindex="-1"></a>numeric_columns <span class="op">=</span> <span class="bu">list</span>()</span>
<span id="cb1-2"><a href="methods.html#cb1-2" tabindex="-1"></a><span class="cf">for</span> a <span class="kw">in</span> df.columns:</span>
<span id="cb1-3"><a href="methods.html#cb1-3" tabindex="-1"></a>    <span class="cf">if</span> (df.dtypes[a]  <span class="st">&#39;float64&#39;</span>) <span class="op">|</span> (df.dtypes[a]  <span class="st">&#39;int64&#39;</span>) :</span>
<span id="cb1-4"><a href="methods.html#cb1-4" tabindex="-1"></a>        numeric_columns.append(a)</span>
<span id="cb1-5"><a href="methods.html#cb1-5" tabindex="-1"></a>        </span>
<span id="cb1-6"><a href="methods.html#cb1-6" tabindex="-1"></a>feature_columns <span class="op">=</span> [fc <span class="cf">for</span> fc <span class="kw">in</span> numeric_columns <span class="cf">if</span> (<span class="st">&#39;Metadata&#39;</span> <span class="kw">not</span> <span class="kw">in</span> fc) <span class="op">&amp;</span> (<span class="st">&#39;Number&#39;</span> <span class="kw">not</span> <span class="kw">in</span> fc) <span class="op">&amp;</span></span>
<span id="cb1-7"><a href="methods.html#cb1-7" tabindex="-1"></a>                   (<span class="st">&#39;Outlier&#39;</span> <span class="kw">not</span> <span class="kw">in</span> fc)  <span class="op">&amp;</span> (<span class="st">&#39;ImageQuality&#39;</span> <span class="kw">not</span> <span class="kw">in</span> fc)  <span class="op">&amp;</span> (<span class="st">&#39;cmpd_conc&#39;</span> <span class="kw">not</span> <span class="kw">in</span> fc)  <span class="op">&amp;</span></span>
<span id="cb1-8"><a href="methods.html#cb1-8" tabindex="-1"></a>                   (<span class="st">&#39;Total&#39;</span> <span class="kw">not</span> <span class="kw">in</span> fc) <span class="op">&amp;</span> (<span class="st">&#39;Flag&#39;</span> <span class="kw">not</span> <span class="kw">in</span> fc)  <span class="op">&amp;</span> (<span class="st">&#39;Site&#39;</span> <span class="kw">not</span> <span class="kw">in</span> fc) ]</span></code></pre></div>
<p>The preparation phase also involved removing any features with missing values and those with an SD less than 0.0001.</p>
<div class="sourceCode" id="cb2"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb2-1"><a href="methods.html#cb2-1" tabindex="-1"></a>X <span class="op">=</span> df.loc[:, feature_columns]</span>
<span id="cb2-2"><a href="methods.html#cb2-2" tabindex="-1"></a>X.dropna(axis<span class="op">=</span><span class="dv">1</span>, inplace<span class="op">=</span><span class="va">True</span>)</span>
<span id="cb2-3"><a href="methods.html#cb2-3" tabindex="-1"></a>X <span class="op">=</span> X.loc[:, (X.std() <span class="op">&gt;</span> <span class="fl">0.0001</span>) ]</span></code></pre></div>
<div id="normalization" class="section level4 hasAnchor" number="2.1.1.1">
<h4><span class="header-section-number">2.1.1.1</span> Normalization<a href="methods.html#normalization" class="anchor-section" aria-label="Anchor link to header"></a></h4>
<p>Two methods of normalization were employed in this study: an overall approach and a plate-separated strategy. Both strategies utilized Median and Median Absolute Deviation (MMAD) normalization<span class="citation"><a href="#ref-kappal2019data">[35]</a></span>. The normalization was accomplished using the formula:</p>
<p><span class="math display">\[MMAD = \frac{X - DMSO_{median}}{|X_{dmso} - DMSO_{median}|_{median}}\]</span></p>
<p>where <span class="math inline">\(X\)</span> denotes the observed feature value, <span class="math inline">\(DMSO_{median}\)</span> signifies the median value of DMSO, and <span class="math inline">\(X_{dmso}\)</span> represents the observed feature value of DMSO. The overall strategy applied this formula to the entire dataset at once, whereas the plate-separated strategy applied it independently to each plate, using the local median values of DMSO for normalization within that plate.</p>
<div class="sourceCode" id="cb3"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb3-1"><a href="methods.html#cb3-1" tabindex="-1"></a>dfDMSO <span class="op">=</span> df[df[<span class="st">&#39;batch_id&#39;</span>] <span class="op">==</span> <span class="st">&#39;[dmso]&#39;</span>]</span>
<span id="cb3-2"><a href="methods.html#cb3-2" tabindex="-1"></a>dfDMSO_Medians <span class="op">=</span> dfDMSO[phenotype_features].median()</span>
<span id="cb3-3"><a href="methods.html#cb3-3" tabindex="-1"></a>dfDMSO_MADs <span class="op">=</span> (dfDMSO[phenotype_features] <span class="op">-</span> dfDMSO[phenotype_features].median()).<span class="bu">abs</span>().median()</span>
<span id="cb3-4"><a href="methods.html#cb3-4" tabindex="-1"></a>df_MMAD <span class="op">=</span> df[phenotype_features].copy()</span>
<span id="cb3-5"><a href="methods.html#cb3-5" tabindex="-1"></a>df_MMAD <span class="op">=</span> (df[phenotype_features] <span class="op">-</span> dfDMSO_Medians[phenotype_features])<span class="op">/</span>dfDMSO_MADs[phenotype_features]</span>
<span id="cb3-6"><a href="methods.html#cb3-6" tabindex="-1"></a>df_MMAD.clip(lower<span class="op">=-</span><span class="dv">10</span>, upper<span class="op">=</span><span class="dv">10</span>,  inplace<span class="op">=</span><span class="va">True</span>)</span></code></pre></div>
<p>In the plate separated approach, the same process was applied. However, it was done first by finding local median values for DMSO in each plate and normalizing the measurements in the same plate based on their respective DMSO medians.</p>
<div class="sourceCode" id="cb4"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb4-1"><a href="methods.html#cb4-1" tabindex="-1"></a>df_MMAD_by_plate <span class="op">=</span> pd.DataFrame()</span>
<span id="cb4-2"><a href="methods.html#cb4-2" tabindex="-1"></a></span>
<span id="cb4-3"><a href="methods.html#cb4-3" tabindex="-1"></a><span class="cf">for</span> plate <span class="kw">in</span> plates:</span>
<span id="cb4-4"><a href="methods.html#cb4-4" tabindex="-1"></a>    plate_data <span class="op">=</span> df[df[<span class="st">&#39;Plate&#39;</span>] <span class="op">==</span> plate]</span>
<span id="cb4-5"><a href="methods.html#cb4-5" tabindex="-1"></a>    df_DMSO <span class="op">=</span> plate_data[plate_data[<span class="st">&#39;batch_id&#39;</span>] <span class="op">==</span> <span class="st">&#39;[dmso]&#39;</span>]</span>
<span id="cb4-6"><a href="methods.html#cb4-6" tabindex="-1"></a>    df_DMSO_medians <span class="op">=</span> df_DMSO[phenotype_features].median()</span>
<span id="cb4-7"><a href="methods.html#cb4-7" tabindex="-1"></a>    df_DMSO_MADs <span class="op">=</span> (df_DMSO[phenotype_features] <span class="op">-</span> df_DMSO[phenotype_features].median()).<span class="bu">abs</span>().median()</span>
<span id="cb4-8"><a href="methods.html#cb4-8" tabindex="-1"></a>    MMAD <span class="op">=</span> (df[df[<span class="st">&#39;Plate&#39;</span>] <span class="op">==</span> plate][phenotype_features] <span class="op">-</span> df_DMSO_medians[phenotype_features])<span class="op">/</span>df_DMSO_MADs[phenotype_features]</span>
<span id="cb4-9"><a href="methods.html#cb4-9" tabindex="-1"></a>    df_MMAD_by_plate <span class="op">=</span> pd.concat([df_MMADs_by_plate, MMAD])</span>
<span id="cb4-10"><a href="methods.html#cb4-10" tabindex="-1"></a></span>
<span id="cb4-11"><a href="methods.html#cb4-11" tabindex="-1"></a>df_MMAD_by_plate</span></code></pre></div>
<p>The site-level features were normalized at the plate level using the mean and standard deviation of the DMSO sites in the plate. MMAD normalization was chosen due to its robustness to outliers, implying that it functions well even with data containing extreme values. In contrast, other methods, such as Z-score normalization, could be significantly affected by these outliers. Furthermore, MMAD normalization does not require the data to follow a specific distribution, making it a versatile choice for various datasets. The application of two different strategies, one that treated the dataset as a whole and another that treated each plate independently, was done to account for potential variations within and between different plates.</p>
</div>
<div id="dimensionality-reduction" class="section level4 hasAnchor" number="2.1.1.2">
<h4><span class="header-section-number">2.1.1.2</span> Dimensionality Reduction<a href="methods.html#dimensionality-reduction" class="anchor-section" aria-label="Anchor link to header"></a></h4>
<p>In this part, several key steps were undertaken to reduce the dimensionality of the data, select the most informative features, and visualize the structure and variability of the data using Principal Component Analysis (PCA). PCA was applied to the dataset to identify the key direction or “a component” that describes most of the data variability. This process helps to transform the original dataset into an updated one where each data point is represented in terms of this component. The PCA algorithm also provides the loadings, or the contribution of each original feature to each principal component.</p>
<div class="sourceCode" id="cb5"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb5-1"><a href="methods.html#cb5-1" tabindex="-1"></a><span class="im">from</span> pca <span class="im">import</span> pca</span>
<span id="cb5-2"><a href="methods.html#cb5-2" tabindex="-1"></a><span class="im">import</span> matplotlib.pyplot <span class="im">as</span> plt</span>
<span id="cb5-3"><a href="methods.html#cb5-3" tabindex="-1"></a></span>
<span id="cb5-4"><a href="methods.html#cb5-4" tabindex="-1"></a>X <span class="op">=</span> covid_df.loc[(covid_df[<span class="st">&#39;label&#39;</span>] <span class="op">==</span> <span class="st">&#39;DMSO&#39;</span>) <span class="op">|</span> (covid_df[<span class="st">&#39;label&#39;</span>] <span class="op">==</span> <span class="st">&#39;Uninfected&#39;</span>) <span class="op">|</span> (covid_df[<span class="st">&#39;label&#39;</span>] <span class="op">==</span> <span class="st">&#39;Remdesivir&#39;</span>)][features].values</span>
<span id="cb5-5"><a href="methods.html#cb5-5" tabindex="-1"></a>row_label <span class="op">=</span> covid_df.loc[(covid_df[<span class="st">&#39;label&#39;</span>] <span class="op">==</span> <span class="st">&#39;DMSO&#39;</span>) <span class="op">|</span> (covid_df[<span class="st">&#39;label&#39;</span>] <span class="op">==</span> <span class="st">&#39;Uninfected&#39;</span>) <span class="op">|</span> (covid_df[<span class="st">&#39;label&#39;</span>] <span class="op">==</span> <span class="st">&#39;Remdesivir&#39;</span>)][<span class="st">&#39;label&#39;</span>]</span>
<span id="cb5-6"><a href="methods.html#cb5-6" tabindex="-1"></a></span>
<span id="cb5-7"><a href="methods.html#cb5-7" tabindex="-1"></a>PCA_model <span class="op">=</span> pca(n_components<span class="op">=</span><span class="dv">10</span>, detect_outliers<span class="op">=</span>[<span class="st">&#39;ht2&#39;</span>, <span class="st">&#39;spe&#39;</span>])</span>
<span id="cb5-8"><a href="methods.html#cb5-8" tabindex="-1"></a>results <span class="op">=</span> PCA_model.fit_transform(X, col_labels<span class="op">=</span>features, row_labels<span class="op">=</span>row_label)</span>
<span id="cb5-9"><a href="methods.html#cb5-9" tabindex="-1"></a>PCA_model.plot(figsize<span class="op">=</span>(<span class="dv">12</span>, <span class="dv">6</span>))</span></code></pre></div>
<div style="text-align: center;">
<figure>
<img src="assets/pca_1.png" alt="Graph basics" id="graph-basics" style="width: 90%; height: auto;"/>
</figure>
</div>
<p>Using outlier detection method like the Hotelling T2 test and the squared prediction error (SPE/DmodX) just one outlier were found in the data which was decided to remain in data.</p>
<div class="sourceCode" id="cb6"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb6-1"><a href="methods.html#cb6-1" tabindex="-1"></a>fig, axes <span class="op">=</span> plt.subplots(ncols<span class="op">=</span><span class="dv">2</span>, figsize<span class="op">=</span>(<span class="dv">17</span>,<span class="dv">8</span>))</span>
<span id="cb6-2"><a href="methods.html#cb6-2" tabindex="-1"></a>ax1 <span class="op">=</span> PCA_model.scatter(SPE<span class="op">=</span><span class="va">True</span>, hotellingt2<span class="op">=</span><span class="va">True</span>, cmap<span class="op">=</span><span class="st">&#39;tab10&#39;</span>, ax<span class="op">=</span>axes[<span class="dv">0</span>])</span>
<span id="cb6-3"><a href="methods.html#cb6-3" tabindex="-1"></a>ax2 <span class="op">=</span> PCA_model.biplot(SPE<span class="op">=</span><span class="va">True</span>, hotellingt2<span class="op">=</span><span class="va">True</span>, fontdict<span class="op">=</span>{<span class="st">&#39;size&#39;</span>: <span class="dv">8</span>}, cmap<span class="op">=</span><span class="st">&#39;tab10&#39;</span>, PC<span class="op">=</span>[<span class="dv">0</span>,<span class="dv">1</span>,<span class="dv">2</span>], ax<span class="op">=</span>axes[<span class="dv">1</span>])</span>
<span id="cb6-4"><a href="methods.html#cb6-4" tabindex="-1"></a>plt.show()</span></code></pre></div>
<div style="text-align: center;">
<figure>
<img src="assets/pca_biplot.png" alt="Graph basics" style="width: 90%; height: auto;"/>
</figure>
</div>
<p>The loading values obtained from PCA were subsequently utilized as input for a k-means clustering algorithm, enabling the clustering of features according to their loadings. The idea is to find the features that provide same information and cluster them together. The process begins with the execution of PCA, which is then followed by the deployment of k-means clustering on the PCA loadings. This arrangement allows features to be clustered based on their loadings. This can be construed as their significance or contribution to the data variance then a new feature is calculated for every cluster. This feature represents the average of all features contained within that specific cluster.</p>
<div class="sourceCode" id="cb7"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb7-1"><a href="methods.html#cb7-1" tabindex="-1"></a><span class="kw">def</span> feature_reducer(df, feature_list, loading_dim<span class="op">=</span><span class="dv">32</span>, feat_output_dim<span class="op">=</span><span class="dv">32</span>):</span>
<span id="cb7-2"><a href="methods.html#cb7-2" tabindex="-1"></a>    <span class="im">import</span> pandas <span class="im">as</span> pd</span>
<span id="cb7-3"><a href="methods.html#cb7-3" tabindex="-1"></a>    <span class="im">from</span> sklearn <span class="im">import</span> preprocessing</span>
<span id="cb7-4"><a href="methods.html#cb7-4" tabindex="-1"></a>    <span class="im">from</span> sklearn.decomposition <span class="im">import</span> PCA</span>
<span id="cb7-5"><a href="methods.html#cb7-5" tabindex="-1"></a>    <span class="im">from</span> sklearn.cluster <span class="im">import</span> KMeans</span>
<span id="cb7-6"><a href="methods.html#cb7-6" tabindex="-1"></a></span>
<span id="cb7-7"><a href="methods.html#cb7-7" tabindex="-1"></a>    df_dsmo_uninfected_remi <span class="op">=</span> df.loc[(df[<span class="st">&#39;label&#39;</span>] <span class="op">==</span> <span class="st">&#39;DMSO&#39;</span>) <span class="op">|</span> (df[<span class="st">&#39;label&#39;</span>] <span class="op">==</span> <span class="st">&#39;Uninfected&#39;</span>) <span class="op">|</span> (df[<span class="st">&#39;label&#39;</span>] <span class="op">==</span> <span class="st">&#39;Remdesivir&#39;</span>)][feature_list]</span>
<span id="cb7-8"><a href="methods.html#cb7-8" tabindex="-1"></a>    X <span class="op">=</span> df_dsmo_uninfected_remi.values</span>
<span id="cb7-9"><a href="methods.html#cb7-9" tabindex="-1"></a></span>
<span id="cb7-10"><a href="methods.html#cb7-10" tabindex="-1"></a>    pca <span class="op">=</span> PCA()</span>
<span id="cb7-11"><a href="methods.html#cb7-11" tabindex="-1"></a>    pca.fit(X)</span>
<span id="cb7-12"><a href="methods.html#cb7-12" tabindex="-1"></a>    loadings <span class="op">=</span> pca.components_</span>
<span id="cb7-13"><a href="methods.html#cb7-13" tabindex="-1"></a>    loading_data <span class="op">=</span> pd.DataFrame(loadings[:loading_dim]).T.values</span>
<span id="cb7-14"><a href="methods.html#cb7-14" tabindex="-1"></a>    <span class="co"># Perform k-means clustering</span></span>
<span id="cb7-15"><a href="methods.html#cb7-15" tabindex="-1"></a>    kmeans <span class="op">=</span> KMeans(n_clusters<span class="op">=</span>feat_output_dim, random_state<span class="op">=</span><span class="dv">42</span>, n_init<span class="op">=</span><span class="dv">100</span>).fit(loading_data)</span>
<span id="cb7-16"><a href="methods.html#cb7-16" tabindex="-1"></a></span>
<span id="cb7-17"><a href="methods.html#cb7-17" tabindex="-1"></a>    <span class="co"># Get cluster assignments for each point</span></span>
<span id="cb7-18"><a href="methods.html#cb7-18" tabindex="-1"></a>    labels <span class="op">=</span> kmeans.labels_</span>
<span id="cb7-19"><a href="methods.html#cb7-19" tabindex="-1"></a>    <span class="cf">for</span> i <span class="kw">in</span> <span class="bu">range</span>(feat_output_dim):</span>
<span id="cb7-20"><a href="methods.html#cb7-20" tabindex="-1"></a>        <span class="bu">exec</span>(<span class="ss">f&#39;f_</span><span class="sc">{</span>i<span class="op">+</span><span class="dv">1</span><span class="sc">}</span><span class="ss">&#39;</span> <span class="op">+</span> <span class="ss">f&#39;= loading_data[labels == </span><span class="sc">{</span>i<span class="sc">}</span><span class="ss">]&#39;</span>)</span>
<span id="cb7-21"><a href="methods.html#cb7-21" tabindex="-1"></a></span>
<span id="cb7-22"><a href="methods.html#cb7-22" tabindex="-1"></a>    f <span class="op">=</span> pd.DataFrame({<span class="st">&#39;features&#39;</span> : df_dsmo_uninfected_remi.columns.values, <span class="st">&#39;cluster&#39;</span>: labels}).groupby(<span class="st">&quot;cluster&quot;</span>).agg(<span class="bu">list</span>)</span>
<span id="cb7-23"><a href="methods.html#cb7-23" tabindex="-1"></a>    column_list <span class="op">=</span> <span class="bu">list</span>(df.columns)</span>
<span id="cb7-24"><a href="methods.html#cb7-24" tabindex="-1"></a>    <span class="cf">for</span> feat <span class="kw">in</span> feature_list:</span>
<span id="cb7-25"><a href="methods.html#cb7-25" tabindex="-1"></a>        column_list.remove(feat)</span>
<span id="cb7-26"><a href="methods.html#cb7-26" tabindex="-1"></a>    new_df <span class="op">=</span> df.loc[:,column_list].copy()</span>
<span id="cb7-27"><a href="methods.html#cb7-27" tabindex="-1"></a>    <span class="co"># new_df = df.loc[:,list(set(df.columns) - set(feature_list))].copy()</span></span>
<span id="cb7-28"><a href="methods.html#cb7-28" tabindex="-1"></a>    <span class="cf">for</span> i, f_list <span class="kw">in</span> <span class="bu">enumerate</span>(f[<span class="st">&#39;features&#39;</span>]):</span>
<span id="cb7-29"><a href="methods.html#cb7-29" tabindex="-1"></a>        new_df[<span class="ss">f&#39;f</span><span class="sc">{</span>i<span class="op">+</span><span class="dv">1</span><span class="sc">}</span><span class="ss">&#39;</span>] <span class="op">=</span> df[f_list].<span class="bu">apply</span>(<span class="kw">lambda</span> x:  x.mean() , axis<span class="op">=</span><span class="dv">1</span>)</span>
<span id="cb7-30"><a href="methods.html#cb7-30" tabindex="-1"></a></span>
<span id="cb7-31"><a href="methods.html#cb7-31" tabindex="-1"></a>    <span class="cf">return</span> new_df, f</span></code></pre></div>
<p>In another approach to reduce the dimension, each feature’s ability to differentiate between the classes was evaluated by calculating the area of the triangle formed by the centroids of the classes in the feature space when we just use that specific feature and one highly related parameter to differentiate classes (for instance feature vs. number of nuclei) to plot all the points. For this, the centroids of the three distinct categories (‘Compound’, ‘Uninfected’, ‘Remdesivir’) for each feature ~ number of nuclei plot have been calculated. The area of the triangle formed by these centroids and the distance between them has been computed. This quantifies the separation between the three categories using each feature and helps to find the most descriptive feature. Features resulting in larger triangle areas were considered more informative.</p>
<div class="sourceCode" id="cb8"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb8-1"><a href="methods.html#cb8-1" tabindex="-1"></a><span class="im">import</span> math</span>
<span id="cb8-2"><a href="methods.html#cb8-2" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
<span id="cb8-3"><a href="methods.html#cb8-3" tabindex="-1"></a></span>
<span id="cb8-4"><a href="methods.html#cb8-4" tabindex="-1"></a><span class="kw">def</span> area_of_triangle(p1, p2, p3):</span>
<span id="cb8-5"><a href="methods.html#cb8-5" tabindex="-1"></a>    <span class="co"># Calculate the length of each side of the triangle</span></span>
<span id="cb8-6"><a href="methods.html#cb8-6" tabindex="-1"></a>    a <span class="op">=</span> math.sqrt((p2[<span class="dv">0</span>] <span class="op">-</span> p1[<span class="dv">0</span>])<span class="op">**</span><span class="dv">2</span> <span class="op">+</span> (p2[<span class="dv">1</span>] <span class="op">-</span> p1[<span class="dv">1</span>])<span class="op">**</span><span class="dv">2</span>)</span>
<span id="cb8-7"><a href="methods.html#cb8-7" tabindex="-1"></a>    b <span class="op">=</span> math.sqrt((p3[<span class="dv">0</span>] <span class="op">-</span> p2[<span class="dv">0</span>])<span class="op">**</span><span class="dv">2</span> <span class="op">+</span> (p3[<span class="dv">1</span>] <span class="op">-</span> p2[<span class="dv">1</span>])<span class="op">**</span><span class="dv">2</span>)</span>
<span id="cb8-8"><a href="methods.html#cb8-8" tabindex="-1"></a>    c <span class="op">=</span> math.sqrt((p3[<span class="dv">0</span>] <span class="op">-</span> p1[<span class="dv">0</span>])<span class="op">**</span><span class="dv">2</span> <span class="op">+</span> (p3[<span class="dv">1</span>] <span class="op">-</span> p1[<span class="dv">1</span>])<span class="op">**</span><span class="dv">2</span>)</span>
<span id="cb8-9"><a href="methods.html#cb8-9" tabindex="-1"></a>    </span>
<span id="cb8-10"><a href="methods.html#cb8-10" tabindex="-1"></a>    <span class="co"># Calculate the semiperimeter of the triangle</span></span>
<span id="cb8-11"><a href="methods.html#cb8-11" tabindex="-1"></a>    s <span class="op">=</span> (a <span class="op">+</span> b <span class="op">+</span> c) <span class="op">/</span> <span class="dv">2</span></span>
<span id="cb8-12"><a href="methods.html#cb8-12" tabindex="-1"></a>    </span>
<span id="cb8-13"><a href="methods.html#cb8-13" tabindex="-1"></a>    <span class="co"># Calculate the area using Heron&#39;s formula</span></span>
<span id="cb8-14"><a href="methods.html#cb8-14" tabindex="-1"></a>    area <span class="op">=</span> math.sqrt(s <span class="op">*</span> (s <span class="op">-</span> a) <span class="op">*</span> (s <span class="op">-</span> b) <span class="op">*</span> (s <span class="op">-</span> c))</span>
<span id="cb8-15"><a href="methods.html#cb8-15" tabindex="-1"></a>    </span>
<span id="cb8-16"><a href="methods.html#cb8-16" tabindex="-1"></a>    <span class="cf">return</span> area</span>
<span id="cb8-17"><a href="methods.html#cb8-17" tabindex="-1"></a></span>
<span id="cb8-18"><a href="methods.html#cb8-18" tabindex="-1"></a><span class="kw">def</span> distance_between_centroids(centroid1, centroid2):</span>
<span id="cb8-19"><a href="methods.html#cb8-19" tabindex="-1"></a>    <span class="co"># Calculate the distance using the distance formula</span></span>
<span id="cb8-20"><a href="methods.html#cb8-20" tabindex="-1"></a>    distance <span class="op">=</span> np.sqrt((centroid2[<span class="dv">0</span>] <span class="op">-</span> centroid1[<span class="dv">0</span>])<span class="op">**</span><span class="dv">2</span> <span class="op">+</span> (centroid2[<span class="dv">1</span>] <span class="op">-</span> centroid1[<span class="dv">1</span>])<span class="op">**</span><span class="dv">2</span>)</span>
<span id="cb8-21"><a href="methods.html#cb8-21" tabindex="-1"></a>    </span>
<span id="cb8-22"><a href="methods.html#cb8-22" tabindex="-1"></a>    <span class="cf">return</span> distance</span>
<span id="cb8-23"><a href="methods.html#cb8-23" tabindex="-1"></a></span>
<span id="cb8-24"><a href="methods.html#cb8-24" tabindex="-1"></a></span>
<span id="cb8-25"><a href="methods.html#cb8-25" tabindex="-1"></a>scaler <span class="op">=</span> preprocessing.StandardScaler(with_mean<span class="op">=</span><span class="va">True</span>, with_std<span class="op">=</span><span class="va">True</span>)</span>
<span id="cb8-26"><a href="methods.html#cb8-26" tabindex="-1"></a>scaled_df <span class="op">=</span> covid_df.copy()</span>
<span id="cb8-27"><a href="methods.html#cb8-27" tabindex="-1"></a>scaled_df.loc[:, features] <span class="op">=</span> scaler.fit_transform(scaled_df.loc[:, features])</span>
<span id="cb8-28"><a href="methods.html#cb8-28" tabindex="-1"></a></span>
<span id="cb8-29"><a href="methods.html#cb8-29" tabindex="-1"></a>scores <span class="op">=</span> []</span>
<span id="cb8-30"><a href="methods.html#cb8-30" tabindex="-1"></a>comp_remi <span class="op">=</span> []</span>
<span id="cb8-31"><a href="methods.html#cb8-31" tabindex="-1"></a>comp_uni <span class="op">=</span> []</span>
<span id="cb8-32"><a href="methods.html#cb8-32" tabindex="-1"></a>remi_uni <span class="op">=</span> []</span>
<span id="cb8-33"><a href="methods.html#cb8-33" tabindex="-1"></a><span class="cf">for</span> feat <span class="kw">in</span> features[<span class="dv">1</span>:]:</span>
<span id="cb8-34"><a href="methods.html#cb8-34" tabindex="-1"></a>    compound_coords <span class="op">=</span> scaled_df.loc[scaled_df[<span class="st">&#39;label&#39;</span>] <span class="op">==</span> <span class="st">&#39;compound&#39;</span>,[<span class="st">&#39;Count_nuclei&#39;</span>,feat]].values</span>
<span id="cb8-35"><a href="methods.html#cb8-35" tabindex="-1"></a>    uninfected_coords <span class="op">=</span> scaled_df.loc[scaled_df[<span class="st">&#39;label&#39;</span>] <span class="op">==</span> <span class="st">&#39;Uninfected&#39;</span>,[<span class="st">&#39;Count_nuclei&#39;</span>,feat]].values</span>
<span id="cb8-36"><a href="methods.html#cb8-36" tabindex="-1"></a>    remidesivir_coords <span class="op">=</span> scaled_df.loc[scaled_df[<span class="st">&#39;label&#39;</span>] <span class="op">==</span> <span class="st">&#39;Remdesivir&#39;</span>,[<span class="st">&#39;Count_nuclei&#39;</span>,feat]].values</span>
<span id="cb8-37"><a href="methods.html#cb8-37" tabindex="-1"></a></span>
<span id="cb8-38"><a href="methods.html#cb8-38" tabindex="-1"></a>    compound_centroid <span class="op">=</span> np.mean(compound_coords, axis<span class="op">=</span><span class="dv">0</span>)</span>
<span id="cb8-39"><a href="methods.html#cb8-39" tabindex="-1"></a>    uninfected_centroid <span class="op">=</span> np.mean(uninfected_coords, axis<span class="op">=</span><span class="dv">0</span>)</span>
<span id="cb8-40"><a href="methods.html#cb8-40" tabindex="-1"></a>    remidesivir_centroid <span class="op">=</span> np.mean(remidesivir_coords, axis<span class="op">=</span><span class="dv">0</span>)</span>
<span id="cb8-41"><a href="methods.html#cb8-41" tabindex="-1"></a></span>
<span id="cb8-42"><a href="methods.html#cb8-42" tabindex="-1"></a>    area <span class="op">=</span> area_of_triangle(compound_centroid, uninfected_centroid, remidesivir_centroid)</span>
<span id="cb8-43"><a href="methods.html#cb8-43" tabindex="-1"></a>    comp_remi_dist <span class="op">=</span>  distance_between_centroids(compound_centroid, remidesivir_centroid)</span>
<span id="cb8-44"><a href="methods.html#cb8-44" tabindex="-1"></a>    comp_uni_dist <span class="op">=</span>  distance_between_centroids(compound_centroid, uninfected_centroid)</span>
<span id="cb8-45"><a href="methods.html#cb8-45" tabindex="-1"></a>    remi_uni_dist <span class="op">=</span>  distance_between_centroids(remidesivir_centroid, uninfected_centroid)</span>
<span id="cb8-46"><a href="methods.html#cb8-46" tabindex="-1"></a>    scores.append(area)</span>
<span id="cb8-47"><a href="methods.html#cb8-47" tabindex="-1"></a>    comp_remi.append(comp_remi_dist)</span>
<span id="cb8-48"><a href="methods.html#cb8-48" tabindex="-1"></a>    comp_uni.append(comp_uni_dist)</span>
<span id="cb8-49"><a href="methods.html#cb8-49" tabindex="-1"></a>    remi_uni.append(remi_uni_dist)</span>
<span id="cb8-50"><a href="methods.html#cb8-50" tabindex="-1"></a></span>
<span id="cb8-51"><a href="methods.html#cb8-51" tabindex="-1"></a>feat_score <span class="op">=</span> pd.DataFrame({<span class="st">&#39;feat&#39;</span>: features[<span class="dv">1</span>:],</span>
<span id="cb8-52"><a href="methods.html#cb8-52" tabindex="-1"></a>                           <span class="st">&#39;score&#39;</span>: scores,</span>
<span id="cb8-53"><a href="methods.html#cb8-53" tabindex="-1"></a>                           <span class="st">&#39;comp_remi&#39;</span>:comp_remi,</span>
<span id="cb8-54"><a href="methods.html#cb8-54" tabindex="-1"></a>                           <span class="st">&#39;comp_uni&#39;</span>:comp_uni,</span>
<span id="cb8-55"><a href="methods.html#cb8-55" tabindex="-1"></a>                           <span class="st">&#39;remi_uni&#39;</span>: remi_uni})</span></code></pre></div>
<p>The features have been ranked based on the calculated area, seen as a measure of separation between the categories. The top 50 features have been selected. Between the first 50 features all annotated with MITO were removed because they bias our model. Therefore, 16 features remain as our selected features.</p>
<div class="sourceCode" id="cb9"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb9-1"><a href="methods.html#cb9-1" tabindex="-1"></a><span class="bu">list</span>(feat_score.sort_values(by<span class="op">=</span>[<span class="st">&quot;score&quot;</span>], ascending<span class="op">=</span><span class="va">False</span>).head(<span class="dv">50</span>)[<span class="st">&#39;feat&#39;</span>])</span></code></pre></div>
<p>Finally, PCA has been performed again, this time exclusively on the selected features.</p>
<div style="text-align: center;">
<figure>
<img src="assets/pca_16.png" alt="Graph basics" style="width: 100%; height: auto;"/>
</figure>
</div>
<p align="center">
<p><a href="https://pharmbio.github.io/nw-cp/assets/pca_16.html">Link to see interactive plot</a></p>
</p>
<p>The PCA results demonstrate that the 10 principal components can now explain a very high percentage (99.91%) of the variance. The first principle component improved from describing 75.1 percent of variance to 93.0 percent. This indicates that the selected features capture most of the data variability. The value of PC1 was used for regression models as the target value.</p>
<div style="text-align: center;">
<figure>
<img src="assets/pca_2.png" alt="Graph basics" style="width: 90%; height: auto;"/>
</figure>
</div>
</div>
<div id="development-of-a-binary-classification-of-data" class="section level4 hasAnchor" number="2.1.1.3">
<h4><span class="header-section-number">2.1.1.3</span> Development of a binary classification of data<a href="methods.html#development-of-a-binary-classification-of-data" class="anchor-section" aria-label="Anchor link to header"></a></h4>
<p>In this part, Kernel Density Estimation (KDE) and Empirical Confidence Regions (2d confidence intervals based on binned kernel density estimate) were utilized to inform the development of a binary classification model for identifying active and inactive compounds based on their PCA1 values.</p>
<p>Firstly, a two-dimensional KDE was performed on the compounds’ PCA1 and PCA2 values. This KDE plot provided a comprehensive understanding of the distribution of data points in the PCA space. It highlighted two distinct clusters corresponding to active and inactive compounds.</p>
<p>The KDE plot was further enhanced by overlaying empirical confidence regions on it. These regions were derived from the mean and covariance of the PCA1 and PCA2 values for each cluster. Two standard deviation ellipses were used, approximating 90% confidence regions for the location of the true mean of each cluster in the PCA space.</p>
<div class="sourceCode" id="cb10"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb10-1"><a href="methods.html#cb10-1" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
<span id="cb10-2"><a href="methods.html#cb10-2" tabindex="-1"></a><span class="im">import</span> seaborn <span class="im">as</span> sns</span>
<span id="cb10-3"><a href="methods.html#cb10-3" tabindex="-1"></a><span class="im">import</span> matplotlib.pyplot <span class="im">as</span> plt</span>
<span id="cb10-4"><a href="methods.html#cb10-4" tabindex="-1"></a><span class="im">from</span> matplotlib.patches <span class="im">import</span> Ellipse</span>
<span id="cb10-5"><a href="methods.html#cb10-5" tabindex="-1"></a></span>
<span id="cb10-6"><a href="methods.html#cb10-6" tabindex="-1"></a>mean1 <span class="op">=</span> df_cluster1[[<span class="st">&#39;PC1&#39;</span>, <span class="st">&#39;PC2&#39;</span>]].mean()</span>
<span id="cb10-7"><a href="methods.html#cb10-7" tabindex="-1"></a>cov1 <span class="op">=</span> df_cluster1[[<span class="st">&#39;PC1&#39;</span>, <span class="st">&#39;PC2&#39;</span>]].cov()</span>
<span id="cb10-8"><a href="methods.html#cb10-8" tabindex="-1"></a></span>
<span id="cb10-9"><a href="methods.html#cb10-9" tabindex="-1"></a>mean2 <span class="op">=</span> df_cluster2[[<span class="st">&#39;PC1&#39;</span>, <span class="st">&#39;PC2&#39;</span>]].mean()</span>
<span id="cb10-10"><a href="methods.html#cb10-10" tabindex="-1"></a>cov2 <span class="op">=</span> df_cluster2[[<span class="st">&#39;PC1&#39;</span>, <span class="st">&#39;PC2&#39;</span>]].cov()</span>
<span id="cb10-11"><a href="methods.html#cb10-11" tabindex="-1"></a></span>
<span id="cb10-12"><a href="methods.html#cb10-12" tabindex="-1"></a>fig, ax <span class="op">=</span> plt.subplots(figsize<span class="op">=</span>(<span class="dv">7</span>, <span class="dv">4</span>))</span>
<span id="cb10-13"><a href="methods.html#cb10-13" tabindex="-1"></a><span class="co"># Draw the KDE plot</span></span>
<span id="cb10-14"><a href="methods.html#cb10-14" tabindex="-1"></a>sns.kdeplot(data<span class="op">=</span>df_clust, x<span class="op">=</span><span class="st">&#39;PC1&#39;</span>, y<span class="op">=</span><span class="st">&#39;PC2&#39;</span>, fill<span class="op">=</span><span class="va">True</span>, ax<span class="op">=</span>ax)</span>
<span id="cb10-15"><a href="methods.html#cb10-15" tabindex="-1"></a></span>
<span id="cb10-16"><a href="methods.html#cb10-16" tabindex="-1"></a><span class="co"># Draw the confidence ellipses</span></span>
<span id="cb10-17"><a href="methods.html#cb10-17" tabindex="-1"></a><span class="cf">for</span> mean, cov <span class="kw">in</span> [(mean1, cov1), (mean2, cov2)]:</span>
<span id="cb10-18"><a href="methods.html#cb10-18" tabindex="-1"></a>    eigenvalues, eigenvectors <span class="op">=</span> np.linalg.eigh(cov)</span>
<span id="cb10-19"><a href="methods.html#cb10-19" tabindex="-1"></a>    order <span class="op">=</span> eigenvalues.argsort()[::<span class="op">-</span><span class="dv">1</span>]</span>
<span id="cb10-20"><a href="methods.html#cb10-20" tabindex="-1"></a>    eigenvalues, eigenvectors <span class="op">=</span> eigenvalues[order], eigenvectors[:, order]</span>
<span id="cb10-21"><a href="methods.html#cb10-21" tabindex="-1"></a>    vx, vy <span class="op">=</span> eigenvectors[:,<span class="dv">0</span>]</span>
<span id="cb10-22"><a href="methods.html#cb10-22" tabindex="-1"></a>    theta <span class="op">=</span> np.arctan2(vy, vx)</span>
<span id="cb10-23"><a href="methods.html#cb10-23" tabindex="-1"></a></span>
<span id="cb10-24"><a href="methods.html#cb10-24" tabindex="-1"></a>    <span class="co"># Draw a 2*2.146 ellipse for 90 % CI</span></span>
<span id="cb10-25"><a href="methods.html#cb10-25" tabindex="-1"></a>    ellipse <span class="op">=</span> Ellipse(xy<span class="op">=</span>mean, width<span class="op">=</span><span class="dv">2</span><span class="op">*</span><span class="fl">2.146</span><span class="op">**</span>np.sqrt(eigenvalues[<span class="dv">0</span>]), </span>
<span id="cb10-26"><a href="methods.html#cb10-26" tabindex="-1"></a>                      height<span class="op">=</span><span class="dv">2</span><span class="op">*</span><span class="fl">2.146</span><span class="op">**</span>np.sqrt(eigenvalues[<span class="dv">1</span>]), </span>
<span id="cb10-27"><a href="methods.html#cb10-27" tabindex="-1"></a>                      angle<span class="op">=</span>np.degrees(theta), edgecolor<span class="op">=</span><span class="st">&#39;red&#39;</span>, </span>
<span id="cb10-28"><a href="methods.html#cb10-28" tabindex="-1"></a>                      facecolor<span class="op">=</span><span class="st">&#39;none&#39;</span>)</span>
<span id="cb10-29"><a href="methods.html#cb10-29" tabindex="-1"></a>    ax.add_patch(ellipse)</span>
<span id="cb10-30"><a href="methods.html#cb10-30" tabindex="-1"></a></span>
<span id="cb10-31"><a href="methods.html#cb10-31" tabindex="-1"></a>plt.show()</span></code></pre></div>
<div style="text-align: center;">
<figure>
<img src="assets/KDE_ECR.png" alt="Graph basics" style="width: 70%; height: auto;"/>
</figure>
</div>
<p>The intersection of these confidence regions was then examined. The PCA1 value of 5 at this intersection was hypothesized to be an effective threshold for binary classification of compounds. Any compound with a PCA1 value greater than this threshold was classified as ‘active’, and any compound with a PCA1 value less than this threshold was classified as ‘inactive’.</p>
<p>Importantly, this approach allowed the study to estimate a suitable classification threshold but also to visualize the uncertainty around this threshold and the potential overlap between the two classes. The method provided a data-driven way to set the classification threshold and offered insights into the inherent complexity of the data.</p>
<div style="text-align: center;">
<figure>
<img src="assets/ECIRegion_R.png" alt="Graph basics" style="width: 80%; height: auto;"/>
</figure>
</div>
<p>It should be noted that the assumptions of the Gaussian distribution and independent identically distributed data inherent to this method may not hold in all cases. Therefore, the results should be interpreted with caution. Further, the use of PCA1 values alone for classification may oversimplify the problem if the active and inactive compounds differ along other principal components as well. Therefore, additional analyses are recommended to validate and refine this binary classification model.</p>
</div>
</div>
<div id="compound-protein-and-pathway-data-aggregation" class="section level3 hasAnchor" number="2.1.2">
<h3><span class="header-section-number">2.1.2</span> Compound, Protein and Pathway Data Aggregation<a href="methods.html#compound-protein-and-pathway-data-aggregation" class="anchor-section" aria-label="Anchor link to header"></a></h3>
<p>In the cell profiling data, the Simplified Molecular Input Line Entry System (SMILES) was incorporated to represent the chemical structure. The COVID-19 cell painting experiments that was carried out, was involved cell perturbations with over 5000 compounds. To uncover potential information regarding the protein binding capabilities and the pathways and assays in which these compounds are active, a highly recognized cross-reference annotation was necessitated.</p>
<p>The PubChem Chemical ID (CID) serves as an exhaustive cross-reference annotation for chemicals. The initial step involved the determination of the CIDs for all the chemical compounds. These identifiers were subsequently utilized for the aggregation of additional protein and pathway data. This approach facilitated finding the activity of the compounds within the biological systems.</p>
<div class="sourceCode" id="cb11"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb11-1"><a href="methods.html#cb11-1" tabindex="-1"></a><span class="im">import</span> pubchempy <span class="im">as</span> pcp</span>
<span id="cb11-2"><a href="methods.html#cb11-2" tabindex="-1"></a></span>
<span id="cb11-3"><a href="methods.html#cb11-3" tabindex="-1"></a>chemical_smiles <span class="op">=</span> <span class="bu">list</span>(df[<span class="st">&#39;smiles&#39;</span>].values)</span>
<span id="cb11-4"><a href="methods.html#cb11-4" tabindex="-1"></a></span>
<span id="cb11-5"><a href="methods.html#cb11-5" tabindex="-1"></a>cids <span class="op">=</span> []</span>
<span id="cb11-6"><a href="methods.html#cb11-6" tabindex="-1"></a><span class="cf">for</span> smiles <span class="kw">in</span> chemical_smiles:</span>
<span id="cb11-7"><a href="methods.html#cb11-7" tabindex="-1"></a>    <span class="cf">try</span>:</span>
<span id="cb11-8"><a href="methods.html#cb11-8" tabindex="-1"></a>        c <span class="op">=</span> pcp.get_compounds(smiles, <span class="st">&#39;smiles&#39;</span>)</span>
<span id="cb11-9"><a href="methods.html#cb11-9" tabindex="-1"></a>        <span class="cf">if</span> c:</span>
<span id="cb11-10"><a href="methods.html#cb11-10" tabindex="-1"></a>           cids.append(c[<span class="dv">0</span>].cid})</span>
<span id="cb11-11"><a href="methods.html#cb11-11" tabindex="-1"></a>        <span class="cf">else</span>:</span>
<span id="cb11-12"><a href="methods.html#cb11-12" tabindex="-1"></a>            <span class="bu">print</span>(<span class="ss">f&#39;No compound found for SMILES: </span><span class="sc">{</span>smiles<span class="sc">}</span><span class="ss">&#39;</span>)</span>
<span id="cb11-13"><a href="methods.html#cb11-13" tabindex="-1"></a>    <span class="cf">except</span> <span class="pp">Exception</span> <span class="im">as</span> e:</span>
<span id="cb11-14"><a href="methods.html#cb11-14" tabindex="-1"></a>        <span class="bu">print</span>(<span class="ss">f&#39;Error occurred: </span><span class="sc">{</span>e<span class="sc">}</span><span class="ss">&#39;</span>)</span></code></pre></div>
<p>The COVID-19 dataset consisted of compounds screened for potential activity against SARS-CoV-2. To analyze the associations between these compounds and proteins, an auxiliary dataset, sourced from the STITCH database, was utilized. The STITCH database provides information about interactions between chemicals and proteins. These connections were then indexed and collated to create a list of compounds with and their association with proteins.</p>
<p>Further, the connection between chemical compounds and their corresponding biological pathways was established through the following steps:</p>
<p>Initially, assay summaries for a selection of compounds were obtained from the PubChem database. These summaries provided information about various biological assays performed on the compounds, specifically emphasizing on the target gene IDs that interacted with the compounds during these assays. Only the assays reporting an ‘Active’ outcome and a non-empty target gene ID were retained for further analysis.</p>
<div class="sourceCode" id="cb12"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb12-1"><a href="methods.html#cb12-1" tabindex="-1"></a><span class="im">from</span> io <span class="im">import</span> StringIO</span>
<span id="cb12-2"><a href="methods.html#cb12-2" tabindex="-1"></a><span class="im">import</span> polars <span class="im">as</span> pl</span>
<span id="cb12-3"><a href="methods.html#cb12-3" tabindex="-1"></a><span class="im">import</span> pubchempy <span class="im">as</span> pcp</span>
<span id="cb12-4"><a href="methods.html#cb12-4" tabindex="-1"></a></span>
<span id="cb12-5"><a href="methods.html#cb12-5" tabindex="-1"></a>comp_gid <span class="op">=</span> pl.read_csv(<span class="st">&#39;data/comp_gid.tsv&#39;</span>, separator<span class="op">=</span><span class="st">&#39;</span><span class="ch">\t</span><span class="st">&#39;</span>)</span>
<span id="cb12-6"><a href="methods.html#cb12-6" tabindex="-1"></a>cids <span class="op">=</span> comp_gid.select([<span class="st">&#39;pubchem_cid&#39;</span>]).to_series().to_list()</span>
<span id="cb12-7"><a href="methods.html#cb12-7" tabindex="-1"></a></span>
<span id="cb12-8"><a href="methods.html#cb12-8" tabindex="-1"></a>cid_genid_df <span class="op">=</span> pl.DataFrame(</span>
<span id="cb12-9"><a href="methods.html#cb12-9" tabindex="-1"></a>  schema<span class="op">=</span>{<span class="st">&#39;CID&#39;</span>: pl.Int64, <span class="st">&#39;AID&#39;</span>: pl.Int64, <span class="st">&#39;Target GeneID&#39;</span>: pl.Utf8, <span class="st">&#39;Activity Value [uM]&#39;</span>: pl.Float64, <span class="st">&#39;Assay Name&#39;</span>: pl.Utf8}</span>
<span id="cb12-10"><a href="methods.html#cb12-10" tabindex="-1"></a>  )</span>
<span id="cb12-11"><a href="methods.html#cb12-11" tabindex="-1"></a>  </span>
<span id="cb12-12"><a href="methods.html#cb12-12" tabindex="-1"></a><span class="cf">for</span> cid <span class="kw">in</span> cids[:<span class="dv">10</span>]:</span>
<span id="cb12-13"><a href="methods.html#cb12-13" tabindex="-1"></a>    <span class="cf">try</span>:</span>
<span id="cb12-14"><a href="methods.html#cb12-14" tabindex="-1"></a>        csvStringIO <span class="op">=</span> StringIO(pcp.get(cid, operation<span class="op">=</span><span class="st">&#39;assaysummary&#39;</span>, output<span class="op">=</span><span class="st">&#39;CSV&#39;</span>).decode(<span class="st">&quot;utf-8&quot;</span>))</span>
<span id="cb12-15"><a href="methods.html#cb12-15" tabindex="-1"></a>        dictdf <span class="op">=</span> pl.read_csv(csvStringIO, dtypes<span class="op">=</span>{<span class="st">&#39;Activity Value [uM]&#39;</span>: pl.Float64})</span>
<span id="cb12-16"><a href="methods.html#cb12-16" tabindex="-1"></a>        ciddf <span class="op">=</span> dictdf.<span class="bu">filter</span>(</span>
<span id="cb12-17"><a href="methods.html#cb12-17" tabindex="-1"></a>          (pl.col(<span class="st">&quot;Activity Outcome&quot;</span>) <span class="op">==</span> <span class="st">&quot;Active&quot;</span>) <span class="op">&amp;</span> (pl.col(<span class="st">&quot;Target GeneID&quot;</span>) <span class="op">!=</span> <span class="st">&quot;&quot;</span>)</span>
<span id="cb12-18"><a href="methods.html#cb12-18" tabindex="-1"></a>          ).unique(</span>
<span id="cb12-19"><a href="methods.html#cb12-19" tabindex="-1"></a>            subset<span class="op">=</span><span class="st">&#39;Target GeneID&#39;</span></span>
<span id="cb12-20"><a href="methods.html#cb12-20" tabindex="-1"></a>            ).select(</span>
<span id="cb12-21"><a href="methods.html#cb12-21" tabindex="-1"></a>              [<span class="st">&#39;CID&#39;</span>, <span class="st">&#39;AID&#39;</span>, <span class="st">&#39;Target GeneID&#39;</span>, <span class="st">&#39;Activity Value [uM]&#39;</span>,  <span class="st">&#39;Assay Name&#39;</span>]</span>
<span id="cb12-22"><a href="methods.html#cb12-22" tabindex="-1"></a>              )</span>
<span id="cb12-23"><a href="methods.html#cb12-23" tabindex="-1"></a>    <span class="cf">except</span>:</span>
<span id="cb12-24"><a href="methods.html#cb12-24" tabindex="-1"></a>        ciddf <span class="op">=</span> pl.DataFrame(</span>
<span id="cb12-25"><a href="methods.html#cb12-25" tabindex="-1"></a>          schema<span class="op">=</span>{<span class="st">&#39;CID&#39;</span>: pl.Int64, <span class="st">&#39;AID&#39;</span>: pl.Int64, <span class="st">&#39;Target GeneID&#39;</span>: pl.Utf8, <span class="st">&#39;Activity Value [uM]&#39;</span>: pl.Utf8, <span class="st">&#39;Assay Name&#39;</span>: pl.Utf8}</span>
<span id="cb12-26"><a href="methods.html#cb12-26" tabindex="-1"></a>          )</span>
<span id="cb12-27"><a href="methods.html#cb12-27" tabindex="-1"></a>          </span>
<span id="cb12-28"><a href="methods.html#cb12-28" tabindex="-1"></a>    cid_genid_df <span class="op">=</span> cid_genid_df.vstack(ciddf)</span>
<span id="cb12-29"><a href="methods.html#cb12-29" tabindex="-1"></a></span>
<span id="cb12-30"><a href="methods.html#cb12-30" tabindex="-1"></a>cid_genid_df.write_csv(<span class="st">&#39;comp_geneid.tsv&#39;</span>, separator<span class="op">=</span><span class="st">&#39;</span><span class="ch">\t</span><span class="st">&#39;</span>)</span></code></pre></div>
<p>Subsequently, the list of unique target gene IDs was utilized to extract associated biological pathways from the PubChem database. These pathways, sourced from the WikiPathways database, link each gene ID to one or multiple biological pathways.</p>
<div class="sourceCode" id="cb13"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb13-1"><a href="methods.html#cb13-1" tabindex="-1"></a><span class="im">import</span> polars <span class="im">as</span> pl</span>
<span id="cb13-2"><a href="methods.html#cb13-2" tabindex="-1"></a><span class="im">import</span> pubchempy <span class="im">as</span> pcp</span>
<span id="cb13-3"><a href="methods.html#cb13-3" tabindex="-1"></a></span>
<span id="cb13-4"><a href="methods.html#cb13-4" tabindex="-1"></a>gene_id_list <span class="op">=</span> cid_genid_df.select(pl.col(<span class="st">&#39;Target GeneID&#39;</span>).cast(pl.Int64, strict<span class="op">=</span><span class="va">True</span>)).unique(subset<span class="op">=</span><span class="st">&#39;Target GeneID&#39;</span>, maintain_order<span class="op">=</span><span class="va">True</span>).to_series().to_list()</span>
<span id="cb13-5"><a href="methods.html#cb13-5" tabindex="-1"></a>genid_wpw_df <span class="op">=</span> pl.DataFrame(schema<span class="op">=</span>{<span class="st">&#39;Target GeneID&#39;</span>: pl.Object, <span class="st">&#39;Wiki Pathway&#39;</span>: pl.Utf8})</span>
<span id="cb13-6"><a href="methods.html#cb13-6" tabindex="-1"></a></span>
<span id="cb13-7"><a href="methods.html#cb13-7" tabindex="-1"></a><span class="cf">for</span> gene_id <span class="kw">in</span> gene_id_list:</span>
<span id="cb13-8"><a href="methods.html#cb13-8" tabindex="-1"></a>    <span class="cf">try</span>:</span>
<span id="cb13-9"><a href="methods.html#cb13-9" tabindex="-1"></a>        ptw_list <span class="op">=</span> pcp.get_json(gene_id, namespace<span class="op">=</span><span class="st">&#39;geneid&#39;</span>, domain<span class="op">=</span><span class="st">&#39;gene&#39;</span>, operation<span class="op">=</span><span class="st">&#39;pwaccs&#39;</span>)[<span class="st">&#39;InformationList&#39;</span>][<span class="st">&#39;Information&#39;</span>][<span class="dv">0</span>][<span class="st">&#39;PathwayAccession&#39;</span>]</span>
<span id="cb13-10"><a href="methods.html#cb13-10" tabindex="-1"></a>        wp <span class="op">=</span> [i[<span class="dv">13</span>:] <span class="cf">for</span> i <span class="kw">in</span> <span class="bu">list</span>(<span class="bu">filter</span>(<span class="kw">lambda</span> x: <span class="st">&#39;WikiPathways&#39;</span> <span class="kw">in</span> x, ptw_list))]</span>
<span id="cb13-11"><a href="methods.html#cb13-11" tabindex="-1"></a>        temp_df <span class="op">=</span> pl.DataFrame({<span class="st">&#39;Target GeneID&#39;</span>:gene_id, <span class="st">&#39;Wiki Pathway&#39;</span>: wp})</span>
<span id="cb13-12"><a href="methods.html#cb13-12" tabindex="-1"></a>    <span class="cf">except</span>:</span>
<span id="cb13-13"><a href="methods.html#cb13-13" tabindex="-1"></a>        temp_df <span class="op">=</span> pl.DataFrame({<span class="st">&#39;Target GeneID&#39;</span>:gene_id, <span class="st">&#39;Wiki Pathway&#39;</span>: <span class="st">&#39;&#39;</span>})</span>
<span id="cb13-14"><a href="methods.html#cb13-14" tabindex="-1"></a>    genid_wpw_df <span class="op">=</span> genid_wpw_df.vstack(temp_df)</span>
<span id="cb13-15"><a href="methods.html#cb13-15" tabindex="-1"></a>    </span>
<span id="cb13-16"><a href="methods.html#cb13-16" tabindex="-1"></a></span>
<span id="cb13-17"><a href="methods.html#cb13-17" tabindex="-1"></a>genid_wpw_df.write_csv(<span class="st">&#39;geneid_wpw.tsv&#39;</span>, separator<span class="op">=</span><span class="st">&#39;</span><span class="ch">\t</span><span class="st">&#39;</span>)</span></code></pre></div>
<p>Gene ID fetched from these databases can be converted to STITCH database annotation by using BIIT API:</p>
<div class="sourceCode" id="cb14"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb14-1"><a href="methods.html#cb14-1" tabindex="-1"></a><span class="im">import</span> requests</span>
<span id="cb14-2"><a href="methods.html#cb14-2" tabindex="-1"></a></span>
<span id="cb14-3"><a href="methods.html#cb14-3" tabindex="-1"></a>r <span class="op">=</span> requests.post(</span>
<span id="cb14-4"><a href="methods.html#cb14-4" tabindex="-1"></a>    url<span class="op">=</span><span class="st">&#39;https://biit.cs.ut.ee/gprofiler/api/convert/convert/&#39;</span>,</span>
<span id="cb14-5"><a href="methods.html#cb14-5" tabindex="-1"></a>    json<span class="op">=</span>{</span>
<span id="cb14-6"><a href="methods.html#cb14-6" tabindex="-1"></a>        <span class="st">&#39;organism&#39;</span>:<span class="st">&#39;hsapiens&#39;</span>,</span>
<span id="cb14-7"><a href="methods.html#cb14-7" tabindex="-1"></a>        <span class="st">&#39;target&#39;</span>:<span class="st">&#39;ENSP&#39;</span>,</span>
<span id="cb14-8"><a href="methods.html#cb14-8" tabindex="-1"></a>        <span class="st">&#39;query&#39;</span>:gene_id_list,</span>
<span id="cb14-9"><a href="methods.html#cb14-9" tabindex="-1"></a>        <span class="st">&#39;numeric_namespace&#39;</span>: <span class="st">&#39;ENTREZGENE_ACC&#39;</span></span>
<span id="cb14-10"><a href="methods.html#cb14-10" tabindex="-1"></a>    }</span>
<span id="cb14-11"><a href="methods.html#cb14-11" tabindex="-1"></a>    )</span>
<span id="cb14-12"><a href="methods.html#cb14-12" tabindex="-1"></a>pl.DataFrame(r.json()[<span class="st">&#39;result&#39;</span>]).select([<span class="st">&#39;incoming&#39;</span>, <span class="st">&#39;converted&#39;</span>, <span class="st">&#39;name&#39;</span>, <span class="st">&#39;description&#39;</span>])</span></code></pre></div>
<p>Hence, a linkage from chemical compounds to biological pathways was constructed by associating compounds with target gene IDs from assays, and then connecting these gene IDs to biological pathways. This method of data preparation facilitated the exploration of potential mechanisms of action of compounds. It also provided an understanding of the biological processes potentially influenced by these compounds. It should be noted, however, that this is a simplified representation of the actual biological interactions, which are inherently more complex.</p>
<p>These curated lists of compounds, proteins, and pathways served as the foundation for constructing a multimodal graph. In this graph, the nodes represent compounds, proteins, and pathways while the edges depict the connections between them.</p>
</div>
<div id="featurizing-the-biomedical-entities" class="section level3 hasAnchor" number="2.1.3">
<h3><span class="header-section-number">2.1.3</span> Featurizing the Biomedical Entities<a href="methods.html#featurizing-the-biomedical-entities" class="anchor-section" aria-label="Anchor link to header"></a></h3>
<p>In machine learning, featurization converts biomedical entities, which are often three-dimensional entities like chemicals and proteins, into a format that can be understood and processed by algorithms. Essentially, it involves converting the structure and properties into numerical vectors.</p>
<p>It is necessary because machine learning algorithms work with numerical data rather than understanding biological structures and properties directly. Featurization is accomplished through the application of different algorithm and ways which will be discussed in following section.</p>
<div id="featurizing-compounds" class="section level4 hasAnchor" number="2.1.3.1">
<h4><span class="header-section-number">2.1.3.1</span> Featurizing Compounds<a href="methods.html#featurizing-compounds" class="anchor-section" aria-label="Anchor link to header"></a></h4>
<p>Starting with MACCS fingerprints, these are binary representations of a molecule based on the presence or absence of 167 predefined structural fragments. MACCS fingerprints are popular due to their simplicity, interpretability, and effectiveness at capturing structural information.</p>
<div class="sourceCode" id="cb15"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb15-1"><a href="methods.html#cb15-1" tabindex="-1"></a><span class="im">import</span> deepchem <span class="im">as</span> dc</span>
<span id="cb15-2"><a href="methods.html#cb15-2" tabindex="-1"></a></span>
<span id="cb15-3"><a href="methods.html#cb15-3" tabindex="-1"></a>feat <span class="op">=</span> dc.feat.MACCSKeysFingerprint()</span>
<span id="cb15-4"><a href="methods.html#cb15-4" tabindex="-1"></a>maccs_fp <span class="op">=</span> feat.featurize(smiles)</span></code></pre></div>
<p>Morgan fingerprints, also known as circular fingerprints, are another type of molecular descriptor. They are generated by iteratively hashing the environments of atoms in a molecule. These fingerprints are characterized by their flexibility, as their radius and length can be adjusted. This allows for various levels of specificity in the representation of molecular structures.</p>
<div class="sourceCode" id="cb16"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb16-1"><a href="methods.html#cb16-1" tabindex="-1"></a><span class="im">import</span> deepchem <span class="im">as</span> dc</span>
<span id="cb16-2"><a href="methods.html#cb16-2" tabindex="-1"></a></span>
<span id="cb16-3"><a href="methods.html#cb16-3" tabindex="-1"></a>feat <span class="op">=</span> dc.feat.CircularFingerprint(size<span class="op">=</span><span class="dv">2048</span>, radius<span class="op">=</span><span class="dv">1</span>)</span>
<span id="cb16-4"><a href="methods.html#cb16-4" tabindex="-1"></a>morgan_fp <span class="op">=</span> feat.featurize(smiles)</span></code></pre></div>
<p>PubChem fingerprints are binary fingerprints consisting of 881 bits, each representing a particular chemical substructure or pattern. They were specifically designed for use with the PubChem database and provide detailed chemical structure encoding.</p>
<div class="sourceCode" id="cb17"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb17-1"><a href="methods.html#cb17-1" tabindex="-1"></a><span class="im">import</span> pubchempy <span class="im">as</span> pcp</span>
<span id="cb17-2"><a href="methods.html#cb17-2" tabindex="-1"></a></span>
<span id="cb17-3"><a href="methods.html#cb17-3" tabindex="-1"></a>cids <span class="op">=</span> <span class="bu">list</span>(df.pubchem_cid[df[<span class="st">&#39;label&#39;</span>]<span class="op">==</span><span class="st">&#39;compound&#39;</span>])</span>
<span id="cb17-4"><a href="methods.html#cb17-4" tabindex="-1"></a></span>
<span id="cb17-5"><a href="methods.html#cb17-5" tabindex="-1"></a>bit_list <span class="op">=</span> []</span>
<span id="cb17-6"><a href="methods.html#cb17-6" tabindex="-1"></a><span class="cf">for</span> cid <span class="kw">in</span> tqdm(cids):</span>
<span id="cb17-7"><a href="methods.html#cb17-7" tabindex="-1"></a>    <span class="cf">try</span>:</span>
<span id="cb17-8"><a href="methods.html#cb17-8" tabindex="-1"></a>        pubchem_compound <span class="op">=</span>pcp.get_compounds(cid)[<span class="dv">0</span>]</span>
<span id="cb17-9"><a href="methods.html#cb17-9" tabindex="-1"></a>        pubchem_fp <span class="op">=</span> [<span class="bu">int</span>(bit) <span class="cf">for</span> bit <span class="kw">in</span> pubchem_compound.cactvs_fingerprint]</span>
<span id="cb17-10"><a href="methods.html#cb17-10" tabindex="-1"></a>        bit_list.append(pubchem_fp)</span>
<span id="cb17-11"><a href="methods.html#cb17-11" tabindex="-1"></a>    <span class="cf">except</span>:</span>
<span id="cb17-12"><a href="methods.html#cb17-12" tabindex="-1"></a>        <span class="bu">print</span>(<span class="ss">f&#39;No PubChem FP found for </span><span class="sc">{</span>cid<span class="sc">}</span><span class="ss">&#39;</span>)</span>
<span id="cb17-13"><a href="methods.html#cb17-13" tabindex="-1"></a>        bit_list.append([])</span>
<span id="cb17-14"><a href="methods.html#cb17-14" tabindex="-1"></a>        </span>
<span id="cb17-15"><a href="methods.html#cb17-15" tabindex="-1"></a>pc_fp <span class="op">=</span> np.asarray(bit_list)</span></code></pre></div>
<p>The Mol2Vec fingerprint is inspired by the Word2Vec algorithm in Natural Language Processing. It considers molecules as sentences and SMILES as words, thus converting molecules into continuous vectors. This technique captures not only the presence of particular substructures but also their context within the molecule, providing a more nuanced representation.</p>
<div class="sourceCode" id="cb18"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb18-1"><a href="methods.html#cb18-1" tabindex="-1"></a><span class="im">import</span> deepchem <span class="im">as</span> dc</span>
<span id="cb18-2"><a href="methods.html#cb18-2" tabindex="-1"></a></span>
<span id="cb18-3"><a href="methods.html#cb18-3" tabindex="-1"></a>feat <span class="op">=</span> dc.feat.Mol2VecFingerprint()</span>
<span id="cb18-4"><a href="methods.html#cb18-4" tabindex="-1"></a>m2v_fp <span class="op">=</span> feat.featurize(smiles)</span></code></pre></div>
<p>In computational chemistry and drug discovery, molecules’ pre-treatment plays a crucial role in preparing them for machine learning applications. Molecules undergo optimization, where their 3D structure is refined and all possible conformations are explored. This step is vital as the 3D structure heavily influences various properties, such as reactivity and binding affinity. For Mordred and RDKit to compute all descriptors, we must optimize molecules and find their most energetically favorable conformations.</p>
<div class="sourceCode" id="cb19"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb19-1"><a href="methods.html#cb19-1" tabindex="-1"></a><span class="im">from</span> rdkit <span class="im">import</span> Chem</span>
<span id="cb19-2"><a href="methods.html#cb19-2" tabindex="-1"></a><span class="im">from</span> rdkit.Chem <span class="im">import</span> AllChem</span>
<span id="cb19-3"><a href="methods.html#cb19-3" tabindex="-1"></a><span class="im">from</span> threading <span class="im">import</span> active_count</span>
<span id="cb19-4"><a href="methods.html#cb19-4" tabindex="-1"></a></span>
<span id="cb19-5"><a href="methods.html#cb19-5" tabindex="-1"></a>num_thread <span class="op">=</span> active_count()</span>
<span id="cb19-6"><a href="methods.html#cb19-6" tabindex="-1"></a><span class="kw">def</span> optimize_3d(mol, method):</span>
<span id="cb19-7"><a href="methods.html#cb19-7" tabindex="-1"></a></span>
<span id="cb19-8"><a href="methods.html#cb19-8" tabindex="-1"></a>    mol <span class="op">=</span> Chem.AddHs(mol)</span>
<span id="cb19-9"><a href="methods.html#cb19-9" tabindex="-1"></a>    <span class="co"># Generate initial 3D coordinates</span></span>
<span id="cb19-10"><a href="methods.html#cb19-10" tabindex="-1"></a>    params <span class="op">=</span> AllChem.ETKDG()</span>
<span id="cb19-11"><a href="methods.html#cb19-11" tabindex="-1"></a>    params.useRandomCoords<span class="op">=</span><span class="va">True</span></span>
<span id="cb19-12"><a href="methods.html#cb19-12" tabindex="-1"></a>    params.maxAttempts<span class="op">=</span><span class="dv">5000</span></span>
<span id="cb19-13"><a href="methods.html#cb19-13" tabindex="-1"></a>    AllChem.EmbedMolecule(mol, params)</span>
<span id="cb19-14"><a href="methods.html#cb19-14" tabindex="-1"></a>    <span class="cf">try</span>:</span>
<span id="cb19-15"><a href="methods.html#cb19-15" tabindex="-1"></a>        <span class="cf">if</span> method <span class="op">==</span> <span class="st">&#39;MMFF&#39;</span>:</span>
<span id="cb19-16"><a href="methods.html#cb19-16" tabindex="-1"></a>            <span class="co"># optimize the 3D structure using the MMFF method (suitable for optimizing small to medium-sized molecules)</span></span>
<span id="cb19-17"><a href="methods.html#cb19-17" tabindex="-1"></a>            AllChem.MMFFOptimizeMolecule(mol, maxIters<span class="op">=</span><span class="dv">200</span>, mmffVariant<span class="op">=</span><span class="st">&#39;MMFF94s&#39;</span>)</span>
<span id="cb19-18"><a href="methods.html#cb19-18" tabindex="-1"></a>        <span class="cf">elif</span> method <span class="op">==</span> <span class="st">&#39;LOPT&#39;</span>:</span>
<span id="cb19-19"><a href="methods.html#cb19-19" tabindex="-1"></a>            <span class="co"># optimize the 3D structure using a combination of UFF and MMFF methods (can be used for optimizing larger and more complex molecules)</span></span>
<span id="cb19-20"><a href="methods.html#cb19-20" tabindex="-1"></a>            <span class="co"># create a PyForceField object and set its parameters</span></span>
<span id="cb19-21"><a href="methods.html#cb19-21" tabindex="-1"></a>            ff <span class="op">=</span> AllChem.UFFGetMoleculeForceField(mol)</span>
<span id="cb19-22"><a href="methods.html#cb19-22" tabindex="-1"></a>            ff.Initialize()</span>
<span id="cb19-23"><a href="methods.html#cb19-23" tabindex="-1"></a>            ff.Minimize()</span>
<span id="cb19-24"><a href="methods.html#cb19-24" tabindex="-1"></a>            AllChem.OptimizeMolecule(ff, maxIters<span class="op">=</span><span class="dv">500</span>)</span>
<span id="cb19-25"><a href="methods.html#cb19-25" tabindex="-1"></a>        <span class="cf">elif</span> method <span class="op">==</span> <span class="st">&#39;CONFOPT&#39;</span>:</span>
<span id="cb19-26"><a href="methods.html#cb19-26" tabindex="-1"></a>            <span class="co"># optimize each conformation using the PyForceField object (to explore the conformational space of a molecule and identify the most energetically favorable conformations)</span></span>
<span id="cb19-27"><a href="methods.html#cb19-27" tabindex="-1"></a>            <span class="co"># generate 10 conformations using UFF</span></span>
<span id="cb19-28"><a href="methods.html#cb19-28" tabindex="-1"></a>            <span class="co"># create a PyForceField object and set its parameters</span></span>
<span id="cb19-29"><a href="methods.html#cb19-29" tabindex="-1"></a>            AllChem.EmbedMultipleConfs(mol, numConfs<span class="op">=</span><span class="dv">10</span>)</span>
<span id="cb19-30"><a href="methods.html#cb19-30" tabindex="-1"></a>            ff <span class="op">=</span> AllChem.UFFGetMoleculeForceField(mol)</span>
<span id="cb19-31"><a href="methods.html#cb19-31" tabindex="-1"></a>            ff.Initialize()</span>
<span id="cb19-32"><a href="methods.html#cb19-32" tabindex="-1"></a>            ff.Minimize()</span>
<span id="cb19-33"><a href="methods.html#cb19-33" tabindex="-1"></a>            AllChem.OptimizeMoleculeConfs(mol, ff, maxIters<span class="op">=</span><span class="dv">300</span>, numThreads<span class="op">=</span>num_thread)</span>
<span id="cb19-34"><a href="methods.html#cb19-34" tabindex="-1"></a>        <span class="cf">else</span>:</span>
<span id="cb19-35"><a href="methods.html#cb19-35" tabindex="-1"></a>            <span class="bu">print</span>(<span class="ss">f&quot;method should be from </span><span class="sc">{</span>[<span class="st">&#39;MMFF&#39;</span>, <span class="st">&#39;LOPT&#39;</span>, <span class="st">&#39;CONFOPT&#39;</span>]<span class="sc">}</span><span class="ss">&quot;</span>)</span>
<span id="cb19-36"><a href="methods.html#cb19-36" tabindex="-1"></a>    <span class="cf">except</span>:</span>
<span id="cb19-37"><a href="methods.html#cb19-37" tabindex="-1"></a>        <span class="cf">pass</span></span>
<span id="cb19-38"><a href="methods.html#cb19-38" tabindex="-1"></a>    <span class="cf">return</span> mol</span></code></pre></div>
<p>RDKit descriptors include a comprehensive set of descriptors calculated directly from the molecule’s structure. These descriptors encompass a wide range of molecular properties, including size, shape, polarity, and topological characteristics. They are widely used in QSAR modeling and virtual screening applications due to their comprehensive nature.</p>
<div class="sourceCode" id="cb20"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb20-1"><a href="methods.html#cb20-1" tabindex="-1"></a><span class="im">from</span> rdkit <span class="im">import</span> Chem</span>
<span id="cb20-2"><a href="methods.html#cb20-2" tabindex="-1"></a><span class="im">from</span> rdkit.Chem <span class="im">import</span> Descriptors</span>
<span id="cb20-3"><a href="methods.html#cb20-3" tabindex="-1"></a></span>
<span id="cb20-4"><a href="methods.html#cb20-4" tabindex="-1"></a><span class="co"># Define a function to calculate all the possible descriptors</span></span>
<span id="cb20-5"><a href="methods.html#cb20-5" tabindex="-1"></a><span class="kw">def</span> calculate_descriptors(smiles, idx):</span>
<span id="cb20-6"><a href="methods.html#cb20-6" tabindex="-1"></a>    mol <span class="op">=</span> Chem.MolFromSmiles(smiles)</span>
<span id="cb20-7"><a href="methods.html#cb20-7" tabindex="-1"></a>    <span class="cf">try</span>:</span>
<span id="cb20-8"><a href="methods.html#cb20-8" tabindex="-1"></a>        mol <span class="op">=</span> optimize_3d(mol, <span class="st">&#39;LOPT&#39;</span>)</span>
<span id="cb20-9"><a href="methods.html#cb20-9" tabindex="-1"></a>    <span class="cf">except</span>:</span>
<span id="cb20-10"><a href="methods.html#cb20-10" tabindex="-1"></a>        <span class="cf">pass</span></span>
<span id="cb20-11"><a href="methods.html#cb20-11" tabindex="-1"></a>    desc_lst <span class="op">=</span> []</span>
<span id="cb20-12"><a href="methods.html#cb20-12" tabindex="-1"></a>    <span class="cf">for</span> descriptor_name, descriptor_function <span class="kw">in</span> Descriptors.descList:</span>
<span id="cb20-13"><a href="methods.html#cb20-13" tabindex="-1"></a>        <span class="cf">try</span>:</span>
<span id="cb20-14"><a href="methods.html#cb20-14" tabindex="-1"></a>            descriptor_value <span class="op">=</span> descriptor_function(mol)</span>
<span id="cb20-15"><a href="methods.html#cb20-15" tabindex="-1"></a>            <span class="cf">if</span> descriptor_value <span class="op">==</span> pd.notnull:</span>
<span id="cb20-16"><a href="methods.html#cb20-16" tabindex="-1"></a>                <span class="bu">print</span>(<span class="ss">f&#39;No value for </span><span class="sc">{</span>descriptor_name<span class="sc">}</span><span class="ss">, output: </span><span class="sc">{</span>descriptor_value<span class="sc">}</span><span class="ss"> for </span><span class="sc">{</span>idx<span class="sc">}</span><span class="ss">:</span><span class="sc">{</span>smiles<span class="sc">}</span><span class="ss">&#39;</span>)</span>
<span id="cb20-17"><a href="methods.html#cb20-17" tabindex="-1"></a>                desc_lst.append(np.nan)</span>
<span id="cb20-18"><a href="methods.html#cb20-18" tabindex="-1"></a>            <span class="cf">else</span>:</span>
<span id="cb20-19"><a href="methods.html#cb20-19" tabindex="-1"></a>                desc_lst.append(descriptor_value)</span>
<span id="cb20-20"><a href="methods.html#cb20-20" tabindex="-1"></a>        <span class="cf">except</span>:</span>
<span id="cb20-21"><a href="methods.html#cb20-21" tabindex="-1"></a>            <span class="cf">pass</span></span>
<span id="cb20-22"><a href="methods.html#cb20-22" tabindex="-1"></a>    <span class="cf">return</span> desc_lst</span>
<span id="cb20-23"><a href="methods.html#cb20-23" tabindex="-1"></a></span>
<span id="cb20-24"><a href="methods.html#cb20-24" tabindex="-1"></a>descriptor_list <span class="op">=</span> []</span>
<span id="cb20-25"><a href="methods.html#cb20-25" tabindex="-1"></a><span class="cf">for</span> idx, sml <span class="kw">in</span> <span class="bu">enumerate</span>(tqdm(smiles)):</span>
<span id="cb20-26"><a href="methods.html#cb20-26" tabindex="-1"></a>    compound_desc <span class="op">=</span> calculate_descriptors(sml, idx)</span>
<span id="cb20-27"><a href="methods.html#cb20-27" tabindex="-1"></a>    descriptor_list.append(compound_desc)</span>
<span id="cb20-28"><a href="methods.html#cb20-28" tabindex="-1"></a>    </span>
<span id="cb20-29"><a href="methods.html#cb20-29" tabindex="-1"></a>rdkit_desc_df <span class="op">=</span> pd.DataFrame(descriptor_list, columns<span class="op">=</span>[name <span class="cf">for</span> name, _ <span class="kw">in</span> Descriptors.descList]).astype(np.float64)</span>
<span id="cb20-30"><a href="methods.html#cb20-30" tabindex="-1"></a>rdkit_desc_df <span class="op">=</span> rdkit_desc_df.dropna(axis<span class="op">=</span><span class="dv">1</span>)</span>
<span id="cb20-31"><a href="methods.html#cb20-31" tabindex="-1"></a>rdkit_desc <span class="op">=</span> rdkit_desc_df.values.astype(np.float64)</span></code></pre></div>
<p>Finally, Mordred descriptors provide a vast array of over 1600 three-dimensional, two-dimensional, and one-dimensional descriptors. These descriptors represent a wide variety of chemical information, ranging from simple atom counts and molecular weight to more complex descriptors such as electrotopological state indices and autocorrelation descriptors. The rich information provided by Mordred descriptors makes them an excellent choice for modeling complex molecular behaviors.</p>
<div class="sourceCode" id="cb21"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb21-1"><a href="methods.html#cb21-1" tabindex="-1"></a><span class="im">from</span> mordred <span class="im">import</span> Calculator, descriptors</span>
<span id="cb21-2"><a href="methods.html#cb21-2" tabindex="-1"></a></span>
<span id="cb21-3"><a href="methods.html#cb21-3" tabindex="-1"></a>molecules <span class="op">=</span> [Chem.MolFromSmiles(sml) <span class="cf">for</span> sml <span class="kw">in</span> smiles]</span>
<span id="cb21-4"><a href="methods.html#cb21-4" tabindex="-1"></a></span>
<span id="cb21-5"><a href="methods.html#cb21-5" tabindex="-1"></a><span class="co"># Define a function to calculate all the possible descriptors</span></span>
<span id="cb21-6"><a href="methods.html#cb21-6" tabindex="-1"></a><span class="kw">def</span> calculate_mordred_descriptors(mol, optimization<span class="op">=</span> <span class="st">&#39;LOPT&#39;</span>):</span>
<span id="cb21-7"><a href="methods.html#cb21-7" tabindex="-1"></a>    mol <span class="op">=</span> optimize_3d(mol, optimization)</span>
<span id="cb21-8"><a href="methods.html#cb21-8" tabindex="-1"></a>    <span class="co"># Create a Mordred calculator object</span></span>
<span id="cb21-9"><a href="methods.html#cb21-9" tabindex="-1"></a>    calculator <span class="op">=</span> Calculator(descriptors)</span>
<span id="cb21-10"><a href="methods.html#cb21-10" tabindex="-1"></a>    <span class="cf">return</span> calculator(mol)</span>
<span id="cb21-11"><a href="methods.html#cb21-11" tabindex="-1"></a></span>
<span id="cb21-12"><a href="methods.html#cb21-12" tabindex="-1"></a>mdrd_descriptor_list <span class="op">=</span> []</span>
<span id="cb21-13"><a href="methods.html#cb21-13" tabindex="-1"></a><span class="cf">for</span> idx, mol <span class="kw">in</span> <span class="bu">enumerate</span>(tqdm(molecules)):</span>
<span id="cb21-14"><a href="methods.html#cb21-14" tabindex="-1"></a>    mol_desc <span class="op">=</span> calculate_mordred_descriptors(mol)</span>
<span id="cb21-15"><a href="methods.html#cb21-15" tabindex="-1"></a>    desc <span class="op">=</span> <span class="bu">list</span>(mol_desc.asdict().values())</span>
<span id="cb21-16"><a href="methods.html#cb21-16" tabindex="-1"></a>    mdrd_descriptor_list.append(desc)</span>
<span id="cb21-17"><a href="methods.html#cb21-17" tabindex="-1"></a>    </span>
<span id="cb21-18"><a href="methods.html#cb21-18" tabindex="-1"></a>mordred_desc_df <span class="op">=</span> pd.DataFrame(mdrd_descriptor_list, columns<span class="op">=</span><span class="bu">list</span>(mol_desc.asdict().keys())).astype(np.float64)</span>
<span id="cb21-19"><a href="methods.html#cb21-19" tabindex="-1"></a>mordred_desc_df <span class="op">=</span> mordred_desc_df.dropna(axis<span class="op">=</span><span class="dv">1</span>)</span>
<span id="cb21-20"><a href="methods.html#cb21-20" tabindex="-1"></a>mordred_desc <span class="op">=</span> mordred_desc_df.values.astype(np.float64)</span></code></pre></div>
<p>To summarize, the featureization process leverages multiple molecular descriptors to capture molecular structures’ complexity and diversity. Each descriptor contributes unique information about the molecule, resulting in a comprehensive and informative representation.</p>
<table style="width:100%;">
<colgroup>
<col width="16%" />
<col width="16%" />
<col width="16%" />
<col width="16%" />
<col width="16%" />
<col width="16%" />
</colgroup>
<thead>
<tr class="header">
<th>Featurizing Technique</th>
<th>Description</th>
<th>Size</th>
<th>Binary</th>
<th>3D Information</th>
<th>Adjustability</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td>MACCS fingerprints</td>
<td>Predefined structural fragments</td>
<td>167</td>
<td>Yes</td>
<td>No</td>
<td>No</td>
</tr>
<tr class="even">
<td>Morgan fingerprints (Circular fingerprints)</td>
<td>Hashing the environments of atoms in a molecule</td>
<td>Adjustable (2048 in the example)</td>
<td>Yes</td>
<td>No</td>
<td>Yes (Radius and length can be adjusted)</td>
</tr>
<tr class="odd">
<td>PubChem fingerprints</td>
<td>Chemical substructure or pattern</td>
<td>881</td>
<td>Yes</td>
<td>No</td>
<td>No</td>
</tr>
<tr class="even">
<td>Mol2Vec fingerprint</td>
<td>SMILES as words</td>
<td>Variable</td>
<td>No</td>
<td>No</td>
<td>No</td>
</tr>
<tr class="odd">
<td>RDKit descriptors</td>
<td>Physico-chemical descriptors</td>
<td>&gt;200</td>
<td>No</td>
<td>Yes</td>
<td>No</td>
</tr>
<tr class="even">
<td>Mordred descriptors</td>
<td>Physico-chemical descriptors</td>
<td>&gt;1600</td>
<td>No</td>
<td>Yes</td>
<td>No</td>
</tr>
</tbody>
</table>
</div>
<div id="featurizing-proteins" class="section level4 hasAnchor" number="2.1.3.2">
<h4><span class="header-section-number">2.1.3.2</span> Featurizing Proteins<a href="methods.html#featurizing-proteins" class="anchor-section" aria-label="Anchor link to header"></a></h4>
<p>The transformation of protein sequences into embedding vectors using models pretrained on millions of proteins is highly beneficial. These models are trained to understand protein sequence patterns, structures, and dependencies. Thus, the resulting embeddings capture a wealth of information about protein sequences, including their evolutionary context, structural features, and biological functions.</p>
<p>BioTransformers is a Python package that provides a unified API to use and evaluate several pre-trained models for protein sequences. These models transform protein sequences into meaningful numerical representations, also known as embeddings. The extracted embeddings can then be used in downstream machine learning tasks such as protein classification, clustering, or prediction of protein properties.</p>
<p>The models you’ve selected, <code>protbert</code> and <code>esm1_t34_670M_UR100</code>, are two different pre-trained models available in BioTransformers.</p>
<p><code>ProtBert</code> is a transformer-based model trained on a large corpus of protein sequences using a masked language modeling objective, similar to BERT models in natural language processing. The model’s architecture enables it to capture complex patterns and dependencies in the sequence data.</p>
<p>On the other hand, <code>esm1_t34_670M_UR100</code> is part of the ESM (Evolutionary Scale Modeling) series of models, specifically trained on a large evolutionary scale of protein sequences. This model is designed to capture evolutionary patterns and sequence conservation information, which can be highly beneficial for protein-related tasks.</p>
<div class="sourceCode" id="cb22"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb22-1"><a href="methods.html#cb22-1" tabindex="-1"></a><span class="im">from</span> biotransformers <span class="im">import</span> BioTransformers</span>
<span id="cb22-2"><a href="methods.html#cb22-2" tabindex="-1"></a><span class="im">from</span> tqdm.notebook <span class="im">import</span> tqdm</span>
<span id="cb22-3"><a href="methods.html#cb22-3" tabindex="-1"></a><span class="im">import</span> torch</span>
<span id="cb22-4"><a href="methods.html#cb22-4" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
<span id="cb22-5"><a href="methods.html#cb22-5" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
<span id="cb22-6"><a href="methods.html#cb22-6" tabindex="-1"></a><span class="im">import</span> pickle</span>
<span id="cb22-7"><a href="methods.html#cb22-7" tabindex="-1"></a></span>
<span id="cb22-8"><a href="methods.html#cb22-8" tabindex="-1"></a><span class="kw">def</span> compute_embeddings(bio_trans, sequences, batch_size<span class="op">=</span><span class="dv">10</span>):</span>
<span id="cb22-9"><a href="methods.html#cb22-9" tabindex="-1"></a>    embeddings <span class="op">=</span> np.empty((<span class="dv">0</span>,<span class="dv">1024</span>), <span class="bu">float</span>)</span>
<span id="cb22-10"><a href="methods.html#cb22-10" tabindex="-1"></a>    <span class="cf">for</span> idx <span class="kw">in</span> tqdm(<span class="bu">range</span>(<span class="dv">0</span>, <span class="bu">len</span>(sequences), batch_size)):</span>
<span id="cb22-11"><a href="methods.html#cb22-11" tabindex="-1"></a>        batch <span class="op">=</span> sequences[idx:idx<span class="op">+</span>batch_size]</span>
<span id="cb22-12"><a href="methods.html#cb22-12" tabindex="-1"></a>        embd <span class="op">=</span> bio_trans.compute_embeddings(batch, pool_mode<span class="op">=</span><span class="st">&#39;mean&#39;</span>, batch_size<span class="op">=</span>batch_size, silent<span class="op">=</span><span class="va">True</span>)[<span class="st">&#39;mean&#39;</span>]</span>
<span id="cb22-13"><a href="methods.html#cb22-13" tabindex="-1"></a>        embeddings <span class="op">=</span> np.vstack((embeddings, embd))</span>
<span id="cb22-14"><a href="methods.html#cb22-14" tabindex="-1"></a>    <span class="cf">return</span> embeddings</span>
<span id="cb22-15"><a href="methods.html#cb22-15" tabindex="-1"></a></span>
<span id="cb22-16"><a href="methods.html#cb22-16" tabindex="-1"></a><span class="kw">def</span> save_embeddings(embeddings, filename):</span>
<span id="cb22-17"><a href="methods.html#cb22-17" tabindex="-1"></a>    <span class="cf">with</span> <span class="bu">open</span>(filename, <span class="st">&quot;wb&quot;</span>) <span class="im">as</span> f:</span>
<span id="cb22-18"><a href="methods.html#cb22-18" tabindex="-1"></a>        pickle.dump(embeddings, f)</span>
<span id="cb22-19"><a href="methods.html#cb22-19" tabindex="-1"></a></span>
<span id="cb22-20"><a href="methods.html#cb22-20" tabindex="-1"></a><span class="co"># Load sequences</span></span>
<span id="cb22-21"><a href="methods.html#cb22-21" tabindex="-1"></a>seq_df <span class="op">=</span> pd.read_csv(<span class="st">&#39;sequence.tsv&#39;</span>, sep<span class="op">=</span><span class="st">&#39;</span><span class="ch">\t</span><span class="st">&#39;</span>)</span>
<span id="cb22-22"><a href="methods.html#cb22-22" tabindex="-1"></a>sequences <span class="op">=</span> <span class="bu">list</span>(seq_df[<span class="st">&#39;sequence&#39;</span>])</span>
<span id="cb22-23"><a href="methods.html#cb22-23" tabindex="-1"></a></span>
<span id="cb22-24"><a href="methods.html#cb22-24" tabindex="-1"></a><span class="co"># Backends</span></span>
<span id="cb22-25"><a href="methods.html#cb22-25" tabindex="-1"></a>backends <span class="op">=</span> [<span class="st">&quot;protbert&quot;</span>, <span class="st">&quot;esm1_t34_670M_UR100&quot;</span>]</span>
<span id="cb22-26"><a href="methods.html#cb22-26" tabindex="-1"></a></span>
<span id="cb22-27"><a href="methods.html#cb22-27" tabindex="-1"></a><span class="cf">for</span> backend <span class="kw">in</span> backends:</span>
<span id="cb22-28"><a href="methods.html#cb22-28" tabindex="-1"></a>      <span class="co"># Clear GPU memory</span></span>
<span id="cb22-29"><a href="methods.html#cb22-29" tabindex="-1"></a>    torch.cuda.empty_cache()</span>
<span id="cb22-30"><a href="methods.html#cb22-30" tabindex="-1"></a>    <span class="bu">print</span>(<span class="ss">f&quot;Processing with backend: </span><span class="sc">{</span>backend<span class="sc">}</span><span class="ss">&quot;</span>)</span>
<span id="cb22-31"><a href="methods.html#cb22-31" tabindex="-1"></a>    bio_trans <span class="op">=</span> BioTransformers(backend, num_gpus<span class="op">=</span><span class="dv">1</span>)</span>
<span id="cb22-32"><a href="methods.html#cb22-32" tabindex="-1"></a>    embeddings <span class="op">=</span> compute_embeddings(bio_trans, sequences)</span>
<span id="cb22-33"><a href="methods.html#cb22-33" tabindex="-1"></a>    save_embeddings(embeddings, <span class="ss">f&quot;</span><span class="sc">{</span>backend<span class="sc">}</span><span class="ss">_embeddings.pkl&quot;</span>)</span></code></pre></div>
<p>This rich representation can be leveraged in downstream tasks, improving the performance of various bioinformatics applications such as protein function prediction, protein-protein interaction prediction, and many others. By using pre-trained embeddings, one can also significantly reduce the computational cost and complexity associated with training deep learning models from scratch on large protein datasets.</p>
</div>
<div id="featurizing-pathways" class="section level4 hasAnchor" number="2.1.3.3">
<h4><span class="header-section-number">2.1.3.3</span> Featurizing Pathways<a href="methods.html#featurizing-pathways" class="anchor-section" aria-label="Anchor link to header"></a></h4>
<p>BERT tokenizer from the Hugging Face Transformers library, a pre-trained model, was chosen for its ability to perform natural language processing tasks such as tokenization to vectorize biological pathways. This process involved splitting the text into individual tokens and encoding them as numerical IDs that could be understood by the BERT model. Padding and truncation techniques were applied to ensure consistent sequence lengths during tokenization. This step was critical as pathway descriptions often varied in length. By padding shorter sequences and truncating longer ones, uniformity was achieved.</p>
<p>This would simply allow the model recognize different pathways from each other.</p>
<div class="sourceCode" id="cb23"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb23-1"><a href="methods.html#cb23-1" tabindex="-1"></a><span class="im">from</span> transformers <span class="im">import</span> BertTokenizer</span>
<span id="cb23-2"><a href="methods.html#cb23-2" tabindex="-1"></a></span>
<span id="cb23-3"><a href="methods.html#cb23-3" tabindex="-1"></a>tokenizer <span class="op">=</span> BertTokenizer.from_pretrained(<span class="st">&#39;bert-base-uncased&#39;</span>)</span>
<span id="cb23-4"><a href="methods.html#cb23-4" tabindex="-1"></a>string_list <span class="op">=</span> pathway_df.select(pl.col(<span class="st">&#39;Wiki Pathway&#39;</span>)).to_series().to_list()</span>
<span id="cb23-5"><a href="methods.html#cb23-5" tabindex="-1"></a></span>
<span id="cb23-6"><a href="methods.html#cb23-6" tabindex="-1"></a><span class="co"># Tokenize the pathway descriptions</span></span>
<span id="cb23-7"><a href="methods.html#cb23-7" tabindex="-1"></a>tokenized_strings <span class="op">=</span> tokenizer(string_list, padding<span class="op">=</span><span class="va">True</span>, truncation<span class="op">=</span><span class="va">True</span>, return_tensors<span class="op">=</span><span class="st">&#39;pt&#39;</span>)</span>
<span id="cb23-8"><a href="methods.html#cb23-8" tabindex="-1"></a></span>
<span id="cb23-9"><a href="methods.html#cb23-9" tabindex="-1"></a><span class="co"># Retrieve the tokenized input IDs</span></span>
<span id="cb23-10"><a href="methods.html#cb23-10" tabindex="-1"></a>pw_vector <span class="op">=</span> tokenized_strings[<span class="st">&#39;input_ids&#39;</span>]</span></code></pre></div>
</div>
</div>
<div id="covid-19-bio-graph" class="section level3 hasAnchor" number="2.1.4">
<h3><span class="header-section-number">2.1.4</span> COVID-19 Bio-Graph<a href="methods.html#covid-19-bio-graph" class="anchor-section" aria-label="Anchor link to header"></a></h3>
<p>Having all the data ready a multimodal graph was built that represents a complex network of interconnected biological entities - chemicals, proteins, and pathways. The graph contains 4,293 unique chemicals, each distinguished by a high-dimensional feature vector (4,272 features) that includes biochemical properties, SMILES strings, and phenotype features. The chemicals’ high-dimensional space has been condensed into a single feature PCA1 using dimensionality reduction techniques.</p>
<div class="sourceCode" id="cb24"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb24-1"><a href="methods.html#cb24-1" tabindex="-1"></a><span class="im">from</span> torch_geometric.data <span class="im">import</span> HeteroData</span>
<span id="cb24-2"><a href="methods.html#cb24-2" tabindex="-1"></a></span>
<span id="cb24-3"><a href="methods.html#cb24-3" tabindex="-1"></a>data <span class="op">=</span> HeteroData()</span>
<span id="cb24-4"><a href="methods.html#cb24-4" tabindex="-1"></a></span>
<span id="cb24-5"><a href="methods.html#cb24-5" tabindex="-1"></a>data[<span class="st">&#39;chemical&#39;</span>].x <span class="op">=</span> chemical_features.to(torch.<span class="bu">float</span>)                      <span class="co"># [num_chemicals, num_features_chemical]</span></span>
<span id="cb24-6"><a href="methods.html#cb24-6" tabindex="-1"></a>data[<span class="st">&#39;chemical&#39;</span>].smiles <span class="op">=</span> chemical_smiles                                   <span class="co"># [num_chemicals]</span></span>
<span id="cb24-7"><a href="methods.html#cb24-7" tabindex="-1"></a>data[<span class="st">&#39;chemical&#39;</span>].y <span class="op">=</span> chemical_y.<span class="bu">long</span>()                                      <span class="co"># [num_chemicals]</span></span>
<span id="cb24-8"><a href="methods.html#cb24-8" tabindex="-1"></a>data[<span class="st">&#39;chemical&#39;</span>].pca1 <span class="op">=</span> chemical_pca1.to(torch.<span class="bu">float</span>)                       <span class="co"># [num_chemicals]</span></span>
<span id="cb24-9"><a href="methods.html#cb24-9" tabindex="-1"></a>data[<span class="st">&#39;chemical&#39;</span>].phenotype_feat <span class="op">=</span> chemical_phenotype_feat.to(torch.<span class="bu">float</span>)   <span class="co"># [num_chemicals, 16]</span></span>
<span id="cb24-10"><a href="methods.html#cb24-10" tabindex="-1"></a></span>
<span id="cb24-11"><a href="methods.html#cb24-11" tabindex="-1"></a><span class="cf">for</span> f, v <span class="kw">in</span> [(<span class="st">&#39;train&#39;</span>, <span class="st">&#39;train&#39;</span>), (<span class="st">&#39;valid&#39;</span>, <span class="st">&#39;val&#39;</span>), (<span class="st">&#39;test&#39;</span>, <span class="st">&#39;test&#39;</span>)]:</span>
<span id="cb24-12"><a href="methods.html#cb24-12" tabindex="-1"></a>    idx <span class="op">=</span> mask_df.select(</span>
<span id="cb24-13"><a href="methods.html#cb24-13" tabindex="-1"></a>                [<span class="st">&#39;connected_compound_gid&#39;</span>, <span class="st">&#39;mask&#39;</span>]</span>
<span id="cb24-14"><a href="methods.html#cb24-14" tabindex="-1"></a>                ).<span class="bu">filter</span>(</span>
<span id="cb24-15"><a href="methods.html#cb24-15" tabindex="-1"></a>                pl.col(<span class="st">&#39;mask&#39;</span>) <span class="op">==</span> f</span>
<span id="cb24-16"><a href="methods.html#cb24-16" tabindex="-1"></a>                ).select(<span class="st">&#39;connected_compound_gid&#39;</span>).to_numpy().flatten()</span>
<span id="cb24-17"><a href="methods.html#cb24-17" tabindex="-1"></a>    idx <span class="op">=</span> torch.from_numpy(idx)</span>
<span id="cb24-18"><a href="methods.html#cb24-18" tabindex="-1"></a>    maskit <span class="op">=</span> torch.zeros(data[<span class="st">&#39;chemical&#39;</span>].num_nodes, dtype<span class="op">=</span>torch.<span class="bu">bool</span>)</span>
<span id="cb24-19"><a href="methods.html#cb24-19" tabindex="-1"></a>    maskit[idx] <span class="op">=</span> <span class="va">True</span></span>
<span id="cb24-20"><a href="methods.html#cb24-20" tabindex="-1"></a>    data[<span class="st">&#39;chemical&#39;</span>][<span class="ss">f&#39;</span><span class="sc">{</span>v<span class="sc">}</span><span class="ss">_mask&#39;</span>] <span class="op">=</span> maskit</span>
<span id="cb24-21"><a href="methods.html#cb24-21" tabindex="-1"></a></span>
<span id="cb24-22"><a href="methods.html#cb24-22" tabindex="-1"></a>data[<span class="st">&#39;protein&#39;</span>].x <span class="op">=</span> protein_esm_embeddings.to(torch.<span class="bu">float</span>)  <span class="co"># [num_proteins, num_features_protein]</span></span>
<span id="cb24-23"><a href="methods.html#cb24-23" tabindex="-1"></a>data[<span class="st">&#39;protein&#39;</span>].name <span class="op">=</span> protein_names                        <span class="co"># [num_proteins]</span></span>
<span id="cb24-24"><a href="methods.html#cb24-24" tabindex="-1"></a>data[<span class="st">&#39;protein&#39;</span>].seq <span class="op">=</span> protein_sequences                     <span class="co"># [num_proteins]</span></span>
<span id="cb24-25"><a href="methods.html#cb24-25" tabindex="-1"></a></span>
<span id="cb24-26"><a href="methods.html#cb24-26" tabindex="-1"></a>data[<span class="st">&#39;pathway&#39;</span>].x <span class="op">=</span> pathway_features.to(torch.<span class="bu">float</span>)        <span class="co"># [num_pathways, num_features_pathway]</span></span>
<span id="cb24-27"><a href="methods.html#cb24-27" tabindex="-1"></a>data[<span class="st">&#39;pathway&#39;</span>].name <span class="op">=</span> pathway_names                        <span class="co"># [num_pathways]</span></span>
<span id="cb24-28"><a href="methods.html#cb24-28" tabindex="-1"></a></span>
<span id="cb24-29"><a href="methods.html#cb24-29" tabindex="-1"></a>data[<span class="st">&#39;chemical&#39;</span>, <span class="st">&#39;bind_to&#39;</span>, <span class="st">&#39;protein&#39;</span>].edge_index <span class="op">=</span> torch.from_numpy(compound_protein_deges).t().contiguous()     <span class="co"># [2, num_edges_bind]</span></span>
<span id="cb24-30"><a href="methods.html#cb24-30" tabindex="-1"></a>data[<span class="st">&#39;pathway&#39;</span>, <span class="st">&#39;activate_by&#39;</span>, <span class="st">&#39;chemical&#39;</span>].edge_index <span class="op">=</span> torch.from_numpy(pathway_compound_edges).t().contiguous() <span class="co"># [2, num_edges_activate]</span></span>
<span id="cb24-31"><a href="methods.html#cb24-31" tabindex="-1"></a>data[<span class="st">&#39;protein&#39;</span>, <span class="st">&#39;governs&#39;</span>, <span class="st">&#39;pathway&#39;</span>].edge_index <span class="op">=</span> torch.from_numpy(protein_pathway_edges).t().contiguous()       <span class="co"># [2, num_edges_govern]</span></span></code></pre></div>
<p>The final output of this process was a <code>HeteroData</code> object from the PyTorch Geometric library, which represents a heterogeneous graph with various types of nodes and edges. This graph-based representation of the data encapsulates the interconnected nature of the compounds, proteins, and pathways, thereby providing a comprehensive overview of the interactions and associations within the COVID-19 cell profiling data.</p>
<p>Our multimodal graph, a complex mesh of interconnected biological entities, represents a wealth of relationships between chemicals, proteins, and pathways. It illustrates the intricate dynamics prevalent in molecular biology, serving as a robust framework for advanced modeling tasks.</p>
<p>Among the 4,293 chemicals, only 3,711 have known connections to at least one protein, whereas 1,376 chemicals are isolated, meaning they lack any known protein interactions. On the other hand, from the total of 16,733 proteins, 16,727 are known to interact with chemicals, leaving 2,839 proteins without any known chemical connections.</p>
<p>The graph also models 1,117 unique biological pathways. Only 282 proteins are known to govern these pathways. Interestingly, 3,220 chemicals are linked to all pathways, underscoring chemicals’ pervasive influence in biological processes. A distinct subset of 582 chemicals connect exclusively to pathways, without any protein connections, and 6 proteins have pathway connections but no compound connections.</p>
<p>Overall, the graph includes 4,293 chemicals and 16,733 proteins that have at least one known connection, either to proteins, pathways, or both. These connections, represented by the different types of edges, symbolize various biological interactions and regulatory mechanisms. The graph’s structure, supplemented by the additional attributes of each node type, provides a comprehensive data platform for downstream tasks.</p>
<p>The graph were further masked with training, validation, and testing subsets using a stratified approach to maintain a uniform distribution of classes across all subsets. This enabled the creation of robust machine learning models capable of effectively learning from training data and generalizing to unseen data.</p>
</div>
<div id="representing-chemical-molecules-as-graph" class="section level3 hasAnchor" number="2.1.5">
<h3><span class="header-section-number">2.1.5</span> Representing Chemical Molecules as Graph<a href="methods.html#representing-chemical-molecules-as-graph" class="anchor-section" aria-label="Anchor link to header"></a></h3>
<p>In conventional drug discovery processes, chemical structures are often encoded as fixed-length feature vectors, which were explained in detail in the previous section (Featurizing). These vectors, while effective for some tasks, lack the nuanced structural information and context of the atoms and bonds within the molecule.</p>
<p>Recently, graph-based representations of molecules have gained popularity in computational chemistry and cheminformatics. In this approach, each molecule is represented as a graph, where atoms are considered as nodes and bonds as edges. This representation retains the context of the molecule, allowing for more sophisticated analysis and understanding of the molecular structure. Each atom (node) and bond (edge) can be associated with features such as atom type, bond type, atom hybridization, whether the bond is in a ring, etc. Graph convolutional networks (GCNs) can then be used to learn complex patterns from these graph-structured data.</p>
<p>The graph-based representation and the conventional vector-based representation each have their unique strengths. The graph representation can capture local structural information and long-range interactions in the molecule, while the vector-based representation can efficiently capture specific substructures or holistic properties like the physico-chemical characteristics of the molecule.</p>
<p>The initial step was to convert molecular structures into graph-based representations. This conversion was achieved using the <code>MolGraphConvFeaturizer</code> from the DeepChem library. This generated node and edge features considering additional aspects such as chirality and partial charge.</p>
<p>Two dataset classes were designed for classification task: <code>CovidMolGraph_imbalance_classification</code> and <code>CovidMolGraph_balanced_classification</code>. The imbalanced dataset will be used with weight on binary classes during the training. The balanced classification approach involved resampling the minority class in the training set to balance the class distribution, improving the model’s performance.These handled the unbalanced and balanced classifications of the dataset, respectively.</p>
<div class="sourceCode" id="cb25"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb25-1"><a href="methods.html#cb25-1" tabindex="-1"></a><span class="im">import</span> os</span>
<span id="cb25-2"><a href="methods.html#cb25-2" tabindex="-1"></a><span class="im">import</span> pickle</span>
<span id="cb25-3"><a href="methods.html#cb25-3" tabindex="-1"></a><span class="im">import</span> torch</span>
<span id="cb25-4"><a href="methods.html#cb25-4" tabindex="-1"></a><span class="im">from</span> typing <span class="im">import</span> Callable, List, Optional</span>
<span id="cb25-5"><a href="methods.html#cb25-5" tabindex="-1"></a><span class="im">from</span> sklearn <span class="im">import</span> preprocessing</span>
<span id="cb25-6"><a href="methods.html#cb25-6" tabindex="-1"></a><span class="im">from</span> tqdm <span class="im">import</span> tqdm</span>
<span id="cb25-7"><a href="methods.html#cb25-7" tabindex="-1"></a><span class="im">import</span> deepchem <span class="im">as</span> dc</span>
<span id="cb25-8"><a href="methods.html#cb25-8" tabindex="-1"></a><span class="im">import</span> polars <span class="im">as</span> pl</span>
<span id="cb25-9"><a href="methods.html#cb25-9" tabindex="-1"></a><span class="im">from</span> rdkit <span class="im">import</span> Chem</span>
<span id="cb25-10"><a href="methods.html#cb25-10" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np </span>
<span id="cb25-11"><a href="methods.html#cb25-11" tabindex="-1"></a><span class="im">from</span> sklearn.model_selection <span class="im">import</span> train_test_split</span>
<span id="cb25-12"><a href="methods.html#cb25-12" tabindex="-1"></a></span>
<span id="cb25-13"><a href="methods.html#cb25-13" tabindex="-1"></a><span class="im">from</span> torch_geometric.data <span class="im">import</span> (</span>
<span id="cb25-14"><a href="methods.html#cb25-14" tabindex="-1"></a>    Data,</span>
<span id="cb25-15"><a href="methods.html#cb25-15" tabindex="-1"></a>    Dataset,</span>
<span id="cb25-16"><a href="methods.html#cb25-16" tabindex="-1"></a>    InMemoryDataset</span>
<span id="cb25-17"><a href="methods.html#cb25-17" tabindex="-1"></a>)</span>
<span id="cb25-18"><a href="methods.html#cb25-18" tabindex="-1"></a></span>
<span id="cb25-19"><a href="methods.html#cb25-19" tabindex="-1"></a><span class="kw">class</span> CovidMolGraph_imbalance_classification(InMemoryDataset):</span>
<span id="cb25-20"><a href="methods.html#cb25-20" tabindex="-1"></a>    </span>
<span id="cb25-21"><a href="methods.html#cb25-21" tabindex="-1"></a>    <span class="kw">def</span> <span class="fu">__init__</span>(<span class="va">self</span>, root: <span class="bu">str</span>, transform: Optional[Callable] <span class="op">=</span> <span class="va">None</span>,</span>
<span id="cb25-22"><a href="methods.html#cb25-22" tabindex="-1"></a>                 pre_transform: Optional[Callable] <span class="op">=</span> <span class="va">None</span>,</span>
<span id="cb25-23"><a href="methods.html#cb25-23" tabindex="-1"></a>                 pre_filter: Optional[Callable] <span class="op">=</span> <span class="va">None</span>):</span>
<span id="cb25-24"><a href="methods.html#cb25-24" tabindex="-1"></a>        <span class="bu">super</span>().<span class="fu">__init__</span>(root, transform, pre_transform, pre_filter)</span>
<span id="cb25-25"><a href="methods.html#cb25-25" tabindex="-1"></a>        <span class="va">self</span>.data, <span class="va">self</span>.slices <span class="op">=</span> torch.load(<span class="va">self</span>.processed_paths[<span class="dv">0</span>])</span>
<span id="cb25-26"><a href="methods.html#cb25-26" tabindex="-1"></a></span>
<span id="cb25-27"><a href="methods.html#cb25-27" tabindex="-1"></a>    <span class="at">@property</span></span>
<span id="cb25-28"><a href="methods.html#cb25-28" tabindex="-1"></a>    <span class="kw">def</span> processed_file_names(<span class="va">self</span>) <span class="op">-&gt;</span> <span class="bu">str</span>:</span>
<span id="cb25-29"><a href="methods.html#cb25-29" tabindex="-1"></a>        <span class="cf">return</span> <span class="st">&#39;covid_data_processed.pt&#39;</span></span>
<span id="cb25-30"><a href="methods.html#cb25-30" tabindex="-1"></a></span>
<span id="cb25-31"><a href="methods.html#cb25-31" tabindex="-1"></a>    <span class="kw">def</span> process(<span class="va">self</span>):</span>
<span id="cb25-32"><a href="methods.html#cb25-32" tabindex="-1"></a>        df <span class="op">=</span> pl.read_csv(<span class="st">&#39;covid_20230504.tsv&#39;</span>, separator<span class="op">=</span><span class="st">&#39;</span><span class="ch">\t</span><span class="st">&#39;</span>).<span class="bu">filter</span>(</span>
<span id="cb25-33"><a href="methods.html#cb25-33" tabindex="-1"></a>            pl.col(<span class="st">&#39;label&#39;</span>) <span class="op">==</span> <span class="st">&#39;compound&#39;</span></span>
<span id="cb25-34"><a href="methods.html#cb25-34" tabindex="-1"></a>        ).with_columns(</span>
<span id="cb25-35"><a href="methods.html#cb25-35" tabindex="-1"></a>            pl.col(<span class="st">&#39;pca1&#39;</span>).<span class="bu">apply</span>(<span class="kw">lambda</span> x: <span class="dv">1</span> <span class="cf">if</span> x <span class="op">&gt;=</span> <span class="dv">5</span> <span class="cf">else</span> <span class="dv">0</span>).alias(<span class="st">&#39;activity&#39;</span>)</span>
<span id="cb25-36"><a href="methods.html#cb25-36" tabindex="-1"></a>        ).with_columns(</span>
<span id="cb25-37"><a href="methods.html#cb25-37" tabindex="-1"></a>            pl.lit([i <span class="cf">for</span> i <span class="kw">in</span> <span class="bu">range</span>(<span class="dv">5087</span>)]).alias(<span class="st">&#39;xid&#39;</span>)</span>
<span id="cb25-38"><a href="methods.html#cb25-38" tabindex="-1"></a>        )</span>
<span id="cb25-39"><a href="methods.html#cb25-39" tabindex="-1"></a></span>
<span id="cb25-40"><a href="methods.html#cb25-40" tabindex="-1"></a>        <span class="cf">with</span> <span class="bu">open</span>(<span class="st">&quot;all_feat.pkl&quot;</span>, <span class="st">&quot;rb&quot;</span>) <span class="im">as</span> f:</span>
<span id="cb25-41"><a href="methods.html#cb25-41" tabindex="-1"></a>            chemical_features <span class="op">=</span> pickle.load(f)</span>
<span id="cb25-42"><a href="methods.html#cb25-42" tabindex="-1"></a></span>
<span id="cb25-43"><a href="methods.html#cb25-43" tabindex="-1"></a>        phenotype_features <span class="op">=</span> df.columns[<span class="dv">5</span>:<span class="op">-</span><span class="dv">6</span>]</span>
<span id="cb25-44"><a href="methods.html#cb25-44" tabindex="-1"></a>        smiles <span class="op">=</span> df.select([<span class="st">&#39;pubchem_smiles&#39;</span>]).to_numpy().flatten()</span>
<span id="cb25-45"><a href="methods.html#cb25-45" tabindex="-1"></a>        y <span class="op">=</span> df.select([<span class="st">&#39;activity&#39;</span>]).to_numpy().flatten()</span>
<span id="cb25-46"><a href="methods.html#cb25-46" tabindex="-1"></a>        pca1 <span class="op">=</span> df.select([<span class="st">&#39;pca1&#39;</span>]).to_numpy().flatten()</span>
<span id="cb25-47"><a href="methods.html#cb25-47" tabindex="-1"></a></span>
<span id="cb25-48"><a href="methods.html#cb25-48" tabindex="-1"></a>        <span class="co"># Convert the smiles into numerical features using a featurizer from deepchem</span></span>
<span id="cb25-49"><a href="methods.html#cb25-49" tabindex="-1"></a>        <span class="co"># Using MolGraphConvFeaturizer</span></span>
<span id="cb25-50"><a href="methods.html#cb25-50" tabindex="-1"></a>        featurizer <span class="op">=</span> dc.feat.MolGraphConvFeaturizer(use_edges<span class="op">=</span><span class="va">True</span>, use_chirality<span class="op">=</span> <span class="va">True</span>, use_partial_charge<span class="op">=</span><span class="va">True</span>)</span>
<span id="cb25-51"><a href="methods.html#cb25-51" tabindex="-1"></a>        graphs <span class="op">=</span> featurizer.featurize(smiles, y<span class="op">=</span>y)</span>
<span id="cb25-52"><a href="methods.html#cb25-52" tabindex="-1"></a>                </span>
<span id="cb25-53"><a href="methods.html#cb25-53" tabindex="-1"></a>        data_list <span class="op">=</span> []</span>
<span id="cb25-54"><a href="methods.html#cb25-54" tabindex="-1"></a>        <span class="cf">for</span> idx, graph <span class="kw">in</span> tqdm(<span class="bu">enumerate</span>(graphs)):</span>
<span id="cb25-55"><a href="methods.html#cb25-55" tabindex="-1"></a>            edge_features <span class="op">=</span> torch.from_numpy(graph.edge_features).<span class="bu">float</span>()</span>
<span id="cb25-56"><a href="methods.html#cb25-56" tabindex="-1"></a>            g <span class="op">=</span> Data(x<span class="op">=</span>torch.from_numpy(graph.node_features).<span class="bu">float</span>(),</span>
<span id="cb25-57"><a href="methods.html#cb25-57" tabindex="-1"></a>                            edge_index<span class="op">=</span>torch.from_numpy(graph.edge_index).<span class="bu">long</span>(),</span>
<span id="cb25-58"><a href="methods.html#cb25-58" tabindex="-1"></a>                            edge_attr<span class="op">=</span>edge_features,</span>
<span id="cb25-59"><a href="methods.html#cb25-59" tabindex="-1"></a>                            y<span class="op">=</span>torch.tensor(y[idx]).<span class="bu">long</span>().unsqueeze(<span class="dv">0</span>),</span>
<span id="cb25-60"><a href="methods.html#cb25-60" tabindex="-1"></a>                            chem_features<span class="op">=</span>torch.from_numpy(chemical_features[idx]).<span class="bu">float</span>().unsqueeze(<span class="dv">0</span>),</span>
<span id="cb25-61"><a href="methods.html#cb25-61" tabindex="-1"></a>                            smiles<span class="op">=</span>smiles[idx])</span>
<span id="cb25-62"><a href="methods.html#cb25-62" tabindex="-1"></a>            data_list.append(g)</span>
<span id="cb25-63"><a href="methods.html#cb25-63" tabindex="-1"></a></span>
<span id="cb25-64"><a href="methods.html#cb25-64" tabindex="-1"></a>            <span class="cf">if</span> <span class="va">self</span>.pre_filter <span class="kw">is</span> <span class="kw">not</span> <span class="va">None</span>:</span>
<span id="cb25-65"><a href="methods.html#cb25-65" tabindex="-1"></a>                data_list <span class="op">=</span> [data <span class="cf">for</span> data <span class="kw">in</span> data_list <span class="cf">if</span> <span class="va">self</span>.pre_filter(data)]</span>
<span id="cb25-66"><a href="methods.html#cb25-66" tabindex="-1"></a></span>
<span id="cb25-67"><a href="methods.html#cb25-67" tabindex="-1"></a>            <span class="cf">if</span> <span class="va">self</span>.pre_transform <span class="kw">is</span> <span class="kw">not</span> <span class="va">None</span>:</span>
<span id="cb25-68"><a href="methods.html#cb25-68" tabindex="-1"></a>                data_list <span class="op">=</span> [<span class="va">self</span>.pre_transform(data) <span class="cf">for</span> data <span class="kw">in</span> data_list]</span>
<span id="cb25-69"><a href="methods.html#cb25-69" tabindex="-1"></a></span>
<span id="cb25-70"><a href="methods.html#cb25-70" tabindex="-1"></a>        data, slices <span class="op">=</span> <span class="va">self</span>.collate(data_list)</span>
<span id="cb25-71"><a href="methods.html#cb25-71" tabindex="-1"></a>        torch.save((data, slices), <span class="va">self</span>.processed_paths[<span class="dv">0</span>])</span>
<span id="cb25-72"><a href="methods.html#cb25-72" tabindex="-1"></a>        </span>
<span id="cb25-73"><a href="methods.html#cb25-73" tabindex="-1"></a><span class="kw">class</span> CovidMolGraph_balanced_classification(InMemoryDataset):</span>
<span id="cb25-74"><a href="methods.html#cb25-74" tabindex="-1"></a>    </span>
<span id="cb25-75"><a href="methods.html#cb25-75" tabindex="-1"></a>    <span class="kw">def</span> <span class="fu">__init__</span>(<span class="va">self</span>, root: <span class="bu">str</span>, transform: Optional[Callable] <span class="op">=</span> <span class="va">None</span>,</span>
<span id="cb25-76"><a href="methods.html#cb25-76" tabindex="-1"></a>                 pre_transform: Optional[Callable] <span class="op">=</span> <span class="va">None</span>,</span>
<span id="cb25-77"><a href="methods.html#cb25-77" tabindex="-1"></a>                 pre_filter: Optional[Callable] <span class="op">=</span> <span class="va">None</span>):</span>
<span id="cb25-78"><a href="methods.html#cb25-78" tabindex="-1"></a>        <span class="bu">super</span>().<span class="fu">__init__</span>(root, transform, pre_transform, pre_filter)</span>
<span id="cb25-79"><a href="methods.html#cb25-79" tabindex="-1"></a>        <span class="va">self</span>.data, <span class="va">self</span>.slices <span class="op">=</span> torch.load(<span class="va">self</span>.processed_paths[<span class="dv">0</span>])</span>
<span id="cb25-80"><a href="methods.html#cb25-80" tabindex="-1"></a></span>
<span id="cb25-81"><a href="methods.html#cb25-81" tabindex="-1"></a>    <span class="at">@property</span></span>
<span id="cb25-82"><a href="methods.html#cb25-82" tabindex="-1"></a>    <span class="kw">def</span> processed_file_names(<span class="va">self</span>) <span class="op">-&gt;</span> <span class="bu">str</span>:</span>
<span id="cb25-83"><a href="methods.html#cb25-83" tabindex="-1"></a>        <span class="cf">return</span> <span class="st">&#39;covid_data_processed.pt&#39;</span></span>
<span id="cb25-84"><a href="methods.html#cb25-84" tabindex="-1"></a></span>
<span id="cb25-85"><a href="methods.html#cb25-85" tabindex="-1"></a>    <span class="kw">def</span> process(<span class="va">self</span>):</span>
<span id="cb25-86"><a href="methods.html#cb25-86" tabindex="-1"></a>        df <span class="op">=</span> pl.read_csv(<span class="st">&#39;covid_20230504.tsv&#39;</span>, separator<span class="op">=</span><span class="st">&#39;</span><span class="ch">\t</span><span class="st">&#39;</span>).<span class="bu">filter</span>(</span>
<span id="cb25-87"><a href="methods.html#cb25-87" tabindex="-1"></a>            pl.col(<span class="st">&#39;label&#39;</span>) <span class="op">==</span> <span class="st">&#39;compound&#39;</span></span>
<span id="cb25-88"><a href="methods.html#cb25-88" tabindex="-1"></a>        ).with_columns(</span>
<span id="cb25-89"><a href="methods.html#cb25-89" tabindex="-1"></a>            pl.col(<span class="st">&#39;pca1&#39;</span>).<span class="bu">apply</span>(<span class="kw">lambda</span> x: <span class="dv">1</span> <span class="cf">if</span> x <span class="op">&gt;=</span> <span class="dv">5</span> <span class="cf">else</span> <span class="dv">0</span>).alias(<span class="st">&#39;activity&#39;</span>)</span>
<span id="cb25-90"><a href="methods.html#cb25-90" tabindex="-1"></a>        ).with_columns(</span>
<span id="cb25-91"><a href="methods.html#cb25-91" tabindex="-1"></a>            pl.lit([i <span class="cf">for</span> i <span class="kw">in</span> <span class="bu">range</span>(<span class="dv">5087</span>)]).alias(<span class="st">&#39;xid&#39;</span>)</span>
<span id="cb25-92"><a href="methods.html#cb25-92" tabindex="-1"></a>        )</span>
<span id="cb25-93"><a href="methods.html#cb25-93" tabindex="-1"></a></span>
<span id="cb25-94"><a href="methods.html#cb25-94" tabindex="-1"></a>        <span class="cf">with</span> <span class="bu">open</span>(<span class="st">&quot;all_feat.pkl&quot;</span>, <span class="st">&quot;rb&quot;</span>) <span class="im">as</span> f:</span>
<span id="cb25-95"><a href="methods.html#cb25-95" tabindex="-1"></a>            chemical_features <span class="op">=</span> pickle.load(f)</span>
<span id="cb25-96"><a href="methods.html#cb25-96" tabindex="-1"></a></span>
<span id="cb25-97"><a href="methods.html#cb25-97" tabindex="-1"></a>        phenotype_features <span class="op">=</span> df.columns[<span class="dv">5</span>:<span class="op">-</span><span class="dv">6</span>]</span>
<span id="cb25-98"><a href="methods.html#cb25-98" tabindex="-1"></a>        smiles <span class="op">=</span> df.select([<span class="st">&#39;pubchem_smiles&#39;</span>]).to_numpy().flatten()</span>
<span id="cb25-99"><a href="methods.html#cb25-99" tabindex="-1"></a>        y <span class="op">=</span> df.select([<span class="st">&#39;activity&#39;</span>]).to_numpy().flatten()</span>
<span id="cb25-100"><a href="methods.html#cb25-100" tabindex="-1"></a>        pca1 <span class="op">=</span> df.select([<span class="st">&#39;pca1&#39;</span>]).to_numpy().flatten()</span>
<span id="cb25-101"><a href="methods.html#cb25-101" tabindex="-1"></a>        </span>
<span id="cb25-102"><a href="methods.html#cb25-102" tabindex="-1"></a>        <span class="co"># Example data with 4 classes</span></span>
<span id="cb25-103"><a href="methods.html#cb25-103" tabindex="-1"></a>        data <span class="op">=</span> pca1</span>
<span id="cb25-104"><a href="methods.html#cb25-104" tabindex="-1"></a>        labels <span class="op">=</span> y</span>
<span id="cb25-105"><a href="methods.html#cb25-105" tabindex="-1"></a></span>
<span id="cb25-106"><a href="methods.html#cb25-106" tabindex="-1"></a>        <span class="co"># Split the data into train, validation, and test sets</span></span>
<span id="cb25-107"><a href="methods.html#cb25-107" tabindex="-1"></a>        train_data, val_data, train_labels, val_labels <span class="op">=</span> train_test_split(data, labels, test_size<span class="op">=</span><span class="dv">1000</span>, random_state<span class="op">=</span><span class="dv">42</span>,</span>
<span id="cb25-108"><a href="methods.html#cb25-108" tabindex="-1"></a>                                                                          stratify<span class="op">=</span>labels)</span>
<span id="cb25-109"><a href="methods.html#cb25-109" tabindex="-1"></a>        train_list <span class="op">=</span> [<span class="st">&#39;train&#39;</span> <span class="cf">for</span> _ <span class="kw">in</span> <span class="bu">range</span>(<span class="dv">4087</span>)]</span>
<span id="cb25-110"><a href="methods.html#cb25-110" tabindex="-1"></a>        val_list <span class="op">=</span> [<span class="st">&#39;valid&#39;</span> <span class="cf">for</span> _ <span class="kw">in</span> <span class="bu">range</span>(<span class="dv">1000</span>)]</span>
<span id="cb25-111"><a href="methods.html#cb25-111" tabindex="-1"></a>        mask_list <span class="op">=</span> train_list <span class="op">+</span> val_list</span>
<span id="cb25-112"><a href="methods.html#cb25-112" tabindex="-1"></a>        </span>
<span id="cb25-113"><a href="methods.html#cb25-113" tabindex="-1"></a>        mask <span class="op">=</span> pl.DataFrame(</span>
<span id="cb25-114"><a href="methods.html#cb25-114" tabindex="-1"></a>            {</span>
<span id="cb25-115"><a href="methods.html#cb25-115" tabindex="-1"></a>                <span class="st">&#39;pca1&#39;</span>: np.hstack((train_data, val_data)),</span>
<span id="cb25-116"><a href="methods.html#cb25-116" tabindex="-1"></a>                <span class="st">&#39;mask&#39;</span>: mask_list</span>
<span id="cb25-117"><a href="methods.html#cb25-117" tabindex="-1"></a>            }</span>
<span id="cb25-118"><a href="methods.html#cb25-118" tabindex="-1"></a>        ).sort(<span class="st">&#39;pca1&#39;</span>, descending<span class="op">=</span><span class="va">True</span>)</span>
<span id="cb25-119"><a href="methods.html#cb25-119" tabindex="-1"></a>        df <span class="op">=</span> df.join(mask, on<span class="op">=</span><span class="st">&#39;pca1&#39;</span>, how<span class="op">=</span><span class="st">&#39;left&#39;</span>)</span>
<span id="cb25-120"><a href="methods.html#cb25-120" tabindex="-1"></a>        </span>
<span id="cb25-121"><a href="methods.html#cb25-121" tabindex="-1"></a>        neg_class <span class="op">=</span> df[<span class="st">&quot;activity&quot;</span>].value_counts()[<span class="dv">0</span>][<span class="st">&#39;counts&#39;</span>].item()</span>
<span id="cb25-122"><a href="methods.html#cb25-122" tabindex="-1"></a>        pos_class <span class="op">=</span> df[<span class="st">&quot;activity&quot;</span>].value_counts()[<span class="dv">1</span>][<span class="st">&#39;counts&#39;</span>].item()</span>
<span id="cb25-123"><a href="methods.html#cb25-123" tabindex="-1"></a>        multiplier <span class="op">=</span> <span class="bu">int</span>(neg_class<span class="op">/</span>pos_class) <span class="op">-</span> <span class="dv">1</span></span>
<span id="cb25-124"><a href="methods.html#cb25-124" tabindex="-1"></a></span>
<span id="cb25-125"><a href="methods.html#cb25-125" tabindex="-1"></a>        df <span class="op">=</span> df.with_columns(</span>
<span id="cb25-126"><a href="methods.html#cb25-126" tabindex="-1"></a>            pl.col(<span class="st">&#39;activity&#39;</span>).<span class="bu">apply</span>(<span class="kw">lambda</span> x: x<span class="op">*</span>multiplier <span class="cf">if</span> x <span class="op">==</span> <span class="dv">1</span> <span class="cf">else</span> <span class="dv">1</span>).alias(<span class="st">&#39;to_replicate&#39;</span>)</span>
<span id="cb25-127"><a href="methods.html#cb25-127" tabindex="-1"></a>        )</span>
<span id="cb25-128"><a href="methods.html#cb25-128" tabindex="-1"></a>        </span>
<span id="cb25-129"><a href="methods.html#cb25-129" tabindex="-1"></a>        train_df <span class="op">=</span> df.<span class="bu">filter</span>(pl.col(<span class="st">&#39;mask&#39;</span>) <span class="op">==</span> <span class="st">&#39;train&#39;</span>)</span>
<span id="cb25-130"><a href="methods.html#cb25-130" tabindex="-1"></a>        valid_df <span class="op">=</span> df.<span class="bu">filter</span>(pl.col(<span class="st">&#39;mask&#39;</span>) <span class="op">==</span> <span class="st">&#39;valid&#39;</span>)</span>
<span id="cb25-131"><a href="methods.html#cb25-131" tabindex="-1"></a>        </span>
<span id="cb25-132"><a href="methods.html#cb25-132" tabindex="-1"></a>        balanced_train_df <span class="op">=</span> train_df.select(</span>
<span id="cb25-133"><a href="methods.html#cb25-133" tabindex="-1"></a>            pl.exclude(<span class="st">&#39;to_replicate&#39;</span>).repeat_by(<span class="st">&#39;to_replicate&#39;</span>).explode()</span>
<span id="cb25-134"><a href="methods.html#cb25-134" tabindex="-1"></a>        )</span>
<span id="cb25-135"><a href="methods.html#cb25-135" tabindex="-1"></a>        balanced_valid_df <span class="op">=</span> valid_df.select(</span>
<span id="cb25-136"><a href="methods.html#cb25-136" tabindex="-1"></a>            pl.exclude(<span class="st">&#39;to_replicate&#39;</span>).repeat_by(<span class="st">&#39;to_replicate&#39;</span>).explode()</span>
<span id="cb25-137"><a href="methods.html#cb25-137" tabindex="-1"></a>        )</span>
<span id="cb25-138"><a href="methods.html#cb25-138" tabindex="-1"></a>        balanced_df <span class="op">=</span> balanced_train_df.vstack(balanced_valid_df).sort(<span class="st">&quot;xid&quot;</span>, descending<span class="op">=</span><span class="va">False</span>)</span>
<span id="cb25-139"><a href="methods.html#cb25-139" tabindex="-1"></a>        index_to_replicate <span class="op">=</span> balanced_df.groupby(<span class="st">&quot;xid&quot;</span>, maintain_order<span class="op">=</span><span class="va">True</span>).count()[<span class="st">&#39;count&#39;</span>].to_numpy()</span>
<span id="cb25-140"><a href="methods.html#cb25-140" tabindex="-1"></a>        </span>
<span id="cb25-141"><a href="methods.html#cb25-141" tabindex="-1"></a>        balanced_chemical_features <span class="op">=</span> np.repeat(chemical_features, index_to_replicate, axis<span class="op">=</span><span class="dv">0</span>)</span>
<span id="cb25-142"><a href="methods.html#cb25-142" tabindex="-1"></a>        balanced_smiles <span class="op">=</span> balanced_df.select([<span class="st">&#39;pubchem_smiles&#39;</span>]).to_series().to_list()</span>
<span id="cb25-143"><a href="methods.html#cb25-143" tabindex="-1"></a>        balanced_y <span class="op">=</span> balanced_df.select([<span class="st">&#39;activity&#39;</span>]).to_numpy().flatten()</span>
<span id="cb25-144"><a href="methods.html#cb25-144" tabindex="-1"></a>        balanced_pca1 <span class="op">=</span> balanced_df.select([<span class="st">&#39;pca1&#39;</span>]).to_numpy().flatten()</span>
<span id="cb25-145"><a href="methods.html#cb25-145" tabindex="-1"></a>        </span>
<span id="cb25-146"><a href="methods.html#cb25-146" tabindex="-1"></a>        <span class="co"># Convert the smiles into numerical features using a featurizer from deepchem</span></span>
<span id="cb25-147"><a href="methods.html#cb25-147" tabindex="-1"></a>        <span class="co"># Using MolGraphConvFeaturizer</span></span>
<span id="cb25-148"><a href="methods.html#cb25-148" tabindex="-1"></a>        featurizer <span class="op">=</span> dc.feat.MolGraphConvFeaturizer(use_edges<span class="op">=</span><span class="va">True</span>, use_chirality<span class="op">=</span> <span class="va">True</span>, use_partial_charge<span class="op">=</span><span class="va">True</span>)</span>
<span id="cb25-149"><a href="methods.html#cb25-149" tabindex="-1"></a>        graphs <span class="op">=</span> featurizer.featurize(balanced_smiles, y<span class="op">=</span>balanced_y)</span>
<span id="cb25-150"><a href="methods.html#cb25-150" tabindex="-1"></a>                </span>
<span id="cb25-151"><a href="methods.html#cb25-151" tabindex="-1"></a>        data_list <span class="op">=</span> []</span>
<span id="cb25-152"><a href="methods.html#cb25-152" tabindex="-1"></a>        <span class="cf">for</span> idx, graph <span class="kw">in</span> tqdm(<span class="bu">enumerate</span>(graphs)):</span>
<span id="cb25-153"><a href="methods.html#cb25-153" tabindex="-1"></a>            edge_features <span class="op">=</span> torch.from_numpy(graph.edge_features).<span class="bu">float</span>()</span>
<span id="cb25-154"><a href="methods.html#cb25-154" tabindex="-1"></a>            g <span class="op">=</span> Data(x<span class="op">=</span>torch.from_numpy(graph.node_features).<span class="bu">float</span>(),</span>
<span id="cb25-155"><a href="methods.html#cb25-155" tabindex="-1"></a>                            edge_index<span class="op">=</span>torch.from_numpy(graph.edge_index).<span class="bu">long</span>(),</span>
<span id="cb25-156"><a href="methods.html#cb25-156" tabindex="-1"></a>                            edge_attr<span class="op">=</span>edge_features,</span>
<span id="cb25-157"><a href="methods.html#cb25-157" tabindex="-1"></a>                            y<span class="op">=</span>torch.tensor(balanced_y[idx]).<span class="bu">long</span>().unsqueeze(<span class="dv">0</span>),</span>
<span id="cb25-158"><a href="methods.html#cb25-158" tabindex="-1"></a>                            chem_features<span class="op">=</span>torch.from_numpy(balanced_chemical_features[idx]).<span class="bu">float</span>().unsqueeze(<span class="dv">0</span>),</span>
<span id="cb25-159"><a href="methods.html#cb25-159" tabindex="-1"></a>                            smiles<span class="op">=</span>balanced_smiles[idx])</span>
<span id="cb25-160"><a href="methods.html#cb25-160" tabindex="-1"></a>            data_list.append(g)</span>
<span id="cb25-161"><a href="methods.html#cb25-161" tabindex="-1"></a></span>
<span id="cb25-162"><a href="methods.html#cb25-162" tabindex="-1"></a>            <span class="cf">if</span> <span class="va">self</span>.pre_filter <span class="kw">is</span> <span class="kw">not</span> <span class="va">None</span>:</span>
<span id="cb25-163"><a href="methods.html#cb25-163" tabindex="-1"></a>                data_list <span class="op">=</span> [data <span class="cf">for</span> data <span class="kw">in</span> data_list <span class="cf">if</span> <span class="va">self</span>.pre_filter(data)]</span>
<span id="cb25-164"><a href="methods.html#cb25-164" tabindex="-1"></a></span>
<span id="cb25-165"><a href="methods.html#cb25-165" tabindex="-1"></a>            <span class="cf">if</span> <span class="va">self</span>.pre_transform <span class="kw">is</span> <span class="kw">not</span> <span class="va">None</span>:</span>
<span id="cb25-166"><a href="methods.html#cb25-166" tabindex="-1"></a>                data_list <span class="op">=</span> [<span class="va">self</span>.pre_transform(data) <span class="cf">for</span> data <span class="kw">in</span> data_list]</span>
<span id="cb25-167"><a href="methods.html#cb25-167" tabindex="-1"></a></span>
<span id="cb25-168"><a href="methods.html#cb25-168" tabindex="-1"></a>        data, slices <span class="op">=</span> <span class="va">self</span>.collate(data_list)</span>
<span id="cb25-169"><a href="methods.html#cb25-169" tabindex="-1"></a>        torch.save((data, slices), <span class="va">self</span>.processed_paths[<span class="dv">0</span>])</span></code></pre></div>
<p>Another two dataset also were created for regression task. The difference is that the model target <code>y</code> instead of binary classification of active and inactive compound would be the molecule <code>PCA1</code> value.</p>
<div class="sourceCode" id="cb26"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb26-1"><a href="methods.html#cb26-1" tabindex="-1"></a><span class="kw">class</span> CovidMolGraph_imbalance_regression(InMemoryDataset):</span>
<span id="cb26-2"><a href="methods.html#cb26-2" tabindex="-1"></a>    </span>
<span id="cb26-3"><a href="methods.html#cb26-3" tabindex="-1"></a>    <span class="kw">def</span> <span class="fu">__init__</span>(<span class="va">self</span>, root: <span class="bu">str</span>, transform: Optional[Callable] <span class="op">=</span> <span class="va">None</span>,</span>
<span id="cb26-4"><a href="methods.html#cb26-4" tabindex="-1"></a>                 pre_transform: Optional[Callable] <span class="op">=</span> <span class="va">None</span>,</span>
<span id="cb26-5"><a href="methods.html#cb26-5" tabindex="-1"></a>                 pre_filter: Optional[Callable] <span class="op">=</span> <span class="va">None</span>):</span>
<span id="cb26-6"><a href="methods.html#cb26-6" tabindex="-1"></a>        <span class="bu">super</span>().<span class="fu">__init__</span>(root, transform, pre_transform, pre_filter)</span>
<span id="cb26-7"><a href="methods.html#cb26-7" tabindex="-1"></a>        <span class="va">self</span>.data, <span class="va">self</span>.slices <span class="op">=</span> torch.load(<span class="va">self</span>.processed_paths[<span class="dv">0</span>])</span>
<span id="cb26-8"><a href="methods.html#cb26-8" tabindex="-1"></a></span>
<span id="cb26-9"><a href="methods.html#cb26-9" tabindex="-1"></a>    <span class="at">@property</span></span>
<span id="cb26-10"><a href="methods.html#cb26-10" tabindex="-1"></a>    <span class="kw">def</span> processed_file_names(<span class="va">self</span>) <span class="op">-&gt;</span> <span class="bu">str</span>:</span>
<span id="cb26-11"><a href="methods.html#cb26-11" tabindex="-1"></a>        <span class="cf">return</span> <span class="st">&#39;covid_data_processed.pt&#39;</span></span>
<span id="cb26-12"><a href="methods.html#cb26-12" tabindex="-1"></a></span>
<span id="cb26-13"><a href="methods.html#cb26-13" tabindex="-1"></a>    <span class="kw">def</span> process(<span class="va">self</span>):</span>
<span id="cb26-14"><a href="methods.html#cb26-14" tabindex="-1"></a>        df <span class="op">=</span> pl.read_csv(<span class="st">&#39;covid_20230504.tsv&#39;</span>, separator<span class="op">=</span><span class="st">&#39;</span><span class="ch">\t</span><span class="st">&#39;</span>).<span class="bu">filter</span>(</span>
<span id="cb26-15"><a href="methods.html#cb26-15" tabindex="-1"></a>            pl.col(<span class="st">&#39;label&#39;</span>) <span class="op">==</span> <span class="st">&#39;compound&#39;</span></span>
<span id="cb26-16"><a href="methods.html#cb26-16" tabindex="-1"></a>        ).with_columns(</span>
<span id="cb26-17"><a href="methods.html#cb26-17" tabindex="-1"></a>            pl.col(<span class="st">&#39;pca1&#39;</span>).<span class="bu">apply</span>(<span class="kw">lambda</span> x: <span class="dv">1</span> <span class="cf">if</span> x <span class="op">&gt;=</span> <span class="dv">5</span> <span class="cf">else</span> <span class="dv">0</span>).alias(<span class="st">&#39;activity&#39;</span>)</span>
<span id="cb26-18"><a href="methods.html#cb26-18" tabindex="-1"></a>        ).with_columns(</span>
<span id="cb26-19"><a href="methods.html#cb26-19" tabindex="-1"></a>            pl.lit([i <span class="cf">for</span> i <span class="kw">in</span> <span class="bu">range</span>(<span class="dv">5087</span>)]).alias(<span class="st">&#39;xid&#39;</span>)</span>
<span id="cb26-20"><a href="methods.html#cb26-20" tabindex="-1"></a>        )</span>
<span id="cb26-21"><a href="methods.html#cb26-21" tabindex="-1"></a></span>
<span id="cb26-22"><a href="methods.html#cb26-22" tabindex="-1"></a>        <span class="cf">with</span> <span class="bu">open</span>(<span class="st">&quot;all_feat.pkl&quot;</span>, <span class="st">&quot;rb&quot;</span>) <span class="im">as</span> f:</span>
<span id="cb26-23"><a href="methods.html#cb26-23" tabindex="-1"></a>            chemical_features <span class="op">=</span> pickle.load(f)</span>
<span id="cb26-24"><a href="methods.html#cb26-24" tabindex="-1"></a></span>
<span id="cb26-25"><a href="methods.html#cb26-25" tabindex="-1"></a>        phenotype_features <span class="op">=</span> df.columns[<span class="dv">5</span>:<span class="op">-</span><span class="dv">6</span>]</span>
<span id="cb26-26"><a href="methods.html#cb26-26" tabindex="-1"></a>        smiles <span class="op">=</span> df.select([<span class="st">&#39;pubchem_smiles&#39;</span>]).to_numpy().flatten()</span>
<span id="cb26-27"><a href="methods.html#cb26-27" tabindex="-1"></a>        y <span class="op">=</span> df.select([<span class="st">&#39;activity&#39;</span>]).to_numpy().flatten()</span>
<span id="cb26-28"><a href="methods.html#cb26-28" tabindex="-1"></a>        pca1 <span class="op">=</span> df.select([<span class="st">&#39;pca1&#39;</span>]).to_numpy().flatten()</span>
<span id="cb26-29"><a href="methods.html#cb26-29" tabindex="-1"></a></span>
<span id="cb26-30"><a href="methods.html#cb26-30" tabindex="-1"></a>        <span class="co"># Convert the smiles into numerical features using a featurizer from deepchem</span></span>
<span id="cb26-31"><a href="methods.html#cb26-31" tabindex="-1"></a>        <span class="co"># Using MolGraphConvFeaturizer</span></span>
<span id="cb26-32"><a href="methods.html#cb26-32" tabindex="-1"></a>        featurizer <span class="op">=</span> dc.feat.MolGraphConvFeaturizer(use_edges<span class="op">=</span><span class="va">True</span>, use_chirality<span class="op">=</span> <span class="va">True</span>, use_partial_charge<span class="op">=</span><span class="va">True</span>)</span>
<span id="cb26-33"><a href="methods.html#cb26-33" tabindex="-1"></a>        graphs <span class="op">=</span> featurizer.featurize(smiles, y<span class="op">=</span>pca1)</span>
<span id="cb26-34"><a href="methods.html#cb26-34" tabindex="-1"></a>                </span>
<span id="cb26-35"><a href="methods.html#cb26-35" tabindex="-1"></a>        data_list <span class="op">=</span> []</span>
<span id="cb26-36"><a href="methods.html#cb26-36" tabindex="-1"></a>        <span class="cf">for</span> idx, graph <span class="kw">in</span> tqdm(<span class="bu">enumerate</span>(graphs)):</span>
<span id="cb26-37"><a href="methods.html#cb26-37" tabindex="-1"></a>            edge_features <span class="op">=</span> torch.from_numpy(graph.edge_features).<span class="bu">float</span>()</span>
<span id="cb26-38"><a href="methods.html#cb26-38" tabindex="-1"></a>            g <span class="op">=</span> Data(x<span class="op">=</span>torch.from_numpy(graph.node_features).<span class="bu">float</span>(),</span>
<span id="cb26-39"><a href="methods.html#cb26-39" tabindex="-1"></a>                            edge_index<span class="op">=</span>torch.from_numpy(graph.edge_index).<span class="bu">long</span>(),</span>
<span id="cb26-40"><a href="methods.html#cb26-40" tabindex="-1"></a>                            edge_attr<span class="op">=</span>edge_features,</span>
<span id="cb26-41"><a href="methods.html#cb26-41" tabindex="-1"></a>                            y<span class="op">=</span>torch.tensor(pca1[idx]).<span class="bu">long</span>().unsqueeze(<span class="dv">0</span>),</span>
<span id="cb26-42"><a href="methods.html#cb26-42" tabindex="-1"></a>                            chem_features<span class="op">=</span>torch.from_numpy(chemical_features[idx]).<span class="bu">float</span>().unsqueeze(<span class="dv">0</span>),</span>
<span id="cb26-43"><a href="methods.html#cb26-43" tabindex="-1"></a>                            smiles<span class="op">=</span>smiles[idx])</span>
<span id="cb26-44"><a href="methods.html#cb26-44" tabindex="-1"></a>            data_list.append(g)</span>
<span id="cb26-45"><a href="methods.html#cb26-45" tabindex="-1"></a></span>
<span id="cb26-46"><a href="methods.html#cb26-46" tabindex="-1"></a>            <span class="cf">if</span> <span class="va">self</span>.pre_filter <span class="kw">is</span> <span class="kw">not</span> <span class="va">None</span>:</span>
<span id="cb26-47"><a href="methods.html#cb26-47" tabindex="-1"></a>                data_list <span class="op">=</span> [data <span class="cf">for</span> data <span class="kw">in</span> data_list <span class="cf">if</span> <span class="va">self</span>.pre_filter(data)]</span>
<span id="cb26-48"><a href="methods.html#cb26-48" tabindex="-1"></a></span>
<span id="cb26-49"><a href="methods.html#cb26-49" tabindex="-1"></a>            <span class="cf">if</span> <span class="va">self</span>.pre_transform <span class="kw">is</span> <span class="kw">not</span> <span class="va">None</span>:</span>
<span id="cb26-50"><a href="methods.html#cb26-50" tabindex="-1"></a>                data_list <span class="op">=</span> [<span class="va">self</span>.pre_transform(data) <span class="cf">for</span> data <span class="kw">in</span> data_list]</span>
<span id="cb26-51"><a href="methods.html#cb26-51" tabindex="-1"></a></span>
<span id="cb26-52"><a href="methods.html#cb26-52" tabindex="-1"></a>        data, slices <span class="op">=</span> <span class="va">self</span>.collate(data_list)</span>
<span id="cb26-53"><a href="methods.html#cb26-53" tabindex="-1"></a>        torch.save((data, slices), <span class="va">self</span>.processed_paths[<span class="dv">0</span>])</span>
<span id="cb26-54"><a href="methods.html#cb26-54" tabindex="-1"></a>        </span>
<span id="cb26-55"><a href="methods.html#cb26-55" tabindex="-1"></a><span class="kw">class</span> CovidMolGraph_balanced_regression(InMemoryDataset):</span>
<span id="cb26-56"><a href="methods.html#cb26-56" tabindex="-1"></a>    </span>
<span id="cb26-57"><a href="methods.html#cb26-57" tabindex="-1"></a>    <span class="kw">def</span> <span class="fu">__init__</span>(<span class="va">self</span>, root: <span class="bu">str</span>, transform: Optional[Callable] <span class="op">=</span> <span class="va">None</span>,</span>
<span id="cb26-58"><a href="methods.html#cb26-58" tabindex="-1"></a>                 pre_transform: Optional[Callable] <span class="op">=</span> <span class="va">None</span>,</span>
<span id="cb26-59"><a href="methods.html#cb26-59" tabindex="-1"></a>                 pre_filter: Optional[Callable] <span class="op">=</span> <span class="va">None</span>):</span>
<span id="cb26-60"><a href="methods.html#cb26-60" tabindex="-1"></a>        <span class="bu">super</span>().<span class="fu">__init__</span>(root, transform, pre_transform, pre_filter)</span>
<span id="cb26-61"><a href="methods.html#cb26-61" tabindex="-1"></a>        <span class="va">self</span>.data, <span class="va">self</span>.slices <span class="op">=</span> torch.load(<span class="va">self</span>.processed_paths[<span class="dv">0</span>])</span>
<span id="cb26-62"><a href="methods.html#cb26-62" tabindex="-1"></a></span>
<span id="cb26-63"><a href="methods.html#cb26-63" tabindex="-1"></a>    <span class="at">@property</span></span>
<span id="cb26-64"><a href="methods.html#cb26-64" tabindex="-1"></a>    <span class="kw">def</span> processed_file_names(<span class="va">self</span>) <span class="op">-&gt;</span> <span class="bu">str</span>:</span>
<span id="cb26-65"><a href="methods.html#cb26-65" tabindex="-1"></a>        <span class="cf">return</span> <span class="st">&#39;covid_data_processed.pt&#39;</span></span>
<span id="cb26-66"><a href="methods.html#cb26-66" tabindex="-1"></a></span>
<span id="cb26-67"><a href="methods.html#cb26-67" tabindex="-1"></a>    <span class="kw">def</span> process(<span class="va">self</span>):</span>
<span id="cb26-68"><a href="methods.html#cb26-68" tabindex="-1"></a>        df <span class="op">=</span> pl.read_csv(<span class="st">&#39;covid_20230504.tsv&#39;</span>, separator<span class="op">=</span><span class="st">&#39;</span><span class="ch">\t</span><span class="st">&#39;</span>).<span class="bu">filter</span>(</span>
<span id="cb26-69"><a href="methods.html#cb26-69" tabindex="-1"></a>            pl.col(<span class="st">&#39;label&#39;</span>) <span class="op">==</span> <span class="st">&#39;compound&#39;</span></span>
<span id="cb26-70"><a href="methods.html#cb26-70" tabindex="-1"></a>        ).with_columns(</span>
<span id="cb26-71"><a href="methods.html#cb26-71" tabindex="-1"></a>            pl.col(<span class="st">&#39;pca1&#39;</span>).<span class="bu">apply</span>(<span class="kw">lambda</span> x: <span class="dv">1</span> <span class="cf">if</span> x <span class="op">&gt;=</span> <span class="dv">5</span> <span class="cf">else</span> <span class="dv">0</span>).alias(<span class="st">&#39;activity&#39;</span>)</span>
<span id="cb26-72"><a href="methods.html#cb26-72" tabindex="-1"></a>        ).with_columns(</span>
<span id="cb26-73"><a href="methods.html#cb26-73" tabindex="-1"></a>            pl.lit([i <span class="cf">for</span> i <span class="kw">in</span> <span class="bu">range</span>(<span class="dv">5087</span>)]).alias(<span class="st">&#39;xid&#39;</span>)</span>
<span id="cb26-74"><a href="methods.html#cb26-74" tabindex="-1"></a>        )</span>
<span id="cb26-75"><a href="methods.html#cb26-75" tabindex="-1"></a></span>
<span id="cb26-76"><a href="methods.html#cb26-76" tabindex="-1"></a>        <span class="cf">with</span> <span class="bu">open</span>(<span class="st">&quot;all_feat.pkl&quot;</span>, <span class="st">&quot;rb&quot;</span>) <span class="im">as</span> f:</span>
<span id="cb26-77"><a href="methods.html#cb26-77" tabindex="-1"></a>            chemical_features <span class="op">=</span> pickle.load(f)</span>
<span id="cb26-78"><a href="methods.html#cb26-78" tabindex="-1"></a></span>
<span id="cb26-79"><a href="methods.html#cb26-79" tabindex="-1"></a>        phenotype_features <span class="op">=</span> df.columns[<span class="dv">5</span>:<span class="op">-</span><span class="dv">6</span>]</span>
<span id="cb26-80"><a href="methods.html#cb26-80" tabindex="-1"></a>        smiles <span class="op">=</span> df.select([<span class="st">&#39;pubchem_smiles&#39;</span>]).to_numpy().flatten()</span>
<span id="cb26-81"><a href="methods.html#cb26-81" tabindex="-1"></a>        y <span class="op">=</span> df.select([<span class="st">&#39;activity&#39;</span>]).to_numpy().flatten()</span>
<span id="cb26-82"><a href="methods.html#cb26-82" tabindex="-1"></a>        pca1 <span class="op">=</span> df.select([<span class="st">&#39;pca1&#39;</span>]).to_numpy().flatten()</span>
<span id="cb26-83"><a href="methods.html#cb26-83" tabindex="-1"></a>        </span>
<span id="cb26-84"><a href="methods.html#cb26-84" tabindex="-1"></a>        <span class="co"># Example data with 4 classes</span></span>
<span id="cb26-85"><a href="methods.html#cb26-85" tabindex="-1"></a>        data <span class="op">=</span> pca1</span>
<span id="cb26-86"><a href="methods.html#cb26-86" tabindex="-1"></a>        labels <span class="op">=</span> y</span>
<span id="cb26-87"><a href="methods.html#cb26-87" tabindex="-1"></a></span>
<span id="cb26-88"><a href="methods.html#cb26-88" tabindex="-1"></a>        <span class="co"># Split the data into train, validation, and test sets</span></span>
<span id="cb26-89"><a href="methods.html#cb26-89" tabindex="-1"></a>        train_data, val_data, train_labels, val_labels <span class="op">=</span> train_test_split(data, labels, test_size<span class="op">=</span><span class="dv">1000</span>, random_state<span class="op">=</span><span class="dv">42</span>,</span>
<span id="cb26-90"><a href="methods.html#cb26-90" tabindex="-1"></a>                                                                          stratify<span class="op">=</span>labels)</span>
<span id="cb26-91"><a href="methods.html#cb26-91" tabindex="-1"></a>        train_list <span class="op">=</span> [<span class="st">&#39;train&#39;</span> <span class="cf">for</span> _ <span class="kw">in</span> <span class="bu">range</span>(<span class="dv">4087</span>)]</span>
<span id="cb26-92"><a href="methods.html#cb26-92" tabindex="-1"></a>        val_list <span class="op">=</span> [<span class="st">&#39;valid&#39;</span> <span class="cf">for</span> _ <span class="kw">in</span> <span class="bu">range</span>(<span class="dv">1000</span>)]</span>
<span id="cb26-93"><a href="methods.html#cb26-93" tabindex="-1"></a>        mask_list <span class="op">=</span> train_list <span class="op">+</span> val_list</span>
<span id="cb26-94"><a href="methods.html#cb26-94" tabindex="-1"></a>        </span>
<span id="cb26-95"><a href="methods.html#cb26-95" tabindex="-1"></a>        mask <span class="op">=</span> pl.DataFrame(</span>
<span id="cb26-96"><a href="methods.html#cb26-96" tabindex="-1"></a>            {</span>
<span id="cb26-97"><a href="methods.html#cb26-97" tabindex="-1"></a>                <span class="st">&#39;pca1&#39;</span>: np.hstack((train_data, val_data)),</span>
<span id="cb26-98"><a href="methods.html#cb26-98" tabindex="-1"></a>                <span class="st">&#39;mask&#39;</span>: mask_list</span>
<span id="cb26-99"><a href="methods.html#cb26-99" tabindex="-1"></a>            }</span>
<span id="cb26-100"><a href="methods.html#cb26-100" tabindex="-1"></a>        ).sort(<span class="st">&#39;pca1&#39;</span>, descending<span class="op">=</span><span class="va">True</span>)</span>
<span id="cb26-101"><a href="methods.html#cb26-101" tabindex="-1"></a>        df <span class="op">=</span> df.join(mask, on<span class="op">=</span><span class="st">&#39;pca1&#39;</span>, how<span class="op">=</span><span class="st">&#39;left&#39;</span>)</span>
<span id="cb26-102"><a href="methods.html#cb26-102" tabindex="-1"></a>        </span>
<span id="cb26-103"><a href="methods.html#cb26-103" tabindex="-1"></a>        neg_class <span class="op">=</span> df[<span class="st">&quot;activity&quot;</span>].value_counts()[<span class="dv">0</span>][<span class="st">&#39;counts&#39;</span>].item()</span>
<span id="cb26-104"><a href="methods.html#cb26-104" tabindex="-1"></a>        pos_class <span class="op">=</span> df[<span class="st">&quot;activity&quot;</span>].value_counts()[<span class="dv">1</span>][<span class="st">&#39;counts&#39;</span>].item()</span>
<span id="cb26-105"><a href="methods.html#cb26-105" tabindex="-1"></a>        multiplier <span class="op">=</span> <span class="bu">int</span>(neg_class<span class="op">/</span>pos_class) <span class="op">-</span> <span class="dv">1</span></span>
<span id="cb26-106"><a href="methods.html#cb26-106" tabindex="-1"></a></span>
<span id="cb26-107"><a href="methods.html#cb26-107" tabindex="-1"></a>        df <span class="op">=</span> df.with_columns(</span>
<span id="cb26-108"><a href="methods.html#cb26-108" tabindex="-1"></a>            pl.col(<span class="st">&#39;activity&#39;</span>).<span class="bu">apply</span>(<span class="kw">lambda</span> x: x<span class="op">*</span>multiplier <span class="cf">if</span> x <span class="op">==</span> <span class="dv">1</span> <span class="cf">else</span> <span class="dv">1</span>).alias(<span class="st">&#39;to_replicate&#39;</span>)</span>
<span id="cb26-109"><a href="methods.html#cb26-109" tabindex="-1"></a>        )</span>
<span id="cb26-110"><a href="methods.html#cb26-110" tabindex="-1"></a>        </span>
<span id="cb26-111"><a href="methods.html#cb26-111" tabindex="-1"></a>        train_df <span class="op">=</span> df.<span class="bu">filter</span>(pl.col(<span class="st">&#39;mask&#39;</span>) <span class="op">==</span> <span class="st">&#39;train&#39;</span>)</span>
<span id="cb26-112"><a href="methods.html#cb26-112" tabindex="-1"></a>        valid_df <span class="op">=</span> df.<span class="bu">filter</span>(pl.col(<span class="st">&#39;mask&#39;</span>) <span class="op">==</span> <span class="st">&#39;valid&#39;</span>)</span>
<span id="cb26-113"><a href="methods.html#cb26-113" tabindex="-1"></a>        </span>
<span id="cb26-114"><a href="methods.html#cb26-114" tabindex="-1"></a>        balanced_train_df <span class="op">=</span> train_df.select(</span>
<span id="cb26-115"><a href="methods.html#cb26-115" tabindex="-1"></a>            pl.exclude(<span class="st">&#39;to_replicate&#39;</span>).repeat_by(<span class="st">&#39;to_replicate&#39;</span>).explode()</span>
<span id="cb26-116"><a href="methods.html#cb26-116" tabindex="-1"></a>        )</span>
<span id="cb26-117"><a href="methods.html#cb26-117" tabindex="-1"></a>        balanced_valid_df <span class="op">=</span> valid_df.select(</span>
<span id="cb26-118"><a href="methods.html#cb26-118" tabindex="-1"></a>            pl.exclude(<span class="st">&#39;to_replicate&#39;</span>).repeat_by(<span class="st">&#39;to_replicate&#39;</span>).explode()</span>
<span id="cb26-119"><a href="methods.html#cb26-119" tabindex="-1"></a>        )</span>
<span id="cb26-120"><a href="methods.html#cb26-120" tabindex="-1"></a>        balanced_df <span class="op">=</span> balanced_train_df.vstack(balanced_valid_df).sort(<span class="st">&quot;xid&quot;</span>, descending<span class="op">=</span><span class="va">False</span>)</span>
<span id="cb26-121"><a href="methods.html#cb26-121" tabindex="-1"></a>        index_to_replicate <span class="op">=</span> balanced_df.groupby(<span class="st">&quot;xid&quot;</span>, maintain_order<span class="op">=</span><span class="va">True</span>).count()[<span class="st">&#39;count&#39;</span>].to_numpy()</span>
<span id="cb26-122"><a href="methods.html#cb26-122" tabindex="-1"></a>        </span>
<span id="cb26-123"><a href="methods.html#cb26-123" tabindex="-1"></a>        balanced_chemical_features <span class="op">=</span> np.repeat(chemical_features, index_to_replicate, axis<span class="op">=</span><span class="dv">0</span>)</span>
<span id="cb26-124"><a href="methods.html#cb26-124" tabindex="-1"></a>        balanced_smiles <span class="op">=</span> balanced_df.select([<span class="st">&#39;pubchem_smiles&#39;</span>]).to_series().to_list()</span>
<span id="cb26-125"><a href="methods.html#cb26-125" tabindex="-1"></a>        balanced_y <span class="op">=</span> balanced_df.select([<span class="st">&#39;activity&#39;</span>]).to_numpy().flatten()</span>
<span id="cb26-126"><a href="methods.html#cb26-126" tabindex="-1"></a>        balanced_pca1 <span class="op">=</span> balanced_df.select([<span class="st">&#39;pca1&#39;</span>]).to_numpy().flatten()</span>
<span id="cb26-127"><a href="methods.html#cb26-127" tabindex="-1"></a>        </span>
<span id="cb26-128"><a href="methods.html#cb26-128" tabindex="-1"></a>        <span class="co"># Convert the smiles into numerical features using a featurizer from deepchem</span></span>
<span id="cb26-129"><a href="methods.html#cb26-129" tabindex="-1"></a>        <span class="co"># Using MolGraphConvFeaturizer</span></span>
<span id="cb26-130"><a href="methods.html#cb26-130" tabindex="-1"></a>        featurizer <span class="op">=</span> dc.feat.MolGraphConvFeaturizer(use_edges<span class="op">=</span><span class="va">True</span>, use_chirality<span class="op">=</span> <span class="va">True</span>, use_partial_charge<span class="op">=</span><span class="va">True</span>)</span>
<span id="cb26-131"><a href="methods.html#cb26-131" tabindex="-1"></a>        graphs <span class="op">=</span> featurizer.featurize(balanced_smiles, y<span class="op">=</span>balanced_pca1)</span>
<span id="cb26-132"><a href="methods.html#cb26-132" tabindex="-1"></a>                </span>
<span id="cb26-133"><a href="methods.html#cb26-133" tabindex="-1"></a>        data_list <span class="op">=</span> []</span>
<span id="cb26-134"><a href="methods.html#cb26-134" tabindex="-1"></a>        <span class="cf">for</span> idx, graph <span class="kw">in</span> tqdm(<span class="bu">enumerate</span>(graphs)):</span>
<span id="cb26-135"><a href="methods.html#cb26-135" tabindex="-1"></a>            edge_features <span class="op">=</span> torch.from_numpy(graph.edge_features).<span class="bu">float</span>()</span>
<span id="cb26-136"><a href="methods.html#cb26-136" tabindex="-1"></a>            g <span class="op">=</span> Data(x<span class="op">=</span>torch.from_numpy(graph.node_features).<span class="bu">float</span>(),</span>
<span id="cb26-137"><a href="methods.html#cb26-137" tabindex="-1"></a>                            edge_index<span class="op">=</span>torch.from_numpy(graph.edge_index).<span class="bu">long</span>(),</span>
<span id="cb26-138"><a href="methods.html#cb26-138" tabindex="-1"></a>                            edge_attr<span class="op">=</span>edge_features,</span>
<span id="cb26-139"><a href="methods.html#cb26-139" tabindex="-1"></a>                            y<span class="op">=</span>torch.tensor(balanced_pca1[idx]).<span class="bu">long</span>().unsqueeze(<span class="dv">0</span>),</span>
<span id="cb26-140"><a href="methods.html#cb26-140" tabindex="-1"></a>                            chem_features<span class="op">=</span>torch.from_numpy(balanced_chemical_features[idx]).<span class="bu">float</span>().unsqueeze(<span class="dv">0</span>),</span>
<span id="cb26-141"><a href="methods.html#cb26-141" tabindex="-1"></a>                            smiles<span class="op">=</span>balanced_smiles[idx])</span>
<span id="cb26-142"><a href="methods.html#cb26-142" tabindex="-1"></a>            data_list.append(g)</span>
<span id="cb26-143"><a href="methods.html#cb26-143" tabindex="-1"></a></span>
<span id="cb26-144"><a href="methods.html#cb26-144" tabindex="-1"></a>            <span class="cf">if</span> <span class="va">self</span>.pre_filter <span class="kw">is</span> <span class="kw">not</span> <span class="va">None</span>:</span>
<span id="cb26-145"><a href="methods.html#cb26-145" tabindex="-1"></a>                data_list <span class="op">=</span> [data <span class="cf">for</span> data <span class="kw">in</span> data_list <span class="cf">if</span> <span class="va">self</span>.pre_filter(data)]</span>
<span id="cb26-146"><a href="methods.html#cb26-146" tabindex="-1"></a></span>
<span id="cb26-147"><a href="methods.html#cb26-147" tabindex="-1"></a>            <span class="cf">if</span> <span class="va">self</span>.pre_transform <span class="kw">is</span> <span class="kw">not</span> <span class="va">None</span>:</span>
<span id="cb26-148"><a href="methods.html#cb26-148" tabindex="-1"></a>                data_list <span class="op">=</span> [<span class="va">self</span>.pre_transform(data) <span class="cf">for</span> data <span class="kw">in</span> data_list]</span>
<span id="cb26-149"><a href="methods.html#cb26-149" tabindex="-1"></a></span>
<span id="cb26-150"><a href="methods.html#cb26-150" tabindex="-1"></a>        data, slices <span class="op">=</span> <span class="va">self</span>.collate(data_list)</span>
<span id="cb26-151"><a href="methods.html#cb26-151" tabindex="-1"></a>        torch.save((data, slices), <span class="va">self</span>.processed_paths[<span class="dv">0</span>])   </span></code></pre></div>
</div>
</div>
<div id="models" class="section level2 hasAnchor" number="2.2">
<h2><span class="header-section-number">2.2</span> Models<a href="methods.html#models" class="anchor-section" aria-label="Anchor link to header"></a></h2>
<div id="graph-level-molecular-predictor-glmp" class="section level3 hasAnchor" number="2.2.1">
<h3><span class="header-section-number">2.2.1</span> Graph-Level Molecular Predictor (GLMP)<a href="methods.html#graph-level-molecular-predictor-glmp" class="anchor-section" aria-label="Anchor link to header"></a></h3>
<p>This model operates at the level of the graph, predicting properties based on the structural features of molecules. In machine learning applications in chemistry, compound representation plays a crucial role. The traditional approach involves converting chemical compounds into numerical vectors using various algorithms. However, a new popular alternative approach is molecule graph representation. This treats the compound as a graph structure with atoms as nodes and bonds as edges. This method captures atom connectivity and spatial arrangement within the molecule, providing more detailed information.</p>
<p>Conversely, the conventional vector conversion method transforms chemical compounds into fixed-length vectors by encoding molecular descriptors or fingerprints. Molecular descriptors encompass essential chemical properties, while fingerprints encode the presence or absence of specific substructures within the compound. Although vector representations are more concise and compatible with traditional machine learning algorithms, molecule graphs offer enhanced versatility and applicability to various chemistry tasks. Graph representations’ ability to leverage connectivity patterns and atom-level information leads to improved predictive performance and a deeper understanding of chemical phenomena.</p>
<p>When comparing the two approaches, molecule graphs excel at explicitly capturing structural information and atom relationships, making them advantageous for tasks reliant on spatial arrangement or connectivity patterns. On the other hand, conventional vector representations are more compact and suitable for traditional machine learning algorithms, making them preferable for larger datasets or tasks that don’t require explicit structural information. Molecule graphs can handle inputs of variable sizes, accommodating molecules with different atom numbers, whereas vector representations typically require fixed-size inputs. However, vector representations may sacrifice some fine-grained structural details or substructure information that molecule graphs can capture.</p>
<p>The GLMP (Graph-Level Molecular Predictor) model is designed to leverage both molecular graph representations and conventional vector representations in its architecture. The model combines the strengths of both approaches to enhance predictive performance and capture detailed structural information.</p>
<div class="sourceCode" id="cb27"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb27-1"><a href="methods.html#cb27-1" tabindex="-1"></a><span class="im">import</span> torch</span>
<span id="cb27-2"><a href="methods.html#cb27-2" tabindex="-1"></a><span class="im">import</span> torch.nn.functional <span class="im">as</span> F </span>
<span id="cb27-3"><a href="methods.html#cb27-3" tabindex="-1"></a><span class="im">from</span> torch.nn <span class="im">import</span> Linear, BatchNorm1d, ModuleList</span>
<span id="cb27-4"><a href="methods.html#cb27-4" tabindex="-1"></a><span class="im">from</span> torch_geometric.nn <span class="im">import</span> TransformerConv, TopKPooling </span>
<span id="cb27-5"><a href="methods.html#cb27-5" tabindex="-1"></a><span class="im">from</span> torch_geometric.nn <span class="im">import</span> global_mean_pool <span class="im">as</span> gap, global_max_pool <span class="im">as</span> gmp</span>
<span id="cb27-6"><a href="methods.html#cb27-6" tabindex="-1"></a></span>
<span id="cb27-7"><a href="methods.html#cb27-7" tabindex="-1"></a>torch.manual_seed(<span class="dv">42</span>)</span>
<span id="cb27-8"><a href="methods.html#cb27-8" tabindex="-1"></a>device <span class="op">=</span> torch.device(<span class="st">&#39;cuda&#39;</span> <span class="cf">if</span> torch.cuda.is_available() <span class="cf">else</span> <span class="st">&#39;cpu&#39;</span>)</span>
<span id="cb27-9"><a href="methods.html#cb27-9" tabindex="-1"></a></span>
<span id="cb27-10"><a href="methods.html#cb27-10" tabindex="-1"></a><span class="kw">class</span> GLMP(torch.nn.Module):</span>
<span id="cb27-11"><a href="methods.html#cb27-11" tabindex="-1"></a>    <span class="kw">def</span> <span class="fu">__init__</span>(<span class="va">self</span>, feature_size, model_params):</span>
<span id="cb27-12"><a href="methods.html#cb27-12" tabindex="-1"></a>        <span class="bu">super</span>(GNN, <span class="va">self</span>).<span class="fu">__init__</span>()</span>
<span id="cb27-13"><a href="methods.html#cb27-13" tabindex="-1"></a>        embedding_size <span class="op">=</span> model_params[<span class="st">&quot;model_embedding_size&quot;</span>]</span>
<span id="cb27-14"><a href="methods.html#cb27-14" tabindex="-1"></a>        n_heads <span class="op">=</span> model_params[<span class="st">&quot;model_attention_heads&quot;</span>]</span>
<span id="cb27-15"><a href="methods.html#cb27-15" tabindex="-1"></a>        <span class="va">self</span>.n_layers <span class="op">=</span> model_params[<span class="st">&quot;model_layers&quot;</span>]</span>
<span id="cb27-16"><a href="methods.html#cb27-16" tabindex="-1"></a>        dropout_rate <span class="op">=</span> model_params[<span class="st">&quot;model_dropout_rate&quot;</span>]</span>
<span id="cb27-17"><a href="methods.html#cb27-17" tabindex="-1"></a>        top_k_ratio <span class="op">=</span> model_params[<span class="st">&quot;model_top_k_ratio&quot;</span>]</span>
<span id="cb27-18"><a href="methods.html#cb27-18" tabindex="-1"></a>        <span class="va">self</span>.top_k_every_n <span class="op">=</span> model_params[<span class="st">&quot;model_top_k_every_n&quot;</span>]</span>
<span id="cb27-19"><a href="methods.html#cb27-19" tabindex="-1"></a>        dense_neurons <span class="op">=</span> model_params[<span class="st">&quot;model_dense_neurons&quot;</span>]</span>
<span id="cb27-20"><a href="methods.html#cb27-20" tabindex="-1"></a>        edge_dim <span class="op">=</span> <span class="dv">11</span></span>
<span id="cb27-21"><a href="methods.html#cb27-21" tabindex="-1"></a></span>
<span id="cb27-22"><a href="methods.html#cb27-22" tabindex="-1"></a>        <span class="va">self</span>.conv_layers <span class="op">=</span> ModuleList([])</span>
<span id="cb27-23"><a href="methods.html#cb27-23" tabindex="-1"></a>        <span class="va">self</span>.transf_layers <span class="op">=</span> ModuleList([])</span>
<span id="cb27-24"><a href="methods.html#cb27-24" tabindex="-1"></a>        <span class="va">self</span>.pooling_layers <span class="op">=</span> ModuleList([])</span>
<span id="cb27-25"><a href="methods.html#cb27-25" tabindex="-1"></a>        <span class="va">self</span>.bn_layers <span class="op">=</span> ModuleList([])</span>
<span id="cb27-26"><a href="methods.html#cb27-26" tabindex="-1"></a></span>
<span id="cb27-27"><a href="methods.html#cb27-27" tabindex="-1"></a>        <span class="co"># Transformation layer</span></span>
<span id="cb27-28"><a href="methods.html#cb27-28" tabindex="-1"></a>        <span class="va">self</span>.conv1 <span class="op">=</span> TransformerConv(feature_size, </span>
<span id="cb27-29"><a href="methods.html#cb27-29" tabindex="-1"></a>                                    embedding_size, </span>
<span id="cb27-30"><a href="methods.html#cb27-30" tabindex="-1"></a>                                    heads<span class="op">=</span>n_heads, </span>
<span id="cb27-31"><a href="methods.html#cb27-31" tabindex="-1"></a>                                    dropout<span class="op">=</span>dropout_rate,</span>
<span id="cb27-32"><a href="methods.html#cb27-32" tabindex="-1"></a>                                    edge_dim<span class="op">=</span>edge_dim,</span>
<span id="cb27-33"><a href="methods.html#cb27-33" tabindex="-1"></a>                                    beta<span class="op">=</span><span class="va">True</span>) </span>
<span id="cb27-34"><a href="methods.html#cb27-34" tabindex="-1"></a></span>
<span id="cb27-35"><a href="methods.html#cb27-35" tabindex="-1"></a>        <span class="va">self</span>.transf1 <span class="op">=</span> Linear(embedding_size<span class="op">*</span>n_heads, embedding_size)</span>
<span id="cb27-36"><a href="methods.html#cb27-36" tabindex="-1"></a>        <span class="va">self</span>.bn1 <span class="op">=</span> BatchNorm1d(embedding_size)</span>
<span id="cb27-37"><a href="methods.html#cb27-37" tabindex="-1"></a>        <span class="va">self</span>.bn2 <span class="op">=</span> BatchNorm1d(<span class="dv">8192</span>)</span>
<span id="cb27-38"><a href="methods.html#cb27-38" tabindex="-1"></a>        <span class="va">self</span>.bn3 <span class="op">=</span> BatchNorm1d(<span class="dv">4096</span>)</span>
<span id="cb27-39"><a href="methods.html#cb27-39" tabindex="-1"></a>        <span class="va">self</span>.bn4 <span class="op">=</span> BatchNorm1d(<span class="dv">2048</span>)</span>
<span id="cb27-40"><a href="methods.html#cb27-40" tabindex="-1"></a>        </span>
<span id="cb27-41"><a href="methods.html#cb27-41" tabindex="-1"></a></span>
<span id="cb27-42"><a href="methods.html#cb27-42" tabindex="-1"></a>        <span class="co"># Other layers</span></span>
<span id="cb27-43"><a href="methods.html#cb27-43" tabindex="-1"></a>        <span class="cf">for</span> i <span class="kw">in</span> <span class="bu">range</span>(<span class="va">self</span>.n_layers):</span>
<span id="cb27-44"><a href="methods.html#cb27-44" tabindex="-1"></a>            <span class="va">self</span>.conv_layers.append(TransformerConv(embedding_size, </span>
<span id="cb27-45"><a href="methods.html#cb27-45" tabindex="-1"></a>                                                    embedding_size, </span>
<span id="cb27-46"><a href="methods.html#cb27-46" tabindex="-1"></a>                                                    heads<span class="op">=</span>n_heads, </span>
<span id="cb27-47"><a href="methods.html#cb27-47" tabindex="-1"></a>                                                    dropout<span class="op">=</span>dropout_rate,</span>
<span id="cb27-48"><a href="methods.html#cb27-48" tabindex="-1"></a>                                                    edge_dim<span class="op">=</span>edge_dim,</span>
<span id="cb27-49"><a href="methods.html#cb27-49" tabindex="-1"></a>                                                    beta<span class="op">=</span><span class="va">True</span>))</span>
<span id="cb27-50"><a href="methods.html#cb27-50" tabindex="-1"></a></span>
<span id="cb27-51"><a href="methods.html#cb27-51" tabindex="-1"></a>            <span class="va">self</span>.transf_layers.append(Linear(embedding_size<span class="op">*</span>n_heads, embedding_size))</span>
<span id="cb27-52"><a href="methods.html#cb27-52" tabindex="-1"></a>            <span class="va">self</span>.bn_layers.append(BatchNorm1d(embedding_size))</span>
<span id="cb27-53"><a href="methods.html#cb27-53" tabindex="-1"></a>            <span class="cf">if</span> i <span class="op">%</span> <span class="va">self</span>.top_k_every_n <span class="op">==</span> <span class="dv">0</span>:</span>
<span id="cb27-54"><a href="methods.html#cb27-54" tabindex="-1"></a>                <span class="va">self</span>.pooling_layers.append(TopKPooling(embedding_size, ratio<span class="op">=</span>top_k_ratio))</span>
<span id="cb27-55"><a href="methods.html#cb27-55" tabindex="-1"></a>            </span>
<span id="cb27-56"><a href="methods.html#cb27-56" tabindex="-1"></a></span>
<span id="cb27-57"><a href="methods.html#cb27-57" tabindex="-1"></a>        <span class="co"># Linear layers</span></span>
<span id="cb27-58"><a href="methods.html#cb27-58" tabindex="-1"></a>        <span class="va">self</span>.linear1 <span class="op">=</span> Linear(<span class="dv">4784</span>, <span class="dv">8192</span>)</span>
<span id="cb27-59"><a href="methods.html#cb27-59" tabindex="-1"></a>        <span class="va">self</span>.linear2 <span class="op">=</span> Linear(<span class="dv">8192</span>, <span class="dv">4096</span>)  </span>
<span id="cb27-60"><a href="methods.html#cb27-60" tabindex="-1"></a>        <span class="va">self</span>.linear3 <span class="op">=</span> Linear(<span class="dv">4096</span>, <span class="dv">2048</span>)</span>
<span id="cb27-61"><a href="methods.html#cb27-61" tabindex="-1"></a>        <span class="va">self</span>.linear4 <span class="op">=</span> Linear(<span class="dv">2048</span>, <span class="dv">1</span>)  </span>
<span id="cb27-62"><a href="methods.html#cb27-62" tabindex="-1"></a>        </span>
<span id="cb27-63"><a href="methods.html#cb27-63" tabindex="-1"></a></span>
<span id="cb27-64"><a href="methods.html#cb27-64" tabindex="-1"></a>    <span class="kw">def</span> forward(<span class="va">self</span>, data):</span>
<span id="cb27-65"><a href="methods.html#cb27-65" tabindex="-1"></a>        x, edge_attr, edge_index, batch_index, chem_features <span class="op">=</span> data.x, data.edge_attr, data.edge_index, data.batch, data.chem_features</span>
<span id="cb27-66"><a href="methods.html#cb27-66" tabindex="-1"></a>        <span class="co"># Initial transformation</span></span>
<span id="cb27-67"><a href="methods.html#cb27-67" tabindex="-1"></a>        x <span class="op">=</span> <span class="va">self</span>.conv1(x, edge_index, edge_attr)</span>
<span id="cb27-68"><a href="methods.html#cb27-68" tabindex="-1"></a>        x <span class="op">=</span> torch.relu(<span class="va">self</span>.transf1(x))</span>
<span id="cb27-69"><a href="methods.html#cb27-69" tabindex="-1"></a>        x <span class="op">=</span> <span class="va">self</span>.bn1(x)</span>
<span id="cb27-70"><a href="methods.html#cb27-70" tabindex="-1"></a></span>
<span id="cb27-71"><a href="methods.html#cb27-71" tabindex="-1"></a>        <span class="co"># Holds the intermediate graph representations</span></span>
<span id="cb27-72"><a href="methods.html#cb27-72" tabindex="-1"></a>        global_representation <span class="op">=</span> []</span>
<span id="cb27-73"><a href="methods.html#cb27-73" tabindex="-1"></a></span>
<span id="cb27-74"><a href="methods.html#cb27-74" tabindex="-1"></a>        <span class="cf">for</span> i <span class="kw">in</span> <span class="bu">range</span>(<span class="va">self</span>.n_layers):</span>
<span id="cb27-75"><a href="methods.html#cb27-75" tabindex="-1"></a>            x <span class="op">=</span> <span class="va">self</span>.conv_layers[i](x, edge_index, edge_attr)</span>
<span id="cb27-76"><a href="methods.html#cb27-76" tabindex="-1"></a>            x <span class="op">=</span> torch.relu(<span class="va">self</span>.transf_layers[i](x))</span>
<span id="cb27-77"><a href="methods.html#cb27-77" tabindex="-1"></a>            x <span class="op">=</span> <span class="va">self</span>.bn_layers[i](x)</span>
<span id="cb27-78"><a href="methods.html#cb27-78" tabindex="-1"></a>            <span class="co"># Always aggregate last layer</span></span>
<span id="cb27-79"><a href="methods.html#cb27-79" tabindex="-1"></a>            <span class="cf">if</span> i <span class="op">%</span> <span class="va">self</span>.top_k_every_n <span class="op">==</span> <span class="dv">0</span> <span class="kw">or</span> i <span class="op">==</span> <span class="va">self</span>.n_layers:</span>
<span id="cb27-80"><a href="methods.html#cb27-80" tabindex="-1"></a>                x , edge_index, edge_attr, batch_index, _, _ <span class="op">=</span> <span class="va">self</span>.pooling_layers[<span class="bu">int</span>(i<span class="op">/</span><span class="va">self</span>.top_k_every_n)](</span>
<span id="cb27-81"><a href="methods.html#cb27-81" tabindex="-1"></a>                    x, edge_index, edge_attr, batch_index</span>
<span id="cb27-82"><a href="methods.html#cb27-82" tabindex="-1"></a>                    )</span>
<span id="cb27-83"><a href="methods.html#cb27-83" tabindex="-1"></a>                <span class="co"># Add current representation</span></span>
<span id="cb27-84"><a href="methods.html#cb27-84" tabindex="-1"></a>                global_representation.append(torch.cat([gmp(x, batch_index), gap(x, batch_index)], dim<span class="op">=</span><span class="dv">1</span>))</span>
<span id="cb27-85"><a href="methods.html#cb27-85" tabindex="-1"></a>    </span>
<span id="cb27-86"><a href="methods.html#cb27-86" tabindex="-1"></a>        x <span class="op">=</span> <span class="bu">sum</span>(global_representation)</span>
<span id="cb27-87"><a href="methods.html#cb27-87" tabindex="-1"></a>        <span class="co"># chem_features concatenated with graph-level representations</span></span>
<span id="cb27-88"><a href="methods.html#cb27-88" tabindex="-1"></a>        x <span class="op">=</span> torch.cat([x, chem_features], dim<span class="op">=</span><span class="dv">1</span>)</span>
<span id="cb27-89"><a href="methods.html#cb27-89" tabindex="-1"></a>        </span>
<span id="cb27-90"><a href="methods.html#cb27-90" tabindex="-1"></a>        <span class="co"># Output block</span></span>
<span id="cb27-91"><a href="methods.html#cb27-91" tabindex="-1"></a>        x <span class="op">=</span> torch.relu(<span class="va">self</span>.linear1(x))</span>
<span id="cb27-92"><a href="methods.html#cb27-92" tabindex="-1"></a>        x <span class="op">=</span> F.dropout(x, p<span class="op">=</span><span class="fl">0.7</span>, training<span class="op">=</span><span class="va">self</span>.training)</span>
<span id="cb27-93"><a href="methods.html#cb27-93" tabindex="-1"></a>        x <span class="op">=</span> <span class="va">self</span>.bn2(x)  </span>
<span id="cb27-94"><a href="methods.html#cb27-94" tabindex="-1"></a>        x <span class="op">=</span> torch.relu(<span class="va">self</span>.linear2(x))</span>
<span id="cb27-95"><a href="methods.html#cb27-95" tabindex="-1"></a>        x <span class="op">=</span> F.dropout(x, p<span class="op">=</span><span class="fl">0.7</span>, training<span class="op">=</span><span class="va">self</span>.training)</span>
<span id="cb27-96"><a href="methods.html#cb27-96" tabindex="-1"></a>        x <span class="op">=</span> <span class="va">self</span>.bn3(x) </span>
<span id="cb27-97"><a href="methods.html#cb27-97" tabindex="-1"></a>        x <span class="op">=</span> <span class="va">self</span>.linear3(x)</span>
<span id="cb27-98"><a href="methods.html#cb27-98" tabindex="-1"></a>        x <span class="op">=</span> F.dropout(x, p<span class="op">=</span><span class="fl">0.7</span>, training<span class="op">=</span><span class="va">self</span>.training) </span>
<span id="cb27-99"><a href="methods.html#cb27-99" tabindex="-1"></a>        x <span class="op">=</span> <span class="va">self</span>.bn4(x)  </span>
<span id="cb27-100"><a href="methods.html#cb27-100" tabindex="-1"></a>        x <span class="op">=</span> <span class="va">self</span>.linear4(x)</span>
<span id="cb27-101"><a href="methods.html#cb27-101" tabindex="-1"></a></span>
<span id="cb27-102"><a href="methods.html#cb27-102" tabindex="-1"></a>        <span class="cf">return</span> x</span></code></pre></div>
<p>The core of the GLMP model is a graph neural network (GNN) implemented using PyTorch. The GNN takes molecular graphs as input and applies a series of graph convolutional layers to capture atom connectivity and spatial arrangement within the molecules. The graph convolutional layers are implemented using a variant of the Transformer architecture, called TransformerConv, which incorporates attention mechanisms to capture global dependencies.</p>
<p>In addition to the graph convolutional layers, the GLMP model also includes traditional linear layers for further processing and prediction. These linear layers operate on the representations obtained from the graph convolutional layers and other features, such as chemical descriptors or fingerprints encoded as fixed-length vectors.</p>
<div style="text-align: center;">
<figure>
<img src="assets/GLMP.png" alt="GLMP" style="width: 90%; height: auto;"/>
</figure>
<p style="text-align: justify; text-align-last: left; font-size: 12px;">
Graph-Level Molecular Predictor (GLMP) model architecture. GLMP implements a graph neural network, treating molecules as graph structures for detailed connectivity and spatial arrangement analysis. Combined with conventional numerical chemical vectors, GLMP performs graph-level molecular property prediction.
</p>
</div>
<p>The GLMP model consists of the following components:</p>
<ol style="list-style-type: decimal">
<li><p>Graph Convolutional Layers: The model uses multiple graph convolutional layers, implemented as instances of the TransformerConv class, to capture structural information from the molecule graphs. These layers perform graph convolutions, incorporating attention mechanisms and edge features to enhance the model’s ability to capture atom relationships.</p></li>
<li><p>Linear and Batch Normalization Layers: After each graph convolutional layer, the GLMP model applies linear transformations followed by batch normalization to further process the representations obtained. These layers help refine the representations and make them suitable for downstream tasks.</p></li>
<li><p>Pooling Layers: The GLMP model includes pooling layers, specifically the TopKPooling class, which aggregates the node representations at certain intervals. The pooling layers help to condense the graph-level information and capture important features.</p></li>
<li><p>Chemical Features: The GLMP model incorporates additional chemical features, such as molecular descriptors or fingerprints, encoded as fixed-length vectors. These features are concatenated with graph-level representations at a later stage of the model.</p></li>
<li><p>Final Linear Layers: The GLMP model concludes with a series of linear layers, which further process the combined representations of the graph-level information and the additional chemical features. These layers progressively reduce the dimensionality of the representations and eventually output a single value for prediction.</p></li>
</ol>
<p>By combining graph convolutional layers, linear layers, pooling layers, and chemical features, the GLMP model can effectively capture the detailed structural information present in molecular graphs. This allows the model to handle inputs of variable sizes, accommodate different atom numbers, and make predictions based on both spatial arrangement and essential chemical properties.</p>
</div>
<div id="bio-graph-integrative-classifierregressor-biogicbiogir" class="section level3 hasAnchor" number="2.2.2">
<h3><span class="header-section-number">2.2.2</span> Bio-Graph Integrative Classifier/Regressor (BioGIC/BioGIR)<a href="methods.html#bio-graph-integrative-classifierregressor-biogicbiogir" class="anchor-section" aria-label="Anchor link to header"></a></h3>
<div id="classificationregression" class="section level4 hasAnchor" number="2.2.2.1">
<h4><span class="header-section-number">2.2.2.1</span> Classification/Regression<a href="methods.html#classificationregression" class="anchor-section" aria-label="Anchor link to header"></a></h4>
<p>The Bio-Graph Integrative Classifier/Regressor (BioGIC/BioGIR) is envisioned as a model that assimilates information from the COVID-19 Bio-Graph, a complex network encapsulating different biological entities, such as chemical compounds, proteins, and pathways. This model performs classification or regression tasks based on the intricate relationships and interactions among these biological entities alongside the biological entities’ feature vectors.</p>
<p>The COVID-19 Bio-Graph allows for a more comprehensive representation of interdependencies. The proposed BioGIC/BioGIR model aims to leverage this wealth of information combined with feature vectors that represent the relevant information about the nodes they represent. This is to predict the properties or behaviors of certain entities, such as identifying active chemical compounds.</p>
<p>The BioGIC/BioGIR model architecture could be based upon a combination of different Graph Convolution Networks (GCNs). Graph Attention Network (GAT) convolution layer, is usually used for capturing the local neighborhood relationships of the nodes. GAT layer provides an attention mechanism allowing the model to weigh neighbor nodes differently based on their importance. Also, a sequence of Relational Graph Convolutional Network (RGCN) layers could be particularly beneficial for heterogeneous graphs, which are graphs with different types of nodes and edges. RGCN layers can learn separate weights for different types of edges, thereby modeling different types of relationships more accurately. GraphSAGE is another GCN that is a suitable general-purpose graph convolution operation and works reasonably well in many scenarios. It’s capable of generating embeddings for unseen data, which is an advantage if you expect to be working with active compounds that weren’t in your training data. However, it treats all neighbor nodes equally when aggregating their features, which might not be ideal in a heterogeneous graph where different types of nodes could have different importance.</p>
<div style="text-align: center;">
<figure>
<img src="assets/BioGIP_dup.png" alt="GLMP" style="width: 80%; height: auto;"/>
</figure>
<p style="text-align: justify; text-align-last: left; font-size: 12px;">
For BioGIP to be able to operate on heterogeneous graphs we have to duplicate the model’s message functions to cater to each unique edge type. As a result, the updated model expects dictionaries of all node and edge types, instead of single tensors that homogeneous graphs use. This adjustment enables message passing in multi-partite graphs by passing a set of input to the different convolutional layers. For simplicity only the BioGIP for sequential message passing is depicted here but the idea is the same for any complex architecture.
</p>
</div>
<p>The model would also include appropriate optimization techniques and loss functions suitable for the task at hand. For a classification task, a cross-entropy loss function would be used, while for regression tasks, mean squared error or another suitable loss function could be employed. The training process would involve backpropagation and optimization of the weights to minimize the loss function.</p>
<p>The BioGIC/BioGIR model, by virtue of its design, allows for the integration of the complex relationships among various biological entities present in the COVID-19 Bio-Graph.</p>
<p>The decision to concatenate embeddings from each layer via a pooling mechanism versus passing messages serially through the layers depends on the specific task and data. Both approaches have their strengths and weaknesses, and they capture different types of information.</p>
<div style="text-align: center;">
<figure>
<img src="assets/BioGIP.png" alt="BioGIP" style="width: 80%; height: auto;"/>
</figure>
<p style="text-align: justify; text-align-last: left; font-size: 12px;">
The BioGIP model design can consider sequential layer information passing for abstract representation tasks, layer embedding concatenation to preserve multi-scale graph information, or the use of skip or residual connections to maintain information flow and address the vanishing gradient issue in deeper models.
</p>
</div>
<ol style="list-style-type: decimal">
<li><strong>Passing Messages in Series</strong>: This approach sequentially passes information through layers, with each layer potentially transforming and aggregating the information from the previous layer. This approach might be better suited if the task requires more abstract representations, as the information is aggregated and transformed across layers. However, one potential drawback is that information from the initial layers could be lost or diluted in this process.</li>
</ol>
<div class="sourceCode" id="cb28"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb28-1"><a href="methods.html#cb28-1" tabindex="-1"></a><span class="im">from</span> torch_geometric.nn <span class="im">import</span> GATConv, RGCNConv, SAGEConv</span>
<span id="cb28-2"><a href="methods.html#cb28-2" tabindex="-1"></a></span>
<span id="cb28-3"><a href="methods.html#cb28-3" tabindex="-1"></a><span class="kw">class</span> BioGI(torch.nn.Module):</span>
<span id="cb28-4"><a href="methods.html#cb28-4" tabindex="-1"></a>    <span class="kw">def</span> <span class="fu">__init__</span>(<span class="va">self</span>, hidden_channels, out_channels):</span>
<span id="cb28-5"><a href="methods.html#cb28-5" tabindex="-1"></a>        <span class="bu">super</span>().<span class="fu">__init__</span>()</span>
<span id="cb28-6"><a href="methods.html#cb28-6" tabindex="-1"></a>        <span class="va">self</span>.conv1 <span class="op">=</span> GATConv((<span class="op">-</span><span class="dv">1</span>, <span class="op">-</span><span class="dv">1</span>), hidden_channels)</span>
<span id="cb28-7"><a href="methods.html#cb28-7" tabindex="-1"></a>        <span class="va">self</span>.conv2 <span class="op">=</span> SAGEConv(hidden_channels, hidden_channels)</span>
<span id="cb28-8"><a href="methods.html#cb28-8" tabindex="-1"></a>        <span class="va">self</span>.conv3 <span class="op">=</span> RGCNConv(hidden_channels, hidden_channels, num_relations<span class="op">=</span>dataset.num_relations, num_bases<span class="op">=</span><span class="dv">10</span>)</span>
<span id="cb28-9"><a href="methods.html#cb28-9" tabindex="-1"></a>        <span class="va">self</span>.conv4 <span class="op">=</span> RGCNConv(hidden_channels, out_channels, num_relations<span class="op">=</span>dataset.num_relations, num_bases<span class="op">=</span><span class="dv">10</span>)</span>
<span id="cb28-10"><a href="methods.html#cb28-10" tabindex="-1"></a></span>
<span id="cb28-11"><a href="methods.html#cb28-11" tabindex="-1"></a>    <span class="kw">def</span> forward(<span class="va">self</span>, x, edge_index, edge_type):</span>
<span id="cb28-12"><a href="methods.html#cb28-12" tabindex="-1"></a>        x <span class="op">=</span> <span class="va">self</span>.conv1(x, edge_index).relu()</span>
<span id="cb28-13"><a href="methods.html#cb28-13" tabindex="-1"></a>        x <span class="op">=</span> <span class="va">self</span>.conv2(x, edge_index).relu()</span>
<span id="cb28-14"><a href="methods.html#cb28-14" tabindex="-1"></a>        x <span class="op">=</span> <span class="va">self</span>.conv3(x, edge_index, edge_type).relu()</span>
<span id="cb28-15"><a href="methods.html#cb28-15" tabindex="-1"></a>        x <span class="op">=</span> <span class="va">self</span>.conv4(x, edge_index, edge_type)</span>
<span id="cb28-16"><a href="methods.html#cb28-16" tabindex="-1"></a>        <span class="cf">return</span> F.log_softmax(x, dim<span class="op">=</span><span class="dv">1</span>)</span>
<span id="cb28-17"><a href="methods.html#cb28-17" tabindex="-1"></a></span>
<span id="cb28-18"><a href="methods.html#cb28-18" tabindex="-1"></a>model <span class="op">=</span> BioGI(hidden_channels<span class="op">=</span><span class="dv">32</span>, out_channels<span class="op">=</span>dataset.num_classes)</span>
<span id="cb28-19"><a href="methods.html#cb28-19" tabindex="-1"></a>model <span class="op">=</span> to_hetero(model, data.metadata(), aggr<span class="op">=</span><span class="st">&#39;sum&#39;</span>)</span></code></pre></div>
<ol start="2" style="list-style-type: decimal">
<li><strong>Concatenating Layer Embeddings</strong>: This approach can be beneficial as it allows the model to preserve and learn from information at different levels of abstraction. Each layer in a Graph Neural Network captures different types of information - initial layers capture local information, while deeper layers aggregate information from a larger neighborhood. Concatenating these embeddings can help the model leverage all this information simultaneously. Pooling mechanisms can be used to reduce dimensionality if needed. This approach might work well if the relevant information for the task is spread across different scales of the graph.</li>
</ol>
<div class="sourceCode" id="cb29"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb29-1"><a href="methods.html#cb29-1" tabindex="-1"></a><span class="im">from</span> torch_geometric.nn <span class="im">import</span> GATConv, RGCNConv, SAGEConv</span>
<span id="cb29-2"><a href="methods.html#cb29-2" tabindex="-1"></a><span class="im">from</span> torch.nn <span class="im">import</span> Linear</span>
<span id="cb29-3"><a href="methods.html#cb29-3" tabindex="-1"></a></span>
<span id="cb29-4"><a href="methods.html#cb29-4" tabindex="-1"></a><span class="kw">class</span> BioGI(torch.nn.Module):</span>
<span id="cb29-5"><a href="methods.html#cb29-5" tabindex="-1"></a>    <span class="kw">def</span> <span class="fu">__init__</span>(<span class="va">self</span>, hidden_channels, out_channels):</span>
<span id="cb29-6"><a href="methods.html#cb29-6" tabindex="-1"></a>        <span class="bu">super</span>().<span class="fu">__init__</span>()</span>
<span id="cb29-7"><a href="methods.html#cb29-7" tabindex="-1"></a>        <span class="va">self</span>.conv1 <span class="op">=</span> GATConv((<span class="op">-</span><span class="dv">1</span>, <span class="op">-</span><span class="dv">1</span>), hidden_channels)</span>
<span id="cb29-8"><a href="methods.html#cb29-8" tabindex="-1"></a>        <span class="va">self</span>.conv2 <span class="op">=</span> SAGEConv(hidden_channels, hidden_channels)</span>
<span id="cb29-9"><a href="methods.html#cb29-9" tabindex="-1"></a>        <span class="va">self</span>.conv3 <span class="op">=</span> RGCNConv(hidden_channels, hidden_channels, num_relations<span class="op">=</span>dataset.num_relations, num_bases<span class="op">=</span><span class="dv">10</span>)</span>
<span id="cb29-10"><a href="methods.html#cb29-10" tabindex="-1"></a>        <span class="va">self</span>.conv4 <span class="op">=</span> RGCNConv(hidden_channels, out_channels, num_relations<span class="op">=</span>dataset.num_relations, num_bases<span class="op">=</span><span class="dv">10</span>)</span>
<span id="cb29-11"><a href="methods.html#cb29-11" tabindex="-1"></a>        <span class="va">self</span>.fc <span class="op">=</span> Linear(hidden_channels<span class="op">*</span><span class="dv">4</span>, out_channels) <span class="co"># Fully connected layer</span></span>
<span id="cb29-12"><a href="methods.html#cb29-12" tabindex="-1"></a></span>
<span id="cb29-13"><a href="methods.html#cb29-13" tabindex="-1"></a>    <span class="kw">def</span> forward(<span class="va">self</span>, x, edge_index, edge_type):</span>
<span id="cb29-14"><a href="methods.html#cb29-14" tabindex="-1"></a>        x1 <span class="op">=</span> <span class="va">self</span>.conv1(x, edge_index).relu()</span>
<span id="cb29-15"><a href="methods.html#cb29-15" tabindex="-1"></a>        x2 <span class="op">=</span> <span class="va">self</span>.conv2(x1, edge_index).relu()</span>
<span id="cb29-16"><a href="methods.html#cb29-16" tabindex="-1"></a>        x3 <span class="op">=</span> <span class="va">self</span>.conv3(x2, edge_index, edge_type).relu()</span>
<span id="cb29-17"><a href="methods.html#cb29-17" tabindex="-1"></a>        x4 <span class="op">=</span> <span class="va">self</span>.conv4(x3, edge_index, edge_type).relu()</span>
<span id="cb29-18"><a href="methods.html#cb29-18" tabindex="-1"></a></span>
<span id="cb29-19"><a href="methods.html#cb29-19" tabindex="-1"></a>        x <span class="op">=</span> torch.cat([x1, x2, x3, x4], dim<span class="op">=-</span><span class="dv">1</span>)  <span class="co"># Concatenate along the last dimension</span></span>
<span id="cb29-20"><a href="methods.html#cb29-20" tabindex="-1"></a>        x <span class="op">=</span> <span class="va">self</span>.fc(x)  <span class="co"># Pass through the fully connected layer</span></span>
<span id="cb29-21"><a href="methods.html#cb29-21" tabindex="-1"></a>        <span class="cf">return</span> F.log_softmax(x, dim<span class="op">=</span><span class="dv">1</span>)</span>
<span id="cb29-22"><a href="methods.html#cb29-22" tabindex="-1"></a></span>
<span id="cb29-23"><a href="methods.html#cb29-23" tabindex="-1"></a>model <span class="op">=</span> BioGI(hidden_channels<span class="op">=</span><span class="dv">32</span>, out_channels<span class="op">=</span>dataset.num_classes)</span>
<span id="cb29-24"><a href="methods.html#cb29-24" tabindex="-1"></a>model <span class="op">=</span> to_hetero(model, data.metadata(), aggr<span class="op">=</span><span class="st">&#39;sum&#39;</span>)</span></code></pre></div>
<ol start="3" style="list-style-type: decimal">
<li><strong>Skip connections or residual connections</strong>: Skip connections, also known as residual connections, are a technique used in deep neural networks to combat the vanishing gradient problem and to ease the training of deeper models. Skip connections work by adding the input to a layer to its output rather than directly feeding its output into the next layer. Through this approach, it is possible to maintain the flow of information and gradients through the network, thus making it easier to train deeper models. This is because during backpropagation, the gradients have a direct path through the skip connections, helping to mitigate the vanishing gradient problem where gradients can become very small and training can become difficult.</li>
</ol>
<div class="sourceCode" id="cb30"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb30-1"><a href="methods.html#cb30-1" tabindex="-1"></a><span class="im">from</span> torch_geometric.nn <span class="im">import</span> GATConv, RGCNConv, SAGEConv</span>
<span id="cb30-2"><a href="methods.html#cb30-2" tabindex="-1"></a><span class="im">from</span> torch.nn <span class="im">import</span> Linear</span>
<span id="cb30-3"><a href="methods.html#cb30-3" tabindex="-1"></a></span>
<span id="cb30-4"><a href="methods.html#cb30-4" tabindex="-1"></a><span class="kw">class</span> BioGI(torch.nn.Module):</span>
<span id="cb30-5"><a href="methods.html#cb30-5" tabindex="-1"></a>    <span class="kw">def</span> <span class="fu">__init__</span>(<span class="va">self</span>, hidden_channels, out_channels):</span>
<span id="cb30-6"><a href="methods.html#cb30-6" tabindex="-1"></a>        <span class="bu">super</span>().<span class="fu">__init__</span>()</span>
<span id="cb30-7"><a href="methods.html#cb30-7" tabindex="-1"></a>        <span class="va">self</span>.conv1 <span class="op">=</span> GATConv((<span class="op">-</span><span class="dv">1</span>, <span class="op">-</span><span class="dv">1</span>), hidden_channels)</span>
<span id="cb30-8"><a href="methods.html#cb30-8" tabindex="-1"></a>        <span class="va">self</span>.conv2 <span class="op">=</span> SAGEConv(hidden_channels, hidden_channels)</span>
<span id="cb30-9"><a href="methods.html#cb30-9" tabindex="-1"></a>        <span class="va">self</span>.conv3 <span class="op">=</span> RGCNConv(hidden_channels, hidden_channels, num_relations<span class="op">=</span>dataset.num_relations, num_bases<span class="op">=</span><span class="dv">10</span>)</span>
<span id="cb30-10"><a href="methods.html#cb30-10" tabindex="-1"></a>        <span class="va">self</span>.conv4 <span class="op">=</span> RGCNConv(hidden_channels, out_channels, num_relations<span class="op">=</span>dataset.num_relations, num_bases<span class="op">=</span><span class="dv">10</span>)</span>
<span id="cb30-11"><a href="methods.html#cb30-11" tabindex="-1"></a>        <span class="va">self</span>.skip <span class="op">=</span> Linear(hidden_channels, out_channels)  <span class="co"># Skip connection</span></span>
<span id="cb30-12"><a href="methods.html#cb30-12" tabindex="-1"></a></span>
<span id="cb30-13"><a href="methods.html#cb30-13" tabindex="-1"></a>    <span class="kw">def</span> forward(<span class="va">self</span>, x, edge_index, edge_type):</span>
<span id="cb30-14"><a href="methods.html#cb30-14" tabindex="-1"></a>        x1 <span class="op">=</span> <span class="va">self</span>.conv1(x, edge_index).relu()</span>
<span id="cb30-15"><a href="methods.html#cb30-15" tabindex="-1"></a>        x2 <span class="op">=</span> <span class="va">self</span>.conv2(x1, edge_index).relu() <span class="op">+</span> x1</span>
<span id="cb30-16"><a href="methods.html#cb30-16" tabindex="-1"></a>        x3 <span class="op">=</span> <span class="va">self</span>.conv3(x2, edge_index, edge_type).relu() <span class="op">+</span> <span class="va">self</span>.skip(x2)</span>
<span id="cb30-17"><a href="methods.html#cb30-17" tabindex="-1"></a>        x4 <span class="op">=</span> <span class="va">self</span>.conv4(x3, edge_index, edge_type).relu() <span class="op">+</span> <span class="va">self</span>.skip(x3)</span>
<span id="cb30-18"><a href="methods.html#cb30-18" tabindex="-1"></a>        <span class="cf">return</span> F.log_softmax(x4, dim<span class="op">=</span><span class="dv">1</span>)</span>
<span id="cb30-19"><a href="methods.html#cb30-19" tabindex="-1"></a></span>
<span id="cb30-20"><a href="methods.html#cb30-20" tabindex="-1"></a>model <span class="op">=</span> BioGI(hidden_channels<span class="op">=</span><span class="dv">32</span>, out_channels<span class="op">=</span>dataset.num_classes)</span>
<span id="cb30-21"><a href="methods.html#cb30-21" tabindex="-1"></a>model <span class="op">=</span> to_hetero(model, data.metadata(), aggr<span class="op">=</span><span class="st">&#39;sum&#39;</span>)</span></code></pre></div>
<p>It’s difficult to say definitively which approach is more effective in this case without empirical testing. Both approaches could work well, and their performance may vary depending on the specifics of the data and task. It could be beneficial to implement both approaches and conduct experiments to determine which works best for your specific use case.</p>
</div>
<div id="predicting-joint-effect-of-nodes-chemical-combination" class="section level4 hasAnchor" number="2.2.2.2">
<h4><span class="header-section-number">2.2.2.2</span> Predicting joint effect of nodes (Chemical Combination)<a href="methods.html#predicting-joint-effect-of-nodes-chemical-combination" class="anchor-section" aria-label="Anchor link to header"></a></h4>
<p>The application of computational models to evaluate combination effects in chemical compounds represents a pioneering development in the field. An innovative approach entails the creation of ‘hypothetical combination nodes’, a concept rooted in manipulating the original dataset. Such nodes are formed by combining pairs of chemical compounds, specifically those with a PCA1 value ranging from 2.9 to 5. This PCA1 range is strategically chosen, considering that the compounds in this range are proximal to active compounds (compounds with PCA1 greater than 5); thus, they are more likely to exhibit biologically meaningful combinations. Besides, fewer chosen nodes in this range effectively reduce the computational burden by narrowing the search space for potential compound combinations from 25 million to 10 thousand.</p>
<p>The augmented dataset effectively embeds an assumption about the combined effect of two compounds by constructing these nodes as a pairwise combination of chemical compounds and their features as the element-wise maximum of the two original node features. It assumes that the combination’s effect of a descriptor is at least as potent as the strongest of the two compounds in isolation. Also, the resulting combined fingerprint represents the presence of a particular substructure or property if it exists in either of the two original compounds. While not universally applicable, this assumption gives a pragmatic starting point for exploring synergistic effects.</p>
<p>Moreover, the edges connected to these combination nodes represent the combined influence of the original chemical compounds. These are constructed as the union of edges connected to the original nodes, thereby encapsulating the joint relationships of the two compounds with proteins and pathways. This allows the model to capture more complex interaction patterns that may emerge from the compound combination, which single compound nodes may not capture. However, it is important to note that this approach may oversimplify the relationships, as it does not account for possible antagonistic effects, where the combination is less effective than one of the compounds alone. Despite this, the methodology provides a feasible mechanism for studying combination effects.</p>
<p>Such a refined approach paves the way for a deeper understanding of compound effectiveness, revealing complex interplays that might be overlooked when considering compounds individually. Consequently, these insights could be leveraged to enhance our capabilities in drug repurposing and the design of combination therapies, thus expanding the horizons of computational pharmacology. Although these assumptions might not always be the case and oversimplify the relationships between these combination nodes, they provide a starting point, and the model can be further refined based on the results obtained.</p>
<div class="sourceCode" id="cb31"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb31-1"><a href="methods.html#cb31-1" tabindex="-1"></a><span class="im">import</span> itertools</span>
<span id="cb31-2"><a href="methods.html#cb31-2" tabindex="-1"></a><span class="im">from</span> torch_geometric.data <span class="im">import</span> HeteroData</span>
<span id="cb31-3"><a href="methods.html#cb31-3" tabindex="-1"></a><span class="im">from</span> tqdm.notebook  <span class="im">import</span> tqdm</span>
<span id="cb31-4"><a href="methods.html#cb31-4" tabindex="-1"></a></span>
<span id="cb31-5"><a href="methods.html#cb31-5" tabindex="-1"></a><span class="co"># Define nodes to combine based on PCA1 value</span></span>
<span id="cb31-6"><a href="methods.html#cb31-6" tabindex="-1"></a>nodes_to_combine <span class="op">=</span> np.where((data[<span class="st">&#39;chemical&#39;</span>].pca1.numpy() <span class="op">&gt;=</span> <span class="fl">2.9</span>) <span class="op">&amp;</span> (data[<span class="st">&#39;chemical&#39;</span>].pca1.numpy() <span class="op">&lt;</span> <span class="dv">5</span>))[<span class="dv">0</span>].tolist()</span>
<span id="cb31-7"><a href="methods.html#cb31-7" tabindex="-1"></a></span>
<span id="cb31-8"><a href="methods.html#cb31-8" tabindex="-1"></a><span class="co"># Generate all possible pairs of nodes to combine</span></span>
<span id="cb31-9"><a href="methods.html#cb31-9" tabindex="-1"></a>chemical_pairs <span class="op">=</span> itertools.combinations(nodes_to_combine, <span class="dv">2</span>)</span>
<span id="cb31-10"><a href="methods.html#cb31-10" tabindex="-1"></a></span>
<span id="cb31-11"><a href="methods.html#cb31-11" tabindex="-1"></a><span class="co"># Create an empty list to store new features and names of combined nodes</span></span>
<span id="cb31-12"><a href="methods.html#cb31-12" tabindex="-1"></a>new_xs <span class="op">=</span> []</span>
<span id="cb31-13"><a href="methods.html#cb31-13" tabindex="-1"></a>combo_name <span class="op">=</span> []</span>
<span id="cb31-14"><a href="methods.html#cb31-14" tabindex="-1"></a></span>
<span id="cb31-15"><a href="methods.html#cb31-15" tabindex="-1"></a><span class="co"># For each pair of nodes, compute the element-wise maximum of their features and store the result</span></span>
<span id="cb31-16"><a href="methods.html#cb31-16" tabindex="-1"></a><span class="co"># Also generate and store a name for the combined node</span></span>
<span id="cb31-17"><a href="methods.html#cb31-17" tabindex="-1"></a><span class="cf">for</span> id_1, id_2 <span class="kw">in</span> tqdm(chemical_pairs):</span>
<span id="cb31-18"><a href="methods.html#cb31-18" tabindex="-1"></a>    new_x <span class="op">=</span> np.maximum(feat[connected_compounds_idx][id_1], feat[connected_compounds_idx][id_1])</span>
<span id="cb31-19"><a href="methods.html#cb31-19" tabindex="-1"></a>    new_xs.append(new_x)</span>
<span id="cb31-20"><a href="methods.html#cb31-20" tabindex="-1"></a>    combo_name.append(<span class="ss">f&#39;</span><span class="sc">{</span>id_1<span class="sc">}</span><span class="ss">_</span><span class="sc">{</span>id_2<span class="sc">}</span><span class="ss">&#39;</span>)</span>
<span id="cb31-21"><a href="methods.html#cb31-21" tabindex="-1"></a></span>
<span id="cb31-22"><a href="methods.html#cb31-22" tabindex="-1"></a><span class="co"># Concatenate the names of the original and combined nodes</span></span>
<span id="cb31-23"><a href="methods.html#cb31-23" tabindex="-1"></a>combo_smiles <span class="op">=</span> chemical_smiles <span class="op">+</span> combo_name</span>
<span id="cb31-24"><a href="methods.html#cb31-24" tabindex="-1"></a></span>
<span id="cb31-25"><a href="methods.html#cb31-25" tabindex="-1"></a><span class="co"># Concatenate the features of the original and combined nodes</span></span>
<span id="cb31-26"><a href="methods.html#cb31-26" tabindex="-1"></a>combo_chemical_features <span class="op">=</span> np.vstack((feat[connected_compounds_idx], np.vstack(new_xs)))</span>
<span id="cb31-27"><a href="methods.html#cb31-27" tabindex="-1"></a></span>
<span id="cb31-28"><a href="methods.html#cb31-28" tabindex="-1"></a><span class="co"># Map each node to its connected nodes in the compound-protein and pathway-compound graphs</span></span>
<span id="cb31-29"><a href="methods.html#cb31-29" tabindex="-1"></a>comp_prot_dict <span class="op">=</span> {i: compound_protein_deges[compound_protein_deges[:, <span class="dv">0</span>] <span class="op">==</span> i, <span class="dv">1</span>] <span class="cf">for</span> i <span class="kw">in</span> np.unique(compound_protein_deges[:, <span class="dv">0</span>])}</span>
<span id="cb31-30"><a href="methods.html#cb31-30" tabindex="-1"></a>comp_pathway_dict <span class="op">=</span> {i: pathway_compound_edges[:, ::<span class="op">-</span><span class="dv">1</span>][pathway_compound_edges[:, ::<span class="op">-</span><span class="dv">1</span>][:, <span class="dv">0</span>] <span class="op">==</span> i, <span class="dv">1</span>] <span class="cf">for</span> i <span class="kw">in</span> np.unique(pathway_compound_edges[:, ::<span class="op">-</span><span class="dv">1</span>][:, <span class="dv">0</span>])}</span>
<span id="cb31-31"><a href="methods.html#cb31-31" tabindex="-1"></a></span>
<span id="cb31-32"><a href="methods.html#cb31-32" tabindex="-1"></a><span class="co"># Initial index for combined nodes</span></span>
<span id="cb31-33"><a href="methods.html#cb31-33" tabindex="-1"></a>idx <span class="op">=</span> <span class="dv">4292</span></span>
<span id="cb31-34"><a href="methods.html#cb31-34" tabindex="-1"></a>p_l <span class="op">=</span> []</span>
<span id="cb31-35"><a href="methods.html#cb31-35" tabindex="-1"></a>w_l <span class="op">=</span> []</span>
<span id="cb31-36"><a href="methods.html#cb31-36" tabindex="-1"></a></span>
<span id="cb31-37"><a href="methods.html#cb31-37" tabindex="-1"></a><span class="co"># Reinitialize chemical_pairs as it was exhausted in previous loop</span></span>
<span id="cb31-38"><a href="methods.html#cb31-38" tabindex="-1"></a>chemical_pairs <span class="op">=</span> itertools.combinations(nodes_to_combine, <span class="dv">2</span>)</span>
<span id="cb31-39"><a href="methods.html#cb31-39" tabindex="-1"></a></span>
<span id="cb31-40"><a href="methods.html#cb31-40" tabindex="-1"></a><span class="co"># For each pair of nodes, identify the proteins and pathways they are connected to</span></span>
<span id="cb31-41"><a href="methods.html#cb31-41" tabindex="-1"></a><span class="co"># Store the connections of the combined node to proteins and pathways</span></span>
<span id="cb31-42"><a href="methods.html#cb31-42" tabindex="-1"></a><span class="cf">for</span> id_1, id_2 <span class="kw">in</span> tqdm(chemical_pairs):</span>
<span id="cb31-43"><a href="methods.html#cb31-43" tabindex="-1"></a>    idx <span class="op">+=</span> <span class="dv">1</span></span>
<span id="cb31-44"><a href="methods.html#cb31-44" tabindex="-1"></a>    <span class="cf">if</span> id_1 <span class="kw">in</span> comp_prot_dict <span class="kw">and</span> id_2 <span class="kw">in</span> comp_prot_dict:</span>
<span id="cb31-45"><a href="methods.html#cb31-45" tabindex="-1"></a>        p <span class="op">=</span> np.union1d(comp_prot_dict[id_1], comp_prot_dict[id_2])</span>
<span id="cb31-46"><a href="methods.html#cb31-46" tabindex="-1"></a>    <span class="cf">elif</span> id_1 <span class="kw">in</span> comp_prot_dict:</span>
<span id="cb31-47"><a href="methods.html#cb31-47" tabindex="-1"></a>        p <span class="op">=</span> comp_prot_dict[id_1]</span>
<span id="cb31-48"><a href="methods.html#cb31-48" tabindex="-1"></a>    <span class="cf">elif</span> id_2 <span class="kw">in</span> comp_prot_dict:</span>
<span id="cb31-49"><a href="methods.html#cb31-49" tabindex="-1"></a>        p <span class="op">=</span> comp_prot_dict[id_2]</span>
<span id="cb31-50"><a href="methods.html#cb31-50" tabindex="-1"></a>    <span class="cf">else</span>:</span>
<span id="cb31-51"><a href="methods.html#cb31-51" tabindex="-1"></a>        p <span class="op">=</span> <span class="st">&#39;None&#39;</span></span>
<span id="cb31-52"><a href="methods.html#cb31-52" tabindex="-1"></a></span>
<span id="cb31-53"><a href="methods.html#cb31-53" tabindex="-1"></a>    <span class="cf">if</span> id_1 <span class="kw">in</span> comp_pathway_dict <span class="kw">and</span> id_2 <span class="kw">in</span> comp_pathway_dict:</span>
<span id="cb31-54"><a href="methods.html#cb31-54" tabindex="-1"></a>        w <span class="op">=</span> np.union1d(comp_pathway_dict[id_1], comp_pathway_dict[id_2])</span>
<span id="cb31-55"><a href="methods.html#cb31-55" tabindex="-1"></a>    <span class="cf">elif</span> id_1 <span class="kw">in</span> comp_pathway_dict:</span>
<span id="cb31-56"><a href="methods.html#cb31-56" tabindex="-1"></a>        w <span class="op">=</span> comp_pathway_dict[id_1]</span>
<span id="cb31-57"><a href="methods.html#cb31-57" tabindex="-1"></a>    <span class="cf">elif</span> id_2 <span class="kw">in</span> comp_pathway_dict:</span>
<span id="cb31-58"><a href="methods.html#cb31-58" tabindex="-1"></a>        w <span class="op">=</span> comp_pathway_dict[id_2]</span>
<span id="cb31-59"><a href="methods.html#cb31-59" tabindex="-1"></a>    <span class="cf">else</span>:</span>
<span id="cb31-60"><a href="methods.html#cb31-60" tabindex="-1"></a>        w <span class="op">=</span> <span class="st">&#39;None&#39;</span></span>
<span id="cb31-61"><a href="methods.html#cb31-61" tabindex="-1"></a></span>
<span id="cb31-62"><a href="methods.html#cb31-62" tabindex="-1"></a>    <span class="cf">if</span> p <span class="op">!=</span> <span class="st">&#39;None&#39;</span>:</span>
<span id="cb31-63"><a href="methods.html#cb31-63" tabindex="-1"></a>        p_l.append(np.array([[idx, v] <span class="cf">for</span> v <span class="kw">in</span> p]))</span>
<span id="cb31-64"><a href="methods.html#cb31-64" tabindex="-1"></a>    <span class="cf">if</span> w <span class="op">!=</span> <span class="st">&#39;None&#39;</span>:</span>
<span id="cb31-65"><a href="methods.html#cb31-65" tabindex="-1"></a>        w_l.append(np.array([[idx, v] <span class="cf">for</span> v <span class="kw">in</span> w]))</span>
<span id="cb31-66"><a href="methods.html#cb31-66" tabindex="-1"></a></span>
<span id="cb31-67"><a href="methods.html#cb31-67" tabindex="-1"></a><span class="co"># Add the connections of the combined nodes to the original graphs</span></span>
<span id="cb31-68"><a href="methods.html#cb31-68" tabindex="-1"></a>combo_pathway_compound_edges <span class="op">=</span> np.vstack((pathway_compound_edges, np.vstack(w_l)[:, ::<span class="op">-</span><span class="dv">1</span>]))</span>
<span id="cb31-69"><a href="methods.html#cb31-69" tabindex="-1"></a>combo_compound_protein_deges <span class="op">=</span> np.vstack((compound_protein_deges, np.vstack(p_l)))</span>
<span id="cb31-70"><a href="methods.html#cb31-70" tabindex="-1"></a></span>
<span id="cb31-71"><a href="methods.html#cb31-71" tabindex="-1"></a>combo_data <span class="op">=</span> HeteroData()</span>
<span id="cb31-72"><a href="methods.html#cb31-72" tabindex="-1"></a></span>
<span id="cb31-73"><a href="methods.html#cb31-73" tabindex="-1"></a>combo_data[<span class="st">&#39;chemical&#39;</span>].x <span class="op">=</span> torch.tensor(combo_chemical_features, dtype<span class="op">=</span>torch.<span class="bu">float</span>)   <span class="co"># [num_chemicals, num_features_chemical]</span></span>
<span id="cb31-74"><a href="methods.html#cb31-74" tabindex="-1"></a>combo_data[<span class="st">&#39;chemical&#39;</span>].smiles <span class="op">=</span> combo_smiles                                          <span class="co"># [num_chemicals]</span></span>
<span id="cb31-75"><a href="methods.html#cb31-75" tabindex="-1"></a>combo_data[<span class="st">&#39;chemical&#39;</span>].y <span class="op">=</span> chemical_y.<span class="bu">long</span>()                                          <span class="co"># [num_chemicals]</span></span>
<span id="cb31-76"><a href="methods.html#cb31-76" tabindex="-1"></a>combo_data[<span class="st">&#39;chemical&#39;</span>].pca1 <span class="op">=</span> chemical_pca1.to(torch.<span class="bu">float</span>)                           <span class="co"># [num_chemicals]</span></span>
<span id="cb31-77"><a href="methods.html#cb31-77" tabindex="-1"></a>combo_data[<span class="st">&#39;chemical&#39;</span>].phenotype_feat <span class="op">=</span> chemical_phenotype_feat.to(torch.<span class="bu">float</span>)       <span class="co"># [num_chemicals, 16]</span></span>
<span id="cb31-78"><a href="methods.html#cb31-78" tabindex="-1"></a></span>
<span id="cb31-79"><a href="methods.html#cb31-79" tabindex="-1"></a><span class="cf">for</span> f, v <span class="kw">in</span> [(<span class="st">&#39;train&#39;</span>, <span class="st">&#39;train&#39;</span>), (<span class="st">&#39;valid&#39;</span>, <span class="st">&#39;val&#39;</span>), (<span class="st">&#39;test&#39;</span>, <span class="st">&#39;test&#39;</span>)]:</span>
<span id="cb31-80"><a href="methods.html#cb31-80" tabindex="-1"></a>    idx <span class="op">=</span> mask_df.select(</span>
<span id="cb31-81"><a href="methods.html#cb31-81" tabindex="-1"></a>                [<span class="st">&#39;connected_compound_gid&#39;</span>, <span class="st">&#39;mask&#39;</span>]</span>
<span id="cb31-82"><a href="methods.html#cb31-82" tabindex="-1"></a>                ).<span class="bu">filter</span>(</span>
<span id="cb31-83"><a href="methods.html#cb31-83" tabindex="-1"></a>                pl.col(<span class="st">&#39;mask&#39;</span>) <span class="op">==</span> f</span>
<span id="cb31-84"><a href="methods.html#cb31-84" tabindex="-1"></a>                ).select(<span class="st">&#39;connected_compound_gid&#39;</span>).to_numpy().flatten()</span>
<span id="cb31-85"><a href="methods.html#cb31-85" tabindex="-1"></a>    idx <span class="op">=</span> torch.from_numpy(idx)</span>
<span id="cb31-86"><a href="methods.html#cb31-86" tabindex="-1"></a>    maskit <span class="op">=</span> torch.zeros(combo_data[<span class="st">&#39;chemical&#39;</span>].num_nodes, dtype<span class="op">=</span>torch.<span class="bu">bool</span>)</span>
<span id="cb31-87"><a href="methods.html#cb31-87" tabindex="-1"></a>    maskit[idx] <span class="op">=</span> <span class="va">True</span></span>
<span id="cb31-88"><a href="methods.html#cb31-88" tabindex="-1"></a>    combo_data[<span class="st">&#39;chemical&#39;</span>][<span class="ss">f&#39;</span><span class="sc">{</span>v<span class="sc">}</span><span class="ss">_mask&#39;</span>] <span class="op">=</span> maskit</span>
<span id="cb31-89"><a href="methods.html#cb31-89" tabindex="-1"></a>    </span>
<span id="cb31-90"><a href="methods.html#cb31-90" tabindex="-1"></a>combo_mask <span class="op">=</span> torch.zeros(combo_data[<span class="st">&#39;chemical&#39;</span>].num_nodes, dtype<span class="op">=</span>torch.<span class="bu">bool</span>)</span>
<span id="cb31-91"><a href="methods.html#cb31-91" tabindex="-1"></a>combo_mask[torch.from_numpy(np.arange(<span class="dv">4293</span>, <span class="dv">14163</span>, <span class="dv">1</span>))] <span class="op">=</span> <span class="va">True</span></span>
<span id="cb31-92"><a href="methods.html#cb31-92" tabindex="-1"></a>combo_data[<span class="st">&#39;chemical&#39;</span>][<span class="ss">f&#39;combo_mask&#39;</span>] <span class="op">=</span> combo_mask</span>
<span id="cb31-93"><a href="methods.html#cb31-93" tabindex="-1"></a></span>
<span id="cb31-94"><a href="methods.html#cb31-94" tabindex="-1"></a>combo_data[<span class="st">&#39;protein&#39;</span>].x <span class="op">=</span> protein_esm_embeddings.to(torch.<span class="bu">float</span>)  <span class="co"># [num_proteins, num_features_protein]</span></span>
<span id="cb31-95"><a href="methods.html#cb31-95" tabindex="-1"></a>combo_data[<span class="st">&#39;protein&#39;</span>].name <span class="op">=</span> protein_names                        <span class="co"># [num_proteins]</span></span>
<span id="cb31-96"><a href="methods.html#cb31-96" tabindex="-1"></a>combo_data[<span class="st">&#39;protein&#39;</span>].seq <span class="op">=</span> protein_sequences                     <span class="co"># [num_proteins]</span></span>
<span id="cb31-97"><a href="methods.html#cb31-97" tabindex="-1"></a></span>
<span id="cb31-98"><a href="methods.html#cb31-98" tabindex="-1"></a>combo_data[<span class="st">&#39;pathway&#39;</span>].x <span class="op">=</span> pathway_features.to(torch.<span class="bu">float</span>)        <span class="co"># [num_pathways, num_features_pathway]</span></span>
<span id="cb31-99"><a href="methods.html#cb31-99" tabindex="-1"></a>combo_data[<span class="st">&#39;pathway&#39;</span>].name <span class="op">=</span> pathway_names                        <span class="co"># [num_pathways]</span></span>
<span id="cb31-100"><a href="methods.html#cb31-100" tabindex="-1"></a></span>
<span id="cb31-101"><a href="methods.html#cb31-101" tabindex="-1"></a>combo_data[<span class="st">&#39;chemical&#39;</span>, <span class="st">&#39;bind_to&#39;</span>, <span class="st">&#39;protein&#39;</span>].edge_index <span class="op">=</span> torch.from_numpy(combo_compound_protein_deges).t().contiguous() <span class="co"># [2, num_edges_bind]</span></span>
<span id="cb31-102"><a href="methods.html#cb31-102" tabindex="-1"></a>combo_data[<span class="st">&#39;pathway&#39;</span>, <span class="st">&#39;activate_by&#39;</span>, <span class="st">&#39;chemical&#39;</span>].edge_index <span class="op">=</span> torch.from_numpy(combo_pathway_compound_edges).t().contiguous() <span class="co"># [2, num_edges_activate]</span></span>
<span id="cb31-103"><a href="methods.html#cb31-103" tabindex="-1"></a>combo_data[<span class="st">&#39;protein&#39;</span>, <span class="st">&#39;governs&#39;</span>, <span class="st">&#39;pathway&#39;</span>].edge_index <span class="op">=</span> torch.from_numpy(protein_pathway_edges).t().contiguous() <span class="co"># [2, num_edges_govern]</span></span></code></pre></div>
<p>Additionally, the Louvain community detection algorithm, a method based on modularity optimization, offers a powerful alternative to the PCA1-based selection of compounds. Instead of naively selecting compounds based on PCA1 values, the Louvain algorithm can be used to detect communities or clusters within the complex network of compounds.</p>
<p>The Louvain algorithm operates by grouping nodes into communities to maximize the number of within-community edges while minimizing the number of between-community edges. Through iterative optimization, the algorithm determines the network’s modularity, quantifying the density of edges within communities versus edges between communities. Therefore, the algorithm identifies clusters of compounds with greater connectivity than their neighbours.</p>
<div class="sourceCode" id="cb32"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb32-1"><a href="methods.html#cb32-1" tabindex="-1"></a><span class="im">import</span> networkx <span class="im">as</span> nx</span>
<span id="cb32-2"><a href="methods.html#cb32-2" tabindex="-1"></a><span class="im">import</span> community <span class="im">as</span> community_louvain</span>
<span id="cb32-3"><a href="methods.html#cb32-3" tabindex="-1"></a></span>
<span id="cb32-4"><a href="methods.html#cb32-4" tabindex="-1"></a><span class="co"># Create an empty graph for the multimodal graph</span></span>
<span id="cb32-5"><a href="methods.html#cb32-5" tabindex="-1"></a>G <span class="op">=</span> nx.Graph()</span>
<span id="cb32-6"><a href="methods.html#cb32-6" tabindex="-1"></a></span>
<span id="cb32-7"><a href="methods.html#cb32-7" tabindex="-1"></a>chemical_cid <span class="op">=</span> covid_df[connected_compounds_idx, :].select(<span class="st">&#39;pubchem_cid&#39;</span>).to_series().to_list()</span>
<span id="cb32-8"><a href="methods.html#cb32-8" tabindex="-1"></a><span class="co"># Add nodes to the graph with their respective attributes and node types</span></span>
<span id="cb32-9"><a href="methods.html#cb32-9" tabindex="-1"></a><span class="cf">for</span> i, attr <span class="kw">in</span> <span class="bu">enumerate</span>(<span class="bu">zip</span>(combo_data[<span class="st">&#39;chemical&#39;</span>].smiles, combo_data[<span class="st">&#39;chemical&#39;</span>].y, combo_data[<span class="st">&#39;chemical&#39;</span>].pca1, chemical_cid)):</span>
<span id="cb32-10"><a href="methods.html#cb32-10" tabindex="-1"></a>    G.add_node(<span class="ss">f&#39;c_</span><span class="sc">{</span>i<span class="sc">}</span><span class="ss">&#39;</span>, <span class="bu">type</span><span class="op">=</span><span class="st">&#39;chemical&#39;</span>, name<span class="op">=</span>attr[<span class="dv">3</span>] , smiles<span class="op">=</span>attr[<span class="dv">0</span>], y<span class="op">=</span>attr[<span class="dv">1</span>].item(), pca1<span class="op">=</span>attr[<span class="dv">2</span>].item())</span>
<span id="cb32-11"><a href="methods.html#cb32-11" tabindex="-1"></a></span>
<span id="cb32-12"><a href="methods.html#cb32-12" tabindex="-1"></a><span class="cf">for</span> i, attr <span class="kw">in</span> <span class="bu">enumerate</span>(<span class="bu">zip</span>(combo_data[<span class="st">&#39;protein&#39;</span>].name, combo_data[<span class="st">&#39;protein&#39;</span>].seq)):</span>
<span id="cb32-13"><a href="methods.html#cb32-13" tabindex="-1"></a>    G.add_node(<span class="ss">f&#39;p_</span><span class="sc">{</span>i<span class="sc">}</span><span class="ss">&#39;</span>, <span class="bu">type</span><span class="op">=</span><span class="st">&#39;protein&#39;</span>, name<span class="op">=</span>attr[<span class="dv">0</span>], seq<span class="op">=</span>attr[<span class="dv">1</span>])</span>
<span id="cb32-14"><a href="methods.html#cb32-14" tabindex="-1"></a></span>
<span id="cb32-15"><a href="methods.html#cb32-15" tabindex="-1"></a><span class="cf">for</span> i, attr <span class="kw">in</span> <span class="bu">enumerate</span>(<span class="bu">zip</span>(combo_data[<span class="st">&#39;pathway&#39;</span>].name)):</span>
<span id="cb32-16"><a href="methods.html#cb32-16" tabindex="-1"></a>    G.add_node(<span class="ss">f&#39;w_</span><span class="sc">{</span>i<span class="sc">}</span><span class="ss">&#39;</span>, <span class="bu">type</span><span class="op">=</span><span class="st">&#39;pathway&#39;</span>, name<span class="op">=</span>attr[<span class="dv">0</span>])</span>
<span id="cb32-17"><a href="methods.html#cb32-17" tabindex="-1"></a></span>
<span id="cb32-18"><a href="methods.html#cb32-18" tabindex="-1"></a><span class="co"># Add edges between the nodes for each of the relationships</span></span>
<span id="cb32-19"><a href="methods.html#cb32-19" tabindex="-1"></a><span class="cf">for</span> src, dst <span class="kw">in</span> compound_protein_deges:</span>
<span id="cb32-20"><a href="methods.html#cb32-20" tabindex="-1"></a>    G.add_edge(<span class="ss">f&#39;c_</span><span class="sc">{</span>src<span class="sc">}</span><span class="ss">&#39;</span>, <span class="ss">f&#39;p_</span><span class="sc">{</span>dst<span class="sc">}</span><span class="ss">&#39;</span>, interaction<span class="op">=</span><span class="st">&#39;bind&#39;</span>, name<span class="op">=</span><span class="st">&#39;c_p&#39;</span>)</span>
<span id="cb32-21"><a href="methods.html#cb32-21" tabindex="-1"></a></span>
<span id="cb32-22"><a href="methods.html#cb32-22" tabindex="-1"></a><span class="cf">for</span> src, dst <span class="kw">in</span> pathway_compound_edges:</span>
<span id="cb32-23"><a href="methods.html#cb32-23" tabindex="-1"></a>    G.add_edge(<span class="ss">f&#39;w_</span><span class="sc">{</span>src<span class="sc">}</span><span class="ss">&#39;</span>, <span class="ss">f&#39;c_</span><span class="sc">{</span>dst<span class="sc">}</span><span class="ss">&#39;</span>, interaction<span class="op">=</span><span class="st">&#39;active&#39;</span>, name<span class="op">=</span><span class="st">&#39;w_c&#39;</span>)</span>
<span id="cb32-24"><a href="methods.html#cb32-24" tabindex="-1"></a></span>
<span id="cb32-25"><a href="methods.html#cb32-25" tabindex="-1"></a><span class="cf">for</span> src, dst <span class="kw">in</span> protein_pathway_edges:</span>
<span id="cb32-26"><a href="methods.html#cb32-26" tabindex="-1"></a>    G.add_edge(<span class="ss">f&#39;p_</span><span class="sc">{</span>src<span class="sc">}</span><span class="ss">&#39;</span>, <span class="ss">f&#39;w_</span><span class="sc">{</span>dst<span class="sc">}</span><span class="ss">&#39;</span>, interaction<span class="op">=</span><span class="st">&#39;govern&#39;</span>, name<span class="op">=</span><span class="st">&#39;p_w&#39;</span>)</span>
<span id="cb32-27"><a href="methods.html#cb32-27" tabindex="-1"></a></span>
<span id="cb32-28"><a href="methods.html#cb32-28" tabindex="-1"></a><span class="co"># Creating a subgraph with only &#39;chemical&#39; nodes</span></span>
<span id="cb32-29"><a href="methods.html#cb32-29" tabindex="-1"></a>chem_nodes <span class="op">=</span> [n <span class="cf">for</span> n, attr <span class="kw">in</span> G.nodes(data<span class="op">=</span><span class="va">True</span>) <span class="cf">if</span> attr[<span class="st">&#39;type&#39;</span>] <span class="op">==</span> <span class="st">&#39;chemical&#39;</span>]</span>
<span id="cb32-30"><a href="methods.html#cb32-30" tabindex="-1"></a>chem_subgraph <span class="op">=</span> G.subgraph(chem_nodes)</span>
<span id="cb32-31"><a href="methods.html#cb32-31" tabindex="-1"></a></span>
<span id="cb32-32"><a href="methods.html#cb32-32" tabindex="-1"></a><span class="co"># Next, we perform the Louvain community detection</span></span>
<span id="cb32-33"><a href="methods.html#cb32-33" tabindex="-1"></a>partition <span class="op">=</span> community_louvain.best_partition(chem_subgraph)</span>
<span id="cb32-34"><a href="methods.html#cb32-34" tabindex="-1"></a></span>
<span id="cb32-35"><a href="methods.html#cb32-35" tabindex="-1"></a><span class="co"># &#39;partition&#39; is a dictionary with nodes as keys and the community they belong to as values</span></span>
<span id="cb32-36"><a href="methods.html#cb32-36" tabindex="-1"></a><span class="co"># We can add these community assignments back as attributes in the original graph</span></span>
<span id="cb32-37"><a href="methods.html#cb32-37" tabindex="-1"></a>nx.set_node_attributes(G, partition, <span class="st">&#39;community&#39;</span>)</span>
<span id="cb32-38"><a href="methods.html#cb32-38" tabindex="-1"></a></span>
<span id="cb32-39"><a href="methods.html#cb32-39" tabindex="-1"></a><span class="co"># Print out the communities</span></span>
<span id="cb32-40"><a href="methods.html#cb32-40" tabindex="-1"></a><span class="cf">for</span> i, comm <span class="kw">in</span> <span class="bu">enumerate</span>(<span class="bu">set</span>(partition.values())):</span>
<span id="cb32-41"><a href="methods.html#cb32-41" tabindex="-1"></a>    <span class="bu">print</span>(<span class="ss">f&quot;Community </span><span class="sc">{</span>i<span class="sc">}</span><span class="ss">:&quot;</span>)</span>
<span id="cb32-42"><a href="methods.html#cb32-42" tabindex="-1"></a>    <span class="bu">print</span>([nodes <span class="cf">for</span> nodes <span class="kw">in</span> partition.keys() <span class="cf">if</span> partition[nodes] <span class="op">==</span> comm])</span>
<span id="cb32-43"><a href="methods.html#cb32-43" tabindex="-1"></a></span>
<span id="cb32-44"><a href="methods.html#cb32-44" tabindex="-1"></a><span class="co"># visualize the communities in the graph</span></span>
<span id="cb32-45"><a href="methods.html#cb32-45" tabindex="-1"></a>pos <span class="op">=</span> nx.spring_layout(G)</span>
<span id="cb32-46"><a href="methods.html#cb32-46" tabindex="-1"></a>cmap <span class="op">=</span> cm.get_cmap(<span class="st">&#39;viridis&#39;</span>, <span class="bu">max</span>(partition.values()) <span class="op">+</span> <span class="dv">1</span>)</span>
<span id="cb32-47"><a href="methods.html#cb32-47" tabindex="-1"></a>nx.draw_networkx_nodes(G, pos, partition.keys(), node_size<span class="op">=</span><span class="dv">40</span>, cmap<span class="op">=</span>cmap, node_color<span class="op">=</span><span class="bu">list</span>(partition.values()))</span>
<span id="cb32-48"><a href="methods.html#cb32-48" tabindex="-1"></a>nx.draw_networkx_edges(G, pos, alpha<span class="op">=</span><span class="fl">0.5</span>)</span>
<span id="cb32-49"><a href="methods.html#cb32-49" tabindex="-1"></a>plt.show()</span></code></pre></div>
<p>Applying the Louvain algorithm to the compound network offers several advantages. Firstly, it is a data-driven method without arbitrary thresholds, such as the PCA1 range of 2.9 to 5. Instead, it identifies natural clusters in the data, which may reveal hidden patterns that are not apparent when compounds are considered individually. Secondly, by grouping similar compounds, the algorithm reduces the complexity of the network, making it more manageable to analyze and interpret. Finally, the communities detected by the Louvain algorithm may correspond to groups of compounds with similar properties or effects, offering novel insights into the collective behaviour of compound combinations.</p>
<p>In conclusion, the Louvain community detection algorithm provides a more sophisticated, data-driven approach to selecting compound combinations for further finding the best possible compounds for the combination study. Uncovering the compound network’s inherent structure can yield richer and more meaningful insights into compounds’ combined effects.</p>
</div>
</div>
<div id="optimized-molecular-graph-generator-omg" class="section level3 hasAnchor" number="2.2.3">
<h3><span class="header-section-number">2.2.3</span> Optimized Molecular Graph Generator (OMG)<a href="methods.html#optimized-molecular-graph-generator-omg" class="anchor-section" aria-label="Anchor link to header"></a></h3>
<p>There are several approaches implemented for this goal in the field. For instance, in SMILES-based Generative Models, the model uses SMILES notation as a textual representation for molecules to train the model and generate the new molecules. Several generative models, such as Recurrent Neural Networks (RNNs), Variational Autoencoders (VAEs) or Transformer models, have been developed with this procedure to generate SMILES strings that correspond to novel molecules. In this workflow, for example, VAEs can be used for molecule generation, where the molecule is usually represented as a SMILES string or a graph to be encoded into a latent space. New points in this latent space can be decoded into new molecules. The VAE can be trained to ensure that similar points in the latent space correspond to molecules with similar properties.</p>
<p>Other models, like Generative Adversarial Networks (GANs), have also been used for molecule generation. The generator network in the GAN learns to generate new molecules, and the discriminator network learns to distinguish between actual molecules and molecules generated by the generator. By playing this adversarial game, the generator learns to generate more realistic molecules.</p>
<p>Optimized Molecular Graph Generator (OMG) model is designed to generate graphs (molecules) and optimize a particular property, which in this study is the PCA1 value. The <strong>Graph Convolutional Policy Network (GCPN)</strong> and a <strong>flow-based autoregressive model for graph generation (GraphAF)</strong>, which both are graph-based generative models were explored to overcome this task. Although they address the same problem, they have distinct mechanisms where GCPN, employing a reinforcement learning paradigm, makes decisions based on a reward function that evaluates the generated molecule’s quality based on its chemical properties. In contrast, GraphAF, an autoregressive model, generates each new atom and bond based on the atoms and bonds previously generated. Besides, GCPN incrementally generates molecules, choosing new atoms for addition and deciding their connections to the existing molecule based on the current policy, learned through reinforcement learning. Conversely, GraphAF generates molecules atom by atom and bond by bond sequentially, the generation process being deterministic and based on the current state.</p>
<div style="text-align: center;">
<figure>
<img src="assets/omg.png" alt="OMG" style="width: 60%; height: auto;"/>
</figure>
<p style="text-align: justify; text-align-last: left; font-size: 12px;">
Graph Convolutional Policy Network (GCPN) and Graph Autoregressive Flow (GraphAF), RGCN is used as a base model for feature extraction. Before generating a new graph, both models need to understand the input graph’s features, such as node types and edge types, as well as any patterns or structures in the graph. After this, both models learn the “rules” of graph structure from their training data and then apply these rules when generating new graphs by different technique.
</p>
</div>
<p>GCPN uses a reinforcement learning (RL) paradigm to fabricate molecules with refined chemical properties. This method aims to create molecules with particular properties by setting specific goals. The central strategy uses Graph Convolutional Policy Network (GCPN) and Proximal Policy Optimization (PPO).</p>
<ol style="list-style-type: decimal">
<li><p>Graph Convolutional Policy Network (GCPN): In the framework of molecular data, the GCPN, a deep learning model developed for graph structures, is found particularly fitting. Molecules are naturally represented as graphs, with atoms and bonds serving as nodes and edges, respectively. GCPN, a graph-based generative model, applies policy gradient methods for molecule generation. The generation process is sequential, selecting new atoms for addition and determining their connections to the existing molecule.</p></li>
<li><p>Proximal Policy Optimization (PPO): The PPO, a policy optimization method in reinforcement learning, aims at enhancing the policy while ensuring minimal deviation from the preceding policy. This approach aids in maintaining stability and securing reliable learning progress.</p></li>
<li><p>Fine-tuning with RL: The GCPN model, once pretrained, undergoes fine-tuning using reinforcement learning. The RL model sharpens its policy for generating molecules under the guidance of a reward function that assesses the chemical properties of the molecules. The generation of molecules with improved properties, such as enhanced drug-likeness, solubility, or synthetic accessibility, is the chief objective.</p></li>
</ol>
<p>In the pretraining phase, a model is trained on a large dataset. The purpose of this step is to learn general features and patterns from the data. Here, the Relational Graph Convolutional Network (RGCN) model is trained on the ZINC250k dataset. The model is trained for 500 epochs with a batch size of 96. After the pretraining phase, the model is saved for later use.</p>
<div class="sourceCode" id="cb33"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb33-1"><a href="methods.html#cb33-1" tabindex="-1"></a><span class="im">import</span> torch</span>
<span id="cb33-2"><a href="methods.html#cb33-2" tabindex="-1"></a><span class="im">from</span> CovidMolGraph_TD <span class="im">import</span> CovidMolGraphTD_imbalance</span>
<span id="cb33-3"><a href="methods.html#cb33-3" tabindex="-1"></a><span class="im">from</span> torchdrug <span class="im">import</span> datasets, data, utils, core, models, tasks</span>
<span id="cb33-4"><a href="methods.html#cb33-4" tabindex="-1"></a><span class="im">from</span> torch <span class="im">import</span> nn, optim</span>
<span id="cb33-5"><a href="methods.html#cb33-5" tabindex="-1"></a></span>
<span id="cb33-6"><a href="methods.html#cb33-6" tabindex="-1"></a>torch.manual_seed(<span class="dv">42</span>)</span>
<span id="cb33-7"><a href="methods.html#cb33-7" tabindex="-1"></a></span>
<span id="cb33-8"><a href="methods.html#cb33-8" tabindex="-1"></a>dataset_zinc250k <span class="op">=</span> datasets.ZINC250k(<span class="st">&quot;./data/molecule-datasets/&quot;</span>, kekulize<span class="op">=</span><span class="va">True</span>, atom_feature<span class="op">=</span><span class="st">&quot;symbol&quot;</span>)</span>
<span id="cb33-9"><a href="methods.html#cb33-9" tabindex="-1"></a></span>
<span id="cb33-10"><a href="methods.html#cb33-10" tabindex="-1"></a>model <span class="op">=</span> models.RGCN(input_dim<span class="op">=</span>dataset_zinc250k.node_feature_dim,</span>
<span id="cb33-11"><a href="methods.html#cb33-11" tabindex="-1"></a>                    num_relation<span class="op">=</span>dataset_zinc250k.num_bond_type,</span>
<span id="cb33-12"><a href="methods.html#cb33-12" tabindex="-1"></a>                    hidden_dims<span class="op">=</span>[<span class="dv">256</span>, <span class="dv">256</span>, <span class="dv">256</span>, <span class="dv">256</span>], batch_norm<span class="op">=</span><span class="va">False</span>)</span>
<span id="cb33-13"><a href="methods.html#cb33-13" tabindex="-1"></a></span>
<span id="cb33-14"><a href="methods.html#cb33-14" tabindex="-1"></a>task <span class="op">=</span> tasks.GCPNGeneration(model, dataset_zinc250k.atom_types, max_edge_unroll<span class="op">=</span><span class="dv">12</span>,</span>
<span id="cb33-15"><a href="methods.html#cb33-15" tabindex="-1"></a>                            max_node<span class="op">=</span><span class="dv">38</span>, criterion<span class="op">=</span><span class="st">&quot;nll&quot;</span>)</span>
<span id="cb33-16"><a href="methods.html#cb33-16" tabindex="-1"></a></span>
<span id="cb33-17"><a href="methods.html#cb33-17" tabindex="-1"></a>optimizer <span class="op">=</span> optim.Adam(task.parameters(), lr <span class="op">=</span> <span class="fl">1e-3</span>)</span>
<span id="cb33-18"><a href="methods.html#cb33-18" tabindex="-1"></a></span>
<span id="cb33-19"><a href="methods.html#cb33-19" tabindex="-1"></a>solver <span class="op">=</span> core.Engine(task, dataset_zinc250k, <span class="va">None</span>, <span class="va">None</span>, optimizer,</span>
<span id="cb33-20"><a href="methods.html#cb33-20" tabindex="-1"></a>                     gpus<span class="op">=</span>[<span class="dv">0</span>, <span class="dv">1</span>, <span class="dv">2</span>, <span class="dv">3</span>, <span class="dv">4</span>, <span class="dv">5</span>], batch_size<span class="op">=</span><span class="dv">96</span>, log_interval<span class="op">=</span><span class="dv">100</span>)</span>
<span id="cb33-21"><a href="methods.html#cb33-21" tabindex="-1"></a></span>
<span id="cb33-22"><a href="methods.html#cb33-22" tabindex="-1"></a>solver.train(num_epoch<span class="op">=</span><span class="dv">500</span>)</span>
<span id="cb33-23"><a href="methods.html#cb33-23" tabindex="-1"></a>solver.save(<span class="st">&quot;./data/gcpn_dataset_zinc250k_500epoch.pkl&quot;</span>)</span>
<span id="cb33-24"><a href="methods.html#cb33-24" tabindex="-1"></a></span>
<span id="cb33-25"><a href="methods.html#cb33-25" tabindex="-1"></a>dataset_covid <span class="op">=</span> CovidMolGraphTD_imbalance(<span class="st">&quot;./data/CovidMolGraphTD_imbalance&quot;</span>)</span>
<span id="cb33-26"><a href="methods.html#cb33-26" tabindex="-1"></a></span>
<span id="cb33-27"><a href="methods.html#cb33-27" tabindex="-1"></a>model <span class="op">=</span> models.RGCN(input_dim<span class="op">=</span>dataset_covid.node_feature_dim,</span>
<span id="cb33-28"><a href="methods.html#cb33-28" tabindex="-1"></a>                    num_relation<span class="op">=</span>dataset_covid.num_bond_type,</span>
<span id="cb33-29"><a href="methods.html#cb33-29" tabindex="-1"></a>                    hidden_dims<span class="op">=</span>[<span class="dv">1024</span>, <span class="dv">1024</span>, <span class="dv">1024</span>, <span class="dv">1024</span>], batch_norm<span class="op">=</span><span class="va">True</span>)</span>
<span id="cb33-30"><a href="methods.html#cb33-30" tabindex="-1"></a></span>
<span id="cb33-31"><a href="methods.html#cb33-31" tabindex="-1"></a>task <span class="op">=</span> tasks.GCPNGeneration(model, dataset_covid.atom_types, max_edge_unroll<span class="op">=</span><span class="dv">12</span>,</span>
<span id="cb33-32"><a href="methods.html#cb33-32" tabindex="-1"></a>                            max_node<span class="op">=</span><span class="dv">38</span>, criterion<span class="op">=</span><span class="st">&quot;nll&quot;</span>)</span>
<span id="cb33-33"><a href="methods.html#cb33-33" tabindex="-1"></a></span>
<span id="cb33-34"><a href="methods.html#cb33-34" tabindex="-1"></a>optimizer <span class="op">=</span> optim.Adam(task.parameters(), lr <span class="op">=</span> <span class="fl">1e-3</span>)</span>
<span id="cb33-35"><a href="methods.html#cb33-35" tabindex="-1"></a></span>
<span id="cb33-36"><a href="methods.html#cb33-36" tabindex="-1"></a>solver <span class="op">=</span> core.Engine(task, dataset_covid, <span class="va">None</span>, <span class="va">None</span>, optimizer,</span>
<span id="cb33-37"><a href="methods.html#cb33-37" tabindex="-1"></a>                     gpus<span class="op">=</span>[<span class="dv">0</span>, <span class="dv">1</span>, <span class="dv">2</span>, <span class="dv">3</span>, <span class="dv">4</span>, <span class="dv">5</span>], batch_size<span class="op">=</span><span class="dv">128</span>, log_interval<span class="op">=</span><span class="dv">1000</span>)</span>
<span id="cb33-38"><a href="methods.html#cb33-38" tabindex="-1"></a></span>
<span id="cb33-39"><a href="methods.html#cb33-39" tabindex="-1"></a>solver.train(num_epoch<span class="op">=</span><span class="dv">1000</span>)</span>
<span id="cb33-40"><a href="methods.html#cb33-40" tabindex="-1"></a>solver.save(<span class="st">&quot;./data/gcpn_dataset_ncovid_1000epoch_batchnormalization_1024.pkl&quot;</span>)</span></code></pre></div>
<p>The next part of the experiment involves fine-tuning the pretrained model on a more specific task. This is where the model learns more task-specific patterns from a different dataset. In this case, the model is fine-tuned on a COVID-19 specific dataset using the Proximal Policy Optimization (PPO) algorithm, a reinforcement learning method. The learning rate is significantly lower than in the pretraining phase, indicating a more careful, incremental learning process. The model is trained for 100 epochs with a batch size of 16.</p>
<p>Loading the pretrained model for fine-tuning is performed with the optimizer state not being loaded. This implies that while the model parameters are loaded from the pretrained model, the state of the optimizer (which could include momentum, adaptive learning rates, etc.) is reinitialized.</p>
<p>Finally, the fine-tuned model is saved for further use or evaluation. The fine-tuned model is expected to perform better on the COVID-19 specific task than a model trained from scratch, thanks to the transfer of knowledge from the pretraining phase. This approach often reduces the amount of data required for the task-specific model and also shortens the training time.</p>
<div class="sourceCode" id="cb34"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb34-1"><a href="methods.html#cb34-1" tabindex="-1"></a><span class="im">import</span> torch</span>
<span id="cb34-2"><a href="methods.html#cb34-2" tabindex="-1"></a><span class="im">from</span> torchdrug <span class="im">import</span> core, datasets, models, tasks</span>
<span id="cb34-3"><a href="methods.html#cb34-3" tabindex="-1"></a><span class="im">from</span> torch <span class="im">import</span> nn, optim</span>
<span id="cb34-4"><a href="methods.html#cb34-4" tabindex="-1"></a><span class="im">from</span> collections <span class="im">import</span> defaultdict</span>
<span id="cb34-5"><a href="methods.html#cb34-5" tabindex="-1"></a></span>
<span id="cb34-6"><a href="methods.html#cb34-6" tabindex="-1"></a>model <span class="op">=</span> models.RGCN(input_dim<span class="op">=</span>dataset_covid.node_feature_dim,</span>
<span id="cb34-7"><a href="methods.html#cb34-7" tabindex="-1"></a>                    num_relation<span class="op">=</span>dataset_covid.num_bond_type,</span>
<span id="cb34-8"><a href="methods.html#cb34-8" tabindex="-1"></a>                    hidden_dims<span class="op">=</span>[<span class="dv">256</span>, <span class="dv">256</span>, <span class="dv">256</span>, <span class="dv">256</span>], batch_norm<span class="op">=</span><span class="va">True</span>)</span>
<span id="cb34-9"><a href="methods.html#cb34-9" tabindex="-1"></a>task <span class="op">=</span> tasks.GCPNGeneration(model, dataset_covid.atom_types,</span>
<span id="cb34-10"><a href="methods.html#cb34-10" tabindex="-1"></a>                            max_edge_unroll<span class="op">=</span><span class="dv">12</span>, max_node<span class="op">=</span><span class="dv">38</span>,</span>
<span id="cb34-11"><a href="methods.html#cb34-11" tabindex="-1"></a>                            task<span class="op">=</span>[<span class="st">&#39;pca1&#39;</span>], criterion<span class="op">=</span><span class="st">&quot;ppo&quot;</span>,</span>
<span id="cb34-12"><a href="methods.html#cb34-12" tabindex="-1"></a>                            reward_temperature<span class="op">=</span><span class="dv">1</span>,</span>
<span id="cb34-13"><a href="methods.html#cb34-13" tabindex="-1"></a>                            agent_update_interval<span class="op">=</span><span class="dv">3</span>, gamma<span class="op">=</span><span class="fl">0.9</span>)</span>
<span id="cb34-14"><a href="methods.html#cb34-14" tabindex="-1"></a></span>
<span id="cb34-15"><a href="methods.html#cb34-15" tabindex="-1"></a></span>
<span id="cb34-16"><a href="methods.html#cb34-16" tabindex="-1"></a>optimizer <span class="op">=</span> optim.Adam(task.parameters(), lr<span class="op">=</span><span class="fl">1e-5</span>)</span>
<span id="cb34-17"><a href="methods.html#cb34-17" tabindex="-1"></a>solver <span class="op">=</span> core.Engine(task, dataset_covid, <span class="va">None</span>, <span class="va">None</span>, optimizer,</span>
<span id="cb34-18"><a href="methods.html#cb34-18" tabindex="-1"></a>                     gpus<span class="op">=</span>(<span class="dv">0</span>,), batch_size<span class="op">=</span><span class="dv">16</span>, log_interval<span class="op">=</span><span class="dv">10</span>)</span>
<span id="cb34-19"><a href="methods.html#cb34-19" tabindex="-1"></a></span>
<span id="cb34-20"><a href="methods.html#cb34-20" tabindex="-1"></a>solver.load(<span class="st">&quot;./data/gcpn_dataset_zinc250k_500epoch.pkl&quot;</span>,load_optimizer<span class="op">=</span><span class="va">False</span>)</span>
<span id="cb34-21"><a href="methods.html#cb34-21" tabindex="-1"></a></span>
<span id="cb34-22"><a href="methods.html#cb34-22" tabindex="-1"></a><span class="co"># RL finetuning</span></span>
<span id="cb34-23"><a href="methods.html#cb34-23" tabindex="-1"></a>solver.train(num_epoch<span class="op">=</span><span class="dv">100</span>)</span>
<span id="cb34-24"><a href="methods.html#cb34-24" tabindex="-1"></a>solver.save(<span class="st">&quot;./data/gcpn_zinc250k_500epoch_finetune_covid_100epoch.pkl&quot;</span>)</span></code></pre></div>
<p>GraphAF, however, sequentially generates molecules atom by atom and bond by bond. This approach is rooted in the concept of normalizing flows in deep learning. It employs an invertible mapping between the types of nodes (atoms) and edges (bonds) in the molecular graph and a noise distribution. The main components of GraphAF include:</p>
<ol style="list-style-type: decimal">
<li><p>Relational Graph Convolutional Networks (RGCN): The RGCN serves as the graph representation model in GraphAF. It is a variant of GCN designed to handle graphs with varied relations or edges. In molecular terms, diverse types of edges can represent different chemical bonds.By utilizing an RGCN, GraphAF can learn a more expressive representation of the molecular graph that considers the types of bonds.</p></li>
<li><p>Autoregressive Generation: The training task for GraphAF, this process trains the model to generate the nodes and edges of the molecular graph sequentially. The generation process is autoregressive, implying that the generation of each new node or edge depends on the nodes and edges previously generated.</p></li>
<li><p>Node Flow Model and Edge Flow Model: These components, integral to the autoregressive generation task, define an invertible mapping between the types of nodes and edges in the molecular graph and noise distribution. By implementing a node flow model and an edge flow model, GraphAF can generate a diverse set of molecules.</p></li>
</ol>
</div>
</div>
<div id="model-validation-and-optimization" class="section level2 hasAnchor" number="2.3">
<h2><span class="header-section-number">2.3</span> Model Validation and Optimization<a href="methods.html#model-validation-and-optimization" class="anchor-section" aria-label="Anchor link to header"></a></h2>
<p>In the development of models, a strict validation, testing, and hyperparameter tuning procedure was employed, aimed at yielding reliable and robust performance.</p>
<p>The available dataset was partitioned into training, validation, and test subsets, thereby facilitating the impartial evaluation of the model. The training procedure of the model was intricately designed to generalize model performance to unseen data, the effectiveness of which was ascertained via this three-way dataset partitioning. Initial hyperparameters, including aspects like learning rate, weight decay, model embedding size, the number of attention heads, and dropout rate, among others, were pre-determined.</p>
<p>Throughout the training process, multiple epochs were executed. Upon the completion of each epoch, an evaluation of the model’s performance was carried out on the validation set. This evaluation involved the calculation of critical metrics, including balanced accuracy, recall, precision, and the F1-score for classification and R², MSE and RMSE for the regression task.</p>
<p>To prevent overfitting, a common pitfall in deep learning models, strategies like early stopping and model checkpointing, guided by validation metrics, were primarily utilized. Early stopping implies halting the training procedure once the model’s performance on the validation set no longer shows improvement. This method helps the model avoid becoming overly fitted to the training data, thus enhancing its ability to generalize.</p>
<p>Model checkpointing was adopted to preserve the model’s state when it demonstrated superior performance, as indicated by a higher average of recall, precision, and F1-score on the validation set compared to any previous epoch in classification and higher R² for regression. This method ensures that the model, which showed the highest generalization capacity during training, is retained for future predictive tasks.</p>
<p>The model was evaluated on the test set after finalizing the training and validation phase. This additional step was critical to provide a final confirmation of the model’s ability to generalize to completely unseen data. The model’s performance on the validation set also drove the refinement of hyperparameters. This approach to hyperparameter optimization allowed the model to be fine-tuned for the task at hand. These collective measures served to fortify the model’s ability to generalize and to inhibit overfitting, ensuring the model’s reliability and robustness.</p>
</div>
<div id="model-enhancement" class="section level2 hasAnchor" number="2.4">
<h2><span class="header-section-number">2.4</span> Model Enhancement<a href="methods.html#model-enhancement" class="anchor-section" aria-label="Anchor link to header"></a></h2>
<p>Various strategies were employed to enhance the performance of the models used in this study. Two such approaches were the integration of models and the use of self-supervised learning.</p>
<p>In the discipline of machine learning, the fusion of distinct models is often employed, known as an ensemble approach, to optimize performance and accuracy. In the presented study, such an approach was adopted, integrating the BioGIP and GLMP models. Rather than using raw chemical feature vectors for node representation, latent embeddings sourced from the GLMP model were utilized. These embeddings, encompassing the molecules’ concentrated structural and physiochemical properties, were believed to provide a more in-depth and relevant portrayal of molecular properties. The anticipation was that this approach could improve the BioGIP model’s ability to classify or regress biological entities accurately.</p>
<p>Additionally, self-supervised learning was another key strategy used for model enhancement. In this method, models generate their labels from the input data, which has shown its effectiveness in predicting drug properties and molecule generation. Graph contrastive learning techniques such as InfoGraph<span class="citation"><a href="#ref-sun2019infograph">[36]</a></span> and Attribute Masking<span class="citation"><a href="#ref-you2020graph">[37]</a></span>, which operate on graph-structured data and maximize the mutual information between node-level and graph-level representations, were used to enhance the GLMP model. Once trained, these models can generate meaningful representations of new graphs or nodes, useful in tasks like node classification, link prediction, or graph classification. This method was employed in the GLMP model to improve its ability to predict molecular properties.</p>
<p>In the generative model, OMG, self-supervised learning was incorporated in training the Relational Graph Convolutional Network (RGCN) model on the ZINC250k dataset. Therefore, this technique played a critical role in improving the accuracy of molecular property prediction (GLMP) and refining the molecule generation process (OMG). In the context of the OMG model, the role of the teacher model was essential in guiding the molecule generation process. The GCPN variant of the OMG model used a reinforcement learning (RL) framework, which characterized and evaluated the generated molecules based on their PCA1 value, a crucial descriptor of molecular properties as our primary objective. This evaluation was performed using an ordinal regressor model, thus enabling a more goal-oriented generation of molecules. The model, therefore, aimed to create molecules that optimize the targeted PCA1 value.</p>
<p>This study modified the OMG model’s source code to accommodate a custom task – the prediction of the PCA1 value. The novelty in this approach was the use of an ordinal regression instead of regression due to the poor performance of the latter. Hence, improvements to the predictor component of the OMG directly influenced the model’s effectiveness. The present study does not purport to pioneer new algorithms or models for graph generation. Rather, it focuses on the astute adaptation and implementation of well-known algorithms tailored explicitly for generating particular molecules. The study’s primary contribution is underscored by the intricate modifications enacted on existing models, ensuring their effective adaptation to bespoke tasks within the confines of this research.</p>
</div>
<div id="data-acquisition-software-and-libraries" class="section level2 hasAnchor" number="2.5">
<h2><span class="header-section-number">2.5</span> Data acquisition, software and libraries<a href="methods.html#data-acquisition-software-and-libraries" class="anchor-section" aria-label="Anchor link to header"></a></h2>
<p>The Pharmaceutical Bioinformatics Research Group at Uppsala University provided cell profiling data for this study, which then preprocessed and normalized using the Python packages Polars and Pandas. The primary analysis, graph representation learning, was executed using Python, with the deep learning framework PyTorch and PyTorch Geometric and Deep Graph Library for graph neural networks implementation. Preprocessing and featurization of chemical compounds were done using DeepChem, with the RDKit and Mordred libraries used for physicochemical featurization. The TorchDrug toolkit was utilized to create network architectures for drug generation tasks.</p>
<p>Chemoinformatics operations were performed with RDKit, while Scikit-learn was used for machine learning tasks. Results were visualized using Plotly, and statistical analyses were performed in Python and R. This integrated approach revealed new interactions between chemical compounds, cellular phenotypes, and biological entities, identifying new potential drug targets.</p>
<p>This study capitalized on the computational prowess of Berzelius, an AI/ML focused compute cluster in Sweden using NVIDIA A100 GPUs, and in-house NVIDIA GTX 3090, hosted at Uppsala University’s Pharmaceutical Bioinformatics Research Group.</p>
<table>
<caption>(ref:tools_packages) Tools and Packages Utilized in the Study</caption>
<colgroup>
<col width="25%" />
<col width="25%" />
<col width="25%" />
<col width="25%" />
</colgroup>
<thead>
<tr class="header">
<th>Tool/Package</th>
<th>Description</th>
<th>Version</th>
<th>Additional Info</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td>Python</td>
<td>Programming language</td>
<td>3.8.10</td>
<td>-</td>
</tr>
<tr class="even">
<td>Polars</td>
<td>Data manipulation</td>
<td>0.17.13</td>
<td>-</td>
</tr>
<tr class="odd">
<td>Pandas</td>
<td>Data manipulation</td>
<td>1.5.3</td>
<td>-</td>
</tr>
<tr class="even">
<td>PyTorch</td>
<td>Deep learning framework</td>
<td>1.13.1+cu116</td>
<td>-</td>
</tr>
<tr class="odd">
<td>PyTorch Geometric</td>
<td>Graph representation learning</td>
<td>2.2.0</td>
<td>Extension of PyTorch</td>
</tr>
<tr class="even">
<td>Deep Graph Library (DGL)</td>
<td>Graph representation learning</td>
<td>1.0.1+cu116</td>
<td>Extension of PyTorch</td>
</tr>
<tr class="odd">
<td>DeepChem</td>
<td>Chemoinformatics</td>
<td>2.7.1</td>
<td>Used for pre-processing and featurization</td>
</tr>
<tr class="even">
<td>RDKit</td>
<td>Chemoinformatics</td>
<td>2022.09.5</td>
<td>Used for physico-chemical featurization</td>
</tr>
<tr class="odd">
<td>Mordred</td>
<td>Chemoinformatics</td>
<td>Latest</td>
<td>Used for physico-chemical featurization</td>
</tr>
<tr class="even">
<td>TorchDrug</td>
<td>Drug discovery toolkit</td>
<td>0.2.0.post1</td>
<td>Used for network architectures</td>
</tr>
<tr class="odd">
<td>Scikit-learn</td>
<td>Machine Learning</td>
<td>1.2.1</td>
<td>Used for model evaluation and comparing models</td>
</tr>
<tr class="even">
<td>BioTransformers</td>
<td>ESM/Protbert models</td>
<td>0.1.17</td>
<td>Protein featurization</td>
</tr>
<tr class="odd">
<td>Plotly</td>
<td>Data visualization</td>
<td>5.14.1</td>
<td>-</td>
</tr>
<tr class="even">
<td>R</td>
<td>Statistical analysis</td>
<td>4.2.2</td>
<td>Used for specific statistical analyses</td>
</tr>
</tbody>
</table>

</div>
</div>
<h3>References<a href="references.html#references" class="anchor-section" aria-label="Anchor link to header"></a></h3>
<div id="refs" class="references csl-bib-body">
<div id="ref-kappal2019data" class="csl-entry">
<div class="csl-left-margin">[35] </div><div class="csl-right-inline">S. Kappal <em>et al.</em>, <span>“Data normalization using median median absolute deviation MMAD based z-score for robust predictions vs. Min–max normalization,”</span> <em>London Journal of Research in Science: Natural and Formal</em>, 2019. </div>
</div>
<div id="ref-sun2019infograph" class="csl-entry">
<div class="csl-left-margin">[36] </div><div class="csl-right-inline">F.-Y. Sun, J. Hoffmann, V. Verma, and J. Tang, <span>“Infograph: Unsupervised and semi-supervised graph-level representation learning via mutual information maximization,”</span> <em>arXiv preprint arXiv:1908.01000</em>, 2019. </div>
</div>
<div id="ref-you2020graph" class="csl-entry">
<div class="csl-left-margin">[37] </div><div class="csl-right-inline">Y. You, T. Chen, Y. Sui, T. Chen, Z. Wang, and Y. Shen, <span>“Graph contrastive learning with augmentations,”</span> <em>Advances in neural information processing systems</em>, vol. 33, pp. 5812–5823, 2020. </div>
</div>
</div>
            </section>

          </div>
        </div>
      </div>
<a href="intro.html" class="navigation navigation-prev " aria-label="Previous page"><i class="fa fa-angle-left"></i></a>
<a href="results.html" class="navigation navigation-next " aria-label="Next page"><i class="fa fa-angle-right"></i></a>
    </div>
  </div>
<script src="libs/gitbook-2.6.7/js/app.min.js"></script>
<script src="libs/gitbook-2.6.7/js/clipboard.min.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-search.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-sharing.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-fontsettings.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-bookdown.js"></script>
<script src="libs/gitbook-2.6.7/js/jquery.highlight.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-clipboard.js"></script>
<script>
gitbook.require(["gitbook"], function(gitbook) {
gitbook.start({
"sharing": {
"github": false,
"facebook": true,
"twitter": true,
"linkedin": false,
"weibo": false,
"instapaper": false,
"vk": false,
"whatsapp": false,
"all": ["facebook", "twitter", "linkedin", "weibo", "instapaper"]
},
"fontsettings": {
"theme": "white",
"family": "sans",
"size": 2
},
"edit": null,
"history": {
"link": null,
"text": null
},
"view": {
"link": null,
"text": null
},
"download": ["_main.pdf", "_main.epub", "_main.mobi"],
"search": {
"engine": "fuse",
"options": null
},
"toc": {
"collapse": "subsection"
},
"toolbar": {
"position": "fixed"
}
});
});
</script>

<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
  (function () {
    var script = document.createElement("script");
    script.type = "text/javascript";
    var src = "true";
    if (src === "" || src === "true") src = "https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.9/latest.js?config=TeX-MML-AM_CHTML";
    if (location.protocol !== "file:")
      if (/^https?:/.test(src))
        src = src.replace(/^https?:/, '');
    script.src = src;
    document.getElementsByTagName("head")[0].appendChild(script);
  })();
</script>
</body>

</html>