Pull request #7: Feature/split model instances

Steijaert, Marvin [JRDBE Non-J&J] · stsouko · commit a30f278fe97e · 2025-09-24T04:17:01.000-04:00
Merge in ASX-JFUG/chytorch from feature/split_model_instances to dev

* commit 'fa2e1a664500072a8759726bb57a6c7a3f9f0fcb':
  minor improvements
  add unittest and make sagemaker-ready
  create copy of encoder for MC inference
diff --git a/chytorch/zoo/autodl/inference_v1/gtpp.py b/chytorch/zoo/autodl/inference_v1/gtpp.py
@@ -27,14 +27,16 @@
 from functools import cached_property
 from gtpp_client import ComputedPropertyDescription, Table, TableColumn, TableRow, ComputeExJob
 from gtpp_client.util import ContextStopwatch
-from kserve_gtpp.model_base import ModelBase, L as BASE_LOGGER, BadDataException
 from os import getenv
 from os.path import abspath
 from torch import set_num_threads, no_grad
 from torch.utils.data import DataLoader
 from typing import List
 from ..model import ChIMP, RotaryChIMP
-
+if getenv("GTPP_DEPLOYMENT_FLAVOR", "kserve").lower() == "sagemaker":
+    from sagemaker_gtpp.model_base import ModelBase, L as BASE_LOGGER, BadDataException
+else :
+    from kserve_gtpp.model_base import ModelBase, L as BASE_LOGGER, BadDataException
 
 prompt_shift = 120
 set_num_threads(int(getenv('CPU_COUNT', 6)))
diff --git a/chytorch/zoo/autodl/inference_v1/test_gtpp.py b/chytorch/zoo/autodl/inference_v1/test_gtpp.py
@@ -0,0 +1,104 @@
+from concurrent.futures import ThreadPoolExecutor
+from copy import deepcopy
+import functools
+import gtpp_client.util as cu
+from gtpp_client.models.compute_ex_job import ComputeExJob
+import numpy as np
+from os import getenv
+import time
+from typing import List
+import unittest
+from warnings import warn
+from .gtpp import ChIMPServe
+if getenv("GTPP_DEPLOYMENT_FLAVOR", "kserve").lower() == "sagemaker":
+    from sagemaker_gtpp.model_base import ModelBase
+    import sagemaker_gtpp.test_base as test_base
+    from sagemaker_gtpp.test_base import MicroserviceException
+    import sagemaker_gtpp.test_util as tu
+else :
+    from kserve_gtpp.model_base import ModelBase
+    import kserve_gtpp.test_base as test_base
+    from kserve_gtpp.test_base import MicroserviceException
+    import kserve_gtpp.test_util as tu
+
+class MyTestCase(test_base.TestBase):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def get_model(self) -> ModelBase:
+        """
+        This method MUST be implemented. It should return an instance of the model class.
+        You may place additional model-specific code here (for example, to initialize
+        the model in a certain non-standard way). get_model() is executed once for each test,
+        that is, each test from the test suite uses a new instance of the model class.
+        :return: an instance of the model class that derives from ModelBase.
+        """
+        model = ChIMPServe()
+        # ignore uq colums for non-deterministic uncertainty quantification results
+        self._test_stability = self._test_stability_without_uq_columns
+        return model
+
+    def _test_stability_without_uq_columns(self, codes: List[str]):
+        # modified variant of TestBase._test_stability() that ignores the
+        # uncertainty columns
+        test_smiles = self.STABILITY_TEST_SMILES
+        result_table = None
+        print(f'Running n_inner_loop_repeats={self.STABILITY_NUM_INNER_LOOP_REPEATS}')
+        for i in range(self.STABILITY_NUM_INNER_LOOP_REPEATS):
+            smiles_chunks = list(tu.gen_random_chunks(test_smiles))
+            with cu.ContextStopwatch(f'num. chunks={len(smiles_chunks)}'):
+                # normally, the number of concurrently executing threads
+                # will be equal to the number of chunks test_smiles list is split into
+                options = self.STABILITY_COMPUTE_OPTIONS.copy()
+                with ThreadPoolExecutor(max_workers=self.STABILITY_MAX_THREADS) as executor:
+                    ret = executor.map(functools.partial(self.run_model_compute,
+                                                         property_codes=codes, options=options), smiles_chunks)
+                    complete_table_original = cu.TableMerger.append_multiple_tables(ret)
+                    complete_table = self._only_keep_every_third_column(complete_table_original)
+                if result_table is None:
+                    result_table = complete_table
+                    # Note: in all currently deployed models, there should be 1-to-1 mapping between
+                    # the input chemical structures and output table rows.
+                    # This may change in the future (the gtpp API itself has no such restriction).
+                    self.assertEqual(len(complete_table_original.rows), len(test_smiles), 'Num. output rows is not the same as '
+                                                                               'the size of the input list')
+                    print('First iteration done, got sample result table for comparison.')
+                else:
+                    try:
+                        tu.assert_tables_equal(self, result_table, complete_table)
+                        print(f'Passed with num. chunks={len(smiles_chunks)}')
+                    except Exception:
+                        print(f'FAILED with num. chunks={len(smiles_chunks)}')
+                        raise
+                print(f'Iteration #{i + 1}: success.')
+
+        print(f'_test_stability passed')
+        pass
+
+    def _only_keep_every_third_column(self, table):
+        table_out = deepcopy(table)
+        table_out.columns = [c for i,c in enumerate(table.columns) if not i%3]
+        for r in table_out.rows:
+            r.values = [x for i,x in enumerate(r.values) if not i%3]
+        return table_out
+
+    def test_uncertainty_columns(self):
+        test_smiles = ['C[C@H](N)C(=O)O', 'CC(O)C(=O)O', 'OCC(O)CO', 'Oc1ccccc1',
+                       'Nc1ccncc1', 'C#CC(C)(O)CC', '', 'ClCCCl', 'O=C1CCC(=O)N1', 'O=CCCCC=O',
+                       'N[C@@H]1CONC1=O', 'S1C=CSC1=C'] + self.get_10_valid_smiles()
+        prop_desciptions = self.get_properties()
+        all_prop_codes = [pd.code for pd in prop_desciptions]
+        res = self.run_model_compute(smiles_list = test_smiles,
+                               property_codes=all_prop_codes)
+
+        # assert that lcl<=main<=ucl for all properties and all test compounds
+        n_columns = len(res.columns)
+        pred_main = [np.array([x.values[i] for x in res.rows if None not in x.values]) for i in range(0,n_columns, 3)]
+        pred_lcl = [np.array([x.values[i] for x in res.rows if None not in x.values]) for i in range(1,n_columns, 3)]
+        pred_ucl = [np.array([x.values[i] for x in res.rows if None not in x.values]) for i in range(2,n_columns, 3)]
+        self.assertTrue(all(all(x<=y) for x,y in zip(pred_lcl, pred_main)))
+        self.assertTrue(all(all(x<=y) for x,y in zip(pred_main, pred_ucl)))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/chytorch/zoo/autodl/model.py b/chytorch/zoo/autodl/model.py
@@ -22,10 +22,10 @@
 #
 from chytorch.nn import (MoleculeRotaryEncoder, MoleculeEncoder, ConditionedMaskedSlicer,
                          CensoredLoss, MaskedNaNLoss, MultiTaskLoss, LossDispatcher)
+from copy import deepcopy
 from functools import reduce
 from lightning.pytorch import LightningModule
 from operator import add
-from threading import Lock
 from torch import bfloat16, zeros, zeros_like, stack, empty, minimum, maximum, where, ones, tensor
 from torch.nn import Linear, BCEWithLogitsLoss, SmoothL1Loss
 from torch.nn.functional import embedding
@@ -89,23 +89,29 @@ def __init__(self, d_model: int = 256, max_tokens: int = 10_000,
         self.lr_mode = lr_mode
         self.betas = betas
         self.weight_decay = weight_decay
+        self.__copied_encoder = None
 
+    @property
+    def _mc_encoder(self):
+        # creates a copy of the encoder in train mode
+        if not self.__copied_encoder:
+            self.__copied_encoder = deepcopy(self.encoder)
+            for x in self.__copied_encoder.layers:
+                x.train()
+        return self.__copied_encoder
+    
     def predict(self, batch):
         # each mol in minibatch must be with the same prompt size.
         # I'm not going to check it here
         prompt_size = (batch.atoms[0] > self.prompt_shift).sum()
         prompt_batch = (batch.atoms[:, :prompt_size], batch.neighbors[:, :prompt_size], batch.distances[:, :prompt_size])
 
         cache = self.build_cache(batch)
-        with Lock():  # make sure MC runs are not going in parallel
-            mid = self.head(self.encoder(batch, cache=cache)[:, :prompt_size]).flatten()
-            for x in self.encoder.layers:
-                x.train()
-            mc = stack([
-                self.head(self.encoder(prompt_batch, cache=cache, cache_direction='left')).flatten()
-                for _ in range(self.monte_carlo_runs)
-            ])
-            self.eval()
+        mid = self.head(self.encoder(batch, cache=cache)[:, :prompt_size]).flatten()
+        mc = stack([
+            self.head(self._mc_encoder(prompt_batch, cache=cache, cache_direction='left')).flatten()
+            for _ in range(self.monte_carlo_runs)
+        ])
 
         # get quantiles
         low = mc.quantile(self.monte_carlo_quantile, dim=0)