JDACS4C-IMPROVE · rajeeja · Oct 19, 2023 · Oct 19, 2023 · Oct 20, 2023 · Oct 20, 2023
diff --git a/Pilot1/Uno_IMPROVE/README.md b/Pilot1/Uno_IMPROVE/README.md
@@ -0,0 +1,27 @@
+# Files in the repo
+- Modified for IMPROVE (pick one configuration of drug):
+   - uno_default_model.txt
+   - uno_preprocess.py
+   - uno_train.py
+   - uno_infer.py
+
+# Conda Run (Miniconda version: 23.11.0)
+- conda create --name Uno_IMPROVE python=3.7.16
+- conda activate Uno_IMPROVE
+- conda config --add channels conda-forge
+- conda install tensorflow-gpu=2.10.0
+- pip install git+https://github.com/ECP-CANDLE/candle_lib@develop
+- pip install protobuf==3.20.0
+- git clone https://github.com/JDACS4C-IMPROVE/IMPROVE.git
+- export PYTHONPATH=<IMPROVE_LIBRARY_PATH>/:$PYTHONPATH
+- pip install pyarrow==12.0.1 scikit-learn==1.0.2 joblib==1.3.2
+
+
+In order to run the modified version of the code, you need to run the following commands:
+```
+export IMPROVE_DATA_DIR=<DESIRED_DATA_DIR>
+wget --cut-dirs=9 -P ~/$IMPROVE_DATA_DIR -np -nH -m https://web.cels.anl.gov/projects/IMPROVE_FTP/candle/public/improve/benchmarks/single_drug_drp/benchmark-data-pilot1/csa_data/
+export PYTHONPATH=<IMPROVE_LIBRARY>/:$PYTHONPATH 
+python uno_preprocess_improve.py
+python uno_train_improve.py
+```
diff --git a/Pilot1/Uno_IMPROVE/To-Do.md b/Pilot1/Uno_IMPROVE/To-Do.md
@@ -0,0 +1,3 @@
+1: Change the datasets to the IMPROVE library
+2: Use the splits provided by the IMPROVE library
+3: Use the IMPROVE functions for data_preprocess
diff --git a/Pilot1/Uno_IMPROVE/csa_wf_v3.py b/Pilot1/Uno_IMPROVE/csa_wf_v3.py
@@ -0,0 +1,266 @@
+""" Python implementation of cross-study analysis workflow """
+
+import os
+import subprocess
+import warnings
+from time import time
+from pathlib import Path
+
+import pandas as pd
+
+# IMPROVE imports
+from improve import framework as frm
+
+# LightGBM imports
+# TODO: change this for your model
+import uno_preprocess_improve
+import uno_train_improve
+import uno_infer_improve
+
+# from ap_utils.classlogger import Logger
+# from ap_utils.utils import get_print_func, Timer
+
+
+class Timer:
+  """ Measure time. """
+  def __init__(self):
+    self.start = time()
+
+  def timer_end(self):
+    self.end = time()
+    return self.end - self.start
+
+  def display_timer(self, print_fn=print):
+    time_diff = self.timer_end()
+    if (time_diff) // 3600 > 0:
+        print_fn("Runtime: {:.1f} hrs".format( (time_diff)/3600) )
+    else:
+        print_fn("Runtime: {:.1f} mins".format( (time_diff)/60) )
+
+
+fdir = Path(__file__).resolve().parent
+
+maindir = Path(f"./cross_study_HPO1")
+MAIN_ML_DATA_DIR = Path(f"./{maindir}/ml_data")
+MAIN_MODEL_DIR = Path(f"./{maindir}/models")
+MAIN_INFER_OUTDIR = Path(f"./{maindir}/infer")
+
+# Check that environment variable "IMPROVE_DATA_DIR" has been specified
+if os.getenv("IMPROVE_DATA_DIR") is None:
+    raise Exception("ERROR ! Required system variable not specified.  \
+                    You must define IMPROVE_DATA_DIR ... Exiting.\n")
+os.environ["CANDLE_DATA_DIR"] = os.environ["IMPROVE_DATA_DIR"]
+
+params = frm.initialize_parameters(
+    fdir,
+    default_model="csa_workflow_params.txt",
+)
+
+main_datadir = Path(os.environ["IMPROVE_DATA_DIR"])
+raw_datadir = main_datadir / params["raw_data_dir"]
+x_datadir = raw_datadir / params["x_data_dir"]
+y_datadir = raw_datadir / params["y_data_dir"]
+splits_dir = raw_datadir / params["splits_dir"]
+
+# lg = Logger(main_datadir/"csa.log")
+print_fn = print
+# print_fn = get_print_func(lg.logger)
+print_fn(f"File path: {fdir}")
+
+### Source and target data sources
+## Set 1 - full analysis
+source_datasets = ["gCSI", "CTRPv2", "GDSCv1", "GDSCv2", "CCLE"]
+target_datasets = ["gCSI", "CTRPv2", "GDSCv1", "GDSCv2", "CCLE"]
+## Set 2 - smaller datasets
+# source_datasets = ["CCLE", "gCSI"]
+# target_datasets = ["CCLE", "gCSI"]
+# source_datasets = ["CCLE", "gCSI", "GDSCv1", "GDSCv2"]
+# target_datasets = ["CCLE", "gCSI", "GDSCv1", "GDSCv2"]
+# source_datasets = ["CCLE", "GDSCv1"]
+# target_datasets = ["CCLE", "gCSI", "GDSCv1", "GDSCv2"]
+## Set 3 - full analysis for a single source
+# source_datasets = ["CCLE"]
+# source_datasets = ["CTRPv2"]
+# target_datasets = ["CCLE", "CTRPv2", "gCSI", "GDSCv1", "GDSCv2"]
+# target_datasets = ["CCLE", "gCSI", "GDSCv1", "GDSCv2"]
+# target_datasets = ["CCLE", "gCSI", "GDSCv2"]
+## Set 4 - same source and target
+# source_datasets = ["CCLE"]
+# target_datasets = ["CCLE"]
+## Set 5 - single source and target
+# source_datasets = ["GDSCv1"]
+# target_datasets = ["CCLE"]
+
+only_cross_study = False
+# only_cross_study = True
+
+y_col_name = "auc"
+# y_col_name = "auc1"
+
+## Splits
+# split_nums = []  # all splits
+split_nums = [0]
+# split_nums = [4, 7]
+# split_nums = [1, 4, 7]
+# split_nums = [1, 3, 5, 7, 9]
+
+def build_split_fname(source: str, split: int, phase: str):
+    """ Build split file name. If file does not exist continue """
+    return f"{source_data_name}_split_{split}_{phase}.txt"
+
+# ===============================================================
+###  Generate CSA results (within- and cross-study)
+# ===============================================================
+
+timer = Timer()
+# Iterate over source datasets
+# Note! The "source_data_name" iterations are independent of each other
+print_fn(f"\nsource_datasets: {source_datasets}")
+print_fn(f"target_datasets: {target_datasets}")
+print_fn(f"split_nums:      {split_nums}")
+# import pdb; pdb.set_trace()
+for source_data_name in source_datasets:
+
+    # Get the split file paths
+    # This parsing assumes splits file names are: SOURCE_split_NUM_[train/val/test].txt
+    if len(split_nums) == 0:
+        # Get all splits
+        split_files = list((splits_dir).glob(f"{source_data_name}_split_*.txt"))
+        split_nums = [str(s).split("split_")[1].split("_")[0] for s in split_files]
+        split_nums = sorted(set(split_nums))
+        # num_splits = 1
+    else:
+        # Use the specified splits
+        split_files = []
+        for s in split_nums:
+            split_files.extend(list((splits_dir).glob(f"{source_data_name}_split_{s}_*.txt")))
+
+    files_joined = [str(s) for s in split_files]
+
+    # --------------------
+    # Preprocess and Train
+    # --------------------
+    for split in split_nums:
+        print_fn(f"Split id {split} out of {len(split_nums)} splits.")
+        # Check that train, val, and test are available. Otherwise, continue to the next split.
+        # split = 11
+        # files_joined = [str(s) for s in split_files]
+        # TODO: check this!
+        for phase in ["train", "val", "test"]:
+            fname = build_split_fname(source_data_name, split, phase)
+            # print(f"{phase}: {fname}")
+            if fname not in "\t".join(files_joined):
+                warnings.warn(f"\nThe {phase} split file {fname} is missing (continue to next split)")
+                continue
+
+        for target_data_name in target_datasets:
+            if only_cross_study and (source_data_name == target_data_name):
+                continue # only cross-study
+            print_fn(f"\nSource data: {source_data_name}")
+            print_fn(f"Target data: {target_data_name}")
+
+            ml_data_outdir = MAIN_ML_DATA_DIR/f"{source_data_name}-{target_data_name}"/f"split_{split}"
+
+            if source_data_name == target_data_name:
+                # If source and target are the same, then infer on the test split
+                test_split_file = f"{source_data_name}_split_{split}_test.txt"
+            else:
+                # If source and target are different, then infer on the entire target dataset
+                test_split_file = f"{target_data_name}_all.txt"
+
+            # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+            # p1 (none): Preprocess train data
+            # train_split_files = list((ig.splits_dir).glob(f"{source_data_name}_split_0_train*.txt"))  # TODO: placeholder for lc analysis
+            timer_preprocess = Timer()
+            # ml_data_path = graphdrp_preprocess_improve.main([
+            #     "--train_split_file", f"{source_data_name}_split_{split}_train.txt",
+            #     "--val_split_file", f"{source_data_name}_split_{split}_val.txt",
+            #     "--test_split_file", str(test_split_file_name),
+            #     "--ml_data_outdir", str(ml_data_outdir),
+            #     "--y_col_name", y_col_name
+            # ])
+            print_fn("\nPreprocessing")
+            train_split_file = f"{source_data_name}_split_{split}_train.txt"
+            val_split_file = f"{source_data_name}_split_{split}_val.txt"
+            print_fn(f"train_split_file: {train_split_file}")
+            print_fn(f"val_split_file:   {val_split_file}")
+            print_fn(f"test_split_file:  {test_split_file}")
+            print_fn(f"ml_data_outdir:   {ml_data_outdir}")
+            preprocess_run = ["python",
+                  "uno_preprocess_improve.py",
+                  "--train_split_file", str(train_split_file),
+                  "--val_split_file", str(val_split_file),
+                  "--test_split_file", str(test_split_file),
+                  "--ml_data_outdir", str(ml_data_outdir),
+                  "--y_col_name", str(y_col_name)
+            ]
+            result = subprocess.run(preprocess_run, capture_output=True,
+                                    text=True, check=True)
+            # print(result.stdout)
+            # print(result.stderr)
+            timer_preprocess.display_timer(print_fn)
+
+            # p2 (p1): Train model
+            # Train a single model for a given [source, split] pair
+            # Train using train samples and early stop using val samples
+            model_outdir = MAIN_MODEL_DIR/f"{source_data_name}"/f"split_{split}"
+            if model_outdir.exists() is False:
+                train_ml_data_dir = ml_data_outdir
+                val_ml_data_dir = ml_data_outdir
+                timer_train = Timer()
+                # graphdrp_train_improve.main([
+                #     "--train_ml_data_dir", str(train_ml_data_dir),
+                #     "--val_ml_data_dir", str(val_ml_data_dir),
+                #     "--model_outdir", str(model_outdir),
+                #     "--epochs", str(epochs),  # available in config_file
+                #     # "--ckpt_directory", str(MODEL_OUTDIR),  # TODO: we'll use candle known param ckpt_directory instead of model_outdir
+                #     # "--cuda_name", "cuda:5"
+                # ])
+                print_fn("\nTrain")
+                print_fn(f"train_ml_data_dir: {train_ml_data_dir}")
+                print_fn(f"val_ml_data_dir:   {val_ml_data_dir}")
+                print_fn(f"model_outdir:      {model_outdir}")
+                # import pdb; pdb.set_trace()
+                train_run = ["python",
+                      "uno_train_improve.py",
+                      "--train_ml_data_dir", str(train_ml_data_dir),
+                      "--val_ml_data_dir", str(val_ml_data_dir),
+                      "--model_outdir", str(model_outdir),
+                      "--y_col_name", y_col_name
+                ]
+                result = subprocess.run(train_run, capture_output=True,
+                                        text=True, check=True)
+                # print(result.stdout)
+                # print(result.stderr)
+                timer_train.display_timer(print_fn)
+
+            # Infer
+            # p3 (p1, p2): Inference
+            test_ml_data_dir = ml_data_outdir
+            model_dir = model_outdir
+            infer_outdir = MAIN_INFER_OUTDIR/f"{source_data_name}-{target_data_name}"/f"split_{split}"
+            timer_infer = Timer()
+            # graphdrp_infer_improve.main([
+            #     "--test_ml_data_dir", str(test_ml_data_dir),
+            #     "--model_dir", str(model_dir),
+            #     "--infer_outdir", str(infer_outdir),
+            #     # "--cuda_name", "cuda:5"
+            # ])
+            print_fn("\nInfer")
+            print_fn(f"test_ml_data_dir: {test_ml_data_dir}")
+            print_fn(f"infer_outdir:     {infer_outdir}")
+            infer_run = ["python",
+                  "uno_infer_improve.py",
+                  "--test_ml_data_dir", str(test_ml_data_dir),
+                  "--model_dir", str(model_dir),
+                  "--infer_outdir", str(infer_outdir),
+                  "--y_col_name", y_col_name
+            ]
+            result = subprocess.run(infer_run, capture_output=True,
+                                    text=True, check=True)
+            timer_infer.display_timer(print_fn)
+
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+timer.display_timer(print_fn)
+print_fn("Finished a full cross-study run.")
diff --git a/Pilot1/Uno_IMPROVE/csa_workflow_params.txt b/Pilot1/Uno_IMPROVE/csa_workflow_params.txt
@@ -0,0 +1,8 @@
+[Global_Params]
+model_name = "DRP_model"
+
+[CSA_Workflow]
+raw_data_dir = "raw_data"
+x_data_dir = "x_data"
+y_data_dir = "y_data"
+splits_dir = "splits"