-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #81 from mmcdermott/dev
added autogluon support, more models, more preprocessing strategies
- Loading branch information
Showing
68 changed files
with
2,677 additions
and
1,126 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
# MIMIC-IV Example | ||
|
||
This is an example of how to extract a MEDS dataset from MIMIC-IV. All scripts in this README are assumed to | ||
be run **not** from this directory but from the root directory of this entire repository (e.g., one directory | ||
up from this one). | ||
|
||
## Extract MIMIC-IV MEDS Data | ||
|
||
### Download pre-extracted data from gpc | ||
|
||
Install the [gcloud client](https://cloud.google.com/sdk/docs/install) and then run the following command to download the MEDS data from the gcp bucket: | ||
|
||
```console | ||
export MIMICIV_MEDS_DIR=??? # set to the directory in which you want to store the raw MIMIC-IV data | ||
export OUTPUT_TABULARIZATION_DIR=??? # set to the output directory for the tabularized data | ||
export OUTPUT_MODEL_DIR=${OUTPUT_TABULARIZATION_DIR}/results/ # set to the base results directory | ||
|
||
cd $MIMICIV_MEDS_DIR | ||
gcloud storage cp gs://ehr_standardization_schema/MEDS_Extract_v0.0.7_test.zip meds_extract_0.0.7_data.zip | ||
unzip meds_extract_0.0.7_data.zip | ||
rm meds_extract_0.0.7_data.zip | ||
``` | ||
|
||
```console | ||
conda create -n meds_tab python=3.12 | ||
conda activate meds_tab | ||
pip install "meds-tab==0.0.5" | ||
``` | ||
|
||
Next we need to get some labels for our tasks. We will use the `long_los` and `icu_mortality` tasks as examples. | ||
|
||
### Download pre-extracted labels from gcp: | ||
|
||
```console | ||
TASKS=("long_los" "icu_mortality") | ||
TASKS_DIR="$MIMICIV_MEDS_DIR/tasks/" # set to the directory in which you want to store all tasks | ||
|
||
mkdir -p "${TASKS_DIR}" # create a directory for the task | ||
|
||
for TASK_NAME in "${TASKS[@]}" | ||
do | ||
gcloud storage cp "gs://ehr_standardization_schema/benchmark_v1/data/labels/${TASK_NAME}.parquet" "${TASKS_DIR}/${TASK_NAME}/0.parquet" | ||
done | ||
``` | ||
|
||
## Run Tabularization and XGBoost Baseline | ||
|
||
```console | ||
export N_PARALLEL_WORKERS=48 # Set number of workers | ||
export RESHARD_DIR=??? # set to directory to output reshareded meds data | ||
bash MIMICIV_TUTORIAL/tabularize_meds.sh "${MIMICIV_MEDS_DIR}" "$RESHARD_DIR" $OUTPUT_TABULARIZATION_DIR \ | ||
"long_los,icu_mortality" $TASKS_DIR $OUTPUT_MODEL_DIR $N_PARALLEL_WORKERS \ | ||
"tabularization.aggs=[static/present,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]" \ | ||
"tabularization.window_sizes=[2h,12h,1d,7d,30d,365d,full]" | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
#!/usr/bin/env bash | ||
|
||
set -e | ||
|
||
# Function to print help message | ||
print_help() { | ||
echo "Usage: $0 <MIMICIV_MEDS_DIR> <MIMICIV_MEDS_RESHARD_DIR> <OUTPUT_TABULARIZATION_DIR> <TASKS> <TASKS_DIR> <OUTPUT_MODEL_DIR> <N_PARALLEL_WORKERS> [additional arguments]" | ||
echo | ||
echo "Arguments:" | ||
echo " MIMICIV_MEDS_DIR Directory containing MIMIC-IV medications data" | ||
echo " MIMICIV_MEDS_RESHARD_DIR Directory for resharded MIMIC-IV medications data" | ||
echo " OUTPUT_TABULARIZATION_DIR Output directory for tabularized data" | ||
echo " TASKS Comma-separated list of tasks to run (e.g., 'long_los,icu_mortality')" | ||
echo " TASKS_DIR Directory containing task-specific data" | ||
echo " OUTPUT_MODEL_DIR Output directory for models" | ||
echo " N_PARALLEL_WORKERS Number of parallel workers to use" | ||
echo | ||
echo "Additional arguments will be passed to the underlying commands." | ||
} | ||
|
||
# Check for help flag | ||
if [[ "$1" == "--help" || "$1" == "-h" ]]; then | ||
print_help | ||
exit 0 | ||
fi | ||
|
||
# Check if we have the minimum required number of arguments | ||
if [ "$#" -lt 7 ]; then | ||
echo "Error: Not enough arguments provided." | ||
print_help | ||
exit 1 | ||
fi | ||
|
||
# Assign arguments to variables | ||
MIMICIV_MEDS_DIR="$1" | ||
MIMICIV_MEDS_RESHARD_DIR="$2" | ||
OUTPUT_TABULARIZATION_DIR="$3" | ||
TASKS="$4" | ||
TASKS_DIR="$5" | ||
OUTPUT_MODEL_DIR="$6" | ||
N_PARALLEL_WORKERS="$7" | ||
|
||
shift 7 | ||
|
||
# Split the TASKS string into an array | ||
IFS=',' read -ra TASK_ARRAY <<< "$TASKS" | ||
|
||
# Print input arguments | ||
echo "Input arguments:" | ||
echo "MIMICIV_MEDS_DIR: $MIMICIV_MEDS_DIR" | ||
echo "MIMICIV_MEDS_RESHARD_DIR: $MIMICIV_MEDS_RESHARD_DIR" | ||
echo "OUTPUT_TABULARIZATION_DIR: $OUTPUT_TABULARIZATION_DIR" | ||
echo "TASKS:" "${TASK_ARRAY[@]}" | ||
echo "TASKS_DIR: $TASKS_DIR" | ||
echo "OUTPUT_MODEL_DIR: $OUTPUT_MODEL_DIR" | ||
echo "N_PARALLEL_WORKERS: $N_PARALLEL_WORKERS" | ||
echo "Additional arguments:" "$@" | ||
echo | ||
|
||
# Reshard the data | ||
echo "Resharding data" | ||
MEDS_transform-reshard_to_split \ | ||
--multirun \ | ||
worker="range(0,6)" \ | ||
hydra/launcher=joblib \ | ||
input_dir="$MIMICIV_MEDS_DIR" \ | ||
cohort_dir="$MIMICIV_MEDS_RESHARD_DIR" \ | ||
'stages=["reshard_to_split"]' \ | ||
stage="reshard_to_split" \ | ||
stage_configs.reshard_to_split.n_subjects_per_shard=2500 \ | ||
"polling_time=5" | ||
|
||
# describe codes | ||
echo "Describing codes" | ||
meds-tab-describe \ | ||
"input_dir=${MIMICIV_MEDS_RESHARD_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" | ||
|
||
echo "Tabularizing static data" | ||
meds-tab-tabularize-static \ | ||
"input_dir=${MIMICIV_MEDS_RESHARD_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" \ | ||
do_overwrite=False "$@" | ||
|
||
meds-tab-tabularize-time-series \ | ||
--multirun \ | ||
worker="range(0,$N_PARALLEL_WORKERS)" \ | ||
hydra/launcher=joblib \ | ||
"input_dir=${MIMICIV_MEDS_RESHARD_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" \ | ||
do_overwrite=False "$@" | ||
|
||
for TASK in "${TASK_ARRAY[@]}" | ||
do | ||
echo "Running task_specific_caching.py for task: $TASK" | ||
meds-tab-cache-task \ | ||
hydra/launcher=joblib \ | ||
"input_dir=${MIMICIV_MEDS_RESHARD_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" \ | ||
"input_label_dir=${TASKS_DIR}/${TASK}/" "task_name=${TASK}" do_overwrite=False "$@" | ||
|
||
echo "Running xgboost for task: $TASK" | ||
meds-tab-xgboost \ | ||
--multirun \ | ||
worker="range(0,$N_PARALLEL_WORKERS)" \ | ||
"input_dir=${MIMICIV_MEDS_RESHARD_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" \ | ||
"output_model_dir=${OUTPUT_MODEL_DIR}/${TASK}/" "task_name=$TASK" do_overwrite=False \ | ||
"hydra.sweeper.n_trials=1000" "hydra.sweeper.n_jobs=${N_PARALLEL_WORKERS}" \ | ||
"$@" | ||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.