Use Docker image for training (#4)

* use Docker image for training * create servable
ImScientist · Feb 17, 2024 · 018e878 · 018e878
1 parent 7b4818c
commit 018e878
Show file tree

Hide file tree

Showing 14 changed files with 211 additions and 327 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,14 @@
+FROM tensorflow/tensorflow:2.13.0-gpu-jupyter
+
+RUN apt-get update && apt-get install graphviz -y
+
+WORKDIR /tf
+
+COPY requirements.txt .
+
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY src ./src
+
+ENV TF_CPP_MIN_LOG_LEVEL=2
+ENV PYTHONPATH=/tf,/tf/src
diff --git a/README.md b/README.md
@@ -18,17 +18,44 @@ distribution instead of a single value we modify the neural network by:
 We apply both models to the NYC taxi trip data that can be found
 in the [nyc.gov](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page) website.
 
+## Docker image and environment variables
+
+- Build the image:
+ ```shell
+ docker build -t travel_time -f Dockerfile .
+ ```
+
+- The environment variables that will be used in the model are in `src/settings.py`. Change them:
+ ```shell
+ export DATA_DIR="$(pwd)"/data
+ export TFBOARD_DIR="$(pwd)"/tfboard
+ export ARTIFACTS_DIR="$(pwd)"/artifacts
+ 
+ # maximum memory in GB that can be allocated by tensorflow
+ export GPU_MEMORY_LIMIT=16
+ ```
+
 ## Collect & preprocess data
 
 - We can collect the NYC taxi trip data (drives and taxi zones) for the entire 2016
  from [nyc.gov](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page) website, and store it in `DATA_DIR`:
  ```shell
- PYTHONPATH=$(pwd) python src/main.py collect-data --year=2016
+ docker run -it --rm --runtime=nvidia --gpus all --name=experiment \
+ -v $DATA_DIR:/tf/data \
+ -v $TFBOARD_DIR:/tf/tfboard \
+ -v $ARTIFACTS_DIR:/tf/artifacts \
+ --env GPU_MEMORY_LIMIT=$GPU_MEMORY_LIMIT \
+ travel_time:latest python src/main.py collect-data --year=2016
  ```
 
 - To generate features from the data and split it into a training, validation and test datasets execute:
  ```shell
- PYTHONPATH=$(pwd) python src/main.py preprocess-data \
+ docker run -it --rm --runtime=nvidia --gpus all --name=experiment \
+ -v $DATA_DIR:/tf/data \
+ -v $TFBOARD_DIR:/tf/tfboard \
+ -v $ARTIFACTS_DIR:/tf/artifacts \
+ --env GPU_MEMORY_LIMIT=$GPU_MEMORY_LIMIT \
+ travel_time:latest python src/main.py preprocess-data \
  --tr=0.8 --va=0.1 --te=0.1
  ```
  It splits the data from every month into a training, validation and test dataset and stores it in a separate folder,
@@ -46,20 +73,19 @@ in the [nyc.gov](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page)
 
 ## Train model
 
-- Change the following variables in `src/settings.py`:
- - `DATA_DIR`: raw and preprocessed data location
- - `TFBOARD_DIR`: location of the logs visualized in tensorboard
- - `ARTIFACTS_DIR`: location where the model and dataset wrappers will be stored
-
-
-- Train the model. The json strings that you can provide overwrite the default arguments used by the model:
+- The json strings that you can provide overwrite the default arguments used by the model:
  ```shell
- PYTHONPATH=$(pwd) python src/main.py train \
+ docker run -it --rm --runtime=nvidia --gpus all --name=experiment \
+ -v $DATA_DIR:/tf/data \
+ -v $TFBOARD_DIR:/tf/tfboard \
+ -v $ARTIFACTS_DIR:/tf/artifacts \
+ --env GPU_MEMORY_LIMIT=$GPU_MEMORY_LIMIT \
+ travel_time:latest python src/main.py train \
  --model_wrapper=ModelPDF \
  --model_args='{"l2": 0.0001, "batch_normalization": false, "layer_sizes": [64, [64, 64], [64, 64], 32, 8]}' \
  --ds_args='{"max_files": 2}' \
  --callbacks_args='{"period": 10, "profile_batch": 0}' \
- --training_args='{"epochs": 40}'
+ --training_args='{"epochs": 100}'
  ```
 
 - After training the model the following directory structure will be generated (the experiment id is generated
@@ -77,6 +103,16 @@ in the [nyc.gov](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page)
  └── validation/
  ```
 
+- Check the training logs with tensorboard:
+ ```shell
+ docker run --rm --name=tfboard \
+ -p 6006:6006 \
+ -v $TFBOARD_DIR:/tf/tfboard \
+ travel_time:latest tensorboard --logdir /tf/tfboard --host 0.0.0.0
+ 
+ # visit http://0.0.0.0:6006/
+ ```
+
 ## Evaluate model
 
 - We can evaluate accuracy and the uncertainty estimation provided by the predicted probability distribution functions
@@ -97,56 +133,39 @@ in the [nyc.gov](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page)
  <img src="figs/pdf-model_pct_plot.png" alt="isolated" height="250"/>
  <img src="figs/pdf-model_mean-to-std_histogram.png" alt="isolated" height="250"/>
 
-## Serve model (WIP)
-
-- Variables to set:
- ```shell
- ARTIFACTS_DIR= < settings.ARTIFACTS_DIR >
- EXPERIMENT=ex_010
- MODEL_DIR=${ARTIFACTS_DIR}/${EXPERIMENT}
- ```
-
-- Model that predicts parameters of a probability distribution function:
- - Since the output of the model is a random variable we make it deterministic by:
- - cutting the last layer
- - eventually adding a deterministic layer that calculates the mean and standard deviation of the distribution
-
- To create this modified model we run:
- ```shell
- PYTHONPATH=$(pwd) python src/main.py prepare-servable \
- --load_dir=${MODEL_DIR}
- ```
-
- - Start TensorFlow Serving container and make a sample request:
- ```shell
- docker run -t --rm -p 8501:8501 \
- --name=serving \
- -v "$MODEL_DIR/model_mean_std:/models/model_mean_std/1" \
- -e MODEL_NAME=model_mean_std \
- tensorflow/serving
-
- # Does not work for the model with Lognormal dist
- curl -X POST http://localhost:8501/v1/models/model_mean_std:predict \
- -H 'Content-type: application/json' \
- -d '{"signature_name": "serving_default", "instances": [{"dropoff_area": [7.9e-05], "dropoff_lat": [40.723752], "dropoff_lon": [-73.976968], "month": [1], "passenger_count": [1], "pickup_area": [0.000422], "pickup_lat": [40.744235], "pickup_lon": [-73.906306], "time": [800], "trip_distance": [2.3], "vendor_id": [0], "weekday": [3]}]}'
- 
- docker container stop serving
- ```
-
-- Quantile model (not tested)
-
-
-- Useful commands to check the model signature:
- ```shell
- # output 1,2: `serve`
- saved_model_cli show --dir ${ARTIFACTS_DIR}/${EXPERIMENT}/model
- saved_model_cli show --dir ${ARTIFACTS_DIR}/${EXPERIMENT}/model_mean_std
- 
- # output 1: `__saved_model_init_op`
- # output 2: `__saved_model_init_op`, `serving_default`
- saved_model_cli show --dir ${ARTIFACTS_DIR}/${EXPERIMENT}/model --tag_set serve
- saved_model_cli show --dir ${ARTIFACTS_DIR}/${EXPERIMENT}/model_mean_std --tag_set serve
- 
- saved_model_cli show --dir ${ARTIFACTS_DIR}/${EXPERIMENT}/model_mean_std \
- --tag_set serve --signature_def serving_default
- ```
+## Serve model
+
+- We serve a `ModelPDF` that can output the mean and standard deviation of the predicted travel time distribution.
+
+ - First, we create a servable from the trained model. For example, we will use the model from experiment `ex_011`
+ that is stored in `$ARTIFACTS_DIR/ex_000`. The code below generates a new servable
+ in `$ARTIFACTS_DIR/ex_000/model_mean_std`:
+ ```shell
+ docker run -it --rm --runtime=nvidia --gpus all --name=experiment \
+ -v $ARTIFACTS_DIR:/tf/artifacts \
+ travel_time:latest python src/main.py prepare-servable \
+ --load_dir=/tf/artifacts/ex_000
+ ```
+ - Next we spawn the tf serving container and mount to it the newly created servable:
+ ```shell
+ MODEL_DIR=$ARTIFACTS_DIR/ex_000/model_mean_std
+ 
+ docker run -t --rm -p 8501:8501 \
+ --name=serving \
+ -v "$MODEL_DIR:/models/model_mean_std/1" \
+ -e MODEL_NAME=model_mean_std \
+ tensorflow/serving:2.13.0
+ ```
+
+ - Test exported model predictions of the travel-time mean:
+ ```shell
+ curl -X POST http://localhost:8501/v1/models/model_mean_std/versions/1:predict \
+ -H 'Content-type: application/json' \
+ -d '{"signature_name": "mean_value", "instances": [{"time": [571.0], "trip_distance": [1.1], "pickup_lon": [-73.991791], "pickup_lat": [40.736072], "pickup_area": [1e-5], "dropoff_lon": [-73.991142], "dropoff_lat": [40.734538], "dropoff_area": [2e-5], "passenger_count": [1], "vendor_id": [1], "weekday": [1], "month": [1]}]}'
+ ```
+ To get the predicted std change the value of the `signature_name` from `mean_value` to `std`:
+ ```shell
+ curl -X POST http://localhost:8501/v1/models/model_mean_std/versions/1:predict \
+ -H 'Content-type: application/json' \
+ -d '{"signature_name": "std", "instances": [{"time": [571.0], "trip_distance": [1.1], "pickup_lon": [-73.991791], "pickup_lat": [40.736072], "pickup_area": [1e-5], "dropoff_lon": [-73.991142], "dropoff_lat": [40.734538], "dropoff_area": [2e-5], "passenger_count": [1], "vendor_id": [1], "weekday": [1], "month": [1]}]}'
+ ```
diff --git a/requirements.txt b/requirements.txt
@@ -1,120 +1,12 @@
-absl-py==1.0.0
-aiohttp==3.8.1
-aiosignal==1.2.0
-asttokens==2.0.5
-astunparse==1.6.3
-async-timeout==4.0.2
-attrs==21.4.0
-backcall==0.2.0
-cachetools==5.0.0
-certifi==2020.6.20
-charset-normalizer==2.0.12
-click==8.1.3
-cloudpickle==2.1.0
-cycler==0.11.0
-db-dtypes==1.0.2
-debugpy==1.6.0
-decorator==5.1.1
-dill==0.3.4
-dm-tree==0.1.7
-entrypoints==0.4
-environs==9.5.0
-executing==0.8.3
-flatbuffers==1.12
-fonttools==4.33.3
-frozenlist==1.3.0
-fsspec==2022.5.0
-gast==0.4.0
-gcsfs==2022.5.0
-google-api-core==2.8.2
-google-auth==2.6.6
-google-auth-oauthlib==0.4.6
-google-cloud-bigquery==3.2.0
-google-cloud-bigquery-storage==2.13.2
-google-cloud-core==2.3.1
-google-cloud-storage==2.4.0
-google-crc32c==1.3.0
-google-pasta==0.2.0
-google-resumable-media==2.3.3
-googleapis-common-protos==1.56.3
-graphviz==0.20
-grpcio==1.47.0
-grpcio-status==1.47.0
-gviz-api==1.10.0
-h5py==3.6.0
-idna==3.3
-ipykernel==6.13.0
-ipython==8.3.0
-jedi==0.18.1
-joblib==1.1.0
-jupyter-client==7.3.0
-jupyter-core==4.10.0
-keras==2.9.0
-Keras-Preprocessing==1.1.2
-kiwisolver==1.4.2
-libclang==14.0.1
-Markdown==3.3.6
-marshmallow==3.16.0
-matplotlib==3.5.2
-matplotlib-inline==0.1.3
-multidict==6.0.2
-nest-asyncio==1.5.5
-numpy==1.22.3
-oauthlib==3.2.0
-opt-einsum==3.3.0
-packaging==21.3
-pandas==1.4.2
-parso==0.8.3
-pexpect==4.8.0
-pickleshare==0.7.5
-Pillow==9.1.0
-promise==2.3
-prompt-toolkit==3.0.29
-proto-plus==1.20.6
-protobuf==3.19.4
-psutil==5.9.0
-ptyprocess==0.7.0
-pure-eval==0.2.2
-pyarrow==8.0.0
-pyasn1==0.4.8
-pyasn1-modules==0.2.8
-pydot==1.4.2
-Pygments==2.12.0
-pyparsing==3.0.8
-python-dateutil==2.8.2
-python-dotenv==0.20.0
-pytz==2022.1
-pyzmq==22.3.0
-requests==2.27.1
-requests-oauthlib==1.3.1
-rsa==4.8
-scikit-learn==1.1.1
-scipy==1.8.0
-seaborn==0.11.2
-six==1.16.0
-stack-data==0.2.0
-tensorboard==2.9.1
-tensorboard-data-server==0.6.1
-tensorboard-plugin-profile==2.8.0
-tensorboard-plugin-wit==1.8.1
-tensorflow==2.9.1
-tensorflow-addons==0.17.1
-tensorflow-datasets==4.5.2
-tensorflow-estimator==2.9.0
-tensorflow-hub==0.12.0
-tensorflow-io-gcs-filesystem==0.25.0
-tensorflow-metadata==1.7.0
-tensorflow-probability==0.17.0
-termcolor==1.1.0
-tf-estimator-nightly==2.8.0.dev2021122109
-threadpoolctl==3.1.0
-tornado==6.1
-tqdm==4.64.0
-traitlets==5.1.1
-typeguard==2.13.3
-typing_extensions==4.2.0
-urllib3==1.26.9
-wcwidth==0.2.5
-Werkzeug==2.1.2
-wrapt==1.14.0
-yarl==1.7.2
+tensorflow_probability==0.21.0
+tensorflow_addons==0.21.0
+tensorflow_io==0.34.0
+click==8.1.7
+environs==10.3.0
+pandas==2.0.3
+geopandas==0.13.2
+matplotlib==3.7.2
+graphviz==0.20.1
+pydot==2.0.0
+scikit-learn==1.3.2
+pyarrow==15.0.0
diff --git a/src/data/data_collection.py b/src/data/data_collection.py
@@ -6,7 +6,7 @@
 logger = logging.getLogger(__name__)
 
 
-def get_data(save_dir: str, year: int):
+def collect_data(save_dir: str, year: int):
  """ Get NYC taxi data and taxi-zones data for a particular year """
 
  os.makedirs(save_dir, exist_ok=True)

diff --git a/src/data/preprocessing.py → src/data/data_preprocessing.py b/src/data/preprocessing.py → src/data/data_preprocessing.py
@@ -1,5 +1,6 @@
+from __future__ import annotations
+
 import os
-import glob
 import logging
 
 import numpy as np
@@ -84,7 +85,7 @@ def generate_features(df: pd.DataFrame, taxi_zones: pd.DataFrame) -> pd.DataFram
 
  cond = lambda x: (
  (x['dropoff_datetime'] - x['pickup_datetime'])
- .dt.total_seconds().between(1, 6000))
+ .dt.total_seconds().between(1, 6_000))
 
  df = (
  df
@@ -137,9 +138,14 @@ def preprocess_pq_files(
  taxi_zones_path = os.path.join(source_dir, 'taxi_zones.zip')
  taxi_zones = taxi_zones_summary(taxi_zones_path)
 
- files = glob.glob('*.parquet', root_dir=source_dir)
+ # files = glob.glob('*.parquet', root_dir=source_dir)
+ # files = sorted(files)
+ files = os.listdir(source_dir)
+ files = [f for f in files if f.endswith('.parquet')]
  files = sorted(files)
 
+ logger.info(f'files {files}')
+
  for file in files:
  path_load = os.path.join(source_dir, file)
 

diff --git a/src/data/dataset.py b/src/data/dataset.py
@@ -48,8 +48,9 @@ def pq_to_dataset(
  'month': tf.TensorSpec(tf.TensorShape([]), tf.int32),
  'target': tf.TensorSpec(tf.TensorShape([]))}
 
- files = sorted(glob.glob('*.parquet', root_dir=data_dir))
- files = [os.path.join(data_dir, x) for x in files]
+ # files = sorted(glob.glob('*.parquet', root_dir=data_dir))
+ # files = [os.path.join(data_dir, x) for x in files]
+ files = sorted(glob.glob(f'{data_dir}/*.parquet'))
  files = files[:max_files]
 
  ds = (

diff --git a/src/default_args.py b/src/default_args.py
@@ -25,7 +25,7 @@
  histogram_freq=0,
  reduce_lr_patience=100,
  profile_batch=(10, 15),
- verbose=0,
+ verbose=1,
  early_stopping_patience=250,
  period=10)