cortexlabs · RobertLucian · Aug 12, 2020 · Jun 18, 2020 · Jun 19, 2020 · Jun 19, 2020
diff --git a/cli/local/docker_spec.go b/cli/local/docker_spec.go
@@ -35,7 +35,6 @@ import (
 	"github.com/docker/docker/api/types/container"
 	"github.com/docker/docker/api/types/filters"
 	"github.com/docker/docker/api/types/mount"
-	"github.com/docker/docker/api/types/strslice"
 	"github.com/docker/go-connections/nat"
 )
 
@@ -45,6 +44,7 @@ const (
 	_defaultPortStr            = "8888"
 	_tfServingPortStr          = "9000"
 	_tfServingEmptyModelConfig = "/etc/tfs/model_config_server.conf"
+	_tfServingBatchConfig      = "/etc/tfs/batch_config.conf"
 	_projectDir                = "/mnt/project"
 	_cacheDir                  = "/mnt/cache"
 	_modelDir                  = "/mnt/model"
@@ -301,13 +301,28 @@ func deployTensorFlowContainers(api *spec.API, awsClient *aws.Client) error {
 		Mounts:    mounts,
 	}
 
+	envVars := []string{}
+	cmdArgs := []string{
+		"--port=" + _tfServingPortStr,
+		"--model_config_file=" + _tfServingEmptyModelConfig,
+	}
+	if api.Predictor.ServerSideBatching != nil {
+		envVars = append(envVars,
+			"TF_MAX_BATCH_SIZE="+s.Int32(api.Predictor.ServerSideBatching.MaxBatchSize),
+			"TF_BATCH_TIMEOUT_MICROS="+s.Int64(api.Predictor.ServerSideBatching.BatchInterval.Microseconds()),
+			"TF_NUM_BATCHED_THREADS="+s.Int32(api.Predictor.ProcessesPerReplica),
+		)
+		cmdArgs = append(cmdArgs,
+			"--enable_batching=true",
+			"--batching_parameters_file="+_tfServingBatchConfig,
+		)
+	}
+
 	serveContainerConfig := &container.Config{
 		Image: api.Predictor.TensorFlowServingImage,
 		Tty:   true,
-		Cmd: strslice.StrSlice{
-			"--port=" + _tfServingPortStr,
-			"--model_config_file=" + _tfServingEmptyModelConfig,
-		},
+		Env:   envVars,
+		Cmd:   cmdArgs,
 		ExposedPorts: nat.PortSet{
 			_tfServingPortStr + "/tcp": struct{}{},
 		},

diff --git a/docs/deployments/api-configuration.md b/docs/deployments/api-configuration.md
@@ -67,6 +67,9 @@ See additional documentation for [parallelism](parallelism.md), [autoscaling](au
         model_path: <string>  # S3 path to an exported model (e.g. s3://my-bucket/exported_model) (required)
         signature_key: <string>  # name of the signature def to use for prediction (required if your model has more than one signature def)
       ...
+    server_side_batching:  # (optional)
+      max_batch_size: <int>  # the maximum number of requests to aggregate before running inference
+      batch_interval: <duration>  # the maximum amount of time to spend waiting for additional requests before running inference on the batch of requests
     processes_per_replica: <int>  # the number of parallel serving processes to run on each replica (default: 1)
     threads_per_process: <int>  # the number of threads per process (default: 1)
     config: <string: value>  # arbitrary dictionary passed to the constructor of the Predictor (optional)

diff --git a/docs/deployments/parallelism.md b/docs/deployments/parallelism.md
@@ -2,10 +2,51 @@
 
 _WARNING: you are on the master branch, please refer to the docs on the branch that matches your `cortex version`_
 
+## Concurrency
+
 Replica parallelism can be configured with the following fields in the `predictor` configuration:
 
 * `processes_per_replica` (default: 1): Each replica runs a web server with `processes_per_replica` processes. For APIs running with multiple CPUs per replica, using 1-3 processes per unit of CPU generally leads to optimal throughput. For example, if `cpu` is 2, a value between 2 and 6 `processes_per_replica` is reasonable. The optimal number will vary based on the workload's characteristics and the CPU compute request for the API.
 
 * `threads_per_process` (default: 1): Each process uses a thread pool of size `threads_per_process` to process requests. For applications that are not CPU intensive such as high I/O (e.g. downloading files), GPU-based inference, or Inferentia-based inference, increasing the number of threads per process can increase throughput. For CPU-bound applications such as running your model inference on a CPU, using 1 thread per process is recommended to avoid unnecessary context switching. Some applications are not thread-safe, and therefore must be run with 1 thread per process.
 
-`processes_per_replica` * `threads_per_process` represents the total number of requests that your replica can work on concurrently. For example, if `processes_per_replica` is 2 and `threads_per_process` is 2, and the replica was hit with 5 concurrent requests, 4 would immediately begin to be processed and 1 would be waiting for a thread to become available. If the replica was hit with 3 concurrent requests, all three would begin processing immediately.
+`processes_per_replica` * `threads_per_process` represents the total number of requests that your replica can work on concurrently. For example, if `processes_per_replica` is 2 and `threads_per_process` is 2, and the replica was hit with 5 concurrent requests, 4 would immediately begin to be processed, and 1 would be waiting for a thread to become available. If the replica were hit with 3 concurrent requests, all three would begin processing immediately.
+
+## Server-side batching
+
+Server-side batching is the process of aggregating multiple real-time requests into a single batch inference, which increases throughput at the expense of latency. Inference is triggered when either a maximum number of requests have been received, or when a certain amount of time has passed since receiving the first request, whichever comes first. Once a threshold is reached, inference is run on the received requests and responses are returned individually back to the clients. This process is transparent to the clients.
+
+The [TensorFlow Predictor](predictors.md#tensorflow-predictor) allows for the use of the following 2 fields in the `server_side_batching` configuration:
+
+* `max_batch_size`: The maximum number of requests to aggregate before running inference. This is an instrument for controlling throughput. The maximum size can be achieved if `batch_interval` is long enough to collect `max_batch_size` requests.
+
+* `batch_interval`: The maximum amount of time to spend waiting for additional requests before running inference on the batch of requests. If fewer than `max_batch_size` requests are received after waiting the full `batch_interval`, then inference will run on the requests that have been received. This is an instrument for controlling latency.
+
+In order to use server-side batching, the model's graph must be built such that batches can be accepted as input/output. The following is an example of how the input `x` and the output `y` of the graph could be shaped to be compatible with server-side batching:
+
+```python
+batch_size = None
+sample_shape = [340, 240, 3] # i.e. RGB image
+output_shape = [1000] # i.e. image labels
+
+with graph.as_default():
+    # ...
+    x = tf.placeholder(tf.float32, shape=[batch_size] + sample_shape, name="input")
+    y = tf.placeholder(tf.float32, shape=[batch_size] + output_shape, name="output")
+    # ...
+```
+
+### Optimization
+
+When optimizing for both throughput and latency, you will likely want keep the `max_batch_size` to a relatively small value. Even though a higher `max_batch_size` with a low `batch_interval` (when there are many requests coming in) can offer a significantly higher throughput, the overall latency could be quite large. The reason is that for a request to get back a response, it has to wait until the entire batch is processed, which means that the added latency due to the `batch_interval` can pale in comparison. For instance, let's assume that a single prediction takes 50ms, and that when the batch size is set to 128, the processing time for a batch is 1280ms (i.e. 10ms per sample). So while the throughput is now 5 times higher, it takes 1280ms + `batch_interval` to get back a response (instead of 50ms). This is the trade-off with server-side batching.
+
+When optimizing for maximum throughput, a good rule of thumb is to follow these steps:
+
+1. Determine the maximum throughput of one API replica when `server_side_batching` is not enabled (same as if `max_batch_size` were set to 1). This can be done with a load test (make sure to set `max_replicas` to 1 to disable autoscaling).
+1. Determine the highest `batch_interval` with which you are still comfortable for your application. Keep in mind that the batch interval is not the only component of the overall latency - the inference on the batch and the pre/post processing also have to occur.
+1. Multiply the maximum throughput from step 1 by the `batch_interval` from step 2. The result is a number which you can assign to `max_batch_size`.
+1. Run the load test again. If the inference fails with that batch size (e.g. due to running out of GPU or RAM memory), then reduce `max_batch_size` to a level that works (reduce `batch_interval` by the same factor).
+1. Use the load test to determine the peak throughput of the API replica. Multiply the observed throughput by the `batch_interval` to calculate the average batch size. If the average batch size coincides with `max_batch_size`, then it might mean that the throughput could still be further increased by increasing `max_batch_size`. If it's lower, then it means that `batch_interval` is triggering the inference before `max_batch_size` requests have been aggregated. If modifying both `max_batch_size` and `batch_interval` doesn't improve the throughput, then the service may be bottlenecked by something else (e.g. CPU, network IO, `processes_per_replica`, `threads_per_process`, etc).
+
+<!-- CORTEX_VERSION_MINOR x1 -->
+An example of server-side batching for the TensorFlow Predictor that has been benchmarked is found in [ResNet50 in TensorFlow](https://github.com/cortexlabs/cortex/tree/master/examples/tensorflow/image-classifier-resnet50#throughput-test).
diff --git a/docs/summary.md b/docs/summary.md
@@ -49,6 +49,7 @@
 * [404/503 API responses](troubleshooting/api-request-errors.md)
 * [NVIDIA runtime not found](troubleshooting/nvidia-container-runtime-not-found.md)
 * [TF session in predict()](troubleshooting/tf-session-in-predict.md)
+* [Serving-side batching errors](troubleshooting/server-side-batching-errors.md)
 
 ## Guides
 

diff --git a/docs/troubleshooting/server-side-batching-errors.md b/docs/troubleshooting/server-side-batching-errors.md
@@ -0,0 +1,33 @@
+# Batching errors when max_batch_size/batch_interval are set
+
+_WARNING: you are on the master branch, please refer to the docs on the branch that matches your `cortex version`_
+
+When `max_batch_size` and `batch_interval` fields are set for the [TensorFlow Predictor](../deployments/predictors.md#tensorflow-predictor), errors can be encountered if the associated model hasn't been built for batching.
+
+The following error is an example of what happens when the input shape doesn't accommodate batching - e.g. when its shape is `[height, width, 3]` instead of `[batch_size, height, width, 3]`:
+
+```text
+Batching session Run() input tensors must have at least one dimension.
+```
+
+Here is another example of setting the output shape inappropriately for batching - e.g. when its shape is `[labels]` instead of `[batch_size, labels]`:
+
+```text
+Batched output tensor has 0 dimensions.
+```
+
+The solution to these errors is to incorporate into the model's graph another dimension (a placeholder for batch size) placed on the first position for both its input and output.
+
+The following is an example of how the input `x` and the output `y` of the graph could be shaped to be compatible with server-side batching:
+
+```python
+batch_size = None
+sample_shape = [340, 240, 3] # i.e. RGB image
+output_shape = [1000] # i.e. image labels
+
+with graph.as_default():
+    # ...
+    x = tf.placeholder(tf.float32, shape=[batch_size] + sample_shape, name="input")
+    y = tf.placeholder(tf.float32, shape=[batch_size] + output_shape, name="output")
+    # ...
+```
diff --git a/examples/pytorch/image-classifier-resnet50/README.md b/examples/pytorch/image-classifier-resnet50/README.md
@@ -6,8 +6,8 @@ This example implements an image recognition system using ResNet50, which allows
 
 There are 3 Cortex APIs available in this example:
 
+1. [cortex.yaml](cortex.yaml) - to be used with any instances.
 1. [cortex_inf.yaml](cortex_inf.yaml) - to be used with `inf1` instances.
-1. [cortex_cpu.yaml](cortex_cpu.yaml) - to be used with any instances that have CPUs.
 1. [cortex_gpu.yaml](cortex_gpu.yaml) - to be used with GPU instances.
 
 To deploy an API, run:
@@ -19,7 +19,7 @@ cortex deploy <cortex-deployment-yaml>
 E.g.
 
 ```bash
-cortex deploy cortex_cpu.yaml
+cortex deploy cortex_gpu.yaml
 ```
 
 ## Verifying your API

diff --git a/...image-classifier-resnet50/cortex_cpu.yaml → ...rch/image-classifier-resnet50/cortex.yaml b/...image-classifier-resnet50/cortex_cpu.yaml → ...rch/image-classifier-resnet50/cortex.yaml
diff --git a/examples/tensorflow/image-classifier-inception/cortex_server_side_batching.yaml b/examples/tensorflow/image-classifier-inception/cortex_server_side_batching.yaml
@@ -0,0 +1,17 @@
+# WARNING: you are on the master branch, please refer to the examples on the branch that matches your `cortex version`
+
+- name: image-classifier-inception
+  kind: SyncAPI
+  predictor:
+    type: tensorflow
+    path: predictor.py
+    model_path: s3://cortex-examples/tensorflow/image-classifier/inception
+    server_side_batching:
+      max_batch_size: 2
+      batch_interval: 0.2s
+    threads_per_process: 2
+  monitoring:
+    model_type: classification
+  compute:
+    cpu: 1
+    gpu: 1
diff --git a/examples/tensorflow/image-classifier-resnet50/README.md b/examples/tensorflow/image-classifier-resnet50/README.md
@@ -4,11 +4,12 @@ This example implements an image recognition system using ResNet50, which allows
 
 ## Deploying
 
-There are 3 Cortex APIs available in this example:
+There are 4 Cortex APIs available in this example:
 
+1. [cortex.yaml](cortex.yaml) - to be used with any instances.
 1. [cortex_inf.yaml](cortex_inf.yaml) - to be used with `inf1` instances.
-1. [cortex_cpu.yaml](cortex_cpu.yaml) - to be used with any instances that have CPUs.
 1. [cortex_gpu.yaml](cortex_gpu.yaml) - to be used with GPU instances.
+1. [cortex_gpu_server_side_batching.yaml](cortex_gpu_server_side_batching.yaml) - to be used with GPU instances. Deployed with `max_batch_size` > 1. The exported model and the TensorFlow Predictor do not need to be modified to support server-side batching.
 
 To deploy an API, run:
 
@@ -19,7 +20,7 @@ cortex deploy <cortex-deployment-yaml>
 E.g.
 
 ```bash
-cortex deploy cortex_cpu.yaml
+cortex deploy cortex_inf.yaml
 ```
 
 ## Verifying your API
@@ -29,7 +30,7 @@ Check that your API is live by running `cortex get image-classifier-resnet50`, a
 ```bash
 $ curl <API endpoint> -X POST -H "Content-Type: application/json" -d @sample.json
 
-[["tabby", "Egyptian_cat", "tiger_cat", "tiger", "plastic_bag"]]
+["tabby", "Egyptian_cat", "tiger_cat", "tiger", "plastic_bag"]
 ```
 
 The following image is embedded in [sample.json](sample.json):
@@ -38,42 +39,19 @@ The following image is embedded in [sample.json](sample.json):
 
 ## Throughput test
 
-[throughput_test.py](throughput_test.py) is a Python CLI that can be used to test the throughput of your deployed API. The throughput will vary depending on your API's configuration (specified in the `cortex_*.yaml` file), your local machine's resources (mostly CPU, since it has to spawn many concurrent requests), and the internet connection on your local machine.
-
-```bash
-Usage: throughput_test.py [OPTIONS] IMG_URL ENDPOINT
-
-  Program for testing the throughput of a Resnet50 model on instances equipped
-  with CPU, GPU or Inferentia devices.
-
-Options:
-  -w, --processes INTEGER   Number of processes for prediction requests.  [default: 1]
-  -t, --threads INTEGER     Number of threads per process for prediction requests.  [default: 1]
-  -s, --samples INTEGER     Number of samples to run per thread.  [default: 10]
-  -i, --time-based FLOAT    How long the thread making predictions will run for in seconds.
-                            If set, -s option will be ignored.
-  -b, --batch-size INTEGER  Number of images sent for inference in one request.  [default: 1]
-  --help                    Show this message and exit.
-```
-
-The Python CLI has been tested with Python 3.6.9. To install the CLI's dependencies, run the following:
-
-```bash
-pip install requests click opencv-contrib-python numpy
-```
-
-Before [throughput_test.py](throughput_test.py) is run, 2 environment variables have to be exported:
+Before [throughput_test.py](../../utils/throughput_test.py) is run, 2 environment variables have to be exported:
 
 ```bash
 export ENDPOINT=<API endpoint>  # you can find this with `cortex get image-classifier-resnet50`
-export IMG_URL=https://i.imgur.com/213xcvs.jpg # this is the cat image shown in the previous step
+export PAYLOAD=https://i.imgur.com/213xcvs.jpg # this is the cat image shown in the previous step
 ```
 
 Then, deploy each API one at a time and check the results:
 
-1. Running `python throughput_test.py -i 30 -p 4 -t 48` with the [cortex_inf.yaml](cortex_inf.yaml) API running on an `inf1.2xlarge` instance will get **~510 inferences/sec** with an average latency of **80 ms**.
-1. Running `python throughput_test.py -i 30 -p 4 -t 2` with the [cortex_cpu.yaml](cortex_cpu.yaml) API running on an `c5.xlarge` instance will get **~16.2 inferences/sec** with an average latency of **200 ms**.
-1. Running `python throughput_test.py -i 30 -p 4 -t 24` with the [cortex_gpu.yaml](cortex_gpu.yaml) API running on an `g4dn.xlarge` instance will get **~125 inferences/sec** with an average latency of **85 ms**. Optimizing the model with TensorRT to use FP16 on TF-serving only seems to achieve a 10% performance improvement - one thing to consider is that the TensorRT engines hadn't been built beforehand, so this might have affected the results negatively.
+1. Running `python ../../utils/throughput_test.py -i 30 -p 4 -t 2` with the [cortex.yaml](cortex.yaml) API running on an `c5.xlarge` instance will get **~16.2 inferences/sec** with an average latency of **200 ms**.
+1. Running `python ../../utils/throughput_test.py -i 30 -p 4 -t 48` with the [cortex_inf.yaml](cortex_inf.yaml) API running on an `inf1.2xlarge` instance will get **~510 inferences/sec** with an average latency of **80 ms**.
+1. Running `python ../../utils/throughput_test.py -i 30 -p 4 -t 24` with the [cortex_gpu.yaml](cortex_gpu.yaml) API running on an `g4dn.xlarge` instance will get **~125 inferences/sec** with an average latency of **85 ms**. Optimizing the model with TensorRT to use FP16 on TF-serving only seems to achieve a 10% performance improvement - one thing to consider is that the TensorRT engines hadn't been built beforehand, so this might have affected the results negatively.
+1. Running `python ../../utils/throughput_test.py -i 30 -p 4 -t 60` with the [cortex_gpu_server_side_batching.yaml](cortex_gpu_batch_sized.yaml) API running on an `g4dn.xlarge` instance will get **~186 inferences/sec** with an average latency of **500 ms**. This achieves a 49% higher throughput than the [cortex_gpu.yaml](cortex_gpu.yaml) API, at the expense of increased latency.
 
 *Note: `inf1.xlarge` isn't used because the major bottleneck with `inf` instances for this example is with the CPU, and `inf1.2xlarge` has twice the amount of CPU cores for same number of Inferentia ASICs (which is 1), which translates to almost double the throughput.*
 
@@ -89,7 +67,7 @@ pip install --extra-index-url=https://pip.repos.neuron.amazonaws.com \
   tensorflow-neuron==1.15.0.1.0.1333.0
 ```
 
-The [generate_resnet50_models.ipynb](generate_resnet50_models.ipynb) notebook will generate 2 SavedModels. One will be saved in the `resnet50` directory which can be run on GPU or on CPU and another in the `resnet50_neuron` directory which can only be run on `inf1` instances.
+The [generate_resnet50_models.ipynb](generate_resnet50_models.ipynb) notebook will generate 2 SavedModels. One will be saved in the `resnet50` directory which can be run on GPU or on CPU and another in the `resnet50_neuron` directory which can only be run on `inf1` instances. For server-side batching on `inf1` instances, a different compilation of the model is required. To compile ResNet50 model for a batch size of 5, run `run_all` from [this directory](https://github.com/aws/aws-neuron-sdk/tree/master/src/examples/tensorflow/keras_resnet50).
 
 If you'd also like to build the TensorRT version of the GPU model, run the following command in a new Python environment to install the pip dependencies required for the [generate_gpu_resnet50_model.ipynb](generate_gpu_resnet50_model.ipynb) notebook:
 

diff --git a/...image-classifier-resnet50/cortex_cpu.yaml → ...low/image-classifier-resnet50/cortex.yaml b/...image-classifier-resnet50/cortex_cpu.yaml → ...low/image-classifier-resnet50/cortex.yaml
@@ -12,6 +12,7 @@
       classes: https://s3.amazonaws.com/deep-learning-models/image-models/imagenet_class_index.json
       input_shape: [224, 224]
       input_key: input
+      output_key: output
   compute:
     cpu: 3
     mem: 4G
diff --git a/examples/tensorflow/image-classifier-resnet50/cortex_gpu.yaml b/examples/tensorflow/image-classifier-resnet50/cortex_gpu.yaml
@@ -12,6 +12,7 @@
       classes: https://s3.amazonaws.com/deep-learning-models/image-models/imagenet_class_index.json
       input_shape: [224, 224]
       input_key: input
+      output_key: output
   compute:
     gpu: 1
     cpu: 3