diff --git a/cli/local/docker_spec.go b/cli/local/docker_spec.go
index 7c13d11a0d..6595ce7a66 100644
--- a/cli/local/docker_spec.go
+++ b/cli/local/docker_spec.go
@@ -19,7 +19,6 @@ package local
 import (
 	"context"
 	"fmt"
-	"math"
 	"path/filepath"
 	"strings"
 
@@ -92,9 +91,7 @@ func getAPIEnv(api *spec.API, awsClient *aws.Client) []string {
 		"CORTEX_PROJECT_DIR="+_projectDir,
 		"CORTEX_PROCESSES_PER_REPLICA="+s.Int32(api.Predictor.ProcessesPerReplica),
 		"CORTEX_THREADS_PER_PROCESS="+s.Int32(api.Predictor.ThreadsPerProcess),
-		// add 1 because it was required to achieve the target concurrency for 1 process, 1 thread
-		"CORTEX_MAX_PROCESS_CONCURRENCY="+s.Int64(1+int64(math.Round(float64(consts.DefaultMaxReplicaConcurrency)/float64(api.Predictor.ProcessesPerReplica)))),
-		"CORTEX_SO_MAX_CONN="+s.Int64(consts.DefaultMaxReplicaConcurrency+100), // add a buffer to be safe
+		"CORTEX_MAX_REPLICA_CONCURRENCY="+s.Int32(api.Predictor.ProcessesPerReplica*api.Predictor.ThreadsPerProcess+1024), // allow a queue of 1024
 		"AWS_REGION="+awsClient.Region,
 	)
 
diff --git a/dev/versions.md b/dev/versions.md
index 36bff81f5d..302c207a9f 100644
--- a/dev/versions.md
+++ b/dev/versions.md
@@ -172,6 +172,18 @@ Note: it's ok if example training notebooks aren't upgraded, as long as the expo
     * be careful not to update any of the versions for Inferentia that are not latest in `images/python-predictor-inf/Dockerfile`
 1. Rerun all examples and check their logs
 
+## S6-overlay supervisor
+
+1. Locate the `s6-overlay` installation in `images/python-predictor-*/Dockerfile`, `images/tensorflow-predictor/Dockerfile` and `images/onnx-predictor-*/Dockerfile`
+1. Update the version in each serving image with the newer one in https://github.com/just-containers/s6-overlay.
+
+## Nginx
+
+1. Run a base image of ubuntu that matches the version tag used for the serving images. The running command is `docker run -it --rm <base-image>`
+1. Run `apt update && apt-cache policy nginx`. Notice the latest minor version of nginx (e.g. `1.14`)
+1. Locate the `nginx` package in `images/python-predictor-*/Dockerfile`, `images/tensorflow-predictor/Dockerfile` and `images/onnx-predictor-*/Dockerfile`
+1. Update the version for all `nginx` appearances using the minor version from step 2 and add an asterisk at the end to denote any version (e.g. `1.14.*`)
+
 ## Istio
 
 1. Find the latest [release](https://istio.io/latest/news/releases) and check the release notes (here are the [latest IstioOperator Options](https://istio.io/latest/docs/reference/config/istio.operator.v1alpha1/))
diff --git a/examples/sklearn/iris-classifier/cortex.yaml b/examples/sklearn/iris-classifier/cortex.yaml
index f86400768f..b562fa1fdf 100644
--- a/examples/sklearn/iris-classifier/cortex.yaml
+++ b/examples/sklearn/iris-classifier/cortex.yaml
@@ -12,4 +12,4 @@
     model_type: classification
   compute:
     cpu: 0.2
-    mem: 100M
+    mem: 200M
diff --git a/examples/sklearn/iris-classifier/requirements.txt b/examples/sklearn/iris-classifier/requirements.txt
index 30ddf823b8..bbc213cf3e 100644
--- a/examples/sklearn/iris-classifier/requirements.txt
+++ b/examples/sklearn/iris-classifier/requirements.txt
@@ -1 +1,2 @@
 boto3
+scikit-learn==0.21.3
diff --git a/images/neuron-rtd/Dockerfile b/images/neuron-rtd/Dockerfile
index 8d2fa7e96b..08887ec0e9 100644
--- a/images/neuron-rtd/Dockerfile
+++ b/images/neuron-rtd/Dockerfile
@@ -13,7 +13,8 @@ RUN yum install -y \
     aws-neuron-runtime-1.0.9592.0 \
     procps-ng-3.3.10-26.amzn2.x86_64 \
     gzip \
-    tar
+    tar \
+    curl
 
 ENV PATH="/opt/aws/neuron/bin:${PATH}"
 
diff --git a/images/onnx-predictor-cpu/Dockerfile b/images/onnx-predictor-cpu/Dockerfile
index 117417d45c..8afd33f853 100644
--- a/images/onnx-predictor-cpu/Dockerfile
+++ b/images/onnx-predictor-cpu/Dockerfile
@@ -9,8 +9,14 @@ RUN apt-get update -qq && apt-get install -y -q \
         unzip \
         zlib1g-dev \
         locales \
+        nginx=1.14.* \
     && apt-get clean -qq && rm -rf /var/lib/apt/lists/*
 
+RUN cd /tmp/ && \
+    curl -L --output s6-overlay-amd64-installer "https://github.com/just-containers/s6-overlay/releases/download/v2.1.0.2/s6-overlay-amd64-installer" && \
+    cd - && \
+    chmod +x /tmp/s6-overlay-amd64-installer && /tmp/s6-overlay-amd64-installer / && rm /tmp/s6-overlay-amd64-installer
+
 RUN locale-gen en_US.UTF-8
 ENV LANG=en_US.UTF-8 LANGUAGE=en_US.en LC_ALL=en_US.UTF-8
 
@@ -68,4 +74,6 @@ COPY pkg/workloads/cortex/consts.py /src/cortex
 COPY pkg/workloads/cortex/lib /src/cortex/lib
 COPY pkg/workloads/cortex/serve /src/cortex/serve
 
-ENTRYPOINT ["/src/cortex/serve/run.sh"]
+RUN mv /src/cortex/serve/init/bootloader.sh /etc/cont-init.d/bootloader.sh
+
+ENTRYPOINT ["/init"]
diff --git a/images/onnx-predictor-gpu/Dockerfile b/images/onnx-predictor-gpu/Dockerfile
index 804c8c8f01..13a512ac11 100644
--- a/images/onnx-predictor-gpu/Dockerfile
+++ b/images/onnx-predictor-gpu/Dockerfile
@@ -9,8 +9,14 @@ RUN apt-get update -qq && apt-get install -y -q \
         unzip \
         zlib1g-dev \
         locales \
+        nginx=1.14.* \
     && apt-get clean -qq && rm -rf /var/lib/apt/lists/*
 
+RUN cd /tmp/ && \
+    curl -L --output s6-overlay-amd64-installer "https://github.com/just-containers/s6-overlay/releases/download/v2.1.0.2/s6-overlay-amd64-installer" && \
+    cd - && \
+    chmod +x /tmp/s6-overlay-amd64-installer && /tmp/s6-overlay-amd64-installer / && rm /tmp/s6-overlay-amd64-installer
+
 RUN locale-gen en_US.UTF-8
 ENV LANG=en_US.UTF-8 LANGUAGE=en_US.en LC_ALL=en_US.UTF-8
 
@@ -68,4 +74,6 @@ COPY pkg/workloads/cortex/consts.py /src/cortex
 COPY pkg/workloads/cortex/lib /src/cortex/lib
 COPY pkg/workloads/cortex/serve /src/cortex/serve
 
-ENTRYPOINT ["/src/cortex/serve/run.sh"]
+RUN mv /src/cortex/serve/init/bootloader.sh /etc/cont-init.d/bootloader.sh
+
+ENTRYPOINT ["/init"]
diff --git a/images/python-predictor-cpu/Dockerfile b/images/python-predictor-cpu/Dockerfile
index fa7922a387..599ea622e0 100644
--- a/images/python-predictor-cpu/Dockerfile
+++ b/images/python-predictor-cpu/Dockerfile
@@ -9,8 +9,14 @@ RUN apt-get update -qq && apt-get install -y -q \
         unzip \
         zlib1g-dev \
         locales \
+        nginx=1.14.* \
     && apt-get clean -qq && rm -rf /var/lib/apt/lists/*
 
+RUN cd /tmp/ && \
+    curl -L --output s6-overlay-amd64-installer "https://github.com/just-containers/s6-overlay/releases/download/v2.1.0.2/s6-overlay-amd64-installer" && \
+    cd - && \
+    chmod +x /tmp/s6-overlay-amd64-installer && /tmp/s6-overlay-amd64-installer / && rm /tmp/s6-overlay-amd64-installer
+
 RUN locale-gen en_US.UTF-8
 ENV LANG=en_US.UTF-8 LANGUAGE=en_US.en LC_ALL=en_US.UTF-8
 
@@ -88,4 +94,6 @@ COPY pkg/workloads/cortex/consts.py /src/cortex
 COPY pkg/workloads/cortex/lib /src/cortex/lib
 COPY pkg/workloads/cortex/serve /src/cortex/serve
 
-ENTRYPOINT ["/src/cortex/serve/run.sh"]
+RUN mv /src/cortex/serve/init/bootloader.sh /etc/cont-init.d/bootloader.sh
+
+ENTRYPOINT ["/init"]
diff --git a/images/python-predictor-gpu/Dockerfile b/images/python-predictor-gpu/Dockerfile
index 6ced46d169..bd04e83c1a 100644
--- a/images/python-predictor-gpu/Dockerfile
+++ b/images/python-predictor-gpu/Dockerfile
@@ -11,8 +11,14 @@ RUN apt-get update -qq && apt-get install -y -q \
         unzip \
         zlib1g-dev \
         locales \
+        nginx=1.14.* \
     && apt-get clean -qq && rm -rf /var/lib/apt/lists/*
 
+RUN cd /tmp/ && \
+    curl -L --output s6-overlay-amd64-installer "https://github.com/just-containers/s6-overlay/releases/download/v2.1.0.2/s6-overlay-amd64-installer" && \
+    cd - && \
+    chmod +x /tmp/s6-overlay-amd64-installer && /tmp/s6-overlay-amd64-installer / && rm /tmp/s6-overlay-amd64-installer
+
 RUN locale-gen en_US.UTF-8
 ENV LANG=en_US.UTF-8 LANGUAGE=en_US.en LC_ALL=en_US.UTF-8
 
@@ -93,4 +99,6 @@ COPY pkg/workloads/cortex/consts.py /src/cortex
 COPY pkg/workloads/cortex/lib /src/cortex/lib
 COPY pkg/workloads/cortex/serve /src/cortex/serve
 
-ENTRYPOINT ["/src/cortex/serve/run.sh"]
+RUN mv /src/cortex/serve/init/bootloader.sh /etc/cont-init.d/bootloader.sh
+
+ENTRYPOINT ["/init"]
diff --git a/images/python-predictor-inf/Dockerfile b/images/python-predictor-inf/Dockerfile
index 63fb538dde..755bbeb6b8 100644
--- a/images/python-predictor-inf/Dockerfile
+++ b/images/python-predictor-inf/Dockerfile
@@ -10,6 +10,9 @@ RUN apt-get update -qq && apt-get install -y -q \
     aws-neuron-runtime=1.1.1402.0 && \
     apt-get clean -qq && rm -rf /var/lib/apt/lists/*
 
+RUN wget -P /tmp/ https://github.com/just-containers/s6-overlay/releases/download/v2.1.0.2/s6-overlay-amd64-installer && \
+    chmod +x /tmp/s6-overlay-amd64-installer && /tmp/s6-overlay-amd64-installer / && rm /tmp/s6-overlay-amd64-installer
+
 ENV PATH=/opt/aws/neuron/bin/:$PATH
 
 RUN apt-get update -qq && apt-get install -y -q \
@@ -21,6 +24,7 @@ RUN apt-get update -qq && apt-get install -y -q \
     unzip \
     zlib1g-dev \
     locales \
+    nginx=1.14.* \
     && apt-get clean -qq && rm -rf /var/lib/apt/lists/*
 
 RUN locale-gen en_US.UTF-8
@@ -100,4 +104,6 @@ COPY pkg/workloads/cortex/consts.py /src/cortex
 COPY pkg/workloads/cortex/lib /src/cortex/lib
 COPY pkg/workloads/cortex/serve /src/cortex/serve
 
-ENTRYPOINT ["/src/cortex/serve/run.sh"]
+RUN mv /src/cortex/serve/init/bootloader.sh /etc/cont-init.d/bootloader.sh
+
+ENTRYPOINT ["/init"]
diff --git a/images/tensorflow-predictor/Dockerfile b/images/tensorflow-predictor/Dockerfile
index 30f0736505..d1d69be72a 100644
--- a/images/tensorflow-predictor/Dockerfile
+++ b/images/tensorflow-predictor/Dockerfile
@@ -9,8 +9,14 @@ RUN apt-get update -qq && apt-get install -y -q \
         unzip \
         zlib1g-dev \
         locales \
+        nginx=1.14.* \
     && apt-get clean -qq && rm -rf /var/lib/apt/lists/*
 
+RUN cd /tmp/ && \
+    curl -L --output s6-overlay-amd64-installer "https://github.com/just-containers/s6-overlay/releases/download/v2.1.0.2/s6-overlay-amd64-installer" && \
+    cd - && \
+    chmod +x /tmp/s6-overlay-amd64-installer && /tmp/s6-overlay-amd64-installer / && rm /tmp/s6-overlay-amd64-installer
+
 RUN locale-gen en_US.UTF-8
 ENV LANG=en_US.UTF-8 LANGUAGE=en_US.en LC_ALL=en_US.UTF-8
 
@@ -68,8 +74,10 @@ RUN test "${SLIM}" = "true" || ( \
             tensorflow-hub==0.9.0 \
     )
 
-COPY pkg/workloads/cortex/consts.py /src/cortex/
+COPY pkg/workloads/cortex/consts.py /src/cortex
 COPY pkg/workloads/cortex/lib /src/cortex/lib
 COPY pkg/workloads/cortex/serve /src/cortex/serve
 
-ENTRYPOINT ["/src/cortex/serve/run.sh"]
+RUN mv /src/cortex/serve/init/bootloader.sh /etc/cont-init.d/bootloader.sh
+
+ENTRYPOINT ["/init"]
diff --git a/images/tensorflow-serving-cpu/Dockerfile b/images/tensorflow-serving-cpu/Dockerfile
index 0197d9ad80..396bfec923 100644
--- a/images/tensorflow-serving-cpu/Dockerfile
+++ b/images/tensorflow-serving-cpu/Dockerfile
@@ -1,5 +1,9 @@
 FROM tensorflow/serving:2.3.0
 
+RUN apt-get update -qq && apt-get install -y -q \
+    curl \
+    && apt-get clean -qq && rm -rf /var/lib/apt/lists/*
+
 COPY images/tensorflow-serving-cpu/run.sh /src/
 RUN chmod +x /src/run.sh
 
diff --git a/images/tensorflow-serving-gpu/Dockerfile b/images/tensorflow-serving-gpu/Dockerfile
index 8f7fb285b8..745e29a56a 100644
--- a/images/tensorflow-serving-gpu/Dockerfile
+++ b/images/tensorflow-serving-gpu/Dockerfile
@@ -3,6 +3,7 @@ FROM tensorflow/serving:2.3.0-gpu
 RUN apt-get update -qq && apt-get install -y --no-install-recommends -q \
         libnvinfer6=6.0.1-1+cuda10.1 \
         libnvinfer-plugin6=6.0.1-1+cuda10.1 \
+        curl \
     && apt-get clean -qq && rm -rf /var/lib/apt/lists/*
 
 COPY images/tensorflow-serving-gpu/run.sh /src/
diff --git a/images/tensorflow-serving-inf/Dockerfile b/images/tensorflow-serving-inf/Dockerfile
index cccb83e189..efebb680cc 100644
--- a/images/tensorflow-serving-inf/Dockerfile
+++ b/images/tensorflow-serving-inf/Dockerfile
@@ -4,6 +4,7 @@ FROM ubuntu:18.04
 RUN apt-get update -qq && apt-get install -y -q \
     gettext-base \
     supervisor \
+    curl \
     wget \
     netcat \
     gnupg && \
diff --git a/pkg/operator/operator/k8s.go b/pkg/operator/operator/k8s.go
index bddeea6d08..f79ea2beb6 100644
--- a/pkg/operator/operator/k8s.go
+++ b/pkg/operator/operator/k8s.go
@@ -20,7 +20,6 @@ import (
 	"encoding/base64"
 	"encoding/json"
 	"fmt"
-	"math"
 	"path"
 	"strings"
 
@@ -179,6 +178,7 @@ func PythonPredictorContainers(api *spec.API) ([]kcore.Container, []kcore.Volume
 		VolumeMounts:    apiPodVolumeMounts,
 		ReadinessProbe:  FileExistsProbe(_apiReadinessFile),
 		LivenessProbe:   _apiLivenessProbe,
+		Lifecycle:       nginxGracefulStopper(api.Kind),
 		Resources: kcore.ResourceRequirements{
 			Requests: apiPodResourceList,
 			Limits:   apiPodResourceLimitsList,
@@ -267,6 +267,7 @@ func TensorFlowPredictorContainers(api *spec.API) ([]kcore.Container, []kcore.Vo
 		VolumeMounts:    volumeMounts,
 		ReadinessProbe:  FileExistsProbe(_apiReadinessFile),
 		LivenessProbe:   _apiLivenessProbe,
+		Lifecycle:       nginxGracefulStopper(api.Kind),
 		Resources: kcore.ResourceRequirements{
 			Requests: apiResourceList,
 		},
@@ -320,6 +321,7 @@ func ONNXPredictorContainers(api *spec.API) []kcore.Container {
 		VolumeMounts:    DefaultVolumeMounts,
 		ReadinessProbe:  FileExistsProbe(_apiReadinessFile),
 		LivenessProbe:   _apiLivenessProbe,
+		Lifecycle:       nginxGracefulStopper(api.Kind),
 		Resources: kcore.ResourceRequirements{
 			Requests: resourceList,
 			Limits:   resourceLimitsList,
@@ -409,15 +411,6 @@ func getEnvVars(api *spec.API, container string) []kcore.EnvVar {
 					Name:  "CORTEX_MAX_REPLICA_CONCURRENCY",
 					Value: s.Int64(api.Autoscaling.MaxReplicaConcurrency),
 				},
-				kcore.EnvVar{
-					Name: "CORTEX_MAX_PROCESS_CONCURRENCY",
-					// add 1 because it was required to achieve the target concurrency for 1 process, 1 thread
-					Value: s.Int64(1 + int64(math.Round(float64(api.Autoscaling.MaxReplicaConcurrency)/float64(api.Predictor.ProcessesPerReplica)))),
-				},
-				kcore.EnvVar{
-					Name:  "CORTEX_SO_MAX_CONN",
-					Value: s.Int64(api.Autoscaling.MaxReplicaConcurrency + 100), // add a buffer to be safe
-				},
 			)
 		}
 
@@ -699,6 +692,7 @@ func tensorflowServingContainer(api *spec.API, volumeMounts []kcore.VolumeMount,
 			FailureThreshold:    2,
 			Handler:             probeHandler,
 		},
+		Lifecycle: waitAPIContainerToStop(api.Kind),
 		Resources: resources,
 		Ports:     ports,
 	}
@@ -720,6 +714,7 @@ func neuronRuntimeDaemonContainer(api *spec.API, volumeMounts []kcore.VolumeMoun
 		},
 		VolumeMounts:   volumeMounts,
 		ReadinessProbe: socketExistsProbe(_neuronRTDSocket),
+		Lifecycle:      waitAPIContainerToStop(api.Kind),
 		Resources: kcore.ResourceRequirements{
 			Requests: kcore.ResourceList{
 				"hugepages-2Mi":         *kresource.NewQuantity(totalHugePages, kresource.BinarySI),
@@ -794,6 +789,34 @@ func socketExistsProbe(socketName string) *kcore.Probe {
 	}
 }
 
+func nginxGracefulStopper(apiKind userconfig.Kind) *kcore.Lifecycle {
+	if apiKind == userconfig.RealtimeAPIKind {
+		return &kcore.Lifecycle{
+			PreStop: &kcore.Handler{
+				Exec: &kcore.ExecAction{
+					// the sleep is required to wait for any k8s-related race conditions
+					// as described in https://medium.com/codecademy-engineering/kubernetes-nginx-and-zero-downtime-in-production-2c910c6a5ed8
+					Command: []string{"/bin/sh", "-c", "sleep 5; /usr/sbin/nginx -s quit; while pgrep -x nginx; do sleep 1; done"},
+				},
+			},
+		}
+	}
+	return nil
+}
+
+func waitAPIContainerToStop(apiKind userconfig.Kind) *kcore.Lifecycle {
+	if apiKind == userconfig.RealtimeAPIKind {
+		return &kcore.Lifecycle{
+			PreStop: &kcore.Handler{
+				Exec: &kcore.ExecAction{
+					Command: []string{"/bin/sh", "-c", fmt.Sprintf("while curl localhost:%s/nginx_status; do sleep 1; done", DefaultPortStr)},
+				},
+			},
+		}
+	}
+	return nil
+}
+
 var BaseEnvVars = []kcore.EnvFromSource{
 	{
 		ConfigMapRef: &kcore.ConfigMapEnvSource{
diff --git a/pkg/operator/resources/realtimeapi/k8s_specs.go b/pkg/operator/resources/realtimeapi/k8s_specs.go
index 945168f1af..eec2003505 100644
--- a/pkg/operator/resources/realtimeapi/k8s_specs.go
+++ b/pkg/operator/resources/realtimeapi/k8s_specs.go
@@ -27,6 +27,8 @@ import (
 	kcore "k8s.io/api/core/v1"
 )
 
+var _terminationGracePeriodSeconds int64 = 60 // seconds
+
 func deploymentSpec(api *spec.API, prevDeployment *kapps.Deployment) *kapps.Deployment {
 	switch api.Predictor.Type {
 	case userconfig.TensorFlowPredictorType:
@@ -74,7 +76,8 @@ func tensorflowAPISpec(api *spec.API, prevDeployment *kapps.Deployment) *kapps.D
 				"traffic.sidecar.istio.io/excludeOutboundIPRanges": "0.0.0.0/0",
 			},
 			K8sPodSpec: kcore.PodSpec{
-				RestartPolicy: "Always",
+				RestartPolicy:                 "Always",
+				TerminationGracePeriodSeconds: pointer.Int64(_terminationGracePeriodSeconds),
 				InitContainers: []kcore.Container{
 					operator.InitContainer(api),
 				},
@@ -123,7 +126,8 @@ func pythonAPISpec(api *spec.API, prevDeployment *kapps.Deployment) *kapps.Deplo
 				"traffic.sidecar.istio.io/excludeOutboundIPRanges": "0.0.0.0/0",
 			},
 			K8sPodSpec: kcore.PodSpec{
-				RestartPolicy: "Always",
+				RestartPolicy:                 "Always",
+				TerminationGracePeriodSeconds: pointer.Int64(_terminationGracePeriodSeconds),
 				InitContainers: []kcore.Container{
 					operator.InitContainer(api),
 				},
@@ -175,7 +179,8 @@ func onnxAPISpec(api *spec.API, prevDeployment *kapps.Deployment) *kapps.Deploym
 				InitContainers: []kcore.Container{
 					operator.InitContainer(api),
 				},
-				Containers: containers,
+				TerminationGracePeriodSeconds: pointer.Int64(_terminationGracePeriodSeconds),
+				Containers:                    containers,
 				NodeSelector: map[string]string{
 					"workload": "true",
 				},
diff --git a/pkg/types/spec/validations.go b/pkg/types/spec/validations.go
index e907e1b441..44a117d11a 100644
--- a/pkg/types/spec/validations.go
+++ b/pkg/types/spec/validations.go
@@ -399,9 +399,11 @@ func autoscalingValidation(provider types.ProviderType) *cr.StructFieldValidatio
 				{
 					StructField: "MaxReplicaConcurrency",
 					Int64Validation: &cr.Int64Validation{
-						Default:           consts.DefaultMaxReplicaConcurrency,
-						GreaterThan:       pointer.Int64(0),
-						LessThanOrEqualTo: pointer.Int64(math.MaxUint16),
+						Default:     consts.DefaultMaxReplicaConcurrency,
+						GreaterThan: pointer.Int64(0),
+						// our configured nginx can theoretically accept up to 32768 connections, but during testing,
+						// it has been observed that the number is just slightly lower, so it has been offset by 2678
+						LessThanOrEqualTo: pointer.Int64(30000),
 					},
 				},
 				{
diff --git a/pkg/workloads/cortex/lib/util.py b/pkg/workloads/cortex/lib/util.py
index 1b76ced3e6..43adad5713 100644
--- a/pkg/workloads/cortex/lib/util.py
+++ b/pkg/workloads/cortex/lib/util.py
@@ -21,6 +21,7 @@
 import inspect
 from inspect import Parameter
 from copy import deepcopy
+from typing import Any
 
 
 def has_method(object, method: str):
@@ -229,3 +230,17 @@ def is_float_or_int_list(var):
         if not is_float_or_int(item):
             return False
     return True
+
+
+def render_jinja_template(jinja_template_file: str, context: dict) -> str:
+    from jinja2 import Environment, FileSystemLoader
+
+    template_path = pathlib.Path(jinja_template_file)
+
+    env = Environment(loader=FileSystemLoader(str(template_path.parent)))
+    env.trim_blocks = True
+    env.lstrip_blocks = True
+    env.rstrip_blocks = True
+
+    template = env.get_template(str(template_path.name))
+    return template.render(**context)
diff --git a/pkg/workloads/cortex/serve/run.sh b/pkg/workloads/cortex/serve/init/bootloader.sh
similarity index 61%
rename from pkg/workloads/cortex/serve/run.sh
rename to pkg/workloads/cortex/serve/init/bootloader.sh
index 4de3fc693f..1054f5a46e 100755
--- a/pkg/workloads/cortex/serve/run.sh
+++ b/pkg/workloads/cortex/serve/init/bootloader.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/with-contenv bash
 
 # Copyright 2020 Cortex Labs, Inc.
 #
@@ -35,25 +35,17 @@ cd /mnt/project
 
 # if the container restarted, ensure that it is not perceived as ready
 rm -rf /mnt/workspace/api_readiness.txt
+rm -rf /mnt/workspace/proc-*-ready.txt
 
 # allow for the liveness check to pass until the API is running
 echo "9999999999" > /mnt/workspace/api_liveness.txt
 
-# export environment variables
-if [ -f "/mnt/project/.env" ]; then
-    set -a
-    source /mnt/project/.env
-    set +a
-fi
-
-export PYTHONPATH=$PYTHONPATH:$CORTEX_PYTHON_PATH
-
-# ensure predictor print() statements are always flushed
-export PYTHONUNBUFFERED=TRUE
+# to export user-specified environment files
+source_env_file_cmd="if [ -f \"/mnt/project/.env\" ]; then set -a; source /mnt/project/.env; set +a; fi"
 
 if [ "$CORTEX_PROVIDER" != "local" ]; then
     if [ "$CORTEX_KIND" == "RealtimeAPI" ]; then
-        sysctl -w net.core.somaxconn=$CORTEX_SO_MAX_CONN >/dev/null
+        sysctl -w net.core.somaxconn="65535" >/dev/null
         sysctl -w net.ipv4.ip_local_port_range="15000 64000" >/dev/null
         sysctl -w net.ipv4.tcp_fin_timeout=30 >/dev/null
     fi
@@ -61,7 +53,12 @@ fi
 
 # execute script if present in project's directory
 if [ -f "/mnt/project/dependencies.sh" ]; then
+    eval $source_env_file_cmd
     bash -e /mnt/project/dependencies.sh
+    status=$?
+    if [ $status -ne 0 ]; then
+        exit $status
+    fi
 fi
 
 # install from conda-packages.txt
@@ -86,4 +83,48 @@ if [ -f "/mnt/project/requirements.txt" ]; then
     pip --no-cache-dir install -r /mnt/project/requirements.txt
 fi
 
-/opt/conda/envs/env/bin/python /src/cortex/serve/start.py
+create_s6_service() {
+    service_name=$1
+    cmd=$2
+
+    dest_dir="/etc/services.d/$service_name"
+    mkdir $dest_dir
+
+    dest_script="$dest_dir/run"
+    echo "#!/usr/bin/with-contenv bash" > $dest_script
+    echo $cmd >> $dest_script
+    chmod +x $dest_script
+
+    dest_script="$dest_dir/finish"
+    echo "#!/usr/bin/execlineb -S0" > $dest_script
+    echo "s6-svscanctl -t /var/run/s6/services" >> $dest_script
+    chmod +x $dest_script
+}
+
+# prepare webserver
+if [ "$CORTEX_KIND" = "RealtimeAPI" ]; then
+
+    # prepare uvicorn workers
+    mkdir /run/uvicorn
+    for i in $(seq 1 $CORTEX_PROCESSES_PER_REPLICA); do
+        create_s6_service "uvicorn-$((i-1))" "$source_env_file_cmd && exec env PYTHONUNBUFFERED=TRUE env PYTHONPATH=$PYTHONPATH:$CORTEX_PYTHON_PATH /opt/conda/envs/env/bin/python /src/cortex/serve/start/server.py /run/uvicorn/proc-$((i-1)).sock"
+    done
+
+    create_s6_service "nginx" "exec nginx -c /run/nginx.conf"
+
+    # prepare api readiness checker
+    dest_dir="/etc/services.d/api_readiness"
+    mkdir $dest_dir
+    cp /src/cortex/serve/poll/readiness.sh $dest_dir/run
+    chmod +x $dest_dir/run
+
+    # generate nginx conf
+    /opt/conda/envs/env/bin/python -c 'from cortex.lib import util; import os; generated = util.render_jinja_template("/src/cortex/serve/nginx.conf.j2", os.environ); print(generated);' > /run/nginx.conf
+
+# prepare batch otherwise
+else
+    create_s6_service "batch" "$source_env_file_cmd && exec env PYTHONUNBUFFERED=TRUE env PYTHONPATH=$PYTHONPATH:$CORTEX_PYTHON_PATH /opt/conda/envs/env/bin/python /src/cortex/serve/start/batch.py"
+fi
+
+# run the python initialization script
+/opt/conda/envs/env/bin/python /src/cortex/serve/init/script.py
diff --git a/pkg/workloads/cortex/serve/start.py b/pkg/workloads/cortex/serve/init/script.py
similarity index 78%
rename from pkg/workloads/cortex/serve/start.py
rename to pkg/workloads/cortex/serve/init/script.py
index f86a808abd..e72cd8290e 100644
--- a/pkg/workloads/cortex/serve/start.py
+++ b/pkg/workloads/cortex/serve/init/script.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import uvicorn
-import yaml
 import os
 import json
 
@@ -48,9 +46,6 @@ def load_tensorflow_serving_models():
 
 
 def main():
-    with open("/src/cortex/serve/log_config.yaml", "r") as f:
-        log_config = yaml.load(f, yaml.FullLoader)
-
     # wait until neuron-rtd sidecar is ready
     uses_inferentia = os.getenv("CORTEX_ACTIVE_NEURON")
     if uses_inferentia:
@@ -81,25 +76,6 @@ def main():
     if raw_api_spec["predictor"]["type"] == "tensorflow":
         load_tensorflow_serving_models()
 
-    if raw_api_spec["kind"] == "RealtimeAPI":
-        # https://github.com/encode/uvicorn/blob/master/uvicorn/config.py
-        uvicorn.run(
-            "cortex.serve.wsgi:app",
-            host="0.0.0.0",
-            port=int(os.environ["CORTEX_SERVING_PORT"]),
-            workers=int(os.environ["CORTEX_PROCESSES_PER_REPLICA"]),
-            limit_concurrency=int(
-                os.environ["CORTEX_MAX_PROCESS_CONCURRENCY"]
-            ),  # this is a per process limit
-            backlog=int(os.environ["CORTEX_SO_MAX_CONN"]),
-            log_config=log_config,
-            log_level="info",
-        )
-    else:
-        from cortex.serve import batch
-
-        batch.start()
-
 
 if __name__ == "__main__":
     main()
diff --git a/pkg/workloads/cortex/serve/nginx.conf.j2 b/pkg/workloads/cortex/serve/nginx.conf.j2
new file mode 100644
index 0000000000..18aac9d16e
--- /dev/null
+++ b/pkg/workloads/cortex/serve/nginx.conf.j2
@@ -0,0 +1,101 @@
+# Copyright 2020 Cortex Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# good articles to read
+# https://hub.packtpub.com/fine-tune-nginx-configufine-tune-nginx-configurationfine-tune-nginx-configurationratio/
+# https://www.nginx.com/blog/tuning-nginx/
+# https://www.digitalocean.com/community/tutorials/understanding-nginx-http-proxying-load-balancing-buffering-and-caching
+# https://serverfault.com/a/788703
+# https://stackoverflow.com/questions/59846238/guide-on-how-to-use-regex-in-nginx-location-block-section
+
+daemon off;
+# maximum number of open files per worker
+worker_rlimit_nofile 65535;
+worker_processes 1;
+
+thread_pool pool threads={{ CORTEX_PROCESSES_PER_REPLICA | int }};
+
+events {
+    # max num requests = (worker_processes * worker_connections ) / 2 for reverse proxy
+    # max num requests is also limited by the number of socket connections available on the system (~64k)
+    worker_connections 65535;
+
+    # The multi_accept flag enables an NGINX worker to accept as many connections as possible when it
+    # gets the notification of a new connection. The purpose of this flag is to accept all connections
+    # in the listen queue at once. If the directive is disabled, a worker process will accept connections one by one.
+    multi_accept off;
+
+    # An efficient method of processing connections available on Linux 2.6+. The method is similar to the FreeBSD kqueue.
+    use epoll;
+}
+
+http {
+    # send headers in one piece, it is better than sending them one by one
+    tcp_nopush on;
+
+    # don't buffer data sent, good for small data bursts in real time
+    tcp_nodelay on;
+
+    # to limit concurrent requests
+    limit_conn_zone 1 zone=inflights:1m;
+
+    # to distribute load
+    aio threads=pool;
+
+    # how much time an inference can take
+    proxy_read_timeout 3600s;
+
+    upstream uvicorn {
+        # load balancing policy
+        least_conn;
+
+        {% for i in range(CORTEX_PROCESSES_PER_REPLICA | int) %}
+        server unix:/run/uvicorn/proc-{{ i }}.sock;
+        {% endfor %}
+    }
+
+    server {
+        listen {{ CORTEX_SERVING_PORT | int }};
+        underscores_in_headers on;
+
+        location /nginx_status {
+            stub_status on;
+            allow 127.0.0.1;
+            deny all;
+        }
+
+        location / {
+            deny all;
+        }
+
+        location ~ ^/(predict/?|)$ {
+            limit_conn inflights {{ CORTEX_MAX_REPLICA_CONCURRENCY | int }};
+
+            add_header Access-Control-Allow-Origin "*";
+            add_header Access-Control-Allow-Headers "*";
+            add_header Access-Control-Allow-Methods "GET, POST";
+            add_header Access-Control-Allow-Credentials "true";
+
+            proxy_set_header HOST $host;
+            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+            proxy_set_header X-Real-IP $remote_addr;
+            proxy_set_header X-Forwarded-Proto $scheme;
+
+            proxy_redirect off;
+            proxy_buffering off;
+
+            proxy_pass http://uvicorn;
+        }
+    }
+}
diff --git a/pkg/workloads/cortex/serve/poll/readiness.sh b/pkg/workloads/cortex/serve/poll/readiness.sh
new file mode 100644
index 0000000000..ee5fc347e9
--- /dev/null
+++ b/pkg/workloads/cortex/serve/poll/readiness.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/with-contenv bash
+
+# Copyright 2020 Cortex Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+while true; do
+    procs_ready="$(ls /mnt/workspace/proc-*-ready.txt 2>/dev/null | wc -l)"
+    if [ "$CORTEX_PROCESSES_PER_REPLICA" = "$procs_ready" ]; then
+        touch /mnt/workspace/api_readiness.txt
+        break
+    fi
+    sleep 1
+done
diff --git a/pkg/workloads/cortex/serve/requirements.txt b/pkg/workloads/cortex/serve/requirements.txt
index 3d165c92a3..d4e38eabd6 100644
--- a/pkg/workloads/cortex/serve/requirements.txt
+++ b/pkg/workloads/cortex/serve/requirements.txt
@@ -8,3 +8,4 @@ python-multipart==0.0.5
 pyyaml==5.3.1
 requests==2.24.0
 uvicorn==0.11.8
+jinja2==2.11.2
diff --git a/pkg/workloads/cortex/serve/serve.py b/pkg/workloads/cortex/serve/serve.py
index 716ed7f9cd..c584156616 100644
--- a/pkg/workloads/cortex/serve/serve.py
+++ b/pkg/workloads/cortex/serve/serve.py
@@ -26,7 +26,6 @@
 
 from fastapi import Body, FastAPI
 from fastapi.exceptions import RequestValidationError
-from fastapi.middleware.cors import CORSMiddleware
 from starlette.requests import Request
 from starlette.responses import Response, PlainTextResponse, JSONResponse
 from starlette.background import BackgroundTasks
@@ -51,14 +50,6 @@
 
 app = FastAPI()
 
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-
 local_cache = {
     "api": None,
     "provider": None,
@@ -77,7 +68,7 @@ def update_api_liveness():
 
 @app.on_event("startup")
 def startup():
-    open("/mnt/workspace/api_readiness.txt", "a").close()
+    open(f"/mnt/workspace/proc-{os.getpid()}-ready.txt", "a").close()
     update_api_liveness()
 
 
@@ -88,6 +79,11 @@ def shutdown():
     except:
         pass
 
+    try:
+        os.remove(f"/mnt/workspace/proc-{os.getpid()}-ready.txt")
+    except:
+        pass
+
     try:
         os.remove("/mnt/workspace/api_liveness.txt")
     except:
diff --git a/pkg/workloads/cortex/serve/batch.py b/pkg/workloads/cortex/serve/start/batch.py
similarity index 100%
rename from pkg/workloads/cortex/serve/batch.py
rename to pkg/workloads/cortex/serve/start/batch.py
diff --git a/pkg/workloads/cortex/serve/start/server.py b/pkg/workloads/cortex/serve/start/server.py
new file mode 100644
index 0000000000..564b0bb7c9
--- /dev/null
+++ b/pkg/workloads/cortex/serve/start/server.py
@@ -0,0 +1,38 @@
+# Copyright 2020 Cortex Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+
+import uvicorn
+import yaml
+
+
+def main():
+    uds = sys.argv[1]
+
+    with open("/src/cortex/serve/log_config.yaml", "r") as f:
+        log_config = yaml.load(f, yaml.FullLoader)
+
+    uvicorn.run(
+        "cortex.serve.wsgi:app",
+        uds=uds,
+        forwarded_allow_ips="*",
+        proxy_headers=True,
+        log_config=log_config,
+        log_level="info",
+    )
+
+
+if __name__ == "__main__":
+    main()