diff --git a/cli/local/docker_spec.go b/cli/local/docker_spec.go index 7c13d11a0d..6595ce7a66 100644 --- a/cli/local/docker_spec.go +++ b/cli/local/docker_spec.go @@ -19,7 +19,6 @@ package local import ( "context" "fmt" - "math" "path/filepath" "strings" @@ -92,9 +91,7 @@ func getAPIEnv(api *spec.API, awsClient *aws.Client) []string { "CORTEX_PROJECT_DIR="+_projectDir, "CORTEX_PROCESSES_PER_REPLICA="+s.Int32(api.Predictor.ProcessesPerReplica), "CORTEX_THREADS_PER_PROCESS="+s.Int32(api.Predictor.ThreadsPerProcess), - // add 1 because it was required to achieve the target concurrency for 1 process, 1 thread - "CORTEX_MAX_PROCESS_CONCURRENCY="+s.Int64(1+int64(math.Round(float64(consts.DefaultMaxReplicaConcurrency)/float64(api.Predictor.ProcessesPerReplica)))), - "CORTEX_SO_MAX_CONN="+s.Int64(consts.DefaultMaxReplicaConcurrency+100), // add a buffer to be safe + "CORTEX_MAX_REPLICA_CONCURRENCY="+s.Int32(api.Predictor.ProcessesPerReplica*api.Predictor.ThreadsPerProcess+1024), // allow a queue of 1024 "AWS_REGION="+awsClient.Region, ) diff --git a/dev/versions.md b/dev/versions.md index 36bff81f5d..302c207a9f 100644 --- a/dev/versions.md +++ b/dev/versions.md @@ -172,6 +172,18 @@ Note: it's ok if example training notebooks aren't upgraded, as long as the expo * be careful not to update any of the versions for Inferentia that are not latest in `images/python-predictor-inf/Dockerfile` 1. Rerun all examples and check their logs +## S6-overlay supervisor + +1. Locate the `s6-overlay` installation in `images/python-predictor-*/Dockerfile`, `images/tensorflow-predictor/Dockerfile` and `images/onnx-predictor-*/Dockerfile` +1. Update the version in each serving image with the newer one in https://github.com/just-containers/s6-overlay. + +## Nginx + +1. Run a base image of ubuntu that matches the version tag used for the serving images. The running command is `docker run -it --rm ` +1. Run `apt update && apt-cache policy nginx`. Notice the latest minor version of nginx (e.g. `1.14`) +1. Locate the `nginx` package in `images/python-predictor-*/Dockerfile`, `images/tensorflow-predictor/Dockerfile` and `images/onnx-predictor-*/Dockerfile` +1. Update the version for all `nginx` appearances using the minor version from step 2 and add an asterisk at the end to denote any version (e.g. `1.14.*`) + ## Istio 1. Find the latest [release](https://istio.io/latest/news/releases) and check the release notes (here are the [latest IstioOperator Options](https://istio.io/latest/docs/reference/config/istio.operator.v1alpha1/)) diff --git a/examples/sklearn/iris-classifier/cortex.yaml b/examples/sklearn/iris-classifier/cortex.yaml index f86400768f..b562fa1fdf 100644 --- a/examples/sklearn/iris-classifier/cortex.yaml +++ b/examples/sklearn/iris-classifier/cortex.yaml @@ -12,4 +12,4 @@ model_type: classification compute: cpu: 0.2 - mem: 100M + mem: 200M diff --git a/examples/sklearn/iris-classifier/requirements.txt b/examples/sklearn/iris-classifier/requirements.txt index 30ddf823b8..bbc213cf3e 100644 --- a/examples/sklearn/iris-classifier/requirements.txt +++ b/examples/sklearn/iris-classifier/requirements.txt @@ -1 +1,2 @@ boto3 +scikit-learn==0.21.3 diff --git a/images/neuron-rtd/Dockerfile b/images/neuron-rtd/Dockerfile index 8d2fa7e96b..08887ec0e9 100644 --- a/images/neuron-rtd/Dockerfile +++ b/images/neuron-rtd/Dockerfile @@ -13,7 +13,8 @@ RUN yum install -y \ aws-neuron-runtime-1.0.9592.0 \ procps-ng-3.3.10-26.amzn2.x86_64 \ gzip \ - tar + tar \ + curl ENV PATH="/opt/aws/neuron/bin:${PATH}" diff --git a/images/onnx-predictor-cpu/Dockerfile b/images/onnx-predictor-cpu/Dockerfile index 117417d45c..8afd33f853 100644 --- a/images/onnx-predictor-cpu/Dockerfile +++ b/images/onnx-predictor-cpu/Dockerfile @@ -9,8 +9,14 @@ RUN apt-get update -qq && apt-get install -y -q \ unzip \ zlib1g-dev \ locales \ + nginx=1.14.* \ && apt-get clean -qq && rm -rf /var/lib/apt/lists/* +RUN cd /tmp/ && \ + curl -L --output s6-overlay-amd64-installer "https://github.com/just-containers/s6-overlay/releases/download/v2.1.0.2/s6-overlay-amd64-installer" && \ + cd - && \ + chmod +x /tmp/s6-overlay-amd64-installer && /tmp/s6-overlay-amd64-installer / && rm /tmp/s6-overlay-amd64-installer + RUN locale-gen en_US.UTF-8 ENV LANG=en_US.UTF-8 LANGUAGE=en_US.en LC_ALL=en_US.UTF-8 @@ -68,4 +74,6 @@ COPY pkg/workloads/cortex/consts.py /src/cortex COPY pkg/workloads/cortex/lib /src/cortex/lib COPY pkg/workloads/cortex/serve /src/cortex/serve -ENTRYPOINT ["/src/cortex/serve/run.sh"] +RUN mv /src/cortex/serve/init/bootloader.sh /etc/cont-init.d/bootloader.sh + +ENTRYPOINT ["/init"] diff --git a/images/onnx-predictor-gpu/Dockerfile b/images/onnx-predictor-gpu/Dockerfile index 804c8c8f01..13a512ac11 100644 --- a/images/onnx-predictor-gpu/Dockerfile +++ b/images/onnx-predictor-gpu/Dockerfile @@ -9,8 +9,14 @@ RUN apt-get update -qq && apt-get install -y -q \ unzip \ zlib1g-dev \ locales \ + nginx=1.14.* \ && apt-get clean -qq && rm -rf /var/lib/apt/lists/* +RUN cd /tmp/ && \ + curl -L --output s6-overlay-amd64-installer "https://github.com/just-containers/s6-overlay/releases/download/v2.1.0.2/s6-overlay-amd64-installer" && \ + cd - && \ + chmod +x /tmp/s6-overlay-amd64-installer && /tmp/s6-overlay-amd64-installer / && rm /tmp/s6-overlay-amd64-installer + RUN locale-gen en_US.UTF-8 ENV LANG=en_US.UTF-8 LANGUAGE=en_US.en LC_ALL=en_US.UTF-8 @@ -68,4 +74,6 @@ COPY pkg/workloads/cortex/consts.py /src/cortex COPY pkg/workloads/cortex/lib /src/cortex/lib COPY pkg/workloads/cortex/serve /src/cortex/serve -ENTRYPOINT ["/src/cortex/serve/run.sh"] +RUN mv /src/cortex/serve/init/bootloader.sh /etc/cont-init.d/bootloader.sh + +ENTRYPOINT ["/init"] diff --git a/images/python-predictor-cpu/Dockerfile b/images/python-predictor-cpu/Dockerfile index fa7922a387..599ea622e0 100644 --- a/images/python-predictor-cpu/Dockerfile +++ b/images/python-predictor-cpu/Dockerfile @@ -9,8 +9,14 @@ RUN apt-get update -qq && apt-get install -y -q \ unzip \ zlib1g-dev \ locales \ + nginx=1.14.* \ && apt-get clean -qq && rm -rf /var/lib/apt/lists/* +RUN cd /tmp/ && \ + curl -L --output s6-overlay-amd64-installer "https://github.com/just-containers/s6-overlay/releases/download/v2.1.0.2/s6-overlay-amd64-installer" && \ + cd - && \ + chmod +x /tmp/s6-overlay-amd64-installer && /tmp/s6-overlay-amd64-installer / && rm /tmp/s6-overlay-amd64-installer + RUN locale-gen en_US.UTF-8 ENV LANG=en_US.UTF-8 LANGUAGE=en_US.en LC_ALL=en_US.UTF-8 @@ -88,4 +94,6 @@ COPY pkg/workloads/cortex/consts.py /src/cortex COPY pkg/workloads/cortex/lib /src/cortex/lib COPY pkg/workloads/cortex/serve /src/cortex/serve -ENTRYPOINT ["/src/cortex/serve/run.sh"] +RUN mv /src/cortex/serve/init/bootloader.sh /etc/cont-init.d/bootloader.sh + +ENTRYPOINT ["/init"] diff --git a/images/python-predictor-gpu/Dockerfile b/images/python-predictor-gpu/Dockerfile index 6ced46d169..bd04e83c1a 100644 --- a/images/python-predictor-gpu/Dockerfile +++ b/images/python-predictor-gpu/Dockerfile @@ -11,8 +11,14 @@ RUN apt-get update -qq && apt-get install -y -q \ unzip \ zlib1g-dev \ locales \ + nginx=1.14.* \ && apt-get clean -qq && rm -rf /var/lib/apt/lists/* +RUN cd /tmp/ && \ + curl -L --output s6-overlay-amd64-installer "https://github.com/just-containers/s6-overlay/releases/download/v2.1.0.2/s6-overlay-amd64-installer" && \ + cd - && \ + chmod +x /tmp/s6-overlay-amd64-installer && /tmp/s6-overlay-amd64-installer / && rm /tmp/s6-overlay-amd64-installer + RUN locale-gen en_US.UTF-8 ENV LANG=en_US.UTF-8 LANGUAGE=en_US.en LC_ALL=en_US.UTF-8 @@ -93,4 +99,6 @@ COPY pkg/workloads/cortex/consts.py /src/cortex COPY pkg/workloads/cortex/lib /src/cortex/lib COPY pkg/workloads/cortex/serve /src/cortex/serve -ENTRYPOINT ["/src/cortex/serve/run.sh"] +RUN mv /src/cortex/serve/init/bootloader.sh /etc/cont-init.d/bootloader.sh + +ENTRYPOINT ["/init"] diff --git a/images/python-predictor-inf/Dockerfile b/images/python-predictor-inf/Dockerfile index 63fb538dde..755bbeb6b8 100644 --- a/images/python-predictor-inf/Dockerfile +++ b/images/python-predictor-inf/Dockerfile @@ -10,6 +10,9 @@ RUN apt-get update -qq && apt-get install -y -q \ aws-neuron-runtime=1.1.1402.0 && \ apt-get clean -qq && rm -rf /var/lib/apt/lists/* +RUN wget -P /tmp/ https://github.com/just-containers/s6-overlay/releases/download/v2.1.0.2/s6-overlay-amd64-installer && \ + chmod +x /tmp/s6-overlay-amd64-installer && /tmp/s6-overlay-amd64-installer / && rm /tmp/s6-overlay-amd64-installer + ENV PATH=/opt/aws/neuron/bin/:$PATH RUN apt-get update -qq && apt-get install -y -q \ @@ -21,6 +24,7 @@ RUN apt-get update -qq && apt-get install -y -q \ unzip \ zlib1g-dev \ locales \ + nginx=1.14.* \ && apt-get clean -qq && rm -rf /var/lib/apt/lists/* RUN locale-gen en_US.UTF-8 @@ -100,4 +104,6 @@ COPY pkg/workloads/cortex/consts.py /src/cortex COPY pkg/workloads/cortex/lib /src/cortex/lib COPY pkg/workloads/cortex/serve /src/cortex/serve -ENTRYPOINT ["/src/cortex/serve/run.sh"] +RUN mv /src/cortex/serve/init/bootloader.sh /etc/cont-init.d/bootloader.sh + +ENTRYPOINT ["/init"] diff --git a/images/tensorflow-predictor/Dockerfile b/images/tensorflow-predictor/Dockerfile index 30f0736505..d1d69be72a 100644 --- a/images/tensorflow-predictor/Dockerfile +++ b/images/tensorflow-predictor/Dockerfile @@ -9,8 +9,14 @@ RUN apt-get update -qq && apt-get install -y -q \ unzip \ zlib1g-dev \ locales \ + nginx=1.14.* \ && apt-get clean -qq && rm -rf /var/lib/apt/lists/* +RUN cd /tmp/ && \ + curl -L --output s6-overlay-amd64-installer "https://github.com/just-containers/s6-overlay/releases/download/v2.1.0.2/s6-overlay-amd64-installer" && \ + cd - && \ + chmod +x /tmp/s6-overlay-amd64-installer && /tmp/s6-overlay-amd64-installer / && rm /tmp/s6-overlay-amd64-installer + RUN locale-gen en_US.UTF-8 ENV LANG=en_US.UTF-8 LANGUAGE=en_US.en LC_ALL=en_US.UTF-8 @@ -68,8 +74,10 @@ RUN test "${SLIM}" = "true" || ( \ tensorflow-hub==0.9.0 \ ) -COPY pkg/workloads/cortex/consts.py /src/cortex/ +COPY pkg/workloads/cortex/consts.py /src/cortex COPY pkg/workloads/cortex/lib /src/cortex/lib COPY pkg/workloads/cortex/serve /src/cortex/serve -ENTRYPOINT ["/src/cortex/serve/run.sh"] +RUN mv /src/cortex/serve/init/bootloader.sh /etc/cont-init.d/bootloader.sh + +ENTRYPOINT ["/init"] diff --git a/images/tensorflow-serving-cpu/Dockerfile b/images/tensorflow-serving-cpu/Dockerfile index 0197d9ad80..396bfec923 100644 --- a/images/tensorflow-serving-cpu/Dockerfile +++ b/images/tensorflow-serving-cpu/Dockerfile @@ -1,5 +1,9 @@ FROM tensorflow/serving:2.3.0 +RUN apt-get update -qq && apt-get install -y -q \ + curl \ + && apt-get clean -qq && rm -rf /var/lib/apt/lists/* + COPY images/tensorflow-serving-cpu/run.sh /src/ RUN chmod +x /src/run.sh diff --git a/images/tensorflow-serving-gpu/Dockerfile b/images/tensorflow-serving-gpu/Dockerfile index 8f7fb285b8..745e29a56a 100644 --- a/images/tensorflow-serving-gpu/Dockerfile +++ b/images/tensorflow-serving-gpu/Dockerfile @@ -3,6 +3,7 @@ FROM tensorflow/serving:2.3.0-gpu RUN apt-get update -qq && apt-get install -y --no-install-recommends -q \ libnvinfer6=6.0.1-1+cuda10.1 \ libnvinfer-plugin6=6.0.1-1+cuda10.1 \ + curl \ && apt-get clean -qq && rm -rf /var/lib/apt/lists/* COPY images/tensorflow-serving-gpu/run.sh /src/ diff --git a/images/tensorflow-serving-inf/Dockerfile b/images/tensorflow-serving-inf/Dockerfile index cccb83e189..efebb680cc 100644 --- a/images/tensorflow-serving-inf/Dockerfile +++ b/images/tensorflow-serving-inf/Dockerfile @@ -4,6 +4,7 @@ FROM ubuntu:18.04 RUN apt-get update -qq && apt-get install -y -q \ gettext-base \ supervisor \ + curl \ wget \ netcat \ gnupg && \ diff --git a/pkg/operator/operator/k8s.go b/pkg/operator/operator/k8s.go index bddeea6d08..f79ea2beb6 100644 --- a/pkg/operator/operator/k8s.go +++ b/pkg/operator/operator/k8s.go @@ -20,7 +20,6 @@ import ( "encoding/base64" "encoding/json" "fmt" - "math" "path" "strings" @@ -179,6 +178,7 @@ func PythonPredictorContainers(api *spec.API) ([]kcore.Container, []kcore.Volume VolumeMounts: apiPodVolumeMounts, ReadinessProbe: FileExistsProbe(_apiReadinessFile), LivenessProbe: _apiLivenessProbe, + Lifecycle: nginxGracefulStopper(api.Kind), Resources: kcore.ResourceRequirements{ Requests: apiPodResourceList, Limits: apiPodResourceLimitsList, @@ -267,6 +267,7 @@ func TensorFlowPredictorContainers(api *spec.API) ([]kcore.Container, []kcore.Vo VolumeMounts: volumeMounts, ReadinessProbe: FileExistsProbe(_apiReadinessFile), LivenessProbe: _apiLivenessProbe, + Lifecycle: nginxGracefulStopper(api.Kind), Resources: kcore.ResourceRequirements{ Requests: apiResourceList, }, @@ -320,6 +321,7 @@ func ONNXPredictorContainers(api *spec.API) []kcore.Container { VolumeMounts: DefaultVolumeMounts, ReadinessProbe: FileExistsProbe(_apiReadinessFile), LivenessProbe: _apiLivenessProbe, + Lifecycle: nginxGracefulStopper(api.Kind), Resources: kcore.ResourceRequirements{ Requests: resourceList, Limits: resourceLimitsList, @@ -409,15 +411,6 @@ func getEnvVars(api *spec.API, container string) []kcore.EnvVar { Name: "CORTEX_MAX_REPLICA_CONCURRENCY", Value: s.Int64(api.Autoscaling.MaxReplicaConcurrency), }, - kcore.EnvVar{ - Name: "CORTEX_MAX_PROCESS_CONCURRENCY", - // add 1 because it was required to achieve the target concurrency for 1 process, 1 thread - Value: s.Int64(1 + int64(math.Round(float64(api.Autoscaling.MaxReplicaConcurrency)/float64(api.Predictor.ProcessesPerReplica)))), - }, - kcore.EnvVar{ - Name: "CORTEX_SO_MAX_CONN", - Value: s.Int64(api.Autoscaling.MaxReplicaConcurrency + 100), // add a buffer to be safe - }, ) } @@ -699,6 +692,7 @@ func tensorflowServingContainer(api *spec.API, volumeMounts []kcore.VolumeMount, FailureThreshold: 2, Handler: probeHandler, }, + Lifecycle: waitAPIContainerToStop(api.Kind), Resources: resources, Ports: ports, } @@ -720,6 +714,7 @@ func neuronRuntimeDaemonContainer(api *spec.API, volumeMounts []kcore.VolumeMoun }, VolumeMounts: volumeMounts, ReadinessProbe: socketExistsProbe(_neuronRTDSocket), + Lifecycle: waitAPIContainerToStop(api.Kind), Resources: kcore.ResourceRequirements{ Requests: kcore.ResourceList{ "hugepages-2Mi": *kresource.NewQuantity(totalHugePages, kresource.BinarySI), @@ -794,6 +789,34 @@ func socketExistsProbe(socketName string) *kcore.Probe { } } +func nginxGracefulStopper(apiKind userconfig.Kind) *kcore.Lifecycle { + if apiKind == userconfig.RealtimeAPIKind { + return &kcore.Lifecycle{ + PreStop: &kcore.Handler{ + Exec: &kcore.ExecAction{ + // the sleep is required to wait for any k8s-related race conditions + // as described in https://medium.com/codecademy-engineering/kubernetes-nginx-and-zero-downtime-in-production-2c910c6a5ed8 + Command: []string{"/bin/sh", "-c", "sleep 5; /usr/sbin/nginx -s quit; while pgrep -x nginx; do sleep 1; done"}, + }, + }, + } + } + return nil +} + +func waitAPIContainerToStop(apiKind userconfig.Kind) *kcore.Lifecycle { + if apiKind == userconfig.RealtimeAPIKind { + return &kcore.Lifecycle{ + PreStop: &kcore.Handler{ + Exec: &kcore.ExecAction{ + Command: []string{"/bin/sh", "-c", fmt.Sprintf("while curl localhost:%s/nginx_status; do sleep 1; done", DefaultPortStr)}, + }, + }, + } + } + return nil +} + var BaseEnvVars = []kcore.EnvFromSource{ { ConfigMapRef: &kcore.ConfigMapEnvSource{ diff --git a/pkg/operator/resources/realtimeapi/k8s_specs.go b/pkg/operator/resources/realtimeapi/k8s_specs.go index 945168f1af..eec2003505 100644 --- a/pkg/operator/resources/realtimeapi/k8s_specs.go +++ b/pkg/operator/resources/realtimeapi/k8s_specs.go @@ -27,6 +27,8 @@ import ( kcore "k8s.io/api/core/v1" ) +var _terminationGracePeriodSeconds int64 = 60 // seconds + func deploymentSpec(api *spec.API, prevDeployment *kapps.Deployment) *kapps.Deployment { switch api.Predictor.Type { case userconfig.TensorFlowPredictorType: @@ -74,7 +76,8 @@ func tensorflowAPISpec(api *spec.API, prevDeployment *kapps.Deployment) *kapps.D "traffic.sidecar.istio.io/excludeOutboundIPRanges": "0.0.0.0/0", }, K8sPodSpec: kcore.PodSpec{ - RestartPolicy: "Always", + RestartPolicy: "Always", + TerminationGracePeriodSeconds: pointer.Int64(_terminationGracePeriodSeconds), InitContainers: []kcore.Container{ operator.InitContainer(api), }, @@ -123,7 +126,8 @@ func pythonAPISpec(api *spec.API, prevDeployment *kapps.Deployment) *kapps.Deplo "traffic.sidecar.istio.io/excludeOutboundIPRanges": "0.0.0.0/0", }, K8sPodSpec: kcore.PodSpec{ - RestartPolicy: "Always", + RestartPolicy: "Always", + TerminationGracePeriodSeconds: pointer.Int64(_terminationGracePeriodSeconds), InitContainers: []kcore.Container{ operator.InitContainer(api), }, @@ -175,7 +179,8 @@ func onnxAPISpec(api *spec.API, prevDeployment *kapps.Deployment) *kapps.Deploym InitContainers: []kcore.Container{ operator.InitContainer(api), }, - Containers: containers, + TerminationGracePeriodSeconds: pointer.Int64(_terminationGracePeriodSeconds), + Containers: containers, NodeSelector: map[string]string{ "workload": "true", }, diff --git a/pkg/types/spec/validations.go b/pkg/types/spec/validations.go index e907e1b441..44a117d11a 100644 --- a/pkg/types/spec/validations.go +++ b/pkg/types/spec/validations.go @@ -399,9 +399,11 @@ func autoscalingValidation(provider types.ProviderType) *cr.StructFieldValidatio { StructField: "MaxReplicaConcurrency", Int64Validation: &cr.Int64Validation{ - Default: consts.DefaultMaxReplicaConcurrency, - GreaterThan: pointer.Int64(0), - LessThanOrEqualTo: pointer.Int64(math.MaxUint16), + Default: consts.DefaultMaxReplicaConcurrency, + GreaterThan: pointer.Int64(0), + // our configured nginx can theoretically accept up to 32768 connections, but during testing, + // it has been observed that the number is just slightly lower, so it has been offset by 2678 + LessThanOrEqualTo: pointer.Int64(30000), }, }, { diff --git a/pkg/workloads/cortex/lib/util.py b/pkg/workloads/cortex/lib/util.py index 1b76ced3e6..43adad5713 100644 --- a/pkg/workloads/cortex/lib/util.py +++ b/pkg/workloads/cortex/lib/util.py @@ -21,6 +21,7 @@ import inspect from inspect import Parameter from copy import deepcopy +from typing import Any def has_method(object, method: str): @@ -229,3 +230,17 @@ def is_float_or_int_list(var): if not is_float_or_int(item): return False return True + + +def render_jinja_template(jinja_template_file: str, context: dict) -> str: + from jinja2 import Environment, FileSystemLoader + + template_path = pathlib.Path(jinja_template_file) + + env = Environment(loader=FileSystemLoader(str(template_path.parent))) + env.trim_blocks = True + env.lstrip_blocks = True + env.rstrip_blocks = True + + template = env.get_template(str(template_path.name)) + return template.render(**context) diff --git a/pkg/workloads/cortex/serve/run.sh b/pkg/workloads/cortex/serve/init/bootloader.sh similarity index 61% rename from pkg/workloads/cortex/serve/run.sh rename to pkg/workloads/cortex/serve/init/bootloader.sh index 4de3fc693f..1054f5a46e 100755 --- a/pkg/workloads/cortex/serve/run.sh +++ b/pkg/workloads/cortex/serve/init/bootloader.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/with-contenv bash # Copyright 2020 Cortex Labs, Inc. # @@ -35,25 +35,17 @@ cd /mnt/project # if the container restarted, ensure that it is not perceived as ready rm -rf /mnt/workspace/api_readiness.txt +rm -rf /mnt/workspace/proc-*-ready.txt # allow for the liveness check to pass until the API is running echo "9999999999" > /mnt/workspace/api_liveness.txt -# export environment variables -if [ -f "/mnt/project/.env" ]; then - set -a - source /mnt/project/.env - set +a -fi - -export PYTHONPATH=$PYTHONPATH:$CORTEX_PYTHON_PATH - -# ensure predictor print() statements are always flushed -export PYTHONUNBUFFERED=TRUE +# to export user-specified environment files +source_env_file_cmd="if [ -f \"/mnt/project/.env\" ]; then set -a; source /mnt/project/.env; set +a; fi" if [ "$CORTEX_PROVIDER" != "local" ]; then if [ "$CORTEX_KIND" == "RealtimeAPI" ]; then - sysctl -w net.core.somaxconn=$CORTEX_SO_MAX_CONN >/dev/null + sysctl -w net.core.somaxconn="65535" >/dev/null sysctl -w net.ipv4.ip_local_port_range="15000 64000" >/dev/null sysctl -w net.ipv4.tcp_fin_timeout=30 >/dev/null fi @@ -61,7 +53,12 @@ fi # execute script if present in project's directory if [ -f "/mnt/project/dependencies.sh" ]; then + eval $source_env_file_cmd bash -e /mnt/project/dependencies.sh + status=$? + if [ $status -ne 0 ]; then + exit $status + fi fi # install from conda-packages.txt @@ -86,4 +83,48 @@ if [ -f "/mnt/project/requirements.txt" ]; then pip --no-cache-dir install -r /mnt/project/requirements.txt fi -/opt/conda/envs/env/bin/python /src/cortex/serve/start.py +create_s6_service() { + service_name=$1 + cmd=$2 + + dest_dir="/etc/services.d/$service_name" + mkdir $dest_dir + + dest_script="$dest_dir/run" + echo "#!/usr/bin/with-contenv bash" > $dest_script + echo $cmd >> $dest_script + chmod +x $dest_script + + dest_script="$dest_dir/finish" + echo "#!/usr/bin/execlineb -S0" > $dest_script + echo "s6-svscanctl -t /var/run/s6/services" >> $dest_script + chmod +x $dest_script +} + +# prepare webserver +if [ "$CORTEX_KIND" = "RealtimeAPI" ]; then + + # prepare uvicorn workers + mkdir /run/uvicorn + for i in $(seq 1 $CORTEX_PROCESSES_PER_REPLICA); do + create_s6_service "uvicorn-$((i-1))" "$source_env_file_cmd && exec env PYTHONUNBUFFERED=TRUE env PYTHONPATH=$PYTHONPATH:$CORTEX_PYTHON_PATH /opt/conda/envs/env/bin/python /src/cortex/serve/start/server.py /run/uvicorn/proc-$((i-1)).sock" + done + + create_s6_service "nginx" "exec nginx -c /run/nginx.conf" + + # prepare api readiness checker + dest_dir="/etc/services.d/api_readiness" + mkdir $dest_dir + cp /src/cortex/serve/poll/readiness.sh $dest_dir/run + chmod +x $dest_dir/run + + # generate nginx conf + /opt/conda/envs/env/bin/python -c 'from cortex.lib import util; import os; generated = util.render_jinja_template("/src/cortex/serve/nginx.conf.j2", os.environ); print(generated);' > /run/nginx.conf + +# prepare batch otherwise +else + create_s6_service "batch" "$source_env_file_cmd && exec env PYTHONUNBUFFERED=TRUE env PYTHONPATH=$PYTHONPATH:$CORTEX_PYTHON_PATH /opt/conda/envs/env/bin/python /src/cortex/serve/start/batch.py" +fi + +# run the python initialization script +/opt/conda/envs/env/bin/python /src/cortex/serve/init/script.py diff --git a/pkg/workloads/cortex/serve/start.py b/pkg/workloads/cortex/serve/init/script.py similarity index 78% rename from pkg/workloads/cortex/serve/start.py rename to pkg/workloads/cortex/serve/init/script.py index f86a808abd..e72cd8290e 100644 --- a/pkg/workloads/cortex/serve/start.py +++ b/pkg/workloads/cortex/serve/init/script.py @@ -12,8 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import uvicorn -import yaml import os import json @@ -48,9 +46,6 @@ def load_tensorflow_serving_models(): def main(): - with open("/src/cortex/serve/log_config.yaml", "r") as f: - log_config = yaml.load(f, yaml.FullLoader) - # wait until neuron-rtd sidecar is ready uses_inferentia = os.getenv("CORTEX_ACTIVE_NEURON") if uses_inferentia: @@ -81,25 +76,6 @@ def main(): if raw_api_spec["predictor"]["type"] == "tensorflow": load_tensorflow_serving_models() - if raw_api_spec["kind"] == "RealtimeAPI": - # https://github.com/encode/uvicorn/blob/master/uvicorn/config.py - uvicorn.run( - "cortex.serve.wsgi:app", - host="0.0.0.0", - port=int(os.environ["CORTEX_SERVING_PORT"]), - workers=int(os.environ["CORTEX_PROCESSES_PER_REPLICA"]), - limit_concurrency=int( - os.environ["CORTEX_MAX_PROCESS_CONCURRENCY"] - ), # this is a per process limit - backlog=int(os.environ["CORTEX_SO_MAX_CONN"]), - log_config=log_config, - log_level="info", - ) - else: - from cortex.serve import batch - - batch.start() - if __name__ == "__main__": main() diff --git a/pkg/workloads/cortex/serve/nginx.conf.j2 b/pkg/workloads/cortex/serve/nginx.conf.j2 new file mode 100644 index 0000000000..18aac9d16e --- /dev/null +++ b/pkg/workloads/cortex/serve/nginx.conf.j2 @@ -0,0 +1,101 @@ +# Copyright 2020 Cortex Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# good articles to read +# https://hub.packtpub.com/fine-tune-nginx-configufine-tune-nginx-configurationfine-tune-nginx-configurationratio/ +# https://www.nginx.com/blog/tuning-nginx/ +# https://www.digitalocean.com/community/tutorials/understanding-nginx-http-proxying-load-balancing-buffering-and-caching +# https://serverfault.com/a/788703 +# https://stackoverflow.com/questions/59846238/guide-on-how-to-use-regex-in-nginx-location-block-section + +daemon off; +# maximum number of open files per worker +worker_rlimit_nofile 65535; +worker_processes 1; + +thread_pool pool threads={{ CORTEX_PROCESSES_PER_REPLICA | int }}; + +events { + # max num requests = (worker_processes * worker_connections ) / 2 for reverse proxy + # max num requests is also limited by the number of socket connections available on the system (~64k) + worker_connections 65535; + + # The multi_accept flag enables an NGINX worker to accept as many connections as possible when it + # gets the notification of a new connection. The purpose of this flag is to accept all connections + # in the listen queue at once. If the directive is disabled, a worker process will accept connections one by one. + multi_accept off; + + # An efficient method of processing connections available on Linux 2.6+. The method is similar to the FreeBSD kqueue. + use epoll; +} + +http { + # send headers in one piece, it is better than sending them one by one + tcp_nopush on; + + # don't buffer data sent, good for small data bursts in real time + tcp_nodelay on; + + # to limit concurrent requests + limit_conn_zone 1 zone=inflights:1m; + + # to distribute load + aio threads=pool; + + # how much time an inference can take + proxy_read_timeout 3600s; + + upstream uvicorn { + # load balancing policy + least_conn; + + {% for i in range(CORTEX_PROCESSES_PER_REPLICA | int) %} + server unix:/run/uvicorn/proc-{{ i }}.sock; + {% endfor %} + } + + server { + listen {{ CORTEX_SERVING_PORT | int }}; + underscores_in_headers on; + + location /nginx_status { + stub_status on; + allow 127.0.0.1; + deny all; + } + + location / { + deny all; + } + + location ~ ^/(predict/?|)$ { + limit_conn inflights {{ CORTEX_MAX_REPLICA_CONCURRENCY | int }}; + + add_header Access-Control-Allow-Origin "*"; + add_header Access-Control-Allow-Headers "*"; + add_header Access-Control-Allow-Methods "GET, POST"; + add_header Access-Control-Allow-Credentials "true"; + + proxy_set_header HOST $host; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-Proto $scheme; + + proxy_redirect off; + proxy_buffering off; + + proxy_pass http://uvicorn; + } + } +} diff --git a/pkg/workloads/cortex/serve/poll/readiness.sh b/pkg/workloads/cortex/serve/poll/readiness.sh new file mode 100644 index 0000000000..ee5fc347e9 --- /dev/null +++ b/pkg/workloads/cortex/serve/poll/readiness.sh @@ -0,0 +1,24 @@ +#!/usr/bin/with-contenv bash + +# Copyright 2020 Cortex Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +while true; do + procs_ready="$(ls /mnt/workspace/proc-*-ready.txt 2>/dev/null | wc -l)" + if [ "$CORTEX_PROCESSES_PER_REPLICA" = "$procs_ready" ]; then + touch /mnt/workspace/api_readiness.txt + break + fi + sleep 1 +done diff --git a/pkg/workloads/cortex/serve/requirements.txt b/pkg/workloads/cortex/serve/requirements.txt index 3d165c92a3..d4e38eabd6 100644 --- a/pkg/workloads/cortex/serve/requirements.txt +++ b/pkg/workloads/cortex/serve/requirements.txt @@ -8,3 +8,4 @@ python-multipart==0.0.5 pyyaml==5.3.1 requests==2.24.0 uvicorn==0.11.8 +jinja2==2.11.2 diff --git a/pkg/workloads/cortex/serve/serve.py b/pkg/workloads/cortex/serve/serve.py index 716ed7f9cd..c584156616 100644 --- a/pkg/workloads/cortex/serve/serve.py +++ b/pkg/workloads/cortex/serve/serve.py @@ -26,7 +26,6 @@ from fastapi import Body, FastAPI from fastapi.exceptions import RequestValidationError -from fastapi.middleware.cors import CORSMiddleware from starlette.requests import Request from starlette.responses import Response, PlainTextResponse, JSONResponse from starlette.background import BackgroundTasks @@ -51,14 +50,6 @@ app = FastAPI() -app.add_middleware( - CORSMiddleware, - allow_origins=["*"], - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], -) - local_cache = { "api": None, "provider": None, @@ -77,7 +68,7 @@ def update_api_liveness(): @app.on_event("startup") def startup(): - open("/mnt/workspace/api_readiness.txt", "a").close() + open(f"/mnt/workspace/proc-{os.getpid()}-ready.txt", "a").close() update_api_liveness() @@ -88,6 +79,11 @@ def shutdown(): except: pass + try: + os.remove(f"/mnt/workspace/proc-{os.getpid()}-ready.txt") + except: + pass + try: os.remove("/mnt/workspace/api_liveness.txt") except: diff --git a/pkg/workloads/cortex/serve/batch.py b/pkg/workloads/cortex/serve/start/batch.py similarity index 100% rename from pkg/workloads/cortex/serve/batch.py rename to pkg/workloads/cortex/serve/start/batch.py diff --git a/pkg/workloads/cortex/serve/start/server.py b/pkg/workloads/cortex/serve/start/server.py new file mode 100644 index 0000000000..564b0bb7c9 --- /dev/null +++ b/pkg/workloads/cortex/serve/start/server.py @@ -0,0 +1,38 @@ +# Copyright 2020 Cortex Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys + +import uvicorn +import yaml + + +def main(): + uds = sys.argv[1] + + with open("/src/cortex/serve/log_config.yaml", "r") as f: + log_config = yaml.load(f, yaml.FullLoader) + + uvicorn.run( + "cortex.serve.wsgi:app", + uds=uds, + forwarded_allow_ips="*", + proxy_headers=True, + log_config=log_config, + log_level="info", + ) + + +if __name__ == "__main__": + main()