diff --git a/cli/cmd/const.go b/cli/cmd/const.go
new file mode 100644
index 0000000000..4596751e9b
--- /dev/null
+++ b/cli/cmd/const.go
@@ -0,0 +1,21 @@
+/*
+Copyright 2020 Cortex Labs, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package cmd
+
+const (
+	_timeFormat = "02 Jan 06 15:04:05 MST"
+)
diff --git a/cli/cmd/lib_batch_apis.go b/cli/cmd/lib_batch_apis.go
index 30b8a9af2e..64150aea85 100644
--- a/cli/cmd/lib_batch_apis.go
+++ b/cli/cmd/lib_batch_apis.go
@@ -38,7 +38,6 @@ const (
 	_titleBatchAPI    = "batch api"
 	_titleJobCount    = "running jobs"
 	_titleLatestJobID = "latest job id"
-	_timeFormat       = "02 Jan 2006 15:04:05 MST"
 )
 
 func batchAPIsTable(batchAPIs []schema.APIResponse, envNames []string) table.Table {
diff --git a/cli/cmd/lib_realtime_apis.go b/cli/cmd/lib_realtime_apis.go
index 987ddc4b53..9492749be4 100644
--- a/cli/cmd/lib_realtime_apis.go
+++ b/cli/cmd/lib_realtime_apis.go
@@ -22,12 +22,12 @@ import (
 	"io/ioutil"
 	"net/http"
 	"sort"
+	"strconv"
 	"strings"
 	"time"
 
 	"github.com/cortexlabs/cortex/cli/types/cliconfig"
 	"github.com/cortexlabs/cortex/pkg/consts"
-	"github.com/cortexlabs/cortex/pkg/lib/cast"
 	"github.com/cortexlabs/cortex/pkg/lib/console"
 	"github.com/cortexlabs/cortex/pkg/lib/errors"
 	"github.com/cortexlabs/cortex/pkg/lib/json"
@@ -70,8 +70,8 @@ func realtimeAPITable(realtimeAPI schema.APIResponse, env cliconfig.Environment)
 
 	out += fmt.Sprintf("\n%s curl %s -X POST -H \"Content-Type: application/json\" -d @sample.json\n", console.Bold("example curl:"), realtimeAPI.Endpoint)
 
-	if realtimeAPI.Spec.Predictor.Type == userconfig.TensorFlowPredictorType || realtimeAPI.Spec.Predictor.Type == userconfig.ONNXPredictorType {
-		out += "\n" + describeModelInput(realtimeAPI.Status, realtimeAPI.Endpoint)
+	if !(realtimeAPI.Spec.Predictor.Type == userconfig.PythonPredictorType && realtimeAPI.Spec.Predictor.ModelPath == nil && realtimeAPI.Spec.Predictor.Models == nil) {
+		out += "\n" + describeModelInput(realtimeAPI.Status, realtimeAPI.Spec.Predictor, realtimeAPI.Endpoint)
 	}
 
 	out += titleStr("configuration") + strings.TrimSpace(realtimeAPI.Spec.UserStr(env.Provider))
@@ -232,67 +232,40 @@ func classificationMetricsStr(metrics *metrics.Metrics) string {
 	return out
 }
 
-func describeModelInput(status *status.Status, apiEndpoint string) string {
+func describeModelInput(status *status.Status, predictor *userconfig.Predictor, apiEndpoint string) string {
 	if status.Updated.Ready+status.Stale.Ready == 0 {
-		return "the model's input schema will be available when the api is live\n"
+		return "the models' metadata schema will be available when the api is live\n"
 	}
 
-	apiSummary, err := getAPISummary(apiEndpoint)
-	if err != nil {
-		return "error retrieving the model's input schema: " + errors.Message(err) + "\n"
-	}
-
-	numRows := 0
-	for _, inputSignatures := range apiSummary.ModelSignatures {
-		numRows += len(inputSignatures)
-	}
-
-	usesDefaultModel := false
-	rows := make([][]interface{}, numRows)
-	rowNum := 0
-	for modelName, inputSignatures := range apiSummary.ModelSignatures {
-		for inputName, inputSignature := range inputSignatures {
-			shapeStr := make([]string, len(inputSignature.Shape))
-			for idx, dim := range inputSignature.Shape {
-				shapeStr[idx] = s.ObjFlatNoQuotes(dim)
-			}
-
-			shapeRowEntry := ""
-			if len(shapeStr) == 1 && shapeStr[0] == "scalar" {
-				shapeRowEntry = "scalar"
-			} else if len(shapeStr) == 1 && shapeStr[0] == "unknown" {
-				shapeRowEntry = "unknown"
-			} else {
-				shapeRowEntry = "(" + strings.Join(shapeStr, ", ") + ")"
-			}
-			rows[rowNum] = []interface{}{
-				modelName,
-				inputName,
-				inputSignature.Type,
-				shapeRowEntry,
-			}
-			rowNum++
+	cachingEnabled := predictor.Models != nil && predictor.Models.CacheSize != nil && predictor.Models.DiskCacheSize != nil
+	if predictor.Type == userconfig.TensorFlowPredictorType && !cachingEnabled {
+		apiTFLiveReloadingSummary, err := getAPITFLiveReloadingSummary(apiEndpoint)
+		if err != nil {
+			return "error retrieving the models' metadata schema: " + errors.Message(err) + "\n"
 		}
-		if modelName == consts.SingleModelName {
-			usesDefaultModel = true
+		t, err := parseAPITFLiveReloadingSummary(apiTFLiveReloadingSummary)
+		if err != nil {
+			return "error retrieving the model's input schema: " + errors.Message(err) + "\n"
 		}
+		return t
 	}
 
-	inputTitle := "input"
-	if usesDefaultModel {
-		inputTitle = "model input"
+	apiModelSummary, err := getAPIModelSummary(apiEndpoint)
+	if err != nil {
+		return "error retrieving the models' metadata schema: " + errors.Message(err) + "\n"
 	}
-	t := table.Table{
-		Headers: []table.Header{
-			{Title: "model name", MaxWidth: 32, Hidden: usesDefaultModel},
-			{Title: inputTitle, MaxWidth: 32},
-			{Title: "type", MaxWidth: 10},
-			{Title: "shape", MaxWidth: 20},
-		},
-		Rows: rows,
+	t, err := parseAPIModelSummary(apiModelSummary)
+	if err != nil {
+		return "error retrieving the models' metadata schema: " + errors.Message(err) + "\n"
 	}
+	return t
+}
 
-	return t.MustFormat()
+func getModelFromModelID(modelID string) (modelName string, modelVersion int64, err error) {
+	splitIndex := strings.LastIndex(modelID, "-")
+	modelName = modelID[:splitIndex]
+	modelVersion, err = strconv.ParseInt(modelID[splitIndex+1:], 10, 64)
+	return
 }
 
 func makeRequest(request *http.Request) (http.Header, []byte, error) {
@@ -324,7 +297,26 @@ func makeRequest(request *http.Request) (http.Header, []byte, error) {
 	return response.Header, bodyBytes, nil
 }
 
-func getAPISummary(apiEndpoint string) (*schema.APISummary, error) {
+func getAPIModelSummary(apiEndpoint string) (*schema.APIModelSummary, error) {
+	req, err := http.NewRequest("GET", apiEndpoint, nil)
+	if err != nil {
+		return nil, errors.Wrap(err, "unable to request api summary")
+	}
+	req.Header.Set("Content-Type", "application/json")
+	_, response, err := makeRequest(req)
+	if err != nil {
+		return nil, err
+	}
+
+	var apiModelSummary schema.APIModelSummary
+	err = json.DecodeWithNumber(response, &apiModelSummary)
+	if err != nil {
+		return nil, errors.Wrap(err, "unable to parse api summary response")
+	}
+	return &apiModelSummary, nil
+}
+
+func getAPITFLiveReloadingSummary(apiEndpoint string) (*schema.APITFLiveReloadingSummary, error) {
 	req, err := http.NewRequest("GET", apiEndpoint, nil)
 	if err != nil {
 		return nil, errors.Wrap(err, "unable to request api summary")
@@ -335,17 +327,179 @@ func getAPISummary(apiEndpoint string) (*schema.APISummary, error) {
 		return nil, err
 	}
 
-	var apiSummary schema.APISummary
-	err = json.DecodeWithNumber(response, &apiSummary)
+	var apiTFLiveReloadingSummary schema.APITFLiveReloadingSummary
+	err = json.DecodeWithNumber(response, &apiTFLiveReloadingSummary)
 	if err != nil {
 		return nil, errors.Wrap(err, "unable to parse api summary response")
 	}
+	return &apiTFLiveReloadingSummary, nil
+}
 
-	for _, inputSignatures := range apiSummary.ModelSignatures {
-		for _, inputSignature := range inputSignatures {
-			inputSignature.Shape = cast.JSONNumbers(inputSignature.Shape)
+func parseAPIModelSummary(summary *schema.APIModelSummary) (string, error) {
+	rows := make([][]interface{}, 0)
+
+	for modelName, modelMetadata := range summary.ModelMetadata {
+		latestVersion := int64(0)
+		for _, version := range modelMetadata.Versions {
+			v, err := strconv.ParseInt(version, 10, 64)
+			if err != nil {
+				return "", err
+			}
+			if v > latestVersion {
+				latestVersion = v
+			}
 		}
+		latestStrVersion := strconv.FormatInt(latestVersion, 10)
+
+		for idx, version := range modelMetadata.Versions {
+			var latestTag string
+			if latestStrVersion == version {
+				latestTag = " (latest)"
+			}
+
+			timestamp := modelMetadata.Timestamps[idx]
+			date := time.Unix(timestamp, 0)
+
+			rows = append(rows, []interface{}{
+				modelName,
+				version + latestTag,
+				date.Format(_timeFormat),
+			})
+		}
+	}
+
+	_, usesCortexDefaultModelName := summary.ModelMetadata[consts.SingleModelName]
+
+	t := table.Table{
+		Headers: []table.Header{
+			{
+				Title:    "model name",
+				MaxWidth: 32,
+				Hidden:   usesCortexDefaultModelName,
+			},
+			{
+				Title:    "model version",
+				MaxWidth: 25,
+			},
+			{
+				Title:    "edit time",
+				MaxWidth: 32,
+			},
+		},
+		Rows: rows,
+	}
+
+	return t.MustFormat(), nil
+}
+
+func parseAPITFLiveReloadingSummary(summary *schema.APITFLiveReloadingSummary) (string, error) {
+	latestVersions := make(map[string]int64)
+
+	numRows := 0
+	models := make(map[string]schema.GenericModelMetadata, 0)
+	for modelID, modelMetadata := range summary.ModelMetadata {
+		timestamp := modelMetadata.Timestamp
+		modelName, modelVersion, err := getModelFromModelID(modelID)
+		if err != nil {
+			return "", err
+		}
+		if _, ok := models[modelName]; !ok {
+			models[modelName] = schema.GenericModelMetadata{
+				Versions:   []string{strconv.FormatInt(modelVersion, 10)},
+				Timestamps: []int64{timestamp},
+			}
+		} else {
+			model := models[modelName]
+			model.Versions = append(model.Versions, strconv.FormatInt(modelVersion, 10))
+			model.Timestamps = append(model.Timestamps, timestamp)
+			models[modelName] = model
+		}
+		if _, ok := latestVersions[modelName]; !ok {
+			latestVersions[modelName] = modelVersion
+		} else if modelVersion > latestVersions[modelName] {
+			latestVersions[modelName] = modelVersion
+		}
+		numRows += len(modelMetadata.InputSignatures)
+	}
+
+	rows := make([][]interface{}, 0, numRows)
+	for modelName, model := range models {
+		latestVersion := latestVersions[modelName]
+
+		for _, modelVersion := range model.Versions {
+			modelID := fmt.Sprintf("%s-%s", modelName, modelVersion)
+
+			inputSignatures := summary.ModelMetadata[modelID].InputSignatures
+			timestamp := summary.ModelMetadata[modelID].Timestamp
+			versionInt, err := strconv.ParseInt(modelVersion, 10, 64)
+			if err != nil {
+				return "", err
+			}
+
+			var applicableTags string
+			if versionInt == latestVersion {
+				applicableTags = " (latest)"
+			}
+
+			date := time.Unix(timestamp, 0)
+
+			for inputName, inputSignature := range inputSignatures {
+				shapeStr := make([]string, len(inputSignature.Shape))
+				for idx, dim := range inputSignature.Shape {
+					shapeStr[idx] = s.ObjFlatNoQuotes(dim)
+				}
+				shapeRowEntry := ""
+				if len(shapeStr) == 1 && shapeStr[0] == "scalar" {
+					shapeRowEntry = "scalar"
+				} else if len(shapeStr) == 1 && shapeStr[0] == "unknown" {
+					shapeRowEntry = "unknown"
+				} else {
+					shapeRowEntry = "(" + strings.Join(shapeStr, ", ") + ")"
+				}
+				rows = append(rows, []interface{}{
+					modelName,
+					modelVersion + applicableTags,
+					inputName,
+					inputSignature.Type,
+					shapeRowEntry,
+					date.Format(_timeFormat),
+				})
+			}
+		}
+	}
+
+	_, usesCortexDefaultModelName := summary.ModelMetadata[consts.SingleModelName]
+
+	t := table.Table{
+		Headers: []table.Header{
+			{
+				Title:    "model name",
+				MaxWidth: 32,
+				Hidden:   usesCortexDefaultModelName,
+			},
+			{
+				Title:    "model version",
+				MaxWidth: 25,
+			},
+			{
+				Title:    "model input",
+				MaxWidth: 32,
+			},
+			{
+				Title:    "type",
+				MaxWidth: 10,
+			},
+			{
+				Title:    "shape",
+				MaxWidth: 20,
+			},
+			{
+				Title:    "edit time",
+				MaxWidth: 32,
+			},
+		},
+		Rows: rows,
 	}
 
-	return &apiSummary, nil
+	return t.MustFormat(), nil
 }
diff --git a/cli/local/api.go b/cli/local/api.go
index be4b254188..c1689cd58c 100644
--- a/cli/local/api.go
+++ b/cli/local/api.go
@@ -35,7 +35,7 @@ import (
 
 var _deploymentID = "local"
 
-func UpdateAPI(apiConfig *userconfig.API, configPath string, projectID string, deployDisallowPrompt bool, awsClient *aws.Client) (*schema.APIResponse, string, error) {
+func UpdateAPI(apiConfig *userconfig.API, models []spec.CuratedModelResource, configPath string, projectID string, deployDisallowPrompt bool, awsClient *aws.Client) (*schema.APIResponse, string, error) {
 	var incompatibleVersion string
 	encounteredVersionMismatch := false
 	prevAPISpec, err := FindAPISpec(apiConfig.Name)
@@ -71,15 +71,12 @@ func UpdateAPI(apiConfig *userconfig.API, configPath string, projectID string, d
 		return nil, "", err
 	}
 
-	newAPISpec := spec.GetAPISpec(apiConfig, projectID, _deploymentID, "")
+	newAPISpec := spec.GetAPISpec(apiConfig, models, projectID, _deploymentID, "")
 
-	// apiConfig.Predictor.ModelPath was already added to apiConfig.Predictor.Models for ease of use
-	if len(apiConfig.Predictor.Models) > 0 {
-		localModelCaches, err := CacheModels(newAPISpec, awsClient)
-		if err != nil {
+	if newAPISpec != nil && newAPISpec.TotalLocalModelVersions() > 0 {
+		if err := CacheLocalModels(newAPISpec); err != nil {
 			return nil, "", err
 		}
-		newAPISpec.LocalModelCaches = localModelCaches
 	}
 
 	newAPISpec.LocalProjectDir = files.Dir(configPath)
@@ -91,7 +88,7 @@ func UpdateAPI(apiConfig *userconfig.API, configPath string, projectID string, d
 	if prevAPISpec != nil || len(prevAPIContainers) != 0 {
 		err = errors.FirstError(
 			DeleteAPI(newAPISpec.Name),
-			DeleteCachedModels(newAPISpec.Name, prevAPISpec.SubtractLocalModelIDs(newAPISpec)),
+			deleteCachedModels(newAPISpec.Name, prevAPISpec.SubtractModelIDs(newAPISpec)),
 		)
 		if err != nil {
 			return nil, "", err
@@ -101,13 +98,13 @@ func UpdateAPI(apiConfig *userconfig.API, configPath string, projectID string, d
 	err = writeAPISpec(newAPISpec)
 	if err != nil {
 		DeleteAPI(newAPISpec.Name)
-		DeleteCachedModels(newAPISpec.Name, newAPISpec.LocalModelIDs())
+		deleteCachedModels(newAPISpec.Name, newAPISpec.ModelIDs())
 		return nil, "", err
 	}
 
 	if err := DeployContainers(newAPISpec, awsClient); err != nil {
 		DeleteAPI(newAPISpec.Name)
-		DeleteCachedModels(newAPISpec.Name, newAPISpec.LocalModelIDs())
+		deleteCachedModels(newAPISpec.Name, newAPISpec.ModelIDs())
 		return nil, "", err
 	}
 
@@ -162,7 +159,7 @@ func areAPIsEqual(a1, a2 *spec.API) bool {
 	if a1.SpecID != a2.SpecID {
 		return false
 	}
-	if !strset.FromSlice(a1.LocalModelIDs()).IsEqual(strset.FromSlice(a2.LocalModelIDs())) {
+	if !strset.FromSlice(a1.ModelIDs()).IsEqual(strset.FromSlice(a2.ModelIDs())) {
 		return false
 	}
 	return true
@@ -183,6 +180,13 @@ func DeleteAPI(apiName string) error {
 		errList = append(errList, err)
 	}
 
+	if ContainersHaveAPINameVolume(containers) {
+		err = DeleteVolume(apiName)
+		if err != nil {
+			errList = append(errList, err)
+		}
+	}
+
 	_, err = FindAPISpec(apiName)
 	if err == nil {
 		_, err := files.DeleteDirIfPresent(filepath.Join(_localWorkspaceDir, "apis", apiName))
diff --git a/cli/local/delete.go b/cli/local/delete.go
index f422da4d09..b1355a76cc 100644
--- a/cli/local/delete.go
+++ b/cli/local/delete.go
@@ -69,10 +69,14 @@ func Delete(apiName string, keepCache, deleteForce bool) (schema.DeleteResponse,
 	if keepCache {
 		err = DeleteAPI(apiName)
 	} else {
-		err = errors.FirstError(
-			DeleteAPI(apiName),
-			DeleteCachedModels(apiName, apiSpec.LocalModelIDs()),
-		)
+		if apiSpec != nil {
+			err = errors.FirstError(
+				DeleteAPI(apiName),
+				deleteCachedModels(apiName, apiSpec.ModelIDs()),
+			)
+		} else {
+			err = DeleteAPI(apiName)
+		}
 	}
 	if err != nil {
 		return schema.DeleteResponse{}, err
diff --git a/cli/local/deploy.go b/cli/local/deploy.go
index fb0dada6f1..44f3fd9d81 100644
--- a/cli/local/deploy.go
+++ b/cli/local/deploy.go
@@ -67,7 +67,8 @@ func Deploy(env cliconfig.Environment, configPath string, projectFileList []stri
 		return nil, err
 	}
 
-	err = ValidateLocalAPIs(apiConfigs, projectFiles, awsClient)
+	models := []spec.CuratedModelResource{}
+	err = ValidateLocalAPIs(apiConfigs, &models, projectFiles, awsClient)
 	if err != nil {
 		err = errors.Append(err, fmt.Sprintf("\n\napi configuration schema for Realtime API can be found at https://docs.cortex.dev/v/%s/deployments/realtime-api/api-configuration", consts.CortexVersionMinor))
 		return nil, err
@@ -81,7 +82,7 @@ func Deploy(env cliconfig.Environment, configPath string, projectFileList []stri
 	results := make([]schema.DeployResult, len(apiConfigs))
 	for i := range apiConfigs {
 		apiConfig := apiConfigs[i]
-		api, msg, err := UpdateAPI(&apiConfig, configPath, projectID, deployDisallowPrompt, awsClient)
+		api, msg, err := UpdateAPI(&apiConfig, models, configPath, projectID, deployDisallowPrompt, awsClient)
 		results[i].Message = msg
 		if err != nil {
 			results[i].Error = errors.Message(err)
diff --git a/cli/local/docker_spec.go b/cli/local/docker_spec.go
index 6595ce7a66..67769a058a 100644
--- a/cli/local/docker_spec.go
+++ b/cli/local/docker_spec.go
@@ -43,6 +43,8 @@ const (
 	_defaultPortStr            = "8888"
 	_tfServingPortStr          = "9000"
 	_tfServingEmptyModelConfig = "/etc/tfs/model_config_server.conf"
+	_tfServingMaxReloadTimes   = "0"
+	_tfServingLoadTimeMicros   = "30000000" // 30 seconds
 	_tfServingBatchConfig      = "/etc/tfs/batch_config.conf"
 	_projectDir                = "/mnt/project"
 	_cacheDir                  = "/mnt/cache"
@@ -85,8 +87,6 @@ func getAPIEnv(api *spec.API, awsClient *aws.Client) []string {
 		"CORTEX_SERVING_PORT="+_defaultPortStr,
 		"CORTEX_PROVIDER="+"local",
 		"CORTEX_CACHE_DIR="+_cacheDir,
-		"CORTEX_MODEL_DIR="+_modelDir,
-		"CORTEX_MODELS="+strings.Join(api.ModelNames(), ","),
 		"CORTEX_API_SPEC="+filepath.Join("/mnt/workspace", filepath.Base(api.Key)),
 		"CORTEX_PROJECT_DIR="+_projectDir,
 		"CORTEX_PROCESSES_PER_REPLICA="+s.Int32(api.Predictor.ProcessesPerReplica),
@@ -95,6 +95,10 @@ func getAPIEnv(api *spec.API, awsClient *aws.Client) []string {
 		"AWS_REGION="+awsClient.Region,
 	)
 
+	if api.Predictor.ModelPath != nil || api.Predictor.Models != nil {
+		envs = append(envs, "CORTEX_MODEL_DIR="+_modelDir)
+	}
+
 	cortexPythonPath := _projectDir
 	if api.Predictor.PythonPath != nil {
 		cortexPythonPath = filepath.Join(_projectDir, *api.Predictor.PythonPath)
@@ -139,23 +143,33 @@ func deployPythonContainer(api *spec.API, awsClient *aws.Client) error {
 		}
 	}
 
+	mounts := []mount.Mount{
+		{
+			Type:   mount.TypeBind,
+			Source: api.LocalProjectDir,
+			Target: _projectDir,
+		},
+		{
+			Type:   mount.TypeBind,
+			Source: filepath.Join(_localWorkspaceDir, filepath.Dir(api.Key)),
+			Target: _workspaceDir,
+		},
+	}
+
+	for _, modelCache := range api.LocalModelCaches {
+		mounts = append(mounts, mount.Mount{
+			Type:   mount.TypeBind,
+			Source: modelCache.HostPath,
+			Target: filepath.Join(_modelDir, modelCache.TargetPath),
+		})
+	}
+
 	hostConfig := &container.HostConfig{
 		PortBindings: nat.PortMap{
 			_defaultPortStr + "/tcp": []nat.PortBinding{portBinding},
 		},
 		Resources: resources,
-		Mounts: []mount.Mount{
-			{
-				Type:   mount.TypeBind,
-				Source: api.LocalProjectDir,
-				Target: _projectDir,
-			},
-			{
-				Type:   mount.TypeBind,
-				Source: filepath.Join(_localWorkspaceDir, filepath.Dir(api.Key)),
-				Target: _workspaceDir,
-			},
-		},
+		Mounts:    mounts,
 	}
 
 	containerConfig := &container.Config{
@@ -316,6 +330,11 @@ func deployTensorFlowContainers(api *spec.API, awsClient *aws.Client) error {
 		}
 	}
 
+	modelVolume := api.Name
+	if err := DeleteVolume(modelVolume); err != nil {
+		return errors.Wrap(err, api.Identify())
+	}
+
 	mounts := []mount.Mount{}
 	for _, modelCache := range api.LocalModelCaches {
 		mounts = append(mounts, mount.Mount{
@@ -324,6 +343,11 @@ func deployTensorFlowContainers(api *spec.API, awsClient *aws.Client) error {
 			Target: filepath.Join(_modelDir, modelCache.TargetPath),
 		})
 	}
+	mounts = append(mounts, mount.Mount{
+		Type:   mount.TypeVolume,
+		Source: modelVolume,
+		Target: _modelDir,
+	})
 
 	serveHostConfig := &container.HostConfig{
 		Resources: serveResources,
@@ -334,6 +358,9 @@ func deployTensorFlowContainers(api *spec.API, awsClient *aws.Client) error {
 	cmdArgs := []string{
 		"--port=" + _tfServingPortStr,
 		"--model_config_file=" + _tfServingEmptyModelConfig,
+		"--max_num_load_retries=" + _tfServingMaxReloadTimes,
+		"--load_retry_interval_micros=" + _tfServingLoadTimeMicros,
+		fmt.Sprintf(`--grpc_channel_arguments="grpc.max_concurrent_streams=%d"`, api.Predictor.ProcessesPerReplica*api.Predictor.ThreadsPerProcess+10),
 	}
 	if api.Predictor.ServerSideBatching != nil {
 		envVars = append(envVars,
@@ -526,3 +553,25 @@ func DeleteContainers(apiName string) error {
 	}
 	return nil
 }
+
+func ContainersHaveAPINameVolume(containers []dockertypes.Container) bool {
+	for _, container := range containers {
+		apiName := container.Labels["apiName"]
+		for _, mounted := range container.Mounts {
+			if mounted.Type == mount.TypeVolume && mounted.Name == apiName {
+				return true
+			}
+		}
+	}
+
+	return false
+}
+
+func DeleteVolume(volumeName string) error {
+	if _, err := docker.MustDockerClient().VolumeInspect(context.Background(), volumeName); err == nil {
+		if err := docker.MustDockerClient().VolumeRemove(context.Background(), volumeName, false); err != nil {
+			return err
+		}
+	}
+	return nil
+}
diff --git a/cli/local/model_cache.go b/cli/local/model_cache.go
index 82b30dd650..996798679f 100644
--- a/cli/local/model_cache.go
+++ b/cli/local/model_cache.go
@@ -18,147 +18,127 @@ package local
 
 import (
 	"fmt"
-	"os"
 	"path/filepath"
 	"strings"
-	"time"
 
-	"github.com/cortexlabs/cortex/cli/types/flags"
-	"github.com/cortexlabs/cortex/pkg/lib/archive"
-	"github.com/cortexlabs/cortex/pkg/lib/aws"
-	"github.com/cortexlabs/cortex/pkg/lib/cron"
+	"github.com/cortexlabs/cortex/pkg/consts"
 	"github.com/cortexlabs/cortex/pkg/lib/errors"
 	"github.com/cortexlabs/cortex/pkg/lib/files"
-	"github.com/cortexlabs/cortex/pkg/lib/print"
 	"github.com/cortexlabs/cortex/pkg/lib/sets/strset"
 	s "github.com/cortexlabs/cortex/pkg/lib/strings"
 	"github.com/cortexlabs/cortex/pkg/types/spec"
 	"github.com/cortexlabs/cortex/pkg/types/userconfig"
 )
 
-func CacheModels(apiSpec *spec.API, awsClient *aws.Client) ([]*spec.LocalModelCache, error) {
-	modelPaths := make([]string, len(apiSpec.Predictor.Models))
-	for i, modelResource := range apiSpec.Predictor.Models {
-		modelPaths[i] = modelResource.ModelPath
-	}
+func CacheLocalModels(apiSpec *spec.API) error {
+	var err error
+	var wasAlreadyCached bool
+	var localModelCache *spec.LocalModelCache
+	localModelCaches := make([]*spec.LocalModelCache, 0)
 
-	uncachedModelCount := 0
+	modelsThatWereCachedAlready := 0
+	for i, model := range apiSpec.CuratedModelResources {
+		if model.S3Path {
+			continue
+		}
 
-	localModelCaches := make([]*spec.LocalModelCache, len(modelPaths))
-	for i, modelPath := range modelPaths {
-		var err error
-		modelCacheID, err := modelCacheID(modelPath, awsClient)
+		localModelCache, wasAlreadyCached, err = cacheLocalModel(model)
 		if err != nil {
 			if apiSpec.Predictor.ModelPath != nil {
-				return nil, errors.Wrap(err, apiSpec.Identify(), userconfig.PredictorKey, userconfig.ModelPathKey)
+				return errors.Wrap(err, apiSpec.Identify(), userconfig.PredictorKey, userconfig.ModelPathKey)
+			} else if apiSpec.Predictor.Models != nil && apiSpec.Predictor.Models.Dir != nil {
+				return errors.Wrap(err, apiSpec.Identify(), userconfig.PredictorKey, userconfig.ModelsKey, userconfig.ModelsDirKey, apiSpec.CuratedModelResources[i].Name, *apiSpec.Predictor.Models.Dir)
 			}
-			return nil, errors.Wrap(err, apiSpec.Identify(), userconfig.PredictorKey, userconfig.ModelsKey, apiSpec.Predictor.Models[i].Name, userconfig.ModelPathKey)
+			return errors.Wrap(err, apiSpec.Identify(), userconfig.PredictorKey, userconfig.ModelsKey, userconfig.ModelsPathsKey, apiSpec.CuratedModelResources[i].Name, userconfig.ModelPathKey)
 		}
-
-		localModelCache := spec.LocalModelCache{
-			ID:         modelCacheID,
-			HostPath:   filepath.Join(_modelCacheDir, modelCacheID),
-			TargetPath: apiSpec.Predictor.Models[i].Name,
+		if wasAlreadyCached {
+			modelsThatWereCachedAlready++
 		}
-
-		if !files.IsFile(filepath.Join(localModelCache.HostPath, "_SUCCESS")) {
-			err = cacheModel(modelPath, localModelCache, awsClient)
-			if err != nil {
-				if apiSpec.Predictor.ModelPath != nil {
-					return nil, errors.Wrap(err, apiSpec.Identify(), userconfig.PredictorKey, userconfig.ModelPathKey)
-				}
-				return nil, errors.Wrap(err, apiSpec.Identify(), userconfig.PredictorKey, userconfig.ModelsKey, apiSpec.Predictor.Models[i].Name, userconfig.ModelPathKey)
-			}
-			uncachedModelCount++
+		if len(model.Versions) == 0 {
+			localModelCache.TargetPath = filepath.Join(apiSpec.CuratedModelResources[i].Name, "1")
+		} else {
+			localModelCache.TargetPath = apiSpec.CuratedModelResources[i].Name
 		}
 
-		localModelCaches[i] = &localModelCache
+		localModelCaches = append(localModelCaches, localModelCache)
 	}
+	apiSpec.LocalModelCaches = localModelCaches
 
-	if uncachedModelCount > 0 {
-		localPrintln("") // Newline to group all of the model information
+	if len(localModelCaches) > modelsThatWereCachedAlready {
+		fmt.Println("") // Newline to group all of the model information
 	}
 
-	return localModelCaches, nil
+	return nil
 }
 
-func modelCacheID(modelPath string, awsClient *aws.Client) (string, error) {
-	if strings.HasPrefix(modelPath, "s3://") {
-		awsClientForBucket, err := aws.NewFromClientS3Path(modelPath, awsClient)
-		if err != nil {
-			return "", err
-		}
-		bucket, prefix, err := aws.SplitS3Path(modelPath)
-		if err != nil {
-			return "", err
-		}
-		hash, err := awsClientForBucket.HashS3Dir(bucket, prefix, nil)
-		if err != nil {
-			return "", err
-		}
-		return hash, nil
+func cacheLocalModel(model spec.CuratedModelResource) (*spec.LocalModelCache, bool, error) {
+	localModelCache := spec.LocalModelCache{}
+	var err error
+
+	if model.S3Path {
+		return nil, false, nil
 	}
 
-	hash, err := localModelHash(modelPath)
+	hash, err := localModelHash(model.ModelPath)
 	if err != nil {
-		return "", err
+		return nil, false, err
 	}
-	return hash, nil
-}
+	localModelCache.ID = hash
 
-func cacheModel(modelPath string, localModelCache spec.LocalModelCache, awsClient *aws.Client) error {
-	modelDir := localModelCache.HostPath
+	destModelDir := filepath.Join(_modelCacheDir, localModelCache.ID)
 
-	if files.IsFile(filepath.Join(modelDir, "_SUCCESS")) {
-		return nil
+	if files.IsDir(destModelDir) {
+		if len(model.Versions) == 0 {
+			localModelCache.HostPath = filepath.Join(destModelDir, "1")
+		} else {
+			localModelCache.HostPath = destModelDir
+		}
+		return &localModelCache, true, nil
 	}
 
-	err := ResetModelCacheDir(modelDir)
+	err = resetModelCacheDir(destModelDir)
 	if err != nil {
-		return err
+		return nil, false, err
 	}
-
-	if strings.HasPrefix(modelPath, "s3://") {
-		awsClientForBucket, err := aws.NewFromClientS3Path(modelPath, awsClient)
-		if err != nil {
-			return err
+	if len(model.Versions) == 0 {
+		if _, err := files.CreateDirIfMissing(filepath.Join(destModelDir, "1")); err != nil {
+			return nil, false, err
 		}
+	}
 
-		err = downloadModel(modelPath, modelDir, awsClientForBucket)
-		if err != nil {
-			return err
+	if model.Name == consts.SingleModelName {
+		switch len(model.Versions) {
+		case 0:
+			fmt.Println("￮ caching model ...")
+		case 1:
+			fmt.Println(fmt.Sprintf("￮ caching model (version %d) ...", model.Versions[0]))
+		default:
+			fmt.Println(fmt.Sprintf("￮ caching model (versions %s) ...", s.UserStrsAnd(model.Versions)))
 		}
+
 	} else {
-		if strings.HasSuffix(modelPath, ".zip") {
-			err := unzipAndValidate(modelPath, modelPath, modelDir)
-			if err != nil {
-				return err
-			}
-		} else if strings.HasSuffix(modelPath, ".onnx") {
-			localPrintln(fmt.Sprintf("￮ caching model %s ...", modelPath))
-			err := files.CopyFileOverwrite(modelPath, filepath.Join(modelDir, filepath.Base(modelPath)))
-			if err != nil {
-				return err
-			}
-		} else {
-			localPrintln(fmt.Sprintf("￮ caching model %s ...", modelPath))
-			tfModelVersion := filepath.Base(modelPath)
-			err := files.CopyDirOverwrite(strings.TrimSuffix(modelPath, "/"), s.EnsureSuffix(filepath.Join(modelDir, tfModelVersion), "/"))
-			if err != nil {
-				return err
-			}
+		switch len(model.Versions) {
+		case 0:
+			fmt.Println(fmt.Sprintf("￮ caching model %s ...", model.Name))
+		case 1:
+			fmt.Println(fmt.Sprintf("￮ caching model %s (version %d) ...", model.Name, model.Versions[0]))
+		default:
+			fmt.Println(fmt.Sprintf("￮ caching model %s (versions %s) ...", model.Name, s.UserStrsAnd(model.Versions)))
 		}
 	}
 
-	err = files.MakeEmptyFile(filepath.Join(modelDir, "_SUCCESS"))
-	if err != nil {
-		return err
+	if len(model.Versions) == 0 {
+		destModelDir = filepath.Join(destModelDir, "1")
+	}
+	if err := files.CopyDirOverwrite(strings.TrimSuffix(model.ModelPath, "/"), s.EnsureSuffix(destModelDir, "/")); err != nil {
+		return nil, false, err
 	}
 
-	return nil
+	localModelCache.HostPath = destModelDir
+	return &localModelCache, false, nil
 }
 
-func DeleteCachedModels(apiName string, modelsToDelete []string) error {
+func deleteCachedModels(apiName string, modelsToDelete []string) error {
 	var errList []error
 	modelsInUse := strset.New()
 	apiSpecList, err := ListAPISpecs()
@@ -178,13 +158,13 @@ func DeleteCachedModels(apiName string, modelsToDelete []string) error {
 		strset.FromSlice(modelsToDelete),
 		modelsInUse,
 	)
-	err = DeleteCachedModelsByID(toDeleteModels.Slice())
+	err = deleteCachedModelsByID(toDeleteModels.Slice())
 
 	errList = append(errList, err)
 	return errors.FirstError(errList...)
 }
 
-func DeleteCachedModelsByID(modelIDs []string) error {
+func deleteCachedModelsByID(modelIDs []string) error {
 	errList := []error{}
 	for _, modelID := range modelIDs {
 		err := files.DeleteDir(filepath.Join(_modelCacheDir, modelID))
@@ -196,86 +176,6 @@ func DeleteCachedModelsByID(modelIDs []string) error {
 	return errors.FirstError(errList...)
 }
 
-func downloadModel(modelPath string, modelDir string, awsClientForBucket *aws.Client) error {
-	localPrintf("￮ downloading model %s ", modelPath)
-	defer localPrint(" ✓\n")
-	if OutputType != flags.JSONOutputType {
-		dotCron := cron.Run(print.Dot, nil, 2*time.Second)
-		defer dotCron.Cancel()
-	}
-
-	bucket, prefix, err := aws.SplitS3Path(modelPath)
-	if err != nil {
-		return err
-	}
-
-	if strings.HasSuffix(modelPath, ".zip") || strings.HasSuffix(modelPath, ".onnx") {
-		localPath := filepath.Join(modelDir, filepath.Base(modelPath))
-		err := awsClientForBucket.DownloadFileFromS3(bucket, prefix, localPath)
-		if err != nil {
-			return err
-		}
-		if strings.HasSuffix(modelPath, ".zip") {
-			err := unzipAndValidate(modelPath, localPath, modelDir)
-			if err != nil {
-				return err
-			}
-			err = os.Remove(localPath)
-			if err != nil {
-				return err
-			}
-		}
-	} else {
-		tfModelVersion := filepath.Base(prefix)
-		err := awsClientForBucket.DownloadDirFromS3(bucket, prefix, filepath.Join(modelDir, tfModelVersion), true, nil)
-		if err != nil {
-			return err
-		}
-	}
-
-	return nil
-}
-
-func unzipAndValidate(originalModelPath string, zipFile string, destPath string) error {
-	localPrintln(fmt.Sprintf("￮ unzipping model %s ...", originalModelPath))
-	tmpDir := filepath.Join(filepath.Dir(destPath), filepath.Base(destPath)+"-tmp")
-	err := files.CreateDir(tmpDir)
-	if err != nil {
-		return err
-	}
-
-	_, err = archive.UnzipFileToDir(zipFile, tmpDir)
-	if err != nil {
-		return err
-	}
-
-	// returns a tensorflow directory with the version as the suffix of the path
-	tensorflowDir, err := spec.GetTFServingExportFromLocalPath(tmpDir)
-	if err != nil {
-		return err
-	}
-
-	isValid, err := spec.IsValidTensorFlowLocalDirectory(tensorflowDir)
-	if err != nil {
-		return err
-	} else if !isValid {
-		return ErrorInvalidTensorFlowZip()
-	}
-
-	destPathWithVersion := filepath.Join(destPath, filepath.Base(tensorflowDir))
-	err = os.Rename(strings.TrimSuffix(tensorflowDir, "/"), strings.TrimSuffix(destPathWithVersion, "/"))
-	if err != nil {
-		return errors.WithStack(err)
-	}
-
-	err = files.DeleteDir(tmpDir)
-	if err != nil {
-		return err
-	}
-
-	return nil
-}
-
 func localModelHash(modelPath string) (string, error) {
 	var err error
 	modelHash := ""
@@ -297,7 +197,7 @@ func localModelHash(modelPath string) (string, error) {
 	return modelHash, nil
 }
 
-func ResetModelCacheDir(modelDir string) error {
+func resetModelCacheDir(modelDir string) error {
 	_, err := files.DeleteDirIfPresent(modelDir)
 	if err != nil {
 		return err
diff --git a/cli/local/validations.go b/cli/local/validations.go
index 6e202cece1..5e90e9c196 100644
--- a/cli/local/validations.go
+++ b/cli/local/validations.go
@@ -106,7 +106,7 @@ func (projectFiles ProjectFiles) ProjectDir() string {
 	return projectFiles.projectRoot
 }
 
-func ValidateLocalAPIs(apis []userconfig.API, projectFiles ProjectFiles, awsClient *aws.Client) error {
+func ValidateLocalAPIs(apis []userconfig.API, models *[]spec.CuratedModelResource, projectFiles ProjectFiles, awsClient *aws.Client) error {
 	if len(apis) == 0 {
 		return spec.ErrorNoAPIs()
 	}
@@ -119,7 +119,7 @@ func ValidateLocalAPIs(apis []userconfig.API, projectFiles ProjectFiles, awsClie
 	for i := range apis {
 		api := &apis[i]
 
-		if err := spec.ValidateAPI(api, projectFiles, types.LocalProviderType, awsClient, nil); err != nil {
+		if err := spec.ValidateAPI(api, models, projectFiles, types.LocalProviderType, awsClient, nil); err != nil {
 			return errors.Wrap(err, api.Identify())
 		}
 
diff --git a/docs/deployments/realtime-api/api-configuration.md b/docs/deployments/realtime-api/api-configuration.md
index 577874f138..df7ae6bd65 100644
--- a/docs/deployments/realtime-api/api-configuration.md
+++ b/docs/deployments/realtime-api/api-configuration.md
@@ -14,11 +14,20 @@ Reference the section below which corresponds to your Predictor type: [Python](#
   predictor:
     type: python
     path: <string>  # path to a python file with a PythonPredictor class definition, relative to the Cortex root (required)
+    model_path: <string>  # S3 path to an exported model directory (e.g. s3://my-bucket/exported_model/) (optional, cannot be provided along with 'models')
+    models:  # use this to serve multiple models in a single API (optional, cannot be provided along with 'model_path')
+      dir: <string>  # S3 path to a directory containing multiple models (e.g. s3://my-bucket/models/) (either this or 'paths' must be provided)
+      paths:  # list of S3 paths to exported model directories (either this or 'dir' must be provided)
+        - name: <string>  # unique name for the model (e.g. text-generator) (required)
+          model_path: <string>  # S3 path to an exported model directory (e.g. s3://my-bucket/exported_model/) (required)
+        ...
+      cache_size: <int>  # the number models to keep in memory (optional; all models are kept in memory by default)
+      disk_cache_size: <int>  # the number of models to keep on disk (optional; all models are kept on disk by default)
     processes_per_replica: <int>  # the number of parallel serving processes to run on each replica (default: 1)
     threads_per_process: <int>  # the number of threads per process (default: 1)
     config: <string: value>  # arbitrary dictionary passed to the constructor of the Predictor (optional)
     python_path: <string>  # path to the root of your Python folder that will be appended to PYTHONPATH (default: folder containing cortex.yaml)
-    image: <string> # docker image to use for the Predictor (default: cortexlabs/python-predictor-cpu or cortexlabs/python-predictor-gpu based on compute)
+    image: <string>  # docker image to use for the Predictor (default: cortexlabs/python-predictor-cpu or cortexlabs/python-predictor-gpu based on compute)
     env: <string: string>  # dictionary of environment variables
   networking:
     endpoint: <string>  # the endpoint for the API (aws only) (default: <api_name>)
@@ -27,7 +36,7 @@ Reference the section below which corresponds to your Predictor type: [Python](#
   compute:
     cpu: <string | int | float>  # CPU request per replica, e.g. 200m or 1 (200m is equivalent to 0.2) (default: 200m)
     gpu: <int>  # GPU request per replica (default: 0)
-    inf: <int> # Inferentia ASIC request per replica (default: 0)
+    inf: <int>  # Inferentia ASIC request per replica (default: 0)
     mem: <string>  # memory request per replica, e.g. 200Mi or 1Gi (default: Null)
   monitoring:  # (aws only)
     model_type: <string>  # must be "classification" or "regression", so responses can be interpreted correctly (i.e. categorical vs continuous) (required)
@@ -50,7 +59,7 @@ Reference the section below which corresponds to your Predictor type: [Python](#
     max_unavailable: <string | int>  # maximum number of replicas that can be unavailable during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%)
 ```
 
-See additional documentation for [parallelism](parallelism.md), [autoscaling](autoscaling.md), [compute](../compute.md), [networking](../networking.md), [prediction monitoring](prediction-monitoring.md), and [overriding API images](../system-packages.md).
+See additional documentation for [models](models.md), [parallelism](parallelism.md), [autoscaling](autoscaling.md), [compute](../compute.md), [networking](../networking.md), [prediction monitoring](prediction-monitoring.md), and [overriding API images](../system-packages.md).
 
 ## TensorFlow Predictor
 
@@ -60,13 +69,18 @@ See additional documentation for [parallelism](parallelism.md), [autoscaling](au
   predictor:
     type: tensorflow
     path: <string>  # path to a python file with a TensorFlowPredictor class definition, relative to the Cortex root (required)
-    model_path: <string>  # S3 path to an exported model (e.g. s3://my-bucket/exported_model) (either this or 'models' must be provided)
+    model_path: <string>  # S3 path to an exported model directory (e.g. s3://my-bucket/exported_model/) (either this or 'models' must be provided)
     signature_key: <string>  # name of the signature def to use for prediction (required if your model has more than one signature def)
-    models:  # use this when multiple models per API are desired (either this or 'model_path' must be provided)
-      - name: <string> # unique name for the model (e.g. text-generator) (required)
-        model_path: <string>  # S3 path to an exported model (e.g. s3://my-bucket/exported_model) (required)
-        signature_key: <string>  # name of the signature def to use for prediction (required if your model has more than one signature def)
-      ...
+    models:  # use this to serve multiple models in a single API (either this or 'model_path' must be provided)
+      dir: <string>  # S3 path to a directory containing multiple models (e.g. s3://my-bucket/models/) (either this or 'paths' must be provided)
+      paths:  # list of S3 paths to exported model directories (either this or 'dir' must be provided)
+        - name: <string>  # unique name for the model (e.g. text-generator) (required)
+          model_path: <string>  # S3 path to an exported model directory (e.g. s3://my-bucket/exported_model/) (required)
+          signature_key: <string>  # name of the signature def to use for prediction (required if your model has more than one signature def)
+        ...
+      signature_key:  # name of the signature def to use for prediction for 'dir'-specified models or for models specified using 'paths' that haven't had a signature key set
+      cache_size: <int>  # the number models to keep in memory (optional; all models are kept in memory by default)
+      disk_cache_size: <int>  # the number of models to keep on disk (optional; all models are kept on disk by default)
     server_side_batching:  # (optional)
       max_batch_size: <int>  # the maximum number of requests to aggregate before running inference
       batch_interval: <duration>  # the maximum amount of time to spend waiting for additional requests before running inference on the batch of requests
@@ -74,8 +88,8 @@ See additional documentation for [parallelism](parallelism.md), [autoscaling](au
     threads_per_process: <int>  # the number of threads per process (default: 1)
     config: <string: value>  # arbitrary dictionary passed to the constructor of the Predictor (optional)
     python_path: <string>  # path to the root of your Python folder that will be appended to PYTHONPATH (default: folder containing cortex.yaml)
-    image: <string> # docker image to use for the Predictor (default: cortexlabs/tensorflow-predictor)
-    tensorflow_serving_image: <string> # docker image to use for the TensorFlow Serving container (default: cortexlabs/tensorflow-serving-gpu or cortexlabs/tensorflow-serving-cpu based on compute)
+    image: <string>  # docker image to use for the Predictor (default: cortexlabs/tensorflow-predictor)
+    tensorflow_serving_image: <string>  # docker image to use for the TensorFlow Serving container (default: cortexlabs/tensorflow-serving-gpu or cortexlabs/tensorflow-serving-cpu based on compute)
     env: <string: string>  # dictionary of environment variables
   networking:
     endpoint: <string>  # the endpoint for the API (aws only) (default: <api_name>)
@@ -84,7 +98,7 @@ See additional documentation for [parallelism](parallelism.md), [autoscaling](au
   compute:
     cpu: <string | int | float>  # CPU request per replica, e.g. 200m or 1 (200m is equivalent to 0.2) (default: 200m)
     gpu: <int>  # GPU request per replica (default: 0)
-    inf: <int> # Inferentia ASIC request per replica (default: 0)
+    inf: <int>  # Inferentia ASIC request per replica (default: 0)
     mem: <string>  # memory request per replica, e.g. 200Mi or 1Gi (default: Null)
   monitoring:  # (aws only)
     model_type: <string>  # must be "classification" or "regression", so responses can be interpreted correctly (i.e. categorical vs continuous) (required)
@@ -107,7 +121,7 @@ See additional documentation for [parallelism](parallelism.md), [autoscaling](au
     max_unavailable: <string | int>  # maximum number of replicas that can be unavailable during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%)
 ```
 
-See additional documentation for [parallelism](parallelism.md), [autoscaling](autoscaling.md), [compute](../compute.md), [networking](../networking.md), [prediction monitoring](prediction-monitoring.md), and [overriding API images](../system-packages.md).
+See additional documentation for [models](models.md), [parallelism](parallelism.md), [autoscaling](autoscaling.md), [compute](../compute.md), [networking](../networking.md), [prediction monitoring](prediction-monitoring.md), and [overriding API images](../system-packages.md).
 
 ## ONNX Predictor
 
@@ -117,17 +131,21 @@ See additional documentation for [parallelism](parallelism.md), [autoscaling](au
   predictor:
     type: onnx
     path: <string>  # path to a python file with an ONNXPredictor class definition, relative to the Cortex root (required)
-    model_path: <string>  # S3 path to an exported model (e.g. s3://my-bucket/exported_model.onnx) (either this or 'models' must be provided)
-    models:  # use this when multiple models per API are desired (either this or 'model_path' must be provided)
-      - name: <string> # unique name for the model (e.g. text-generator) (required)
-        model_path: <string>  # S3 path to an exported model (e.g. s3://my-bucket/exported_model.onnx) (required)
-        signature_key: <string>  # name of the signature def to use for prediction (required if your model has more than one signature def)
-      ...
+    model_path: <string>  # S3 path to an exported model directory (e.g. s3://my-bucket/exported_model/) (either this or 'models' must be provided)
+    models:  # use this to serve multiple models in a single API (either this or 'model_path' must be provided)
+      dir: <string>  # S3 path to a directory containing multiple models (e.g. s3://my-bucket/models/) (either this or 'paths' must be provided)
+      paths:  # list of S3 paths to exported model directories (either this or 'dir' must be provided)
+        - name: <string>  # unique name for the model (e.g. text-generator) (required)
+          model_path: <string>  # S3 path to an exported model directory (e.g. s3://my-bucket/exported_model/) (required)
+          signature_key: <string>  # name of the signature def to use for prediction (required if your model has more than one signature def)
+        ...
+      cache_size: <int>  # the number models to keep in memory (optional; all models are kept in memory by default)
+      disk_cache_size: <int>  # the number of models to keep on disk (optional; all models are kept on disk by default)
     processes_per_replica: <int>  # the number of parallel serving processes to run on each replica (default: 1)
     threads_per_process: <int>  # the number of threads per process (default: 1)
     config: <string: value>  # arbitrary dictionary passed to the constructor of the Predictor (optional)
     python_path: <string>  # path to the root of your Python folder that will be appended to PYTHONPATH (default: folder containing cortex.yaml)
-    image: <string> # docker image to use for the Predictor (default: cortexlabs/onnx-predictor-gpu or cortexlabs/onnx-predictor-cpu based on compute)
+    image: <string>  # docker image to use for the Predictor (default: cortexlabs/onnx-predictor-gpu or cortexlabs/onnx-predictor-cpu based on compute)
     env: <string: string>  # dictionary of environment variables
   networking:
     endpoint: <string>  # the endpoint for the API (aws only) (default: <api_name>)
@@ -158,4 +176,4 @@ See additional documentation for [parallelism](parallelism.md), [autoscaling](au
     max_unavailable: <string | int>  # maximum number of replicas that can be unavailable during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%)
 ```
 
-See additional documentation for [parallelism](parallelism.md), [autoscaling](autoscaling.md), [compute](../compute.md), [networking](../networking.md), [prediction monitoring](prediction-monitoring.md), and [overriding API images](../system-packages.md).
+See additional documentation for [models](models.md), [parallelism](parallelism.md), [autoscaling](autoscaling.md), [compute](../compute.md), [networking](../networking.md), [prediction monitoring](prediction-monitoring.md), and [overriding API images](../system-packages.md).
diff --git a/docs/deployments/realtime-api/models.md b/docs/deployments/realtime-api/models.md
new file mode 100644
index 0000000000..141119b806
--- /dev/null
+++ b/docs/deployments/realtime-api/models.md
@@ -0,0 +1,390 @@
+# Models
+
+_WARNING: you are on the master branch, please refer to the docs on the branch that matches your `cortex version`_
+
+## Model directory format
+
+Whenever a model path is specified in an API configuration file, it should be a path to an S3 prefix (or a local directory if deploying locally) which contains your exported model. Directories may include a single model, or multiple folders each with a single model (note that a "single model" need not be a single file; there can be multiple files for a single model). When multiple folders are used, the folder names must be integer values, and will be interpreted as the model version. Model versions can be any integer, but are typically integer timestamps. It is always assumed that the highest version number is the latest version of your model.
+
+Each predictor type expects a different model format:
+
+### Python
+
+For the Python predictor, any model structure is accepted. Here is an example:
+
+```text
+  s3://my-bucket/models/text-generator/
+  ├── model.pkl
+  └── data.txt
+```
+
+or for a versioned model:
+
+```text
+  s3://my-bucket/models/text-generator/
+  ├── 1523423423/  (version number, usually a timestamp)
+  |   ├── model.pkl
+  |   └── data.txt
+  └── 2434389194/  (version number, usually a timestamp)
+      ├── model.pkl
+      └── data.txt
+```
+
+### TensorFlow
+
+For the TensorFlow predictor, the model path must be a SavedModel export:
+
+```text
+  s3://my-bucket/models/text-generator/
+  ├── saved_model.pb
+  └── variables/
+      ├── variables.index
+      ├── variables.data-00000-of-00003
+      ├── variables.data-00001-of-00003
+      └── variables.data-00002-of-...
+```
+
+or for a versioned model:
+
+```text
+  s3://my-bucket/models/text-generator/
+  ├── 1523423423/  (version number, usually a timestamp)
+  |   ├── saved_model.pb
+  |   └── variables/
+  |       ├── variables.index
+  |       ├── variables.data-00000-of-00003
+  |       ├── variables.data-00001-of-00003
+  |       └── variables.data-00002-of-...
+  └── 2434389194/  (version number, usually a timestamp)
+      ├── saved_model.pb
+      └── variables/
+          ├── variables.index
+          ├── variables.data-00000-of-00003
+          ├── variables.data-00001-of-00003
+          └── variables.data-00002-of-...
+```
+
+#### Inferentia
+
+When Inferentia models are used, the directory structure is slightly different:
+
+```text
+  s3://my-bucket/models/text-generator/
+  └── saved_model.pb
+```
+
+or for a versioned model:
+
+```text
+  s3://my-bucket/models/text-generator/
+  ├── 1523423423/  (version number, usually a timestamp)
+  |   └── saved_model.pb
+  └── 2434389194/  (version number, usually a timestamp)
+      └── saved_model.pb
+```
+
+### ONNX
+
+For the ONNX predictor, the model path must contain a single `*.onnx` file:
+
+```text
+  s3://my-bucket/models/text-generator/
+  └── model.onnx
+```
+
+or for a versioned model:
+
+```text
+  s3://my-bucket/models/text-generator/
+  ├── 1523423423/  (version number, usually a timestamp)
+  |   └── model.onnx
+  └── 2434389194/  (version number, usually a timestamp)
+      └── model.onnx
+```
+
+## Single model
+
+The most common pattern is to serve a single model per API. The path to the model is specified in the `model_path` field in the `predictor` configuration. For example:
+
+```yaml
+# cortex.yaml
+
+- name: iris-classifier
+  kind: RealtimeAPI
+  predictor:
+    # ...
+    model_path: s3://my-bucket/models/text-generator/
+```
+
+Note: for the Python predictor type, it is not necessary to specify the path to your model in `model_path`, since you can download and load it in your predictor's `__init__()` function. That said, it is necessary to use the `model_path` field to take advantage of [live model reloading](#live-model-reloading).
+
+## Multiple models
+
+It is possible to serve multiple models from a single API. The paths to the models are specified in the api configuration, either via the `models.paths` or `models.dir` field in the `predictor` configuration. For example:
+
+```yaml
+# cortex.yaml
+
+- name: iris-classifier
+  kind: RealtimeAPI
+  predictor:
+    # ...
+    models:
+      paths:
+        - name: iris-classifier
+          path: s3://my-bucket/models/text-generator/
+        # ...
+```
+
+or:
+
+```yaml
+# cortex.yaml
+
+- name: iris-classifier
+  kind: RealtimeAPI
+  predictor:
+    # ...
+    models:
+      dir: s3://my-bucket/models/
+```
+
+Note: for the Python predictor type, it is not necessary to specify the paths to your models in `models`, since you can download and load them in your predictor's `__init__()` function. That said, it is necessary to use the `models` field to take advantage of live reloading or multi model caching (see below).
+
+When using the `models.paths` field, each path must be a valid model directory (see above for valid model directory structures).
+
+When using the `models.dir` field, the directory provided may contain multiple subdirectories, each of which is a valid model directory. For example:
+
+```text
+  s3://my-bucket/models/
+  ├── text-generator
+  |   └── * (model files)
+  └── sentiment-analyzer
+      ├── 24753823/
+      |   └── * (model files)
+      └── 26234288/
+          └── * (model files)
+```
+
+In this case, there are two models in the directory, one of which is named "text-generator", and the other is named "sentiment-analyzer".
+
+<!-- CORTEX_VERSION_MINOR -->
+Additional examples can be seen in the [multi model guide](../../guides/multi-model.md) and in [examples/model-caching](https://github.com/cortexlabs/cortex/tree/master/examples/model-caching) (remove the `cache_size` and `disk_cache_size` configurations in `cortex.yaml` to disable [multi model caching](#multi-model-caching)).
+
+## Live model reloading
+
+Live model reloading is a mechanism that periodically checks for updated models in the model path(s) provided in `predictor.model_path` or `predictor.models`. It is automatically enabled for all predictor types, including the Python predictor type (as long as model paths are specified via `model_path` or `models` in the `predictor` configuration).
+
+The following is a list of events that will trigger the API to update its model(s):
+
+* A new model is added to the model directory.
+* A model is removed from the model directory.
+* A model changes its directory structure.
+* A file in the model directory is updated in-place.
+
+<!-- CORTEX_VERSION_MINOR -->
+Examples can be seen in [examples/live-reloading](https://github.com/cortexlabs/cortex/tree/master/examples/live-reloading).
+
+Usage varies based on the predictor type:
+
+### Python
+
+To use live model reloading with the Python predictor, the model path(s) must be specified in the API's `predictor` configuration (via the `model_path` or `models` field). When models are specified in this manner, your `PythonPredictor` class must implement the `load_model()` function, and models can be retrieved by using the `get_model()` method of the `python_client` that's passed to the predictor's constructor:
+
+```python
+def get_model(model_name, model_version):
+    """
+    Retrieve a model for inference.
+
+    Args:
+        model_name (optional): Name of the model to retrieve (when multiple models are deployed in an API).
+            When predictor.models.paths is specified, model_name should be the name of one of the models listed in the API config.
+            When predictor.models.dir is specified, model_name should be the name of a top-level directory in the models dir.
+        model_version (string, optional): Version of the model to retrieve. Can be omitted or set to "latest" to select the highest version.
+
+    Returns:
+        The model as loaded by the load_model() method.
+    """
+```
+
+For example:
+
+```python
+class PythonPredictor:
+    def __init__(self, config, python_client):
+        self.client = python_client
+
+    def load_model(self, model_path):
+        # model_path is a path to your model's directory on disk
+        return load_from_disk(model_path)
+
+    def predict(self, payload):
+      model = self.client.get_model()
+      return model.predict(payload)
+```
+
+When multiple models are being served in an API, `python_client.get_model()` can accept a model name:
+
+```python
+class PythonPredictor:
+    # ...
+
+    def predict(self, payload, query_params):
+      model = self.client.get_model(query_params["model"])
+      return model.predict(payload)
+```
+
+`python_client.get_model()` can also accept a model version if a version other than the highest version number is desired:
+
+```python
+class PythonPredictor:
+    # ...
+
+    def predict(self, payload, query_params):
+      model = self.client.get_model(query_params["model"], query_params["version"])
+      return model.predict(payload)
+```
+
+### TensorFlow
+
+When using the TensorFlow predictor, inference is performed by using the `predict()` method of the `tensorflow_client` that's passed to the predictor's constructor:
+
+```python
+def predict(model_input, model_name, model_version) -> dict:
+    """
+    Run prediction.
+
+    Args:
+        model_input: Input to the model.
+        model_name (optional): Name of the model to retrieve (when multiple models are deployed in an API).
+            When predictor.models.paths is specified, model_name should be the name of one of the models listed in the API config.
+            When predictor.models.dir is specified, model_name should be the name of a top-level directory in the models dir.
+        model_version (string, optional): Version of the model to retrieve. Can be omitted or set to "latest" to select the highest version.
+
+    Returns:
+        dict: TensorFlow Serving response converted to a dictionary.
+    """
+```
+
+For example:
+
+```python
+class TensorFlowPredictor:
+    def __init__(self, tensorflow_client, config):
+        self.client = tensorflow_client
+
+    def predict(self, payload):
+      return self.client.predict(payload)
+```
+
+When multiple models are being served in an API, `tensorflow_client.predict()` can accept a model name:
+
+```python
+class TensorFlowPredictor:
+    # ...
+
+    def predict(self, payload, query_params):
+      return self.client.predict(payload, query_params["model"])
+```
+
+`tensorflow_client.predict()` can also accept a model version if a version other than the highest version number is desired:
+
+```python
+class TensorFlowPredictor:
+    # ...
+
+    def predict(self, payload, query_params):
+      return self.client.predict(payload, query_params["model"], query_params["version"])
+```
+
+Note: when using Inferentia models with the TensorFlow predictor, live model reloading is only supported if `predictor.processes_per_replica` is set to 1 (the default value).
+
+### ONNX
+
+When using the ONNX predictor, inference is performed by using the `predict()` method of the `onnx_client` that's passed to the predictor's constructor:
+
+```python
+def predict(model_input: Any, model_name: Optional[str] = None, model_version: str = "latest") -> Any:
+    """
+    Run prediction.
+
+    Args:
+        model_input: Input to the model.
+        model_name (optional): Name of the model to retrieve (when multiple models are deployed in an API).
+            When predictor.models.paths is specified, model_name should be the name of one of the models listed in the API config.
+            When predictor.models.dir is specified, model_name should be the name of a top-level directory in the models dir.
+        model_version (string, optional): Version of the model to retrieve. Can be omitted or set to "latest" to select the highest version.
+
+    Returns:
+        The prediction returned from the model.
+    """
+```
+
+For example:
+
+```python
+class ONNXPredictor:
+    def __init__(self, onnx_client, config):
+        self.client = onnx_client
+
+    def predict(self, payload):
+      return self.client.predict(payload)
+```
+
+When multiple models are being served in an API, `onnx_client.predict()` can accept a model name:
+
+```python
+class ONNXPredictor:
+    # ...
+
+    def predict(self, payload, query_params):
+      return self.client.predict(payload, query_params["model"])
+```
+
+`onnx_client.predict()` can also accept a model version if a version other than the highest version number is desired:
+
+```python
+class ONNXPredictor:
+    # ...
+
+    def predict(self, payload, query_params):
+      return self.client.predict(payload, query_params["model"], query_params["version"])
+```
+
+You can also retrieve information about the model by calling the `onnx_client`'s `get_model()` method (it supports model name and model version arguments, like its `predict()` method). This can be useful for retrieving the model's input/output signatures. For example, `self.client.get_model()` might look like this:
+
+```python
+{
+    "session": "<onnxruntime.InferenceSession model object>",
+    "signatures": "<onnxruntime.InferenceSession model object>['session'].get_inputs()",
+    "input_signatures": {
+        "<signature-name>": {
+            "shape": "<input shape>",
+            "type": "<numpy type>"
+        }
+        ...
+    }
+}
+```
+
+## Multi model caching
+
+Multi model caching allows each API replica to serve more models than would all fit into it's memory. It achieves this by keeping only a specified number of models in memory (and disk) at a time. When the in-memory model limit has been reached, the least recently accessed model is evicted from the cache.
+
+This feature can be useful when you have hundreds or thousands of models, when some models are frequently accessed while a larger portion of them are rarely used, or when running on smaller instances to control costs.
+
+The model cache is a two-layer cache, configured by the following parameters in the `predictor.models` configuration:
+
+* `cache_size` sets the number of models to keep in memory
+* `disk_cache_size` sets the number of models to keep on disk (must be greater than or equal to `cache_size`)
+
+Both of these fields must be specified, in addition to either the `dir` or `paths` field (which specifies the model paths, see above for documentation). Multi model caching is only supported if `predictor.processes_per_replica` is set to 1 (the default value).
+
+<!-- CORTEX_VERSION_MINOR -->
+See [examples/model-caching](https://github.com/cortexlabs/cortex/tree/master/examples/model-caching) for examples.
+
+### Caveats
+
+Cortex periodically runs a background script (every 10 seconds) that counts the number of models in memory and on disk, and evicts the least recently used models if the count exceeds `cache_size` / `disk_cache_size`.
+
+The benefit of this approach is that there are no added steps on the critical path of the inference. The limitation with this approach in this is that if many new models are requested between exectutions of the script, then until the script runs again, there may be more models in memory and/or on disk than the configured `cache_size` or `disk_cache_size` limits. This has to potential to lead to out-of-memory errors.
diff --git a/docs/deployments/realtime-api/predictors.md b/docs/deployments/realtime-api/predictors.md
index 1869a5a133..bd8623df9d 100644
--- a/docs/deployments/realtime-api/predictors.md
+++ b/docs/deployments/realtime-api/predictors.md
@@ -52,7 +52,7 @@ class PythonPredictor:
 # initialization code and variables can be declared here in global scope
 
 class PythonPredictor:
-    def __init__(self, config):
+    def __init__(self, config, python_client):
         """(Required) Called once before the API becomes available. Performs
         setup such as downloading/initializing the model or downloading a
         vocabulary.
@@ -61,8 +61,12 @@ class PythonPredictor:
             config (required): Dictionary passed from API configuration (if
                 specified). This may contain information on where to download
                 the model and/or metadata.
+            python_client (optional): Python client which is used to retrieve
+                models for prediction. This should be saved for use in predict().
+                Required when `predictor.model_path` or `predictor.models` is
+                specified in the api configuration.
         """
-        pass
+        self.client = python_client # optional
 
     def predict(self, payload, query_params, headers):
         """(Required) Called once per request. Preprocesses the request payload
@@ -99,8 +103,31 @@ class PythonPredictor:
             headers (optional): A dictionary of the headers sent in the request.
         """
         pass
+
+    def load_model(self, model_path):
+        """(Optional) Called by Cortex to load a model when necessary.
+
+        This method is required when `predictor.model_path` or `predictor.models`
+        field is specified in the api configuration.
+
+        Warning: this method must not make any modification to the model's
+        contents on disk.
+
+        Args:
+            model_path: The path to the model on disk.
+
+        Returns:
+            The loaded model from disk. The returned object is what
+            self.client.get_model() will return.
+        """
+        pass
 ```
 
+<!-- CORTEX_VERSION_MINOR -->
+When explicit model paths are specified in the Python predictor's API configuration, Cortex provides a `python_client` to your Predictor's constructor. `python_client` is an instance of [PythonClient](https://github.com/cortexlabs/cortex/tree/master/pkg/workloads/cortex/lib/client/python.py) that is used to load model(s) (it calls the `load_model()` method of your predictor, which must be defined when using explicit model paths). It should be saved as an instance variable in your Predictor, and your `predict()` function should call `python_client.get_model()` to load your model for inference. Preprocessing of the JSON payload and postprocessing of predictions can be implemented in your `predict()` function as well.
+
+When multiple models are defined using the Predictor's `models` field, the `python_client.get_model()` method expects an argument `model_name` which must hold the name of the model that you want to load (for example: `self.client.get_model("text-generator")`). There is also an optional second argument to specify the model version. See [models](models.md) and the [multi model guide](../../guides/multi-model.md#python-predictor) for more information.
+
 For proper separation of concerns, it is recommended to use the constructor's `config` parameter for information such as from where to download the model and initialization files, or any configurable model parameters. You define `config` in your [API configuration](api-configuration.md), and it is passed through to your Predictor's constructor.
 
 Your API can accept requests with different types of payloads such as `JSON`-parseable, `bytes` or `starlette.datastructures.FormData` data. Navigate to the [API requests](#api-requests) section to learn about how headers can be used to change the type of `payload` that is passed into your `predict` method.
@@ -134,6 +161,37 @@ class PythonPredictor:
         return self.tokenizer.decode(prediction[0])
 ```
 
+<!-- CORTEX_VERSION_MINOR -->
+Here is the Predictor for [examples/live-reloading/python/mpg-estimator](https://github.com/cortexlabs/cortex/tree/feature/master/examples/live-reloading/python/mpg-estimator):
+
+```python
+import mlflow.sklearn
+import numpy as np
+
+
+class PythonPredictor:
+    def __init__(self, config, python_client):
+        self.client = python_client
+
+    def load_model(self, model_path):
+        return mlflow.sklearn.load_model(model_path)
+
+    def predict(self, payload, query_params):
+        model_version = query_params.get("version")
+
+        model = self.client.get_model(model_version=model_version)
+        model_input = [
+            payload["cylinders"],
+            payload["displacement"],
+            payload["horsepower"],
+            payload["weight"],
+            payload["acceleration"],
+        ]
+        result = model.predict([model_input]).item()
+
+        return {"prediction": result, "model": {"version": model_version}}
+```
+
 ### Pre-installed packages
 
 The following Python packages are pre-installed in Python Predictors and can be used in your implementations:
@@ -267,7 +325,7 @@ class TensorFlowPredictor:
 <!-- CORTEX_VERSION_MINOR -->
 Cortex provides a `tensorflow_client` to your Predictor's constructor. `tensorflow_client` is an instance of [TensorFlowClient](https://github.com/cortexlabs/cortex/tree/master/pkg/workloads/cortex/lib/client/tensorflow.py) that manages a connection to a TensorFlow Serving container to make predictions using your model. It should be saved as an instance variable in your Predictor, and your `predict()` function should call `tensorflow_client.predict()` to make an inference with your exported TensorFlow model. Preprocessing of the JSON payload and postprocessing of predictions can be implemented in your `predict()` function as well.
 
-When multiple models are defined using the Predictor's `models` field, the `tensorflow_client.predict()` method expects a second argument `model_name` which must hold the name of the model that you want to use for inference (for example: `self.client.predict(payload, "text-generator")`). See the [multi model guide](../../guides/multi-model.md#tensorflow-predictor) for more information.
+When multiple models are defined using the Predictor's `models` field, the `tensorflow_client.predict()` method expects a second argument `model_name` which must hold the name of the model that you want to use for inference (for example: `self.client.predict(payload, "text-generator")`). There is also an optional third argument to specify the model version. See [models](models.md) and the [multi model guide](../../guides/multi-model.md#tensorflow-predictor) for more information.
 
 For proper separation of concerns, it is recommended to use the constructor's `config` parameter for information such as configurable model parameters or download links for initialization files. You define `config` in your [API configuration](api-configuration.md), and it is passed through to your Predictor's constructor.
 
@@ -379,7 +437,7 @@ class ONNXPredictor:
 <!-- CORTEX_VERSION_MINOR -->
 Cortex provides an `onnx_client` to your Predictor's constructor. `onnx_client` is an instance of [ONNXClient](https://github.com/cortexlabs/cortex/tree/master/pkg/workloads/cortex/lib/client/onnx.py) that manages an ONNX Runtime session to make predictions using your model. It should be saved as an instance variable in your Predictor, and your `predict()` function should call `onnx_client.predict()` to make an inference with your exported ONNX model. Preprocessing of the JSON payload and postprocessing of predictions can be implemented in your `predict()` function as well.
 
-When multiple models are defined using the Predictor's `models` field, the `onnx_client.predict()` method expects a second argument `model_name` which must hold the name of the model that you want to use for inference (for example: `self.client.predict(model_input, "text-generator")`). See the [multi model guide](../../guides/multi-model.md#onnx-predictor) for more information.
+When multiple models are defined using the Predictor's `models` field, the `onnx_client.predict()` method expects a second argument `model_name` which must hold the name of the model that you want to use for inference (for example: `self.client.predict(model_input, "text-generator")`). There is also an optional third argument to specify the model version. See [models](models.md) and the [multi model guide](../../guides/multi-model.md#onnx-predictor) for more information.
 
 For proper separation of concerns, it is recommended to use the constructor's `config` parameter for information such as configurable model parameters or download links for initialization files. You define `config` in your [API configuration](api-configuration.md), and it is passed through to your Predictor's constructor.
 
diff --git a/docs/guides/multi-model.md b/docs/guides/multi-model.md
index 31a6db19eb..79ed0507ab 100644
--- a/docs/guides/multi-model.md
+++ b/docs/guides/multi-model.md
@@ -7,12 +7,82 @@ It is possible to serve multiple models in the same Cortex API using any type of
 
 ## Python Predictor
 
+### Specifying models in API config
+
+<!-- CORTEX_VERSION_MINOR -->
+The following template is based on the [live-reloading/python/mpg-estimator](https://github.com/cortexlabs/cortex/tree/master/examples/live-reloading/python/mpg-estimator) example.
+
+#### `cortex.yaml`
+
+Even though it looks as if there's only a single model served, there are actually 4 different versions saved in `s3://cortex-examples/sklearn/mpg-estimator/linreg/`.
+
+```yaml
+- name: mpg-estimator
+  kind: RealtimeAPI
+  predictor:
+    type: python
+    path: predictor.py
+    model_path: s3://cortex-examples/sklearn/mpg-estimator/linreg/
+```
+
+#### `predictor.py`
+
+```python
+import mlflow.sklearn
+import numpy as np
+
+
+class PythonPredictor:
+    def __init__(self, config, python_client):
+        self.client = python_client
+
+    def load_model(self, model_path):
+        return mlflow.sklearn.load_model(model_path)
+
+    def predict(self, payload, query_params):
+        model_version = query_params.get("version")
+
+        # process the input
+        # ...
+
+        model = self.client.get_model(model_version=model_version)
+        result = model.predict(model_input)
+
+        return {"prediction": result, "model": {"version": model_version}}
+```
+
+#### Making predictions
+
+For convenience, we'll export our API's endpoint (yours will be different from mine):
+
+```bash
+$ api_endpoint=http://a36473270de8b46e79a769850dd3372d-c67035afa37ef878.elb.us-west-2.amazonaws.com/mpg-estimator
+```
+
+Next, we'll make a prediction using the sentiment analyzer model by specifying the model version as a query parameter:
+
+```bash
+$ curl "${api_endpoint}?version=1" -X POST -H "Content-Type: application/json" -d @sample.json
+
+{"prediction": 26.929889872154185, "model": {"version": "1"}}
+```
+
+Then we'll make a prediction using the 2nd version of the model (since they are just duplicate models, it will only return the same result):
+
+```bash
+$ curl "${api_endpoint}?version=2" -X POST -H "Content-Type: application/json" -d @sample.json
+
+{"prediction": 26.929889872154185, "model": {"version": "2"}}
+```
+
+### Without specifying models in API config
+
 For the Python Predictor, the API configuration for a multi-model API is similar to single-model APIs. The Predictor's `config` field can be used to customize the behavior of the `predictor.py` implementation.
 
 <!-- CORTEX_VERSION_MINOR -->
 The following template is based on the [pytorch/multi-model-text-analyzer](https://github.com/cortexlabs/cortex/tree/master/examples/pytorch/multi-model-text-analyzer) example.
 
-### `cortex.yaml`
+#### `cortex.yaml`
 
 ```yaml
 - name: multi-model-text-analyzer
@@ -24,7 +94,7 @@ The following template is based on the [pytorch/multi-model-text-analyzer](https
     ...
 ```
 
-### `predictor.py`
+#### `predictor.py`
 
 Models should be loaded within the predictor's constructor. Query parameters are encouraged to be used when selecting the model for inference.
 
@@ -59,7 +129,7 @@ class PythonPredictor:
             return JSONResponse({"error": f"unknown model: {model_name}"}, status_code=400)
 ```
 
-### Making predictions
+#### Making predictions
 
 For convenience, we'll export our API's endpoint (yours will be different from mine):
 
@@ -70,7 +140,7 @@ $ api_endpoint=http://a36473270de8b46e79a769850dd3372d-c67035afa37ef878.elb.us-w
 Next, we'll make a prediction using the sentiment analyzer model by specifying the model name as a query parameter:
 
 ```bash
-$ curl ${api_endpoint}?model=sentiment -X POST -H "Content-Type: application/json" -d @sample-sentiment.json
+$ curl "${api_endpoint}?model=sentiment" -X POST -H "Content-Type: application/json" -d @sample-sentiment.json
 
 {"label": "POSITIVE", "score": 0.9998506903648376}
 ```
@@ -78,7 +148,7 @@ $ curl ${api_endpoint}?model=sentiment -X POST -H "Content-Type: application/jso
 Then we'll make a prediction using the text summarizer model:
 
 ```bash
-$ curl ${api_endpoint}?model=summarizer -X POST -H "Content-Type: application/json" -d @sample-summarizer.json
+$ curl "${api_endpoint}?model=summarizer" -X POST -H "Content-Type: application/json" -d @sample-summarizer.json
 
 Machine learning is the study of algorithms and statistical models that computer systems use to perform a specific task. It is seen as a subset of artificial intelligence. Machine learning algorithms are used in a wide variety of applications, such as email filtering and computer vision. In its application across business problems, machine learning is also referred to as predictive analytics.
 ```
@@ -99,12 +169,13 @@ The following template is based on the [tensorflow/multi-model-classifier](https
     type: tensorflow
     path: predictor.py
     models:
-      - name: iris
-        model_path: s3://cortex-examples/tensorflow/iris-classifier/nn
-      - name: inception
-        model_path: s3://cortex-examples/tensorflow/image-classifier/inception
-      - name: resnet50
-        model_path: s3://cortex-examples/tensorflow/multi-model-classifier/resnet50
+      paths:
+        - name: inception
+          model_path: s3://cortex-examples/tensorflow/image-classifier/inception/
+        - name: iris
+          model_path: s3://cortex-examples/tensorflow/iris-classifier/nn/
+        - name: resnet50
+          model_path: s3://cortex-examples/tensorflow/resnet50/
       ...
 ```
 
@@ -182,12 +253,13 @@ The following template is based on the [onnx/multi-model-classifier](https://git
     type: onnx
     path: predictor.py
     models:
-      - name: resnet50
-        model_path: s3://cortex-examples/onnx/resnet50/resnet50-v2-7.onnx
-      - name: mobilenet
-        model_path: s3://cortex-examples/onnx/mobilenet/mobilenetv2-7.onnx
-      - name: shufflenet
-        model_path: s3://cortex-examples/onnx/shufflenet/shufflenet-v2-10.onnx
+      paths:
+        - name: resnet50
+          model_path: s3://cortex-examples/onnx/resnet50/
+        - name: mobilenet
+          model_path: s3://cortex-examples/onnx/mobilenet/
+        - name: shufflenet
+          model_path: s3://cortex-examples/onnx/shufflenet/
       ...
 ```
 
diff --git a/docs/summary.md b/docs/summary.md
index e502c792b6..5e5d90c933 100644
--- a/docs/summary.md
+++ b/docs/summary.md
@@ -14,6 +14,7 @@
   * [API configuration](deployments/realtime-api/api-configuration.md)
   * [API deployment](deployments/realtime-api/deployment.md)
   * [API statuses](deployments/realtime-api/statuses.md)
+  * [Models](deployments/realtime-api/models.md)
   * [Parallelism](deployments/realtime-api/parallelism.md)
   * [Autoscaling](deployments/realtime-api/autoscaling.md)
   * [Prediction monitoring](deployments/realtime-api/prediction-monitoring.md)
diff --git a/examples/batch/onnx/cortex.yaml b/examples/batch/onnx/cortex.yaml
index 6fb24b4b3e..b544a333c9 100644
--- a/examples/batch/onnx/cortex.yaml
+++ b/examples/batch/onnx/cortex.yaml
@@ -5,6 +5,6 @@
   predictor:
     type: onnx
     path: predictor.py
-    model_path: s3://cortex-examples/image-classifier/alexnet_batch.onnx
+    model_path: s3://cortex-examples/image-classifier/alexnet_batch/
   compute:
     cpu: 1
diff --git a/examples/batch/tensorflow/cortex.yaml b/examples/batch/tensorflow/cortex.yaml
index aa00396c32..2433edc3f0 100644
--- a/examples/batch/tensorflow/cortex.yaml
+++ b/examples/batch/tensorflow/cortex.yaml
@@ -5,6 +5,6 @@
   predictor:
     type: tensorflow
     path: predictor.py
-    model_path: s3://cortex-examples/tensorflow/image-classifier/inception
+    model_path: s3://cortex-examples/tensorflow/image-classifier/inception/
   compute:
     cpu: 1
diff --git a/examples/live-reloading/onnx/README.md b/examples/live-reloading/onnx/README.md
new file mode 100644
index 0000000000..3b70405073
--- /dev/null
+++ b/examples/live-reloading/onnx/README.md
@@ -0,0 +1,7 @@
+## Live-reloading model APIs
+
+_WARNING: you are on the master branch; please refer to examples on the branch corresponding to your `cortex version` (e.g. for version 0.20.*, run `git checkout -b 0.20` or switch to the `0.20` branch on GitHub)_
+
+The model live-reloading feature is automatically enabled for the ONNX predictors. This means that any ONNX examples found in the [examples](../..) directory will already have this running.
+
+The live-reloading is a feature that reloads models at run-time from (a) specified S3 bucket(s) in the `cortex.yaml` config of each API. Models are added/removed from the API when the said models are added/removed from the S3 bucket(s) or reloaded when the models are edited. More on this in the [docs](insert-link).
diff --git a/examples/live-reloading/python/mpg-estimator/cortex.yaml b/examples/live-reloading/python/mpg-estimator/cortex.yaml
new file mode 100644
index 0000000000..5da8e22fff
--- /dev/null
+++ b/examples/live-reloading/python/mpg-estimator/cortex.yaml
@@ -0,0 +1,8 @@
+# WARNING: you are on the master branch; please refer to examples on the branch corresponding to your `cortex version` (e.g. for version 0.20.*, run `git checkout -b 0.20` or switch to the `0.20` branch on GitHub)
+
+- name: mpg-estimator
+  kind: RealtimeAPI
+  predictor:
+    type: python
+    path: predictor.py
+    model_path: s3://cortex-examples/sklearn/mpg-estimator/linreg/
diff --git a/examples/live-reloading/python/mpg-estimator/predictor.py b/examples/live-reloading/python/mpg-estimator/predictor.py
new file mode 100644
index 0000000000..3c78ba0bf0
--- /dev/null
+++ b/examples/live-reloading/python/mpg-estimator/predictor.py
@@ -0,0 +1,27 @@
+# WARNING: you are on the master branch; please refer to examples on the branch corresponding to your `cortex version` (e.g. for version 0.20.*, run `git checkout -b 0.20` or switch to the `0.20` branch on GitHub)
+
+import mlflow.sklearn
+import numpy as np
+
+
+class PythonPredictor:
+    def __init__(self, config, python_client):
+        self.client = python_client
+
+    def load_model(self, model_path):
+        return mlflow.sklearn.load_model(model_path)
+
+    def predict(self, payload, query_params):
+        model_version = query_params.get("version")
+
+        model = self.client.get_model(model_version=model_version)
+        model_input = [
+            payload["cylinders"],
+            payload["displacement"],
+            payload["horsepower"],
+            payload["weight"],
+            payload["acceleration"],
+        ]
+        result = model.predict([model_input]).item()
+
+        return {"prediction": result, "model": {"version": model_version}}
diff --git a/examples/live-reloading/python/mpg-estimator/requirements.txt b/examples/live-reloading/python/mpg-estimator/requirements.txt
new file mode 100644
index 0000000000..cbcad6b321
--- /dev/null
+++ b/examples/live-reloading/python/mpg-estimator/requirements.txt
@@ -0,0 +1,4 @@
+mlflow
+pandas
+numpy
+scikit-learn==0.21.3
diff --git a/examples/live-reloading/python/mpg-estimator/sample.json b/examples/live-reloading/python/mpg-estimator/sample.json
new file mode 100644
index 0000000000..2dbbca46dd
--- /dev/null
+++ b/examples/live-reloading/python/mpg-estimator/sample.json
@@ -0,0 +1,7 @@
+{
+    "cylinders": 4,
+    "displacement": 135,
+    "horsepower": 84,
+    "weight": 2490,
+    "acceleration": 15.7
+}
diff --git a/examples/live-reloading/tensorflow/README.md b/examples/live-reloading/tensorflow/README.md
new file mode 100644
index 0000000000..6918d8b114
--- /dev/null
+++ b/examples/live-reloading/tensorflow/README.md
@@ -0,0 +1,11 @@
+## Live-reloading model APIs
+
+_WARNING: you are on the master branch; please refer to examples on the branch corresponding to your `cortex version` (e.g. for version 0.20.*, run `git checkout -b 0.20` or switch to the `0.20` branch on GitHub)_
+
+The model live-reloading feature is automatically enabled <sup>1</sup> for the TensorFlow predictors. This means that any TensorFLow examples found in the [examples](../..) directory will already have this running.
+
+The live-reloading is a feature that reloads models at run-time from (a) specified S3 bucket(s) in the `cortex.yaml` config of each API. Models are added/removed from the API when the said models are added/removed from the S3 bucket(s) or reloaded when the models are edited. More on this in the [docs](insert-link).
+
+---
+
+*1: The live-reloading feature for the TensorFlow predictor is disabled when Inferentia resources (`compute.inf`) are added to the API and `processes_per_replica` > 1.*
diff --git a/examples/model-caching/onnx/multi-model-classifier/README.md b/examples/model-caching/onnx/multi-model-classifier/README.md
new file mode 100644
index 0000000000..a987ad76db
--- /dev/null
+++ b/examples/model-caching/onnx/multi-model-classifier/README.md
@@ -0,0 +1,77 @@
+# Multi-Model Classifier API
+
+_WARNING: you are on the master branch; please refer to examples on the branch corresponding to your `cortex version` (e.g. for version 0.20.*, run `git checkout -b 0.20` or switch to the `0.20` branch on GitHub)_
+
+This example deploys ResNet50, MobileNet and ShuffleNet models in one API. Query parameters are used for selecting the model and the version.
+
+Since model caching is enabled, there can only be 2 models loaded into memory - loading a 3rd one will lead to the removal of the least recently used one. To witness the adding/removal process of models, check the logs of the API by running `cortex logs multi-model-classifier` once the API is up.
+
+The example can be run on both CPU and on GPU hardware.
+
+## Sample Prediction
+
+Deploy the model by running:
+
+```bash
+cortex deploy
+```
+
+And wait for it to become live by tracking its status with `cortex get --watch`.
+
+Once the API has been successfully deployed, export the API's endpoint for convenience. You can get the API's endpoint by running `cortex get multi-model-classifier`.
+
+```bash
+export ENDPOINT=your-api-endpoint
+```
+
+When making a prediction with [sample.json](sample.json), the following image will be used:
+
+![cat](https://i.imgur.com/213xcvs.jpg)
+
+### ResNet50 Classifier
+
+Make a request to the ResNet50 model:
+
+```bash
+curl "${ENDPOINT}?model=resnet50" -X POST -H "Content-Type: application/json" -d @sample.json
+```
+
+The expected response is:
+
+```json
+{"label": "tabby", "model": {"name": "resnet50", "version": "latest"}}
+```
+
+### MobileNet Classifier
+
+Make a request to the MobileNet model:
+
+```bash
+curl "${ENDPOINT}?model=mobilenet" -X POST -H "Content-Type: application/json" -d @sample.json
+```
+
+The expected response is:
+
+```json
+{"label": "tabby", "model": {"name": "mobilenet", "version": "latest"}}
+```
+
+### ShuffleNet Classifier
+
+At this point, there are 2 models loaded into memory (as specified by `cache_size`). Loading `ShuffleNet` as well will lead to the removal of the least recently used model - in this case, it will be the ResNet50 model that will get evicted. Since the `disk_cache_size` is set to 3, no model will be removed from disk.
+
+Make a request to the ShuffleNet model:
+
+```bash
+curl "${ENDPOINT}?model=shufflenet" -X POST -H "Content-Type: application/json" -d @sample.json
+```
+
+The expected response is:
+
+```json
+{"label": "Egyptian_cat", "model": {"name": "shufflenet", "version": "latest"}}
+```
+
+---
+
+Now, inspect `cortex get multi-model-classifier` to see when and which models were removed in this process of making requests to different versions of the same model.
diff --git a/examples/model-caching/onnx/multi-model-classifier/cortex.yaml b/examples/model-caching/onnx/multi-model-classifier/cortex.yaml
new file mode 100644
index 0000000000..f93b137e77
--- /dev/null
+++ b/examples/model-caching/onnx/multi-model-classifier/cortex.yaml
@@ -0,0 +1,22 @@
+# WARNING: you are on the master branch; please refer to examples on the branch corresponding to your `cortex version` (e.g. for version 0.20.*, run `git checkout -b 0.20` or switch to the `0.20` branch on GitHub)
+
+- name: multi-model-classifier
+  kind: RealtimeAPI
+  predictor:
+    type: onnx
+    path: predictor.py
+    models:
+      paths:
+        - name: resnet50
+          model_path: s3://cortex-examples/onnx/resnet50/
+        - name: mobilenet
+          model_path: s3://cortex-examples/onnx/mobilenet/
+        - name: shufflenet
+          model_path: s3://cortex-examples/onnx/shufflenet/
+      cache_size: 2
+      disk_cache_size: 3
+    config:
+      image-classifier-classes: https://s3.amazonaws.com/deep-learning-models/image-models/imagenet_class_index.json
+      image-resize: 224
+  compute:
+    mem: 2G
diff --git a/examples/model-caching/onnx/multi-model-classifier/predictor.py b/examples/model-caching/onnx/multi-model-classifier/predictor.py
new file mode 100644
index 0000000000..3492d5d05b
--- /dev/null
+++ b/examples/model-caching/onnx/multi-model-classifier/predictor.py
@@ -0,0 +1,99 @@
+# WARNING: you are on the master branch; please refer to examples on the branch corresponding to your `cortex version` (e.g. for version 0.20.*, run `git checkout -b 0.20` or switch to the `0.20` branch on GitHub)
+
+import numpy as np
+import cv2, requests
+from scipy.special import softmax
+
+
+def get_url_image(url_image):
+    """
+    Get numpy image from URL image.
+    """
+    resp = requests.get(url_image, stream=True).raw
+    image = np.asarray(bytearray(resp.read()), dtype="uint8")
+    image = cv2.imdecode(image, cv2.IMREAD_COLOR)
+    return image
+
+
+def image_resize(image, width=None, height=None, inter=cv2.INTER_AREA):
+    """
+    Resize a numpy image.
+    """
+    dim = None
+    (h, w) = image.shape[:2]
+
+    if width is None and height is None:
+        return image
+
+    if width is None:
+        # calculate the ratio of the height and construct the dimensions
+        r = height / float(h)
+        dim = (int(w * r), height)
+    else:
+        # calculate the ratio of the width and construct the dimensions
+        r = width / float(w)
+        dim = (width, int(h * r))
+
+    resized = cv2.resize(image, dim, interpolation=inter)
+
+    return resized
+
+
+def preprocess(img_data):
+    """
+    Normalize input for inference.
+    """
+    # move pixel color dimension to position 0
+    img = np.moveaxis(img_data, 2, 0)
+
+    mean_vec = np.array([0.485, 0.456, 0.406])
+    stddev_vec = np.array([0.229, 0.224, 0.225])
+    norm_img_data = np.zeros(img.shape).astype("float32")
+    for i in range(img.shape[0]):
+        # for each pixel in each channel, divide the value by 255 to get value between [0, 1] and then normalize
+        norm_img_data[i, :, :] = (img[i, :, :] / 255 - mean_vec[i]) / stddev_vec[i]
+
+    # extend to batch size of 1
+    norm_img_data = norm_img_data[np.newaxis, ...]
+    return norm_img_data
+
+
+def postprocess(results):
+    """
+    Eliminates all dimensions of size 1, softmaxes the input and then returns the index of the element with the highest value.
+    """
+    squeezed = np.squeeze(results)
+    maxed = softmax(squeezed)
+    result = np.argmax(maxed)
+    return result
+
+
+class ONNXPredictor:
+    def __init__(self, onnx_client, config):
+        # onnx client
+        self.client = onnx_client
+
+        # for image classifiers
+        classes = requests.get(config["image-classifier-classes"]).json()
+        self.image_classes = [classes[str(k)][1] for k in range(len(classes))]
+        self.resize_value = config["image-resize"]
+
+    def predict(self, payload, query_params):
+        # get request params
+        model_name = query_params["model"]
+        model_version = query_params.get("version", "latest")
+        img_url = payload["url"]
+
+        # process the input
+        img = get_url_image(img_url)
+        img = image_resize(img, height=self.resize_value)
+        img = preprocess(img)
+
+        # predict
+        results = self.client.predict(img, model_name, model_version)[0]
+
+        # interpret result
+        result = postprocess(results)
+        predicted_label = self.image_classes[result]
+
+        return {"label": predicted_label, "model": {"name": model_name, "version": model_version}}
diff --git a/examples/model-caching/onnx/multi-model-classifier/requirements.txt b/examples/model-caching/onnx/multi-model-classifier/requirements.txt
new file mode 100644
index 0000000000..212d089934
--- /dev/null
+++ b/examples/model-caching/onnx/multi-model-classifier/requirements.txt
@@ -0,0 +1,2 @@
+opencv-python==4.2.0.34
+scipy==1.4.1
diff --git a/examples/model-caching/onnx/multi-model-classifier/sample.json b/examples/model-caching/onnx/multi-model-classifier/sample.json
new file mode 100644
index 0000000000..4ee3aa45df
--- /dev/null
+++ b/examples/model-caching/onnx/multi-model-classifier/sample.json
@@ -0,0 +1,3 @@
+{
+    "url": "https://i.imgur.com/213xcvs.jpg"
+}
diff --git a/examples/model-caching/python/mpg-estimator/README.md b/examples/model-caching/python/mpg-estimator/README.md
new file mode 100644
index 0000000000..ba5c481d26
--- /dev/null
+++ b/examples/model-caching/python/mpg-estimator/README.md
@@ -0,0 +1,75 @@
+# MPG Estimator API
+
+_WARNING: you are on the master branch; please refer to examples on the branch corresponding to your `cortex version` (e.g. for version 0.20.*, run `git checkout -b 0.20` or switch to the `0.20` branch on GitHub)_
+
+This example deploys an MPG estimator model of multiple versions in one API. Query parameters are used for selecting the model and the version.
+
+Since model caching is enabled, there can only be 2 models loaded into memory (counting the versioned models as well) - loading a 3rd one will lead to the removal of the least recently used one. To witness the adding/removal process of models, check the logs of the API by running `cortex logs mpg-estimator` once the API is up.
+
+The example can be run on both CPU and on GPU hardware.
+
+## Sample Prediction
+
+Deploy the model by running:
+
+```bash
+cortex deploy
+```
+
+And wait for it to become live by tracking its status with `cortex get --watch`.
+
+Once the API has been successfully deployed, export the API's endpoint for convenience. You can get the API's endpoint by running `cortex get mpg-estimator`.
+
+```bash
+export ENDPOINT=your-api-endpoint
+```
+
+### Version 1
+
+Make a request version `1` of the `mpg-estimator` model:
+
+```bash
+curl "${ENDPOINT}?model=resnet50&version=1" -X POST -H "Content-Type: application/json" -d @sample.json
+```
+
+The expected response is:
+
+```json
+{"prediction": 26.929889872154185, "model": {"name": "mpg-estimator", "version": "1"}}
+```
+
+### Version 2
+
+At this point, there is one model loaded into memory (as specified by `cache_size`). Loading another versioned model as well will lead to the removal of the least recently used model - in this case, it will be version 1 that will get evicted. Since the `disk_cache_size` is set to 2, no model will be removed from disk.
+
+Make a request version `2` of the `mpg-estimator` model:
+
+```bash
+curl "${ENDPOINT}?model=mobilenet" -X POST -H "Content-Type: application/json" -d @sample.json
+```
+
+The expected response is:
+
+```json
+{"prediction": 26.929889872154185, "model": {"name": "mpg-estimator", "version": "1"}}
+```
+
+### Version 3
+
+With the following request, version 2 of the model will have to be evicted from the memory. Since `disk_cache_size` is set to 2, this time, version 1 of the model will get removed from the disk.
+
+Make a request version `3` of the `mpg-estimator` model:
+
+```bash
+curl "${ENDPOINT}?model=shufflenet" -X POST -H "Content-Type: application/json" -d @sample.json
+```
+
+The expected response is:
+
+```json
+{"prediction": 26.929889872154185, "model": {"name": "mpg-estimator", "version": "1"}}
+```
+
+---
+
+Now, inspect `cortex get mpg-estimator` to see when and which models were removed in this process of making requests to different versions of the same model. The same algorithm is applied to different models as well, not just for the versions of a specific model.
diff --git a/examples/model-caching/python/mpg-estimator/cortex.yaml b/examples/model-caching/python/mpg-estimator/cortex.yaml
new file mode 100644
index 0000000000..211943d2e3
--- /dev/null
+++ b/examples/model-caching/python/mpg-estimator/cortex.yaml
@@ -0,0 +1,13 @@
+# WARNING: you are on the master branch; please refer to examples on the branch corresponding to your `cortex version` (e.g. for version 0.20.*, run `git checkout -b 0.20` or switch to the `0.20` branch on GitHub)
+
+- name: mpg-estimator
+  kind: RealtimeAPI
+  predictor:
+    type: python
+    path: predictor.py
+    models:
+      paths:
+        - name: mpg-estimator
+          model_path: s3://cortex-examples/sklearn/mpg-estimator/linreg/
+      cache_size: 1
+      disk_cache_size: 2
diff --git a/examples/model-caching/python/mpg-estimator/predictor.py b/examples/model-caching/python/mpg-estimator/predictor.py
new file mode 100644
index 0000000000..6554041e27
--- /dev/null
+++ b/examples/model-caching/python/mpg-estimator/predictor.py
@@ -0,0 +1,28 @@
+# WARNING: you are on the master branch; please refer to examples on the branch corresponding to your `cortex version` (e.g. for version 0.20.*, run `git checkout -b 0.20` or switch to the `0.20` branch on GitHub)
+
+import mlflow.sklearn
+import numpy as np
+
+
+class PythonPredictor:
+    def __init__(self, config, python_client):
+        self.client = python_client
+
+    def load_model(self, model_path):
+        return mlflow.sklearn.load_model(model_path)
+
+    def predict(self, payload, query_params):
+        model_name = query_params["model"]
+        model_version = query_params.get("version", "latest")
+
+        model = self.client.get_model(model_name, model_version)
+        model_input = [
+            payload["cylinders"],
+            payload["displacement"],
+            payload["horsepower"],
+            payload["weight"],
+            payload["acceleration"],
+        ]
+        result = model.predict([model_input]).item()
+
+        return {"prediction": result, "model": {"name": model_name, "version": model_version}}
diff --git a/examples/model-caching/python/mpg-estimator/requirements.txt b/examples/model-caching/python/mpg-estimator/requirements.txt
new file mode 100644
index 0000000000..cbcad6b321
--- /dev/null
+++ b/examples/model-caching/python/mpg-estimator/requirements.txt
@@ -0,0 +1,4 @@
+mlflow
+pandas
+numpy
+scikit-learn==0.21.3
diff --git a/examples/model-caching/python/mpg-estimator/sample.json b/examples/model-caching/python/mpg-estimator/sample.json
new file mode 100644
index 0000000000..2dbbca46dd
--- /dev/null
+++ b/examples/model-caching/python/mpg-estimator/sample.json
@@ -0,0 +1,7 @@
+{
+    "cylinders": 4,
+    "displacement": 135,
+    "horsepower": 84,
+    "weight": 2490,
+    "acceleration": 15.7
+}
diff --git a/examples/model-caching/tensorflow/multi-model-classifier/README.md b/examples/model-caching/tensorflow/multi-model-classifier/README.md
new file mode 100644
index 0000000000..944a1d5768
--- /dev/null
+++ b/examples/model-caching/tensorflow/multi-model-classifier/README.md
@@ -0,0 +1,77 @@
+# Multi-Model Classifier API
+
+_WARNING: you are on the master branch; please refer to examples on the branch corresponding to your `cortex version` (e.g. for version 0.20.*, run `git checkout -b 0.20` or switch to the `0.20` branch on GitHub)_
+
+This example deploys Iris, ResNet50 and Inception models in one API. Query parameters are used for selecting the model.
+
+Since model caching is enabled, there can only be 2 models loaded into memory - loading a 3rd one will lead to the removal of the least recently used one. To witness the adding/removal process of models, check the logs of the API by running `cortex logs multi-model-classifier` once the API is up.
+
+The example can be run on both CPU and on GPU hardware.
+
+## Sample Prediction
+
+Deploy the model by running:
+
+```bash
+cortex deploy
+```
+
+And wait for it to become live by tracking its status with `cortex get --watch`.
+
+Once the API has been successfully deployed, export the APIs endpoint. You can get the API's endpoint by running `cortex get multi-model-classifier`.
+
+```bash
+export ENDPOINT=your-api-endpoint
+```
+
+When making a prediction with [sample-image.json](sample-image.json), the following image will be used:
+
+![sports car](https://i.imgur.com/zovGIKD.png)
+
+### ResNet50 Classifier
+
+Make a request to the ResNet50 model:
+
+```bash
+curl "${ENDPOINT}?model=resnet50" -X POST -H "Content-Type: application/json" -d @sample-image.json
+```
+
+The expected response is:
+
+```json
+{"label": "sports_car"}
+```
+
+### Inception Classifier
+
+Make a request to the Inception model:
+
+```bash
+curl "${ENDPOINT}?model=inception" -X POST -H "Content-Type: application/json" -d @sample-image.json
+```
+
+The expected response is:
+
+```json
+{"label": "sports_car"}
+```
+
+### Iris Classifier
+
+At this point, there are 2 models loaded into memory (as specified by `cache_size`). Loading the `iris` classifier will lead to the removal of the least recently used model - in this case, it will be the ResNet50 model that will get evicted. Since the `disk_cache_size` is set to 3, no model will be removed from disk.
+
+Make a request to the Iris model:
+
+```bash
+curl "${ENDPOINT}?model=iris" -X POST -H "Content-Type: application/json" -d @sample-iris.json
+```
+
+The expected response is:
+
+```json
+{"label": "setosa"}
+```
+
+---
+
+Now, inspect `cortex get multi-model-classifier` to see when and which models were removed in this process of making requests to different versions of the same model.
diff --git a/examples/model-caching/tensorflow/multi-model-classifier/cortex.yaml b/examples/model-caching/tensorflow/multi-model-classifier/cortex.yaml
new file mode 100644
index 0000000000..466ae2e27a
--- /dev/null
+++ b/examples/model-caching/tensorflow/multi-model-classifier/cortex.yaml
@@ -0,0 +1,32 @@
+# WARNING: you are on the master branch; please refer to examples on the branch corresponding to your `cortex version` (e.g. for version 0.20.*, run `git checkout -b 0.20` or switch to the `0.20` branch on GitHub)
+
+- name: multi-model-classifier
+  kind: RealtimeAPI
+  predictor:
+    type: tensorflow
+    path: predictor.py
+    models:
+      paths:
+        - name: inception
+          model_path: s3://cortex-examples/tensorflow/image-classifier/inception/
+        - name: iris
+          model_path: s3://cortex-examples/tensorflow/iris-classifier/nn/
+        - name: resnet50
+          model_path: s3://cortex-examples/tensorflow/resnet50/
+      cache_size: 2
+      disk_cache_size: 3
+    config:
+      models:
+        iris:
+          labels: ["setosa", "versicolor", "virginica"]
+        resnet50:
+          input_shape: [224, 224]
+          input_key: input
+          output_key: output
+        inception:
+          input_shape: [224, 224]
+          input_key: images
+          output_key: classes
+      image-classifier-classes: https://s3.amazonaws.com/deep-learning-models/image-models/imagenet_class_index.json
+  compute:
+    mem: 2G
diff --git a/examples/model-caching/tensorflow/multi-model-classifier/predictor.py b/examples/model-caching/tensorflow/multi-model-classifier/predictor.py
new file mode 100644
index 0000000000..d0e474369e
--- /dev/null
+++ b/examples/model-caching/tensorflow/multi-model-classifier/predictor.py
@@ -0,0 +1,63 @@
+# WARNING: you are on the master branch; please refer to examples on the branch corresponding to your `cortex version` (e.g. for version 0.20.*, run `git checkout -b 0.20` or switch to the `0.20` branch on GitHub)
+
+import requests
+import numpy as np
+import cv2
+
+
+def get_url_image(url_image):
+    """
+    Get numpy image from URL image.
+    """
+    resp = requests.get(url_image, stream=True).raw
+    image = np.asarray(bytearray(resp.read()), dtype="uint8")
+    image = cv2.imdecode(image, cv2.IMREAD_COLOR)
+    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+    return image
+
+
+class TensorFlowPredictor:
+    def __init__(self, tensorflow_client, config):
+        self.client = tensorflow_client
+
+        # for image classifiers
+        classes = requests.get(config["image-classifier-classes"]).json()
+        self.image_classes = [classes[str(k)][1] for k in range(len(classes))]
+
+        # assign "models"' key value to self.config for ease of use
+        self.config = config["models"]
+
+        # for iris classifier
+        self.iris_labels = self.config["iris"]["labels"]
+
+    def predict(self, payload, query_params):
+        model_name = query_params["model"]
+        model_version = query_params.get("version", "latest")
+        predicted_label = None
+
+        if model_name == "iris":
+            prediction = self.client.predict(payload["input"], model_name, model_version)
+            predicted_class_id = int(prediction["class_ids"][0])
+            predicted_label = self.iris_labels[predicted_class_id]
+
+        elif model_name in ["resnet50", "inception"]:
+            predicted_label = self.predict_image_classifier(model_name, payload["url"])
+
+        return {"label": predicted_label, "model": {"model": model_name, "version": model_version}}
+
+    def predict_image_classifier(self, model, img_url):
+        img = get_url_image(img_url)
+        img = cv2.resize(
+            img, tuple(self.config[model]["input_shape"]), interpolation=cv2.INTER_NEAREST
+        )
+        if model == "inception":
+            img = img.astype("float32") / 255
+        img = {self.config[model]["input_key"]: img[np.newaxis, ...]}
+
+        results = self.client.predict(img, model)[self.config[model]["output_key"]]
+        result = np.argmax(results)
+        if model == "inception":
+            result -= 1
+        predicted_label = self.image_classes[result]
+
+        return predicted_label
diff --git a/examples/model-caching/tensorflow/multi-model-classifier/requirements.txt b/examples/model-caching/tensorflow/multi-model-classifier/requirements.txt
new file mode 100644
index 0000000000..7e2fba5e6c
--- /dev/null
+++ b/examples/model-caching/tensorflow/multi-model-classifier/requirements.txt
@@ -0,0 +1 @@
+Pillow
diff --git a/examples/model-caching/tensorflow/multi-model-classifier/sample-image.json b/examples/model-caching/tensorflow/multi-model-classifier/sample-image.json
new file mode 100644
index 0000000000..95200916c7
--- /dev/null
+++ b/examples/model-caching/tensorflow/multi-model-classifier/sample-image.json
@@ -0,0 +1,3 @@
+{
+    "url": "https://i.imgur.com/zovGIKD.png"
+}
diff --git a/examples/model-caching/tensorflow/multi-model-classifier/sample-iris.json b/examples/model-caching/tensorflow/multi-model-classifier/sample-iris.json
new file mode 100644
index 0000000000..67c03827f2
--- /dev/null
+++ b/examples/model-caching/tensorflow/multi-model-classifier/sample-iris.json
@@ -0,0 +1,8 @@
+{
+    "input": {
+        "sepal_length": 5.2,
+        "sepal_width": 3.6,
+        "petal_length": 1.4,
+        "petal_width": 0.3
+    }
+}
diff --git a/examples/onnx/iris-classifier/cortex.yaml b/examples/onnx/iris-classifier/cortex.yaml
index 7bf965d590..9704ea822c 100644
--- a/examples/onnx/iris-classifier/cortex.yaml
+++ b/examples/onnx/iris-classifier/cortex.yaml
@@ -5,6 +5,6 @@
   predictor:
     type: onnx
     path: predictor.py
-    model_path: s3://cortex-examples/onnx/iris-classifier/gbtree.onnx
+    model_path: s3://cortex-examples/onnx/iris-classifier/
   monitoring:
     model_type: classification
diff --git a/examples/onnx/multi-model-classifier/cortex.yaml b/examples/onnx/multi-model-classifier/cortex.yaml
index 61e43f6a11..d9f0d7b362 100644
--- a/examples/onnx/multi-model-classifier/cortex.yaml
+++ b/examples/onnx/multi-model-classifier/cortex.yaml
@@ -6,12 +6,13 @@
     type: onnx
     path: predictor.py
     models:
-      - name: resnet50
-        model_path: s3://cortex-examples/onnx/resnet50/resnet50-v2-7.onnx
-      - name: mobilenet
-        model_path: s3://cortex-examples/onnx/mobilenet/mobilenetv2-7.onnx
-      - name: shufflenet
-        model_path: s3://cortex-examples/onnx/shufflenet/shufflenet-v2-10.onnx
+      paths:
+        - name: resnet50
+          model_path: s3://cortex-examples/onnx/resnet50/
+        - name: mobilenet
+          model_path: s3://cortex-examples/onnx/mobilenet/
+        - name: shufflenet
+          model_path: s3://cortex-examples/onnx/shufflenet/
     config:
       image-classifier-classes: https://s3.amazonaws.com/deep-learning-models/image-models/imagenet_class_index.json
       image-resize: 224
diff --git a/examples/onnx/yolov5-youtube/cortex.yaml b/examples/onnx/yolov5-youtube/cortex.yaml
index 1c170aff30..760e175ead 100644
--- a/examples/onnx/yolov5-youtube/cortex.yaml
+++ b/examples/onnx/yolov5-youtube/cortex.yaml
@@ -5,7 +5,7 @@
   predictor:
     type: onnx
     path: predictor.py
-    model_path: s3://cortex-examples/onnx/yolov5-youtube/yolov5s.onnx
+    model_path: s3://cortex-examples/onnx/yolov5-youtube/
     config:
       iou_threshold: 0.5
       confidence_threshold: 0.6
diff --git a/examples/onnx/yolov5-youtube/predictor.py b/examples/onnx/yolov5-youtube/predictor.py
index 5f8d53ac60..786d396b4f 100644
--- a/examples/onnx/yolov5-youtube/predictor.py
+++ b/examples/onnx/yolov5-youtube/predictor.py
@@ -16,8 +16,8 @@ class ONNXPredictor:
     def __init__(self, onnx_client, config):
         self.client = onnx_client
         # Get the input shape from the ONNX runtime
-        (signature,) = onnx_client.input_signatures.values()
-        _, _, height, width = signature["images"]["shape"]
+        (signature,) = onnx_client.get_model()["input_signatures"].values()
+        _, _, height, width = signature["shape"]
         self.input_size = (width, height)
         self.config = config
         with open("labels.json") as buf:
diff --git a/examples/sklearn/mpg-estimator/cortex.yaml b/examples/sklearn/mpg-estimator/cortex.yaml
index 3161d691d1..78a39277c3 100644
--- a/examples/sklearn/mpg-estimator/cortex.yaml
+++ b/examples/sklearn/mpg-estimator/cortex.yaml
@@ -6,6 +6,6 @@
     type: python
     path: predictor.py
     config:
-      model: s3://cortex-examples/sklearn/mpg-estimator/linreg
+      model: s3://cortex-examples/sklearn/mpg-estimator/linreg/
   monitoring:
     model_type: regression
diff --git a/examples/sklearn/mpg-estimator/requirements.txt b/examples/sklearn/mpg-estimator/requirements.txt
index 99c556c375..cbcad6b321 100644
--- a/examples/sklearn/mpg-estimator/requirements.txt
+++ b/examples/sklearn/mpg-estimator/requirements.txt
@@ -1,4 +1,4 @@
 mlflow
 pandas
 numpy
-scikit-learn
+scikit-learn==0.21.3
diff --git a/examples/tensorflow/image-classifier-inception/cortex.yaml b/examples/tensorflow/image-classifier-inception/cortex.yaml
index cc982ade94..4044bfb57f 100644
--- a/examples/tensorflow/image-classifier-inception/cortex.yaml
+++ b/examples/tensorflow/image-classifier-inception/cortex.yaml
@@ -5,7 +5,7 @@
   predictor:
     type: tensorflow
     path: predictor.py
-    model_path: s3://cortex-examples/tensorflow/image-classifier/inception
+    model_path: s3://cortex-examples/tensorflow/image-classifier/inception/
   monitoring:
     model_type: classification
   compute:
diff --git a/examples/tensorflow/image-classifier-inception/cortex_server_side_batching.yaml b/examples/tensorflow/image-classifier-inception/cortex_server_side_batching.yaml
index a8e567eaf0..5337fdbe99 100644
--- a/examples/tensorflow/image-classifier-inception/cortex_server_side_batching.yaml
+++ b/examples/tensorflow/image-classifier-inception/cortex_server_side_batching.yaml
@@ -5,7 +5,7 @@
   predictor:
     type: tensorflow
     path: predictor.py
-    model_path: s3://cortex-examples/tensorflow/image-classifier/inception
+    model_path: s3://cortex-examples/tensorflow/image-classifier/inception/
     server_side_batching:
       max_batch_size: 2
       batch_interval: 0.2s
diff --git a/examples/tensorflow/image-classifier-resnet50/cortex.yaml b/examples/tensorflow/image-classifier-resnet50/cortex.yaml
index a7a27355cb..fc2e1b6820 100644
--- a/examples/tensorflow/image-classifier-resnet50/cortex.yaml
+++ b/examples/tensorflow/image-classifier-resnet50/cortex.yaml
@@ -5,7 +5,7 @@
   predictor:
     type: tensorflow
     path: predictor.py
-    model_path: s3://cortex-examples/tensorflow/resnet50
+    model_path: s3://cortex-examples/tensorflow/resnet50/
     processes_per_replica: 4
     threads_per_process: 16
     config:
diff --git a/examples/tensorflow/image-classifier-resnet50/cortex_gpu.yaml b/examples/tensorflow/image-classifier-resnet50/cortex_gpu.yaml
index b4d937e974..f3898f2041 100644
--- a/examples/tensorflow/image-classifier-resnet50/cortex_gpu.yaml
+++ b/examples/tensorflow/image-classifier-resnet50/cortex_gpu.yaml
@@ -5,7 +5,7 @@
   predictor:
     type: tensorflow
     path: predictor.py
-    model_path: s3://cortex-examples/tensorflow/resnet50
+    model_path: s3://cortex-examples/tensorflow/resnet50/
     processes_per_replica: 4
     threads_per_process: 24
     config:
diff --git a/examples/tensorflow/image-classifier-resnet50/cortex_gpu_server_side_batching.yaml b/examples/tensorflow/image-classifier-resnet50/cortex_gpu_server_side_batching.yaml
index 1c11f02770..4c259a126f 100644
--- a/examples/tensorflow/image-classifier-resnet50/cortex_gpu_server_side_batching.yaml
+++ b/examples/tensorflow/image-classifier-resnet50/cortex_gpu_server_side_batching.yaml
@@ -5,7 +5,7 @@
   predictor:
     type: tensorflow
     path: predictor.py
-    model_path: s3://cortex-examples/tensorflow/resnet50
+    model_path: s3://cortex-examples/tensorflow/resnet50/
     server_side_batching:
       max_batch_size: 32
       batch_interval: 0.1s
diff --git a/examples/tensorflow/image-classifier-resnet50/cortex_inf.yaml b/examples/tensorflow/image-classifier-resnet50/cortex_inf.yaml
index 34f80a77e6..43d85083be 100644
--- a/examples/tensorflow/image-classifier-resnet50/cortex_inf.yaml
+++ b/examples/tensorflow/image-classifier-resnet50/cortex_inf.yaml
@@ -5,7 +5,7 @@
   predictor:
     type: tensorflow
     path: predictor.py
-    model_path: s3://cortex-examples/tensorflow/resnet50_neuron
+    model_path: s3://cortex-examples/tensorflow/resnet50_neuron/
     processes_per_replica: 4
     threads_per_process: 256
     config:
diff --git a/examples/tensorflow/image-classifier-resnet50/cortex_inf_server_side_batching.yaml b/examples/tensorflow/image-classifier-resnet50/cortex_inf_server_side_batching.yaml
index 810e791481..7a73539957 100644
--- a/examples/tensorflow/image-classifier-resnet50/cortex_inf_server_side_batching.yaml
+++ b/examples/tensorflow/image-classifier-resnet50/cortex_inf_server_side_batching.yaml
@@ -5,7 +5,7 @@
   predictor:
     type: tensorflow
     path: predictor.py
-    model_path: s3://cortex-examples/tensorflow/resnet50_neuron_batch_size_5
+    model_path: s3://cortex-examples/tensorflow/resnet50_neuron_batch_size_5/
     server_side_batching:
       max_batch_size: 5
       batch_interval: 0.1s
diff --git a/examples/tensorflow/iris-classifier/cortex.yaml b/examples/tensorflow/iris-classifier/cortex.yaml
index 4ea55deaad..3d9f429def 100644
--- a/examples/tensorflow/iris-classifier/cortex.yaml
+++ b/examples/tensorflow/iris-classifier/cortex.yaml
@@ -5,6 +5,6 @@
   predictor:
     type: tensorflow
     path: predictor.py
-    model_path: s3://cortex-examples/tensorflow/iris-classifier/nn
+    model_path: s3://cortex-examples/tensorflow/iris-classifier/nn/
   monitoring:
     model_type: classification
diff --git a/examples/tensorflow/license-plate-reader/cortex_full.yaml b/examples/tensorflow/license-plate-reader/cortex_full.yaml
index 6ec82f5ee9..923e16a3a1 100644
--- a/examples/tensorflow/license-plate-reader/cortex_full.yaml
+++ b/examples/tensorflow/license-plate-reader/cortex_full.yaml
@@ -5,7 +5,7 @@
   predictor:
     type: tensorflow
     path: predictor_yolo.py
-    model_path: s3://cortex-examples/tensorflow/license-plate-reader/yolov3_tf
+    model_path: s3://cortex-examples/tensorflow/license-plate-reader/yolov3_tf/
     processes_per_replica: 4
     threads_per_process: 3
     signature_key: serving_default
diff --git a/examples/tensorflow/multi-model-classifier/cortex.yaml b/examples/tensorflow/multi-model-classifier/cortex.yaml
index bf755cbbdb..9c8ead69a5 100644
--- a/examples/tensorflow/multi-model-classifier/cortex.yaml
+++ b/examples/tensorflow/multi-model-classifier/cortex.yaml
@@ -6,12 +6,13 @@
     type: tensorflow
     path: predictor.py
     models:
-      - name: inception
-        model_path: s3://cortex-examples/tensorflow/image-classifier/inception
-      - name: iris
-        model_path: s3://cortex-examples/tensorflow/iris-classifier/nn
-      - name: resnet50
-        model_path: s3://cortex-examples/tensorflow/resnet50
+      paths:
+        - name: inception
+          model_path: s3://cortex-examples/tensorflow/image-classifier/inception/
+        - name: iris
+          model_path: s3://cortex-examples/tensorflow/iris-classifier/nn/
+        - name: resnet50
+          model_path: s3://cortex-examples/tensorflow/resnet50/
     config:
       models:
         iris:
diff --git a/examples/tensorflow/sentiment-analyzer/cortex.yaml b/examples/tensorflow/sentiment-analyzer/cortex.yaml
index ad31185908..091da4b5d4 100644
--- a/examples/tensorflow/sentiment-analyzer/cortex.yaml
+++ b/examples/tensorflow/sentiment-analyzer/cortex.yaml
@@ -5,7 +5,7 @@
   predictor:
     type: tensorflow
     path: predictor.py
-    model_path: s3://cortex-examples/tensorflow/sentiment-analyzer/bert
+    model_path: s3://cortex-examples/tensorflow/sentiment-analyzer/bert/
   monitoring:
     model_type: classification
   compute:
diff --git a/examples/tensorflow/text-generator/cortex.yaml b/examples/tensorflow/text-generator/cortex.yaml
index af088d8e63..5117a2dad9 100644
--- a/examples/tensorflow/text-generator/cortex.yaml
+++ b/examples/tensorflow/text-generator/cortex.yaml
@@ -5,7 +5,7 @@
   predictor:
     type: tensorflow
     path: predictor.py
-    model_path: s3://cortex-examples/tensorflow/text-generator/gpt-2/124M
+    model_path: s3://cortex-examples/tensorflow/text-generator/gpt-2/124M/
   compute:
     cpu: 1
     gpu: 1
diff --git a/examples/traffic-splitter/cortex.yaml b/examples/traffic-splitter/cortex.yaml
index 00d3474d3e..de76cedc0d 100644
--- a/examples/traffic-splitter/cortex.yaml
+++ b/examples/traffic-splitter/cortex.yaml
@@ -5,7 +5,7 @@
   predictor:
     type: onnx
     path: onnx_predictor.py
-    model_path: s3://cortex-examples/onnx/iris-classifier/gbtree.onnx
+    model_path: s3://cortex-examples/onnx/iris-classifier/
   monitoring:
     model_type: classification
 
@@ -14,7 +14,7 @@
   predictor:
     type: tensorflow
     path: tensorflow_predictor.py
-    model_path: s3://cortex-examples/tensorflow/iris-classifier/nn
+    model_path: s3://cortex-examples/tensorflow/iris-classifier/nn/
   monitoring:
     model_type: classification
 
diff --git a/images/tensorflow-serving-inf/template.conf b/images/tensorflow-serving-inf/template.conf
index 544f2c406f..0074549155 100644
--- a/images/tensorflow-serving-inf/template.conf
+++ b/images/tensorflow-serving-inf/template.conf
@@ -15,7 +15,7 @@
 # limitations under the License.
 
 [program:tensorflow-$process]
-command=tensorflow_model_server_neuron --port=$port --model_config_file=$TF_EMPTY_MODEL_CONFIG $TF_EXTRA_CMD_ARGS
+command=tensorflow_model_server_neuron --port=$port --model_config_file=$TF_EMPTY_MODEL_CONFIG --max_num_load_retries=$TF_MAX_NUM_LOAD_RETRIES --load_retry_interval_micros=$TF_LOAD_RETRY_INTERVAL_MICROS  --grpc_channel_arguments=$TF_GRPC_MAX_CONCURRENT_STREAMS $TF_EXTRA_CMD_ARGS
 stdout_logfile=/dev/fd/1
 stdout_logfile_maxbytes=0
 redirect_stderr=true
diff --git a/pkg/lib/aws/s3.go b/pkg/lib/aws/s3.go
index 5a3a1f91e0..79a5ada5a9 100644
--- a/pkg/lib/aws/s3.go
+++ b/pkg/lib/aws/s3.go
@@ -37,6 +37,7 @@ import (
 	"github.com/cortexlabs/cortex/pkg/lib/msgpack"
 	"github.com/cortexlabs/cortex/pkg/lib/pointer"
 	"github.com/cortexlabs/cortex/pkg/lib/sets/strset"
+	"github.com/cortexlabs/cortex/pkg/lib/slices"
 	s "github.com/cortexlabs/cortex/pkg/lib/strings"
 )
 
@@ -126,6 +127,45 @@ func IsValidS3aPath(s3aPath string) bool {
 	return true
 }
 
+// List all S3 objects that are "depth" levels or deeper than the given "s3Path".
+// Setting depth to 1 effectively translates to listing the objects one level or deeper than the given prefix (aka listing the directory contents).
+//
+// 1st returned value is the list of paths found at level <depth>.
+// 2nd returned value is the list of paths found at all levels.
+func (c *Client) GetNLevelsDeepFromS3Path(s3Path string, depth int, includeDirObjects bool, maxResults *int64) ([]string, []string, error) {
+	paths := strset.New()
+
+	_, key, err := SplitS3Path(s3Path)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	allS3Objects, err := c.ListS3PathDir(s3Path, includeDirObjects, maxResults)
+	if err != nil {
+		return nil, nil, err
+	}
+	allPaths := ConvertS3ObjectsToKeys(allS3Objects...)
+
+	keySplit := slices.RemoveEmpties(strings.Split(key, "/"))
+	for _, path := range allPaths {
+		pathSplit := slices.RemoveEmpties(strings.Split(path, "/"))
+		if len(pathSplit)-len(keySplit) >= depth {
+			computedPath := strings.Join(pathSplit[:len(keySplit)+depth], "/")
+			paths.Add(computedPath)
+		}
+	}
+
+	return paths.Slice(), allPaths, nil
+}
+
+func ConvertS3ObjectsToKeys(s3Objects ...*s3.Object) []string {
+	paths := make([]string, 0, len(s3Objects))
+	for _, object := range s3Objects {
+		paths = append(paths, *object.Key)
+	}
+	return paths
+}
+
 func GetBucketRegionFromS3Path(s3Path string) (string, error) {
 	bucket, _, err := SplitS3Path(s3Path)
 	if err != nil {
diff --git a/pkg/lib/files/files.go b/pkg/lib/files/files.go
index 987f178e3a..9872208a06 100644
--- a/pkg/lib/files/files.go
+++ b/pkg/lib/files/files.go
@@ -26,6 +26,7 @@ import (
 	"net/http"
 	"os"
 	"os/exec"
+	"path"
 	"path/filepath"
 	"sort"
 	"strings"
@@ -33,6 +34,7 @@ import (
 	"github.com/cortexlabs/cortex/pkg/lib/errors"
 	"github.com/cortexlabs/cortex/pkg/lib/prompt"
 	"github.com/cortexlabs/cortex/pkg/lib/sets/strset"
+	"github.com/cortexlabs/cortex/pkg/lib/slices"
 	s "github.com/cortexlabs/cortex/pkg/lib/strings"
 	"github.com/denormal/go-gitignore"
 	"github.com/mitchellh/go-homedir"
@@ -628,6 +630,67 @@ func ErrorOnProjectSizeLimit(maxProjectSizeBytes int64) IgnoreFn {
 	}
 }
 
+// Retrieves the longest common path given a list of paths.
+func LongestCommonPath(paths ...string) string {
+
+	// Handle special cases.
+	switch len(paths) {
+	case 0:
+		return ""
+	case 1:
+		return path.Clean(paths[0])
+	}
+
+	startsWithSlash := false
+	allStartWithSlash := true
+
+	var splitPaths [][]string
+	shortestPathLength := -1
+	for _, path := range paths {
+		if strings.HasPrefix(path, "/") {
+			startsWithSlash = true
+		} else {
+			allStartWithSlash = false
+		}
+
+		splitPath := slices.RemoveEmpties(strings.Split(path, "/"))
+		splitPaths = append(splitPaths, splitPath)
+
+		if len(splitPath) < shortestPathLength || shortestPathLength == -1 {
+			shortestPathLength = len(splitPath)
+		}
+	}
+
+	commonPath := ""
+	numPaths := len(splitPaths)
+
+	for level := 0; level < shortestPathLength; level++ {
+		element := splitPaths[0][level]
+		counter := 1
+		for _, splitPath := range splitPaths[1:] {
+			if splitPath[level] != element {
+				break
+			}
+			counter++
+		}
+
+		if counter != numPaths {
+			break
+		}
+
+		commonPath = filepath.Join(commonPath, element)
+	}
+	if commonPath != "" && startsWithSlash {
+		commonPath = s.EnsurePrefix(commonPath, "/")
+		commonPath = s.EnsureSuffix(commonPath, "/")
+	}
+	if commonPath == "" && allStartWithSlash {
+		return "/"
+	}
+
+	return commonPath
+}
+
 type DirsOrder string
 
 var DirsSorted DirsOrder = "sorted"
@@ -679,7 +742,7 @@ func FileTree(paths []string, cwd string, dirsOrder DirsOrder) string {
 		dirPaths = DirPaths(paths, true)
 	}
 
-	commonPrefix := s.LongestCommonPrefix(dirPaths...)
+	commonPrefix := LongestCommonPath(dirPaths...)
 	paths, _ = s.TrimPrefixIfPresentInAll(paths, commonPrefix)
 
 	var header string
@@ -689,9 +752,11 @@ func FileTree(paths []string, cwd string, dirsOrder DirsOrder) string {
 	} else if !didTrimCwd && commonPrefix == "" {
 		header = ""
 	} else if didTrimCwd && commonPrefix != "" {
-		header = "./" + commonPrefix + "\n"
+		header = "./" + commonPrefix
+		header = s.EnsureSingleOccurrenceCharSuffix(header, "/") + "\n"
 	} else if !didTrimCwd && commonPrefix != "" {
-		header = commonPrefix + "\n"
+		header = commonPrefix + "/"
+		header = s.EnsureSingleOccurrenceCharSuffix(header, "/") + "\n"
 	}
 
 	tree := treeprint.New()
diff --git a/pkg/lib/slices/float32.go b/pkg/lib/slices/float32.go
index 17473e079b..d39faf8861 100644
--- a/pkg/lib/slices/float32.go
+++ b/pkg/lib/slices/float32.go
@@ -16,6 +16,10 @@ limitations under the License.
 
 package slices
 
+import (
+	s "github.com/cortexlabs/cortex/pkg/lib/strings"
+)
+
 func HasFloat32(list []float32, query float32) bool {
 	for _, elem := range list {
 		if elem == query {
@@ -28,3 +32,11 @@ func HasFloat32(list []float32, query float32) bool {
 func CopyFloat32s(vals []float32) []float32 {
 	return append(vals[:0:0], vals...)
 }
+
+func Float32ToString(vals []float32) []string {
+	stringSlice := []string{}
+	for _, elem := range vals {
+		stringSlice = append(stringSlice, s.Float32(elem))
+	}
+	return stringSlice
+}
diff --git a/pkg/lib/slices/float64.go b/pkg/lib/slices/float64.go
index 4353ec83c8..ee9595f08a 100644
--- a/pkg/lib/slices/float64.go
+++ b/pkg/lib/slices/float64.go
@@ -16,6 +16,10 @@ limitations under the License.
 
 package slices
 
+import (
+	s "github.com/cortexlabs/cortex/pkg/lib/strings"
+)
+
 func HasFloat64(list []float64, query float64) bool {
 	for _, elem := range list {
 		if elem == query {
@@ -28,3 +32,11 @@ func HasFloat64(list []float64, query float64) bool {
 func CopyFloat64s(vals []float64) []float64 {
 	return append(vals[:0:0], vals...)
 }
+
+func Float64ToString(vals []float64) []string {
+	stringSlice := []string{}
+	for _, elem := range vals {
+		stringSlice = append(stringSlice, s.Float64(elem))
+	}
+	return stringSlice
+}
diff --git a/pkg/lib/slices/int.go b/pkg/lib/slices/int.go
index 7c2bc414c3..8fd6b9c20d 100644
--- a/pkg/lib/slices/int.go
+++ b/pkg/lib/slices/int.go
@@ -16,6 +16,10 @@ limitations under the License.
 
 package slices
 
+import (
+	s "github.com/cortexlabs/cortex/pkg/lib/strings"
+)
+
 func HasInt(list []int, query int) bool {
 	for _, elem := range list {
 		if elem == query {
@@ -42,3 +46,11 @@ func AreNGreaterThanZero(minCount int, val int, vals ...int) bool {
 	}
 	return false
 }
+
+func IntToString(vals []int) []string {
+	stringSlice := []string{}
+	for _, elem := range vals {
+		stringSlice = append(stringSlice, s.Int(elem))
+	}
+	return stringSlice
+}
diff --git a/pkg/lib/slices/int32.go b/pkg/lib/slices/int32.go
index d770896b3c..d91f69dcf3 100644
--- a/pkg/lib/slices/int32.go
+++ b/pkg/lib/slices/int32.go
@@ -16,6 +16,10 @@ limitations under the License.
 
 package slices
 
+import (
+	s "github.com/cortexlabs/cortex/pkg/lib/strings"
+)
+
 func HasInt32(list []int32, query int32) bool {
 	for _, elem := range list {
 		if elem == query {
@@ -28,3 +32,11 @@ func HasInt32(list []int32, query int32) bool {
 func CopyInt32s(vals []int32) []int32 {
 	return append(vals[:0:0], vals...)
 }
+
+func Int32ToString(vals []int32) []string {
+	stringSlice := []string{}
+	for _, elem := range vals {
+		stringSlice = append(stringSlice, s.Int32(elem))
+	}
+	return stringSlice
+}
diff --git a/pkg/lib/slices/int64.go b/pkg/lib/slices/int64.go
index 49710e862e..0a9a86096a 100644
--- a/pkg/lib/slices/int64.go
+++ b/pkg/lib/slices/int64.go
@@ -16,6 +16,12 @@ limitations under the License.
 
 package slices
 
+import (
+	"strconv"
+
+	s "github.com/cortexlabs/cortex/pkg/lib/strings"
+)
+
 func HasInt64(list []int64, query int64) bool {
 	for _, elem := range list {
 		if elem == query {
@@ -28,3 +34,35 @@ func HasInt64(list []int64, query int64) bool {
 func CopyInt64s(vals []int64) []int64 {
 	return append(vals[:0:0], vals...)
 }
+
+func UniqueInt64(vals []int64) []int64 {
+	keys := make(map[int64]bool)
+	list := []int64{}
+	for _, entry := range vals {
+		if _, value := keys[entry]; !value {
+			keys[entry] = true
+			list = append(list, entry)
+		}
+	}
+	return list
+}
+
+func Int64ToString(vals []int64) []string {
+	stringSlice := []string{}
+	for _, elem := range vals {
+		stringSlice = append(stringSlice, s.Int64(elem))
+	}
+	return stringSlice
+}
+
+func StringToInt64(vals []string) ([]int64, error) {
+	int64Slice := []int64{}
+	for _, elem := range vals {
+		int64Val, err := strconv.ParseInt(elem, 10, 64)
+		if err != nil {
+			return []int64{}, err
+		}
+		int64Slice = append(int64Slice, int64Val)
+	}
+	return int64Slice, nil
+}
diff --git a/pkg/lib/strings/operations.go b/pkg/lib/strings/operations.go
index 1d0b22d299..e2f8086041 100644
--- a/pkg/lib/strings/operations.go
+++ b/pkg/lib/strings/operations.go
@@ -27,6 +27,14 @@ func ToTitle(str string) string {
 	return strings.Title(strings.ToLower(str))
 }
 
+func EnsureSingleOccurrenceCharPrefix(str string, character string) string {
+	return character + strings.TrimLeft(str, character)
+}
+
+func EnsureSingleOccurrenceCharSuffix(str string, character string) string {
+	return strings.TrimRight(str, character) + character
+}
+
 func EnsurePrefix(str string, prefix string) string {
 	if prefix != "" && !strings.HasPrefix(str, prefix) {
 		return prefix + str
diff --git a/pkg/operator/operator/k8s.go b/pkg/operator/operator/k8s.go
index f79ea2beb6..edb8d3d99d 100644
--- a/pkg/operator/operator/k8s.go
+++ b/pkg/operator/operator/k8s.go
@@ -59,6 +59,8 @@ const (
 	_tfBaseServingPortInt32, _tfBaseServingPortStr = int32(9000), "9000"
 	_tfServingHost                                 = "localhost"
 	_tfServingEmptyModelConfig                     = "/etc/tfs/model_config_server.conf"
+	_tfServingMaxNumLoadRetries                    = "0"        // maximum retries to load a model that didn't get loaded the first time
+	_tfServingLoadTimeMicros                       = "30000000" // 30 seconds (how much time a model can take to load into memory)
 	_tfServingBatchConfig                          = "/etc/tfs/batch_config.conf"
 	_apiReadinessFile                              = "/mnt/workspace/api_readiness.txt"
 	_apiLivenessFile                               = "/mnt/workspace/api_liveness.txt"
@@ -81,13 +83,13 @@ type downloadContainerConfig struct {
 }
 
 type downloadContainerArg struct {
-	From                 string `json:"from"`
-	To                   string `json:"to"`
-	Unzip                bool   `json:"unzip"`
-	ItemName             string `json:"item_name"`               // name of the item being downloaded, just for logging (if "" nothing will be logged)
-	TFModelVersionRename string `json:"tf_model_version_rename"` // e.g. passing in /mnt/model/1 will rename /mnt/model/* to /mnt/model/1 only if there is one item in /mnt/model/
-	HideFromLog          bool   `json:"hide_from_log"`           // if true, don't log where the file is being downloaded from
-	HideUnzippingLog     bool   `json:"hide_unzipping_log"`      // if true, don't log when unzipping
+	From             string `json:"from"`
+	To               string `json:"to"`
+	ToFile           bool   `json:"to_file"` // whether "To" path reflects the path to a file or just the directory in which "From" object is copied to
+	Unzip            bool   `json:"unzip"`
+	ItemName         string `json:"item_name"`          // name of the item being downloaded, just for logging (if "" nothing will be logged)
+	HideFromLog      bool   `json:"hide_from_log"`      // if true, don't log where the file is being downloaded from
+	HideUnzippingLog bool   `json:"hide_unzipping_log"` // if true, don't log when unzipping
 }
 
 func InitContainer(api *spec.API) kcore.Container {
@@ -414,6 +416,15 @@ func getEnvVars(api *spec.API, container string) []kcore.EnvVar {
 			)
 		}
 
+		if api.Predictor.ModelPath != nil || api.Predictor.Models != nil {
+			envVars = append(envVars,
+				kcore.EnvVar{
+					Name:  "CORTEX_MODEL_DIR",
+					Value: path.Join(_emptyDirMountPath, "model"),
+				},
+			)
+		}
+
 		cortexPythonPath := path.Join(_emptyDirMountPath, "project")
 		if api.Predictor.PythonPath != nil {
 			cortexPythonPath = path.Join(_emptyDirMountPath, "project", *api.Predictor.PythonPath)
@@ -423,29 +434,8 @@ func getEnvVars(api *spec.API, container string) []kcore.EnvVar {
 			Value: cortexPythonPath,
 		})
 
-		if api.Predictor.Type == userconfig.ONNXPredictorType {
-			envVars = append(envVars,
-				kcore.EnvVar{
-					Name:  "CORTEX_MODEL_DIR",
-					Value: path.Join(_emptyDirMountPath, "model"),
-				},
-				kcore.EnvVar{
-					Name:  "CORTEX_MODELS",
-					Value: strings.Join(api.ModelNames(), ","),
-				},
-			)
-		}
-
 		if api.Predictor.Type == userconfig.TensorFlowPredictorType {
 			envVars = append(envVars,
-				kcore.EnvVar{
-					Name:  "CORTEX_MODEL_DIR",
-					Value: path.Join(_emptyDirMountPath, "model"),
-				},
-				kcore.EnvVar{
-					Name:  "CORTEX_MODELS",
-					Value: strings.Join(api.ModelNames(), ","),
-				},
 				kcore.EnvVar{
 					Name:  "CORTEX_TF_BASE_SERVING_PORT",
 					Value: _tfBaseServingPortStr,
@@ -511,14 +501,22 @@ func getEnvVars(api *spec.API, container string) []kcore.EnvVar {
 						Name:  "CORTEX_TF_BASE_SERVING_PORT",
 						Value: _tfBaseServingPortStr,
 					},
-					kcore.EnvVar{
-						Name:  "CORTEX_MODEL_DIR",
-						Value: path.Join(_emptyDirMountPath, "model"),
-					},
 					kcore.EnvVar{
 						Name:  "TF_EMPTY_MODEL_CONFIG",
 						Value: _tfServingEmptyModelConfig,
 					},
+					kcore.EnvVar{
+						Name:  "TF_MAX_NUM_LOAD_RETRIES",
+						Value: _tfServingMaxNumLoadRetries,
+					},
+					kcore.EnvVar{
+						Name:  "TF_LOAD_RETRY_INTERVAL_MICROS",
+						Value: _tfServingLoadTimeMicros,
+					},
+					kcore.EnvVar{
+						Name:  "TF_GRPC_MAX_CONCURRENT_STREAMS",
+						Value: fmt.Sprintf(`--grpc_channel_arguments="grpc.max_concurrent_streams=%d"`, api.Predictor.ThreadsPerProcess+10),
+					},
 				)
 			}
 			if container == APIContainerName {
@@ -554,23 +552,6 @@ func tfDownloadArgs(api *spec.API) string {
 		},
 	}
 
-	rootModelPath := path.Join(_emptyDirMountPath, "model")
-	for _, model := range api.Predictor.Models {
-		var itemName string
-		if model.Name == consts.SingleModelName {
-			itemName = "the model"
-		} else {
-			itemName = fmt.Sprintf("model %s", model.Name)
-		}
-		downloadConfig.DownloadArgs = append(downloadConfig.DownloadArgs, downloadContainerArg{
-			From:                 model.ModelPath,
-			To:                   path.Join(rootModelPath, model.Name),
-			Unzip:                strings.HasSuffix(model.ModelPath, ".zip"),
-			ItemName:             itemName,
-			TFModelVersionRename: path.Join(rootModelPath, model.Name, "1"),
-		})
-	}
-
 	downloadArgsBytes, _ := json.Marshal(downloadConfig)
 	return base64.URLEncoding.EncodeToString(downloadArgsBytes)
 }
@@ -609,21 +590,6 @@ func onnxDownloadArgs(api *spec.API) string {
 		},
 	}
 
-	rootModelPath := path.Join(_emptyDirMountPath, "model")
-	for _, model := range api.Predictor.Models {
-		var itemName string
-		if model.Name == consts.SingleModelName {
-			itemName = "the model"
-		} else {
-			itemName = fmt.Sprintf("model %s", model.Name)
-		}
-		downloadConfig.DownloadArgs = append(downloadConfig.DownloadArgs, downloadContainerArg{
-			From:     model.ModelPath,
-			To:       path.Join(rootModelPath, model.Name),
-			ItemName: itemName,
-		})
-	}
-
 	downloadArgsBytes, _ := json.Marshal(downloadConfig)
 	return base64.URLEncoding.EncodeToString(downloadArgsBytes)
 }
@@ -650,6 +616,9 @@ func tensorflowServingContainer(api *spec.API, volumeMounts []kcore.VolumeMount,
 		cmdArgs = []string{
 			"--port=" + _tfBaseServingPortStr,
 			"--model_config_file=" + _tfServingEmptyModelConfig,
+			"--max_num_load_retries=" + _tfServingMaxNumLoadRetries,
+			"--load_retry_interval_micros=" + _tfServingLoadTimeMicros,
+			fmt.Sprintf(`--grpc_channel_arguments="grpc.max_concurrent_streams=%d"`, api.Predictor.ProcessesPerReplica*api.Predictor.ThreadsPerProcess+10),
 		}
 		if api.Predictor.ServerSideBatching != nil {
 			cmdArgs = append(cmdArgs,
diff --git a/pkg/operator/resources/batchapi/api.go b/pkg/operator/resources/batchapi/api.go
index 0c695fb18a..5065c056ed 100644
--- a/pkg/operator/resources/batchapi/api.go
+++ b/pkg/operator/resources/batchapi/api.go
@@ -36,13 +36,13 @@ import (
 	klabels "k8s.io/apimachinery/pkg/labels"
 )
 
-func UpdateAPI(apiConfig *userconfig.API, projectID string) (*spec.API, string, error) {
+func UpdateAPI(apiConfig *userconfig.API, models []spec.CuratedModelResource, projectID string) (*spec.API, string, error) {
 	prevVirtualService, err := config.K8s.GetVirtualService(operator.K8sName(apiConfig.Name))
 	if err != nil {
 		return nil, "", err
 	}
 
-	api := spec.GetAPISpec(apiConfig, projectID, "", config.Cluster.ClusterName) // Deployment ID not needed for BatchAPI spec
+	api := spec.GetAPISpec(apiConfig, models, projectID, "", config.Cluster.ClusterName) // Deployment ID not needed for BatchAPI spec
 
 	if prevVirtualService == nil {
 		if err := config.AWS.UploadJSONToS3(api, config.Cluster.Bucket, api.Key); err != nil {
diff --git a/pkg/operator/resources/realtimeapi/api.go b/pkg/operator/resources/realtimeapi/api.go
index bf0b960c92..5a84c00960 100644
--- a/pkg/operator/resources/realtimeapi/api.go
+++ b/pkg/operator/resources/realtimeapi/api.go
@@ -41,7 +41,7 @@ func deploymentID() string {
 	return k8s.RandomName()[:10]
 }
 
-func UpdateAPI(apiConfig *userconfig.API, projectID string, force bool) (*spec.API, string, error) {
+func UpdateAPI(apiConfig *userconfig.API, models []spec.CuratedModelResource, projectID string, force bool) (*spec.API, string, error) {
 	prevDeployment, prevService, prevVirtualService, err := getK8sResources(apiConfig)
 	if err != nil {
 		return nil, "", err
@@ -52,7 +52,7 @@ func UpdateAPI(apiConfig *userconfig.API, projectID string, force bool) (*spec.A
 		deploymentID = prevDeployment.Labels["deploymentID"]
 	}
 
-	api := spec.GetAPISpec(apiConfig, projectID, deploymentID, config.Cluster.ClusterName)
+	api := spec.GetAPISpec(apiConfig, models, projectID, deploymentID, config.Cluster.ClusterName)
 
 	if prevDeployment == nil {
 		if err := config.AWS.UploadJSONToS3(api, config.Cluster.Bucket, api.Key); err != nil {
@@ -152,7 +152,7 @@ func RefreshAPI(apiName string, force bool) (string, error) {
 		return "", err
 	}
 
-	api = spec.GetAPISpec(api.API, api.ProjectID, deploymentID(), config.Cluster.ClusterName)
+	api = spec.GetAPISpec(api.API, api.CuratedModelResources, api.ProjectID, deploymentID(), config.Cluster.ClusterName)
 
 	if err := config.AWS.UploadJSONToS3(api, config.Cluster.Bucket, api.Key); err != nil {
 		return "", errors.Wrap(err, "upload api spec")
diff --git a/pkg/operator/resources/resources.go b/pkg/operator/resources/resources.go
index c22cc80eb9..fddebe01fa 100644
--- a/pkg/operator/resources/resources.go
+++ b/pkg/operator/resources/resources.go
@@ -91,7 +91,8 @@ func Deploy(projectBytes []byte, configFileName string, configBytes []byte, forc
 		return nil, err
 	}
 
-	err = ValidateClusterAPIs(apiConfigs, projectFiles)
+	models := []spec.CuratedModelResource{}
+	err = ValidateClusterAPIs(apiConfigs, &models, projectFiles)
 	if err != nil {
 		err = errors.Append(err, fmt.Sprintf("\n\napi configuration schema can be found here:\n  → Realtime API: https://docs.cortex.dev/v/%s/deployments/realtime-api/api-configuration\n  → Batch API: https://docs.cortex.dev/v/%s/deployments/batch-api/api-configuration\n  → Traffic Splitter: https://docs.cortex.dev/v/%s/deployments/realtime-api/traffic-splitter", consts.CortexVersionMinor, consts.CortexVersionMinor, consts.CortexVersionMinor))
 		return nil, err
@@ -113,7 +114,7 @@ func Deploy(projectBytes []byte, configFileName string, configBytes []byte, forc
 	results := make([]schema.DeployResult, 0, len(apiConfigs))
 	for i := range apiConfigs {
 		apiConfig := apiConfigs[i]
-		api, msg, err := UpdateAPI(&apiConfig, projectID, force)
+		api, msg, err := UpdateAPI(&apiConfig, models, projectID, force)
 
 		result := schema.DeployResult{
 			Message: msg,
@@ -130,7 +131,7 @@ func Deploy(projectBytes []byte, configFileName string, configBytes []byte, forc
 	return results, nil
 }
 
-func UpdateAPI(apiConfig *userconfig.API, projectID string, force bool) (*schema.APIResponse, string, error) {
+func UpdateAPI(apiConfig *userconfig.API, models []spec.CuratedModelResource, projectID string, force bool) (*schema.APIResponse, string, error) {
 	deployedResource, err := GetDeployedResourceByNameOrNil(apiConfig.Name)
 	if err != nil {
 		return nil, "", err
@@ -144,9 +145,9 @@ func UpdateAPI(apiConfig *userconfig.API, projectID string, force bool) (*schema
 	var msg string
 	switch apiConfig.Kind {
 	case userconfig.RealtimeAPIKind:
-		api, msg, err = realtimeapi.UpdateAPI(apiConfig, projectID, force)
+		api, msg, err = realtimeapi.UpdateAPI(apiConfig, models, projectID, force)
 	case userconfig.BatchAPIKind:
-		api, msg, err = batchapi.UpdateAPI(apiConfig, projectID)
+		api, msg, err = batchapi.UpdateAPI(apiConfig, models, projectID)
 	case userconfig.TrafficSplitterKind:
 		api, msg, err = trafficsplitter.UpdateAPI(apiConfig, force)
 	default:
diff --git a/pkg/operator/resources/trafficsplitter/api.go b/pkg/operator/resources/trafficsplitter/api.go
index 276075a971..c768be2902 100644
--- a/pkg/operator/resources/trafficsplitter/api.go
+++ b/pkg/operator/resources/trafficsplitter/api.go
@@ -37,7 +37,7 @@ func UpdateAPI(apiConfig *userconfig.API, force bool) (*spec.API, string, error)
 		return nil, "", err
 	}
 
-	api := spec.GetAPISpec(apiConfig, "", "", config.Cluster.ClusterName)
+	api := spec.GetAPISpec(apiConfig, nil, "", "", config.Cluster.ClusterName)
 	if prevVirtualService == nil {
 		if err := config.AWS.UploadJSONToS3(api, config.Cluster.Bucket, api.Key); err != nil {
 			return nil, "", errors.Wrap(err, "upload api spec")
diff --git a/pkg/operator/resources/validations.go b/pkg/operator/resources/validations.go
index 6ff3052e25..cd56907d9d 100644
--- a/pkg/operator/resources/validations.go
+++ b/pkg/operator/resources/validations.go
@@ -77,7 +77,7 @@ func (projectFiles ProjectFiles) ProjectDir() string {
 	return "./"
 }
 
-func ValidateClusterAPIs(apis []userconfig.API, projectFiles spec.ProjectFiles) error {
+func ValidateClusterAPIs(apis []userconfig.API, models *[]spec.CuratedModelResource, projectFiles spec.ProjectFiles) error {
 	if len(apis) == 0 {
 		return spec.ErrorNoAPIs()
 	}
@@ -102,7 +102,7 @@ func ValidateClusterAPIs(apis []userconfig.API, projectFiles spec.ProjectFiles)
 	for i := range apis {
 		api := &apis[i]
 		if api.Kind == userconfig.RealtimeAPIKind || api.Kind == userconfig.BatchAPIKind {
-			if err := spec.ValidateAPI(api, projectFiles, types.AWSProviderType, config.AWS, config.K8s); err != nil {
+			if err := spec.ValidateAPI(api, models, projectFiles, types.AWSProviderType, config.AWS, config.K8s); err != nil {
 				return errors.Wrap(err, api.Identify())
 			}
 			if err := validateK8s(api, virtualServices, maxMem); err != nil {
diff --git a/pkg/operator/schema/schema.go b/pkg/operator/schema/schema.go
index 406cc8812b..424878bbfb 100644
--- a/pkg/operator/schema/schema.go
+++ b/pkg/operator/schema/schema.go
@@ -76,14 +76,30 @@ type ErrorResponse struct {
 	Message string `json:"message"`
 }
 
+type APITFLiveReloadingSummary struct {
+	Message       string                       `json:"message"`
+	ModelMetadata map[string]TFModelIDMetadata `json:"model_metadata"`
+}
+
+type TFModelIDMetadata struct {
+	DiskPath        string                    `json:"disk_path"`
+	SignatureKey    string                    `json:"signature_key"`
+	InputSignatures map[string]InputSignature `json:"input_signatures"`
+	Timestamp       int64                     `json:"timestamp"`
+	SignatureDef    map[string]interface{}    `json:"signature_def"`
+}
+
 type InputSignature struct {
 	Shape []interface{} `json:"shape"`
 	Type  string        `json:"type"`
 }
 
-type InputSignatures map[string]InputSignature
+type APIModelSummary struct {
+	Message       string                          `json:"message"`
+	ModelMetadata map[string]GenericModelMetadata `json:"model_metadata"`
+}
 
-type APISummary struct {
-	Message         string                     `json:"message"`
-	ModelSignatures map[string]InputSignatures `json:"model_signatures"`
+type GenericModelMetadata struct {
+	Versions   []string `json:"versions"`
+	Timestamps []int64  `json:"timestamps"`
 }
diff --git a/pkg/types/spec/api.go b/pkg/types/spec/api.go
index 68bbed2226..0d5ce253d1 100644
--- a/pkg/types/spec/api.go
+++ b/pkg/types/spec/api.go
@@ -31,18 +31,19 @@ import (
 
 type API struct {
 	*userconfig.API
-	ID               string             `json:"id"`
-	SpecID           string             `json:"spec_id"`
-	PredictorID      string             `json:"predictor_id"`
-	DeploymentID     string             `json:"deployment_id"`
-	Key              string             `json:"key"`
-	PredictorKey     string             `json:"predictor_key"`
-	LastUpdated      int64              `json:"last_updated"`
-	MetadataRoot     string             `json:"metadata_root"`
-	ProjectID        string             `json:"project_id"`
-	ProjectKey       string             `json:"project_key"`
-	LocalModelCaches []*LocalModelCache `json:"local_model_cache"` // local only
-	LocalProjectDir  string             `json:"local_project_dir"`
+	ID                    string                 `json:"id"`
+	SpecID                string                 `json:"spec_id"`
+	PredictorID           string                 `json:"predictor_id"`
+	DeploymentID          string                 `json:"deployment_id"`
+	Key                   string                 `json:"key"`
+	PredictorKey          string                 `json:"predictor_key"`
+	LastUpdated           int64                  `json:"last_updated"`
+	MetadataRoot          string                 `json:"metadata_root"`
+	ProjectID             string                 `json:"project_id"`
+	ProjectKey            string                 `json:"project_key"`
+	CuratedModelResources []CuratedModelResource `json:"curated_model_resources"`
+	LocalModelCaches      []*LocalModelCache     `json:"local_model_cache"` // local only
+	LocalProjectDir       string                 `json:"local_project_dir"`
 }
 
 type LocalModelCache struct {
@@ -51,6 +52,12 @@ type LocalModelCache struct {
 	TargetPath string `json:"target_path"`
 }
 
+type CuratedModelResource struct {
+	*userconfig.ModelResource
+	S3Path   bool    `json:"s3_path"`
+	Versions []int64 `json:"versions"`
+}
+
 /*
 APIID (uniquely identifies an api configuration for a given deployment)
 	* SpecID (uniquely identifies api configuration specified by user)
@@ -66,7 +73,7 @@ APIID (uniquely identifies an api configuration for a given deployment)
 		* APIs
 	* DeploymentID (used for refreshing a deployment)
 */
-func GetAPISpec(apiConfig *userconfig.API, projectID string, deploymentID string, clusterName string) *API {
+func GetAPISpec(apiConfig *userconfig.API, models []CuratedModelResource, projectID string, deploymentID string, clusterName string) *API {
 	var buf bytes.Buffer
 
 	buf.WriteString(s.Obj(apiConfig.Resource))
@@ -89,47 +96,78 @@ func GetAPISpec(apiConfig *userconfig.API, projectID string, deploymentID string
 	apiID := fmt.Sprintf("%s-%s-%s", MonotonicallyDecreasingID(), deploymentID, specID) // should be up to 60 characters long
 
 	return &API{
-		API:          apiConfig,
-		ID:           apiID,
-		SpecID:       specID,
-		PredictorID:  predictorID,
-		Key:          Key(apiConfig.Name, apiID, clusterName),
-		PredictorKey: PredictorKey(apiConfig.Name, predictorID, clusterName),
-		DeploymentID: deploymentID,
-		LastUpdated:  time.Now().Unix(),
-		MetadataRoot: MetadataRoot(apiConfig.Name, clusterName),
-		ProjectID:    projectID,
-		ProjectKey:   ProjectKey(projectID, clusterName),
+		API:                   apiConfig,
+		CuratedModelResources: models,
+		ID:                    apiID,
+		SpecID:                specID,
+		PredictorID:           predictorID,
+		Key:                   Key(apiConfig.Name, apiID, clusterName),
+		PredictorKey:          PredictorKey(apiConfig.Name, predictorID, clusterName),
+		DeploymentID:          deploymentID,
+		LastUpdated:           time.Now().Unix(),
+		MetadataRoot:          MetadataRoot(apiConfig.Name, clusterName),
+		ProjectID:             projectID,
+		ProjectKey:            ProjectKey(projectID, clusterName),
 	}
 }
 
-// Keep track of models in the model cache used by this API (local only)
-func (api *API) LocalModelIDs() []string {
-	models := []string{}
-	if api != nil && len(api.LocalModelCaches) > 0 {
-		for _, localModelCache := range api.LocalModelCaches {
-			models = append(models, localModelCache.ID)
+func TotalLocalModelVersions(models []CuratedModelResource) int {
+	totalLocalModelVersions := 0
+	for _, model := range models {
+		if model.S3Path {
+			continue
+		}
+		if len(model.Versions) > 0 {
+			totalLocalModelVersions += len(model.Versions)
+		} else {
+			totalLocalModelVersions++
+		}
+	}
+	return totalLocalModelVersions
+}
+
+func TotalModelVersions(models []CuratedModelResource) int {
+	totalModelVersions := 0
+	for _, model := range models {
+		if len(model.Versions) > 0 {
+			totalModelVersions += len(model.Versions)
+		} else {
+			totalModelVersions++
 		}
 	}
+	return totalModelVersions
+}
+
+func (api *API) TotalLocalModelVersions() int {
+	return TotalLocalModelVersions(api.CuratedModelResources)
+}
+
+func (api *API) TotalModelVersions() int {
+	return TotalModelVersions(api.CuratedModelResources)
+}
 
+// Keep track of models in the model cache used by this API (local only)
+func (api *API) ModelIDs() []string {
+	models := []string{}
+	for _, localModelCache := range api.LocalModelCaches {
+		models = append(models, localModelCache.ID)
+	}
 	return models
 }
 
 func (api *API) ModelNames() []string {
 	names := []string{}
-	if api != nil && len(api.Predictor.Models) > 0 {
-		for _, model := range api.Predictor.Models {
-			names = append(names, model.Name)
-		}
+	for _, model := range api.CuratedModelResources {
+		names = append(names, model.Name)
 	}
 
 	return names
 }
 
-func (api *API) SubtractLocalModelIDs(apis ...*API) []string {
-	modelIDs := strset.FromSlice(api.LocalModelIDs())
+func (api *API) SubtractModelIDs(apis ...*API) []string {
+	modelIDs := strset.FromSlice(api.ModelIDs())
 	for _, a := range apis {
-		modelIDs.Remove(a.LocalModelIDs()...)
+		modelIDs.Remove(a.ModelIDs()...)
 	}
 	return modelIDs.Slice()
 }
diff --git a/pkg/types/spec/errors.go b/pkg/types/spec/errors.go
index fe9cdb160f..5c2fabde86 100644
--- a/pkg/types/spec/errors.go
+++ b/pkg/types/spec/errors.go
@@ -22,6 +22,7 @@ import (
 
 	"github.com/cortexlabs/cortex/pkg/consts"
 	"github.com/cortexlabs/cortex/pkg/lib/errors"
+	"github.com/cortexlabs/cortex/pkg/lib/files"
 	libmath "github.com/cortexlabs/cortex/pkg/lib/math"
 	"github.com/cortexlabs/cortex/pkg/lib/sets/strset"
 	s "github.com/cortexlabs/cortex/pkg/lib/strings"
@@ -30,31 +31,44 @@ import (
 )
 
 const (
-	ErrMalformedConfig                      = "spec.malformed_config"
-	ErrNoAPIs                               = "spec.no_apis"
-	ErrDuplicateName                        = "spec.duplicate_name"
-	ErrDuplicateEndpointInOneDeploy         = "spec.duplicate_endpoint_in_one_deploy"
-	ErrDuplicateEndpoint                    = "spec.duplicate_endpoint"
-	ErrConflictingFields                    = "spec.conflicting_fields"
-	ErrSpecifyAllOrNone                     = "spec.specify_all_or_none"
-	ErrOneOfPrerequisitesNotDefined         = "spec.one_of_prerequisites_not_defined"
-	ErrConfigGreaterThanOtherConfig         = "spec.config_greater_than_other_config"
-	ErrMinReplicasGreaterThanMax            = "spec.min_replicas_greater_than_max"
-	ErrInitReplicasGreaterThanMax           = "spec.init_replicas_greater_than_max"
-	ErrInitReplicasLessThanMin              = "spec.init_replicas_less_than_min"
-	ErrInvalidSurgeOrUnavailable            = "spec.invalid_surge_or_unavailable"
-	ErrSurgeAndUnavailableBothZero          = "spec.surge_and_unavailable_both_zero"
-	ErrFileNotFound                         = "spec.file_not_found"
-	ErrDirIsEmpty                           = "spec.dir_is_empty"
-	ErrMustBeRelativeProjectPath            = "spec.must_be_relative_project_path"
-	ErrPythonPathNotFound                   = "spec.python_path_not_found"
-	ErrS3FileNotFound                       = "spec.s3_file_not_found"
-	ErrInvalidTensorFlowDir                 = "spec.invalid_tensorflow_dir"
-	ErrInvalidNeuronTensorFlowDir           = "operator.invalid_neuron_tensorflow_dir"
-	ErrInvalidTensorFlowModelPath           = "spec.invalid_tensorflow_model_path"
-	ErrMissingModel                         = "spec.missing_model"
-	ErrInvalidONNXModelPath                 = "spec.invalid_onnx_model_path"
-	ErrDuplicateModelNames                  = "spec.duplicate_model_names"
+	ErrMalformedConfig              = "spec.malformed_config"
+	ErrNoAPIs                       = "spec.no_apis"
+	ErrDuplicateName                = "spec.duplicate_name"
+	ErrDuplicateEndpointInOneDeploy = "spec.duplicate_endpoint_in_one_deploy"
+	ErrDuplicateEndpoint            = "spec.duplicate_endpoint"
+	ErrConflictingFields            = "spec.conflicting_fields"
+	ErrSpecifyOneOrTheOther         = "spec.specify_one_or_the_other"
+	ErrSpecifyAllOrNone             = "spec.specify_all_or_none"
+	ErrOneOfPrerequisitesNotDefined = "spec.one_of_prerequisites_not_defined"
+	ErrConfigGreaterThanOtherConfig = "spec.config_greater_than_other_config"
+
+	ErrMinReplicasGreaterThanMax  = "spec.min_replicas_greater_than_max"
+	ErrInitReplicasGreaterThanMax = "spec.init_replicas_greater_than_max"
+	ErrInitReplicasLessThanMin    = "spec.init_replicas_less_than_min"
+
+	ErrInvalidSurgeOrUnavailable   = "spec.invalid_surge_or_unavailable"
+	ErrSurgeAndUnavailableBothZero = "spec.surge_and_unavailable_both_zero"
+
+	ErrModelCachingNotSupportedWhenMultiprocessingEnabled = "spec.model_caching_not_supported_when_multiprocessing_enabled"
+
+	ErrFileNotFound              = "spec.file_not_found"
+	ErrDirIsEmpty                = "spec.dir_is_empty"
+	ErrMustBeRelativeProjectPath = "spec.must_be_relative_project_path"
+	ErrPythonPathNotFound        = "spec.python_path_not_found"
+
+	ErrS3FileNotFound = "spec.s3_file_not_found"
+	ErrS3DirNotFound  = "spec.s3_dir_not_found"
+	ErrS3DirIsEmpty   = "spec.s3_dir_is_empty"
+
+	ErrModelPathNotDirectory      = "spec.model_path_not_directory"
+	ErrInvalidPythonModelPath     = "spec.invalid_python_model_path"
+	ErrInvalidTensorFlowModelPath = "spec.invalid_tensorflow_model_path"
+	ErrInvalidONNXModelPath       = "spec.invalid_onnx_model_path"
+
+	ErrMissingModel        = "spec.missing_model"
+	ErrDuplicateModelNames = "spec.duplicate_model_names"
+	ErrReservedModelName   = "spec.reserved_model_name"
+
 	ErrFieldMustBeDefinedForPredictorType   = "spec.field_must_be_defined_for_predictor_type"
 	ErrFieldNotSupportedByPredictorType     = "spec.field_not_supported_by_predictor_type"
 	ErrNoAvailableNodeComputeLimit          = "spec.no_available_node_compute_limit"
@@ -76,6 +90,11 @@ const (
 	ErrUnexpectedDockerSecretData           = "spec.unexpected_docker_secret_data"
 )
 
+var _modelCurrentStructure = `
+  but its current structure is
+
+%s`
+
 func ErrorMalformedConfig() error {
 	return errors.WithStack(&errors.Error{
 		Kind:    ErrMalformedConfig,
@@ -128,6 +147,13 @@ func ErrorConflictingFields(fieldKeyA, fieldKeyB string) error {
 	})
 }
 
+func ErrorSpecifyOneOrTheOther(fieldKeyA, fieldKeyB string) error {
+	return errors.WithStack(&errors.Error{
+		Kind:    ErrSpecifyOneOrTheOther,
+		Message: fmt.Sprintf("please specify either the %s field or %s field (cannot be both empty at the same time)", fieldKeyA, fieldKeyB),
+	})
+}
+
 func ErrorSpecifyAllOrNone(val string, vals ...string) error {
 	allVals := append([]string{val}, vals...)
 	message := fmt.Sprintf("please specify all or none of %s", s.UserStrsAnd(allVals))
@@ -193,6 +219,15 @@ func ErrorSurgeAndUnavailableBothZero() error {
 	})
 }
 
+func ErrorModelCachingNotSupportedWhenMultiprocessingEnabled(desiredProcesses int32) error {
+	const maxNumProcesses int32 = 1
+	return errors.WithStack(&errors.Error{
+		Kind: ErrModelCachingNotSupportedWhenMultiprocessingEnabled,
+		Message: fmt.Sprintf("when dynamic model caching is enabled (%s < provided models), the max value %s can take is %d, while currently it's set to %d",
+			userconfig.ModelsCacheSizeKey, userconfig.ProcessesPerReplicaKey, maxNumProcesses, desiredProcesses),
+	})
+}
+
 func ErrorFileNotFound(path string) error {
 	return errors.WithStack(&errors.Error{
 		Kind:    ErrFileNotFound,
@@ -224,45 +259,174 @@ func ErrorPythonPathNotFound(pythonPath string) error {
 func ErrorS3FileNotFound(path string) error {
 	return errors.WithStack(&errors.Error{
 		Kind:    ErrS3FileNotFound,
-		Message: fmt.Sprintf("%s: not found or insufficient permissions", path),
+		Message: fmt.Sprintf("%s: file not found or insufficient permissions", path),
+	})
+}
+
+func ErrorS3DirNotFound(path string) error {
+	return errors.WithStack(&errors.Error{
+		Kind:    ErrS3DirNotFound,
+		Message: fmt.Sprintf("%s: dir not found or insufficient permissions", path),
 	})
 }
 
-var _tfExpectedStructMessage = `For TensorFlow models, the path must contain a directory with the following structure:
-  1523423423/ (Version prefix, usually a timestamp)
+func ErrorS3DirIsEmpty(path string) error {
+	return errors.WithStack(&errors.Error{
+		Kind:    ErrS3DirIsEmpty,
+		Message: fmt.Sprintf("%s: S3 directory is empty", path),
+	})
+}
+
+func ErrorModelPathNotDirectory(modelPath string) error {
+	return errors.WithStack(&errors.Error{
+		Kind:    ErrModelPathNotDirectory,
+		Message: fmt.Sprintf("%s: model path must be a directory", modelPath),
+	})
+}
+
+var _pythonModelTemplates = `
+  %s
+  ├── 1523423423/ (Version prefix)
+  |   └── * // Model-specific files (i.e. model.h5, model.pkl, labels.json, etc)
+  └── 2434389194/ (Version prefix)
+	  └── * // Model-specific files (i.e. model.h5, model.pkl, labels.json, etc)
+
+or like
+
+  %s
+  └── * // Model-specific files (i.e. model.h5, model.pkl, labels.json, etc)
+`
+
+func ErrorInvalidPythonModelPath(modelPath string, modelSubPaths []string) error {
+	message := fmt.Sprintf("%s: invalid %s model path. ", modelPath, userconfig.PythonPredictorType.CasedString())
+	message += " " + fmt.Sprintf("For models provided for the %s predictor type, the path must be a directory with one of the following structures:\n", userconfig.PythonPredictorType)
+
+	message += fmt.Sprintf(_pythonModelTemplates, modelPath, modelPath)
+
+	if len(modelSubPaths) > 0 {
+		message += "\n" + "but its current structure is (limited to 50 sub-paths)" + "\n\n"
+		if len(modelSubPaths) > 50 {
+			message += s.Indent(files.FileTree(modelSubPaths[:50], "", files.DirsSorted), "  ")
+			message += "\n  ..."
+		} else {
+			message += s.Indent(files.FileTree(modelSubPaths, "", files.DirsSorted), "  ")
+		}
+	} else {
+		message += "\n" + "but its current directory is empty"
+	}
+
+	return errors.WithStack(&errors.Error{
+		Kind:    ErrInvalidPythonModelPath,
+		Message: message,
+	})
+}
+
+var _tfVersionedExpectedStructMessage = `
+  %s
+  ├── 1523423423/ (Version prefix, usually a timestamp)
+  |   ├── saved_model.pb
+  |   └── variables/
+  |       ├── variables.index
+  |       ├── variables.data-00000-of-00003
+  |       ├── variables.data-00001-of-00003
+  |       └── variables.data-00002-of-...
+  └── 2434389194/ (Version prefix, usually a timestamp)
+      ├── saved_model.pb
+      └── variables/
+          ├── variables.index
+          ├── variables.data-00000-of-00003
+          ├── variables.data-00001-of-00003
+          └── variables.data-00002-of-...
+
+or like
+
+  %s
   ├── saved_model.pb
   └── variables/
       ├── variables.index
       ├── variables.data-00000-of-00003
       ├── variables.data-00001-of-00003
-      └── variables.data-00002-of-...`
+      └── variables.data-00002-of-...
+`
+var _neuronTfVersionedExpectedStructMessage = `
+  %s
+  ├── 1523423423/ (Version prefix, usually a timestamp)
+  |   └── saved_model.pb
+  └── 2434389194/ (Version prefix, usually a timestamp)
+      └── saved_model.pb
+
+or like
+
+  %s
+  └── saved_model.pb
+`
+
+func ErrorInvalidTensorFlowModelPath(modelPath string, neuronExport bool, modelSubPaths []string) error {
+	predictorType := userconfig.TensorFlowPredictorType.CasedString()
+	if neuronExport {
+		predictorType = "Neuron " + predictorType
+	}
+	message := fmt.Sprintf("%s: invalid %s model path.", modelPath, predictorType)
+	message += " " + fmt.Sprintf("For models provided for the %s predictor type, the path must be a directory with one of the following structures:\n", userconfig.TensorFlowPredictorType)
 
-func ErrorInvalidTensorFlowDir(path string) error {
-	message := "invalid TensorFlow export directory.\n"
-	message += _tfExpectedStructMessage
-	return errors.WithStack(&errors.Error{
-		Kind:    ErrInvalidTensorFlowDir,
-		Message: message,
-	})
-}
+	if !neuronExport {
+		message += fmt.Sprintf(_tfVersionedExpectedStructMessage, modelPath, modelPath)
+	} else {
+		message += fmt.Sprintf(_neuronTfVersionedExpectedStructMessage, modelPath, modelPath)
+	}
 
-var _neuronTfExpectedStructMessage = `For Neuron TensorFlow models, the path must contain a directory with the following structure:
-1523423423/ (Version prefix, usually a timestamp)
-└── saved_model.pb`
+	if len(modelSubPaths) > 0 {
+		message += "\n" + "but its current structure is (limited to 50 sub-paths)" + "\n\n"
+		if len(modelSubPaths) > 50 {
+			message += s.Indent(files.FileTree(modelSubPaths[:50], "", files.DirsSorted), "  ")
+			message += "\n  ..."
+		} else {
+			message += s.Indent(files.FileTree(modelSubPaths, "", files.DirsSorted), "  ")
+		}
+	} else {
+		message += "\n" + "but its current directory is empty"
+	}
 
-func ErrorInvalidNeuronTensorFlowDir(path string) error {
-	message := "invalid Neuron TensorFlow export directory.\n"
-	message += _neuronTfExpectedStructMessage
 	return errors.WithStack(&errors.Error{
-		Kind:    ErrInvalidNeuronTensorFlowDir,
+		Kind:    ErrInvalidTensorFlowModelPath,
 		Message: message,
 	})
 }
 
-func ErrorInvalidTensorFlowModelPath() error {
+var _onnxVersionedExpectedStructMessage = `
+  %s
+  ├── 1523423423/ (Version prefix)
+  |   └── <model-name>.onnx // ONNX-exported file
+  └── 2434389194/ (Version prefix)
+      └── <model-name>.onnx // ONNX-exported file
+
+or like
+
+  %s
+  └── <model-name>.onnx // ONNX-exported file
+`
+
+func ErrorInvalidONNXModelPath(modelPath string, modelSubPaths []string) error {
+	message := fmt.Sprintf("%s: invalid %s model path. ", modelPath, userconfig.ONNXPredictorType.CasedString())
+	message += " " + fmt.Sprintf("For models provided for the %s predictor type, the path must be a directory with one of the following structures:\n", userconfig.PythonPredictorType)
+
+	message += fmt.Sprintf(_onnxVersionedExpectedStructMessage, modelPath, modelPath)
+
+	if len(modelSubPaths) > 0 {
+		message += "\n" + "but its current structure is (limited to 50 sub-paths)" + "\n\n"
+		if len(modelSubPaths) > 50 {
+			message += s.Indent(files.FileTree(modelSubPaths[:50], "", files.DirsSorted), "  ")
+			message += "\n  ..."
+		} else {
+			message += s.Indent(files.FileTree(modelSubPaths, "", files.DirsSorted), "  ")
+		}
+	} else {
+		message += "\n" + "but its current directory is empty"
+	}
+
 	return errors.WithStack(&errors.Error{
-		Kind:    ErrInvalidTensorFlowModelPath,
-		Message: "TensorFlow model path must be a directory or a zip file ending in `.zip`",
+		Kind:    ErrInvalidONNXModelPath,
+		Message: message,
 	})
 }
 
@@ -273,17 +437,17 @@ func ErrorMissingModel(predictorType userconfig.PredictorType) error {
 	})
 }
 
-func ErrorInvalidONNXModelPath() error {
+func ErrorDuplicateModelNames(duplicateModel string) error {
 	return errors.WithStack(&errors.Error{
-		Kind:    ErrInvalidONNXModelPath,
-		Message: "onnx model path must be an onnx exported file ending in `.onnx`",
+		Kind:    ErrDuplicateModelNames,
+		Message: fmt.Sprintf("cannot have multiple models with the same name (%s)", duplicateModel),
 	})
 }
 
-func ErrorDuplicateModelNames(duplicateModel string) error {
+func ErrorReservedModelName(reservedModel string) error {
 	return errors.WithStack(&errors.Error{
-		Kind:    ErrDuplicateModelNames,
-		Message: fmt.Sprintf("cannot have multiple models with the same name (%s)", duplicateModel),
+		Kind:    ErrReservedModelName,
+		Message: fmt.Sprintf("%s: is a reserved name; please specify a different model name", reservedModel),
 	})
 }
 
diff --git a/pkg/types/spec/utils.go b/pkg/types/spec/utils.go
new file mode 100644
index 0000000000..e2b74e6082
--- /dev/null
+++ b/pkg/types/spec/utils.go
@@ -0,0 +1,730 @@
+/*
+Copyright 2020 Cortex Labs, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package spec
+
+import (
+	"path/filepath"
+	"strconv"
+	"strings"
+
+	"github.com/cortexlabs/cortex/pkg/lib/aws"
+	"github.com/cortexlabs/cortex/pkg/lib/errors"
+	"github.com/cortexlabs/cortex/pkg/lib/files"
+	"github.com/cortexlabs/cortex/pkg/lib/pointer"
+	"github.com/cortexlabs/cortex/pkg/lib/sets/strset"
+	"github.com/cortexlabs/cortex/pkg/lib/slices"
+	s "github.com/cortexlabs/cortex/pkg/lib/strings"
+	"github.com/cortexlabs/cortex/pkg/types/userconfig"
+)
+
+func FindDuplicateNames(apis []userconfig.API) []userconfig.API {
+	names := make(map[string][]userconfig.API)
+
+	for _, api := range apis {
+		names[api.Name] = append(names[api.Name], api)
+	}
+
+	for name := range names {
+		if len(names[name]) > 1 {
+			return names[name]
+		}
+	}
+
+	return nil
+}
+
+func checkDuplicateModelNames(models []CuratedModelResource) error {
+	names := strset.New()
+
+	for _, model := range models {
+		if names.Has(model.Name) {
+			return ErrorDuplicateModelNames(model.Name)
+		}
+		names.Add(model.Name)
+	}
+
+	return nil
+}
+
+func surgeOrUnavailableValidator(str string) (string, error) {
+	if strings.HasSuffix(str, "%") {
+		parsed, ok := s.ParseInt32(strings.TrimSuffix(str, "%"))
+		if !ok {
+			return "", ErrorInvalidSurgeOrUnavailable(str)
+		}
+		if parsed < 0 || parsed > 100 {
+			return "", ErrorInvalidSurgeOrUnavailable(str)
+		}
+	} else {
+		parsed, ok := s.ParseInt32(str)
+		if !ok {
+			return "", ErrorInvalidSurgeOrUnavailable(str)
+		}
+		if parsed < 0 {
+			return "", ErrorInvalidSurgeOrUnavailable(str)
+		}
+	}
+
+	return str, nil
+}
+
+// Verifies if modelName is found in models slice.
+func isModelNameIn(models []userconfig.ModelResource, modelName string) bool {
+	for _, model := range models {
+		if model.Name == modelName {
+			return true
+		}
+	}
+	return false
+}
+
+func modelResourceToCurated(modelResources []userconfig.ModelResource, projectDir string) ([]CuratedModelResource, error) {
+	models := []CuratedModelResource{}
+	for _, model := range modelResources {
+		isS3Path := strings.HasPrefix(model.ModelPath, "s3://")
+		if !isS3Path {
+			model.ModelPath = files.RelToAbsPath(model.ModelPath, projectDir)
+		}
+
+		model.ModelPath = s.EnsureSuffix(model.ModelPath, "/")
+
+		models = append(models, CuratedModelResource{
+			ModelResource: &userconfig.ModelResource{
+				Name:         model.Name,
+				ModelPath:    model.ModelPath,
+				SignatureKey: model.SignatureKey,
+			},
+			S3Path: isS3Path,
+		})
+	}
+
+	return models, nil
+}
+
+// List the model objects found in the S3/local path directory.
+//
+// The model name is determined from the objects' names found in the path directory.
+// Path can either be an S3 path or a local system path - in the latter case, the returned paths will be in absolute form.
+func listModelResourcesFromPath(path string, projectFiles ProjectFiles, awsClient *aws.Client) ([]userconfig.ModelResource, error) {
+	models := []userconfig.ModelResource{}
+
+	if aws.IsValidS3Path(path) {
+		awsClientForBucket, err := aws.NewFromClientS3Path(path, awsClient)
+		if err != nil {
+			return nil, err
+		}
+
+		isDir, err := awsClientForBucket.IsS3PathDir(path)
+		if err != nil {
+			return nil, err
+		}
+
+		if !isDir {
+			return nil, ErrorS3DirNotFound(path)
+		}
+
+		modelPaths, _, err := awsClientForBucket.GetNLevelsDeepFromS3Path(path, 1, false, pointer.Int64(20000))
+		if err != nil {
+			return nil, err
+		}
+		var bucket string
+		bucket, _, err = aws.SplitS3Path(path)
+		if err != nil {
+			return nil, err
+		}
+
+		for _, modelPath := range modelPaths {
+			models = append(models, userconfig.ModelResource{
+				Name:      filepath.Base(modelPath),
+				ModelPath: aws.S3Path(bucket, modelPath),
+			})
+		}
+
+	} else {
+		path = files.RelToAbsPath(path, projectFiles.ProjectDir())
+
+		if err := files.CheckDir(path); err != nil {
+			return nil, err
+		}
+
+		modelObjects, err := files.ListDir(path, true)
+		if err != nil {
+			return nil, err
+		}
+
+		for _, modelName := range modelObjects {
+			models = append(models, userconfig.ModelResource{
+				Name:      modelName,
+				ModelPath: filepath.Join(path, modelName),
+			})
+		}
+	}
+
+	return models, nil
+}
+
+// getPythonVersionsFromS3Path checks that the path contains a valid S3 directory for versioned Python models:
+// - model-name
+// 		- 1523423423/ (version prefix, usually a timestamp)
+// 			- *
+// 		- 2434389194/ (version prefix, usually a timestamp)
+//			- *
+// 		...
+func getPythonVersionsFromS3Path(modelPath string, awsClientForBucket *aws.Client) ([]int64, error) {
+	isDir, err := awsClientForBucket.IsS3PathDir(modelPath)
+	if err != nil {
+		return nil, err
+	}
+	if !isDir {
+		return nil, ErrorModelPathNotDirectory(modelPath)
+	}
+
+	modelSubPaths, allModelSubPaths, err := awsClientForBucket.GetNLevelsDeepFromS3Path(modelPath, 1, false, pointer.Int64(1000))
+	if err != nil {
+		return nil, err
+	}
+
+	if len(modelSubPaths) == 0 {
+		return nil, ErrorS3DirIsEmpty(modelPath)
+	}
+
+	versions := []int64{}
+	for _, modelSubPath := range modelSubPaths {
+		keyParts := strings.Split(modelSubPath, "/")
+		versionStr := keyParts[len(keyParts)-1]
+		version, err := strconv.ParseInt(versionStr, 10, 64)
+		if err != nil {
+			return nil, ErrorInvalidPythonModelPath(modelPath, allModelSubPaths)
+		}
+
+		modelVersionPath := aws.JoinS3Path(modelPath, versionStr)
+		if err := validatePythonS3ModelDir(modelVersionPath, awsClientForBucket); err != nil {
+			if errors.GetKind(err) == ErrDirIsEmpty {
+				continue
+			}
+			return nil, errors.Append(err, "\n\n"+ErrorInvalidPythonModelPath(modelPath, allModelSubPaths).Error())
+		}
+		versions = append(versions, version)
+	}
+
+	return slices.UniqueInt64(versions), nil
+}
+
+func validatePythonS3ModelDir(modelPath string, awsClientForBucket *aws.Client) error {
+	isDir, err := awsClientForBucket.IsS3PathDir(modelPath)
+	if err != nil {
+		return err
+	}
+	if !isDir {
+		return ErrorModelPathNotDirectory(modelPath)
+	}
+
+	modelSubPaths, _, err := awsClientForBucket.GetNLevelsDeepFromS3Path(modelPath, 1, false, pointer.Int64(1000))
+	if err != nil {
+		return err
+	}
+
+	if len(modelSubPaths) == 0 {
+		return ErrorS3DirIsEmpty(modelPath)
+	}
+
+	return nil
+}
+
+// getPythonVersionsFromLocalPath checks that the path contains a valid local directory for versioned Python models:
+// - model-name
+// 		- 1523423423/ (version prefix, usually a timestamp)
+// 			- *
+// 		- 2434389194/ (version prefix, usually a timestamp)
+//			- *
+// 		...
+func getPythonVersionsFromLocalPath(modelPath string) ([]int64, error) {
+	if !files.IsDir(modelPath) {
+		return nil, ErrorModelPathNotDirectory(modelPath)
+	}
+
+	modelSubPaths, err := files.ListDirRecursive(modelPath, false)
+	if err != nil {
+		return nil, err
+	} else if len(modelSubPaths) == 0 {
+		return nil, ErrorDirIsEmpty(modelPath)
+	}
+
+	basePathLength := len(slices.RemoveEmpties(strings.Split(modelPath, "/")))
+	versions := []int64{}
+	for _, modelSubPath := range modelSubPaths {
+		pathParts := slices.RemoveEmpties(strings.Split(modelSubPath, "/"))
+		versionStr := pathParts[basePathLength]
+		version, err := strconv.ParseInt(versionStr, 10, 64)
+		if err != nil {
+			return nil, ErrorInvalidPythonModelPath(modelPath, modelSubPaths)
+		}
+
+		modelVersionPath := filepath.Join(modelPath, versionStr)
+		if err := validatePythonLocalModelDir(modelVersionPath); err != nil {
+			if errors.GetKind(err) == ErrDirIsEmpty {
+				continue
+			}
+			return nil, errors.Append(err, "\n\n"+ErrorInvalidPythonModelPath(modelPath, modelSubPaths).Error())
+		}
+		versions = append(versions, version)
+	}
+
+	return slices.UniqueInt64(versions), nil
+}
+
+func validatePythonLocalModelDir(modelPath string) error {
+	if !files.IsDir(modelPath) {
+		return ErrorModelPathNotDirectory(modelPath)
+	}
+
+	if objects, err := files.ListDir(modelPath, false); err != nil {
+		return err
+	} else if len(objects) == 0 {
+		return ErrorDirIsEmpty(modelPath)
+	}
+
+	return nil
+}
+
+// getTFServingVersionsFromS3Path checks that the path contains a valid S3 directory for (Neuron) TensorFlow models:
+//
+// For TensorFlow models:
+// - model-name
+// 		- 1523423423/ (version prefix, usually a timestamp)
+//			- saved_model.pb
+// 			- variables/
+//				- variables.index
+//				- variables.data-00000-of-00001 (there are a variable number of these files)
+// 		- 2434389194/ (version prefix, usually a timestamp)
+// 			- saved_model.pb
+//			- variables/
+//				- variables.index
+//				- variables.data-00000-of-00001 (there are a variable number of these files)
+//   ...
+//
+// For Neuron TensorFlow models:
+// - model-name
+// 		- 1523423423/ (version prefix, usually a timestamp)
+// 			- saved_model.pb
+// 		- 2434389194/ (version prefix, usually a timestamp)
+//			- saved_model.pb
+// 		...
+//
+func getTFServingVersionsFromS3Path(modelPath string, isNeuronExport bool, awsClientForBucket *aws.Client) ([]int64, error) {
+	isDir, err := awsClientForBucket.IsS3PathDir(modelPath)
+	if err != nil {
+		return nil, err
+	}
+	if !isDir {
+		return nil, ErrorModelPathNotDirectory(modelPath)
+	}
+
+	modelSubPaths, allModelSubPaths, err := awsClientForBucket.GetNLevelsDeepFromS3Path(modelPath, 1, false, pointer.Int64(1000))
+	if err != nil {
+		return nil, err
+	}
+
+	if len(modelSubPaths) == 0 {
+		return nil, ErrorS3DirIsEmpty(modelPath)
+	}
+
+	versions := []int64{}
+	for _, modelSubPath := range modelSubPaths {
+		keyParts := strings.Split(modelSubPath, "/")
+		versionStr := keyParts[len(keyParts)-1]
+		version, err := strconv.ParseInt(versionStr, 10, 64)
+		if err != nil {
+			return nil, ErrorInvalidTensorFlowModelPath(modelPath, isNeuronExport, allModelSubPaths)
+		}
+
+		modelVersionPath := aws.JoinS3Path(modelPath, versionStr)
+		if err := validateTFServingS3ModelDir(modelVersionPath, isNeuronExport, awsClientForBucket); err != nil {
+			if errors.GetKind(err) == ErrDirIsEmpty {
+				continue
+			}
+			if errors.GetKind(err) != ErrInvalidTensorFlowModelPath {
+				return nil, errors.Append(err, "\n\n"+ErrorInvalidTensorFlowModelPath(modelPath, isNeuronExport, allModelSubPaths).Error())
+			}
+			return nil, ErrorInvalidTensorFlowModelPath(modelPath, isNeuronExport, allModelSubPaths)
+		}
+		versions = append(versions, version)
+	}
+
+	return slices.UniqueInt64(versions), nil
+}
+
+func validateTFServingS3ModelDir(modelPath string, isNeuronExport bool, awsClientForBucket *aws.Client) error {
+	isDir, err := awsClientForBucket.IsS3PathDir(modelPath)
+	if err != nil {
+		return err
+	}
+	if !isDir {
+		return ErrorModelPathNotDirectory(modelPath)
+	}
+
+	modelSubPaths, allModelSubPaths, err := awsClientForBucket.GetNLevelsDeepFromS3Path(modelPath, 1, false, pointer.Int64(1000))
+	if err != nil {
+		return err
+	}
+
+	if len(modelSubPaths) == 0 {
+		return ErrorS3DirIsEmpty(modelPath)
+	}
+
+	if isNeuronExport {
+		if !isValidNeuronTensorFlowS3Directory(modelPath, awsClientForBucket) {
+			return ErrorInvalidTensorFlowModelPath(modelPath, isNeuronExport, allModelSubPaths)
+		}
+	} else {
+		if !isValidTensorFlowS3Directory(modelPath, awsClientForBucket) {
+			return ErrorInvalidTensorFlowModelPath(modelPath, isNeuronExport, allModelSubPaths)
+		}
+	}
+
+	return nil
+}
+
+// isValidTensorFlowS3Directory checks that the path contains a valid S3 directory for TensorFlow models
+// Must contain the following structure:
+// - 1523423423/ (version prefix, usually a timestamp)
+// 		- saved_model.pb
+//		- variables/
+//			- variables.index
+//			- variables.data-00000-of-00001 (there are a variable number of these files)
+func isValidTensorFlowS3Directory(path string, awsClientForBucket *aws.Client) bool {
+	if valid, err := awsClientForBucket.IsS3PathFile(
+		aws.JoinS3Path(path, "saved_model.pb"),
+		aws.JoinS3Path(path, "variables/variables.index"),
+	); err != nil || !valid {
+		return false
+	}
+
+	if valid, err := awsClientForBucket.IsS3PathPrefix(
+		aws.JoinS3Path(path, "variables/variables.data-00000-of"),
+	); err != nil || !valid {
+		return false
+	}
+
+	return true
+}
+
+// isValidNeuronTensorFlowS3Directory checks that the path contains a valid S3 directory for Neuron TensorFlow models
+// Must contain the following structure:
+// - 1523423423/ (version prefix, usually a timestamp)
+// 		- saved_model.pb
+func isValidNeuronTensorFlowS3Directory(path string, awsClient *aws.Client) bool {
+	if valid, err := awsClient.IsS3PathFile(
+		aws.JoinS3Path(path, "saved_model.pb"),
+	); err != nil || !valid {
+		return false
+	}
+
+	return true
+}
+
+// getTFServingVersionsFromLocalPath checks that the path contains a valid local directory for TensorFlow models:
+// - model-name
+// 		- 1523423423/ (version prefix, usually a timestamp)
+//			- saved_model.pb
+// 			- variables/
+//				- variables.index
+//				- variables.data-00000-of-00001 (there are a variable number of these files)
+// 		- 2434389194/ (version prefix, usually a timestamp)
+// 			- saved_model.pb
+//			- variables/
+//				- variables.index
+//				- variables.data-00000-of-00001 (there are a variable number of these files)
+//   ...
+func getTFServingVersionsFromLocalPath(modelPath string) ([]int64, error) {
+	if !files.IsDir(modelPath) {
+		return nil, ErrorModelPathNotDirectory(modelPath)
+	}
+
+	modelSubPaths, err := files.ListDirRecursive(modelPath, false)
+	if err != nil {
+		return nil, err
+	} else if len(modelSubPaths) == 0 {
+		return nil, ErrorDirIsEmpty(modelPath)
+	}
+
+	basePathLength := len(slices.RemoveEmpties(strings.Split(modelPath, "/")))
+	versions := []int64{}
+
+	for _, modelSubPath := range modelSubPaths {
+		pathParts := slices.RemoveEmpties(strings.Split(modelSubPath, "/"))
+		versionStr := pathParts[basePathLength]
+		version, err := strconv.ParseInt(versionStr, 10, 64)
+		if err != nil {
+			return nil, ErrorInvalidTensorFlowModelPath(modelPath, false, modelSubPaths)
+		}
+
+		modelVersionPath := filepath.Join(modelPath, versionStr)
+		if err := validateTFServingLocalModelDir(modelVersionPath); err != nil {
+			if errors.GetKind(err) == ErrDirIsEmpty {
+				continue
+			}
+			return nil, errors.Append(err, "\n\n"+ErrorInvalidTensorFlowModelPath(modelPath, false, modelSubPaths).Error())
+		}
+
+		versions = append(versions, version)
+	}
+
+	return slices.UniqueInt64(versions), nil
+}
+
+func validateTFServingLocalModelDir(modelPath string) error {
+	if !files.IsDir(modelPath) {
+		return ErrorModelPathNotDirectory(modelPath)
+	}
+
+	var versionObjects []string
+	var err error
+	if versionObjects, err = files.ListDir(modelPath, false); err != nil {
+		return err
+	} else if len(versionObjects) == 0 {
+		return ErrorDirIsEmpty(modelPath)
+	}
+
+	if yes, err := isValidTensorFlowLocalDirectory(modelPath); !yes || err != nil {
+		return ErrorInvalidTensorFlowModelPath(modelPath, false, versionObjects)
+	}
+
+	return nil
+}
+
+// isValidTensorFlowLocalDirectory checks that the path contains a valid local directory for TensorFlow models
+// Must contain the following structure:
+// - 1523423423/ (version prefix, usually a timestamp)
+// 		- saved_model.pb
+//		- variables/
+//			- variables.index
+//			- variables.data-00000-of-00001 (there are a variable number of these files)
+func isValidTensorFlowLocalDirectory(path string) (bool, error) {
+	paths, err := files.ListDirRecursive(path, true)
+	if err != nil {
+		return false, err
+	}
+	pathSet := strset.New(paths...)
+
+	if !(pathSet.Has("saved_model.pb") && pathSet.Has("variables/variables.index")) {
+		return false, nil
+	}
+
+	for _, path := range paths {
+		if strings.HasPrefix(path, "variables/variables.data-00000-of") {
+			return true, nil
+		}
+	}
+
+	return false, nil
+}
+
+// getONNXVersionsFromS3Path checks that the path contains a valid S3 directory for versioned ONNX models:
+// - model-name
+// 		- 1523423423/ (version prefix, usually a timestamp)
+// 			- <model-name>.onnx
+// 		- 2434389194/ (version prefix, usually a timestamp)
+//			- <model-name>.onnx
+// 		...
+func getONNXVersionsFromS3Path(modelPath string, awsClientForBucket *aws.Client) ([]int64, error) {
+	isDir, err := awsClientForBucket.IsS3PathDir(modelPath)
+	if err != nil {
+		return nil, err
+	}
+	if !isDir {
+		return nil, ErrorModelPathNotDirectory(modelPath)
+	}
+
+	modelSubPaths, allModelSubPaths, err := awsClientForBucket.GetNLevelsDeepFromS3Path(modelPath, 1, false, pointer.Int64(1000))
+	if err != nil {
+		return nil, err
+	}
+
+	if len(modelSubPaths) == 0 {
+		return nil, ErrorS3DirIsEmpty(modelPath)
+	}
+
+	versions := []int64{}
+	for _, modelSubPath := range modelSubPaths {
+		keyParts := strings.Split(modelSubPath, "/")
+		versionStr := keyParts[len(keyParts)-1]
+		version, err := strconv.ParseInt(versionStr, 10, 64)
+		if err != nil {
+			return nil, ErrorInvalidONNXModelPath(modelPath, allModelSubPaths)
+		}
+
+		modelVersionPath := aws.JoinS3Path(modelPath, versionStr)
+		if err := validateONNXS3ModelDir(modelVersionPath, awsClientForBucket); err != nil {
+			if errors.GetKind(err) == ErrDirIsEmpty {
+				continue
+			}
+			if errors.GetKind(err) != ErrInvalidONNXModelPath {
+				return nil, errors.Append(err, "\n\n"+ErrorInvalidONNXModelPath(modelPath, allModelSubPaths).Error())
+			}
+			return nil, ErrorInvalidONNXModelPath(modelPath, allModelSubPaths)
+		}
+
+		versions = append(versions, version)
+	}
+
+	return slices.UniqueInt64(versions), nil
+}
+
+func validateONNXS3ModelDir(modelPath string, awsClientForBucket *aws.Client) error {
+	isDir, err := awsClientForBucket.IsS3PathDir(modelPath)
+	if err != nil {
+		return err
+	}
+	if !isDir {
+		return ErrorModelPathNotDirectory(modelPath)
+	}
+
+	bucket, _, err := aws.SplitS3Path(modelPath)
+	if err != nil {
+		return err
+	}
+
+	modelSubPaths, allModelSubPaths, err := awsClientForBucket.GetNLevelsDeepFromS3Path(modelPath, 1, false, pointer.Int64(1000))
+	if err != nil {
+		return err
+	}
+
+	if len(modelSubPaths) == 0 {
+		return ErrorS3DirIsEmpty(modelPath)
+	}
+
+	numONNXFiles := 0
+	for _, modelSubPath := range modelSubPaths {
+		if !strings.HasSuffix(modelSubPath, ".onnx") {
+			return ErrorInvalidONNXModelPath(modelPath, allModelSubPaths)
+		}
+		if isFile, err := awsClientForBucket.IsS3PathFile(aws.S3Path(bucket, modelSubPath)); err != nil {
+			return errors.Wrap(err, modelPath)
+		} else if !isFile {
+			return ErrorInvalidONNXModelPath(modelPath, allModelSubPaths)
+		}
+		numONNXFiles++
+	}
+
+	if numONNXFiles > 1 {
+		return ErrorInvalidONNXModelPath(modelPath, allModelSubPaths)
+	}
+
+	return nil
+}
+
+// getONNXVersionsFromLocalPath checks that the path contains a valid local directory for versioned ONNX models:
+// - model-name
+// 		- 1523423423/ (version prefix, usually a timestamp)
+// 			- <model-name>.onnx
+// 		- 2434389194/ (version prefix, usually a timestamp)
+//			- <model-name>.onnx
+// 		...
+func getONNXVersionsFromLocalPath(modelPath string) ([]int64, error) {
+	if !files.IsDir(modelPath) {
+		return nil, ErrorModelPathNotDirectory(modelPath)
+	}
+
+	modelSubPaths, err := files.ListDirRecursive(modelPath, false)
+	if err != nil {
+		return nil, err
+	} else if len(modelSubPaths) == 0 {
+		return nil, ErrorDirIsEmpty(modelPath)
+	}
+
+	basePathLength := len(slices.RemoveEmpties(strings.Split(modelPath, "/")))
+	versions := []int64{}
+
+	for _, modelSubPath := range modelSubPaths {
+		pathParts := slices.RemoveEmpties(strings.Split(modelSubPath, "/"))
+		versionStr := pathParts[basePathLength]
+		version, err := strconv.ParseInt(versionStr, 10, 64)
+		if err != nil {
+			return nil, ErrorInvalidONNXModelPath(modelPath, modelSubPaths)
+		}
+
+		modelVersionPath := filepath.Join(modelPath, versionStr)
+		if err := validateONNXLocalModelDir(modelVersionPath); err != nil {
+			if errors.GetKind(err) == ErrDirIsEmpty {
+				continue
+			}
+			return nil, errors.Append(err, "\n\n"+ErrorInvalidONNXModelPath(modelPath, modelSubPaths).Error())
+		}
+
+		versions = append(versions, version)
+	}
+
+	return slices.UniqueInt64(versions), nil
+}
+
+func validateONNXLocalModelDir(modelPath string) error {
+	if !files.IsDir(modelPath) {
+		return ErrorModelPathNotDirectory(modelPath)
+	}
+
+	var versionObjects []string
+	var err error
+	if versionObjects, err = files.ListDir(modelPath, false); err != nil {
+		return err
+	} else if len(versionObjects) == 0 {
+		return ErrorDirIsEmpty(modelPath)
+	}
+
+	numONNXFiles := 0
+	for _, versionObject := range versionObjects {
+		if !strings.HasSuffix(versionObject, ".onnx") || !files.IsFile(versionObject) {
+			return ErrorInvalidONNXModelPath(modelPath, versionObjects)
+		}
+		numONNXFiles++
+	}
+
+	if numONNXFiles > 1 {
+		return ErrorInvalidONNXModelPath(modelPath, versionObjects)
+	}
+
+	return nil
+}
+
+func verifyTotalWeight(apis []*userconfig.TrafficSplit) error {
+	totalWeight := int32(0)
+	for _, api := range apis {
+		totalWeight += api.Weight
+	}
+	if totalWeight == 100 {
+		return nil
+	}
+	return errors.Wrap(ErrorIncorrectTrafficSplitterWeightTotal(totalWeight), userconfig.APIsKey)
+}
+
+// areTrafficSplitterAPIsUnique gives error if the same API is used multiple times in TrafficSplitter
+func areTrafficSplitterAPIsUnique(apis []*userconfig.TrafficSplit) error {
+	names := make(map[string][]userconfig.TrafficSplit)
+	for _, api := range apis {
+		names[api.Name] = append(names[api.Name], *api)
+	}
+	var notUniqueAPIs []string
+	for name := range names {
+		if len(names[name]) > 1 {
+			notUniqueAPIs = append(notUniqueAPIs, names[name][0].Name)
+		}
+	}
+	if len(notUniqueAPIs) > 0 {
+		return errors.Wrap(ErrorTrafficSplitterAPIsNotUnique(notUniqueAPIs), userconfig.APIsKey)
+	}
+	return nil
+}
diff --git a/pkg/types/spec/validations.go b/pkg/types/spec/validations.go
index 44a117d11a..a416210b4b 100644
--- a/pkg/types/spec/validations.go
+++ b/pkg/types/spec/validations.go
@@ -20,8 +20,6 @@ import (
 	"context"
 	"fmt"
 	"math"
-	"path/filepath"
-	"strconv"
 	"strings"
 	"time"
 
@@ -38,7 +36,6 @@ import (
 	libmath "github.com/cortexlabs/cortex/pkg/lib/math"
 	"github.com/cortexlabs/cortex/pkg/lib/pointer"
 	"github.com/cortexlabs/cortex/pkg/lib/regex"
-	"github.com/cortexlabs/cortex/pkg/lib/sets/strset"
 	s "github.com/cortexlabs/cortex/pkg/lib/strings"
 	libtime "github.com/cortexlabs/cortex/pkg/lib/time"
 	"github.com/cortexlabs/cortex/pkg/lib/urls"
@@ -161,8 +158,10 @@ func predictorValidation() *cr.StructFieldValidation {
 					},
 				},
 				{
-					StructField:         "ModelPath",
-					StringPtrValidation: &cr.StringPtrValidation{},
+					StructField: "ModelPath",
+					StringPtrValidation: &cr.StringPtrValidation{
+						Required: false,
+					},
 				},
 				{
 					StructField: "PythonPath",
@@ -502,6 +501,45 @@ func updateStrategyValidation(provider types.ProviderType) *cr.StructFieldValida
 func multiModelValidation() *cr.StructFieldValidation {
 	return &cr.StructFieldValidation{
 		StructField: "Models",
+		StructValidation: &cr.StructValidation{
+			Required:   false,
+			DefaultNil: true,
+			StructFieldValidations: []*cr.StructFieldValidation{
+				multiModelPathsValidation(),
+				{
+					StructField: "Dir",
+					StringPtrValidation: &cr.StringPtrValidation{
+						Required: false,
+					},
+				},
+				{
+					StructField: "SignatureKey",
+					StringPtrValidation: &cr.StringPtrValidation{
+						Required: false,
+					},
+				},
+				{
+					StructField: "CacheSize",
+					Int32PtrValidation: &cr.Int32PtrValidation{
+						Required:    false,
+						GreaterThan: pointer.Int32(0),
+					},
+				},
+				{
+					StructField: "DiskCacheSize",
+					Int32PtrValidation: &cr.Int32PtrValidation{
+						Required:    false,
+						GreaterThan: pointer.Int32(0),
+					},
+				},
+			},
+		},
+	}
+}
+
+func multiModelPathsValidation() *cr.StructFieldValidation {
+	return &cr.StructFieldValidation{
+		StructField: "Paths",
 		StructListValidation: &cr.StructListValidation{
 			Required:         false,
 			TreatNullAsEmpty: true,
@@ -566,28 +604,6 @@ func serverSideBatchingValidation() *cr.StructFieldValidation {
 	}
 }
 
-func surgeOrUnavailableValidator(str string) (string, error) {
-	if strings.HasSuffix(str, "%") {
-		parsed, ok := s.ParseInt32(strings.TrimSuffix(str, "%"))
-		if !ok {
-			return "", ErrorInvalidSurgeOrUnavailable(str)
-		}
-		if parsed < 0 || parsed > 100 {
-			return "", ErrorInvalidSurgeOrUnavailable(str)
-		}
-	} else {
-		parsed, ok := s.ParseInt32(str)
-		if !ok {
-			return "", ErrorInvalidSurgeOrUnavailable(str)
-		}
-		if parsed < 0 {
-			return "", ErrorInvalidSurgeOrUnavailable(str)
-		}
-	}
-
-	return str, nil
-}
-
 var resourceStructValidation = cr.StructValidation{
 	AllowExtraFields:       true,
 	StructFieldValidations: resourceStructValidations,
@@ -672,6 +688,7 @@ func ExtractAPIConfigs(
 
 func ValidateAPI(
 	api *userconfig.API,
+	models *[]CuratedModelResource,
 	projectFiles ProjectFiles,
 	providerType types.ProviderType,
 	awsClient *aws.Client,
@@ -682,7 +699,7 @@ func ValidateAPI(
 		api.Networking.Endpoint = pointer.String("/" + api.Name)
 	}
 
-	if err := validatePredictor(api, projectFiles, providerType, awsClient, k8sClient); err != nil {
+	if err := validatePredictor(api, models, projectFiles, providerType, awsClient, k8sClient); err != nil {
 		return errors.Wrap(err, userconfig.PredictorKey)
 	}
 
@@ -725,28 +742,37 @@ func ValidateTrafficSplitter(
 
 func validatePredictor(
 	api *userconfig.API,
+	models *[]CuratedModelResource,
 	projectFiles ProjectFiles,
 	providerType types.ProviderType,
 	awsClient *aws.Client,
 	k8sClient *k8s.Client, // will be nil for local provider
 ) error {
-
 	predictor := api.Predictor
 
+	if predictor.Models != nil && predictor.ModelPath != nil {
+		return ErrorConflictingFields(userconfig.ModelPathKey, userconfig.ModelsKey)
+	}
+	if predictor.Models != nil {
+		if err := validateMultiModelsFields(api); err != nil {
+			return err
+		}
+	}
+
 	switch predictor.Type {
 	case userconfig.PythonPredictorType:
-		if err := validatePythonPredictor(predictor); err != nil {
+		if err := validatePythonPredictor(predictor, models, providerType, projectFiles, awsClient); err != nil {
 			return err
 		}
 	case userconfig.TensorFlowPredictorType:
-		if err := validateTensorFlowPredictor(api, providerType, projectFiles, awsClient); err != nil {
+		if err := validateTensorFlowPredictor(api, models, providerType, projectFiles, awsClient); err != nil {
 			return err
 		}
 		if err := validateDockerImagePath(predictor.TensorFlowServingImage, providerType, awsClient, k8sClient); err != nil {
 			return errors.Wrap(err, userconfig.TensorFlowServingImageKey)
 		}
 	case userconfig.ONNXPredictorType:
-		if err := validateONNXPredictor(predictor, providerType, projectFiles, awsClient); err != nil {
+		if err := validateONNXPredictor(predictor, models, providerType, projectFiles, awsClient); err != nil {
 			return err
 		}
 	}
@@ -784,365 +810,533 @@ func validatePredictor(
 	return nil
 }
 
-func validatePythonPredictor(predictor *userconfig.Predictor) error {
-	if predictor.SignatureKey != nil {
-		return ErrorFieldNotSupportedByPredictorType(userconfig.SignatureKeyKey, predictor.Type)
+func validateMultiModelsFields(api *userconfig.API) error {
+	predictor := api.Predictor
+
+	if len(predictor.Models.Paths) == 0 && predictor.Models.Dir == nil {
+		return errors.Wrap(ErrorSpecifyOneOrTheOther(userconfig.ModelsPathsKey, userconfig.ModelsDirKey), userconfig.ModelsKey)
+	}
+	if len(predictor.Models.Paths) > 0 && predictor.Models.Dir != nil {
+		return errors.Wrap(ErrorConflictingFields(userconfig.ModelsPathsKey, userconfig.ModelsDirKey), userconfig.ModelsKey)
 	}
 
-	if predictor.ServerSideBatching != nil {
-		ErrorFieldNotSupportedByPredictorType(userconfig.ServerSideBatchingKey, predictor.Type)
+	if predictor.Models.CacheSize != nil && api.Kind != userconfig.RealtimeAPIKind {
+		return errors.Wrap(ErrorKeyIsNotSupportedForKind(userconfig.ModelsCacheSizeKey, api.Kind), userconfig.ModelsKey)
+	}
+	if predictor.Models.DiskCacheSize != nil && api.Kind != userconfig.RealtimeAPIKind {
+		return errors.Wrap(ErrorKeyIsNotSupportedForKind(userconfig.ModelsDiskCacheSizeKey, api.Kind), userconfig.ModelsKey)
 	}
 
-	if predictor.ModelPath != nil {
-		return ErrorFieldNotSupportedByPredictorType(userconfig.ModelPathKey, userconfig.PythonPredictorType)
+	if (predictor.Models.CacheSize == nil && predictor.Models.DiskCacheSize != nil) ||
+		(predictor.Models.CacheSize != nil && predictor.Models.DiskCacheSize == nil) {
+		return errors.Wrap(ErrorSpecifyAllOrNone(userconfig.ModelsCacheSizeKey, userconfig.ModelsDiskCacheSizeKey), userconfig.ModelsKey)
 	}
 
-	if len(predictor.Models) > 0 {
-		return ErrorFieldNotSupportedByPredictorType(userconfig.ModelsKey, predictor.Type)
+	if predictor.Models.CacheSize != nil && predictor.Models.DiskCacheSize != nil {
+		if *predictor.Models.CacheSize > *predictor.Models.DiskCacheSize {
+			return errors.Wrap(ErrorConfigGreaterThanOtherConfig(userconfig.ModelsCacheSizeKey, *predictor.Models.CacheSize, userconfig.ModelsDiskCacheSizeKey, *predictor.Models.DiskCacheSize), userconfig.ModelsKey)
+		}
+
+		if predictor.ProcessesPerReplica > 1 {
+			return ErrorModelCachingNotSupportedWhenMultiprocessingEnabled(predictor.ProcessesPerReplica)
+		}
 	}
 
+	return nil
+}
+
+func validatePythonPredictor(predictor *userconfig.Predictor, models *[]CuratedModelResource, providerType types.ProviderType, projectFiles ProjectFiles, awsClient *aws.Client) error {
+	if predictor.SignatureKey != nil {
+		return ErrorFieldNotSupportedByPredictorType(userconfig.SignatureKeyKey, predictor.Type)
+	}
+	if predictor.ServerSideBatching != nil {
+		return ErrorFieldNotSupportedByPredictorType(userconfig.ServerSideBatchingKey, predictor.Type)
+	}
 	if predictor.TensorFlowServingImage != "" {
 		return ErrorFieldNotSupportedByPredictorType(userconfig.TensorFlowServingImageKey, predictor.Type)
 	}
 
-	return nil
-}
+	hasSingleModel := predictor.ModelPath != nil
+	hasMultiModels := predictor.Models != nil
 
-func validateTensorFlowPredictor(api *userconfig.API, providerType types.ProviderType, projectFiles ProjectFiles, awsClient *aws.Client) error {
-	predictor := api.Predictor
+	var modelWrapError func(error) error
+	var modelResources []userconfig.ModelResource
 
-	if predictor.ServerSideBatching != nil {
-		if api.Compute.Inf == 0 && predictor.ServerSideBatching.MaxBatchSize > predictor.ProcessesPerReplica*predictor.ThreadsPerProcess {
-			return ErrorInsufficientBatchConcurrencyLevel(predictor.ServerSideBatching.MaxBatchSize, predictor.ProcessesPerReplica, predictor.ThreadsPerProcess)
+	if hasSingleModel {
+		modelResources = []userconfig.ModelResource{
+			{
+				Name:      consts.SingleModelName,
+				ModelPath: *predictor.ModelPath,
+			},
 		}
-		if api.Compute.Inf > 0 && predictor.ServerSideBatching.MaxBatchSize > predictor.ThreadsPerProcess {
-			return ErrorInsufficientBatchConcurrencyLevelInf(predictor.ServerSideBatching.MaxBatchSize, predictor.ThreadsPerProcess)
+		*predictor.ModelPath = s.EnsureSuffix(*predictor.ModelPath, "/")
+		modelWrapError = func(err error) error {
+			return errors.Wrap(err, userconfig.ModelPathKey)
 		}
 	}
+	if hasMultiModels {
+		if predictor.Models.SignatureKey != nil {
+			return errors.Wrap(ErrorFieldNotSupportedByPredictorType(userconfig.SignatureKeyKey, predictor.Type), userconfig.ModelsKey)
+		}
 
-	if predictor.ModelPath == nil && len(predictor.Models) == 0 {
-		return ErrorMissingModel(predictor.Type)
-	} else if predictor.ModelPath != nil && len(predictor.Models) > 0 {
-		return ErrorConflictingFields(userconfig.ModelPathKey, userconfig.ModelsKey)
-	} else if predictor.ModelPath != nil {
-		modelResource := &userconfig.ModelResource{
-			Name:         consts.SingleModelName,
-			ModelPath:    *predictor.ModelPath,
-			SignatureKey: predictor.SignatureKey,
+		if len(predictor.Models.Paths) > 0 {
+			modelWrapError = func(err error) error {
+				return errors.Wrap(err, userconfig.ModelsKey, userconfig.ModelsPathsKey)
+			}
+
+			for _, path := range predictor.Models.Paths {
+				if path.SignatureKey != nil {
+					return errors.Wrap(
+						ErrorFieldNotSupportedByPredictorType(userconfig.SignatureKeyKey, predictor.Type),
+						userconfig.ModelsKey,
+						userconfig.ModelsPathsKey,
+						path.Name,
+					)
+				}
+				(*path).ModelPath = s.EnsureSuffix((*path).ModelPath, "/")
+				modelResources = append(modelResources, *path)
+			}
 		}
-		// place the model into predictor.Models for ease of use
-		predictor.Models = []*userconfig.ModelResource{modelResource}
-	}
 
-	if err := checkDuplicateModelNames(predictor.Models); err != nil {
-		return errors.Wrap(err, userconfig.ModelsKey)
+		if predictor.Models.Dir != nil {
+			modelWrapError = func(err error) error {
+				return errors.Wrap(err, userconfig.ModelsKey, userconfig.ModelsDirKey)
+			}
+
+			*(predictor.Models.Dir) = s.EnsureSuffix(*(predictor.Models.Dir), "/")
+
+			var err error
+			modelResources, err = listModelResourcesFromPath(*predictor.Models.Dir, projectFiles, awsClient)
+			if err != nil {
+				return modelWrapError(err)
+			}
+		}
+	}
+	var err error
+	*models, err = modelResourceToCurated(modelResources, projectFiles.ProjectDir())
+	if err != nil {
+		return modelWrapError(err)
 	}
 
-	for i := range predictor.Models {
-		if err := validateTensorFlowModel(predictor.Models[i], api, providerType, projectFiles, awsClient); err != nil {
-			if predictor.ModelPath == nil {
-				return errors.Wrap(err, userconfig.ModelsKey, predictor.Models[i].Name)
+	if hasMultiModels {
+		for _, model := range *models {
+			if model.Name == consts.SingleModelName {
+				return modelWrapError(ErrorReservedModelName(model.Name))
 			}
-			return err
+		}
+	}
+
+	if err := checkDuplicateModelNames(*models); err != nil {
+		return modelWrapError(err)
+	}
+
+	for i := range *models {
+		if err := validatePythonModel(&(*models)[i], providerType, projectFiles, awsClient); err != nil {
+			return modelWrapError(err)
 		}
 	}
 
 	return nil
 }
 
-func validateTensorFlowModel(modelResource *userconfig.ModelResource, api *userconfig.API, providerType types.ProviderType, projectFiles ProjectFiles, awsClient *aws.Client) error {
-	modelPath := modelResource.ModelPath
+func validatePythonModel(modelResource *CuratedModelResource, providerType types.ProviderType, projectFiles ProjectFiles, awsClient *aws.Client) error {
+	modelName := modelResource.Name
+	if modelName == consts.SingleModelName {
+		modelName = ""
+	}
 
-	if strings.HasPrefix(modelPath, "s3://") {
-		awsClientForBucket, err := aws.NewFromClientS3Path(modelPath, awsClient)
+	if modelResource.S3Path {
+		awsClientForBucket, err := aws.NewFromClientS3Path(modelResource.ModelPath, awsClient)
 		if err != nil {
-			return errors.Wrap(err, userconfig.ModelPathKey)
+			return errors.Wrap(err, modelName)
 		}
 
-		modelPath, err := cr.S3PathValidator(modelPath)
+		_, err = cr.S3PathValidator(modelResource.ModelPath)
 		if err != nil {
-			return errors.Wrap(err, userconfig.ModelPathKey)
+			return errors.Wrap(err, modelName)
 		}
 
-		if strings.HasSuffix(modelPath, ".zip") {
-			if ok, err := awsClientForBucket.IsS3PathFile(modelPath); err != nil || !ok {
-				return errors.Wrap(ErrorS3FileNotFound(modelPath), userconfig.ModelPathKey)
+		versions, err := getPythonVersionsFromS3Path(modelResource.ModelPath, awsClientForBucket)
+		if err != nil {
+			if errors.GetKind(err) == ErrModelPathNotDirectory {
+				return errors.Wrap(err, modelName)
 			}
-		} else {
-			isNeuronExport := api.Compute.Inf > 0
-			exportPath, err := getTFServingExportFromS3Path(modelPath, isNeuronExport, awsClientForBucket)
+
+			modelSubS3Objects, err := awsClientForBucket.ListS3PathDir(modelResource.ModelPath, false, pointer.Int64(1000))
 			if err != nil {
-				return errors.Wrap(err, userconfig.ModelPathKey)
+				return errors.Wrap(err, modelName)
 			}
-			if exportPath == "" {
-				if isNeuronExport {
-					return errors.Wrap(ErrorInvalidNeuronTensorFlowDir(modelPath), userconfig.ModelPathKey)
-				}
-				return errors.Wrap(ErrorInvalidTensorFlowDir(modelPath), userconfig.ModelPathKey)
+			modelSubPaths := aws.ConvertS3ObjectsToKeys(modelSubS3Objects...)
+
+			if err = validatePythonS3ModelDir(modelResource.ModelPath, awsClientForBucket); err != nil {
+				return errors.Wrap(errors.Append(err, "\n\n"+ErrorInvalidPythonModelPath(modelResource.ModelPath, modelSubPaths).Error()), modelName)
 			}
-			modelResource.ModelPath = exportPath
 		}
+		modelResource.Versions = versions
 	} else {
 		if providerType == types.AWSProviderType {
-			return errors.Wrap(ErrorLocalModelPathNotSupportedByAWSProvider(), modelPath, userconfig.ModelPathKey)
+			return ErrorLocalModelPathNotSupportedByAWSProvider()
 		}
 
-		var err error
-		if strings.HasPrefix(modelResource.ModelPath, "~/") {
-			modelPath, err = files.EscapeTilde(modelPath)
-			if err != nil {
-				return err
-			}
-		} else {
-			modelPath = files.RelToAbsPath(modelResource.ModelPath, projectFiles.ProjectDir())
-		}
-		if strings.HasSuffix(modelPath, ".zip") {
-			if err := files.CheckFile(modelPath); err != nil {
-				return errors.Wrap(err, userconfig.ModelPathKey)
+		versions, err := getPythonVersionsFromLocalPath(modelResource.ModelPath)
+		if err != nil {
+			if errors.GetKind(err) == ErrModelPathNotDirectory {
+				return errors.Wrap(err, modelName)
 			}
-			modelResource.ModelPath = modelPath
-		} else if files.IsDir(modelPath) {
-			path, err := GetTFServingExportFromLocalPath(modelPath)
+
+			modelSubPaths, err := files.ListDirRecursive(modelResource.ModelPath, false)
 			if err != nil {
-				return errors.Wrap(err, userconfig.ModelPathKey)
-			} else if path == "" {
-				return errors.Wrap(ErrorInvalidTensorFlowDir(modelPath), userconfig.ModelPathKey)
+				return errors.Wrap(err, modelName)
+			}
+
+			if err = validatePythonLocalModelDir(modelResource.ModelPath); err != nil {
+				return errors.Wrap(errors.Append(err, "\n\n"+ErrorInvalidPythonModelPath(modelResource.ModelPath, modelSubPaths).Error()), modelName)
 			}
-			modelResource.ModelPath = path
-		} else {
-			return errors.Wrap(ErrorInvalidTensorFlowModelPath(), userconfig.ModelPathKey, modelPath)
 		}
+		modelResource.Versions = versions
 	}
 
 	return nil
 }
 
-func validateONNXPredictor(predictor *userconfig.Predictor, providerType types.ProviderType, projectFiles ProjectFiles, awsClient *aws.Client) error {
-	if predictor.SignatureKey != nil {
-		return ErrorFieldNotSupportedByPredictorType(userconfig.SignatureKeyKey, predictor.Type)
-	}
+func validateTensorFlowPredictor(api *userconfig.API, models *[]CuratedModelResource, providerType types.ProviderType, projectFiles ProjectFiles, awsClient *aws.Client) error {
+	predictor := api.Predictor
 
 	if predictor.ServerSideBatching != nil {
-		return ErrorFieldNotSupportedByPredictorType(userconfig.ServerSideBatchingKey, predictor.Type)
+		if api.Compute.Inf == 0 && predictor.ServerSideBatching.MaxBatchSize > predictor.ProcessesPerReplica*predictor.ThreadsPerProcess {
+			return ErrorInsufficientBatchConcurrencyLevel(predictor.ServerSideBatching.MaxBatchSize, predictor.ProcessesPerReplica, predictor.ThreadsPerProcess)
+		}
+		if api.Compute.Inf > 0 && predictor.ServerSideBatching.MaxBatchSize > predictor.ThreadsPerProcess {
+			return ErrorInsufficientBatchConcurrencyLevelInf(predictor.ServerSideBatching.MaxBatchSize, predictor.ThreadsPerProcess)
+		}
 	}
 
-	if predictor.ModelPath == nil && len(predictor.Models) == 0 {
+	hasSingleModel := predictor.ModelPath != nil
+	hasMultiModels := predictor.Models != nil
+
+	if !hasSingleModel && !hasMultiModels {
 		return ErrorMissingModel(predictor.Type)
-	} else if predictor.ModelPath != nil && len(predictor.Models) > 0 {
-		return ErrorConflictingFields(userconfig.ModelPathKey, userconfig.ModelsKey)
-	} else if predictor.ModelPath != nil {
-		modelResource := &userconfig.ModelResource{
-			Name:      consts.SingleModelName,
-			ModelPath: *predictor.ModelPath,
-		}
-		// place the model into predictor.Models for ease of use
-		predictor.Models = []*userconfig.ModelResource{modelResource}
 	}
 
-	if err := checkDuplicateModelNames(predictor.Models); err != nil {
-		return errors.Wrap(err, userconfig.ModelsKey)
+	var modelWrapError func(error) error
+	var modelResources []userconfig.ModelResource
+
+	if hasSingleModel {
+		modelResources = []userconfig.ModelResource{
+			{
+				Name:         consts.SingleModelName,
+				ModelPath:    *predictor.ModelPath,
+				SignatureKey: predictor.SignatureKey,
+			},
+		}
+		*predictor.ModelPath = s.EnsureSuffix(*predictor.ModelPath, "/")
+		modelWrapError = func(err error) error {
+			return errors.Wrap(err, userconfig.ModelPathKey)
+		}
 	}
+	if hasMultiModels {
+		if len(predictor.Models.Paths) > 0 {
+			modelWrapError = func(err error) error {
+				return errors.Wrap(err, userconfig.ModelsKey, userconfig.ModelsPathsKey)
+			}
 
-	for i := range predictor.Models {
-		if predictor.Models[i].SignatureKey != nil {
-			return errors.Wrap(ErrorFieldNotSupportedByPredictorType(userconfig.SignatureKeyKey, predictor.Type), userconfig.ModelsKey, predictor.Models[i].Name)
+			for _, path := range predictor.Models.Paths {
+				if path.SignatureKey == nil && predictor.Models.SignatureKey != nil {
+					path.SignatureKey = predictor.Models.SignatureKey
+				}
+				(*path).ModelPath = s.EnsureSuffix((*path).ModelPath, "/")
+				modelResources = append(modelResources, *path)
+			}
 		}
-		if err := validateONNXModel(predictor.Models[i], providerType, projectFiles, awsClient); err != nil {
-			if predictor.ModelPath == nil {
-				return errors.Wrap(err, userconfig.ModelsKey, predictor.Models[i].Name)
+
+		if predictor.Models.Dir != nil {
+			modelWrapError = func(err error) error {
+				return errors.Wrap(err, userconfig.ModelsKey, userconfig.ModelsDirKey)
+			}
+
+			*(predictor.Models.Dir) = s.EnsureSuffix(*(predictor.Models.Dir), "/")
+
+			var err error
+			modelResources, err = listModelResourcesFromPath(*predictor.Models.Dir, projectFiles, awsClient)
+			if err != nil {
+				return modelWrapError(err)
+			}
+			if predictor.Models.SignatureKey != nil {
+				for i := range modelResources {
+					modelResources[i].SignatureKey = predictor.Models.SignatureKey
+				}
 			}
-			return err
+		}
+	}
+	var err error
+	*models, err = modelResourceToCurated(modelResources, projectFiles.ProjectDir())
+	if err != nil {
+		return err
+	}
+
+	if hasMultiModels {
+		for _, model := range *models {
+			if model.Name == consts.SingleModelName {
+				return modelWrapError(ErrorReservedModelName(model.Name))
+			}
+		}
+	}
+
+	if err := checkDuplicateModelNames(*models); err != nil {
+		return modelWrapError(err)
+	}
+
+	for i := range *models {
+		if err := validateTensorFlowModel(&(*models)[i], api, providerType, projectFiles, awsClient); err != nil {
+			return modelWrapError(err)
 		}
 	}
 
 	return nil
 }
 
-func validateONNXModel(modelResource *userconfig.ModelResource, providerType types.ProviderType, projectFiles ProjectFiles, awsClient *aws.Client) error {
-	modelPath := modelResource.ModelPath
-	var err error
-	if !strings.HasSuffix(modelPath, ".onnx") {
-		return errors.Wrap(ErrorInvalidONNXModelPath(), userconfig.ModelPathKey, modelPath)
+func validateTensorFlowModel(
+	modelResource *CuratedModelResource,
+	api *userconfig.API,
+	providerType types.ProviderType,
+	projectFiles ProjectFiles,
+	awsClient *aws.Client,
+) error {
+
+	modelName := modelResource.Name
+	if modelName == consts.SingleModelName {
+		modelName = ""
 	}
 
-	if strings.HasPrefix(modelPath, "s3://") {
-		awsClientForBucket, err := aws.NewFromClientS3Path(modelPath, awsClient)
+	if modelResource.S3Path {
+		awsClientForBucket, err := aws.NewFromClientS3Path(modelResource.ModelPath, awsClient)
 		if err != nil {
-			return errors.Wrap(err, userconfig.ModelPathKey)
+			return errors.Wrap(err, modelName)
 		}
 
-		modelPath, err := cr.S3PathValidator(modelPath)
+		_, err = cr.S3PathValidator(modelResource.ModelPath)
 		if err != nil {
-			return errors.Wrap(err, userconfig.ModelPathKey)
+			return errors.Wrap(err, modelName)
 		}
 
-		if ok, err := awsClientForBucket.IsS3PathFile(modelPath); err != nil || !ok {
-			return errors.Wrap(ErrorS3FileNotFound(modelPath), userconfig.ModelPathKey)
+		isNeuronExport := api.Compute.Inf > 0
+		versions, err := getTFServingVersionsFromS3Path(modelResource.ModelPath, isNeuronExport, awsClientForBucket)
+		if err != nil {
+			if errors.GetKind(err) == ErrModelPathNotDirectory {
+				return errors.Wrap(err, modelName)
+			}
+
+			modelSubS3Objects, err := awsClientForBucket.ListS3PathDir(modelResource.ModelPath, false, pointer.Int64(1000))
+			if err != nil {
+				return errors.Wrap(err, modelName)
+			}
+			modelSubPaths := aws.ConvertS3ObjectsToKeys(modelSubS3Objects...)
+
+			if err = validateTFServingS3ModelDir(modelResource.ModelPath, isNeuronExport, awsClientForBucket); err != nil {
+				if errors.GetKind(err) != ErrInvalidTensorFlowModelPath {
+					return errors.Wrap(errors.Append(err, "\n\n"+ErrorInvalidTensorFlowModelPath(modelResource.ModelPath, isNeuronExport, modelSubPaths).Error()), modelName)
+				}
+				return errors.Wrap(ErrorInvalidTensorFlowModelPath(modelResource.ModelPath, isNeuronExport, modelSubPaths), modelName)
+			}
 		}
+		modelResource.Versions = versions
 	} else {
 		if providerType == types.AWSProviderType {
-			return errors.Wrap(ErrorLocalModelPathNotSupportedByAWSProvider(), modelPath, userconfig.ModelPathKey)
+			return ErrorLocalModelPathNotSupportedByAWSProvider()
 		}
 
-		if strings.HasPrefix(modelResource.ModelPath, "~/") {
-			modelPath, err = files.EscapeTilde(modelPath)
+		versions, err := getTFServingVersionsFromLocalPath(modelResource.ModelPath)
+		if err != nil {
+			if errors.GetKind(err) == ErrModelPathNotDirectory {
+				return errors.Wrap(err, modelName)
+			}
+
+			modelSubPaths, err := files.ListDirRecursive(modelResource.ModelPath, false)
 			if err != nil {
-				return err
+				return errors.Wrap(err, modelName)
+			}
+
+			if err = validateTFServingLocalModelDir(modelResource.ModelPath); err != nil {
+				if errors.GetKind(err) != ErrInvalidTensorFlowModelPath {
+					return errors.Wrap(errors.Append(err, "\n\n"+ErrorInvalidTensorFlowModelPath(modelResource.ModelPath, false, modelSubPaths).Error()), modelName)
+				}
+				return errors.Wrap(ErrorInvalidTensorFlowModelPath(modelResource.ModelPath, false, modelSubPaths), modelName)
 			}
-		} else {
-			modelPath = files.RelToAbsPath(modelResource.ModelPath, projectFiles.ProjectDir())
-		}
-		if err := files.CheckFile(modelPath); err != nil {
-			return errors.Wrap(err, userconfig.ModelPathKey)
 		}
-		modelResource.ModelPath = modelPath
+		modelResource.Versions = versions
 	}
+
 	return nil
 }
 
-func getTFServingExportFromS3Path(path string, isNeuronExport bool, awsClientForBucket *aws.Client) (string, error) {
-	if isValidTensorFlowS3Directory(path, awsClientForBucket) {
-		return path, nil
+func validateONNXPredictor(predictor *userconfig.Predictor, models *[]CuratedModelResource, providerType types.ProviderType, projectFiles ProjectFiles, awsClient *aws.Client) error {
+	if predictor.SignatureKey != nil {
+		return ErrorFieldNotSupportedByPredictorType(userconfig.SignatureKeyKey, predictor.Type)
 	}
-
-	bucket, _, err := aws.SplitS3Path(path)
-	if err != nil {
-		return "", err
+	if predictor.ServerSideBatching != nil {
+		return ErrorFieldNotSupportedByPredictorType(userconfig.ServerSideBatchingKey, predictor.Type)
+	}
+	if predictor.TensorFlowServingImage != "" {
+		return ErrorFieldNotSupportedByPredictorType(userconfig.TensorFlowServingImageKey, predictor.Type)
 	}
 
-	objects, err := awsClientForBucket.ListS3PathDir(path, false, pointer.Int64(1000))
-	if err != nil {
-		return "", err
-	} else if len(objects) == 0 {
-		return "", errors.Wrap(ErrorInvalidTensorFlowModelPath(), path)
+	hasSingleModel := predictor.ModelPath != nil
+	hasMultiModels := predictor.Models != nil
+
+	if !hasSingleModel && !hasMultiModels {
+		return ErrorMissingModel(predictor.Type)
 	}
 
-	highestVersion := int64(0)
-	var highestPath string
-	for _, object := range objects {
-		if !strings.HasSuffix(*object.Key, "saved_model.pb") {
-			continue
-		}
+	var modelWrapError func(error) error
+	var modelResources []userconfig.ModelResource
 
-		keyParts := strings.Split(*object.Key, "/")
-		versionStr := keyParts[len(keyParts)-1]
-		version, err := strconv.ParseInt(versionStr, 10, 64)
-		if err != nil {
-			version = 0
+	if hasSingleModel {
+		modelResources = []userconfig.ModelResource{
+			{
+				Name:      consts.SingleModelName,
+				ModelPath: *predictor.ModelPath,
+			},
+		}
+		*predictor.ModelPath = s.EnsureSuffix(*predictor.ModelPath, "/")
+		modelWrapError = func(err error) error {
+			return errors.Wrap(err, userconfig.ModelPathKey)
+		}
+	}
+	if hasMultiModels {
+		if predictor.Models.SignatureKey != nil {
+			return errors.Wrap(ErrorFieldNotSupportedByPredictorType(userconfig.SignatureKeyKey, predictor.Type), userconfig.ModelsKey)
 		}
 
-		possiblePath := "s3://" + filepath.Join(bucket, filepath.Join(keyParts[:len(keyParts)-1]...))
+		if len(predictor.Models.Paths) > 0 {
+			modelWrapError = func(err error) error {
+				return errors.Wrap(err, userconfig.ModelsKey, userconfig.ModelsPathsKey)
+			}
 
-		if version >= highestVersion {
-			if isNeuronExport && isValidNeuronTensorFlowS3Directory(possiblePath, awsClientForBucket) {
-				highestVersion = version
-				highestPath = possiblePath
+			for _, path := range predictor.Models.Paths {
+				if path.SignatureKey != nil {
+					return errors.Wrap(
+						ErrorFieldNotSupportedByPredictorType(userconfig.SignatureKeyKey, predictor.Type),
+						userconfig.ModelsKey,
+						userconfig.ModelsPathsKey,
+						path.Name,
+					)
+				}
+				(*path).ModelPath = s.EnsureSuffix((*path).ModelPath, "/")
+				modelResources = append(modelResources, *path)
 			}
-			if !isNeuronExport && isValidTensorFlowS3Directory(possiblePath, awsClientForBucket) {
-				highestVersion = version
-				highestPath = possiblePath
+		}
+
+		if predictor.Models.Dir != nil {
+			modelWrapError = func(err error) error {
+				return errors.Wrap(err, userconfig.ModelsKey, userconfig.ModelsDirKey)
+			}
+
+			*(predictor.Models.Dir) = s.EnsureSuffix(*(predictor.Models.Dir), "/")
+
+			var err error
+			modelResources, err = listModelResourcesFromPath(*predictor.Models.Dir, projectFiles, awsClient)
+			if err != nil {
+				return modelWrapError(err)
 			}
 		}
 	}
+	var err error
+	*models, err = modelResourceToCurated(modelResources, projectFiles.ProjectDir())
+	if err != nil {
+		return err
+	}
 
-	return highestPath, nil
-}
+	if hasMultiModels {
+		for _, model := range *models {
+			if model.Name == consts.SingleModelName {
+				return modelWrapError(ErrorReservedModelName(model.Name))
+			}
+		}
+	}
 
-// isValidTensorFlowS3Directory checks that the path contains a valid S3 directory for TensorFlow models
-// Must contain the following structure:
-// - 1523423423/ (version prefix, usually a timestamp)
-// 		- saved_model.pb
-//		- variables/
-//			- variables.index
-//			- variables.data-00000-of-00001 (there are a variable number of these files)
-func isValidTensorFlowS3Directory(path string, awsClientForBucket *aws.Client) bool {
-	if valid, err := awsClientForBucket.IsS3PathFile(
-		aws.JoinS3Path(path, "saved_model.pb"),
-		aws.JoinS3Path(path, "variables/variables.index"),
-	); err != nil || !valid {
-		return false
-	}
-
-	if valid, err := awsClientForBucket.IsS3PathPrefix(
-		aws.JoinS3Path(path, "variables/variables.data-00000-of"),
-	); err != nil || !valid {
-		return false
-	}
-	return true
-}
+	if err := checkDuplicateModelNames(*models); err != nil {
+		return modelWrapError(err)
+	}
 
-// isValidNeuronTensorFlowS3Directory checks that the path contains a valid S3 directory for Neuron TensorFlow models
-// Must contain the following structure:
-// - 1523423423/ (version prefix, usually a timestamp)
-// 		- saved_model.pb
-func isValidNeuronTensorFlowS3Directory(path string, awsClient *aws.Client) bool {
-	if valid, err := awsClient.IsS3PathFile(
-		aws.JoinS3Path(path, "saved_model.pb"),
-	); err != nil || !valid {
-		return false
+	for i := range *models {
+		if err := validateONNXModel(&(*models)[i], providerType, projectFiles, awsClient); err != nil {
+			return modelWrapError(err)
+		}
 	}
 
-	return true
+	return nil
 }
 
-func GetTFServingExportFromLocalPath(path string) (string, error) {
-	if err := files.CheckDir(path); err != nil {
-		return "", err
-	}
-	paths, err := files.ListDirRecursive(path, false, files.IgnoreHiddenFiles, files.IgnoreHiddenFolders)
-	if err != nil {
-		return "", err
-	}
+func validateONNXModel(
+	modelResource *CuratedModelResource,
+	providerType types.ProviderType,
+	projectFiles ProjectFiles,
+	awsClient *aws.Client,
+) error {
 
-	if len(paths) == 0 {
-		return "", ErrorDirIsEmpty(path)
+	modelName := modelResource.Name
+	if modelName == consts.SingleModelName {
+		modelName = ""
 	}
 
-	highestVersion := int64(0)
-	var highestPath string
+	if modelResource.S3Path {
+		awsClientForBucket, err := aws.NewFromClientS3Path(modelResource.ModelPath, awsClient)
+		if err != nil {
+			return errors.Wrap(err, modelName)
+		}
 
-	for _, path := range paths {
-		if strings.HasSuffix(path, "saved_model.pb") {
-			possiblePath := filepath.Dir(path)
+		_, err = cr.S3PathValidator(modelResource.ModelPath)
+		if err != nil {
+			return errors.Wrap(err, modelName)
+		}
 
-			versionStr := filepath.Base(possiblePath)
-			version, err := strconv.ParseInt(versionStr, 10, 64)
-			if err != nil {
-				version = 0
+		versions, err := getONNXVersionsFromS3Path(modelResource.ModelPath, awsClientForBucket)
+		if err != nil {
+			if errors.GetKind(err) == ErrModelPathNotDirectory {
+				return errors.Wrap(err, modelName)
 			}
 
-			validTFDirectory, err := IsValidTensorFlowLocalDirectory(possiblePath)
+			modelSubS3Objects, err := awsClientForBucket.ListS3PathDir(modelResource.ModelPath, false, pointer.Int64(1000))
 			if err != nil {
-				return "", err
+				return errors.Wrap(err, modelName)
 			}
-			if version > highestVersion && validTFDirectory {
-				highestVersion = version
-				highestPath = possiblePath
+			modelSubPaths := aws.ConvertS3ObjectsToKeys(modelSubS3Objects...)
+
+			if err := validateONNXS3ModelDir(modelResource.ModelPath, awsClientForBucket); err != nil {
+				if errors.GetKind(err) != ErrInvalidONNXModelPath {
+					return errors.Wrap(errors.Append(err, "\n\n"+ErrorInvalidONNXModelPath(modelResource.ModelPath, modelSubPaths).Error()), modelName)
+				}
+				return errors.Wrap(ErrorInvalidONNXModelPath(modelResource.ModelPath, modelSubPaths), modelName)
 			}
 		}
-	}
-
-	return highestPath, nil
-}
+		modelResource.Versions = versions
+	} else {
+		if providerType == types.AWSProviderType {
+			return ErrorLocalModelPathNotSupportedByAWSProvider()
+		}
 
-func IsValidTensorFlowLocalDirectory(path string) (bool, error) {
-	paths, err := files.ListDirRecursive(path, true, files.IgnoreHiddenFiles, files.IgnoreHiddenFolders)
-	if err != nil {
-		return false, err
-	}
-	pathSet := strset.New(paths...)
+		versions, err := getONNXVersionsFromLocalPath(modelResource.ModelPath)
+		if err != nil {
+			if errors.GetKind(err) == ErrModelPathNotDirectory {
+				return errors.Wrap(err, modelName)
+			}
 
-	if !(pathSet.Has("saved_model.pb") && pathSet.Has("variables/variables.index")) {
-		return false, nil
-	}
+			modelSubPaths, err := files.ListDirRecursive(modelResource.ModelPath, false)
+			if err != nil {
+				return errors.Wrap(err, modelName)
+			}
 
-	for _, path := range paths {
-		if strings.HasPrefix(path, "variables/variables.data-00000-of") {
-			return true, nil
+			if err := validateONNXLocalModelDir(modelResource.ModelPath); err != nil {
+				if errors.GetKind(err) != ErrInvalidONNXModelPath {
+					return errors.Wrap(errors.Append(err, "\n\n"+ErrorInvalidONNXModelPath(modelResource.ModelPath, modelSubPaths).Error()), modelName)
+				}
+				return errors.Wrap(ErrorInvalidONNXModelPath(modelResource.ModelPath, modelSubPaths), modelName)
+			}
 		}
+		modelResource.Versions = versions
 	}
 
-	return false, nil
+	return nil
 }
 
 func validatePythonPath(predictor *userconfig.Predictor, projectFiles ProjectFiles) error {
@@ -1218,42 +1412,12 @@ func validateUpdateStrategy(updateStrategy *userconfig.UpdateStrategy) error {
 	return nil
 }
 
-func FindDuplicateNames(apis []userconfig.API) []userconfig.API {
-	names := make(map[string][]userconfig.API)
-
-	for _, api := range apis {
-		names[api.Name] = append(names[api.Name], api)
-	}
-
-	for name := range names {
-		if len(names[name]) > 1 {
-			return names[name]
-		}
-	}
-
-	return nil
-}
-
-func checkDuplicateModelNames(modelResources []*userconfig.ModelResource) error {
-	names := strset.New()
-
-	for _, modelResource := range modelResources {
-		if names.Has(modelResource.Name) {
-			return ErrorDuplicateModelNames(modelResource.Name)
-		}
-		names.Add(modelResource.Name)
-	}
-
-	return nil
-}
-
 func validateDockerImagePath(
 	image string,
 	providerType types.ProviderType,
 	awsClient *aws.Client,
-	k8sClient *k8s.Client, // will be nil for local provider
+	k8sClient *k8s.Client, // will be nil for local provider)
 ) error {
-
 	if consts.DefaultImagePathsSet.Has(image) {
 		return nil
 	}
@@ -1382,32 +1546,3 @@ func getDockerAuthStrFromK8s(dockerClient *docker.Client, k8sClient *k8s.Client)
 
 	return dockerAuthStr, nil
 }
-
-func verifyTotalWeight(apis []*userconfig.TrafficSplit) error {
-	totalWeight := int32(0)
-	for _, api := range apis {
-		totalWeight += api.Weight
-	}
-	if totalWeight == 100 {
-		return nil
-	}
-	return errors.Wrap(ErrorIncorrectTrafficSplitterWeightTotal(totalWeight), userconfig.APIsKey)
-}
-
-// areTrafficSplitterAPIsUnique gives error if the same API is used multiple times in TrafficSplitter
-func areTrafficSplitterAPIsUnique(apis []*userconfig.TrafficSplit) error {
-	names := make(map[string][]userconfig.TrafficSplit)
-	for _, api := range apis {
-		names[api.Name] = append(names[api.Name], *api)
-	}
-	var notUniqueAPIs []string
-	for name := range names {
-		if len(names[name]) > 1 {
-			notUniqueAPIs = append(notUniqueAPIs, names[name][0].Name)
-		}
-	}
-	if len(notUniqueAPIs) > 0 {
-		return errors.Wrap(ErrorTrafficSplitterAPIsNotUnique(notUniqueAPIs), userconfig.APIsKey)
-	}
-	return nil
-}
diff --git a/pkg/types/userconfig/api.go b/pkg/types/userconfig/api.go
index d11a1d3ab1..2da653d495 100644
--- a/pkg/types/userconfig/api.go
+++ b/pkg/types/userconfig/api.go
@@ -47,7 +47,8 @@ type Predictor struct {
 	Type                   PredictorType          `json:"type" yaml:"type"`
 	Path                   string                 `json:"path" yaml:"path"`
 	ModelPath              *string                `json:"model_path" yaml:"model_path"`
-	Models                 []*ModelResource       `json:"models" yaml:"models"`
+	SignatureKey           *string                `json:"signature_key" yaml:"signature_key"`
+	Models                 *MultiModels           `json:"models" yaml:"models"`
 	ServerSideBatching     *ServerSideBatching    `json:"server_side_batching" yaml:"server_side_batching"`
 	ProcessesPerReplica    int32                  `json:"processes_per_replica" yaml:"processes_per_replica"`
 	ThreadsPerProcess      int32                  `json:"threads_per_process" yaml:"threads_per_process"`
@@ -56,7 +57,14 @@ type Predictor struct {
 	TensorFlowServingImage string                 `json:"tensorflow_serving_image" yaml:"tensorflow_serving_image"`
 	Config                 map[string]interface{} `json:"config" yaml:"config"`
 	Env                    map[string]string      `json:"env" yaml:"env"`
-	SignatureKey           *string                `json:"signature_key" yaml:"signature_key"`
+}
+
+type MultiModels struct {
+	Paths         []*ModelResource `json:"paths" yaml:"paths"`
+	Dir           *string          `json:"dir" yaml:"dir"`
+	CacheSize     *int32           `json:"cache_size" yaml:"cache_size"`
+	DiskCacheSize *int32           `json:"disk_cache_size" yaml:"disk_cache_size"`
+	SignatureKey  *string          `json:"signature_key" yaml:"signature_key"`
 }
 
 type TrafficSplit struct {
@@ -119,12 +127,9 @@ func (api *API) Identify() string {
 
 func (api *API) ModelNames() []string {
 	names := []string{}
-	if api != nil && len(api.Predictor.Models) > 0 {
-		for _, model := range api.Predictor.Models {
-			names = append(names, model.Name)
-		}
+	for _, model := range api.Predictor.Models.Paths {
+		names = append(names, model.Name)
 	}
-
 	return names
 }
 
@@ -359,11 +364,9 @@ func (predictor *Predictor) UserStr() string {
 	if predictor.ModelPath != nil {
 		sb.WriteString(fmt.Sprintf("%s: %s\n", ModelPathKey, *predictor.ModelPath))
 	}
-	if predictor.ModelPath == nil && len(predictor.Models) > 0 {
+	if predictor.ModelPath == nil && predictor.Models != nil {
 		sb.WriteString(fmt.Sprintf("%s:\n", ModelsKey))
-		for _, model := range predictor.Models {
-			sb.WriteString(fmt.Sprintf(s.Indent(model.UserStr(), "  ")))
-		}
+		sb.WriteString(s.Indent(predictor.Models.UserStr(), "  "))
 	}
 	if predictor.SignatureKey != nil {
 		sb.WriteString(fmt.Sprintf("%s: %s\n", SignatureKeyKey, *predictor.SignatureKey))
@@ -397,23 +400,48 @@ func (predictor *Predictor) UserStr() string {
 	return sb.String()
 }
 
-func (batch *ServerSideBatching) UserStr() string {
+func (models *MultiModels) UserStr() string {
 	var sb strings.Builder
-	sb.WriteString(fmt.Sprintf("%s: %s\n", MaxBatchSizeKey, s.Int32(batch.MaxBatchSize)))
-	sb.WriteString(fmt.Sprintf("%s: %s\n", BatchIntervalKey, batch.BatchInterval))
+
+	if models.Dir != nil {
+		sb.WriteString(fmt.Sprintf("%s: %s\n", ModelsDirKey, *models.Dir))
+	} else if len(models.Paths) > 0 {
+		sb.WriteString(fmt.Sprintf("%s:\n", ModelsPathsKey))
+		for _, model := range models.Paths {
+			modelUserStr := s.Indent(model.UserStr(), "    ")
+			modelUserStr = modelUserStr[:2] + "-" + modelUserStr[3:]
+			sb.WriteString(modelUserStr)
+		}
+	}
+	if models.SignatureKey != nil {
+		sb.WriteString(fmt.Sprintf("%s: %s\n", ModelsDirKey, *models.SignatureKey))
+	}
+	if models.CacheSize != nil {
+		sb.WriteString(fmt.Sprintf("%s: %s\n", ModelsCacheSizeKey, s.Int32(*models.CacheSize)))
+	}
+	if models.DiskCacheSize != nil {
+		sb.WriteString(fmt.Sprintf("%s: %s\n", ModelsDiskCacheSizeKey, s.Int32(*models.DiskCacheSize)))
+	}
 	return sb.String()
 }
 
 func (model *ModelResource) UserStr() string {
 	var sb strings.Builder
-	sb.WriteString(fmt.Sprintf("- %s: %s\n", ModelsNameKey, model.Name))
-	sb.WriteString(fmt.Sprintf(s.Indent("%s: %s\n", "  "), ModelPathKey, model.ModelPath))
+	sb.WriteString(fmt.Sprintf("%s: %s\n", ModelsNameKey, model.Name))
+	sb.WriteString(fmt.Sprintf("%s: %s\n", ModelPathKey, model.ModelPath))
 	if model.SignatureKey != nil {
-		sb.WriteString(fmt.Sprintf(s.Indent("%s: %s\n", "  "), SignatureKeyKey, *model.SignatureKey))
+		sb.WriteString(fmt.Sprintf("%s: %s\n", SignatureKeyKey, *model.SignatureKey))
 	}
 	return sb.String()
 }
 
+func (batch *ServerSideBatching) UserStr() string {
+	var sb strings.Builder
+	sb.WriteString(fmt.Sprintf("%s: %s\n", MaxBatchSizeKey, s.Int32(batch.MaxBatchSize)))
+	sb.WriteString(fmt.Sprintf("%s: %s\n", BatchIntervalKey, batch.BatchInterval))
+	return sb.String()
+}
+
 func (monitoring *Monitoring) UserStr() string {
 	var sb strings.Builder
 	sb.WriteString(fmt.Sprintf("%s: %s\n", ModelTypeKey, monitoring.ModelType.String()))
diff --git a/pkg/types/userconfig/config_key.go b/pkg/types/userconfig/config_key.go
index ca7201e438..b55fa9d5a1 100644
--- a/pkg/types/userconfig/config_key.go
+++ b/pkg/types/userconfig/config_key.go
@@ -36,16 +36,22 @@ const (
 	PathKey                   = "path"
 	ModelPathKey              = "model_path"
 	ServerSideBatchingKey     = "server_side_batching"
-	ProcessesPerReplicaKey    = "processes_per_replica"
-	ThreadsPerProcessKey      = "threads_per_process"
 	ModelsKey                 = "models"
 	PythonPathKey             = "python_path"
 	ImageKey                  = "image"
 	TensorFlowServingImageKey = "tensorflow_serving_image"
+	ProcessesPerReplicaKey    = "processes_per_replica"
+	ThreadsPerProcessKey      = "threads_per_process"
 	ConfigKey                 = "config"
 	EnvKey                    = "env"
 	SignatureKeyKey           = "signature_key"
 
+	// MultiModels
+	ModelsPathsKey         = "paths"
+	ModelsDirKey           = "dir"
+	ModelsCacheSizeKey     = "cache_size"
+	ModelsDiskCacheSizeKey = "disk_cache_size"
+
 	// ServerSideBatching
 	MaxBatchSizeKey  = "max_batch_size"
 	BatchIntervalKey = "batch_interval"
diff --git a/pkg/types/userconfig/predictor_type.go b/pkg/types/userconfig/predictor_type.go
index 6fb1efe7cf..9f14b25611 100644
--- a/pkg/types/userconfig/predictor_type.go
+++ b/pkg/types/userconfig/predictor_type.go
@@ -32,6 +32,13 @@ var _predictorTypes = []string{
 	"onnx",
 }
 
+var _casedPredictorTypes = []string{
+	"unknown",
+	"Python",
+	"TensorFlow",
+	"ONNX",
+}
+
 func PredictorTypeFromString(s string) PredictorType {
 	for i := 0; i < len(_predictorTypes); i++ {
 		if s == _predictorTypes[i] {
@@ -49,6 +56,10 @@ func (t PredictorType) String() string {
 	return _predictorTypes[t]
 }
 
+func (t PredictorType) CasedString() string {
+	return _casedPredictorTypes[t]
+}
+
 // MarshalText satisfies TextMarshaler
 func (t PredictorType) MarshalText() ([]byte, error) {
 	return []byte(t.String()), nil
diff --git a/pkg/workloads/cortex/downloader/download.py b/pkg/workloads/cortex/downloader/download.py
index 3ecaef2af9..58dffe48bf 100644
--- a/pkg/workloads/cortex/downloader/download.py
+++ b/pkg/workloads/cortex/downloader/download.py
@@ -19,7 +19,7 @@
 
 from cortex.lib import util
 from cortex.lib.storage import S3
-from cortex.lib.log import cx_logger
+from cortex.lib.log import cx_logger as logger
 
 
 def start(args):
@@ -33,28 +33,27 @@ def start(args):
 
         if item_name != "":
             if download_arg.get("hide_from_log", False):
-                cx_logger().info("downloading {}".format(item_name))
+                logger().info("downloading {}".format(item_name))
             else:
-                cx_logger().info("downloading {} from {}".format(item_name, from_path))
-        s3_client.download(prefix, to_path)
+                logger().info("downloading {} from {}".format(item_name, from_path))
+
+        if download_arg.get("to_file", False):
+            s3_client.download_file(prefix, to_path)
+        else:
+            s3_client.download(prefix, to_path)
 
         if download_arg.get("unzip", False):
             if item_name != "" and not download_arg.get("hide_unzipping_log", False):
-                cx_logger().info("unzipping {}".format(item_name))
-            util.extract_zip(
-                os.path.join(to_path, os.path.basename(from_path)), delete_zip_file=True
-            )
-
-        if download_arg.get("tf_model_version_rename", "") != "":
-            dest = util.trim_suffix(download_arg["tf_model_version_rename"], "/")
-            dir_path = os.path.dirname(dest)
-            entries = os.listdir(dir_path)
-            if len(entries) == 1:
-                src = os.path.join(dir_path, entries[0])
-                os.rename(src, dest)
+                logger().info("unzipping {}".format(item_name))
+            if download_arg.get("to_file", False):
+                util.extract_zip(to_path, delete_zip_file=True)
+            else:
+                util.extract_zip(
+                    os.path.join(to_path, os.path.basename(from_path)), delete_zip_file=True
+                )
 
     if download_config.get("last_log", "") != "":
-        cx_logger().info(download_config["last_log"])
+        logger().info(download_config["last_log"])
 
 
 def main():
diff --git a/pkg/workloads/cortex/lib/type/model.py b/pkg/workloads/cortex/lib/api/__init__.py
similarity index 55%
rename from pkg/workloads/cortex/lib/type/model.py
rename to pkg/workloads/cortex/lib/api/__init__.py
index 59c9f0267d..50886a014b 100644
--- a/pkg/workloads/cortex/lib/type/model.py
+++ b/pkg/workloads/cortex/lib/api/__init__.py
@@ -12,22 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
-class Model:
-    def __init__(self, name, model_path, base_path, signature_key=None):
-        self.name = name
-        self.model_path = model_path
-        self.base_path = base_path
-        self.signature_key = signature_key
-
-
-def get_model_signature_map(models):
-    signature_keys = {}
-    for model in models:
-        signature_keys[model.name] = model.signature_key
-
-    return signature_keys
-
-
-def get_model_names(models):
-    return [model.name for model in models]
+from cortex.lib.api.predictor import Predictor
+from cortex.lib.api.monitoring import Monitoring
+from cortex.lib.api.api import API, get_api, get_spec
diff --git a/pkg/workloads/cortex/lib/type/api.py b/pkg/workloads/cortex/lib/api/api.py
similarity index 78%
rename from pkg/workloads/cortex/lib/type/api.py
rename to pkg/workloads/cortex/lib/api/api.py
index 9b92ded540..dfc14e117d 100644
--- a/pkg/workloads/cortex/lib/type/api.py
+++ b/pkg/workloads/cortex/lib/api/api.py
@@ -18,32 +18,35 @@
 from pathlib import Path
 import json
 import threading
-
 import datadog
+from typing import Tuple, Union, Optional
 
-from cortex.lib.log import cx_logger
+from cortex.lib.log import cx_logger as logger
 from cortex.lib.exceptions import CortexException
-from cortex.lib.type.predictor import Predictor
-from cortex.lib.type.monitoring import Monitoring
-from cortex.lib.storage import S3
+from cortex.lib.storage import LocalStorage, S3
+
+from cortex.lib.api import Monitoring, Predictor
 
 
 class API:
-    def __init__(self, provider, storage, model_dir, cache_dir=".", **kwargs):
+    def __init__(self, provider, storage, api_spec, model_dir, cache_dir="."):
         self.provider = provider
-        self.id = kwargs["id"]
-        self.predictor_id = kwargs["predictor_id"]
-        self.deployment_id = kwargs["deployment_id"]
-        self.key = kwargs["key"]
-        self.metadata_root = kwargs["metadata_root"]
-        self.name = kwargs["name"]
-        self.predictor = Predictor(provider, model_dir, cache_dir, **kwargs["predictor"])
-        self.monitoring = None
-        if kwargs.get("monitoring") is not None:
-            self.monitoring = Monitoring(**kwargs["monitoring"])
-
-        self.cache_dir = cache_dir
         self.storage = storage
+        self.api_spec = api_spec
+        self.cache_dir = cache_dir
+
+        self.id = api_spec["id"]
+        self.predictor_id = api_spec["predictor_id"]
+        self.deployment_id = api_spec["deployment_id"]
+
+        self.key = api_spec["key"]
+        self.metadata_root = api_spec["metadata_root"]
+        self.name = api_spec["name"]
+        self.predictor = Predictor(provider, api_spec, model_dir)
+
+        self.monitoring = None
+        if self.api_spec.get("monitoring") is not None:
+            self.monitoring = Monitoring(**self.api_spec["monitoring"])
 
         if provider != "local":
             host_ip = os.environ["HOST_IP"]
@@ -55,7 +58,7 @@ def __init__(self, provider, storage, model_dir, cache_dir=".", **kwargs):
 
     def get_cached_classes(self):
         prefix = os.path.join(self.metadata_root, "classes") + "/"
-        class_paths = self.storage.search(prefix=prefix)
+        class_paths, _ = self.storage.search(prefix=prefix)
         class_set = set()
         for class_path in class_paths:
             encoded_class_name = class_path.split("/")[-1]
@@ -114,7 +117,7 @@ def post_metrics(self, metrics):
                 else:
                     self.statsd.histogram(metric["MetricName"], value=metric["Value"], tags=tags)
         except:
-            cx_logger().warn("failure encountered while publishing metrics", exc_info=True)
+            logger().warn("failure encountered while publishing metrics", exc_info=True)
 
     def store_metrics_locally(self, status_code, total_time):
         status_code_series = int(status_code / 100)
@@ -173,9 +176,41 @@ def prediction_metrics(self, dimensions, prediction_value):
             }
 
 
-def get_spec(provider, storage, cache_dir, spec_path):
+def get_api(
+    provider: str,
+    spec_path: str,
+    model_dir: str,
+    cache_dir: Optional[str],
+    bucket: Optional[str],
+    region: Optional[str],
+) -> API:
+    storage, raw_api_spec = get_spec(provider, spec_path, cache_dir, bucket, region)
+
+    api = API(
+        provider=provider,
+        storage=storage,
+        api_spec=raw_api_spec,
+        model_dir=model_dir,
+        cache_dir=cache_dir,
+    )
+
+    return api
+
+
+def get_spec(
+    provider: str,
+    spec_path: str,
+    cache_dir: Optional[str],
+    bucket: Optional[str],
+    region: Optional[str],
+) -> Tuple[Union[LocalStorage, S3], dict]:
+    if provider == "local":
+        storage = LocalStorage(cache_dir)
+    else:
+        storage = S3(bucket=bucket, region=region)
+
     if provider == "local":
-        return read_json(spec_path)
+        return storage, read_json(spec_path)
 
     local_spec_path = os.path.join(cache_dir, "api_spec.json")
 
@@ -183,9 +218,9 @@ def get_spec(provider, storage, cache_dir, spec_path):
         _, key = S3.deconstruct_s3_path(spec_path)
         storage.download_file(key, local_spec_path)
 
-    return read_json(local_spec_path)
+    return storage, read_json(local_spec_path)
 
 
-def read_json(json_path):
+def read_json(json_path: str):
     with open(json_path) as json_file:
         return json.load(json_file)
diff --git a/pkg/workloads/cortex/lib/type/monitoring.py b/pkg/workloads/cortex/lib/api/monitoring.py
similarity index 100%
rename from pkg/workloads/cortex/lib/type/monitoring.py
rename to pkg/workloads/cortex/lib/api/monitoring.py
diff --git a/pkg/workloads/cortex/lib/api/predictor.py b/pkg/workloads/cortex/lib/api/predictor.py
new file mode 100644
index 0000000000..8ea4fd5fdd
--- /dev/null
+++ b/pkg/workloads/cortex/lib/api/predictor.py
@@ -0,0 +1,569 @@
+# Copyright 2020 Cortex Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import imp
+import inspect
+import dill
+import shutil
+import datetime
+import glob
+from copy import deepcopy
+from typing import Any, Optional, Union
+
+# types
+from cortex.lib.type import (
+    predictor_type_from_api_spec,
+    PredictorType,
+    PythonPredictorType,
+    TensorFlowPredictorType,
+    TensorFlowNeuronPredictorType,
+    ONNXPredictorType,
+)
+
+# clients
+from cortex.lib.client.python import PythonClient
+from cortex.lib.client.tensorflow import TensorFlowClient
+from cortex.lib.client.onnx import ONNXClient
+
+
+# crons
+from cortex.lib.model import (
+    FileBasedModelsGC,
+    TFSAPIServingThreadUpdater,
+    ModelsGC,
+    ModelTreeUpdater,
+)
+
+# structures
+from cortex.lib.model import (
+    ModelsHolder,
+    ModelsTree,  # only when num workers = 1
+)
+
+# concurrency
+from cortex.lib.concurrency import FileLock
+
+# model validation
+from cortex.lib.model import validate_model_paths
+
+# misc
+from cortex.lib.storage import S3
+from cortex.lib import util
+from cortex.lib.log import refresh_logger, cx_logger as logger
+from cortex.lib.exceptions import CortexException, UserException, UserRuntimeException
+from cortex import consts
+
+
+class Predictor:
+    """
+    Class to validate/load the predictor class (PythonPredictor, TensorFlowPredictor, ONNXPredictor).
+    Also makes the specified models in cortex.yaml available to the predictor's implementation.
+    """
+
+    def __init__(self, provider: str, api_spec: dict, model_dir: str):
+        """
+        Args:
+            provider: "local" or "aws".
+            api_spec: API configuration.
+            model_dir: Where the models are stored on disk.
+        """
+
+        self.provider = provider
+
+        self.type = predictor_type_from_api_spec(api_spec)
+        self.path = api_spec["predictor"]["path"]
+        self.config = api_spec["predictor"].get("config", {})
+
+        self.api_spec = api_spec
+
+        self.crons = []
+        if not _are_models_specified(self.api_spec):
+            return
+
+        self.model_dir = model_dir
+
+        self.caching_enabled = self._is_model_caching_enabled()
+        self.multiple_processes = self.api_spec["predictor"]["processes_per_replica"] > 1
+
+        # model caching can only be enabled when processes_per_replica is 1
+        # model side-reloading is supported for any number of processes_per_replica
+
+        if self.caching_enabled:
+            self.models = ModelsHolder(
+                self.type,
+                self.model_dir,
+                mem_cache_size=self.api_spec["predictor"]["models"]["cache_size"],
+                disk_cache_size=self.api_spec["predictor"]["models"]["disk_cache_size"],
+                on_download_callback=model_downloader,
+            )
+        elif not self.caching_enabled and self.type not in [
+            TensorFlowPredictorType,
+            TensorFlowNeuronPredictorType,
+        ]:
+            self.models = ModelsHolder(self.type, self.model_dir)
+        else:
+            self.models = None
+
+        if self.multiple_processes:
+            self.models_tree = None
+        else:
+            self.models_tree = ModelsTree()
+
+    def initialize_client(
+        self, tf_serving_host: Optional[str] = None, tf_serving_port: Optional[str] = None
+    ) -> Union[PythonClient, TensorFlowClient, ONNXClient]:
+        """
+        Initialize client that gives access to models specified in the API spec (cortex.yaml).
+        Only applies when models are provided in the API spec.
+
+        Args:
+            tf_serving_host: Host of TF serving server. To be only used when the TensorFlow predictor is used.
+            tf_serving_port: Port of TF serving server. To be only used when the TensorFlow predictor is used.
+
+        Return:
+            The client for the respective predictor type.
+        """
+
+        signature_message = None
+        client = None
+
+        if _are_models_specified(self.api_spec):
+            if self.type == PythonPredictorType:
+                client = PythonClient(self.api_spec, self.models, self.model_dir, self.models_tree)
+
+            if self.type in [TensorFlowPredictorType, TensorFlowNeuronPredictorType]:
+                tf_serving_address = tf_serving_host + ":" + tf_serving_port
+                client = TensorFlowClient(
+                    tf_serving_address,
+                    self.api_spec,
+                    self.models,
+                    self.model_dir,
+                    self.models_tree,
+                )
+                if not self.caching_enabled:
+                    cron = TFSAPIServingThreadUpdater(interval=5.0, client=client._client)
+                    cron.start()
+
+            if self.type == ONNXPredictorType:
+                client = ONNXClient(self.api_spec, self.models, self.model_dir, self.models_tree)
+
+        return client
+
+    def initialize_impl(
+        self,
+        project_dir: str,
+        client: Union[PythonClient, TensorFlowClient, ONNXClient],
+        job_spec: Optional[dict] = None,
+    ):
+        """
+        Initialize predictor class as provided by the user.
+
+        job_spec is a dictionary when the "kind" of the API is set to "BatchAPI". Otherwise, it's None.
+        """
+
+        # build args
+        class_impl = self.class_impl(project_dir)
+        constructor_args = inspect.getfullargspec(class_impl.__init__).args
+        config = deepcopy(self.config)
+        args = {}
+        if job_spec is not None and job_spec.get("config") is not None:
+            util.merge_dicts_in_place_overwrite(config, job_spec["config"])
+        if "config" in constructor_args:
+            args["config"] = config
+        if "job_spec" in constructor_args:
+            args["job_spec"] = job_spec
+
+        # initialize predictor class
+        try:
+            if self.type == PythonPredictorType:
+                if _are_models_specified(self.api_spec):
+                    args["python_client"] = client
+                    initialized_impl = class_impl(**args)
+                    client.set_load_method(initialized_impl.load_model)
+                else:
+                    initialized_impl = class_impl(**args)
+            if self.type in [TensorFlowPredictorType, TensorFlowNeuronPredictorType]:
+                args["tensorflow_client"] = client
+                initialized_impl = class_impl(**args)
+            if self.type == ONNXPredictorType:
+                args["onnx_client"] = client
+                initialized_impl = class_impl(**args)
+        except Exception as e:
+            raise UserRuntimeException(self.path, "__init__", str(e)) from e
+        finally:
+            refresh_logger()
+
+        # initialize the crons if models have been specified and if the API kind is RealtimeAPI
+        if _are_models_specified(self.api_spec) and self.api_spec["kind"] == "RealtimeAPI":
+            if not self.multiple_processes and self.caching_enabled:
+                self.crons += [
+                    ModelTreeUpdater(
+                        interval=10,
+                        api_spec=self.api_spec,
+                        tree=self.models_tree,
+                        ondisk_models_dir=self.model_dir,
+                    ),
+                    ModelsGC(
+                        interval=10,
+                        api_spec=self.api_spec,
+                        models=self.models,
+                        tree=self.models_tree,
+                    ),
+                ]
+
+            if not self.caching_enabled and self.type in [PythonPredictorType, ONNXPredictorType]:
+                self.crons += [
+                    FileBasedModelsGC(interval=10, models=self.models, download_dir=self.model_dir)
+                ]
+
+        for cron in self.crons:
+            cron.start()
+
+        return initialized_impl
+
+    def class_impl(self, project_dir):
+        if self.type in [TensorFlowPredictorType, TensorFlowNeuronPredictorType]:
+            target_class_name = "TensorFlowPredictor"
+            validations = TENSORFLOW_CLASS_VALIDATION
+        elif self.type == ONNXPredictorType:
+            target_class_name = "ONNXPredictor"
+            validations = ONNX_CLASS_VALIDATION
+        elif self.type == PythonPredictorType:
+            target_class_name = "PythonPredictor"
+            validations = PYTHON_CLASS_VALIDATION
+
+        try:
+            with FileLock("/run/init_stagger.lock"):
+                impl = self._load_module("cortex_predictor", os.path.join(project_dir, self.path))
+        except CortexException as e:
+            e.wrap("error in " + self.path)
+            raise
+        finally:
+            refresh_logger()
+
+        try:
+            classes = inspect.getmembers(impl, inspect.isclass)
+            predictor_class = None
+            for class_df in classes:
+                if class_df[0] == target_class_name:
+                    if predictor_class is not None:
+                        raise UserException(
+                            f"multiple definitions for {target_class_name} class found; please check your imports and class definitions and ensure that there is only one Predictor class definition"
+                        )
+                    predictor_class = class_df[1]
+            if predictor_class is None:
+                raise UserException(f"{target_class_name} class is not defined")
+            _validate_impl(predictor_class, validations, self.api_spec)
+            if self.type == PythonPredictorType:
+                _validate_python_predictor_with_models(predictor_class, self.api_spec)
+        except CortexException as e:
+            e.wrap("error in " + self.path)
+            raise
+        return predictor_class
+
+    def _load_module(self, module_name, impl_path):
+        if impl_path.endswith(".pickle"):
+            try:
+                impl = imp.new_module(module_name)
+
+                with open(impl_path, "rb") as pickle_file:
+                    pickled_dict = dill.load(pickle_file)
+                    for key in pickled_dict:
+                        setattr(impl, key, pickled_dict[key])
+            except Exception as e:
+                raise UserException("unable to load pickle", str(e)) from e
+        else:
+            try:
+                impl = imp.load_source(module_name, impl_path)
+            except Exception as e:
+                raise UserException(str(e)) from e
+
+        return impl
+
+    def _is_model_caching_enabled(self) -> bool:
+        """
+        Checks if model caching is enabled (models:cache_size and models:disk_cache_size).
+        """
+        if (
+            self.api_spec["predictor"]["models"]
+            and self.api_spec["predictor"]["models"]["cache_size"] is not None
+            and self.api_spec["predictor"]["models"]["disk_cache_size"] is not None
+        ):
+            return True
+        return False
+
+    def __del__(self) -> None:
+        for cron in self.crons:
+            cron.stop()
+        for cron in self.crons:
+            cron.join()
+
+
+def _are_models_specified(api_spec: dict) -> bool:
+    """
+    Checks if models have been specified in the API spec (cortex.yaml).
+
+    Args:
+        api_spec: API configuration.
+    """
+    if api_spec["predictor"]["model_path"] is not None:
+        return True
+
+    if api_spec["predictor"]["models"] and (
+        api_spec["predictor"]["models"]["dir"] is not None
+        or len(api_spec["predictor"]["models"]["paths"]) > 0
+    ):
+        return True
+    return False
+
+
+PYTHON_CLASS_VALIDATION = {
+    "required": [
+        {
+            "name": "__init__",
+            "required_args": ["self", "config"],
+            "optional_args": ["job_spec", "python_client"],
+        },
+        {
+            "name": "predict",
+            "required_args": ["self"],
+            "optional_args": ["payload", "query_params", "headers", "batch_id"],
+        },
+    ],
+    "optional": [
+        {"name": "on_job_complete", "required_args": ["self"]},
+        {
+            "name": "post_predict",
+            "required_args": ["self"],
+            "optional_args": ["response", "payload", "query_params", "headers"],
+        },
+        {
+            "name": "load_model",
+            "required_args": ["self", "model_path"],
+        },
+    ],
+}
+
+TENSORFLOW_CLASS_VALIDATION = {
+    "required": [
+        {
+            "name": "__init__",
+            "required_args": ["self", "tensorflow_client", "config"],
+            "optional_args": ["job_spec"],
+        },
+        {
+            "name": "predict",
+            "required_args": ["self"],
+            "optional_args": ["payload", "query_params", "headers", "batch_id"],
+        },
+    ],
+    "optional": [
+        {"name": "on_job_complete", "required_args": ["self"]},
+        {
+            "name": "post_predict",
+            "required_args": ["self"],
+            "optional_args": ["response", "payload", "query_params", "headers"],
+        },
+    ],
+}
+
+ONNX_CLASS_VALIDATION = {
+    "required": [
+        {
+            "name": "__init__",
+            "required_args": ["self", "onnx_client", "config"],
+            "optional_args": ["job_spec"],
+        },
+        {
+            "name": "predict",
+            "required_args": ["self"],
+            "optional_args": ["payload", "query_params", "headers", "batch_id"],
+        },
+    ],
+    "optional": [
+        {"name": "on_job_complete", "required_args": ["self"]},
+        {
+            "name": "post_predict",
+            "required_args": ["self"],
+            "optional_args": ["response", "payload", "query_params", "headers"],
+        },
+    ],
+}
+
+
+def _validate_impl(impl, impl_req, api_spec):
+    for optional_func_signature in impl_req.get("optional", []):
+        _validate_optional_fn_args(impl, optional_func_signature, api_spec)
+
+    for required_func_signature in impl_req.get("required", []):
+        _validate_required_fn_args(impl, required_func_signature, api_spec)
+
+
+def _validate_optional_fn_args(impl, func_signature, api_spec):
+    if getattr(impl, func_signature["name"], None):
+        _validate_required_fn_args(impl, func_signature, api_spec)
+
+
+def _validate_required_fn_args(impl, func_signature, api_spec):
+    target_class_name = impl.__name__
+
+    fn = getattr(impl, func_signature["name"], None)
+    if not fn:
+        raise UserException(
+            f"class {target_class_name}",
+            f'required method "{func_signature["name"]}" is not defined',
+        )
+
+    if not callable(fn):
+        raise UserException(
+            f"class {target_class_name}",
+            f'"{func_signature["name"]}" is defined, but is not a method',
+        )
+
+    required_args = func_signature.get("required_args", [])
+    optional_args = func_signature.get("optional_args", [])
+
+    argspec = inspect.getfullargspec(fn)
+    fn_str = f'{func_signature["name"]}({", ".join(argspec.args)})'
+
+    for arg_name in required_args:
+        if arg_name not in argspec.args:
+            raise UserException(
+                f"class {target_class_name}",
+                f'invalid signature for method "{fn_str}"',
+                f'"{arg_name}" is a required argument, but was not provided',
+            )
+
+        if arg_name == "self":
+            if argspec.args[0] != "self":
+                raise UserException(
+                    f"class {target_class_name}",
+                    f'invalid signature for method "{fn_str}"',
+                    f'"self" must be the first argument',
+                )
+
+    seen_args = []
+    for arg_name in argspec.args:
+        if arg_name not in required_args and arg_name not in optional_args:
+            raise UserException(
+                f"class {target_class_name}",
+                f'invalid signature for method "{fn_str}"',
+                f'"{arg_name}" is not a supported argument',
+            )
+
+        if arg_name in seen_args:
+            raise UserException(
+                f"class {target_class_name}",
+                f'invalid signature for method "{fn_str}"',
+                f'"{arg_name}" is duplicated',
+            )
+
+        seen_args.append(arg_name)
+
+
+def _validate_python_predictor_with_models(impl, api_spec):
+    target_class_name = impl.__name__
+
+    if _are_models_specified(api_spec):
+        constructor = getattr(impl, "__init__")
+        constructor_arg_spec = inspect.getfullargspec(constructor)
+        if "python_client" not in constructor_arg_spec.args:
+            raise UserException(
+                f"class {target_class_name}",
+                f'invalid signature for method "__init__"',
+                f'"python_client" is a required argument, but was not provided',
+                f'when the python predictor type is used and models are specified in the api spec, adding the "python_client" argument is required',
+            )
+
+        if getattr(impl, "load_model", None) is None:
+            raise UserException(
+                f"class {target_class_name}",
+                f'required method "load_model" is not defined',
+                f'when the python predictor type is used and models are specified in the api spec, adding the "load_model" method is required',
+            )
+
+
+def model_downloader(
+    predictor_type: PredictorType,
+    bucket_name: str,
+    model_name: str,
+    model_version: str,
+    model_path: str,
+    temp_dir: str,
+    model_dir: str,
+) -> Optional[datetime.datetime]:
+    """
+    Downloads model to disk. Validates the S3 model path and the downloaded model as well.
+
+    Args:
+        bucket_name: Name of the bucket where the model is stored.
+        model_name: Name of the model. Is part of the model's local path.
+        model_version: Version of the model. Is part of the model's local path.
+        model_path: S3 model prefix to the versioned model.
+        temp_dir: Where to temporarily store the model for validation.
+        model_dir: The top directory of where all models are stored locally.
+
+    Returns:
+        The model's timestamp. None if the model didn't pass the validation, if it doesn't exist or if there are not enough permissions.
+    """
+
+    logger().info(
+        f"downloading from bucket {bucket_name}/{model_path}, model {model_name} of version {model_version}, temporarily to {temp_dir} and then finally to {model_dir}"
+    )
+
+    s3_client = S3(bucket_name, client_config={})
+
+    # validate upstream S3 model
+    sub_paths, ts = s3_client.search(model_path)
+    try:
+        validate_model_paths(sub_paths, predictor_type, model_path)
+    except CortexException:
+        logger().info(f"failed validating model {model_name} of version {model_version}")
+        return None
+
+    # download model to temp dir
+    temp_dest = os.path.join(temp_dir, model_name, model_version)
+    try:
+        s3_client.download_dir_contents(model_path, temp_dest)
+    except CortexException:
+        logger().info(
+            f"failed downloading model {model_name} of version {model_version} to temp dir {temp_dest}"
+        )
+        shutil.rmtree(temp_dest)
+        return None
+
+    # validate model
+    model_contents = glob.glob(temp_dest + "*/**", recursive=True)
+    model_contents = util.remove_non_empty_directory_paths(model_contents)
+    try:
+        validate_model_paths(model_contents, predictor_type, temp_dest)
+    except CortexException:
+        logger().info(
+            f"failed validating model {model_name} of version {model_version} from temp dir"
+        )
+        shutil.rmtree(temp_dest)
+        return None
+
+    # move model to dest dir
+    model_top_dir = os.path.join(model_dir, model_name)
+    ondisk_model_version = os.path.join(model_top_dir, model_version)
+    logger().info(
+        f"moving model {model_name} of version {model_version} to final dir {ondisk_model_version}"
+    )
+    if os.path.isdir(ondisk_model_version):
+        shutil.rmtree(ondisk_model_version)
+    shutil.move(temp_dest, ondisk_model_version)
+
+    return max(ts)
diff --git a/pkg/workloads/cortex/lib/checkers/pod.py b/pkg/workloads/cortex/lib/checkers/pod.py
index e0272aa39e..c7567f8ea6 100644
--- a/pkg/workloads/cortex/lib/checkers/pod.py
+++ b/pkg/workloads/cortex/lib/checkers/pod.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import os, stat, time
+
 from cortex import consts
 
 
diff --git a/pkg/workloads/cortex/lib/client/onnx.py b/pkg/workloads/cortex/lib/client/onnx.py
index cb948f3528..73df809891 100644
--- a/pkg/workloads/cortex/lib/client/onnx.py
+++ b/pkg/workloads/cortex/lib/client/onnx.py
@@ -12,79 +12,493 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import onnxruntime as rt
+import os
+import datetime
+import threading as td
+import multiprocessing as mp
+from typing import Any, Tuple, Optional
+
+try:
+    import onnxruntime as rt
+
+    onnx_dependencies_installed = True
+except ImportError:
+    onnx_dependencies_installed = False
 import numpy as np
 
-from cortex.lib.log import cx_logger
+from cortex.lib.log import cx_logger as logger
 from cortex.lib import util
-from cortex.lib.exceptions import UserRuntimeException, CortexException, UserException
-from cortex.lib.type.model import Model, get_model_names
+from cortex.lib.exceptions import UserRuntimeException, CortexException, UserException, WithBreak
+from cortex.lib.model import (
+    ModelsHolder,
+    LockedModel,
+    ModelsTree,
+    LockedModelsTree,
+    CuratedModelResources,
+    find_ondisk_model_info,
+    find_ondisk_models_with_lock,
+)
+from cortex.lib.concurrency import LockedFile
 from cortex import consts
 
 
 class ONNXClient:
-    def __init__(self, models):
-        """Setup ONNX runtime session.
+    def __init__(
+        self,
+        api_spec: dict,
+        models: ModelsHolder,
+        model_dir: str,
+        models_tree: Optional[ModelsTree],
+        lock_dir: Optional[str] = "/run/cron",
+    ):
+        """
+        Setup ONNX runtime.
 
         Args:
-            models ([Model]): List of models deployed with ONNX container.
+            api_spec: API configuration.
+
+            models: Holding all models into memory.
+            model_dir: Where the models are saved on disk.
+
+            models_tree: A tree of the available models from upstream.
+            lock_dir: Where the resource locks are found. Only when processes_per_replica > 0 and caching disabled.
         """
+        if not onnx_dependencies_installed:
+            raise NameError("onnx dependencies not installed")
+
+        self._api_spec = api_spec
         self._models = models
-        self._model_names = get_model_names(models)
+        self._models_tree = models_tree
+        self._model_dir = model_dir
+        self._lock_dir = lock_dir
 
-        self._sessions = {}
-        self._signatures = {}
-        self._input_signatures = {}
-        for model in models:
-            self._sessions[model.name] = rt.InferenceSession(model.base_path)
-            self._signatures[model.name] = self._sessions[model.name].get_inputs()
+        self._spec_models = CuratedModelResources(api_spec["curated_model_resources"])
 
-            metadata = {}
-            for meta in self._signatures[model.name]:
-                numpy_type = ONNX_TO_NP_TYPE.get(meta.type, meta.type)
-                metadata[meta.name] = {"shape": meta.shape, "type": numpy_type}
-            self._input_signatures[model.name] = metadata
+        if (
+            self._api_spec["predictor"]["models"]
+            and self._api_spec["predictor"]["models"]["dir"] is not None
+        ):
+            self._models_dir = True
+        else:
+            self._models_dir = False
+            self._spec_model_names = self._spec_models.get_field("name")
 
-    def predict(self, model_input, model_name=None):
-        """Validate input, convert it to a dictionary of input_name to numpy.ndarray, and make a prediction.
+        # for when local models are used
+        self._spec_local_model_names = self._spec_models.get_local_model_names()
+        self._local_model_ts = int(datetime.datetime.now(datetime.timezone.utc).timestamp())
+
+        self._multiple_processes = self._api_spec["predictor"]["processes_per_replica"] > 1
+        self._caching_enabled = self._is_model_caching_enabled()
+
+        self._models.set_callback("load", self._load_model)
+
+    def _validate_model_args(
+        self, model_name: Optional[str] = None, model_version: str = "latest"
+    ) -> Tuple[str, str]:
+        """
+        Validate the model name and model version.
 
         Args:
-            model_input: Input to the model.
-            model_name: Model to use when multiple models are deployed in a single API.
+            model_name: Name of the model.
+            model_version: Model version to use. Can also be "latest" for picking the highest version.
 
         Returns:
-            numpy.ndarray: The prediction returned from the model.
+            The processed model_name, model_version tuple if they had to go through modification.
+
+        Raises:
+            UserRuntimeException if the validation fails.
         """
-        if consts.SINGLE_MODEL_NAME in self._model_names:
-            return self._run_inference(model_input, consts.SINGLE_MODEL_NAME)
 
-        if model_name is None:
+        if model_version != "latest" and not model_version.isnumeric():
             raise UserRuntimeException(
-                "model_name was not specified, choose one of the following: {}".format(
-                    self._model_names
+                "model_version must be either a parse-able numeric value or 'latest'"
+            )
+
+        # when predictor:model_path or predictor:models:paths is specified
+        if not self._models_dir:
+
+            # when predictor:model_path is provided
+            if consts.SINGLE_MODEL_NAME in self._spec_model_names:
+                return consts.SINGLE_MODEL_NAME, model_version
+
+            # when predictor:models:paths is specified
+            if model_name is None:
+                raise UserRuntimeException(
+                    f"model_name was not specified, choose one of the following: {self._spec_model_names}"
+                )
+
+            if model_name not in self._spec_model_names:
+                raise UserRuntimeException(
+                    f"'{model_name}' model wasn't found in the list of available models"
                 )
+
+        # when predictor:models:dir is specified
+        if self._models_dir:
+            if model_name is None:
+                raise UserRuntimeException("model_name was not specified")
+            if not self._caching_enabled:
+                available_models = find_ondisk_models_with_lock(self._lock_dir)
+                if model_name not in available_models:
+                    raise UserRuntimeException(
+                        f"'{model_name}' model wasn't found in the list of available models"
+                    )
+
+        return model_name, model_version
+
+    def predict(
+        self, model_input: Any, model_name: Optional[str] = None, model_version: str = "latest"
+    ) -> Any:
+        """
+        Validate input, convert it to a dictionary of input_name to numpy.ndarray, and make a prediction.
+
+        Args:
+            model_input: Input to the model.
+            model_name (optional): Name of the model to retrieve (when multiple models are deployed in an API).
+                When predictor.models.paths is specified, model_name should be the name of one of the models listed in the API config.
+                When predictor.models.dir is specified, model_name should be the name of a top-level directory in the models dir.
+            model_version (string, optional): Version of the model to retrieve. Can be omitted or set to "latest" to select the highest version.
+
+        Returns:
+            The prediction returned from the model.
+
+        Raises:
+            UserRuntimeException if the validation fails.
+        """
+
+        model_name, model_version = self._validate_model_args(model_name, model_version)
+        return self._run_inference(model_input, model_name, model_version)
+
+    def _run_inference(self, model_input: Any, model_name: str, model_version: str) -> Any:
+        """
+        Run the inference on model model_name of version model_version.
+        """
+
+        model = self._get_model(model_name, model_version)
+        if model is None:
+            raise UserRuntimeException(
+                f"model {model_name} of version {model_version} wasn't found"
+            )
+
+        try:
+            input_dict = convert_to_onnx_input(model_input, model["signatures"], model_name)
+            return model["session"].run([], input_dict)
+        except Exception as e:
+            raise UserRuntimeException(
+                f"failed inference with model {model_name} of version {model_version}", str(e)
+            )
+
+    def get_model(self, model_name: Optional[str] = None, model_version: str = "latest") -> dict:
+        """
+        Validate input and then return the model loaded into a dictionary.
+        The counting of tag calls is recorded with this method (just like with the predict method).
+
+        Args:
+            model_name: Model to use when multiple models are deployed in a single API.
+            model_version: Model version to use. Can also be "latest" for picking the highest version.
+
+        Returns:
+            The model as returned by _load_model method.
+
+        Raises:
+            UserRuntimeException if the validation fails.
+        """
+        model_name, model_version = self._validate_model_args(model_name, model_version)
+        model = self._get_model(model_name, model_version)
+        if model is None:
+            raise UserRuntimeException(
+                f"model {model_name} of version {model_version} wasn't found"
             )
+        return model
+
+    def _get_model(self, model_name: str, model_version: str) -> Any:
+        """
+        Checks if versioned model is on disk, then checks if model is in memory,
+        and if not, it loads it into memory, and returns the model.
+
+        Args:
+            model_name: Name of the model, as it's specified in predictor:models:paths or in the other case as they are named on disk.
+            model_version: Version of the model, as it's found on disk. Can also infer the version number from the "latest" version tag.
+
+        Exceptions:
+            RuntimeError: if another thread tried to load the model at the very same time.
+
+        Returns:
+            The model as returned by self._load_model method.
+            None if the model wasn't found or if it didn't pass the validation.
+        """
+
+        model = None
+        tag = ""
+        if model_version == "latest":
+            tag = model_version
+
+        if not self._caching_enabled:
+            # determine model version
+            if tag == "latest":
+                model_version = self._get_latest_model_version_from_disk(model_name)
+            model_id = model_name + "-" + model_version
+
+            # grab shared access to versioned model
+            resource = os.path.join(self._lock_dir, model_id + ".txt")
+            with LockedFile(resource, "r", reader_lock=True) as f:
+
+                # check model status
+                file_status = f.read()
+                if file_status == "" or file_status == "not-available":
+                    raise WithBreak
+
+                current_upstream_ts = int(file_status.split(" ")[1])
+                update_model = False
+
+                # grab shared access to models holder and retrieve model
+                with LockedModel(self._models, "r", model_name, model_version):
+                    status, local_ts = self._models.has_model(model_name, model_version)
+                    if status == "not-available" or (
+                        status == "in-memory" and local_ts != current_upstream_ts
+                    ):
+                        update_model = True
+                        raise WithBreak
+                    model, _ = self._models.get_model(model_name, model_version, tag)
+
+                # load model into memory and retrieve it
+                if update_model:
+                    with LockedModel(self._models, "w", model_name, model_version):
+                        status, _ = self._models.has_model(model_name, model_version)
+                        if status == "not-available" or (
+                            status == "in-memory" and local_ts != current_upstream_ts
+                        ):
+                            if status == "not-available":
+                                logger().info(
+                                    f"loading model {model_name} of version {model_version} (thread {td.get_ident()})"
+                                )
+                            else:
+                                logger().info(
+                                    f"reloading model {model_name} of version {model_version} (thread {td.get_ident()})"
+                                )
+                            try:
+                                self._models.load_model(
+                                    model_name,
+                                    model_version,
+                                    current_upstream_ts,
+                                    [tag],
+                                )
+                            except Exception as e:
+                                raise UserRuntimeException(
+                                    f"failed (re-)loading model {model_name} of version {model_version} (thread {td.get_ident()})",
+                                    str(e),
+                                )
+                        model, _ = self._models.get_model(model_name, model_version, tag)
+
+        if not self._multiple_processes and self._caching_enabled:
+            # determine model version
+            try:
+                if tag == "latest":
+                    model_version = self._get_latest_model_version_from_tree(
+                        model_name, self._models_tree.model_info(model_name)
+                    )
+            except ValueError:
+                # if model_name hasn't been found
+                raise UserRuntimeException(
+                    f"'{model_name}' model of tag {tag} wasn't found in the list of available models"
+                )
+
+            # grab shared access to model tree
+            available_model = True
+            with LockedModelsTree(self._models_tree, "r", model_name, model_version):
+
+                # check if the versioned model exists
+                model_id = model_name + "-" + model_version
+                if model_id not in self._models_tree:
+                    available_model = False
+                    raise WithBreak
+
+                # retrieve model tree's metadata
+                upstream_model = self._models_tree[model_id]
+                current_upstream_ts = int(upstream_model["timestamp"].timestamp())
+
+            if not available_model:
+                return None
+
+            # grab shared access to models holder and retrieve model
+            update_model = False
+            with LockedModel(self._models, "r", model_name, model_version):
+                status, local_ts = self._models.has_model(model_name, model_version)
+                if status in ["not-available", "on-disk"] or (
+                    status != "not-available"
+                    and local_ts != current_upstream_ts
+                    and not (status == "in-memory" and model_name in self._spec_local_model_names)
+                ):
+                    update_model = True
+                    raise WithBreak
+                model, _ = self._models.get_model(model_name, model_version, tag)
+
+            # download, load into memory the model and retrieve it
+            if update_model:
+                # grab exclusive access to models holder
+                with LockedModel(self._models, "w", model_name, model_version):
 
-        if model_name not in self._model_names:
+                    # check model status
+                    status, local_ts = self._models.has_model(model_name, model_version)
+
+                    # refresh disk model
+                    if model_name not in self._spec_local_model_names and (
+                        status == "not-available"
+                        or (status in ["on-disk", "in-memory"] and local_ts != current_upstream_ts)
+                    ):
+                        if status == "not-available":
+                            logger().info(
+                                f"model {model_name} of version {model_version} not found locally; continuing with the download..."
+                            )
+                        elif status == "on-disk":
+                            logger().info(
+                                f"found newer model {model_name} of vesion {model_version} on the S3 upstream than the one on the disk"
+                            )
+                        else:
+                            logger().info(
+                                f"found newer model {model_name} of vesion {model_version} on the S3 upstream than the one loaded into memory"
+                            )
+
+                        # remove model from disk and memory
+                        if status == "on-disk":
+                            logger().info(
+                                f"removing model from disk for model {model_name} of version {model_version}"
+                            )
+                            self._models.remove_model(model_name, model_version)
+                        if status == "in-memory":
+                            logger().info(
+                                f"removing model from disk and memory for model {model_name} of version {model_version}"
+                            )
+                            self._models.remove_model(model_name, model_version)
+
+                        # download model
+                        logger().info(
+                            f"downloading model {model_name} of version {model_version} from the S3 upstream"
+                        )
+                        date = self._models.download_model(
+                            upstream_model["bucket"],
+                            model_name,
+                            model_version,
+                            upstream_model["path"],
+                        )
+                        if not date:
+                            raise WithBreak
+                        current_upstream_ts = date.timestamp()
+
+                    # give the local model a timestamp initialized at start time
+                    if model_name in self._spec_local_model_names:
+                        current_upstream_ts = self._local_model_ts
+
+                    # load model
+                    try:
+                        logger().info(
+                            f"loading model {model_name} of version {model_version} into memory"
+                        )
+                        self._models.load_model(
+                            model_name,
+                            model_version,
+                            current_upstream_ts,
+                            [tag],
+                        )
+                    except Exception as e:
+                        raise UserRuntimeException(
+                            f"failed (re-)loading model {model_name} of version {model_version} (thread {td.get_ident()})",
+                            str(e),
+                        )
+
+                    # retrieve model
+                    model, _ = self._models.get_model(model_name, model_version, tag)
+
+        return model
+
+    def _load_model(self, model_path: str) -> None:
+        """
+        Load ONNX model from disk.
+
+        Args:
+            model_path: Directory path to a model's version on disk.
+
+        Not thread-safe, so this method cannot be called on its own. Must only be called by self._get_model method.
+        """
+
+        model_path = os.path.join(model_path, os.listdir(model_path)[0])
+        model = {
+            "session": rt.InferenceSession(model_path),
+        }
+        model["signatures"] = model["session"].get_inputs()
+        metadata = {}
+        for meta in model["signatures"]:
+            numpy_type = ONNX_TO_NP_TYPE.get(meta.type, meta.type)
+            metadata[meta.name] = {
+                "shape": meta.shape,
+                "type": numpy_type,
+            }
+        model["input_signatures"] = metadata
+
+        return model
+
+    def _get_latest_model_version_from_disk(self, model_name: str) -> str:
+        """
+        Get the highest version of a specific model name.
+        Must only be used when caching disabled and processes_per_replica > 0.
+        """
+        versions, timestamps = find_ondisk_model_info(self._lock_dir, model_name)
+        if len(versions) == 0:
             raise UserRuntimeException(
-                "'{}' model wasn't found in the list of available models: {}".format(
-                    model_name, self._model_names
+                "'{}' model's versions have been removed; add at least a version to the model to resume operations".format(
+                    model_name
                 )
             )
+        return str(max(map(lambda x: int(x), versions)))
 
-        return self._run_inference(model_input, model_name)
+    def _get_latest_model_version_from_tree(self, model_name: str, model_info: dict) -> str:
+        """
+        Get the highest version of a specific model name.
+        Must only be used when processes_per_replica = 1 and caching is enabled.
+        """
+        versions, timestamps = model_info["versions"], model_info["timestamps"]
+        return str(max(map(lambda x: int(x), versions)))
 
-    def _run_inference(self, model_input, model_name):
-        input_dict = convert_to_onnx_input(model_input, self._signatures[model_name], model_name)
-        return self._sessions[model_name].run([], input_dict)
+    def _is_model_caching_enabled(self) -> bool:
+        """
+        Checks if model caching is enabled (models:cache_size and models:disk_cache_size).
+        """
+        return (
+            self._api_spec["predictor"]["models"]
+            and self._api_spec["predictor"]["models"]["cache_size"] is not None
+            and self._api_spec["predictor"]["models"]["disk_cache_size"] is not None
+        )
 
     @property
-    def sessions(self):
-        return self._sessions
+    def metadata(self) -> dict:
+        """
+        The returned dictionary will be like in the following example:
+        {
+            ...
+            "yolov3": {
+                "versions": [
+                    "2",
+                    "1"
+                ],
+                "timestamps": [
+                    1601668127,
+                    1601668127
+                ]
+            }
+            ...
+        }
+        """
+        if not self._caching_enabled:
+            return find_ondisk_models_with_lock(self._lock_dir, include_timestamps=True)
+        else:
+            models_info = self._models_tree.get_all_models_info()
+            for model_name in models_info.keys():
+                del models_info[model_name]["bucket"]
+                del models_info[model_name]["model_paths"]
+            return models_info
 
     @property
-    def input_signatures(self):
-        return self._input_signatures
+    def caching(self) -> bool:
+        return self._caching_enabled
 
 
 # https://github.com/microsoft/onnxruntime/blob/v0.4.0/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
diff --git a/pkg/workloads/cortex/lib/client/python.py b/pkg/workloads/cortex/lib/client/python.py
new file mode 100644
index 0000000000..c4a33fef35
--- /dev/null
+++ b/pkg/workloads/cortex/lib/client/python.py
@@ -0,0 +1,413 @@
+# Copyright 2020 Cortex Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import datetime
+import threading as td
+import multiprocessing as mp
+from typing import Any, Optional, Callable
+
+from cortex.lib.log import cx_logger as logger
+from cortex.lib.exceptions import UserRuntimeException, CortexException, UserException, WithBreak
+from cortex.lib.model import (
+    ModelsHolder,
+    LockedModel,
+    ModelsTree,
+    LockedModelsTree,
+    CuratedModelResources,
+    find_ondisk_model_info,
+    find_ondisk_models_with_lock,
+)
+from cortex.lib.concurrency import LockedFile
+from cortex import consts
+
+
+class PythonClient:
+    def __init__(
+        self,
+        api_spec: dict,
+        models: ModelsHolder,
+        model_dir: str,
+        models_tree: Optional[ModelsTree],
+        lock_dir: Optional[str] = "/run/cron",
+        load_model_fn: Optional[Callable[[str], Any]] = None,
+    ):
+        """
+        Setup Python model client.
+
+        Args:
+            api_spec: API configuration.
+
+            models: Holding all models into memory.
+            model_dir: Where the models are saved on disk.
+
+            models_tree: A tree of the available models from upstream.
+            lock_dir: Where the resource locks are found. Only when processes_per_replica > 0 and caching disabled.
+            load_model_fn: Function to load model into memory.
+        """
+
+        self._api_spec = api_spec
+        self._models = models
+        self._models_tree = models_tree
+        self._model_dir = model_dir
+        self._lock_dir = lock_dir
+
+        self._spec_models = CuratedModelResources(api_spec["curated_model_resources"])
+
+        if (
+            self._api_spec["predictor"]["models"]
+            and self._api_spec["predictor"]["models"]["dir"] is not None
+        ):
+            self._models_dir = True
+        else:
+            self._models_dir = False
+            self._spec_model_names = self._spec_models.get_field("name")
+
+        # for when local models are used
+        self._spec_local_model_names = self._spec_models.get_local_model_names()
+        self._local_model_ts = int(datetime.datetime.now(datetime.timezone.utc).timestamp())
+
+        self._multiple_processes = self._api_spec["predictor"]["processes_per_replica"] > 1
+        self._caching_enabled = self._is_model_caching_enabled()
+
+        if callable(load_model_fn):
+            self._models.set_callback("load", load_model_fn)
+
+    def set_load_method(self, load_model_fn: Callable[[str], Any]) -> None:
+        self._models.set_callback("load", load_model_fn)
+
+    def get_model(self, model_name: Optional[str] = None, model_version: str = "latest") -> Any:
+        """
+        Retrieve a model for inference.
+
+        Args:
+            model_name (optional): Name of the model to retrieve (when multiple models are deployed in an API).
+                When predictor.models.paths is specified, model_name should be the name of one of the models listed in the API config.
+                When predictor.models.dir is specified, model_name should be the name of a top-level directory in the models dir.
+            model_version (string, optional): Version of the model to retrieve. Can be omitted or set to "latest" to select the highest version.
+
+        Returns:
+            The model as loaded by the load_model() method.
+        """
+
+        if model_version != "latest" and not model_version.isnumeric():
+            raise UserRuntimeException(
+                "model_version must be either a parse-able numeric value or 'latest'"
+            )
+
+        # when predictor:model_path or predictor:models:paths is specified
+        if not self._models_dir:
+
+            # when predictor:model_path is provided
+            if consts.SINGLE_MODEL_NAME in self._spec_model_names:
+                model_name = consts.SINGLE_MODEL_NAME
+                model = self._get_model(model_name, model_version)
+                if model is None:
+                    raise UserRuntimeException(
+                        f"model {model_name} of version {model_version} wasn't found"
+                    )
+                return model
+
+            # when predictor:models:paths is specified
+            if model_name is None:
+                raise UserRuntimeException(
+                    f"model_name was not specified, choose one of the following: {self._spec_model_names}"
+                )
+
+            if model_name not in self._spec_model_names:
+                raise UserRuntimeException(
+                    f"'{model_name}' model wasn't found in the list of available models"
+                )
+
+        # when predictor:models:dir is specified
+        if self._models_dir:
+            if model_name is None:
+                raise UserRuntimeException("model_name was not specified")
+            if not self._caching_enabled:
+                available_models = find_ondisk_models_with_lock(self._lock_dir)
+                if model_name not in available_models:
+                    raise UserRuntimeException(
+                        f"'{model_name}' model wasn't found in the list of available models"
+                    )
+
+        model = self._get_model(model_name, model_version)
+        if model is None:
+            raise UserRuntimeException(
+                f"model {model_name} of version {model_version} wasn't found"
+            )
+        return model
+
+    def _get_model(self, model_name: str, model_version: str) -> Any:
+        """
+        Checks if versioned model is on disk, then checks if model is in memory,
+        and if not, it loads it into memory, and returns the model.
+
+        Args:
+            model_name: Name of the model, as it's specified in predictor:models:paths or in the other case as they are named on disk.
+            model_version: Version of the model, as it's found on disk. Can also infer the version number from the "latest" tag.
+
+        Exceptions:
+            RuntimeError: if another thread tried to load the model at the very same time.
+
+        Returns:
+            The model as returned by self._load_model method.
+            None if the model wasn't found or if it didn't pass the validation.
+        """
+
+        model = None
+        tag = ""
+        if model_version == "latest":
+            tag = model_version
+
+        if not self._caching_enabled:
+            # determine model version
+            if tag == "latest":
+                model_version = self._get_latest_model_version_from_disk(model_name)
+            model_id = model_name + "-" + model_version
+
+            # grab shared access to versioned model
+            resource = os.path.join(self._lock_dir, model_id + ".txt")
+            with LockedFile(resource, "r", reader_lock=True) as f:
+
+                # check model status
+                file_status = f.read()
+                if file_status == "" or file_status == "not-available":
+                    raise WithBreak
+
+                current_upstream_ts = int(file_status.split(" ")[1])
+                update_model = False
+
+                # grab shared access to models holder and retrieve model
+                with LockedModel(self._models, "r", model_name, model_version):
+                    status, local_ts = self._models.has_model(model_name, model_version)
+                    if status == "not-available" or (
+                        status == "in-memory" and local_ts != current_upstream_ts
+                    ):
+                        update_model = True
+                        raise WithBreak
+                    model, _ = self._models.get_model(model_name, model_version, tag)
+
+                # load model into memory and retrieve it
+                if update_model:
+                    with LockedModel(self._models, "w", model_name, model_version):
+                        status, _ = self._models.has_model(model_name, model_version)
+                        if status == "not-available" or (
+                            status == "in-memory" and local_ts != current_upstream_ts
+                        ):
+                            if status == "not-available":
+                                logger().info(
+                                    f"loading model {model_name} of version {model_version} (thread {td.get_ident()})"
+                                )
+                            else:
+                                logger().info(
+                                    f"reloading model {model_name} of version {model_version} (thread {td.get_ident()})"
+                                )
+                            try:
+                                self._models.load_model(
+                                    model_name,
+                                    model_version,
+                                    current_upstream_ts,
+                                    [tag],
+                                )
+                            except Exception as e:
+                                raise UserRuntimeException(
+                                    f"failed (re-)loading model {model_name} of version {model_version} (thread {td.get_ident()})",
+                                    str(e),
+                                )
+                        model, _ = self._models.get_model(model_name, model_version, tag)
+
+        if not self._multiple_processes and self._caching_enabled:
+            # determine model version
+            try:
+                if tag == "latest":
+                    model_version = self._get_latest_model_version_from_tree(
+                        model_name, self._models_tree.model_info(model_name)
+                    )
+            except ValueError:
+                # if model_name hasn't been found
+                raise UserRuntimeException(
+                    f"'{model_name}' model of tag latest wasn't found in the list of available models"
+                )
+
+            # grab shared access to model tree
+            available_model = True
+            with LockedModelsTree(self._models_tree, "r", model_name, model_version):
+
+                # check if the versioned model exists
+                model_id = model_name + "-" + model_version
+                if model_id not in self._models_tree:
+                    available_model = False
+                    raise WithBreak
+
+                # retrieve model tree's metadata
+                upstream_model = self._models_tree[model_id]
+                current_upstream_ts = int(upstream_model["timestamp"].timestamp())
+
+            if not available_model:
+                return None
+
+            # grab shared access to models holder and retrieve model
+            update_model = False
+            with LockedModel(self._models, "r", model_name, model_version):
+                status, local_ts = self._models.has_model(model_name, model_version)
+                if status in ["not-available", "on-disk"] or (
+                    status != "not-available"
+                    and local_ts != current_upstream_ts
+                    and not (status == "in-memory" and model_name in self._spec_local_model_names)
+                ):
+                    update_model = True
+                    raise WithBreak
+                model, _ = self._models.get_model(model_name, model_version, tag)
+
+            # download, load into memory the model and retrieve it
+            if update_model:
+                # grab exclusive access to models holder
+                with LockedModel(self._models, "w", model_name, model_version):
+
+                    # check model status
+                    status, local_ts = self._models.has_model(model_name, model_version)
+
+                    # refresh disk model
+                    if model_name not in self._spec_local_model_names and (
+                        status == "not-available"
+                        or (status in ["on-disk", "in-memory"] and local_ts != current_upstream_ts)
+                    ):
+                        if status == "not-available":
+                            logger().info(
+                                f"model {model_name} of version {model_version} not found locally; continuing with the download..."
+                            )
+                        elif status == "on-disk":
+                            logger().info(
+                                f"found newer model {model_name} of vesion {model_version} on the S3 upstream than the one on the disk"
+                            )
+                        else:
+                            logger().info(
+                                f"found newer model {model_name} of vesion {model_version} on the S3 upstream than the one loaded into memory"
+                            )
+
+                        # remove model from disk and memory
+                        if status == "on-disk":
+                            logger().info(
+                                f"removing model from disk for model {model_name} of version {model_version}"
+                            )
+                            self._models.remove_model(model_name, model_version)
+                        if status == "in-memory":
+                            logger().info(
+                                f"removing model from disk and memory for model {model_name} of version {model_version}"
+                            )
+                            self._models.remove_model(model_name, model_version)
+
+                        # download model
+                        logger().info(
+                            f"downloading model {model_name} of version {model_version} from the S3 upstream"
+                        )
+                        date = self._models.download_model(
+                            upstream_model["bucket"],
+                            model_name,
+                            model_version,
+                            upstream_model["path"],
+                        )
+                        if not date:
+                            raise WithBreak
+                        current_upstream_ts = date.timestamp()
+
+                    # give the local model a timestamp initialized at start time
+                    if model_name in self._spec_local_model_names:
+                        current_upstream_ts = self._local_model_ts
+
+                    # load model
+                    try:
+                        logger().info(
+                            f"loading model {model_name} of version {model_version} into memory"
+                        )
+                        self._models.load_model(
+                            model_name,
+                            model_version,
+                            current_upstream_ts,
+                            [tag],
+                        )
+                    except Exception as e:
+                        raise UserRuntimeException(
+                            f"failed (re-)loading model {model_name} of version {model_version} (thread {td.get_ident()})",
+                            str(e),
+                        )
+
+                    # retrieve model
+                    model, _ = self._models.get_model(model_name, model_version, tag)
+
+        return model
+
+    def _get_latest_model_version_from_disk(self, model_name: str) -> str:
+        """
+        Get the highest version for a specific model name.
+        Must only be used when processes_per_replica > 0 and caching disabled.
+        """
+        versions, timestamps = find_ondisk_model_info(self._lock_dir, model_name)
+        if len(versions) == 0:
+            raise UserRuntimeException(
+                "'{}' model's versions have been removed; add at least a version to the model to resume operations".format(
+                    model_name
+                )
+            )
+        return str(max(map(lambda x: int(x), versions)))
+
+    def _get_latest_model_version_from_tree(self, model_name: str, model_info: dict) -> str:
+        """
+        Get the highest version for a specific model name.
+        Must only be used when processes_per_replica = 1 and caching is enabled.
+        """
+        versions, timestamps = model_info["versions"], model_info["timestamps"]
+        return str(max(map(lambda x: int(x), versions)))
+
+    def _is_model_caching_enabled(self) -> bool:
+        """
+        Checks if model caching is enabled (models:cache_size and models:disk_cache_size).
+        """
+        return (
+            self._api_spec["predictor"]["models"]
+            and self._api_spec["predictor"]["models"]["cache_size"] is not None
+            and self._api_spec["predictor"]["models"]["disk_cache_size"] is not None
+        )
+
+    @property
+    def metadata(self) -> dict:
+        """
+        The returned dictionary will be like in the following example:
+        {
+            ...
+            "yolov3": {
+                "versions": [
+                    "2",
+                    "1"
+                ],
+                "timestamps": [
+                    1601668127,
+                    1601668127
+                ]
+            }
+            ...
+        }
+        """
+        if not self._caching_enabled:
+            return find_ondisk_models_with_lock(self._lock_dir, include_timestamps=True)
+        else:
+            models_info = self._models_tree.get_all_models_info()
+            for model_name in models_info.keys():
+                del models_info[model_name]["bucket"]
+                del models_info[model_name]["model_paths"]
+            return models_info
+
+    @property
+    def caching(self) -> bool:
+        return self._caching_enabled
diff --git a/pkg/workloads/cortex/lib/client/tensorflow.py b/pkg/workloads/cortex/lib/client/tensorflow.py
index 98d5b3442b..59b51509db 100644
--- a/pkg/workloads/cortex/lib/client/tensorflow.py
+++ b/pkg/workloads/cortex/lib/client/tensorflow.py
@@ -12,320 +12,476 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import time
-import sys
+import os
+import copy
 import grpc
-
-import tensorflow as tf
-from tensorflow_serving.apis import predict_pb2
-from tensorflow_serving.apis import get_model_metadata_pb2
-from tensorflow_serving.apis import prediction_service_pb2_grpc
-from google.protobuf import json_format
-
-from cortex.lib.exceptions import UserRuntimeException, UserException, CortexException
-from cortex.lib.log import cx_logger
-from cortex.lib.type.model import Model, get_model_signature_map, get_model_names
+import threading as td
+from typing import Any, Dict, Optional, List
+
+from cortex.lib.exceptions import UserRuntimeException, CortexException, UserException, WithBreak
+from cortex.lib.model import (
+    TensorFlowServingAPI,
+    ModelsHolder,
+    ModelsTree,
+    LockedModel,
+    LockedModelsTree,
+    CuratedModelResources,
+)
+from cortex.lib.log import cx_logger as logger
 from cortex import consts
 
 
 class TensorFlowClient:
-    def __init__(self, tf_serving_url, models):
-        """Setup gRPC connection to TensorFlow Serving container.
+    def __init__(
+        self,
+        tf_serving_url,
+        api_spec: dict,
+        models: Optional[ModelsHolder],
+        model_dir: Optional[str],
+        models_tree: Optional[ModelsTree],
+    ):
+        """
+        Setup gRPC connection to TensorFlow Serving container.
 
         Args:
-            tf_serving_url (string): Localhost URL to TF Serving container.
-            models        ([Model]): List of models deployed with TF serving container.
+            tf_serving_url: Localhost URL to TF Serving container (i.e. "localhost:9000")
+            api_spec: API configuration.
+
+            models: Holding all models into memory. Only when processes_per_replica = 1 and caching enabled.
+            model_dir: Where the models are saved on disk. Only when processes_per_replica = 1 and caching enabled.
+            models_tree: A tree of the available models from upstream. Only when processes_per_replica = 1 and caching enabled.
         """
-        self._tf_serving_url = tf_serving_url
+
+        self.tf_serving_url = tf_serving_url
+
+        self._api_spec = api_spec
         self._models = models
-        self._model_names = get_model_names(models)
+        self._models_tree = models_tree
+        self._model_dir = model_dir
 
-        channel = grpc.insecure_channel(tf_serving_url)
-        self._stub = prediction_service_pb2_grpc.PredictionServiceStub(channel)
+        self._spec_models = CuratedModelResources(api_spec["curated_model_resources"])
 
-        self._signatures = get_signature_defs(self._stub, models)
-        parsed_signature_keys, parsed_signatures = extract_signatures(
-            self._signatures, get_model_signature_map(models)
-        )
-        self._signature_keys = parsed_signature_keys
-        self._input_signatures = parsed_signatures
+        if (
+            self._api_spec["predictor"]["models"]
+            and self._api_spec["predictor"]["models"]["dir"] is not None
+        ):
+            self._models_dir = True
+        else:
+            self._models_dir = False
+            self._spec_model_names = self._spec_models.get_field("name")
+
+        self._multiple_processes = self._api_spec["predictor"]["processes_per_replica"] > 1
+        self._caching_enabled = self._is_model_caching_enabled()
+
+        if self._models:
+            self._models.set_callback("load", self._load_model)
+            self._models.set_callback("remove", self._remove_models)
 
-    def predict(self, model_input, model_name=None):
-        """Validate model_input, convert it to a Prediction Proto, and make a request to TensorFlow Serving.
+        self._client = TensorFlowServingAPI(tf_serving_url)
+
+    def predict(
+        self, model_input: Any, model_name: Optional[str] = None, model_version: str = "latest"
+    ) -> dict:
+        """
+        Validate model_input, convert it to a Prediction Proto, and make a request to TensorFlow Serving.
 
         Args:
             model_input: Input to the model.
-            model_name: Model to use when multiple models are deployed in a single API.
+            model_name (optional): Name of the model to retrieve (when multiple models are deployed in an API).
+                When predictor.models.paths is specified, model_name should be the name of one of the models listed in the API config.
+                When predictor.models.dir is specified, model_name should be the name of a top-level directory in the models dir.
+            model_version (string, optional): Version of the model to retrieve. Can be omitted or set to "latest" to select the highest version.
 
         Returns:
             dict: TensorFlow Serving response converted to a dictionary.
         """
-        if consts.SINGLE_MODEL_NAME in self._model_names:
-            return self._run_inference(model_input, consts.SINGLE_MODEL_NAME)
-
-        if model_name is None:
-            raise UserRuntimeException(
-                "model_name was not specified, choose one of the following: {}".format(
-                    self._model_names
-                )
-            )
 
-        if model_name not in self._model_names:
+        if model_version != "latest" and not model_version.isnumeric():
             raise UserRuntimeException(
-                "'{}' model wasn't found in the list of available models: {}".format(
-                    model_name, self._model_names
-                )
+                "model_version must be either a parse-able numeric value or 'latest'"
             )
 
-        return self._run_inference(model_input, model_name)
+        # when predictor:model_path or predictor:models:paths is specified
+        if not self._models_dir:
 
-    def _run_inference(self, model_input, model_name):
-        input_signature = self._input_signatures[model_name]
-        signature = self._signatures[model_name]
-        signature_key = self._signature_keys[model_name]
+            # when predictor:model_path is provided
+            if consts.SINGLE_MODEL_NAME in self._spec_model_names:
+                return self._run_inference(model_input, consts.SINGLE_MODEL_NAME, model_version)
 
-        validate_model_input(input_signature, model_input, model_name)
-        prediction_request = create_prediction_request(
-            signature, signature_key, model_name, model_input
-        )
-        response_proto = self._stub.Predict(prediction_request, timeout=300.0)
-        return parse_response_proto(response_proto)
-
-    @property
-    def stub(self):
-        return self._stub
-
-    @property
-    def input_signatures(self):
-        return self._input_signatures
-
-
-DTYPE_TO_TF_TYPE = {
-    "DT_FLOAT": tf.float32,
-    "DT_DOUBLE": tf.float64,
-    "DT_INT32": tf.int32,
-    "DT_UINT8": tf.uint8,
-    "DT_INT16": tf.int16,
-    "DT_INT8": tf.int8,
-    "DT_STRING": tf.string,
-    "DT_COMPLEX64": tf.complex64,
-    "DT_INT64": tf.int64,
-    "DT_BOOL": tf.bool,
-    "DT_QINT8": tf.qint8,
-    "DT_QUINT8": tf.quint8,
-    "DT_QINT32": tf.qint32,
-    "DT_BFLOAT16": tf.bfloat16,
-    "DT_QINT16": tf.qint16,
-    "DT_QUINT16": tf.quint16,
-    "DT_UINT16": tf.uint16,
-    "DT_COMPLEX128": tf.complex128,
-    "DT_HALF": tf.float16,
-    "DT_RESOURCE": tf.resource,
-    "DT_VARIANT": tf.variant,
-    "DT_UINT32": tf.uint32,
-    "DT_UINT64": tf.uint64,
-}
-
-DTYPE_TO_VALUE_KEY = {
-    "DT_INT32": "intVal",
-    "DT_INT64": "int64Val",
-    "DT_FLOAT": "floatVal",
-    "DT_STRING": "stringVal",
-    "DT_BOOL": "boolVal",
-    "DT_DOUBLE": "doubleVal",
-    "DT_HALF": "halfVal",
-    "DT_COMPLEX64": "scomplexVal",
-    "DT_COMPLEX128": "dcomplexVal",
-}
-
-
-def get_signature_defs(stub, models):
-    sigmaps = {}
-    for model in models:
-        sigmaps[model.name] = get_signature_def(stub, model)
-
-    return sigmaps
-
-
-def get_signature_def(stub, model):
-    limit = 2
-    for i in range(limit):
-        try:
-            request = create_get_model_metadata_request(model.name)
-            resp = stub.GetModelMetadata(request, timeout=10.0)
-            sigAny = resp.metadata["signature_def"]
-            signature_def_map = get_model_metadata_pb2.SignatureDefMap()
-            sigAny.Unpack(signature_def_map)
-            sigmap = json_format.MessageToDict(signature_def_map)
-            return sigmap["signatureDef"]
-        except Exception as e:
-            print(e)
-            cx_logger().warn(
-                "unable to read model metadata for model '{}' - retrying ...".format(model.name)
-            )
-
-        time.sleep(5)
-
-    raise CortexException(
-        "timeout: unable to read model metadata for model '{}'".format(model.name)
-    )
-
-
-def create_get_model_metadata_request(model_name):
-    get_model_metadata_request = get_model_metadata_pb2.GetModelMetadataRequest()
-    get_model_metadata_request.model_spec.name = model_name
-    get_model_metadata_request.metadata_field.append("signature_def")
-    return get_model_metadata_request
+            # when predictor:models:paths is specified
+            if model_name is None:
+                raise UserRuntimeException(
+                    f"model_name was not specified, choose one of the following: {self._spec_model_names}"
+                )
 
+            if model_name not in self._spec_model_names:
+                raise UserRuntimeException(
+                    f"'{model_name}' model wasn't found in the list of available models"
+                )
 
-def extract_signatures(signature_defs, signature_keys):
-    parsed_signature_keys = {}
-    parsed_signatures = {}
-    for model_name in signature_defs:
-        parsed_signature_key, parsed_signature = extract_signature(
-            signature_defs[model_name],
-            signature_keys[model_name],
-            model_name,
-        )
-        parsed_signature_keys[model_name] = parsed_signature_key
-        parsed_signatures[model_name] = parsed_signature
+        # when predictor:models:dir is specified
+        if self._models_dir and model_name is None:
+            raise UserRuntimeException("model_name was not specified")
 
-    return parsed_signature_keys, parsed_signatures
+        return self._run_inference(model_input, model_name, model_version)
 
+    def _run_inference(self, model_input: Any, model_name: str, model_version: str) -> dict:
+        """
+        When processes_per_replica = 1 and caching enabled, check/load model and make prediction.
+        When processes_per_replica > 0 and caching disabled, attempt to make prediction regardless.
 
-def extract_signature(signature_def, signature_key, model_name):
-    cx_logger().info("signature defs found in model '{}': {}".format(model_name, signature_def))
+        Args:
+            model_input: Input to the model.
+            model_name: Name of the model, as it's specified in predictor:models:paths or in the other case as they are named on disk.
+            model_version: Version of the model, as it's found on disk. Can also infer the version number from the "latest" version tag.
 
-    available_keys = list(signature_def.keys())
-    if len(available_keys) == 0:
-        raise UserException("unable to find signature defs in model '{}'".format(model_name))
+        Returns:
+            The prediction.
+        """
 
-    if signature_key is None:
-        if len(available_keys) == 1:
-            cx_logger().info(
-                "signature_key was not configured by user, using signature key '{}' for model '{}' (found in the signature def map)".format(
-                    available_keys[0],
-                    model_name,
+        model = None
+        tag = ""
+        if model_version == "latest":
+            tag = model_version
+
+        if not self._caching_enabled:
+
+            # determine model version
+            if tag == "latest":
+                versions = self._client.poll_available_model_versions(model_name)
+                if len(versions) == 0:
+                    raise UserException(
+                        f"model '{model_name}' accessed with tag {tag} couldn't be found"
+                    )
+                model_version = str(max(map(lambda x: int(x), versions)))
+            model_id = model_name + "-" + model_version
+
+            return self._client.predict(model_input, model_name, model_version)
+
+        if not self._multiple_processes and self._caching_enabled:
+
+            # determine model version
+            try:
+                if tag == "latest":
+                    model_version = self._get_latest_model_version_from_tree(
+                        model_name, self._models_tree.model_info(model_name)
+                    )
+            except ValueError:
+                # if model_name hasn't been found
+                raise UserRuntimeException(
+                    f"'{model_name}' model of tag {tag} wasn't found in the list of available models"
                 )
-            )
-            signature_key = available_keys[0]
-        elif "predict" in signature_def:
-            cx_logger().info(
-                "signature_key was not configured by user, using signature key 'predict' for model '{}' (found in the signature def map)".format(
-                    model_name
-                )
-            )
-            signature_key = "predict"
-        else:
-            raise UserException(
-                "signature_key was not configured by user, please specify one the following keys '{}' for model '{}' (found in the signature def map)".format(
-                    ", ".join(available_keys), model_name
-                )
-            )
-    else:
-        if signature_def.get(signature_key) is None:
-            possibilities_str = "key: '{}'".format(available_keys[0])
-            if len(available_keys) > 1:
-                possibilities_str = "keys: '{}'".format("', '".join(available_keys))
-
-            raise UserException(
-                "signature_key '{}' was not found in signature def map for model '{}', but found the following {}".format(
-                    signature_key, model_name, possibilities_str
+
+            models_stats = []
+            for model_id in self._models.get_model_ids():
+                models_stats = self._models.has_model_id(model_id)
+
+            # grab shared access to model tree
+            available_model = True
+            logger().info(f"grabbing access to model {model_name} of version {model_version}")
+            with LockedModelsTree(self._models_tree, "r", model_name, model_version):
+
+                # check if the versioned model exists
+                model_id = model_name + "-" + model_version
+                if model_id not in self._models_tree:
+                    available_model = False
+                    logger().info(f"model {model_name} of version {model_version} is not available")
+                    raise WithBreak
+
+                # retrieve model tree's metadata
+                upstream_model = self._models_tree[model_id]
+                current_upstream_ts = int(upstream_model["timestamp"].timestamp())
+                logger().info(f"model {model_name} of version {model_version} is available")
+
+            if not available_model:
+                if tag == "":
+                    raise UserException(
+                        f"model '{model_name}' of version '{model_version}' couldn't be found"
+                    )
+                raise UserException(
+                    f"model '{model_name}' accessed with tag '{tag}' couldn't be found"
                 )
-            )
 
-    signature_def_val = signature_def.get(signature_key)
+            # grab shared access to models holder and retrieve model
+            update_model = False
+            prediction = None
+            tfs_was_unresponsive = False
+            with LockedModel(self._models, "r", model_name, model_version):
+                logger().info(f"checking the {model_name} {model_version} status")
+                status, local_ts = self._models.has_model(model_name, model_version)
+                if status in ["not-available", "on-disk"] or (
+                    status != "not-available" and local_ts != current_upstream_ts
+                ):
+                    logger().info(
+                        f"model {model_name} of version {model_version} is not loaded (with status {status} or different timestamp)"
+                    )
+                    update_model = True
+                    raise WithBreak
+
+                # run prediction
+                logger().info(
+                    f"run the prediction on model {model_name} of version {model_version}"
+                )
+                self._models.get_model(model_name, model_version, tag)
+                try:
+                    prediction = self._client.predict(model_input, model_name, model_version)
+                except grpc.RpcError as e:
+                    # effectively when it got restarted
+                    if len(self._client.poll_available_model_versions(model_name)) > 0:
+                        raise
+                    tfs_was_unresponsive = True
+
+            # remove model from disk and memory references if TFS gets unresponsive
+            if tfs_was_unresponsive:
+                with LockedModel(self._models, "w", model_name, model_version):
+                    available_versions = self._client.poll_available_model_versions(model_name)
+                    status, _ = self._models.has_model(model_name, model_version)
+                    if not (status == "in-memory" and model_version not in available_versions):
+                        raise WithBreak
+
+                    logger().info(
+                        f"removing model {model_name} of version {model_version} because TFS got unresponsive"
+                    )
+                    self._models.remove_model(model_name, model_version)
+
+            # download, load into memory the model and retrieve it
+            if update_model:
+                # grab exclusive access to models holder
+                with LockedModel(self._models, "w", model_name, model_version):
+
+                    # check model status
+                    status, local_ts = self._models.has_model(model_name, model_version)
+
+                    # refresh disk model
+                    if status == "not-available" or (
+                        status in ["on-disk", "in-memory"] and local_ts != current_upstream_ts
+                    ):
+                        # unload model from TFS
+                        if status == "in-memory":
+                            try:
+                                logger().info(
+                                    f"unloading model {model_name} of version {model_version} from TFS"
+                                )
+                                self._models.unload_model(model_name, model_version)
+                            except Exception:
+                                logger().info(
+                                    f"failed unloading model {model_name} of version {model_version} from TFS"
+                                )
+                                raise
+
+                        # remove model from disk and references
+                        if status in ["on-disk", "in-memory"]:
+                            logger().info(
+                                f"removing model references from memory and from disk for model {model_name} of version {model_version}"
+                            )
+                            self._models.remove_model(model_name, model_version)
+
+                        # download model
+                        logger().info(
+                            f"downloading model {model_name} of version {model_version} from the S3 upstream"
+                        )
+                        date = self._models.download_model(
+                            upstream_model["bucket"],
+                            model_name,
+                            model_version,
+                            upstream_model["path"],
+                        )
+                        if not date:
+                            raise WithBreak
+                        current_upstream_ts = date.timestamp()
+
+                    # load model
+                    try:
+                        logger().info(
+                            f"loading model {model_name} of version {model_version} into memory"
+                        )
+                        self._models.load_model(
+                            model_name,
+                            model_version,
+                            current_upstream_ts,
+                            [tag],
+                            kwargs={
+                                "model_name": model_name,
+                                "model_version": model_version,
+                                "signature_key": self._determine_model_signature_key(model_name),
+                            },
+                        )
+                    except Exception as e:
+                        raise UserRuntimeException(
+                            f"failed (re-)loading model {model_name} of version {model_version} (thread {td.get_ident()})",
+                            str(e),
+                        )
+
+                    # run prediction
+                    self._models.get_model(model_name, model_version, tag)
+                    prediction = self._client.predict(model_input, model_name, model_version)
+
+            return prediction
+
+    def _load_model(
+        self, model_path: str, model_name: str, model_version: str, signature_key: Optional[str]
+    ) -> Any:
+        """
+        Loads model into TFS.
+        Must only be used when caching enabled.
+        """
 
-    if signature_def_val.get("inputs") is None:
-        raise UserException(
-            "unable to find 'inputs' in signature def '{}' for model '{}'".format(
-                signature_key, model_name
+        try:
+            model_dir = os.path.split(model_path)[0]
+            self._client.add_single_model(
+                model_name, model_version, model_dir, signature_key, timeout=30.0, max_retries=3
             )
-        )
+        except Exception as e:
+            self._client.remove_single_model(model_name, model_version)
+            raise
 
-    parsed_signature = {}
-    for input_name, input_metadata in signature_def_val["inputs"].items():
-        if input_metadata["tensorShape"] == {}:
-            # a scalar with rank 0 and empty shape
-            shape = "scalar"
-        elif input_metadata["tensorShape"].get("unknownRank", False):
-            # unknown rank and shape
-            #
-            # unknownRank is set to True if the model input has no rank
-            # it may lead to an undefined behavior if unknownRank is only checked for its presence
-            # so it also gets to be tested against its value
-            shape = "unknown"
-        elif input_metadata["tensorShape"].get("dim", None):
-            # known rank and known/unknown shape
-            shape = [int(dim["size"]) for dim in input_metadata["tensorShape"]["dim"]]
-        else:
-            raise UserException(
-                "invalid 'tensorShape' specification for input '{}' in signature key '{}' for model '{}'",
-                input_name,
-                signature_key,
-                model_name,
-            )
+        return "loaded tensorflow model"
 
-        parsed_signature[input_name] = {
-            "shape": shape if type(shape) == list else [shape],
-            "type": DTYPE_TO_TF_TYPE[input_metadata["dtype"]].name,
-        }
-    return signature_key, parsed_signature
+    def _remove_models(self, model_ids: List[str]) -> None:
+        """
+        Remove models from TFS.
+        Must only be used when caching enabled.
+        """
+        logger().info(f"unloading models with model IDs {model_ids} from TFS")
 
+        models = {}
+        for model_id in model_ids:
+            model_name, model_version = model_id.rsplit("-", maxsplit=1)
+            if model_name not in models:
+                models[model_name] = [model_version]
+            else:
+                models[model_name].append(model_version)
 
-def create_prediction_request(signature_def, signature_key, model_name, model_input):
-    prediction_request = predict_pb2.PredictRequest()
-    prediction_request.model_spec.name = model_name
-    prediction_request.model_spec.signature_name = signature_key
+        model_names = []
+        model_versions = []
+        for model_name, versions in models.items():
+            model_names.append(model_name)
+            model_versions.append(versions)
 
-    for column_name, value in model_input.items():
-        if signature_def[signature_key]["inputs"][column_name]["tensorShape"] == {}:
-            shape = "scalar"
-        elif signature_def[signature_key]["inputs"][column_name]["tensorShape"].get(
-            "unknownRank", False
-        ):
-            # unknownRank is set to True if the model input has no rank
-            # it may lead to an undefined behavior if unknownRank is only checked for its presence
-            # so it also gets to be tested against its value
-            shape = "unknown"
-        else:
-            shape = []
-            for dim in signature_def[signature_key]["inputs"][column_name]["tensorShape"]["dim"]:
-                shape.append(int(dim["size"]))
+        self._client.remove_models(model_names, model_versions)
 
-        sig_type = signature_def[signature_key]["inputs"][column_name]["dtype"]
+    def _determine_model_signature_key(self, model_name: str) -> Optional[str]:
+        """
+        Determine what's the signature key for a given model from API spec.
+        """
+        if self._models_dir:
+            return self._api_spec["predictor"]["models"]["signature_key"]
+        return self._spec_models[model_name]["signature_key"]
 
-        try:
-            tensor_proto = tf.compat.v1.make_tensor_proto(value, dtype=DTYPE_TO_TF_TYPE[sig_type])
-            prediction_request.inputs[column_name].CopyFrom(tensor_proto)
-        except Exception as e:
-            if shape == "scalar":
-                raise UserException(
-                    'key "{}"'.format(column_name), "expected to be a scalar", str(e)
-                ) from e
-            elif shape == "unknown":
-                raise UserException(
-                    'key "{}"'.format(column_name), "can be of any rank and shape", str(e)
-                ) from e
-            else:
-                raise UserException(
-                    'key "{}"'.format(column_name), "expected shape {}".format(shape), str(e)
-                ) from e
+    def _get_latest_model_version_from_tree(self, model_name: str, model_info: dict) -> str:
+        """
+        Get the highest version for a specific model name.
+        Must only be used when processes_per_replica = 1 and caching is enabled.
+        """
+        versions, timestamps = model_info["versions"], model_info["timestamps"]
+        return str(max(map(lambda x: int(x), versions)))
 
-    return prediction_request
+    def _is_model_caching_enabled(self) -> bool:
+        """
+        Checks if model caching is enabled (models:cache_size and models:disk_cache_size).
+        """
+        return (
+            self._api_spec["predictor"]["models"]
+            and self._api_spec["predictor"]["models"]["cache_size"] is not None
+            and self._api_spec["predictor"]["models"]["disk_cache_size"] is not None
+        )
 
+    @property
+    def metadata(self) -> dict:
+        """
+        When caching is disabled, the returned dictionary will be like in the following example:
+        {
+            ...
+            "image-classifier-inception-1569014553": {
+                "disk_path": "/mnt/model/image-classifier-inception/1569014553",
+                "signature_def": {
+                    "predict": {
+                    "inputs": {
+                        "images": {
+                        "name": "images:0",
+                        "dtype": "DT_FLOAT",
+                        "tensorShape": {
+                            "dim": [
+                            {
+                                "size": "-1"
+                            },
+                            {
+                                "size": "-1"
+                            },
+                            {
+                                "size": "-1"
+                            },
+                            {
+                                "size": "3"
+                            }
+                            ]
+                        }
+                        }
+                    },
+                    "outputs": {
+                        "classes": {
+                        "name": "module_apply_default/InceptionV3/Logits/SpatialSqueeze:0",
+                        "dtype": "DT_FLOAT",
+                        "tensorShape": {
+                            "dim": [
+                            {
+                                "size": "-1"
+                            },
+                            {
+                                "size": "1001"
+                            }
+                            ]
+                        }
+                        }
+                    },
+                    "methodName": "tensorflow/serving/predict"
+                    }
+                },
+                "signature_key": "predict",
+                "input_signature": {
+                    "images": {
+                    "shape": [
+                        -1,
+                        -1,
+                        -1,
+                        3
+                    ],
+                    "type": "float32"
+                    }
+                },
+                "timestamp": 1602025473
+            }
+            ...
+        }
 
-def parse_response_proto(response_proto):
-    results_dict = json_format.MessageToDict(response_proto)
-    outputs = results_dict["outputs"]
-    outputs_simplified = {}
-    for key in outputs:
-        value_key = DTYPE_TO_VALUE_KEY[outputs[key]["dtype"]]
-        outputs_simplified[key] = outputs[key][value_key]
-    return outputs_simplified
+        Or when the caching is enabled, the following represents the kind of returned dictionary:
+        {
+            ...
+            "image-classifier-inception": {
+                "versions": [
+                    "1569014553",
+                    "1569014559"
+                ],
+                "timestamps": [
+                    "1601668127",
+                    "1601668120"
+                ]
+            }
+            ...
+        }
+        """
 
+        if not self._caching_enabled:
+            # the models dictionary has another field for each key entry
+            # called timestamp inserted by TFSAPIServingThreadUpdater thread
+            return self._client.models
+        else:
+            models_info = self._models_tree.get_all_models_info()
+            for model_name in models_info.keys():
+                del models_info[model_name]["bucket"]
+                del models_info[model_name]["model_paths"]
+            return models_info
 
-def validate_model_input(input_signature, model_input, model_name):
-    for input_name, _ in input_signature.items():
-        if input_name not in model_input:
-            raise UserException("missing key '{}' for model '{}'".format(input_name, model_name))
+    @property
+    def caching(self) -> bool:
+        return self._caching_enabled
diff --git a/pkg/workloads/cortex/lib/concurrency/__init__.py b/pkg/workloads/cortex/lib/concurrency/__init__.py
new file mode 100644
index 0000000000..ce9c41c65f
--- /dev/null
+++ b/pkg/workloads/cortex/lib/concurrency/__init__.py
@@ -0,0 +1,16 @@
+# Copyright 2020 Cortex Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from cortex.lib.concurrency.files import FileLock, LockedFile, get_locked_files
+from cortex.lib.concurrency.threading import ReadWriteLock, LockRead, LockWrite
diff --git a/pkg/workloads/cortex/lib/concurrency/files.py b/pkg/workloads/cortex/lib/concurrency/files.py
new file mode 100644
index 0000000000..6d0416b061
--- /dev/null
+++ b/pkg/workloads/cortex/lib/concurrency/files.py
@@ -0,0 +1,195 @@
+# Copyright 2020 Cortex Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os, fcntl, time
+from typing import List
+
+from cortex.lib.exceptions import CortexException, WithBreak
+
+
+class FileLock:
+    def __init__(self, lock_file: str, timeout: float = None, reader_lock: bool = False):
+        """
+        Lock for files. Not thread-safe. Instantiate one lock per thread.
+
+        lock_file - File to use as lock.
+        timeout - If used, a timeout exception will be raised if the lock can't be acquired. Measured in seconds.
+        reader_lock - When set to true, a shared lock (LOCK_SH) will be used. Otherwise, an exclusive lock (LOCK_EX) is used.
+        """
+        self._lock_file = lock_file
+        self._file_handle = None
+
+        self.timeout = timeout
+        self.reader_lock = reader_lock
+        self._time_loop = 0.001
+
+        # create lock if it doesn't exist
+        with open(self._lock_file, "w+") as f:
+            pass
+
+    def acquire(self):
+        """
+        To acquire the lock to resource.
+        """
+        if self._file_handle:
+            return
+
+        if not self.timeout:
+            self._file_handle = open(self._lock_file, "w")
+            if self.reader_lock:
+                fcntl.flock(self._file_handle, fcntl.LOCK_SH)
+            else:
+                fcntl.flock(self._file_handle, fcntl.LOCK_EX)
+        else:
+            start = time.time()
+            acquired = False
+            while start + self.timeout >= time.time():
+                try:
+                    self._file_handle = open(self._lock_file, "w")
+                    if self.reader_lock:
+                        fcntl.flock(self._file_handle, fcntl.LOCK_SH | fcntl.LOCK_NB)
+                    else:
+                        fcntl.flock(self._file_handle, fcntl.LOCK_EX | fcntl.LOCK_NB)
+                    acquired = True
+                    break
+                except OSError:
+                    time.sleep(self._time_loop)
+
+            if not acquired:
+                self._file_handle = None
+                raise TimeoutError(
+                    "{} ms timeout on acquiring {} lock".format(
+                        int(self.timeout * 1000), self._lock_file
+                    )
+                )
+
+    def release(self):
+        """
+        To release the lock to resource.
+        """
+        if not self._file_handle:
+            return
+
+        fd = self._file_handle
+        self._file_handle = None
+        fcntl.flock(fd, fcntl.LOCK_UN)
+        fd.close()
+
+    def __enter__(self):
+        self.acquire()
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.release()
+        return None
+
+    def __del__(self):
+        self.release()
+        return None
+
+
+class LockedFile:
+    """
+    Create a lock-based file.
+    """
+
+    def __init__(
+        self,
+        filename: str,
+        mode: str,
+        timeout: float = None,
+        reader_lock: bool = False,
+        create_file_if_not_found: bool = True,
+    ):
+        """
+        Open file with locked access to it - either with exclusive or shared lock.
+
+        Args:
+            filename: Name of the file to open.
+            mode: Open mode for the file - same modes as for the built-in open function.
+            timeout: If set, it will try to acquire the lock for this amount of seconds.
+            reader_lock: Whether to use a shared lock or not.
+            create_file_if_not_found: Creates the file if it doesn't already exist.
+        """
+        self.dir_path, self.basename = os.path.split(filename)
+        if self.basename == "":
+            raise CortexException(f"{filename} does not represent a path to file")
+        if not self.basename.startswith("."):
+            self.lockname = "." + self.basename
+        else:
+            self.lockname = self.basename
+
+        self.filename = filename
+        self.mode = mode
+        self.timeout = timeout
+        self.reader_lock = reader_lock
+        self.create_file_if_not_found = create_file_if_not_found
+
+    def __enter__(self):
+        lockfilepath = os.path.join(self.dir_path, self.lockname + ".lock")
+        self._lock = FileLock(lockfilepath, self.timeout, self.reader_lock)
+        self._lock.acquire()
+        try:
+            self._fd = open(self.filename, self.mode)
+            return self._fd
+        except FileNotFoundError:
+            if not self.create_file_if_not_found:
+                raise
+        except Exception as e:
+            self._lock.release()
+            raise e
+        try:
+            # w  write mode
+            # r  read mode
+            # a  append mode
+            #
+            # w+  create file if it doesn't exist and open it in (over)write mode
+            #     [it overwrites the file if it already exists]
+            # r+  open an existing file in read+write mode
+            # a+  create file if it doesn't exist and open it in append mode
+            if self.create_file_if_not_found and self.mode not in ["a+", "w+"]:
+                open(self.filename, "a+").close()
+            self._fd = open(self.filename, self.mode)
+        except Exception as e:
+            self._lock.release()
+            raise e
+        return self._fd
+
+    def __exit__(self, exc_type, exc_value, traceback) -> bool:
+        # sometimes the `__del__` isn't run right away when the context manager exits
+        self.__del__()
+
+        if exc_value is not None and exc_type is not WithBreak:
+            return False
+        return True
+
+    def __del__(self):
+        if hasattr(self, "_fd"):
+            self._fd.close()
+
+        if hasattr(self, "_lock"):
+            self._lock.release()
+
+
+def get_locked_files(lock_dir: str) -> List[str]:
+    files = [os.path.basename(file) for file in os.listdir(lock_dir)]
+    locks = [f for f in files if f.endswith(".lock")]
+
+    locked_files = []
+    for lock in locks:
+        locked_file = os.path.splitext(lock)[0]
+        locked_file = locked_file[1:]  # to ignore the added "."
+        locked_files.append(locked_file)
+
+    return locked_files
diff --git a/pkg/workloads/cortex/lib/concurrency/threading.py b/pkg/workloads/cortex/lib/concurrency/threading.py
new file mode 100644
index 0000000000..840c441c54
--- /dev/null
+++ b/pkg/workloads/cortex/lib/concurrency/threading.py
@@ -0,0 +1,208 @@
+# Copyright 2020 Cortex Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import threading as td
+from typing import Optional
+
+
+class ReadWriteLock:
+    """
+    Locking object allowing for write once, read many operations.
+
+    The lock must not be acquired multiple times in a single thread without paired release calls.
+
+    Can set different priority policies: "r" for read-preferring RW lock allowing for maximum concurrency
+    or can be set to "w" for write-preferring RW lock to prevent from starving the writer.
+    """
+
+    def __init__(self, prefer: str = "r"):
+        """
+        "r" for read-preferring RW lock.
+
+        "w" for write-preferring RW lock.
+        """
+        self._prefer = prefer
+        self._write_preferred = td.Event()
+        self._write_preferred.set()
+        self._read_allowed = td.Condition(td.RLock())
+        self._readers = []
+        # a single writer is supported despite the fact that this is a list.
+        self._writers = []
+
+    def acquire(self, mode: str, timeout: Optional[float] = None) -> bool:
+        """
+        Acquire a lock.
+
+        Args:
+            mode: "r" for read lock, "w" for write lock.
+            timeout: How many seconds to wait to acquire the lock.
+
+        Returns:
+            Whether the mode was valid or not.
+        """
+        if not timeout:
+            acquire_timeout = -1
+        else:
+            acquire_timeout = timeout
+
+        if mode == "r":
+            # wait until "w" has been released
+            if self._prefer == "w":
+                if not self._write_preferred.wait(timeout):
+                    self._throw_timeout_error(timeout, mode)
+
+            # finish acquiring once all writers have released
+            if not self._read_allowed.acquire(timeout=acquire_timeout):
+                self._throw_timeout_error(timeout, mode)
+            # while loop only relevant when prefer == "r"
+            # but it's necessary when the preference policy is changed
+            while len(self._writers) > 0:
+                if not self._read_allowed.wait(timeout):
+                    self._read_allowed.release()
+                    self._throw_timeout_error(timeout, mode)
+
+            self._readers.append(td.get_ident())
+            self._read_allowed.release()
+
+        elif mode == "w":
+            # stop "r" acquirers from acquiring
+            if self._prefer == "w":
+                self._write_preferred.clear()
+
+            # acquire once all readers have released
+            if not self._read_allowed.acquire(timeout=acquire_timeout):
+                self._write_preferred.set()
+                self._throw_timeout_error(timeout, mode)
+            while len(self._readers) > 0:
+                if not self._read_allowed.wait(timeout):
+                    self._read_allowed.release()
+                    self._write_preferred.set()
+                    self._throw_timeout_error(timeout, mode)
+            self._writers.append(td.get_ident())
+        else:
+            return False
+
+        return True
+
+    def release(self, mode: str) -> bool:
+        """
+        Releases a lock.
+
+        Args:
+            mode: "r" for read lock, "w" for write lock.
+
+        Returns:
+            Whether the mode was valid or not.
+        """
+        if mode == "r":
+            # release and let writers acquire
+            self._read_allowed.acquire()
+            if not len(self._readers) - 1:
+                self._read_allowed.notifyAll()
+            self._readers.remove(td.get_ident())
+            self._read_allowed.release()
+
+        elif mode == "w":
+            # release and let readers acquire
+            self._writers.remove(td.get_ident())
+            # notify all only relevant when prefer == "r"
+            # but it's necessary when the preference policy is changed
+            self._read_allowed.notifyAll()
+            self._read_allowed.release()
+
+            # let "r" acquirers acquire again
+            if self._prefer == "w":
+                self._write_preferred.set()
+        else:
+            return False
+
+        return True
+
+    def set_preference_policy(self, prefer: str) -> bool:
+        """
+        Change preference policy dynamically.
+
+        When readers have acquired the lock, the policy change is immediate.
+        When a writer has acquired the lock, the policy change will block until the writer releases the lock.
+
+        Args:
+            prefer: "r" for read-preferring RW lock, "w" for write-preferring RW lock.
+
+        Returns:
+            True when the policy has been changed, false otherwise.
+        """
+        if self._prefer == prefer:
+            return False
+
+        self._read_allowed.acquire()
+        self._prefer = prefer
+        self._write_preferred.set()
+        self._read_allowed.release()
+
+        return True
+
+    def _throw_timeout_error(self, timeout: float, mode: str) -> None:
+        raise TimeoutError(
+            "{} ms timeout on acquiring '{}' lock in {} thread".format(
+                int(timeout * 1000), mode, td.get_ident()
+            )
+        )
+
+
+class LockRead:
+    """
+    To be used as:
+
+    ```python
+    rw_lock = ReadWriteLock()
+    with LockRead(rw_lock):
+        # code
+    ```
+    """
+
+    def __init__(self, lock: ReadWriteLock, timeout: Optional[float] = None):
+        self._lock = lock
+        self._timeout = timeout
+
+    def __enter__(self):
+        self._lock.acquire("r", self._timeout)
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self._lock.release("r")
+        return False
+
+
+class LockWrite:
+    """
+    To be used as:
+
+    ```python
+    rw_lock = ReadWriteLock()
+    with LockWrite(rw_lock):
+        # code
+    ```
+    """
+
+    def __init__(self, lock: ReadWriteLock, timeout: Optional[float] = None):
+        self._lock = lock
+        self._timeout = timeout
+
+    def __enter__(self):
+        self._lock.acquire("w", self._timeout)
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self._lock.release("w")
+        return False
diff --git a/pkg/workloads/cortex/lib/exceptions.py b/pkg/workloads/cortex/lib/exceptions.py
index d0e80fe51a..92744f0e42 100644
--- a/pkg/workloads/cortex/lib/exceptions.py
+++ b/pkg/workloads/cortex/lib/exceptions.py
@@ -15,6 +15,14 @@
 from collections import deque
 
 
+class WithBreak(Exception):
+    """
+    Gracefully exit with clauses.
+    """
+
+    pass
+
+
 class CortexException(Exception):
     def __init__(self, *messages):
         super().__init__(": ".join(messages))
diff --git a/pkg/workloads/cortex/lib/log.py b/pkg/workloads/cortex/lib/log.py
index 31827cac04..7bc9d3be8b 100644
--- a/pkg/workloads/cortex/lib/log.py
+++ b/pkg/workloads/cortex/lib/log.py
@@ -16,9 +16,9 @@
 import sys
 import time
 import http
+import datetime as dt
 
 from cortex.lib import stringify
-import datetime as dt
 
 
 class CortexFormatter(logging.Formatter):
@@ -84,7 +84,7 @@ def register_logger(name, show_pid=True):
 
     logger.propagate = False
     logger.addHandler(handler)
-    logger.setLevel(logging.DEBUG)
+    logger.setLevel(logging.INFO)
     return logger
 
 
diff --git a/pkg/workloads/cortex/lib/model/__init__.py b/pkg/workloads/cortex/lib/model/__init__.py
new file mode 100644
index 0000000000..5a92139fd8
--- /dev/null
+++ b/pkg/workloads/cortex/lib/model/__init__.py
@@ -0,0 +1,35 @@
+# Copyright 2020 Cortex Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from cortex.lib.model.model import ModelsHolder, LockedGlobalModelsGC, LockedModel, ids_to_models
+from cortex.lib.model.tfs import TensorFlowServingAPI, TensorFlowServingAPIClones
+from cortex.lib.model.tree import ModelsTree, LockedModelsTree
+from cortex.lib.model.type import CuratedModelResources
+from cortex.lib.model.validation import (
+    validate_models_dir_paths,
+    validate_model_paths,
+    ModelVersion,
+)
+from cortex.lib.model.cron import (
+    FileBasedModelsTreeUpdater,
+    FileBasedModelsGC,
+    find_ondisk_models_with_lock,
+    find_ondisk_model_ids_with_lock,
+    find_ondisk_model_info,
+    TFSModelLoader,
+    TFSAPIServingThreadUpdater,
+    find_ondisk_models,
+    ModelsGC,
+    ModelTreeUpdater,
+)
diff --git a/pkg/workloads/cortex/lib/model/cron.py b/pkg/workloads/cortex/lib/model/cron.py
new file mode 100644
index 0000000000..d056d0a837
--- /dev/null
+++ b/pkg/workloads/cortex/lib/model/cron.py
@@ -0,0 +1,1706 @@
+# Copyright 2020 Cortex Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import threading as td
+import multiprocessing as mp
+import time
+import datetime
+import glob
+import shutil
+import itertools
+import json
+import grpc
+import copy
+from concurrent.futures import ThreadPoolExecutor
+from typing import Dict, List, Tuple, Any, Union, Callable, Optional
+
+from cortex.lib import util
+from cortex.lib.log import cx_logger as logger
+from cortex.lib.concurrency import LockedFile, get_locked_files
+from cortex.lib.storage import S3, LocalStorage
+from cortex.lib.exceptions import CortexException, WithBreak
+from cortex.lib.type import (
+    predictor_type_from_api_spec,
+    PythonPredictorType,
+    TensorFlowPredictorType,
+    TensorFlowNeuronPredictorType,
+    ONNXPredictorType,
+    PredictorType,
+)
+
+from cortex.lib.model import (
+    TensorFlowServingAPI,
+    TensorFlowServingAPIClones,
+    validate_models_dir_paths,
+    validate_model_paths,
+    ModelsHolder,
+    ids_to_models,
+    LockedGlobalModelsGC,
+    LockedModel,
+    CuratedModelResources,
+    ModelVersion,
+    ModelsTree,
+    LockedModelsTree,
+)
+
+
+class AbstractLoopingThread(td.Thread):
+    """
+    Abstract class of the td.Thread class.
+
+    Takes a method and keeps calling it in a loop every certain interval.
+    """
+
+    def __init__(self, interval: int, runnable: Callable[[], None]):
+        td.Thread.__init__(self, daemon=True)
+
+        self._interval = interval
+        self._runnable = runnable
+
+        if not callable(self._runnable):
+            raise ValueError("runnable parameter must be a callable method")
+
+        self._event_stopper = td.Event()
+        self._stopped = False
+
+    def run(self):
+        """
+        td.Thread-specific method.
+        """
+
+        while not self._event_stopper.is_set():
+            self._runnable()
+            time.sleep(self._interval)
+        self._stopped = True
+
+    def stop(self, blocking: bool = False):
+        """
+        Stop the thread.
+
+        Args:
+            blocking: Whether to wait until the thread is stopped or not.
+        """
+
+        self._event_stopper.set()
+        if blocking:
+            self.join()
+
+    def join(self):
+        """
+        Block until the thread finishes.
+        """
+
+        while not self._stopped:
+            time.sleep(0.001)
+
+
+def find_all_s3_models(
+    is_dir_used: bool,
+    models_dir: str,
+    predictor_type: PredictorType,
+    s3_paths: List[str],
+    s3_model_names: List[str],
+) -> Tuple[
+    List[str],
+    Dict[str, List[str]],
+    List[str],
+    List[List[str]],
+    List[List[datetime.datetime]],
+    List[str],
+]:
+    """
+    Get updated information on all models that are currently present on the S3 upstreams.
+    Information on the available models, versions, last edit times, the subpaths of each model, and so on.
+
+    Args:
+        is_dir_used: Whether predictor:models:dir is used or not.
+        models_dir: The value of predictor:models:dir in case it's present. Ignored when not required.
+        predictor_type: The predictor type.
+        s3_paths: The S3 model paths as they are specified in predictor:model_path/predictor:models:dir/predictor:models:paths is used. Ignored when not required.
+        s3_model_names: The S3 model names as they are specified in predictor:models:paths:name when predictor:models:paths is used or the default name of the model when predictor:model_path is used. Ignored when not required.
+
+    Returns: The tuple with the following elements:
+        model_names - a list with the names of the models (i.e. bert, gpt-2, etc) and they are unique
+        versions - a dictionary with the keys representing the model names and the values being lists of versions that each model has.
+          For non-versioned model paths ModelVersion.NOT_PROVIDED, the list will be empty.
+        model_paths - a list with the prefix of each model.
+        sub_paths - a list of filepaths lists for each file of each model.
+        timestamps - a list of timestamps lists representing the last edit time of each versioned model.
+        bucket_names - a list of the bucket names of each model.
+    """
+
+    # validate models stored in S3 that were specified with predictor:models:dir field
+    if is_dir_used:
+        bucket_name, models_path = S3.deconstruct_s3_path(models_dir)
+        s3_client = S3(bucket_name, client_config={})
+        sub_paths, timestamps = s3_client.search(models_path)
+        model_paths, ooa_ids = validate_models_dir_paths(sub_paths, predictor_type, models_path)
+        model_names = [os.path.basename(model_path) for model_path in model_paths]
+
+        model_paths = [
+            model_path for model_path in model_paths if os.path.basename(model_path) in model_names
+        ]
+        model_paths = [
+            model_path + "/" * (not model_path.endswith("/")) for model_path in model_paths
+        ]
+
+        bucket_names = len(model_paths) * [bucket_name]
+        sub_paths = len(model_paths) * [sub_paths]
+        timestamps = len(model_paths) * [timestamps]
+
+    # validate models stored in S3 that were specified with predictor:models:paths field
+    if not is_dir_used:
+        sub_paths = []
+        ooa_ids = []
+        model_paths = []
+        model_names = []
+        timestamps = []
+        bucket_names = []
+        for idx, path in enumerate(s3_paths):
+            if S3.is_valid_s3_path(path):
+                bucket_name, model_path = S3.deconstruct_s3_path(path)
+                s3_client = S3(bucket_name, client_config={})
+                sb, model_path_ts = s3_client.search(model_path)
+                try:
+                    ooa_ids.append(validate_model_paths(sb, predictor_type, model_path))
+                except CortexException:
+                    continue
+                model_paths.append(model_path)
+                model_names.append(s3_model_names[idx])
+                bucket_names.append(bucket_name)
+                sub_paths += [sb]
+                timestamps += [model_path_ts]
+
+    # determine the detected versions for each model
+    # if the model was not versioned, then leave the version list empty
+    versions = {}
+    for model_path, model_name, model_ooa_ids, bucket_sub_paths in zip(
+        model_paths, model_names, ooa_ids, sub_paths
+    ):
+        if ModelVersion.PROVIDED not in model_ooa_ids:
+            versions[model_name] = []
+            continue
+
+        model_sub_paths = [os.path.relpath(sub_path, model_path) for sub_path in bucket_sub_paths]
+        model_versions_paths = [path for path in model_sub_paths if not path.startswith("../")]
+        model_versions = [
+            util.get_leftmost_part_of_path(model_version_path)
+            for model_version_path in model_versions_paths
+        ]
+        model_versions = list(set(model_versions))
+        versions[model_name] = model_versions
+
+    # pick up the max timestamp for each versioned model
+    aux_timestamps = []
+    for model_path, model_name, bucket_sub_paths, sub_path_timestamps in zip(
+        model_paths, model_names, sub_paths, timestamps
+    ):
+        model_ts = []
+        if len(versions[model_name]) == 0:
+            masks = list(
+                map(
+                    lambda x: x.startswith(model_path + "/" * (model_path[-1] != "/")),
+                    bucket_sub_paths,
+                )
+            )
+            model_ts = [max(itertools.compress(sub_path_timestamps, masks))]
+
+        for version in versions[model_name]:
+            masks = list(
+                map(
+                    lambda x: x.startswith(os.path.join(model_path, version) + "/"),
+                    bucket_sub_paths,
+                )
+            )
+            model_ts.append(max(itertools.compress(sub_path_timestamps, masks)))
+
+        aux_timestamps.append(model_ts)
+
+    timestamps = aux_timestamps  # type: List[List[datetime.datetime]]
+
+    # model_names - a list with the names of the models (i.e. bert, gpt-2, etc) and they are unique
+    # versions - a dictionary with the keys representing the model names and the values being lists of versions that each model has.
+    #   For non-versioned model paths ModelVersion.NOT_PROVIDED, the list will be empty
+    # model_paths - a list with the prefix of each model
+    # sub_paths - a list of filepaths lists for each file of each model
+    # timestamps - a list of timestamps lists representing the last edit time of each versioned model
+
+    return model_names, versions, model_paths, sub_paths, timestamps, bucket_names
+
+
+class FileBasedModelsTreeUpdater(mp.Process):
+    """
+    Monitors the S3 path(s)/dir and continuously updates the file-based tree.
+    The model paths are validated - the bad paths are ignored.
+    When a new model is found, it updates the tree and downloads it - likewise when a model is removed.
+    """
+
+    def __init__(
+        self,
+        interval: int,
+        api_spec: dict,
+        download_dir: str,
+        temp_dir: str = "/tmp/cron",
+        lock_dir: str = "/run/cron",
+    ):
+        """
+        Args:
+            interval: How often to update the models tree. Measured in seconds.
+            api_spec: Identical copy of pkg.type.spec.api.API.
+            download_dir: Path to where the models are stored.
+            temp_dir: Path to where the models are temporarily stored.
+            lock_dir: Path to where the resource locks are stored.
+        """
+
+        mp.Process.__init__(self, daemon=True)
+
+        self._interval = interval
+        self._api_spec = api_spec
+        self._download_dir = download_dir
+        self._temp_dir = temp_dir
+        self._lock_dir = lock_dir
+
+        self._s3_paths = []
+        self._spec_models = CuratedModelResources(self._api_spec["curated_model_resources"])
+        self._local_model_names = self._spec_models.get_local_model_names()
+        self._s3_model_names = self._spec_models.get_s3_model_names()
+        for model_name in self._s3_model_names:
+            self._s3_paths.append(self._spec_models[model_name]["model_path"])
+
+        if (
+            self._api_spec["predictor"]["model_path"] is None
+            and self._api_spec["predictor"]["models"] is not None
+            and self._api_spec["predictor"]["models"]["dir"] is not None
+        ):
+            self._is_dir_used = True
+            self._models_dir = self._api_spec["predictor"]["models"]["dir"]
+        else:
+            self._is_dir_used = False
+            self._models_dir = None
+
+        self._predictor_type = predictor_type_from_api_spec(self._api_spec)
+
+        try:
+            os.mkdir(self._lock_dir)
+        except FileExistsError:
+            pass
+
+        self._ran_once = mp.Event()
+        self._event_stopper = mp.Event()
+        self._stopped = mp.Event()
+
+    def run(self):
+        """
+        mp.Process-specific method.
+        """
+
+        self._make_local_models_available()
+
+        while not self._event_stopper.is_set():
+            self._update_models_tree()
+            if not self._ran_once.is_set():
+                self._ran_once.set()
+            time.sleep(self._interval)
+        self._stopped.set()
+
+    def stop(self, blocking: bool = False):
+        """
+        Trigger the process of stopping the process.
+
+        Args:
+            blocking: Whether to wait until the process is stopped or not.
+        """
+
+        self._event_stopper.set()
+        if blocking:
+            self.join()
+
+    def join(self):
+        """
+        Block until the process exits.
+        """
+
+        while not self._stopped.is_set():
+            time.sleep(0.001)
+
+    def ran_once(self) -> bool:
+        """
+        Tells whether the FileBasedModelsTreeUpdater loop has run at least once.
+        """
+
+        return self._ran_once.is_set()
+
+    def _make_local_models_available(self) -> None:
+        """
+        Make local models (provided through predictor:model_path, models:paths or models:dir) available on disk.
+        """
+
+        timestamp_utc = datetime.datetime.now(datetime.timezone.utc).timestamp()
+
+        if len(self._local_model_names) == 1:
+            message = "local model "
+        elif len(self._local_model_names) > 1:
+            message = "local models "
+        else:
+            return
+
+        for idx, local_model_name in enumerate(self._local_model_names):
+            versions = self._spec_models[local_model_name]["versions"]
+            if len(versions) == 0:
+                resource = os.path.join(self._lock_dir, local_model_name + "-" + "1" + ".txt")
+                with LockedFile(resource, "w") as f:
+                    f.write("available " + str(int(timestamp_utc)))
+            for ondisk_version in versions:
+                resource = os.path.join(
+                    self._lock_dir, local_model_name + "-" + ondisk_version + ".txt"
+                )
+                with LockedFile(resource, "w") as f:
+                    f.write("available " + str(int(timestamp_utc)))
+
+            message += f"{local_model_name} "
+            if len(versions) == 1:
+                message += f"(version {versions[0]})"
+            elif len(versions) > 1:
+                message += f"(versions {','.join(versions)})"
+
+            if idx + 1 < len(self._local_model_names):
+                message += ", "
+            else:
+                message += "now available on disk"
+
+        logger().info(message)
+
+    def _update_models_tree(self) -> None:
+        # don't update when the models:dir is a local path
+        if self._is_dir_used and not self._models_dir.startswith("s3://"):
+            return
+
+        # get updated/validated paths/versions of the S3 models
+        (
+            model_names,
+            versions,
+            model_paths,
+            sub_paths,
+            timestamps,
+            bucket_names,
+        ) = find_all_s3_models(
+            self._is_dir_used,
+            self._models_dir,
+            self._predictor_type,
+            self._s3_paths,
+            self._s3_model_names,
+        )
+
+        # update models on the local disk if changes have been detected
+        # a model is updated if its directory tree has changed, if it's not present or if it doesn't exist on the upstream
+        with ThreadPoolExecutor(max_workers=5) as executor:
+            futures = []
+            for idx, (model_name, bucket_name, bucket_sub_paths) in enumerate(
+                zip(model_names, bucket_names, sub_paths)
+            ):
+                futures += [
+                    executor.submit(
+                        self._refresh_model,
+                        idx,
+                        model_name,
+                        model_paths[idx],
+                        versions[model_name],
+                        timestamps[idx],
+                        bucket_sub_paths,
+                        bucket_name,
+                    )
+                ]
+
+            [future.result() for future in futures]
+
+        # remove models that no longer appear in model_names
+        for model_name, versions in find_ondisk_models_with_lock(self._lock_dir).items():
+            if model_name in model_names or model_name in self._local_model_names:
+                continue
+            for ondisk_version in versions:
+                resource = os.path.join(self._lock_dir, model_name + "-" + ondisk_version + ".txt")
+                ondisk_model_version_path = os.path.join(
+                    self._download_dir, model_name, ondisk_version
+                )
+                with LockedFile(resource, "w+") as f:
+                    shutil.rmtree(ondisk_model_version_path)
+                    f.write("not-available")
+            shutil.rmtree(os.path.join(self._download_dir, model_name))
+
+        logger().debug(f"{self.__class__.__name__} cron heartbeat")
+
+    def _refresh_model(
+        self,
+        idx: int,
+        model_name: str,
+        model_path: str,
+        versions: List[str],
+        timestamps: List[datetime.datetime],
+        sub_paths: List[str],
+        bucket_name: str,
+    ) -> None:
+        s3_client = S3(bucket_name, client_config={})
+
+        ondisk_model_path = os.path.join(self._download_dir, model_name)
+        for version, model_ts in zip(versions, timestamps):
+
+            # for the lock file
+            resource = os.path.join(self._lock_dir, model_name + "-" + version + ".txt")
+
+            # check if a model update is mandated
+            update_model = False
+            ondisk_model_version_path = os.path.join(ondisk_model_path, version)
+            if os.path.exists(ondisk_model_version_path):
+                local_paths = glob.glob(ondisk_model_version_path + "*/**", recursive=True)
+                local_paths = util.remove_non_empty_directory_paths(local_paths)
+                local_paths = [
+                    os.path.relpath(local_path, ondisk_model_version_path)
+                    for local_path in local_paths
+                ]
+                local_paths = [path for path in local_paths if not path.startswith("../")]
+
+                s3_model_version_path = os.path.join(model_path, version)
+                s3_paths = [
+                    os.path.relpath(sub_path, s3_model_version_path) for sub_path in sub_paths
+                ]
+                s3_paths = [path for path in s3_paths if not path.startswith("../")]
+                s3_paths = util.remove_non_empty_directory_paths(s3_paths)
+
+                # update if the paths don't match
+                if set(local_paths) != set(s3_paths):
+                    update_model = True
+
+                # update if the timestamp is newer
+                with LockedFile(resource, "r", reader_lock=True) as f:
+                    file_status = f.read()
+                    if file_status == "" or file_status == "not-available":
+                        raise WithBreak
+                    current_model_ts = int(file_status.split(" ")[1])
+                    if current_model_ts < int(model_ts.timestamp()):
+                        update_model = True
+            else:
+                update_model = True
+
+            if update_model:
+                # download to a temp directory
+                temp_dest = os.path.join(self._temp_dir, model_name, version)
+                s3_src = os.path.join(model_path, version)
+                s3_client.download_dir_contents(s3_src, temp_dest)
+
+                # validate the downloaded model
+                model_contents = glob.glob(temp_dest + "*/**", recursive=True)
+                model_contents = util.remove_non_empty_directory_paths(model_contents)
+                try:
+                    validate_model_paths(model_contents, self._predictor_type, temp_dest)
+                    passed_validation = True
+                except CortexException:
+                    passed_validation = False
+                    shutil.rmtree(temp_dest)
+                    logger().debug(
+                        f"failed validating model {model_name} of version {version} found at {S3.construct_s3_path(bucket_name, s3_src)} path"
+                    )
+
+                # move the model to its destination directory
+                if passed_validation:
+                    with LockedFile(resource, "w+") as f:
+                        if os.path.exists(ondisk_model_version_path):
+                            shutil.rmtree(ondisk_model_version_path)
+                        shutil.move(temp_dest, ondisk_model_version_path)
+                        f.write("available " + str(int(model_ts.timestamp())))
+
+        # remove the temp model directory if it exists
+        model_temp_dest = os.path.join(self._temp_dir, model_name)
+        if os.path.exists(model_temp_dest):
+            os.rmdir(model_temp_dest)
+
+        # remove model versions if they are not found on the upstream
+        # except when the model version found on disk is 1 and the number of detected versions on the upstream is 0,
+        # thus indicating the 1-version on-disk model must be a model that came without a version
+        if os.path.exists(ondisk_model_path):
+            ondisk_model_versions = glob.glob(ondisk_model_path + "*/**")
+            ondisk_model_versions = [
+                os.path.relpath(path, ondisk_model_path) for path in ondisk_model_versions
+            ]
+            for ondisk_version in ondisk_model_versions:
+                if ondisk_version not in versions and (ondisk_version != "1" or len(versions) > 0):
+                    resource = os.path.join(
+                        self._lock_dir, model_name + "-" + ondisk_version + ".txt"
+                    )
+                    ondisk_model_version_path = os.path.join(ondisk_model_path, ondisk_version)
+                    with LockedFile(resource, "w+") as f:
+                        shutil.rmtree(ondisk_model_version_path)
+                        f.write("not-available")
+
+            # remove the model directory if there are no models left
+            if len(glob.glob(ondisk_model_path + "*/**")) == 0:
+                shutil.rmtree(ondisk_model_path)
+
+        # if it's a non-versioned model ModelVersion.NOT_PROVIDED
+        if len(versions) == 0 and len(sub_paths) > 0:
+
+            # for the lock file
+            resource = os.path.join(self._lock_dir, model_name + "-" + "1" + ".txt")
+            model_ts = int(timestamps[0].timestamp())
+
+            # check if a model update is mandated
+            update_model = False
+            ondisk_model_version_path = os.path.join(ondisk_model_path, "1")
+            if os.path.exists(ondisk_model_version_path):
+                local_paths = glob.glob(ondisk_model_version_path + "*/**", recursive=True)
+                local_paths = util.remove_non_empty_directory_paths(local_paths)
+                local_paths = [
+                    os.path.relpath(local_path, ondisk_model_version_path)
+                    for local_path in local_paths
+                ]
+                local_paths = [path for path in local_paths if not path.startswith("../")]
+
+                s3_model_version_path = model_path
+                s3_paths = [
+                    os.path.relpath(sub_path, s3_model_version_path) for sub_path in sub_paths
+                ]
+                s3_paths = [path for path in s3_paths if not path.startswith("../")]
+                s3_paths = util.remove_non_empty_directory_paths(s3_paths)
+
+                # update if the paths don't match
+                if set(local_paths) != set(s3_paths):
+                    update_model = True
+
+                # update if the timestamp is newer
+                with LockedFile(resource, "r", reader_lock=True) as f:
+                    file_status = f.read()
+                    if file_status == "" or file_status == "not-available":
+                        raise WithBreak()
+                    current_model_ts = int(file_status.split(" ")[1])
+                    if current_model_ts < model_ts:
+                        update_model = True
+            else:
+                update_model = True
+
+            if not update_model:
+                return
+
+            # download to a temp directory
+            temp_dest = os.path.join(self._temp_dir, model_name)
+            s3_client.download_dir_contents(model_path, temp_dest)
+
+            # validate the downloaded model
+            model_contents = glob.glob(temp_dest + "*/**", recursive=True)
+            model_contents = util.remove_non_empty_directory_paths(model_contents)
+            try:
+                validate_model_paths(model_contents, self._predictor_type, temp_dest)
+                passed_validation = True
+            except CortexException:
+                passed_validation = False
+                shutil.rmtree(temp_dest)
+                logger().debug(
+                    f"failed validating model {model_name} of version {version} found at {S3.construct_s3_path(bucket_name, model_path)} path"
+                )
+
+            # move the model to its destination directory
+            if passed_validation:
+                with LockedFile(resource, "w+") as f:
+                    if os.path.exists(ondisk_model_version_path):
+                        shutil.rmtree(ondisk_model_version_path)
+                    shutil.move(temp_dest, ondisk_model_version_path)
+                    f.write("available " + str(model_ts))
+
+
+class FileBasedModelsGC(AbstractLoopingThread):
+    """
+    GC for models that no longer exist on disk. To be used with FileBasedModelsTreeUpdater.
+
+    There has to be a FileBasedModelsGC cron for each API process.
+
+    This needs to run on the API process because the FileBasedModelsTreeUpdater process cannot
+    unload the models from the API process' memory by itself. API process has to rely on this cron to do this periodically.
+
+    This is for the case when the FileBasedModelsTreeUpdater process has removed models from disk and there are still models loaded into the API process' memory.
+    """
+
+    def __init__(
+        self,
+        interval: int,
+        models: ModelsHolder,
+        download_dir: str,
+        lock_dir: str = "/run/cron",
+    ):
+        """
+        Args:
+            interval: How often to run the GC. Measured in seconds.
+            download_dir: Path to where the models are stored.
+            lock_dir: Path to where the resource locks are stored.
+        """
+        AbstractLoopingThread.__init__(self, interval, self._run_gc)
+
+        self._models = models
+        self._download_dir = download_dir
+        self._lock_dir = lock_dir
+
+    def _run_gc(self):
+        on_disk_model_ids = find_ondisk_model_ids_with_lock(self._lock_dir)
+        in_memory_model_ids = self._models.get_model_ids()
+
+        logger().debug(f"{self.__class__.__name__} cron heartbeat")
+
+        for in_memory_id in in_memory_model_ids:
+            if in_memory_id in on_disk_model_ids:
+                continue
+            with LockedModel(self._models, "w", model_id=in_memory_id):
+                if self._models.has_model_id(in_memory_id)[0] == "in-memory":
+                    model_name, model_version = in_memory_id.rsplit("-", maxsplit=1)
+                    logger().info(
+                        f"removing model {model_name} of version {model_version} from memory as it's no longer present on disk/S3 (thread {td.get_ident()})"
+                    )
+                    self._models.remove_model_by_id(
+                        in_memory_id, mem=True, disk=False, del_reference=True
+                    )
+
+
+def find_ondisk_models_with_lock(
+    lock_dir: str, include_timestamps: bool = False
+) -> Union[Dict[str, List[str]], Dict[str, Dict[str, Any]]]:
+    """
+    Returns all available models from the disk.
+    To be used in conjunction with FileBasedModelsTreeUpdater/FileBasedModelsGC.
+
+    Can be used for Python/TensorFlow/ONNX clients.
+
+    Args:
+        lock_dir: Path to where the resource locks are stored.
+        include_timestamps: Whether to include timestamps for each version of each model.
+
+    Returns:
+        Dictionary with available model names and their associated versions when include_timestamps is False.
+        {
+            "model-A": ["177", "245", "247"],
+            "model-B": ["1"],
+            ...
+        }
+
+        Dictionary with available model names and their associated versions/timestamps when include_timestamps is True.
+        {
+            "model-A": {
+                "versions": ["177", "245", "247"],
+                "timestamps": [1602198945, 1602198946, 1602198947]
+            }
+            "model-B": {
+                "versions": ["1"],
+                "timestamps": [1602198567]
+            },
+            ...
+        }
+    """
+    models = {}
+
+    for locked_file in get_locked_files(lock_dir):
+        with LockedFile(os.path.join(lock_dir, locked_file), "r", reader_lock=True) as f:
+            status = f.read()
+
+        if status.startswith("available"):
+            timestamp = int(status.split(" ")[1])
+            _model_name, _model_version = os.path.splitext(locked_file)[0].rsplit("-", maxsplit=1)
+            if _model_name not in models:
+                if include_timestamps:
+                    models[_model_name] = {"versions": [_model_version], "timestamps": [timestamp]}
+                else:
+                    models[_model_name] = [_model_version]
+            else:
+                if include_timestamps:
+                    models[_model_name]["versions"] += [_model_version]
+                    models[_model_name]["timestamps"] += [timestamp]
+                else:
+                    models[_model_name] += [_model_version]
+
+    return models
+
+
+def find_ondisk_model_ids_with_lock(lock_dir: str) -> List[str]:
+    """
+    Returns all available model IDs from the disk.
+    To be used in conjunction with FileBasedModelsTreeUpdater/FileBasedModelsGC.
+
+    Can be used for Python/TensorFlow/ONNX clients.
+
+    Args:
+        lock_dir: Path to where the resource locks are stored.
+
+    Returns:
+        A list with all model IDs present on disk.
+    """
+    model_ids = []
+
+    for locked_file in get_locked_files(lock_dir):
+        with LockedFile(os.path.join(lock_dir, locked_file), "r", reader_lock=True) as f:
+            status = f.read()
+
+        if status.startswith("available"):
+            model_id = os.path.splitext(locked_file)[0]
+            model_ids.append(model_id)
+
+    return model_ids
+
+
+def find_ondisk_model_info(lock_dir: str, model_name: str) -> Tuple[List[str], List[int]]:
+    """
+    Returns all available versions/timestamps of a model from the disk.
+    To be used in conjunction with FileBasedModelsTreeUpdater/FileBasedModelsGC.
+
+    Can be used for Python/TensorFlow/ONNX clients.
+
+    Args:
+        lock_dir: Path to where the resource locks are stored.
+        model_name: Name of the model as specified in predictor:models:paths:name, _cortex_default when predictor:model_path is set or the discovered model names when predictor:models:dir is used.
+
+    Returns:
+        2-element tuple made of a list with the available versions and a list with the corresponding timestamps for each model. Empty when the model is not available.
+    """
+    versions = []
+    timestamps = []
+
+    for locked_file in get_locked_files(lock_dir):
+        _model_name, _model_version = os.path.splitext(locked_file)[0].rsplit("-", maxsplit=1)
+        if _model_name != model_name:
+            continue
+
+        with LockedFile(os.path.join(lock_dir, locked_file), "r", reader_lock=True) as f:
+            status = f.read()
+        if not status.startswith("available"):
+            continue
+
+        current_upstream_ts = int(status.split(" ")[1])
+        timestamps.append(current_upstream_ts)
+        versions.append(_model_version)
+
+    return (versions, timestamps)
+
+
+class TFSModelLoader(mp.Process):
+    """
+    Monitors the S3 path(s)/dir and continuously updates the models on TFS.
+    The model paths are validated - the bad paths are ignored.
+    When a new model is found, it updates the tree, downloads it and loads it into memory - likewise when a model is removed.
+    """
+
+    def __init__(
+        self,
+        interval: int,
+        api_spec: dict,
+        tfs_model_dir: str,
+        download_dir: str,
+        address: Optional[str] = None,
+        addresses: Optional[List[str]] = None,
+        temp_dir: str = "/tmp/cron",
+        lock_dir: str = "/run/cron",
+    ):
+        """
+        Args:
+            interval: How often to update the models tree. Measured in seconds.
+            api_spec: Identical copy of pkg.type.spec.api.API.
+            address: An address with the "host:port" format to where TFS is located.
+            addresses: A list of addresses with the "host:port" format to where the TFS servers are located.
+            tfs_model_dir: Path to where the models are stored within the TFS container.
+            download_dir: Path to where the models are stored.
+            temp_dir: Directory where models are temporarily stored.
+            lock_dir: Directory in which model timestamps are stored.
+        """
+
+        if address and addresses:
+            raise ValueError("address and addresses arguments cannot be passed in at the same time")
+        if not address and not addresses:
+            raise ValueError("must pass in at least one of the two arguments: address or addresses")
+
+        mp.Process.__init__(self, daemon=True)
+
+        self._interval = interval
+        self._api_spec = api_spec
+        self._tfs_model_dir = tfs_model_dir
+        self._download_dir = download_dir
+        self._temp_dir = temp_dir
+        self._lock_dir = lock_dir
+
+        if address:
+            self._tfs_address = address
+            self._tfs_addresses = None
+        else:
+            self._tfs_address = None
+            self._tfs_addresses = addresses
+
+        self._s3_paths = []
+        self._spec_models = CuratedModelResources(self._api_spec["curated_model_resources"])
+        self._local_model_names = self._spec_models.get_local_model_names()
+        self._s3_model_names = self._spec_models.get_s3_model_names()
+        for model_name in self._s3_model_names:
+            self._s3_paths.append(self._spec_models[model_name]["model_path"])
+
+        if (
+            self._api_spec["predictor"]["model_path"] is None
+            and self._api_spec["predictor"]["models"] is not None
+            and self._api_spec["predictor"]["models"]["dir"] is not None
+        ):
+            self._is_dir_used = True
+            self._models_dir = self._api_spec["predictor"]["models"]["dir"]
+        else:
+            self._is_dir_used = False
+            self._models_dir = None
+
+        if self._api_spec["predictor"]["type"] == "tensorflow":
+            if self._api_spec["compute"]["inf"] > 0:
+                self._predictor_type = TensorFlowNeuronPredictorType
+            else:
+                self._predictor_type = TensorFlowPredictorType
+        else:
+            raise CortexException(
+                "'tensorflow' predictor type is the only allowed type for this cron"
+            )
+
+        self._ran_once = mp.Event()
+        self._event_stopper = mp.Event()
+        self._stopped = mp.Event()
+
+        # keeps an old record of the model timestamps
+        self._old_ts_state = {}
+
+    def run(self):
+        """
+        mp.Process-specific method.
+        """
+
+        if self._tfs_address:
+            self._client = TensorFlowServingAPI(self._tfs_address)
+        else:
+            self._client = TensorFlowServingAPIClones(self._tfs_addresses)
+
+        # wait until TFS is responsive
+        while not self._client.is_tfs_accessible():
+            self._reset_when_tfs_unresponsive()
+            time.sleep(1.0)
+
+        self._load_local_models()
+
+        while not self._event_stopper.is_set():
+            success = self._update_models()
+            if success and not self._ran_once.is_set():
+                self._ran_once.set()
+            logger().debug(f"{self.__class__.__name__} cron heartbeat")
+            time.sleep(self._interval)
+        self._stopped.set()
+
+    def stop(self, blocking: bool = False):
+        """
+        Trigger the process of stopping the process.
+
+        Args:
+            blocking: Whether to wait until the process is stopped or not.
+        """
+
+        self._event_stopper.set()
+        if blocking:
+            self.join()
+
+    def join(self):
+        """
+        Block until the process exits.
+        """
+
+        while not self._stopped.is_set():
+            time.sleep(0.001)
+
+    def ran_once(self) -> bool:
+        """
+        Tells whether the TFS loader loop has run at least once.
+        """
+
+        return self._ran_once.is_set()
+
+    def _update_models(self) -> bool:
+        # don't update when the models:dir is a local path
+        if self._is_dir_used and not self._models_dir.startswith("s3://"):
+            return True
+
+        # get updated/validated paths/versions of the S3 models
+        (
+            model_names,
+            versions,
+            model_paths,
+            sub_paths,
+            timestamps,
+            bucket_names,
+        ) = find_all_s3_models(
+            self._is_dir_used,
+            self._models_dir,
+            self._predictor_type,
+            self._s3_paths,
+            self._s3_model_names,
+        )
+
+        # update models on the local disk if changes have been detected
+        # a model is updated if its directory tree has changed, if it's not present or if it doesn't exist on the upstream
+        with ThreadPoolExecutor(max_workers=5) as executor:
+            futures = []
+            for idx, (model_name, bucket_name, bucket_sub_paths) in enumerate(
+                zip(model_names, bucket_names, sub_paths)
+            ):
+                futures += [
+                    executor.submit(
+                        self._refresh_model,
+                        idx,
+                        model_name,
+                        model_paths[idx],
+                        versions[model_name],
+                        timestamps[idx],
+                        bucket_sub_paths,
+                        bucket_name,
+                    )
+                ]
+            [future.result() for future in futures]
+
+        # remove models that no longer appear in model_names
+        for model_name, model_versions in find_ondisk_models(self._download_dir).items():
+            if model_name in model_names or model_name in self._local_model_names:
+                continue
+            for ondisk_version in model_versions:
+                ondisk_model_version_path = os.path.join(
+                    self._download_dir, model_name, ondisk_version
+                )
+                shutil.rmtree(ondisk_model_version_path)
+            shutil.rmtree(os.path.join(self._download_dir, model_name))
+            self._client.remove_models([model_name], [model_versions])
+
+        # check tfs connection
+        if not self._client.is_tfs_accessible():
+            self._reset_when_tfs_unresponsive()
+            return False
+
+        # remove versioned models from TFS that no longer exist on disk
+        tfs_model_ids = self._client.get_registered_model_ids()
+        ondisk_models = find_ondisk_models(self._download_dir)
+        ondisk_model_ids = []
+        for model_name, model_versions in ondisk_models.items():
+            for model_version in model_versions:
+                ondisk_model_ids.append(f"{model_name}-{model_version}")
+        for tfs_model_id in tfs_model_ids:
+            if tfs_model_id not in ondisk_model_ids:
+                try:
+                    model_name, model_version = tfs_model_id.rsplit("-", maxsplit=1)
+                    self._client.remove_single_model(model_name, model_version)
+                    logger().info(
+                        "model '{}' of version '{}' has been unloaded".format(
+                            model_name, model_version
+                        )
+                    )
+                except gprc.RpcError as error:
+                    if error.code() == grpc.StatusCode.UNAVAILABLE:
+                        logger().warning(
+                            "TFS server unresponsive after trying to load model '{}' of version '{}': {}".format(
+                                model_name, model_version, str(e)
+                            )
+                        )
+                    self._reset_when_tfs_unresponsive()
+                    return False
+
+        # # update TFS models
+        current_ts_state = {}
+        for model_name, model_versions in ondisk_models.items():
+            if model_name in self._local_model_names:
+                continue
+            try:
+                ts = self._update_tfs_model(
+                    model_name, model_versions, timestamps, model_names, versions
+                )
+            except grpc.RpcError:
+                return False
+            current_ts_state = {**current_ts_state, **ts}
+
+        # save model timestamp states
+        for model_id, ts in current_ts_state.items():
+            self._old_ts_state[model_id] = ts
+
+        # remove model timestamps that no longer exist
+        loaded_model_ids = self._client.models.keys()
+        aux_ts_state = self._old_ts_state.copy()
+        for model_id in self._old_ts_state.keys():
+            if model_id not in loaded_model_ids:
+                del aux_ts_state[model_id]
+        self._old_ts_state = aux_ts_state
+
+        # save model timestamp states to disk
+        # could be cast to a short-lived thread
+        # required for printing the model stats when cortex getting
+        resource = os.path.join(self._lock_dir, "model_timestamps.json")
+        with open(resource, "w") as f:
+            json.dump(self._old_ts_state, f, indent=2)
+
+        # save model stats for TFS to disk
+        resource = os.path.join(self._lock_dir, "models_tfs.json")
+        with open(resource, "w") as f:
+            json.dump(self._client.models, f, indent=2)
+
+        return True
+
+    def _refresh_model(
+        self,
+        idx: int,
+        model_name: str,
+        model_path: str,
+        versions: List[str],
+        timestamps: List[datetime.datetime],
+        sub_paths: List[str],
+        bucket_name: str,
+    ) -> None:
+        s3_client = S3(bucket_name, client_config={})
+
+        ondisk_model_path = os.path.join(self._download_dir, model_name)
+        for version, model_ts in zip(versions, timestamps):
+
+            # check if a model update is mandated
+            update_model = False
+            ondisk_model_version_path = os.path.join(ondisk_model_path, version)
+            if os.path.exists(ondisk_model_version_path):
+                local_paths = glob.glob(ondisk_model_version_path + "*/**", recursive=True)
+                local_paths = util.remove_non_empty_directory_paths(local_paths)
+                local_paths = [
+                    os.path.relpath(local_path, ondisk_model_version_path)
+                    for local_path in local_paths
+                ]
+                local_paths = [path for path in local_paths if not path.startswith("../")]
+
+                s3_model_version_path = os.path.join(model_path, version)
+                s3_paths = [
+                    os.path.relpath(sub_path, s3_model_version_path) for sub_path in sub_paths
+                ]
+                s3_paths = [path for path in s3_paths if not path.startswith("../")]
+                s3_paths = util.remove_non_empty_directory_paths(s3_paths)
+
+                if set(local_paths) != set(s3_paths):
+                    update_model = True
+
+                model_id = f"{model_name}-{version}"
+                if self._is_this_a_newer_model_id(model_id, int(model_ts.timestamp())):
+                    update_model = True
+            else:
+                update_model = True
+
+            if update_model:
+                # download to a temp directory
+                temp_dest = os.path.join(self._temp_dir, model_name, version)
+                s3_src = os.path.join(model_path, version)
+                s3_client.download_dir_contents(s3_src, temp_dest)
+
+                # validate the downloaded model
+                model_contents = glob.glob(temp_dest + "*/**", recursive=True)
+                model_contents = util.remove_non_empty_directory_paths(model_contents)
+                try:
+                    validate_model_paths(model_contents, self._predictor_type, temp_dest)
+                    passed_validation = True
+                except CortexException:
+                    passed_validation = False
+                    shutil.rmtree(temp_dest)
+                    logger().debug(
+                        f"failed validating model {model_name} of version {version} found at {S3.construct_s3_path(bucket_name, model_path)} path"
+                    )
+
+                # move the model to its destination directory
+                if passed_validation:
+                    if os.path.exists(ondisk_model_version_path):
+                        shutil.rmtree(ondisk_model_version_path)
+                    shutil.move(temp_dest, ondisk_model_version_path)
+
+        # remove the temp model directory if it exists
+        model_temp_dest = os.path.join(self._temp_dir, model_name)
+        if os.path.exists(model_temp_dest):
+            os.rmdir(model_temp_dest)
+
+        # remove model versions if they are not found on the upstream
+        # except when the model version found on disk is 1 and the number of detected versions on the upstream is 0,
+        # thus indicating the 1-version on-disk model must be a model that came without a version
+        if os.path.exists(ondisk_model_path):
+            ondisk_model_versions = glob.glob(ondisk_model_path + "*/**")
+            ondisk_model_versions = [
+                os.path.relpath(path, ondisk_model_path) for path in ondisk_model_versions
+            ]
+            for ondisk_version in ondisk_model_versions:
+                if ondisk_version not in versions and (ondisk_version != "1" or len(versions) > 0):
+                    ondisk_model_version_path = os.path.join(ondisk_model_path, ondisk_version)
+                    shutil.rmtree(ondisk_model_version_path)
+
+            if len(glob.glob(ondisk_model_path + "*/**")) == 0:
+                shutil.rmtree(ondisk_model_path)
+
+        # if it's a non-versioned model ModelVersion.NOT_PROVIDED
+        if len(versions) == 0 and len(sub_paths) > 0:
+
+            model_ts = timestamps[0]
+
+            # check if a model update is mandated
+            update_model = False
+            ondisk_model_version_path = os.path.join(ondisk_model_path, "1")
+            if os.path.exists(ondisk_model_version_path):
+                local_paths = glob.glob(ondisk_model_version_path + "*/**", recursive=True)
+                local_paths = util.remove_non_empty_directory_paths(local_paths)
+                local_paths = [
+                    os.path.relpath(local_path, ondisk_model_version_path)
+                    for local_path in local_paths
+                ]
+                local_paths = [path for path in local_paths if not path.startswith("../")]
+
+                s3_model_version_path = model_path
+                s3_paths = [
+                    os.path.relpath(sub_path, s3_model_version_path) for sub_path in sub_paths
+                ]
+                s3_paths = [path for path in s3_paths if not path.startswith("../")]
+                s3_paths = util.remove_non_empty_directory_paths(s3_paths)
+
+                # update if the paths don't match
+                if set(local_paths) != set(s3_paths):
+                    update_model = True
+
+                model_id = f"{model_name}-1"
+                if self._is_this_a_newer_model_id(model_id, int(model_ts.timestamp())):
+                    update_model = True
+            else:
+                update_model = True
+
+            if not update_model:
+                return
+
+            # download to a temp directory
+            temp_dest = os.path.join(self._temp_dir, model_name)
+            s3_client.download_dir_contents(model_path, temp_dest)
+
+            # validate the downloaded model
+            model_contents = glob.glob(temp_dest + "*/**", recursive=True)
+            model_contents = util.remove_non_empty_directory_paths(model_contents)
+            try:
+                validate_model_paths(model_contents, self._predictor_type, temp_dest)
+                passed_validation = True
+            except CortexException:
+                passed_validation = False
+                shutil.rmtree(temp_dest)
+                logger().debug(
+                    f"failed validating model {model_name} of version {version} found at {S3.construct_s3_path(bucket_name, model_path)} path"
+                )
+
+            # move the model to its destination directory
+            if passed_validation:
+                if os.path.exists(ondisk_model_version_path):
+                    shutil.rmtree(ondisk_model_version_path)
+                shutil.move(temp_dest, ondisk_model_version_path)
+
+    def _update_tfs_model(
+        self,
+        model_name: str,
+        model_versions: List[str],
+        _s3_timestamps: List[List[datetime.datetime]],
+        _s3_model_names: List[str],
+        _s3_versions: Dict[str, List[str]],
+    ) -> Optional[dict]:
+        """
+        Compares the existing models from TFS with those present on disk.
+        Does the loading/unloading/reloading of models.
+
+        From the _s3_timestamps, _s3_model_names, _s3_versions params, only the fields of the respective model name are used.
+        """
+
+        # to prevent overwriting mistakes
+        s3_timestamps = copy.deepcopy(_s3_timestamps)
+        s3_model_names = copy.deepcopy(_s3_model_names)
+        s3_versions = copy.deepcopy(_s3_versions)
+
+        current_ts_state = {}
+
+        # get the right order of model versions with respect to the model ts order
+        model_timestamps = s3_timestamps[s3_model_names.index(model_name)]
+        filtered_model_versions = []
+        if len(s3_versions[model_name]) == 0:
+            filtered_model_versions = ["1"] * len(model_timestamps)
+        else:
+            for idx in range(len(model_timestamps)):
+                if s3_versions[model_name][idx] in model_versions:
+                    filtered_model_versions.append(s3_versions[model_name][idx])
+
+        for model_version, model_ts in zip(filtered_model_versions, model_timestamps):
+            model_ts = int(model_ts.timestamp())
+
+            # remove outdated model
+            model_id = f"{model_name}-{model_version}"
+            is_model_outdated = False
+            first_time_load = False
+            if model_id in self._old_ts_state and self._old_ts_state[model_id] != model_ts:
+                try:
+                    self._client.remove_single_model(model_name, model_version)
+                except gprc.RpcError as error:
+                    if error.code() == grpc.StatusCode.UNAVAILABLE:
+                        logger().warning(
+                            "TFS server unresponsive after trying to unload model '{}' of version '{}': {}".format(
+                                model_name, model_version, str(e)
+                            )
+                        )
+                    logger().warning("TFS server is unresponsive")
+                    raise
+                is_model_outdated = True
+            elif model_id not in self._old_ts_state:
+                first_time_load = True
+
+            if not is_model_outdated and not first_time_load:
+                continue
+
+            # load model
+            model_disk_path = os.path.join(self._tfs_model_dir, model_name)
+            try:
+                self._client.add_single_model(
+                    model_name,
+                    model_version,
+                    model_disk_path,
+                    self._determine_model_signature_key(model_name),
+                    timeout=30.0,
+                )
+            except Exception as e:
+                try:
+                    self._client.remove_single_model(model_name, model_version)
+                    logger().warning(
+                        "model '{}' of version '{}' couldn't be loaded: {}".format(
+                            model_name, model_version, str(e)
+                        )
+                    )
+                except grpc.RpcError as error:
+                    if error.code() == grpc.StatusCode.UNAVAILABLE:
+                        logger().warning(
+                            "TFS server unresponsive after trying to load model '{}' of version '{}': {}".format(
+                                model_name, model_version, str(e)
+                            )
+                        )
+                    self._reset_when_tfs_unresponsive()
+                    raise
+
+                is_model_outdated = False
+                first_time_load = False
+
+            # save timestamp of loaded model
+            current_ts_state[model_id] = model_ts
+            if is_model_outdated:
+                logger().info(
+                    "model '{}' of version '{}' has been reloaded".format(model_name, model_version)
+                )
+            elif first_time_load:
+                logger().info(
+                    "model '{}' of version '{}' has been loaded".format(model_name, model_version)
+                )
+
+        return current_ts_state
+
+    def _load_local_models(self) -> None:
+        for model_name in self._local_model_names:
+            for model_version in self._spec_models[model_name]["versions"]:
+                model_disk_path = os.path.join(self._tfs_model_dir, model_name)
+                try:
+                    self._client.add_single_model(
+                        model_name,
+                        model_version,
+                        model_disk_path,
+                        self._determine_model_signature_key(model_name),
+                        timeout=30.0,
+                    )
+                except Exception as e:
+                    try:
+                        self._client.remove_single_model(model_name, model_version)
+                        logger().warning(
+                            "model '{}' of version '{}' couldn't be loaded: {}".format(
+                                model_name, model_version, str(e)
+                            )
+                        )
+                    except grpc.RpcError as error:
+                        if error.code() == grpc.StatusCode.UNAVAILABLE:
+                            logger().warning(
+                                "TFS server unresponsive after trying to load model '{}' of version '{}': {}".format(
+                                    model_name, model_version, str(e)
+                                )
+                            )
+                        self._reset_when_tfs_unresponsive()
+                        return None
+
+    def _is_this_a_newer_model_id(self, model_id: str, timestamp: int) -> bool:
+        return model_id in self._old_ts_state and self._old_ts_state[model_id] < timestamp
+
+    def _determine_model_signature_key(self, model_name: str) -> Optional[str]:
+        if self._models_dir:
+            signature_key = self._api_spec["predictor"]["models"]["signature_key"]
+        else:
+            signature_key = self._spec_models[model_name]["signature_key"]
+
+        return signature_key
+
+    def _reset_when_tfs_unresponsive(self):
+        logger().warning("TFS server is unresponsive")
+
+        if self._tfs_address:
+            self._client = TensorFlowServingAPI(self._tfs_address)
+        else:
+            self._client = TensorFlowServingAPIClones(self._tfs_addresses)
+
+        resource = os.path.join(self._lock_dir, "models_tfs.json")
+        with open(resource, "w") as f:
+            json.dump(self._client.models, f, indent=2)
+
+
+class TFSAPIServingThreadUpdater(AbstractLoopingThread):
+    """
+    When live reloading and the TensorFlow predictor are used, the serving container
+    needs to have a way of accessing the models' metadata which is generated using the TFSModelLoader cron.
+
+    This cron runs on each serving process and periodically reads the exported metadata from the TFSModelLoader cron.
+    This is then fed into each serving process.
+    """
+
+    def __init__(
+        self,
+        interval: int,
+        client: TensorFlowServingAPI,
+        lock_dir: str = "/run/cron",
+    ):
+        AbstractLoopingThread.__init__(self, interval, self._run_tfs)
+
+        self._client = client
+        self._lock_dir = lock_dir
+
+    def _run_tfs(self) -> None:
+        resource_models = os.path.join(self._lock_dir, "models_tfs.json")
+
+        try:
+            with open(resource_models, "r") as f:
+                models = json.load(f)
+        except Exception:
+            return
+
+        resource_ts = os.path.join(self._lock_dir, "model_timestamps.json")
+        try:
+            with open(resource_ts, "r") as f:
+                timestamps = json.load(f)
+        except Exception:
+            return
+
+        non_intersecting_model_ids = set(models.keys()).symmetric_difference(timestamps.keys())
+        for non_intersecting_model_id in non_intersecting_model_ids:
+            if non_intersecting_model_id in models:
+                del models[non_intersecting_model_id]
+            if non_intersecting_model_id in timestamps:
+                del timestamps[non_intersecting_model_id]
+
+        for model_id in timestamps.keys():
+            models[model_id]["timestamp"] = timestamps[model_id]
+
+        self._client.models = models
+
+
+def find_ondisk_models(models_dir: str) -> Dict[str, List[str]]:
+    """
+    Returns all available models from the disk.
+    To be used in conjunction with TFSModelLoader.
+
+    This function should never be used for determining whether a model has to be loaded or not.
+    Can be used for Python/TensorFlow/ONNX clients.
+
+    Args:
+        models_dir: Path to where the models are stored.
+
+    Returns:
+        Dictionary with available model names and their associated versions.
+        {
+            "model-A": [177, 245, 247],
+            "model-B": [1],
+            ...
+        }
+    """
+
+    models = {}
+    model_names = [os.path.basename(file) for file in os.listdir(models_dir)]
+
+    for model_name in model_names:
+        model_versions = os.listdir(os.path.join(models_dir, model_name))
+        models[model_name] = model_versions
+
+    return models
+
+
+class ModelsGC(AbstractLoopingThread):
+    """
+    GC for models loaded into memory and/or stored on disk.
+
+    If the number of models exceeds the cache size, then evict the LRU models.
+    Also removes models that are no longer present in the model tree.
+    """
+
+    def __init__(
+        self,
+        interval: int,
+        api_spec: dict,
+        models: ModelsHolder,
+        tree: ModelsTree,
+    ):
+        """
+        Args:
+            interval: How often to update the models tree. Measured in seconds.
+            api_spec: Identical copy of pkg.type.spec.api.API.
+            models: The object holding all models in memory / on disk.
+            tree: Model tree representation of the available models on the S3 upstream.
+        """
+
+        AbstractLoopingThread.__init__(self, interval, self._run_gc)
+
+        self._api_spec = api_spec
+        self._models = models
+        self._tree = tree
+
+        self._spec_models = CuratedModelResources(self._api_spec["curated_model_resources"])
+        self._local_model_names = self._spec_models.get_local_model_names()
+        self._local_model_versions = [
+            self._spec_models.get_versions_for(model_name) for model_name in self._local_model_names
+        ]
+        self._local_model_ids = []
+        for model_name, versions in zip(self._local_model_names, self._local_model_versions):
+            if len(versions) == 0:
+                self._local_model_ids.append(f"{model_name}-1")
+                continue
+            for version in versions:
+                self._local_model_ids.append(f"{model_name}-{version}")
+
+        # run the cron every 10 seconds
+        self._lock_timeout = 10.0
+
+        self._event_stopper = td.Event()
+        self._stopped = False
+
+    def _run_gc(self) -> None:
+
+        # are there any models to collect (aka remove) from cache
+        with LockedGlobalModelsGC(self._models, "r"):
+            collectible, _, _ = self._models.garbage_collect(
+                exclude_disk_model_ids=self._local_model_ids, dry_run=True
+            )
+        if not collectible:
+            self._remove_stale_models()
+            return
+
+        # try to grab exclusive access to all models with shared access preference
+        # and if it works, remove excess models from cache
+        self._models.set_global_preference_policy("r")
+        with LockedGlobalModelsGC(self._models, "w", self._lock_timeout) as lg:
+            acquired = lg.acquired
+            if not acquired:
+                raise WithBreak
+
+            _, memory_evicted_model_ids, disk_evicted_model_ids = self._models.garbage_collect(
+                exclude_disk_model_ids=self._local_model_ids
+            )
+
+        # otherwise, grab exclusive access to all models with exclusive access preference
+        # and remove excess models from cache
+        if not acquired:
+            self._models.set_global_preference_policy("w")
+            with LockedGlobalModelsGC(self._models, "w"):
+                _, memory_evicted_model_ids, disk_evicted_model_ids = self._models.garbage_collect(
+                    exclude_disk_model_ids=self._local_model_ids
+                )
+            self._models.set_global_preference_policy("r")
+
+        memory_evicted_models = ids_to_models(memory_evicted_model_ids)
+        disk_evicted_models = ids_to_models(disk_evicted_model_ids)
+
+        self._log_removed_models(memory_evicted_models, memory=True)
+        self._log_removed_models(disk_evicted_models, disk=True)
+
+        self._remove_stale_models()
+
+    def _remove_stale_models(self) -> None:
+        """
+        Remove models that exist locally in-memory and on-disk that no longer appear on the S3 upstream.
+        """
+
+        # get available upstream S3 model IDs
+        s3_model_names = self._tree.get_model_names()
+        s3_model_versions = [
+            self._tree.model_info(model_name)["versions"] for model_name in s3_model_names
+        ]
+        s3_model_ids = []
+        for model_name, model_versions in zip(s3_model_names, s3_model_versions):
+            if len(model_versions) == 0:
+                continue
+            for model_version in model_versions:
+                s3_model_ids.append(f"{model_name}-{model_version}")
+
+        # get model IDs loaded into memory or on disk.
+        with LockedGlobalModelsGC(self._models, "r"):
+            present_model_ids = self._models.get_model_ids()
+
+        # exclude local models from removal
+        present_model_ids = list(set(present_model_ids) - set(self._local_model_ids))
+
+        # remove models that don't exist in the S3 upstream
+        ghost_model_ids = list(set(present_model_ids) - set(s3_model_ids))
+        for model_id in ghost_model_ids:
+            model_name, model_version = model_id.rsplit("-", maxsplit=1)
+            with LockedModel(self._models, "w", model_name, model_version):
+                status, ts = self._models.has_model(model_name, model_version)
+                if status == "in-memory":
+                    logger().info(
+                        f"unloading stale model {model_name} of version {model_version} using the garbage collector"
+                    )
+                    self._models.unload_model(model_name, model_version)
+                if status in ["in-memory", "on-disk"]:
+                    logger().info(
+                        f"removing stale model {model_name} of version {model_version} using the garbage collector"
+                    )
+                    self._models.remove_model(model_name, model_version)
+
+    def _log_removed_models(
+        self, models: Dict[str, List[str]], memory: bool = False, disk: bool = False
+    ) -> None:
+        """
+        Log the removed models from disk/memory.
+        """
+
+        if len(models) == 0:
+            return None
+
+        if len(models) > 1:
+            message = "models "
+        else:
+            message = "model "
+
+        for idx, (model_name, versions) in enumerate(models.items()):
+            message += f"{model_name} "
+            if len(versions) == 1:
+                message += f"(version {versions[0]})"
+            else:
+                message += f"(versions {','.join(versions)})"
+            if idx + 1 < len(models):
+                message += ", "
+            else:
+                if memory:
+                    message += " removed from the memory cache using the garbage collector"
+                if disk:
+                    message += " removed from the disk cache using the garbage collector"
+
+        logger().info(message)
+
+
+class ModelTreeUpdater(AbstractLoopingThread):
+    """
+    Model tree updater. Updates a local representation of all available models from the S3 upstreams.
+    """
+
+    def __init__(self, interval: int, api_spec: dict, tree: ModelsTree, ondisk_models_dir: str):
+        """
+        Args:
+            interval: How often to update the models tree. Measured in seconds.
+            api_spec: Identical copy of pkg.type.spec.api.API.
+            tree: Model tree representation of the available models on the S3 upstream.
+            ondisk_models_dir: Where the models are stored on disk. Necessary when local models are used.
+        """
+
+        AbstractLoopingThread.__init__(self, interval, self._update_models_tree)
+
+        self._api_spec = api_spec
+        self._tree = tree
+        self._ondisk_models_dir = ondisk_models_dir
+
+        self._s3_paths = []
+        self._spec_models = CuratedModelResources(self._api_spec["curated_model_resources"])
+        self._s3_model_names = self._spec_models.get_s3_model_names()
+        for model_name in self._s3_model_names:
+            self._s3_paths.append(self._spec_models[model_name]["model_path"])
+
+        if (
+            self._api_spec["predictor"]["model_path"] is None
+            and self._api_spec["predictor"]["models"] is not None
+            and self._api_spec["predictor"]["models"]["dir"] is not None
+        ):
+            self._is_dir_used = True
+            self._models_dir = self._api_spec["predictor"]["models"]["dir"]
+        else:
+            self._is_dir_used = False
+            self._models_dir = None
+
+        self._predictor_type = predictor_type_from_api_spec(self._api_spec)
+
+        self._make_local_models_available()
+
+    def _make_local_models_available(self):
+        timestamp_utc = datetime.datetime.now(datetime.timezone.utc)
+
+        for model_name in self._spec_models.get_local_model_names():
+            model = self._spec_models[model_name]
+
+            if len(model["versions"]) == 0:
+                model_version = "1"
+                ondisk_model_version_path = os.path.join(
+                    self._ondisk_models_dir, model_name, model_version
+                )
+                ondisk_paths = glob.glob(ondisk_model_version_path + "*/**", recursive=True)
+                ondisk_paths = util.remove_non_empty_directory_paths(ondisk_paths)
+                # removable is set to false to prevent the local models from being removed
+                self._tree.update_model(
+                    bucket="",
+                    model_name=model_name,
+                    model_version=model_version,
+                    model_path=ondisk_model_version_path,
+                    sub_paths=ondisk_paths,
+                    timestamp=timestamp_utc,
+                    removable=False,
+                )
+
+            for model_version in model["versions"]:
+                ondisk_model_version_path = os.path.join(
+                    self._ondisk_models_dir, model_name, model_version
+                )
+                ondisk_paths = glob.glob(ondisk_model_version_path + "*/**", recursive=True)
+                ondisk_paths = util.remove_non_empty_directory_paths(ondisk_paths)
+                # removable is set to false to prevent the local models from being removed
+                self._tree.update_model(
+                    bucket="",
+                    model_name=model_name,
+                    model_version=model_version,
+                    model_path=ondisk_model_version_path,
+                    sub_paths=ondisk_paths,
+                    timestamp=timestamp_utc,
+                    removable=False,
+                )
+
+    def _update_models_tree(self) -> None:
+        # don't update when the models:dir is a local path
+        if self._is_dir_used and not self._models_dir.startswith("s3://"):
+            return
+
+        # get updated/validated paths/versions of the S3 models
+        (
+            model_names,
+            versions,
+            model_paths,
+            sub_paths,
+            timestamps,
+            bucket_names,
+        ) = find_all_s3_models(
+            self._is_dir_used,
+            self._models_dir,
+            self._predictor_type,
+            self._s3_paths,
+            self._s3_model_names,
+        )
+
+        # update model tree
+        self._tree.update_models(
+            model_names, versions, model_paths, sub_paths, timestamps, bucket_names
+        )
+
+        logger().debug(f"{self.__class__.__name__} cron heartbeat")
diff --git a/pkg/workloads/cortex/lib/model/model.py b/pkg/workloads/cortex/lib/model/model.py
new file mode 100644
index 0000000000..36771c1745
--- /dev/null
+++ b/pkg/workloads/cortex/lib/model/model.py
@@ -0,0 +1,584 @@
+# Copyright 2020 Cortex Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import time
+import shutil
+import datetime
+import threading as td
+from typing import Dict, List, Any, Tuple, Callable, AbstractSet, Optional
+
+from cortex.lib.log import cx_logger as logger
+from cortex.lib.concurrency import ReadWriteLock
+from cortex.lib.exceptions import WithBreak, CortexException
+from cortex.lib.type import PredictorType
+
+
+class ModelsHolder:
+    """
+    Class to hold models in memory and references for those on disk.
+    Can limit the number of models in memory/on-disk based on an LRU policy - by default, it's disabled.
+    """
+
+    def __init__(
+        self,
+        predictor_type: PredictorType,
+        model_dir: str,
+        temp_dir: str = "/tmp/cron",
+        mem_cache_size: int = -1,
+        disk_cache_size: int = -1,
+        on_download_callback: Optional[Callable[[str, str, str, int], datetime.datetime]] = None,
+        on_load_callback: Optional[Callable[[str], Any]] = None,
+        on_remove_callback: Optional[Callable[[List[str]], None]] = None,
+    ):
+        """
+        Args:
+            predictor_type: The predictor type. Can be PythonPredictor, TensorFlowPredictor or ONNXPredictor.
+            model_dir: Where models are saved on disk.
+            temp_dir: Where models are temporary stored for validation.
+            mem_cache_size: The size of the cache for in-memory models. For negative values, the cache is disabled.
+            disk_cache_size: The size of the cache for on-disk models. For negative values, the cache is disabled.
+            on_download_callback(<predictor_type>, <model_name>, <model_version>, <model_path>, <temp_dir>, <model_dir>): Function to be called for downloading a model to disk. Returns the downloaded model's upstream timestamp, otherwise a negative number is returned.
+            on_load_callback(<disk_model_path>, **kwargs): Function to be called when a model is loaded from disk. Returns the actual model. May throw exceptions if it doesn't work.
+            on_remove_callback(<list of model IDs to remove>, **kwargs): Function to be called when the GC is called. E.g. for the TensorFlow Predictor, the function would communicate with TFS to unload models.
+        """
+        self._predictor_type = predictor_type
+        self._model_dir = model_dir
+        self._temp_dir = temp_dir
+
+        if mem_cache_size > 0 and disk_cache_size > 0 and mem_cache_size > disk_cache_size:
+            raise RuntimeError(
+                f"mem_cache_size ({mem_cache_size}) must be equal or smaller than disk_cache_size ({disk_cache_size})"
+            )
+
+        if mem_cache_size == 0 or disk_cache_size == 0:
+            raise RuntimeError(
+                "mem_cache_size or disk_cache_size can't be set to 0; must be negative to disable the cache or positive to have it enabled"
+            )
+
+        self._mem_cache_size = mem_cache_size
+        self._disk_cache_size = disk_cache_size
+
+        self._download_callback = on_download_callback
+        self._load_callback = on_load_callback
+        self._remove_callback = on_remove_callback
+
+        self._models = {}  # maps the model ID to the model that's placed in memory
+        self._timestamps = {}  # maps the model ID to the last access time of the model
+        self._locks = {}  # maps the model ID to the underlying lock for each model
+
+        self._create_lock = (
+            td.RLock()
+        )  # to ensure atomicity when 2 threads are trying to create locks for the same model ID that doesn't exist in self._locks
+        self._global_lock = ReadWriteLock()
+
+    def set_callback(self, ctype: str, callback: Callable) -> None:
+        """
+        Sets a callback.
+
+        Args:
+            ctype: "download", "load" or "remove" callback type - see the constructor to mark each one.
+            callback: The actual callback.
+        """
+        if ctype == "download":
+            self._download_callback = callback
+        if ctype == "load":
+            self._load_callback = callback
+        if ctype == "remove":
+            self._remove_callback = callback
+
+    def global_acquire(self, mode: str, timeout: Optional[float] = None) -> None:
+        """
+        Acquire shared/exclusive (R/W) access over all models.
+
+        Use "w" when wanting to acquire exclusive access for the GC (method garbage_collect), or "r" when wanting to grant shared access for any other method to be called (i.e. get_model_ids).
+
+        Args:
+            mode: "r" for read lock, "w" for write lock.
+            timeout: How many seconds to wait to acquire the lock.
+        """
+        self._global_lock.acquire(mode, timeout)
+
+    def global_release(self, mode: str) -> None:
+        """
+        Release shared/exclusive (R/W) access over all models.
+
+        Args:
+            mode: "r" for read lock, "w" for write lock.
+        """
+        self._global_lock.release(mode)
+
+    def model_acquire(self, mode: str, model_name: str, model_version: str) -> None:
+        """
+        Acquire shared/exclusive (R/W) access for a specific model.
+
+        Args:
+            mode: "r" for read lock, "w" for write lock.
+            model_name: The name of the model.
+            model_version: The version of the model.
+
+        When mode is "r", only the following methods can be called:
+        * has_model
+        * get_model
+
+        When mode is "w", the methods available for "r" can be called plus the following ones:
+        * load_model
+        * download_model
+        * remove_model
+        """
+        model_id = f"{model_name}-{model_version}"
+
+        if not model_id in self._locks:
+            lock = ReadWriteLock()
+            self._create_lock.acquire()
+            if model_id not in self._locks:
+                self._locks[model_id] = lock
+            self._create_lock.release()
+
+        self._locks[model_id].acquire(mode)
+
+    def model_release(self, mode: str, model_name: str, model_version: str) -> None:
+        """
+        Release shared/exclusive (R/W) access for a specific model.
+
+        Args:
+            mode: "r" for read lock, "w" for write lock.
+            model_name: The name of the model.
+            model_version: The version of the model.
+        """
+        model_id = f"{model_name}-{model_version}"
+        self._locks[model_id].release(mode)
+
+    def set_global_preference_policy(self, prefer: bool) -> bool:
+        """
+        Wrapper of cortex.lib.concurrency.ReadWriteLock.set_preference_policy.
+        """
+        return self._global_lock.set_preference_policy(prefer)
+
+    def get_model_names_by_tag_count(self, tag: str, count: int) -> Tuple[List[str], List[int]]:
+        """
+        Filter model names by the tag count based on the latest recently used model version.
+
+        Locking is already done within the method.
+
+        Args:
+            tag: Tag as passed on in load_model method. If tag is not found, then the model is not considered.
+            count: How many appearances a tag has to make for a given model name to be selected.
+
+        Returns:
+            List of model names that abide by the method's selection rule.
+            List of timestamps representing the latest upstream timestamp that abide by the method's selection rule.
+        """
+
+        models = {}
+        with LockedGlobalModelsGC(self, "r"):
+            models_ids = self.get_model_ids()
+
+        for model_id in models_ids:
+
+            model_name, model_version = model_id.rsplit("-", maxsplit=1)
+            with LockedModel(self, "r", model_name, model_version):
+                if not self.has_model_id(model_id):
+                    raise WithBreak
+
+                tag_count = self._models[model_id]["metadata"]["consecutive_tag_count"][tag]
+                ts = self._timestamps[model_id]
+
+                if (
+                    model_name in models and models[model_name]["timestamp"] < ts
+                ) or model_name not in models:
+                    models[model_name]["timestamp"] = ts
+                    models[model_name]["count"] = tag_count
+
+        filtered_model_names = []
+        filtered_model_ts = []
+        for model_name, v in models:
+            if v["count"] >= count:
+                filtered_model_names.append(model_name)
+                filtered_model_ts.append(v["timestamp"])
+
+        return filtered_model_names, filtered_model_ts
+
+    def has_model(self, model_name: str, model_version: str) -> Tuple[str, int]:
+        """
+        Verifies if a model is loaded into memory / on disk.
+
+        Args:
+            model_name: The name of the model.
+            model_version: The version of the model.
+
+        Returns:
+            "in-memory" and the upstream timestamp of the model when the model is loaded into memory. "in-memory" also implies "on-disk".
+            "on-disk" and the upstream timestamp of the model when the model is saved to disk.
+            "not-available" and 0 for the upstream timestamp when the model is not available.
+        """
+        model_id = f"{model_name}-{model_version}"
+        if model_id in self._models:
+            if self._models[model_id]["model"] is not None:
+                return "in-memory", self._models[model_id]["upstream_timestamp"]
+            else:
+                return "on-disk", self._models[model_id]["upstream_timestamp"]
+        return "not-available", 0
+
+    def has_model_id(self, model_id: str) -> Tuple[str, int]:
+        """
+        Wrapper for has_model method.
+        """
+        model_name, model_version = model_id.rsplit("-", maxsplit=1)
+        return self.has_model(model_name, model_version)
+
+    def get_model(
+        self, model_name: str, model_version: str, version_tag: str = ""
+    ) -> Tuple[Any, int]:
+        """
+        Retrieves a model from memory.
+
+        If the returned model is None, but the upstream timestamp is positive, then it means the model is present on disk.
+        If the returned model is None and the upstream timestamp is 0, then the model is not present.
+        If the returned model is not None, then the upstream timestamp will also be positive.
+
+        Args:
+            model_name: The name of the model.
+            model_version: The version of the model.
+            version_tag: The tag associated with the given model. If the tag is present, its count will be increased by one.
+
+        Returns:
+            The model and the model's upstream timestamp.
+        """
+        model_id = f"{model_name}-{model_version}"
+
+        if model_id in self._models:
+            self._timestamps[model_id] = time.time()
+
+            if version_tag in self._models[model_id]["metadata"]["consecutive_tag_count"]:
+                self._models[model_id]["metadata"]["consecutive_tag_count"][version_tag] += 1
+            else:
+                for tag in self._models[model_id]["metadata"]["consecutive_tag_count"]:
+                    self._models[model_id]["metadata"]["consecutive_tag_count"][tag] = 0
+
+            return self._models[model_id]["model"], self._models[model_id]["upstream_timestamp"]
+
+        return None, 0
+
+    def load_model(
+        self,
+        model_name: str,
+        model_version: str,
+        upstream_timestamp: int,
+        tags: List[str] = [],
+        kwargs: dict = {},
+    ) -> None:
+        """
+        Loads a given model into memory.
+        It is assumed the model already exists on disk. The model must be downloaded externally or with download_model method.
+
+        Args:
+            model_name: The name of the model.
+            model_version: The version of the model.
+            upstream_timestamp: When was this model last modified on the upstream source (e.g. S3).
+            tags: List of tags to initialize the model with.
+            kwargs: Extra arguments to pass into the loading callback.
+
+        Raises:
+            RuntimeError if a load callback isn't set. Can also raise exception if the load callback raises.
+        """
+
+        if self._load_callback:
+            model_id = f"{model_name}-{model_version}"
+            disk_path = os.path.join(self._model_dir, model_name, model_version)
+
+            model = {
+                "model": self._load_callback(disk_path, **kwargs),
+                "disk_path": disk_path,
+                "upstream_timestamp": upstream_timestamp,
+                "metadata": {
+                    "consecutive_tag_count": {},
+                },
+            }
+            if len(tags) > 0:
+                for tag in tags:
+                    model["metadata"]["consecutive_tag_count"][tag] = 0
+
+            self._models[model_id] = model
+        else:
+            raise RuntimeError(
+                "a load callback must be provided; use set_callback to set a callback"
+            )
+
+    def download_model(
+        self,
+        bucket: str,
+        model_name: str,
+        model_version: str,
+        model_path: str,
+    ) -> datetime.datetime:
+        """
+        Download a model to disk. To be called before load_model method is called.
+
+        To be used when the caching is enabled.
+        It is assumed that when caching is disabled, an external mechanism is responsible for downloading/removing models to/from disk.
+
+        Args:
+            bucket: The upstream model's S3 bucket name.
+            model_name: The name of the model.
+            model_version: The version of the model.
+            model_path: Path to the model as discovered in models:dir or specified in models:paths.
+
+        Returns:
+            Returns the downloaded model's upstream timestamp, otherwise None is returned if it fails.
+
+        Raises:
+            Exceptions if the download callback raises any.
+        """
+        if self._download_callback:
+            return self._download_callback(
+                self._predictor_type,
+                bucket,
+                model_name,
+                model_version,
+                model_path,
+                self._temp_dir,
+                self._model_dir,
+            )
+        raise RuntimeError(
+            "a download callback must be provided; use set_callback to set a callback"
+        )
+
+    def unload_model(self, model_name: str, model_version: str, kwargs: dict = {}) -> None:
+        """
+        Unloads a model from memory. If applicable, it gets called before remove_model/remove_model_by_id.
+
+        Args:
+            model_name: The name of the model.
+            model_version: The version of the model.
+            kwargs: Passable arguments to the remove callback.
+
+        Raises:
+            Exceptions if the remove callback raises any.
+        """
+
+        if self._remove_callback:
+            model_id = f"{model_name}-{model_version}"
+            self._remove_callback([model_id], **kwargs)
+
+    def remove_model(self, model_name: str, model_version: str) -> None:
+        """
+        Removes a model from memory and disk if it exists.
+        """
+        model_id = f"{model_name}-{model_version}"
+        self.remove_model_by_id(model_id, True, True)
+
+    def remove_model_by_id(
+        self, model_id: str, mem: bool, disk: bool, del_reference: bool = False
+    ) -> None:
+        """
+        Remove a model from this object and/or from disk.
+
+        Args:
+            model_id: The model ID to remove.
+            mem: Whether to remove the model from memory or not.
+            disk: Whether to remove the model from disk or not.
+            del_reference: Whether to remove the model reference or not. Don't touch this unless you know what you do.
+        """
+        if model_id not in self._models:
+            return None
+
+        if mem:
+            # remove model from memory (but keeps it on disk)
+            self._models[model_id]["model"] = None
+
+        if disk:
+            disk_path = self._models[model_id]["disk_path"]
+            shutil.rmtree(disk_path)
+
+        if disk or del_reference:
+            del self._models[model_id]
+            del self._timestamps[model_id]
+
+    def garbage_collect(
+        self, exclude_disk_model_ids: List[str] = [], dry_run: bool = False
+    ) -> Tuple[bool, List[str], List[str]]:
+        """
+        Removes stale in-memory and on-disk models based on LRU policy.
+        Also calls the "remove" callback before removing the models from this object. The callback must not raise any exceptions.
+
+        Must be called with a write lock unless dry_run is set to true.
+
+        Args:
+            exclude_disk_model_ids: Model IDs to exclude from removing from disk. Necessary for locally-provided models.
+            dry_run: Just test if there are any models to remove. If set to true, this method can then be called with a read lock.
+
+        Returns:
+            A 3-element tuple. First element tells whether models had to be collected. The 2nd and 3rd elements contain the model IDs that were removed from memory and disk respectively.
+        """
+        collected = False
+        if self._mem_cache_size <= 0 or self._disk_cache_size <= 0:
+            return collected
+
+        stale_mem_model_ids = self._lru_model_ids(self._mem_cache_size, filter_in_mem=True)
+        stale_disk_model_ids = self._lru_model_ids(
+            self._disk_cache_size - len(exclude_disk_model_ids), filter_in_mem=False
+        )
+
+        if self._remove_callback and not dry_run:
+            self._remove_callback(stale_mem_model_ids)
+
+        # don't delete excluded model IDs from disk
+        stale_disk_model_ids = list(set(stale_disk_model_ids) - set(exclude_disk_model_ids))
+        stale_disk_model_ids = stale_disk_model_ids[
+            len(stale_disk_model_ids) - self._disk_cache_size :
+        ]
+
+        if not dry_run:
+            logger().info(
+                f"unloading models {stale_mem_model_ids} from memory using the garbage collector"
+            )
+            logger().info(
+                f"unloading models {stale_disk_model_ids} from disk using the garbage collector"
+            )
+            for model_id in stale_mem_model_ids:
+                self.remove_model_by_id(model_id, mem=True, disk=False)
+            for model_id in stale_disk_model_ids:
+                self.remove_model_by_id(model_id, mem=False, disk=True)
+
+        if len(stale_mem_model_ids) > 0 or len(stale_disk_model_ids) > 0:
+            collected = True
+
+        return collected, stale_mem_model_ids, stale_disk_model_ids
+
+    def get_model_ids(self) -> List[str]:
+        """
+        Gets a list of all loaded model IDs (in memory or on disk).
+        """
+        return list(self._models.keys())
+
+    def _lru_model_ids(self, threshold: int, filter_in_mem: bool) -> List[str]:
+        """
+        Sort model ids by last access and get the model ids with ranks below the specified threshold.
+
+        Args:
+            threshold: The memory cache size or the disk cache size.
+            filter_in_mem: In the counting process, set whether to only look at models loaded in memory or not. True for only looking at models loaded in memory and on disk.
+
+        Returns:
+            A list of stale model IDs.
+        """
+        copied_timestamps = self._timestamps.copy()
+        timestamps = {
+            k: v
+            for k, v in sorted(copied_timestamps.items(), key=lambda item: item[1], reverse=True)
+        }
+        model_ids = []
+        for counter, model_id in enumerate(timestamps):
+            # skip models if they are not loaded in memory but on disk
+            if filter_in_mem and self._models[model_id]["model"] is None:
+                continue
+            if counter >= threshold:
+                model_ids.append(model_id)
+
+        return model_ids
+
+
+def ids_to_models(model_ids: List[str]) -> Dict[str, List[str]]:
+    """
+    Convert model IDs (MODEL_NAME-MODEL_VERSION) to a dictionary with its keys being
+    the model names and its values being lists of the associated versions for each given model name.
+    """
+
+    models = {}
+    for model_id in model_ids:
+        model_name, model_version = model_id.rsplit("-", maxsplit=1)
+        if model_name not in models:
+            models[model_name] = [model_version]
+        else:
+            models[model_name].append(model_version)
+    return models
+
+
+class LockedGlobalModelsGC:
+    """
+    Applies global exclusive lock (R/W) on the models holder.
+
+    For running the GC for all loaded models (or present on disk).
+    This is the locking implementation for the stop-the-world GC.
+
+    The context manager can be exited by raising cortex.lib.exceptions.WithBreak.
+    """
+
+    def __init__(
+        self,
+        models: ModelsHolder,
+        mode: str = "w",
+        prefer: str = "r",
+        timeout: Optional[float] = None,
+    ):
+        self._models = models
+        self._mode = mode
+        self._timeout = timeout
+
+    def __enter__(self):
+        self.acquired = True
+        try:
+            self._models.global_acquire(self._mode, self._timeout)
+        except TimeoutError:
+            self.acquired = False
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback) -> bool:
+        self._models.global_release(self._mode)
+
+        if exc_value is not None and exc_type is not WithBreak:
+            return False
+        return True
+
+
+class LockedModel:
+    """
+    For granting shared/exclusive (R/W) access to a model resource (model name + model version).
+    Also applies global read lock on the models holder.
+
+    The context manager can be exited by raising cortex.lib.exceptions.WithBreak.
+    """
+
+    def __init__(
+        self,
+        models: ModelsHolder,
+        mode: str,
+        model_name: str = "",
+        model_version: str = "",
+        model_id: str = "",
+    ):
+        """
+        mode can be "r" for read or "w" for write.
+        """
+        self._models = models
+        self._mode = mode
+        if model_id != "":
+            self._model_name, self._model_version = model_id.rsplit("-", maxsplit=1)
+        else:
+            self._model_name = model_name
+            self._model_version = model_version
+
+    def __enter__(self):
+        self._models.global_acquire("r")
+        self._models.model_acquire(self._mode, self._model_name, self._model_version)
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback) -> bool:
+        self._models.model_release(self._mode, self._model_name, self._model_version)
+        self._models.global_release("r")
+
+        if exc_value is not None and exc_type is not WithBreak:
+            return False
+        return True
diff --git a/pkg/workloads/cortex/lib/model/tfs.py b/pkg/workloads/cortex/lib/model/tfs.py
new file mode 100644
index 0000000000..c38a9ce2de
--- /dev/null
+++ b/pkg/workloads/cortex/lib/model/tfs.py
@@ -0,0 +1,754 @@
+# Copyright 2020 Cortex Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import time
+import grpc
+import copy
+from typing import Any, Optional, Dict, List, Tuple
+
+from cortex.lib.exceptions import CortexException, UserException
+from cortex.lib.log import cx_logger as logger
+
+# TensorFlow types
+def _define_types() -> Tuple[Dict[str, Any], Dict[str, str]]:
+    return (
+        {
+            "DT_FLOAT": tf.float32,
+            "DT_DOUBLE": tf.float64,
+            "DT_INT32": tf.int32,
+            "DT_UINT8": tf.uint8,
+            "DT_INT16": tf.int16,
+            "DT_INT8": tf.int8,
+            "DT_STRING": tf.string,
+            "DT_COMPLEX64": tf.complex64,
+            "DT_INT64": tf.int64,
+            "DT_BOOL": tf.bool,
+            "DT_QINT8": tf.qint8,
+            "DT_QUINT8": tf.quint8,
+            "DT_QINT32": tf.qint32,
+            "DT_BFLOAT16": tf.bfloat16,
+            "DT_QINT16": tf.qint16,
+            "DT_QUINT16": tf.quint16,
+            "DT_UINT16": tf.uint16,
+            "DT_COMPLEX128": tf.complex128,
+            "DT_HALF": tf.float16,
+            "DT_RESOURCE": tf.resource,
+            "DT_VARIANT": tf.variant,
+            "DT_UINT32": tf.uint32,
+            "DT_UINT64": tf.uint64,
+        },
+        {
+            "DT_INT32": "intVal",
+            "DT_INT64": "int64Val",
+            "DT_FLOAT": "floatVal",
+            "DT_STRING": "stringVal",
+            "DT_BOOL": "boolVal",
+            "DT_DOUBLE": "doubleVal",
+            "DT_HALF": "halfVal",
+            "DT_COMPLEX64": "scomplexVal",
+            "DT_COMPLEX128": "dcomplexVal",
+        },
+    )
+
+
+# for TensorFlowServingAPI
+try:
+    import tensorflow as tf
+    from tensorflow_serving.apis import predict_pb2
+    from tensorflow_serving.apis import get_model_metadata_pb2
+    from tensorflow_serving.apis import prediction_service_pb2_grpc
+    from tensorflow_serving.apis import model_service_pb2_grpc
+    from tensorflow_serving.apis import model_management_pb2
+    from tensorflow_serving.apis import get_model_status_pb2
+    from tensorflow_serving.config import model_server_config_pb2
+    from tensorflow_serving.sources.storage_path.file_system_storage_path_source_pb2 import (
+        FileSystemStoragePathSourceConfig,
+    )
+
+    ServableVersionPolicy = FileSystemStoragePathSourceConfig.ServableVersionPolicy
+    Specific = FileSystemStoragePathSourceConfig.ServableVersionPolicy.Specific
+    from google.protobuf import json_format
+
+    tensorflow_dependencies_installed = True
+    DTYPE_TO_TF_TYPE, DTYPE_TO_VALUE_KEY = _define_types()
+    predictRequestClass = predict_pb2.PredictRequest
+
+except ImportError:
+    tensorflow_dependencies_installed = False
+    predictRequestClass = Any
+
+
+class TensorFlowServingAPI:
+    def __init__(self, address: str):
+        """
+        TensorFlow Serving API for loading/unloading/reloading TF models and for running predictions.
+
+        Extra arguments passed to the tensorflow/serving container:
+            * --max_num_load_retries=0
+            * --load_retry_interval_micros=30000000 # 30 seconds
+            * --grpc_channel_arguments="grpc.max_concurrent_streams=<processes-per-api-replica>*<threads-per-process>" when inf == 0, otherwise
+            * --grpc_channel_arguments="grpc.max_concurrent_streams=<threads-per-process>" when inf > 0.
+
+        Args:
+            address: An address with the "host:port" format.
+        """
+
+        if not tensorflow_dependencies_installed:
+            raise NameError("tensorflow_serving_api and tensorflow packages not installed")
+
+        self.address = address
+        self.models = (
+            {}
+        )  # maps the model ID to the model metadata (signature def, signature key and so on)
+
+        self.channel = grpc.insecure_channel(self.address)
+        self._service = model_service_pb2_grpc.ModelServiceStub(self.channel)
+        self._pred = prediction_service_pb2_grpc.PredictionServiceStub(self.channel)
+
+    def is_tfs_accessible(self) -> bool:
+        """
+        Tests whether TFS is accessible or not.
+        """
+        request = get_model_status_pb2.GetModelStatusRequest()
+        request.model_spec.name = "test-model-name"
+
+        try:
+            self._service.GetModelStatus(request, timeout=10.0)
+        except grpc.RpcError as error:
+            if error.code() in [grpc.StatusCode.UNAVAILABLE, grpc.StatusCode.DEADLINE_EXCEEDED]:
+                return False
+        return True
+
+    def add_single_model(
+        self,
+        model_name: str,
+        model_version: str,
+        model_disk_path: str,
+        signature_key: Optional[str] = None,
+        timeout: Optional[float] = None,
+        max_retries: int = 0,
+    ) -> None:
+        """
+        Wrapper for add_models method.
+        """
+        self.add_models(
+            [model_name],
+            [[model_version]],
+            [model_disk_path],
+            [signature_key],
+            timeout=timeout,
+            max_retries=max_retries,
+        )
+
+    def remove_single_model(
+        self,
+        model_name: str,
+        model_version: str,
+        timeout: Optional[float] = None,
+    ) -> None:
+        """
+        Wrapper for remove_models method.
+        """
+        self.remove_models([model_name], [[model_version]], timeout)
+
+    def add_models(
+        self,
+        model_names: List[str],
+        model_versions: List[List[str]],
+        model_disk_paths: List[str],
+        signature_keys: List[Optional[str]],
+        skip_if_present: bool = False,
+        timeout: Optional[float] = None,
+        max_retries: int = 0,
+    ) -> None:
+        """
+        Add models to TFS. If they can't be loaded, use remove_models to remove them from TFS.
+
+        Args:
+            model_names: List of model names to add.
+            model_versions: List of lists - each element is a list of versions for a given model name.
+            model_disk_paths: The common model disk path of multiple versioned models of the same model name (i.e. modelA/ for modelA/1 and modelA/2).
+            skip_if_present: If the models are already loaded, don't make a new request to TFS.
+            signature_keys: The signature keys as set in cortex.yaml. If an element is set to None, then "predict" key will be assumed.
+            max_retries: How many times to call ReloadConfig before giving up.
+        Raises:
+            grpc.RpcError in case something bad happens while communicating.
+                StatusCode.DEADLINE_EXCEEDED when timeout is encountered. StatusCode.UNAVAILABLE when the service is unreachable.
+            cortex.lib.exceptions.CortexException if a non-0 response code is returned (i.e. model couldn't be loaded).
+            cortex.lib.exceptions.UserException when a model couldn't be validated for the signature def.
+        """
+
+        request = model_management_pb2.ReloadConfigRequest()
+        model_server_config = model_server_config_pb2.ModelServerConfig()
+
+        num_added_models = 0
+        for model_name, versions, model_disk_path in zip(
+            model_names, model_versions, model_disk_paths
+        ):
+            for model_version in versions:
+                versioned_model_disk_path = os.path.join(model_disk_path, model_version)
+                num_added_models += self._add_model_to_dict(
+                    model_name, model_version, versioned_model_disk_path
+                )
+
+        if skip_if_present and num_added_models == 0:
+            return
+
+        config_list = model_server_config_pb2.ModelConfigList()
+        current_model_names = self._get_model_names()
+        for model_name in current_model_names:
+            versions, model_disk_path = self._get_model_info(model_name)
+            versions = [int(version) for version in versions]
+            model_config = config_list.config.add()
+            model_config.name = model_name
+            model_config.base_path = model_disk_path
+            model_config.model_version_policy.CopyFrom(
+                ServableVersionPolicy(specific=Specific(versions=versions))
+            )
+            model_config.model_platform = "tensorflow"
+
+        model_server_config.model_config_list.CopyFrom(config_list)
+        request.config.CopyFrom(model_server_config)
+
+        while max_retries >= 0:
+            max_retries -= 1
+            try:
+                # to prevent HandleReloadConfigRequest from
+                # throwing an exception (TFS has some race-condition bug)
+                time.sleep(0.125)
+                response = self._service.HandleReloadConfigRequest(request, timeout)
+                break
+            except grpc.RpcError as err:
+                # to prevent HandleReloadConfigRequest from
+                # throwing another exception on the next run
+                time.sleep(0.125)
+                raise
+
+        if not (response and response.status.error_code == 0):
+            if response:
+                raise CortexException(
+                    "couldn't load user-requested models {} - failed with error code {}: {}".format(
+                        model_names, response.status.error_code, response.status.error_message
+                    )
+                )
+            else:
+                raise CortexException("couldn't load user-requested models")
+
+        # get models metadata
+        for model_name, versions, signature_key in zip(model_names, model_versions, signature_keys):
+            for model_version in versions:
+                self._load_model_signatures(model_name, model_version, signature_key)
+
+    def remove_models(
+        self,
+        model_names: List[str],
+        model_versions: List[List[str]],
+        timeout: Optional[float] = None,
+    ) -> None:
+        """
+        Remove models to TFS.
+
+        Args:
+            model_names: List of model names to add.
+            model_versions: List of lists - each element is a list of versions for a given model name.
+        Raises:
+            grpc.RpcError in case something bad happens while communicating.
+                StatusCode.DEADLINE_EXCEEDED when timeout is encountered. StatusCode.UNAVAILABLE when the service is unreachable.
+            cortex.lib.exceptions.CortexException if a non-0 response code is returned (i.e. model couldn't be unloaded).
+        """
+
+        request = model_management_pb2.ReloadConfigRequest()
+        model_server_config = model_server_config_pb2.ModelServerConfig()
+
+        for model_name, versions in zip(model_names, model_versions):
+            for model_version in versions:
+                self._remove_model_from_dict(model_name, model_version)
+
+        config_list = model_server_config_pb2.ModelConfigList()
+        remaining_model_names = self._get_model_names()
+        for model_name in remaining_model_names:
+            versions, model_disk_path = self._get_model_info(model_name)
+            versions = [int(version) for version in versions]
+            model_config = config_list.config.add()
+            model_config.name = model_name
+            model_config.base_path = model_disk_path
+            model_config.model_version_policy.CopyFrom(
+                ServableVersionPolicy(specific=Specific(versions=versions))
+            )
+            model_config.model_platform = "tensorflow"
+
+        model_server_config.model_config_list.CopyFrom(config_list)
+        request.config.CopyFrom(model_server_config)
+
+        response = self._service.HandleReloadConfigRequest(request, timeout)
+
+        if not (response and response.status.error_code == 0):
+            if response:
+                raise CortexException(
+                    "couldn't unload user-requested models {} - failed with error code {}: {}".format(
+                        model_names, response.status.error_code, response.status.error_message
+                    )
+                )
+            else:
+                raise CortexException("couldn't unload user-requested models")
+
+    def poll_available_model_versions(self, model_name: str) -> List[str]:
+        """
+        Gets the available model versions from TFS.
+
+        Args:
+            model_name: The model name to check for versions.
+
+        Returns:
+            List of the available versions for the given model from TFS.
+        """
+        request = get_model_status_pb2.GetModelStatusRequest()
+        request.model_spec.name = model_name
+
+        versions = []
+
+        try:
+            for model in self._service.GetModelStatus(request).model_version_status:
+                if model.state == get_model_status_pb2.ModelVersionStatus.AVAILABLE:
+                    versions.append(str(model.version))
+        except grpc.RpcError as e:
+            pass
+
+        return versions
+
+    def get_registered_model_ids(self) -> List[str]:
+        """
+        Get the registered model IDs (doesn't poll the TFS server).
+        """
+        return list(self.models.keys())
+
+    def predict(
+        self, model_input: Any, model_name: str, model_version: str, timeout: float = 300.0
+    ) -> Any:
+        """
+        Args:
+            model_input: The input to run the prediction on - as passed by the user.
+            model_name: Name of the model.
+            model_version: Version of the model.
+            timeout: How many seconds to wait for the prediction to run before timing out.
+
+        Raises:
+            UserException when the model input is not valid or when the model's shape doesn't match that of the input's.
+            grpc.RpcError in case something bad happens while communicating - should not happen.
+
+        Returns:
+            The prediction.
+        """
+
+        model_id = f"{model_name}-{model_version}"
+
+        signature_def = self.models[model_id]["signature_def"]
+        signature_key = self.models[model_id]["signature_key"]
+        input_signatures = self.models[model_id]["input_signatures"]
+
+        # validate model input
+        for input_name, _ in input_signatures.items():
+            if input_name not in model_input:
+                raise UserException(
+                    "missing key '{}' for model '{}' of version '{}'".format(
+                        input_name, model_name, model_version
+                    )
+                )
+
+        # create prediction request
+        prediction_request = self._create_prediction_request(
+            signature_def, signature_key, model_name, model_version, model_input
+        )
+
+        # run prediction
+        response_proto = self._pred.Predict(prediction_request, timeout=timeout)
+
+        # interpret response message
+        results_dict = json_format.MessageToDict(response_proto)
+        outputs = results_dict["outputs"]
+        outputs_simplified = {}
+        for key in outputs:
+            value_key = DTYPE_TO_VALUE_KEY[outputs[key]["dtype"]]
+            outputs_simplified[key] = outputs[key][value_key]
+
+        # return parsed response
+        return outputs_simplified
+
+    def _remove_model_from_dict(self, model_name: str, model_version: str) -> Tuple[bool, str]:
+        model_id = f"{model_name}-{model_version}"
+        try:
+            model = copy.deepcopy(self.models[model_id])
+            del self.models[model_id]
+            return True, model
+        except KeyError:
+            pass
+        return False, ""
+
+    def _add_model_to_dict(self, model_name: str, model_version: str, model_disk_path: str) -> bool:
+        model_id = f"{model_name}-{model_version}"
+        if model_id not in self.models:
+            self.models[model_id] = {
+                "disk_path": model_disk_path,
+            }
+            return True
+        return False
+
+    def _load_model_signatures(
+        self, model_name: str, model_version: str, signature_key: Optional[str] = None
+    ) -> None:
+        """
+        Queries the signature defs from TFS.
+
+        Args:
+            model_name: Name of the model.
+            model_version: Version of the model.
+            signature_key: Signature key of the model as passed in with predictor:signature_key, predictor:models:paths:signature_key or predictor:models:signature_key.
+                When set to None, "predict" is the assumed key.
+
+        Raises:
+            cortex.lib.exceptions.UserException when the signature def can't be validated.
+        """
+
+        # create model metadata request
+        request = get_model_metadata_pb2.GetModelMetadataRequest()
+        request.model_spec.name = model_name
+        request.model_spec.version.value = int(model_version)
+        request.metadata_field.append("signature_def")
+
+        # get signature def
+        last_idx = 0
+        for times in range(100):
+            try:
+                resp = self._pred.GetModelMetadata(request)
+                break
+            except grpc.RpcError as e:
+                # it has been observed that it may take a little bit of time
+                # until a model gets to be accessible with TFS (even though it's already loaded in)
+                time.sleep(0.3)
+            last_idx = times
+        if last_idx == 99:
+            raise UserException(
+                "couldn't find model '{}' of version '{}' to extract the signature def".format(
+                    model_name, model_version
+                )
+            )
+
+        sigAny = resp.metadata["signature_def"]
+        signature_def_map = get_model_metadata_pb2.SignatureDefMap()
+        sigAny.Unpack(signature_def_map)
+        sigmap = json_format.MessageToDict(signature_def_map)
+        signature_def = sigmap["signatureDef"]
+
+        # extract signature key and input signature
+        signature_key, input_signatures = self._extract_signatures(
+            signature_def, signature_key, model_name, model_version
+        )
+
+        model_id = f"{model_name}-{model_version}"
+        self.models[model_id]["signature_def"] = signature_def
+        self.models[model_id]["signature_key"] = signature_key
+        self.models[model_id]["input_signatures"] = input_signatures
+
+    def _get_model_names(self) -> List[str]:
+        return list(set([model_id.rsplit("-", maxsplit=1)[0] for model_id in self.models]))
+
+    def _get_model_info(self, model_name: str) -> Tuple[List[str], str]:
+        model_disk_path = ""
+        versions = []
+        for model_id in self.models:
+            _model_name, model_version = model_id.rsplit("-", maxsplit=1)
+            if _model_name == model_name:
+                versions.append(model_version)
+                if model_disk_path == "":
+                    model_disk_path = os.path.dirname(self.models[model_id]["disk_path"])
+
+        return versions, model_disk_path
+
+    def _extract_signatures(
+        self, signature_def, signature_key, model_name: str, model_version: str
+    ):
+        logger().info(
+            "signature defs found in model '{}' for version '{}': {}".format(
+                model_name, model_version, signature_def
+            )
+        )
+
+        available_keys = list(signature_def.keys())
+        if len(available_keys) == 0:
+            raise UserException(
+                "unable to find signature defs in model '{}' of version '{}'".format(
+                    model_name, model_version
+                )
+            )
+
+        if signature_key is None:
+            if len(available_keys) == 1:
+                logger().info(
+                    "signature_key was not configured by user, using signature key '{}' for model '{}' of version '{}' (found in the signature def map)".format(
+                        available_keys[0],
+                        model_name,
+                        model_version,
+                    )
+                )
+                signature_key = available_keys[0]
+            elif "predict" in signature_def:
+                logger().info(
+                    "signature_key was not configured by user, using signature key 'predict' for model '{}' of version '{}' (found in the signature def map)".format(
+                        model_name,
+                        model_version,
+                    )
+                )
+                signature_key = "predict"
+            else:
+                raise UserException(
+                    "signature_key was not configured by user, please specify one the following keys '{}' for model '{}' of version '{}' (found in the signature def map)".format(
+                        ", ".join(available_keys), model_name, model_version
+                    )
+                )
+        else:
+            if signature_def.get(signature_key) is None:
+                possibilities_str = "key: '{}'".format(available_keys[0])
+                if len(available_keys) > 1:
+                    possibilities_str = "keys: '{}'".format("', '".join(available_keys))
+
+                raise UserException(
+                    "signature_key '{}' was not found in signature def map for model '{}' of version '{}', but found the following {}".format(
+                        signature_key, model_name, model_version, possibilities_str
+                    )
+                )
+
+        signature_def_val = signature_def.get(signature_key)
+
+        if signature_def_val.get("inputs") is None:
+            raise UserException(
+                "unable to find 'inputs' in signature def '{}' for model '{}'".format(
+                    signature_key, model_name
+                )
+            )
+
+        parsed_signatures = {}
+        for input_name, input_metadata in signature_def_val["inputs"].items():
+            if input_metadata["tensorShape"] == {}:
+                # a scalar with rank 0 and empty shape
+                shape = "scalar"
+            elif input_metadata["tensorShape"].get("unknownRank", False):
+                # unknown rank and shape
+                #
+                # unknownRank is set to True if the model input has no rank
+                # it may lead to an undefined behavior if unknownRank is only checked for its presence
+                # so it also gets to be tested against its value
+                shape = "unknown"
+            elif input_metadata["tensorShape"].get("dim", None):
+                # known rank and known/unknown shape
+                shape = [int(dim["size"]) for dim in input_metadata["tensorShape"]["dim"]]
+            else:
+                raise UserException(
+                    "invalid 'tensorShape' specification for input '{}' in signature key '{}' for model '{}'",
+                    input_name,
+                    signature_key,
+                    model_name,
+                )
+
+            parsed_signatures[input_name] = {
+                "shape": shape if type(shape) == list else [shape],
+                "type": DTYPE_TO_TF_TYPE[input_metadata["dtype"]].name,
+            }
+        return signature_key, parsed_signatures
+
+    def _create_prediction_request(
+        self,
+        signature_def: dict,
+        signature_key: str,
+        model_name: str,
+        model_version: int,
+        model_input: Any,
+    ) -> predictRequestClass:
+        prediction_request = predict_pb2.PredictRequest()
+        prediction_request.model_spec.name = model_name
+        prediction_request.model_spec.version.value = int(model_version)
+        prediction_request.model_spec.signature_name = signature_key
+
+        for column_name, value in model_input.items():
+            if signature_def[signature_key]["inputs"][column_name]["tensorShape"] == {}:
+                shape = "scalar"
+            elif signature_def[signature_key]["inputs"][column_name]["tensorShape"].get(
+                "unknownRank", False
+            ):
+                # unknownRank is set to True if the model input has no rank
+                # it may lead to an undefined behavior if unknownRank is only checked for its presence
+                # so it also gets to be tested against its value
+                shape = "unknown"
+            else:
+                shape = []
+                for dim in signature_def[signature_key]["inputs"][column_name]["tensorShape"][
+                    "dim"
+                ]:
+                    shape.append(int(dim["size"]))
+
+            sig_type = signature_def[signature_key]["inputs"][column_name]["dtype"]
+
+            try:
+                tensor_proto = tf.compat.v1.make_tensor_proto(
+                    value, dtype=DTYPE_TO_TF_TYPE[sig_type]
+                )
+                prediction_request.inputs[column_name].CopyFrom(tensor_proto)
+            except Exception as e:
+                if shape == "scalar":
+                    raise UserException(
+                        'key "{}"'.format(column_name), "expected to be a scalar", str(e)
+                    ) from e
+                elif shape == "unknown":
+                    raise UserException(
+                        'key "{}"'.format(column_name), "can be of any rank and shape", str(e)
+                    ) from e
+                else:
+                    raise UserException(
+                        'key "{}"'.format(column_name), "expected shape {}".format(shape), str(e)
+                    ) from e
+
+        return prediction_request
+
+
+class TensorFlowServingAPIClones:
+    """
+    TFS API to load/unload models from multiple TFS server clones. Built on top of TensorFlowServingAPI.
+    """
+
+    def __init__(self, addresses: List[str]):
+        """
+        Args:
+            addresses: A list of addresses with the "host:port" format.
+        """
+
+        if len(addresses) == 0:
+            raise ValueError("addresses list must have at least one address")
+        self._clients = [TensorFlowServingAPI(address) for address in addresses]
+
+    def is_tfs_accessible(self) -> bool:
+        """
+        Tests whether all TFS servers are accessible or not.
+        """
+        return all([client.is_tfs_accessible() for client in self._clients])
+
+    def add_single_model(
+        self,
+        model_name: str,
+        model_version: str,
+        model_disk_path: str,
+        signature_key: Optional[str] = None,
+        timeout: Optional[float] = None,
+        max_retries: int = 0,
+    ) -> None:
+        """
+        Wrapper for add_models method.
+        """
+        for client in self._clients:
+            client.add_single_model(
+                model_name, model_version, model_disk_path, signature_key, timeout, max_retries
+            )
+
+    def remove_single_model(
+        self,
+        model_name: str,
+        model_version: str,
+        timeout: Optional[float] = None,
+    ) -> None:
+        """
+        Wrapper for remove_models method.
+        """
+        for client in self._clients:
+            client.remove_single_model(model_name, model_version, timeout)
+
+    def add_models(
+        self,
+        model_names: List[str],
+        model_versions: List[List[str]],
+        model_disk_paths: List[str],
+        signature_keys: List[Optional[str]],
+        skip_if_present: bool = False,
+        timeout: Optional[float] = None,
+        max_retries: int = 0,
+    ) -> None:
+        """
+        Add the same models to multiple TFS servers. If they can't be loaded, use remove_models to remove them from TFS.
+
+        Args:
+            model_names: List of model names to add.
+            model_versions: List of lists - each element is a list of versions for a given model name.
+            model_disk_paths: The common model disk path of multiple versioned models of the same model name (i.e. modelA/ for modelA/1 and modelA/2).
+            skip_if_present: If the models are already loaded, don't make a new request to TFS.
+            signature_keys: The signature keys as set in cortex.yaml. If an element is set to None, then "predict" key will be assumed.
+            max_retries: How many times to call ReloadConfig before giving up.
+        Raises:
+            grpc.RpcError in case something bad happens while communicating.
+                StatusCode.DEADLINE_EXCEEDED when timeout is encountered. StatusCode.UNAVAILABLE when the service is unreachable.
+            cortex.lib.exceptions.CortexException if a non-0 response code is returned (i.e. model couldn't be loaded).
+            cortex.lib.exceptions.UserException when a model couldn't be validated for the signature def.
+        """
+        for client in self._clients:
+            client.add_models(
+                model_names,
+                model_versions,
+                model_disk_paths,
+                signature_keys,
+                skip_if_present,
+                timeout,
+                max_retries,
+            )
+
+    def remove_models(
+        self,
+        model_names: List[str],
+        model_versions: List[List[str]],
+        timeout: Optional[float] = None,
+    ) -> None:
+        """
+        Remove the same models from multiple TFS servers.
+
+        Args:
+            model_names: List of model names to add.
+            model_versions: List of lists - each element is a list of versions for a given model name.
+        Raises:
+            grpc.RpcError in case something bad happens while communicating.
+                StatusCode.DEADLINE_EXCEEDED when timeout is encountered. StatusCode.UNAVAILABLE when the service is unreachable.
+            cortex.lib.exceptions.CortexException if a non-0 response code is returned (i.e. model couldn't be unloaded).
+        """
+        for client in self._clients:
+            client.remove_models(model_names, model_versions, timeout)
+
+    def poll_available_model_versions(self, model_name: str) -> List[str]:
+        """
+        Gets the available model versions from TFS.
+        Since all TFS servers are assumed to have the same models in memory, it makes sense to just poll one.
+
+        Args:
+            model_name: The model name to check for versions.
+
+        Returns:
+            List of the available versions for the given model from TFS.
+        """
+
+        return self._clients[0].poll_available_model_versions(model_name)
+
+    def get_registered_model_ids(self) -> List[str]:
+        """
+        Get the registered model IDs (doesn't poll the TFS server).
+        Since all TFS servers are assumed to have the same models in memory, it makes sense to just poll one.
+        """
+        return self._clients[0].get_registered_model_ids()
+
+    @property
+    def models(self) -> dict:
+        return self._clients[0].models
diff --git a/pkg/workloads/cortex/lib/model/tree.py b/pkg/workloads/cortex/lib/model/tree.py
new file mode 100644
index 0000000000..0e6d193652
--- /dev/null
+++ b/pkg/workloads/cortex/lib/model/tree.py
@@ -0,0 +1,376 @@
+# Copyright 2020 Cortex Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import time
+import datetime
+import shutil
+import threading as td
+from typing import List, Dict, Any, Tuple, Callable, AbstractSet
+
+from cortex.lib.log import cx_logger as logger
+from cortex.lib.concurrency import ReadWriteLock
+from cortex.lib.exceptions import WithBreak
+
+
+class ModelsTree:
+    """
+    Model tree for S3-provided models.
+    """
+
+    def __init__(self):
+        self.models = {}
+        self._locks = {}
+        self._create_lock = td.RLock()
+        self._removable = set()
+
+    def acquire(self, mode: str, model_name: str, model_version: str) -> None:
+        """
+        Acquire shared/exclusive (R/W) access for a specific model. Use this when multiple threads are used.
+
+        Args:
+            mode: "r" for read lock, "w" for write lock.
+            model_name: The name of the model.
+            model_version: The version of the model.
+        """
+        model_id = f"{model_name}-{model_version}"
+
+        if not model_id in self._locks:
+            lock = ReadWriteLock()
+            self._create_lock.acquire()
+            if model_id not in self._locks:
+                self._locks[model_id] = lock
+            self._create_lock.release()
+
+        self._locks[model_id].acquire(mode)
+
+    def release(self, mode: str, model_name: str, model_version: str) -> None:
+        """
+        Release shared/exclusive (R/W) access for a specific model. Use this when multiple threads are used.
+
+        Args:
+            mode: "r" for read lock, "w" for write lock.
+            model_name: The name of the model.
+            model_version: The version of the model.
+        """
+        model_id = f"{model_name}-{model_version}"
+        self._locks[model_id].release(mode)
+
+    def update_models(
+        self,
+        model_names: List[str],
+        model_versions: Dict[str, List[str]],
+        model_paths: List[str],
+        sub_paths: List[List[str]],
+        timestamps: List[List[datetime.datetime]],
+        bucket_names: List[str],
+    ) -> Tuple[AbstractSet[str], AbstractSet[str]]:
+        """
+        Updates the model tree with the latest from the upstream and removes stale models.
+
+        Locking is not required. Locking is already done within the method.
+
+        Args:
+            model_names: The unique names of the models as discovered in models:dir or specified in models:paths.
+            model_versions: The detected versions of each model. If the list is empty, then version "1" should be assumed. The dictionary keys represent the models' names.
+            model_paths: S3 model paths to each model.
+            sub_paths: A list of filepaths lists for each file of each model.
+            timestamps: When was each versioned model updated the last time on the upstream. When no versions are passed, a timestamp is still expected.
+            bucket_names: A list with the bucket_names required for each model.
+
+        Returns:
+            The loaded model IDs ("<model-name>-<model-version") that haven't been found in the passed parameters.
+            Which model IDs have been updated. If these model IDs are in memory or on disk already, then they should get updated as well.
+
+        Also sets an info attribute which might look like this:
+        {
+            "<model-name>": ,
+        }
+        And where "versions" represents the available versions of a model <model-name> and each "timestamps" element is the corresponding
+        last-edit time of each versioned model.
+        """
+
+        current_model_ids = set()
+        updated_model_ids = set()
+        for idx in range(len(model_names)):
+            model_name = model_names[idx]
+
+            if len(model_versions[model_name]) == 0:
+                model_id = f"{model_name}-1"
+                with LockedModelsTree(self, "w", model_name, "1"):
+                    updated = self.update_model(
+                        bucket_names[idx],
+                        model_name,
+                        "1",
+                        model_paths[idx],
+                        sub_paths[idx],
+                        timestamps[idx][0],
+                        True,
+                    )
+                current_model_ids.add(model_id)
+                if updated:
+                    updated_model_ids.add(model_id)
+
+            for v_idx, model_version in enumerate(model_versions[model_name]):
+                model_id = f"{model_name}-{model_version}"
+                with LockedModelsTree(self, "w", model_name, model_version):
+                    updated = self.update_model(
+                        bucket_names[idx],
+                        model_name,
+                        model_version,
+                        os.path.join(model_paths[idx], model_version) + "/",
+                        sub_paths[idx],
+                        timestamps[idx][v_idx],
+                        True,
+                    )
+                current_model_ids.add(model_id)
+                if updated:
+                    updated_model_ids.add(model_id)
+
+        old_model_ids = set(self.models.keys()) - current_model_ids
+
+        for old_model_id in old_model_ids:
+            model_name, model_version = old_model_id.rsplit("-", maxsplit=1)
+            if old_model_id not in self._removable:
+                continue
+            with LockedModelsTree(self, "w", model_name, model_version):
+                del self.models[old_model_id]
+                self._removable = self._removable - set([old_model_id])
+
+        return old_model_ids, updated_model_ids
+
+    def update_model(
+        self,
+        bucket: str,
+        model_name: str,
+        model_version: str,
+        model_path: str,
+        sub_paths: List[str],
+        timestamp: datetime.datetime,
+        removable: bool,
+    ) -> None:
+        """
+        Updates the model tree with the given model.
+
+        Locking is required.
+
+        Args:
+            bucket: The S3 bucket on which the model is stored.
+            model_name: The unique name of the model as discovered in models:dir or specified in models:paths.
+            model_version: A detected version of the model.
+            model_path: The model path to the versioned model.
+            sub_paths: A list of filepaths for each file of the model.
+            timestamp: When was the model path updated the last time.
+            removable: If update_models method is allowed to remove the model.
+
+        Returns:
+            True if the model wasn't in the tree or if the timestamp is newer. False otherwise.
+        """
+
+        model_id = f"{model_name}-{model_version}"
+        has_changed = False
+        if model_id not in self.models:
+            has_changed = True
+        elif self.models[model_id]["timestamp"] < timestamp:
+            has_changed = True
+
+        if has_changed or model_id in self.models:
+            self.models[model_id] = {
+                "bucket": bucket,
+                "path": model_path,
+                "sub_paths": sub_paths,
+                "timestamp": timestamp,
+            }
+            if removable:
+                self._removable.add(model_id)
+            else:
+                self._removable = self._removable - set([model_id])
+
+        return has_changed
+
+    def model_info(self, model_name: str) -> dict:
+        """
+        Gets model info about the available versions and model timestamps.
+
+        Locking is not required.
+
+        Returns:
+            A dict with keys "bucket", "model_paths, "versions" and "timestamps".
+            "model_paths" contains the S3 prefixes of each versioned model, "versions" represents the available versions of the model,
+            and each "timestamps" element is the corresponding last-edit time of each versioned model.
+
+            Empty lists are returned if the model is not found.
+
+        Example of returned dictionary for model_name.
+        ```json
+        {
+            "bucket": "bucket-0",
+            "model_paths": ["modelA/1", "modelA/4", "modelA/7", ...],
+            "versions": [1,4,7, ...],
+            "timestamps": [12884999, 12874449, 12344931, ...]
+        }
+        ```
+        """
+
+        info = {
+            "model_paths": [],
+            "versions": [],
+            "timestamps": [],
+        }
+
+        # to ensure atomicity
+        models = self.models.copy()
+        for model_id in models:
+            _model_name, model_version = model_id.rsplit("-", maxsplit=1)
+            if _model_name == model_name:
+                if "bucket" not in info:
+                    info["bucket"] = models[model_id]["bucket"]
+                info["model_paths"] += [os.path.join(models[model_id]["path"], model_version)]
+                info["versions"] += [model_version]
+                info["timestamps"] += [models[model_id]["timestamp"]]
+
+        return info
+
+    def get_model_names(self) -> List[str]:
+        """
+        Gets the available model names.
+
+        Locking is not required.
+
+        Returns:
+            List of all model names.
+        """
+        model_names = set()
+
+        # to ensure atomicity
+        models = self.models.copy()
+        for model_id in models:
+            model_name = model_id.rsplit("-", maxsplit=1)[0]
+            model_names.add(model_name)
+
+        return list(model_names)
+
+    def get_all_models_info(self) -> dict:
+        """
+        Gets model info about the available versions and model timestamps.
+
+        Locking is not required.
+
+        It's like model_info method, but for all model names.
+
+        Example of returned dictionary.
+        ```json
+        {
+            ...
+            "modelA": {
+                "bucket": "bucket-0",
+                "model_paths": ["modelA/1", "modelA/4", "modelA/7", ...],
+                "versions": ["1","4","7", ...],
+                "timestamps": [12884999, 12874449, 12344931, ...]
+            }
+            ...
+        }
+        ```
+        """
+
+        models_info = {}
+        # to ensure atomicity
+        models = self.models.copy()
+
+        # extract model names
+        model_names = set()
+        for model_id in models:
+            model_name = model_id.rsplit("-", maxsplit=1)[0]
+            model_names.add(model_name)
+        model_names = list(model_names)
+
+        # build models info dictionary
+        for model_name in model_names:
+            model_info = {
+                "model_paths": [],
+                "versions": [],
+                "timestamps": [],
+            }
+            for model_id in models:
+                _model_name, model_version = model_id.rsplit("-", maxsplit=1)
+                if _model_name == model_name:
+                    if "bucket" not in model_info:
+                        model_info["bucket"] = models[model_id]["bucket"]
+                    model_info["model_paths"] += [
+                        os.path.join(models[model_id]["path"], model_version)
+                    ]
+                    model_info["versions"] += [model_version]
+                    model_info["timestamps"] += [int(models[model_id]["timestamp"].timestamp())]
+
+            models_info[model_name] = model_info
+
+        return models_info
+
+    def __getitem__(self, model_id: str) -> dict:
+        """
+        Each value of a key (model ID) is a dictionary with the following format:
+        {
+            "bucket": <bucket-of-the-model>,
+            "path": <path-of-the-model>,
+            "sub_paths": <sub-path-of-each-file-of-the-model>,
+            "timestamp": <when-was-the-model-last-modified>
+        }
+
+        Locking is required.
+        """
+        return self.models[model_id].copy()
+
+    def __contains__(self, model_id: str) -> bool:
+        """
+        Each value of a key (model ID) is a dictionary with the following format:
+        {
+            "bucket": <bucket-of-the-model>,
+            "path": <path-of-the-model>,
+            "sub_paths": <sub-path-of-each-file-of-the-model>,
+            "timestamp": <when-was-the-model-last-modified>
+        }
+
+        Locking is required.
+        """
+        return model_id in self.models
+
+
+class LockedModelsTree:
+    """
+    When acquiring shared/exclusive (R/W) access to a model resource (model name + version).
+
+    Locks just for a specific model. Apply read lock when granting shared access or write lock when it's exclusive access (for adding/removing operations).
+
+    The context manager can be exited by raising cortex.lib.exceptions.WithBreak.
+    """
+
+    def __init__(self, tree: ModelsTree, mode: str, model_name: str, model_version: str):
+        """
+        mode can be "r" for read or "w" for write.
+        """
+        self._tree = tree
+        self._mode = mode
+        self._model_name = model_name
+        self._model_version = model_version
+
+    def __enter__(self):
+        self._tree.acquire(self._mode, self._model_name, self._model_version)
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback) -> bool:
+        self._tree.release(self._mode, self._model_name, self._model_version)
+
+        if exc_value is not None and exc_type is not WithBreak:
+            return False
+        return True
diff --git a/pkg/workloads/cortex/lib/model/type.py b/pkg/workloads/cortex/lib/model/type.py
new file mode 100644
index 0000000000..b4641f42dd
--- /dev/null
+++ b/pkg/workloads/cortex/lib/model/type.py
@@ -0,0 +1,139 @@
+# Copyright 2020 Cortex Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional
+
+
+class CuratedModelResources:
+    def __init__(self, curated_model_resources: List[dict]):
+        """
+        curated_model_resources must have the format enforced by the CLI's validation process of cortex.yaml.
+        curated_model_resources is an identical copy of pkg.type.spec.api.API.CuratedModelResources.
+
+        An example of curated_model_resources object:
+        [
+            {
+                'model_path': 's3://cortex-examples/models/tensorflow/transformer/',
+                'name': 'modelB',
+                's3_path': True,
+                'signature_key': None,
+                'versions': [1554540232]
+            },
+            ...
+        ]
+        """
+        self._models = curated_model_resources
+
+        for res in self._models:
+            if not res["versions"]:
+                res["versions"] = []
+            else:
+                res["versions"] = [str(version) for version in res["versions"]]
+
+    def is_local(self, name: str) -> Optional[bool]:
+        """
+        Checks if the model has been made available from the local disk.
+
+        Args:
+            name: Name of the model as specified in predictor:models:paths:name or if a single model is specified, _cortex_default.
+
+        Returns:
+            If the model is local. None if the model wasn't found.
+        """
+        for model in self._models:
+            if model["name"] == name:
+                return not model["s3_path"]
+        return None
+
+    def get_field(self, field: str) -> List[str]:
+        """
+        Get a list of the values of each models' specified field.
+
+        Args:
+            field: name, s3_path, signature_key or versions.
+
+        Returns:
+            A list with the specified value of each model.
+        """
+        return [model[field] for model in self._models]
+
+    def get_versions_for(self, name: str) -> Optional[List[str]]:
+        """
+        Get versions for a given model name.
+
+        Args:
+            name: Name of the model (_cortex_default for predictor:model_path) or predictor:models:paths:name.
+
+        Returns:
+            Versions for a given model. None if the model wasn't found.
+        """
+        versions = []
+        model_not_found = True
+        for i, _ in enumerate(self._models):
+            if self._models[i]["name"] == name:
+                versions = self._models[i]["versions"]
+                model_not_found = False
+                break
+
+        if model_not_found:
+            return None
+        return [str(version) for version in versions]
+
+    def get_local_model_names(self) -> List[str]:
+        """
+        Get locally-provided models as specified with predictor:model_path, predictor:models:paths or predictor:models:dir.
+
+        Returns:
+            A list of names of all local models.
+        """
+        local_model_names = []
+        for model_name in self.get_field("name"):
+            if self.is_local(model_name):
+                local_model_names.append(model_name)
+
+        return local_model_names
+
+    def get_s3_model_names(self) -> List[str]:
+        """
+        Get S3-provided models as specified with predictor:model_path, predictor:models:paths or predictor:models:dir.
+
+        Returns:
+            A list of names of all models available from S3.
+        """
+        s3_model_names = []
+        for model_name in self.get_field("name"):
+            if not self.is_local(model_name):
+                s3_model_names.append(model_name)
+
+        return s3_model_names
+
+    def __getitem__(self, name: str) -> dict:
+        """
+        Gets the model resource for a given model name.
+        """
+        for model in self._models:
+            if model["name"] == name:
+                return model
+
+        raise KeyError(f"model resource {name} does not exit")
+
+    def __contains__(self, name: str) -> bool:
+        """
+        Checks if there's a model resource whose name is the provided one.
+        """
+        try:
+            self[name]
+            return True
+        except KeyError:
+            return False
diff --git a/pkg/workloads/cortex/lib/model/validation.py b/pkg/workloads/cortex/lib/model/validation.py
new file mode 100644
index 0000000000..433b998e89
--- /dev/null
+++ b/pkg/workloads/cortex/lib/model/validation.py
@@ -0,0 +1,527 @@
+# Copyright 2020 Cortex Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import operator
+import uuid
+import collections
+from enum import IntEnum
+from typing import List, Any, Tuple
+from fnmatch import fnmatchcase
+
+from cortex.lib import util
+from cortex.lib.storage import S3, LocalStorage
+from cortex.lib.log import cx_logger as logger
+from cortex.lib.exceptions import CortexException
+from cortex.lib.type import (
+    PythonPredictorType,
+    TensorFlowPredictorType,
+    TensorFlowNeuronPredictorType,
+    ONNXPredictorType,
+    PredictorType,
+)
+
+
+class TemplatePlaceholder(collections.namedtuple("TemplatePlaceholder", "placeholder priority")):
+    """
+    Placeholder type that denotes an operation, a text placeholder, etc.
+
+    Accessible properties: type, priority.
+    """
+
+    def __new__(cls, placeholder: str, priority: int):
+        return super(cls, TemplatePlaceholder).__new__(cls, "<" + placeholder + ">", priority)
+
+    def __str__(self) -> str:
+        return str(self.placeholder)
+
+    def __repr__(self) -> str:
+        return str(self.placeholder)
+
+    @property
+    def type(self) -> str:
+        return str(self.placeholder).strip("<>")
+
+
+class GenericPlaceholder(
+    collections.namedtuple("GenericPlaceholder", "placeholder value priority")
+):
+    """
+    Generic placeholder.
+
+    Can hold any value.
+    Can be of one type only: generic.
+
+    Accessible properties: placeholder, value, type, priority.
+    """
+
+    def __new__(cls, value: str):
+        return super(cls, GenericPlaceholder).__new__(cls, "<generic>", value, 0)
+
+    def __eq__(self, other) -> bool:
+        return isinstance(other, GenericPlaceholder)
+
+    def __hash__(self):
+        return hash((self.placeholder, self.value))
+
+    def __str__(self) -> str:
+        return f"<{self.type}>" + str(self.value) + f"</{self.type}>"
+
+    def __repr__(self) -> str:
+        return f"<{self.type}>" + str(self.value) + f"</{self.type}>"
+
+    @property
+    def type(self) -> str:
+        return str(self.placeholder).strip("<>")
+
+
+class PlaceholderGroup:
+    """
+    Order-based addition of placeholder types.
+    Can use AnyPlaceholder, GenericPlaceholder, SinglePlaceholder.
+
+    Accessible properties: parts, type, priority.
+    """
+
+    def __init__(self, *args, priority=0):
+        self.parts = args
+        self.priority = priority
+
+    def __getitem__(self, index: int):
+        return self.parts[index]
+
+    def __len__(self) -> int:
+        return len(self.parts)
+
+    def __str__(self) -> str:
+        return "<group>" + str(self.parts) + "</group>"
+
+    def __repr__(self) -> str:
+        return "<group>" + str(self.parts) + "</group>"
+
+    @property
+    def type(self) -> str:
+        return str(self.parts)
+
+
+class OneOfAllPlaceholder:
+    """
+    Can be any of the provided alternatives.
+
+    Accessible properties: parts, type, priority, ID.
+    """
+
+    def __init__(self, ID: Any = None):
+        self._placeholder = TemplatePlaceholder("oneofall", priority=-1)
+        if not ID:
+            ID = uuid.uuid4().int
+        self.ID = ID
+
+    def __str__(self) -> str:
+        return str(self._placeholder)
+
+    def __repr__(self) -> str:
+        return str(self._placeholder)
+
+    @property
+    def type(self) -> str:
+        return str(self._placeholder).strip("<>")
+
+    @property
+    def priority(self) -> int:
+        return self._placeholder.priority
+
+
+IntegerPlaceholder = TemplatePlaceholder("integer", priority=1)  # the path name must be an integer
+SinglePlaceholder = TemplatePlaceholder(
+    "single", priority=2
+)  # can only have a single occurrence of this, but its name can take any form
+AnyPlaceholder = TemplatePlaceholder(
+    "any", priority=4
+)  # the path can be any file or any directory (with multiple subdirectories)
+
+
+class ModelVersion(IntEnum):
+    NOT_PROVIDED = 1  # for models provided without a specific version
+    PROVIDED = 2  # for models provided with version directories (1, 2, 452, etc).
+
+
+# to be used when predictor:model_path, predictor:models:paths or predictor:models:dir is used
+ModelTemplate = {
+    PythonPredictorType: {
+        OneOfAllPlaceholder(ModelVersion.PROVIDED): {
+            IntegerPlaceholder: AnyPlaceholder,
+        },
+        OneOfAllPlaceholder(ModelVersion.NOT_PROVIDED): {
+            AnyPlaceholder: None,
+        },
+    },
+    TensorFlowPredictorType: {
+        OneOfAllPlaceholder(ModelVersion.PROVIDED): {
+            IntegerPlaceholder: {
+                AnyPlaceholder: None,
+                GenericPlaceholder("saved_model.pb"): None,
+                GenericPlaceholder("variables"): {
+                    GenericPlaceholder("variables.index"): None,
+                    PlaceholderGroup(
+                        GenericPlaceholder("variables.data-00000-of-"), AnyPlaceholder
+                    ): None,
+                    AnyPlaceholder: None,
+                },
+            },
+        },
+        OneOfAllPlaceholder(ModelVersion.NOT_PROVIDED): {
+            AnyPlaceholder: None,
+            GenericPlaceholder("saved_model.pb"): None,
+            GenericPlaceholder("variables"): {
+                GenericPlaceholder("variables.index"): None,
+                PlaceholderGroup(
+                    GenericPlaceholder("variables.data-00000-of-"), AnyPlaceholder
+                ): None,
+                AnyPlaceholder: None,
+            },
+        },
+    },
+    TensorFlowNeuronPredictorType: {
+        OneOfAllPlaceholder(ModelVersion.PROVIDED): {
+            IntegerPlaceholder: {
+                GenericPlaceholder("saved_model.pb"): None,
+                AnyPlaceholder: None,
+            }
+        },
+        OneOfAllPlaceholder(ModelVersion.NOT_PROVIDED): {
+            GenericPlaceholder("saved_model.pb"): None,
+            AnyPlaceholder: None,
+        },
+    },
+    ONNXPredictorType: {
+        OneOfAllPlaceholder(ModelVersion.PROVIDED): {
+            IntegerPlaceholder: {
+                PlaceholderGroup(SinglePlaceholder, GenericPlaceholder(".onnx")): None,
+            },
+        },
+        OneOfAllPlaceholder(ModelVersion.NOT_PROVIDED): {
+            PlaceholderGroup(SinglePlaceholder, GenericPlaceholder(".onnx")): None,
+        },
+    },
+}
+
+
+def json_model_template_representation(model_template) -> dict:
+    dct = {}
+    if model_template is None:
+        return None
+    if isinstance(model_template, dict):
+        if any(isinstance(x, OneOfAllPlaceholder) for x in model_template):
+            oneofall_placeholder_index = 0
+        for key in model_template:
+            if isinstance(key, OneOfAllPlaceholder):
+                dct[
+                    str(key) + f"-{oneofall_placeholder_index}"
+                ] = json_model_template_representation(model_template[key])
+                oneofall_placeholder_index += 1
+            else:
+                dct[str(key)] = json_model_template_representation(model_template[key])
+        return dct
+    else:
+        return str(model_template)
+
+
+def _single_model_pattern(predictor_type: PredictorType) -> dict:
+    """
+    To be used when predictor:model_path or predictor:models:paths in cortex.yaml is used.
+    """
+    return ModelTemplate[predictor_type]
+
+
+def validate_models_dir_paths(
+    paths: List[str], predictor_type: PredictorType, common_prefix: str
+) -> Tuple[List[str], List[List[int]]]:
+    """
+    Validates the models paths based on the given predictor type.
+    To be used when predictor:models:dir in cortex.yaml is used.
+
+    Args:
+        paths: A list of all paths for a given S3/local prefix. Must be underneath the common prefix.
+        predictor_type: The predictor type.
+        common_prefix: The common prefix of the directory which holds all models. AKA predictor:models:dir.
+
+    Returns:
+        List with the prefix of each model that's valid.
+        List with the OneOfAllPlaceholder IDs validated for each valid model.
+    """
+    if len(paths) == 0:
+        raise CortexException(
+            f"{predictor_type} predictor at '{common_prefix}'", "model top path can't be empty"
+        )
+
+    rel_paths = [os.path.relpath(top_path, common_prefix) for top_path in paths]
+    rel_paths = [path for path in rel_paths if not path.startswith("../")]
+
+    model_names = [util.get_leftmost_part_of_path(path) for path in rel_paths]
+    model_names = list(set(model_names))
+
+    valid_model_prefixes = []
+    ooa_valid_key_ids = []
+    for model_name in model_names:
+        try:
+            ooa_valid_key_ids.append(validate_model_paths(rel_paths, predictor_type, model_name))
+            valid_model_prefixes.append(os.path.join(common_prefix, model_name))
+        except CortexException as e:
+            logger().debug(f"failed validating model {model_name}: {str(e)}")
+            continue
+
+    return valid_model_prefixes, ooa_valid_key_ids
+
+
+def validate_model_paths(
+    paths: List[str], predictor_type: PredictorType, common_prefix: str
+) -> List[int]:
+    """
+    To be used when predictor:model_path or predictor:models:paths in cortex.yaml is used.
+
+    Args:
+        paths: A list of all paths for a given S3/local prefix. Must be the top directory of a model.
+        predictor_type: Predictor type. Can be PythonPredictorType, TensorFlowPredictorType, TensorFlowNeuronPredictorType or ONNXPredictorType.
+        common_prefix: The common prefix of the directory which holds all models.
+
+    Returns:
+        List of all OneOfAllPlaceholder IDs that had been validated.
+
+    Exception:
+        CortexException if the paths don't match the model's template.
+    """
+    if len(paths) == 0:
+        raise CortexException(
+            f"{predictor_type} predictor at '{common_prefix}'", "model path can't be empty"
+        )
+
+    def _validate_model_paths(pattern: Any, paths: List[str], common_prefix: str) -> None:
+        rel_paths = [os.path.relpath(path, common_prefix) for path in paths]
+        rel_paths = [path for path in rel_paths if not path.startswith("../")]
+
+        objects = [util.get_leftmost_part_of_path(path) for path in rel_paths]
+        objects = list(set(objects))
+        visited_objects = len(objects) * [False]
+
+        ooa_valid_key_ids = []  # OneOfAllPlaceholder IDs that are valid
+
+        if pattern is None:
+            if len(objects) == 1 and objects[0] == ".":
+                return ooa_valid_key_ids
+            raise CortexException(
+                f"{predictor_type} predictor at '{common_prefix}'",
+                "template doesn't specify a substructure for the given path",
+            )
+        if not isinstance(pattern, dict):
+            pattern = {pattern: None}
+
+        keys = list(pattern.keys())
+        keys.sort(key=operator.attrgetter("priority"))
+
+        try:
+            if any(isinstance(x, OneOfAllPlaceholder) for x in keys) and not all(
+                isinstance(x, OneOfAllPlaceholder) for x in keys
+            ):
+                raise CortexException(
+                    f"{predictor_type} predictor at '{common_prefix}'",
+                    f"{OneOfAllPlaceholder()} is a mutual-exclusive key with all other keys",
+                )
+            elif all(isinstance(x, OneOfAllPlaceholder) for x in keys):
+                num_keys = len(keys)
+                num_validation_failures = 0
+
+            for key_id, key in enumerate(keys):
+                if key == IntegerPlaceholder:
+                    _validate_integer_placeholder(keys, key_id, objects, visited_objects)
+                elif key == AnyPlaceholder:
+                    _validate_any_placeholder(keys, key_id, objects, visited_objects)
+                elif key == SinglePlaceholder:
+                    _validate_single_placeholder(keys, key_id, objects, visited_objects)
+                elif isinstance(key, GenericPlaceholder):
+                    _validate_generic_placeholder(keys, key_id, objects, visited_objects, key)
+                elif isinstance(key, PlaceholderGroup):
+                    _validate_group_placeholder(keys, key_id, objects, visited_objects)
+                elif isinstance(key, OneOfAllPlaceholder):
+                    try:
+                        _validate_model_paths(pattern[key], paths, common_prefix)
+                        ooa_valid_key_ids.append(key.ID)
+                    except CortexException:
+                        num_validation_failures += 1
+                else:
+                    raise CortexException("found a non-placeholder object in model template")
+
+        except CortexException as e:
+            raise CortexException(f"{predictor_type} predictor at '{common_prefix}'", str(e))
+
+        if (
+            all(isinstance(x, OneOfAllPlaceholder) for x in keys)
+            and num_validation_failures == num_keys
+        ):
+            raise CortexException(
+                f"couldn't validate for any of the {OneOfAllPlaceholder()} placeholders"
+            )
+        if all(isinstance(x, OneOfAllPlaceholder) for x in keys):
+            return ooa_valid_key_ids
+
+        unvisited_paths = []
+        for idx, visited in enumerate(visited_objects):
+            if visited is False:
+                untraced_common_prefix = os.path.join(common_prefix, objects[idx])
+                untraced_paths = [os.path.relpath(path, untraced_common_prefix) for path in paths]
+                untraced_paths = [
+                    os.path.join(objects[idx], path)
+                    for path in untraced_paths
+                    if not path.startswith("../")
+                ]
+                unvisited_paths += untraced_paths
+        if len(unvisited_paths) > 0:
+            raise CortexException(
+                f"{predictor_type} predictor model at '{common_prefix}'",
+                "unexpected path(s) for " + str(unvisited_paths),
+            )
+
+        aggregated_ooa_valid_key_ids = []
+        for obj_id, key_id in enumerate(visited_objects):
+            obj = objects[obj_id]
+            key = keys[key_id]
+
+            new_common_prefix = os.path.join(common_prefix, obj)
+            sub_pattern = pattern[key]
+
+            if key != AnyPlaceholder:
+                aggregated_ooa_valid_key_ids += _validate_model_paths(
+                    sub_pattern, paths, new_common_prefix
+                )
+
+        return aggregated_ooa_valid_key_ids
+
+    pattern = _single_model_pattern(predictor_type)
+    return _validate_model_paths(pattern, paths, common_prefix)
+
+
+def _validate_integer_placeholder(
+    placeholders: list, key_id: int, objects: List[str], visited: list
+) -> None:
+    appearances = 0
+    for idx, obj in enumerate(objects):
+        if obj.isnumeric() and visited[idx] is False:
+            visited[idx] = key_id
+            appearances += 1
+
+    if appearances > 1 and len(placeholders) > 1:
+        raise CortexException(f"too many {IntegerPlaceholder} appearances in path")
+    if appearances == 0:
+        raise CortexException(f"{IntegerPlaceholder} not found in path")
+
+
+def _validate_any_placeholder(
+    placeholders: list,
+    key_id: int,
+    objects: List[str],
+    visited: list,
+) -> None:
+    for idx, obj in enumerate(objects):
+        if visited[idx] is False and obj != ".":
+            visited[idx] = key_id
+
+
+def _validate_single_placeholder(
+    placeholders: list, key_id: int, objects: List[str], visited: list
+) -> None:
+    if len(placeholders) > 1 or len(objects) > 1:
+        raise CortexException(f"only a single {SinglePlaceholder} is allowed per directory")
+    if len(visited) > 0 and visited[0] is False:
+        visited[0] = key_id
+
+
+def _validate_generic_placeholder(
+    placeholders: list,
+    key_id: int,
+    objects: List[str],
+    visited: list,
+    generical: GenericPlaceholder,
+) -> None:
+    found = False
+    for idx, obj in enumerate(objects):
+        if obj == generical.value:
+            if visited[idx] is False:
+                visited[idx] = key_id
+            found = True
+            return
+
+    if not found:
+        raise CortexException(f"{generical.type} placeholder for {generical} wasn't found")
+
+
+def _validate_group_placeholder(
+    placeholders: list, key_id: int, objects: List[str], visited: list
+) -> None:
+    """
+    Can use AnyPlaceholder, GenericPlaceholder, SinglePlaceholder.
+
+    The minimum number of placeholders a group must hold is 2.
+
+    The accepted formats are:
+    - ... AnyPlaceholder, GenericPlaceholder, AnyPlaceholder, ...
+    - ... SinglePlaceholder, GenericPlaceholder, SinglePlaceholder, ...
+
+    AnyPlaceholder and SinglePlaceholder cannot be mixed together in one group.
+    """
+
+    placeholder_group = placeholders[key_id]
+
+    if len(placeholder_group) < 2:
+        raise CortexException(f"{placeholder_group} must come with at least 2 placeholders")
+
+    for placeholder in placeholder_group:
+        if placeholder not in [AnyPlaceholder, SinglePlaceholder] and not isinstance(
+            placeholder, GenericPlaceholder
+        ):
+            raise CortexException(
+                f'{placeholder_group} must have a combination of the following placeholder types: {AnyPlaceholder}, {SinglePlaceholder}, {GenericPlaceholder("").placeholder}'
+            )
+
+    if {AnyPlaceholder, SinglePlaceholder}.issubset(set(placeholder_group)):
+        raise CortexException(
+            f"{placeholder_group} cannot have a mix of the following placeholder types: {AnyPlaceholder} and {SinglePlaceholder}"
+        )
+
+    group_len = len(placeholder_group)
+    for idx in range(group_len):
+        if idx + 1 < group_len:
+            a = placeholder_group[idx]
+            b = placeholder_group[idx + 1]
+            if a == b:
+                raise CortexException(
+                    f'{placeholder_group} cannot accept the same type to be specified consecutively ({AnyPlaceholder}, {SinglePlaceholder} or {GenericPlaceholder("").placeholder})'
+                )
+
+    pattern = ""
+    for placeholder in placeholder_group:
+        if placeholder in [AnyPlaceholder, SinglePlaceholder]:
+            pattern += "*"
+        if isinstance(placeholder, GenericPlaceholder):
+            pattern += str(placeholder.value)
+
+    num_occurences = 0
+    for idx, obj in enumerate(objects):
+        if visited[idx] is False and fnmatchcase(obj, pattern):
+            visited[idx] = key_id
+            num_occurences += 1
+
+    if SinglePlaceholder in placeholder_group and num_occurences > 1:
+        raise CortexException(
+            f"{placeholder_group} must match once (not {num_occurences} times) because {SinglePlaceholder} is present"
+        )
diff --git a/pkg/workloads/cortex/lib/server/tensorflow.py b/pkg/workloads/cortex/lib/server/tensorflow.py
deleted file mode 100644
index 33fe4dc17b..0000000000
--- a/pkg/workloads/cortex/lib/server/tensorflow.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# Copyright 2020 Cortex Labs, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import grpc
-import time
-import threading
-
-from tensorflow_serving.apis import model_service_pb2_grpc
-from tensorflow_serving.apis import model_management_pb2
-from tensorflow_serving.config import model_server_config_pb2
-
-from cortex.lib.exceptions import CortexException
-from cortex.lib.log import cx_logger
-
-
-class TensorFlowServing:
-    def __init__(self, address):
-        self.address = address
-        self.model_platform = "tensorflow"
-        self.channel = grpc.insecure_channel(self.address)
-        self.stub = model_service_pb2_grpc.ModelServiceStub(self.channel)
-        self.timeout = 600  # gRPC timeout in seconds
-
-    def add_models_config(self, names, base_paths, replace_models=False):
-        request = model_management_pb2.ReloadConfigRequest()
-        model_server_config = model_server_config_pb2.ModelServerConfig()
-
-        # create model(s) configuration
-        config_list = model_server_config_pb2.ModelConfigList()
-        for i, name in enumerate(names):
-            model_config = config_list.config.add()
-            model_config.name = name
-            model_config.base_path = base_paths[i]
-            model_config.model_platform = self.model_platform
-
-        if replace_models:
-            model_server_config.model_config_list.CopyFrom(config_list)
-            request.config.CopyFrom(model_server_config)
-        else:
-            model_server_config.model_config_list.MergeFrom(config_list)
-            request.config.MergeFrom(model_server_config)
-
-        loaded_models = threading.Event()
-
-        def log_loading_models():
-            while not loaded_models.is_set():
-                time.sleep(2)
-                cx_logger().info("model(s) still loading ...")
-
-        log_thread = threading.Thread(target=log_loading_models, daemon=True)
-        log_thread.start()
-
-        timeout_error_limit = 3
-        timeout_error_counter = 0
-        generic_error_limit = 200
-        generic_error_counter = 0
-
-        # request TFS to load models
-        response = None
-        while True:
-            try:
-                # this request doesn't return until all models have been successfully loaded
-                response = self.stub.HandleReloadConfigRequest(request, self.timeout)
-                break
-            except Exception as e:
-                if not (
-                    isinstance(e, grpc.RpcError)
-                    and e.code() in [grpc.StatusCode.UNAVAILABLE, grpc.StatusCode.DEADLINE_EXCEEDED]
-                ):
-                    print(e)  # unexpected error
-
-                if isinstance(e, grpc.RpcError) and e.code() == grpc.StatusCode.DEADLINE_EXCEEDED:
-                    timeout_error_counter += 1
-                else:
-                    generic_error_counter += 1
-
-            if timeout_error_counter >= timeout_error_limit:
-                break
-            if generic_error_counter >= generic_error_limit:
-                break
-
-            time.sleep(1.0)
-
-        loaded_models.set()
-        log_thread.join()
-
-        # report error or success
-        if response and response.status.error_code == 0:
-            cx_logger().info("successfully loaded {} models into TF-Serving".format(names))
-        else:
-            if response:
-                raise CortexException(
-                    "couldn't load user-requested models - failed with error code {}: {}".format(
-                        response.status.error_code, response.status.error_message
-                    )
-                )
-            else:
-                raise CortexException("couldn't load user-requested models")
-
-    def add_model_config(self, name, base_path, replace_model=False):
-        self.add_models_config([name], [base_path], replace_model)
diff --git a/pkg/workloads/cortex/lib/storage/__init__.py b/pkg/workloads/cortex/lib/storage/__init__.py
index c5b132c6bb..6c67e49a67 100644
--- a/pkg/workloads/cortex/lib/storage/__init__.py
+++ b/pkg/workloads/cortex/lib/storage/__init__.py
@@ -14,4 +14,3 @@
 
 from cortex.lib.storage.local import LocalStorage
 from cortex.lib.storage.s3 import S3
-from cortex.lib.storage.concurrency import FileLock
diff --git a/pkg/workloads/cortex/lib/storage/concurrency.py b/pkg/workloads/cortex/lib/storage/concurrency.py
deleted file mode 100644
index 0a6b3b781c..0000000000
--- a/pkg/workloads/cortex/lib/storage/concurrency.py
+++ /dev/null
@@ -1,90 +0,0 @@
-#!/bin/bash
-
-# Copyright 2020 Cortex Labs, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os, fcntl, time
-
-
-class FileLock:
-    def __init__(self, lock_file, timeout=None):
-        """
-        Lock for files. Not thread-safe. Instantiate one lock per thread.
-
-        lock_file - File to use as lock.
-        timeout - If used, a timeout exception will be raised if the lock can't be acquired. Measured in seconds.
-        """
-        self._lock_file = lock_file
-        self._file_handle = None
-
-        self.timeout = timeout
-        self._time_loop = 0.001
-
-        # create lock if it doesn't exist
-        with open(self._lock_file, "w+") as f:
-            pass
-
-    def acquire(self):
-        """
-        To acquire rw access to resource.
-        """
-        if self._file_handle:
-            return
-
-        if not self.timeout:
-            self._file_handle = open(self._lock_file, "w")
-            fcntl.lockf(self._file_handle, fcntl.LOCK_EX)
-        else:
-            start = time.time()
-            acquired = False
-            while start + self.timeout >= time.time():
-                try:
-                    self._file_handle = open(self._lock_file, "w")
-                    fcntl.lockf(self._file_handle, fcntl.LOCK_EX | fcntl.LOCK_NB)
-                    acquired = True
-                    break
-                except OSError:
-                    time.sleep(self._time_loop)
-
-            if not acquired:
-                self._file_handle = None
-                raise TimeoutError(
-                    "{} ms timeout on acquiring {} lock".format(
-                        int(self.timeout * 1000), self._lock_file
-                    )
-                )
-
-    def release(self):
-        """
-        To release rw access to resource.
-        """
-        if not self._file_handle:
-            return
-
-        fd = self._file_handle
-        self._file_handle = None
-        fcntl.lockf(fd, fcntl.LOCK_UN)
-        fd.close()
-
-    def __enter__(self):
-        self.acquire()
-        return self
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        self.release()
-        return None
-
-    def __del__(self):
-        self.release()
-        return None
diff --git a/pkg/workloads/cortex/lib/storage/local.py b/pkg/workloads/cortex/lib/storage/local.py
index 0b6d371ab7..3a8e84d9ee 100644
--- a/pkg/workloads/cortex/lib/storage/local.py
+++ b/pkg/workloads/cortex/lib/storage/local.py
@@ -16,9 +16,9 @@
 import pickle
 import json
 import msgpack
-from pathlib import Path
 import shutil
 import time
+from pathlib import Path
 
 from cortex.lib import util
 from cortex.lib.exceptions import CortexException
diff --git a/pkg/workloads/cortex/lib/storage/s3.py b/pkg/workloads/cortex/lib/storage/s3.py
index e498269ce5..3dad153446 100644
--- a/pkg/workloads/cortex/lib/storage/s3.py
+++ b/pkg/workloads/cortex/lib/storage/s3.py
@@ -19,6 +19,8 @@
 import json
 import msgpack
 import time
+import datetime
+from typing import Dict, List, Tuple
 
 from cortex.lib import util
 from cortex.lib.exceptions import CortexException
@@ -38,12 +40,27 @@ def __init__(self, bucket=None, region=None, client_config={}):
         self.s3 = boto3.client("s3", **client_config)
 
     @staticmethod
-    def deconstruct_s3_path(s3_path):
+    def construct_s3_path(bucket_name: str, prefix: str) -> str:
+        return f"s3://{bucket_name}/{prefix}"
+
+    @staticmethod
+    def deconstruct_s3_path(s3_path) -> Tuple[str, str]:
         path = util.trim_prefix(s3_path, "s3://")
         bucket = path.split("/")[0]
         key = os.path.join(*path.split("/")[1:])
         return (bucket, key)
 
+    @staticmethod
+    def is_valid_s3_path(path: str) -> bool:
+        if not path.startswith("s3://"):
+            return False
+        parts = path[5:].split("/")
+        if len(parts) < 2:
+            return False
+        if parts[0] == "" or parts[1] == "":
+            return False
+        return True
+
     def blob_path(self, key):
         return os.path.join("s3://", self.bucket, key)
 
@@ -87,7 +104,7 @@ def _get_matching_s3_objects_generator(self, prefix="", suffix=""):
 
     def _get_matching_s3_keys_generator(self, prefix="", suffix=""):
         for obj in self._get_matching_s3_objects_generator(prefix, suffix):
-            yield obj["Key"]
+            yield obj["Key"], obj["LastModified"]
 
     def put_object(self, body, key):
         self.s3.put_object(Bucket=self.bucket, Key=key, Body=body)
@@ -126,8 +143,20 @@ def _read_bytes_from_s3_single(self, key, allow_missing=False, ext_bucket=None):
 
         return byte_array.strip()
 
-    def search(self, prefix="", suffix=""):
-        return list(self._get_matching_s3_keys_generator(prefix, suffix))
+    def search(self, prefix="", suffix="") -> Tuple[List[str], List[datetime.datetime]]:
+        paths = []
+        timestamps = []
+
+        timestamp_map = {}
+        for key, ts in self._get_matching_s3_keys_generator(prefix, suffix):
+            timestamp_map[key] = ts
+
+        filtered_keys = util.remove_non_empty_directory_paths(list(timestamp_map.keys()))
+        for key in filtered_keys:
+            paths.append(key)
+            timestamps.append(timestamp_map[key])
+
+        return paths, timestamps
 
     def put_str(self, str_val, key):
         self.put_object(str_val, key)
@@ -185,7 +214,7 @@ def download_dir(self, prefix, local_dir):
     def download_dir_contents(self, prefix, local_dir):
         util.mkdir_p(local_dir)
         prefix = util.ensure_suffix(prefix, "/")
-        for key in self._get_matching_s3_keys_generator(prefix):
+        for key, _ in self._get_matching_s3_keys_generator(prefix):
             if key.endswith("/"):
                 continue
             rel_path = util.trim_prefix(key, prefix)
diff --git a/pkg/workloads/cortex/lib/type/__init__.py b/pkg/workloads/cortex/lib/type/__init__.py
index d385fe7b2f..c628d1dd9e 100644
--- a/pkg/workloads/cortex/lib/type/__init__.py
+++ b/pkg/workloads/cortex/lib/type/__init__.py
@@ -12,11 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cortex.lib.type.api import API, get_spec
-from cortex.lib.type.predictor import Predictor
-from cortex.lib.type.monitoring import Monitoring
-from cortex.lib.type.model import (
-    Model,
-    get_model_signature_map,
-    get_model_names,
+from cortex.lib.type.type import (
+    PythonPredictorType,
+    TensorFlowPredictorType,
+    TensorFlowNeuronPredictorType,
+    ONNXPredictorType,
+    PredictorType,
+    predictor_type_from_string,
+    predictor_type_from_api_spec,
 )
diff --git a/pkg/workloads/cortex/lib/type/predictor.py b/pkg/workloads/cortex/lib/type/predictor.py
deleted file mode 100644
index ca45d52a67..0000000000
--- a/pkg/workloads/cortex/lib/type/predictor.py
+++ /dev/null
@@ -1,372 +0,0 @@
-# Copyright 2020 Cortex Labs, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import imp
-import inspect
-from copy import deepcopy
-
-import dill
-
-from cortex.lib.log import refresh_logger, cx_logger
-from cortex.lib.exceptions import CortexException, UserException, UserRuntimeException
-from cortex.lib.type.model import Model, get_model_signature_map
-from cortex.lib.storage.concurrency import FileLock
-from cortex import consts
-from cortex.lib import util
-
-
-class Predictor:
-    def __init__(self, provider, model_dir, cache_dir, **kwargs):
-        self.provider = provider
-        self.type = kwargs["type"]
-        self.path = kwargs["path"]
-        self.python_path = kwargs.get("python_path")
-        self.config = kwargs.get("config", {})
-        self.env = kwargs.get("env")
-
-        self.model_dir = model_dir
-        self.models = []
-        if kwargs.get("models"):
-            for model in kwargs["models"]:
-                self.models += [
-                    Model(
-                        name=model["name"],
-                        model_path=model["model_path"],
-                        base_path=self._compute_model_basepath(model["model_path"], model["name"]),
-                        signature_key=model.get("signature_key"),
-                    )
-                ]
-
-        self.cache_dir = cache_dir
-
-    def initialize_client(self, tf_serving_host=None, tf_serving_port=None):
-        signature_message = None
-        if self.type == "onnx":
-            from cortex.lib.client.onnx import ONNXClient
-
-            client = ONNXClient(self.models)
-            if self.models[0].name == consts.SINGLE_MODEL_NAME:
-                signature_message = "ONNX model signature: {}".format(
-                    client.input_signatures[consts.SINGLE_MODEL_NAME]
-                )
-            else:
-                signature_message = "ONNX model signatures: {}".format(client.input_signatures)
-            cx_logger().info(signature_message)
-            return client
-        elif self.type == "tensorflow":
-            from cortex.lib.client.tensorflow import TensorFlowClient
-
-            for model in self.models:
-                validate_model_dir(model.base_path)
-
-            tf_serving_address = tf_serving_host + ":" + tf_serving_port
-            client = TensorFlowClient(tf_serving_address, self.models)
-            if self.models[0].name == consts.SINGLE_MODEL_NAME:
-                signature_message = "TensorFlow model signature: {}".format(
-                    client.input_signatures[consts.SINGLE_MODEL_NAME]
-                )
-            else:
-                signature_message = "TensorFlow model signatures: {}".format(
-                    client.input_signatures
-                )
-            cx_logger().info(signature_message)
-            return client
-
-        return None
-
-    def initialize_impl(self, project_dir, client=None, api_spec=None, job_spec=None):
-        class_impl = self.class_impl(project_dir)
-        constructor_args = inspect.getfullargspec(class_impl.__init__).args
-
-        args = {}
-
-        config = deepcopy(api_spec["predictor"]["config"])
-        if job_spec is not None and job_spec.get("config") is not None:
-            util.merge_dicts_in_place_overwrite(config, job_spec["config"])
-
-        if "config" in constructor_args:
-            args["config"] = config
-        if "job_spec" in constructor_args:
-            args["job_spec"] = job_spec
-
-        try:
-            if self.type == "onnx":
-                args["onnx_client"] = client
-                return class_impl(**args)
-            elif self.type == "tensorflow":
-                args["tensorflow_client"] = client
-                return class_impl(**args)
-            else:
-                return class_impl(**args)
-        except Exception as e:
-            raise UserRuntimeException(self.path, "__init__", str(e)) from e
-        finally:
-            refresh_logger()
-
-    def get_target_and_validations(self):
-        target_class_name = None
-        validations = None
-
-        if self.type == "tensorflow":
-            target_class_name = "TensorFlowPredictor"
-            validations = TENSORFLOW_CLASS_VALIDATION
-        elif self.type == "onnx":
-            target_class_name = "ONNXPredictor"
-            validations = ONNX_CLASS_VALIDATION
-        elif self.type == "python":
-            target_class_name = "PythonPredictor"
-            validations = PYTHON_CLASS_VALIDATION
-
-        return target_class_name, validations
-
-    def class_impl(self, project_dir):
-        target_class_name, validations = self.get_target_and_validations()
-
-        try:
-            with FileLock("/run/init_stagger.lock"):
-                impl = self._load_module("cortex_predictor", os.path.join(project_dir, self.path))
-        except CortexException as e:
-            e.wrap("error in " + self.path)
-            raise
-        finally:
-            refresh_logger()
-
-        try:
-            classes = inspect.getmembers(impl, inspect.isclass)
-            predictor_class = None
-            for class_df in classes:
-                if class_df[0] == target_class_name:
-                    if predictor_class is not None:
-                        raise UserException(
-                            "multiple definitions for {} class found; please check your imports and class definitions and ensure that there is only one Predictor class definition".format(
-                                target_class_name
-                            )
-                        )
-                    predictor_class = class_df[1]
-            if predictor_class is None:
-                raise UserException("{} class is not defined".format(target_class_name))
-
-            _validate_impl(predictor_class, validations)
-        except CortexException as e:
-            e.wrap("error in " + self.path)
-            raise
-        return predictor_class
-
-    def _load_module(self, module_name, impl_path):
-        if impl_path.endswith(".pickle"):
-            try:
-                impl = imp.new_module(module_name)
-
-                with open(impl_path, "rb") as pickle_file:
-                    pickled_dict = dill.load(pickle_file)
-                    for key in pickled_dict:
-                        setattr(impl, key, pickled_dict[key])
-            except Exception as e:
-                raise UserException("unable to load pickle", str(e)) from e
-        else:
-            try:
-                impl = imp.load_source(module_name, impl_path)
-            except Exception as e:
-                raise UserException(str(e)) from e
-
-        return impl
-
-    def _compute_model_basepath(self, model_path, model_name):
-        base_path = os.path.join(self.model_dir, model_name)
-        if self.type == "onnx":
-            base_path = os.path.join(base_path, os.path.basename(model_path))
-        return base_path
-
-
-PYTHON_CLASS_VALIDATION = {
-    "required": [
-        {"name": "__init__", "required_args": ["self", "config"], "optional_args": ["job_spec"]},
-        {
-            "name": "predict",
-            "required_args": ["self"],
-            "optional_args": ["payload", "query_params", "headers", "batch_id"],
-        },
-    ],
-    "optional": [
-        {"name": "on_job_complete", "required_args": ["self"]},
-        {
-            "name": "post_predict",
-            "required_args": ["self"],
-            "optional_args": ["response", "payload", "query_params", "headers"],
-        },
-    ],
-}
-
-TENSORFLOW_CLASS_VALIDATION = {
-    "required": [
-        {
-            "name": "__init__",
-            "required_args": ["self", "tensorflow_client", "config"],
-            "optional_args": ["job_spec"],
-        },
-        {
-            "name": "predict",
-            "required_args": ["self"],
-            "optional_args": ["payload", "query_params", "headers", "batch_id"],
-        },
-    ],
-    "optional": [
-        {"name": "on_job_complete", "required_args": ["self"]},
-        {
-            "name": "post_predict",
-            "required_args": ["self"],
-            "optional_args": ["response", "payload", "query_params", "headers"],
-        },
-    ],
-}
-
-ONNX_CLASS_VALIDATION = {
-    "required": [
-        {
-            "name": "__init__",
-            "required_args": ["self", "onnx_client", "config"],
-            "optional_args": ["job_spec"],
-        },
-        {
-            "name": "predict",
-            "required_args": ["self"],
-            "optional_args": ["payload", "query_params", "headers", "batch_id"],
-        },
-    ],
-    "optional": [
-        {"name": "on_job_complete", "required_args": ["self"]},
-        {
-            "name": "post_predict",
-            "required_args": ["self"],
-            "optional_args": ["response", "payload", "query_params", "headers"],
-        },
-    ],
-}
-
-
-def _validate_impl(impl, impl_req):
-    for optional_func_signature in impl_req.get("optional", []):
-        _validate_optional_fn_args(impl, optional_func_signature)
-
-    for required_func_signature in impl_req.get("required", []):
-        _validate_required_fn_args(impl, required_func_signature)
-
-
-def _validate_optional_fn_args(impl, func_signature):
-    if getattr(impl, func_signature["name"], None):
-        _validate_required_fn_args(impl, func_signature)
-
-
-def _validate_required_fn_args(impl, func_signature):
-    fn = getattr(impl, func_signature["name"], None)
-    if not fn:
-        raise UserException(f'required function "{func_signature["name"]}" is not defined')
-
-    if not callable(fn):
-        raise UserException(f'"{func_signature["name"]}" is defined, but is not a function')
-
-    argspec = inspect.getfullargspec(fn)
-
-    required_args = func_signature.get("required_args", [])
-    optional_args = func_signature.get("optional_args", [])
-    fn_str = f'{func_signature["name"]}({", ".join(argspec.args)})'
-
-    for arg_name in required_args:
-        if arg_name not in argspec.args:
-            raise UserException(
-                f'invalid signature for function "{fn_str}": "{arg_name}" is a required argument, but was not provided'
-            )
-
-        if arg_name == "self":
-            if argspec.args[0] != "self":
-                raise UserException(
-                    f'invalid signature for function "{fn_str}": "self" must be the first argument'
-                )
-
-    seen_args = []
-    for arg_name in argspec.args:
-        if arg_name not in required_args and arg_name not in optional_args:
-            raise UserException(
-                f'invalid signature for function "{fn_str}": "{arg_name}" is not a supported argument'
-            )
-
-        if arg_name in seen_args:
-            raise UserException(
-                f'invalid signature for function "{fn_str}": "{arg_name}" is duplicated'
-            )
-
-        seen_args.append(arg_name)
-
-
-def uses_neuron_savedmodel():
-    return os.getenv("CORTEX_ACTIVE_NEURON") != None
-
-
-def get_expected_dir_structure():
-    if uses_neuron_savedmodel():
-        return neuron_tf_expected_dir_structure
-    return tf_expected_dir_structure
-
-
-tf_expected_dir_structure = """tensorflow model directories must have the following structure:
-  1523423423/ (version prefix, usually a timestamp)
-  ├── saved_model.pb
-  └── variables/
-      ├── variables.index
-      ├── variables.data-00000-of-00003
-      ├── variables.data-00001-of-00003
-      └── variables.data-00002-of-...`"""
-
-neuron_tf_expected_dir_structure = """neuron tensorflow model directories must have the following structure:
-  1523423423/ (version prefix, usually a timestamp)
-  └── saved_model.pb`"""
-
-
-def validate_model_dir(model_dir):
-    version = None
-    for file_name in os.listdir(model_dir):
-        if file_name.isdigit():
-            version = file_name
-            break
-
-    if version is None:
-        cx_logger().error(get_expected_dir_structure())
-        raise UserException("no top-level version folder found")
-
-    if not os.path.isdir(os.path.join(model_dir, version)):
-        cx_logger().error(get_expected_dir_structure())
-        raise UserException("no top-level version folder found")
-
-    if not os.path.isfile(os.path.join(model_dir, version, "saved_model.pb")):
-        cx_logger().error(get_expected_dir_structure())
-        raise UserException('expected a "saved_model.pb" file')
-
-    if not uses_neuron_savedmodel():
-        if not os.path.isdir(os.path.join(model_dir, version, "variables")):
-            cx_logger().error(tf_expected_dir_structure)
-            raise UserException('expected a "variables" directory')
-
-        if not os.path.isfile(os.path.join(model_dir, version, "variables", "variables.index")):
-            cx_logger().error(tf_expected_dir_structure)
-            raise UserException('expected a "variables/variables.index" file')
-
-        for file_name in os.listdir(os.path.join(model_dir, version, "variables")):
-            if file_name.startswith("variables.data-00000-of"):
-                return
-
-        cx_logger().error(tf_expected_dir_structure)
-        raise UserException(
-            'expected at least one variables data file, starting with "variables.data-00000-of-"'
-        )
diff --git a/pkg/workloads/cortex/lib/type/type.py b/pkg/workloads/cortex/lib/type/type.py
new file mode 100644
index 0000000000..d38fc1b9d6
--- /dev/null
+++ b/pkg/workloads/cortex/lib/type/type.py
@@ -0,0 +1,64 @@
+# Copyright 2020 Cortex Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+
+
+class PredictorType(collections.namedtuple("PredictorType", "type")):
+    def __str__(self) -> str:
+        return str(self.type)
+
+    def __repr__(self) -> str:
+        return str(self.type)
+
+
+PythonPredictorType = PredictorType("python")
+
+TensorFlowPredictorType = PredictorType("tensorflow")
+TensorFlowNeuronPredictorType = PredictorType("tensorflow-neuron")
+
+ONNXPredictorType = PredictorType("onnx")
+
+
+def predictor_type_from_string(predictor_type: str) -> PredictorType:
+    """
+    Get predictor type from string.
+
+    Args:
+        predictor_type: "python", "tensorflow", "onnx" or "tensorflow-neuron"
+
+    Raises:
+        ValueError if predictor_type does not hold the right value.
+    """
+    predictor_types = [
+        PythonPredictorType,
+        TensorFlowPredictorType,
+        ONNXPredictorType,
+        TensorFlowNeuronPredictorType,
+    ]
+    for candidate in predictor_types:
+        if str(candidate) == predictor_type:
+            return candidate
+    raise ValueError(
+        "predictor_type can only be 'python', 'tensorflow', 'onnx' or 'tensorflow-neuron'"
+    )
+
+
+def predictor_type_from_api_spec(api_spec: dict) -> PredictorType:
+    """
+    Get predictor type from API spec.
+    """
+    if api_spec["compute"]["inf"] > 0 and api_spec["predictor"]["type"] == TensorFlowPredictorType:
+        return predictor_type_from_string("tensorflow-neuron")
+    return predictor_type_from_string(api_spec["predictor"]["type"])
diff --git a/pkg/workloads/cortex/lib/util.py b/pkg/workloads/cortex/lib/util.py
index 43adad5713..ebc98f5df8 100644
--- a/pkg/workloads/cortex/lib/util.py
+++ b/pkg/workloads/cortex/lib/util.py
@@ -21,7 +21,7 @@
 import inspect
 from inspect import Parameter
 from copy import deepcopy
-from typing import Any
+from typing import List, Any
 
 
 def has_method(object, method: str):
@@ -82,6 +82,59 @@ def ensure_suffix(string, suffix):
     return string + suffix
 
 
+def get_leftmost_part_of_path(path: str) -> str:
+    """
+    Gets the leftmost part of a path.
+
+    If a path looks like
+    /models/tensorflow/iris/15559399
+
+    Then this function will return
+    /models/
+    """
+    has_leading_slash = False
+    if path.startswith("/"):
+        path = path[1:]
+        has_leading_slash = True
+
+    basename = ""
+    while path:
+        path, basename = os.path.split(path)
+
+    return "/" * has_leading_slash + basename
+
+
+def remove_non_empty_directory_paths(paths: List[str]) -> List[str]:
+    """
+    Eliminates dir paths from the tree that are not empty.
+
+    If paths looks like:
+    models/tensorflow/
+    models/tensorflow/iris/1569001258
+    models/tensorflow/iris/1569001258/saved_model.pb
+
+    Then after calling this function, it will look like:
+    models/tensorflow/iris/1569001258/saved_model.pb
+    """
+    new_paths = []
+
+    split_paths = [list(filter(lambda x: x != "", path.split("/"))) for path in paths]
+    create_set_from_list = lambda l: set([(idx, split) for idx, split in enumerate(l)])
+    split_set_paths = [create_set_from_list(split_path) for split_path in split_paths]
+
+    for id_a, a in enumerate(split_set_paths):
+        matches = 0
+        for id_b, b in enumerate(split_set_paths):
+            if id_a == id_b:
+                continue
+            if a.issubset(b):
+                matches += 1
+        if matches == 0:
+            new_paths.append(paths[id_a])
+
+    return new_paths
+
+
 def merge_dicts_in_place_overwrite(*dicts):
     """Merge dicts, right into left, with overwriting. First dict is updated in place"""
     dicts = list(dicts)
diff --git a/pkg/workloads/cortex/serve/init/bootloader.sh b/pkg/workloads/cortex/serve/init/bootloader.sh
index 1054f5a46e..5e051ed54b 100755
--- a/pkg/workloads/cortex/serve/init/bootloader.sh
+++ b/pkg/workloads/cortex/serve/init/bootloader.sh
@@ -35,6 +35,7 @@ cd /mnt/project
 
 # if the container restarted, ensure that it is not perceived as ready
 rm -rf /mnt/workspace/api_readiness.txt
+rm -rf /mnt/workspace/init_script_run.txt
 rm -rf /mnt/workspace/proc-*-ready.txt
 
 # allow for the liveness check to pass until the API is running
@@ -84,6 +85,13 @@ if [ -f "/mnt/project/requirements.txt" ]; then
 fi
 
 create_s6_service() {
+    # good pages to read about s6-overlay
+    # https://wiki.gentoo.org/wiki/S6#Process_supervision
+    # https://skarnet.org/software/s6/s6-svscanctl.html
+    # http://skarnet.org/software/s6/s6-svc.html
+    # http://skarnet.org/software/s6/servicedir.html
+    # http://www.troubleshooters.com/linux/execline.htm
+
     service_name=$1
     cmd=$2
 
@@ -97,7 +105,8 @@ create_s6_service() {
 
     dest_script="$dest_dir/finish"
     echo "#!/usr/bin/execlineb -S0" > $dest_script
-    echo "s6-svscanctl -t /var/run/s6/services" >> $dest_script
+    echo "ifelse { s6-test \${1} -ne 0 } { s6-svscanctl -t /var/run/s6/services }" >> $dest_script
+    echo "s6-svc -O /var/run/s6/services/$service_name" >> $dest_script
     chmod +x $dest_script
 }
 
@@ -126,5 +135,5 @@ else
     create_s6_service "batch" "$source_env_file_cmd && exec env PYTHONUNBUFFERED=TRUE env PYTHONPATH=$PYTHONPATH:$CORTEX_PYTHON_PATH /opt/conda/envs/env/bin/python /src/cortex/serve/start/batch.py"
 fi
 
-# run the python initialization script
-/opt/conda/envs/env/bin/python /src/cortex/serve/init/script.py
+# create the python initialization service
+create_s6_service "py_init" "/opt/conda/envs/env/bin/python /src/cortex/serve/init/script.py"
diff --git a/pkg/workloads/cortex/serve/init/script.py b/pkg/workloads/cortex/serve/init/script.py
index e72cd8290e..ac7e65f7c8 100644
--- a/pkg/workloads/cortex/serve/init/script.py
+++ b/pkg/workloads/cortex/serve/init/script.py
@@ -13,36 +13,81 @@
 # limitations under the License.
 
 import os
+import time
 import json
 
-from cortex.lib.type import get_spec
-from cortex.lib.storage import S3, LocalStorage
+from cortex.lib.type import (
+    predictor_type_from_api_spec,
+    PythonPredictorType,
+    TensorFlowPredictorType,
+    TensorFlowNeuronPredictorType,
+    ONNXPredictorType,
+)
+from cortex.lib.model import (
+    FileBasedModelsTreeUpdater,  # only when num workers > 1
+    TFSModelLoader,
+)
+from cortex.lib.api import get_spec
 from cortex.lib.checkers.pod import wait_neuron_rtd
 
 
-def load_tensorflow_serving_models():
+def prepare_tfs_servers_api(api_spec: dict, model_dir: str) -> TFSModelLoader:
     # get TFS address-specific details
-    model_dir = os.environ["CORTEX_MODEL_DIR"]
     tf_serving_host = os.getenv("CORTEX_TF_SERVING_HOST", "localhost")
     tf_base_serving_port = int(os.getenv("CORTEX_TF_BASE_SERVING_PORT", "9000"))
 
-    # get models from environment variable
-    models = os.environ["CORTEX_MODELS"].split(",")
-    models = [model.strip() for model in models]
-
-    from cortex.lib.server.tensorflow import TensorFlowServing
-
     # determine if multiple TF processes are required
     num_processes = 1
-    has_multiple_servers = os.getenv("CORTEX_MULTIPLE_TF_SERVERS")
-    if has_multiple_servers:
+    has_multiple_tf_servers = os.getenv("CORTEX_MULTIPLE_TF_SERVERS")
+    if has_multiple_tf_servers:
         num_processes = int(os.environ["CORTEX_PROCESSES_PER_REPLICA"])
 
     # initialize models for each TF process
-    base_paths = [os.path.join(model_dir, name) for name in models]
+    addresses = []
     for w in range(int(num_processes)):
-        tfs = TensorFlowServing(f"{tf_serving_host}:{tf_base_serving_port+w}")
-        tfs.add_models_config(models, base_paths, replace_models=False)
+        addresses.append(f"{tf_serving_host}:{tf_base_serving_port+w}")
+
+    if len(addresses) == 1:
+        return TFSModelLoader(
+            interval=10,
+            api_spec=api_spec,
+            address=addresses[0],
+            tfs_model_dir=model_dir,
+            download_dir=model_dir,
+        )
+    return TFSModelLoader(
+        interval=10,
+        api_spec=api_spec,
+        addresses=addresses,
+        tfs_model_dir=model_dir,
+        download_dir=model_dir,
+    )
+
+
+def are_models_specified(api_spec: dict) -> bool:
+    """
+    Checks if models have been specified in the API spec (cortex.yaml).
+
+    Args:
+        api_spec: API configuration.
+    """
+    if api_spec["predictor"]["model_path"] is not None:
+        return True
+
+    if api_spec["predictor"]["models"] and (
+        api_spec["predictor"]["models"]["dir"] is not None
+        or len(api_spec["predictor"]["models"]["paths"]) > 0
+    ):
+        return True
+    return False
+
+
+def is_model_caching_enabled(api_spec: dir) -> bool:
+    return (
+        api_spec["predictor"]["models"]
+        and api_spec["predictor"]["models"]["cache_size"] is not None
+        and api_spec["predictor"]["models"]["disk_cache_size"] is not None
+    )
 
 
 def main():
@@ -52,10 +97,10 @@ def main():
         wait_neuron_rtd()
 
     # strictly for Inferentia
-    has_multiple_servers = os.getenv("CORTEX_MULTIPLE_TF_SERVERS")
-    if has_multiple_servers:
+    has_multiple_tf_servers = os.getenv("CORTEX_MULTIPLE_TF_SERVERS")
+    num_processes = int(os.environ["CORTEX_PROCESSES_PER_REPLICA"])
+    if has_multiple_tf_servers:
         base_serving_port = int(os.environ["CORTEX_TF_BASE_SERVING_PORT"])
-        num_processes = int(os.environ["CORTEX_PROCESSES_PER_REPLICA"])
         used_ports = {}
         for w in range(int(num_processes)):
             used_ports[str(base_serving_port + w)] = False
@@ -63,18 +108,70 @@ def main():
             json.dump(used_ports, f)
 
     # get API spec
-    cache_dir = os.environ["CORTEX_CACHE_DIR"]
     provider = os.environ["CORTEX_PROVIDER"]
     spec_path = os.environ["CORTEX_API_SPEC"]
-    if provider == "local":
-        storage = LocalStorage(os.getenv("CORTEX_CACHE_DIR"))
-    else:
-        storage = S3(bucket=os.environ["CORTEX_BUCKET"], region=os.environ["AWS_REGION"])
-    raw_api_spec = get_spec(provider, storage, cache_dir, spec_path)
-
-    # load tensorflow models into TFS
-    if raw_api_spec["predictor"]["type"] == "tensorflow":
-        load_tensorflow_serving_models()
+    cache_dir = os.getenv("CORTEX_CACHE_DIR")  # when it's deployed locally
+    bucket = os.getenv("CORTEX_BUCKET")  # when it's deployed to AWS
+    region = os.getenv("AWS_REGION")  # when it's deployed to AWS
+    _, api_spec = get_spec(provider, spec_path, cache_dir, bucket, region)
+
+    predictor_type = predictor_type_from_api_spec(api_spec)
+    multiple_processes = api_spec["predictor"]["processes_per_replica"] > 1
+    caching_enabled = is_model_caching_enabled(api_spec)
+    model_dir = os.getenv("CORTEX_MODEL_DIR")
+
+    # start live-reloading when model caching not enabled > 1
+    cron = None
+    if not caching_enabled:
+        # create cron dirs if they don't exist
+        os.makedirs("/run/cron", exist_ok=True)
+        os.makedirs("/tmp/cron", exist_ok=True)
+
+        # prepare crons
+        if predictor_type in [PythonPredictorType, ONNXPredictorType] and are_models_specified(
+            api_spec
+        ):
+            cron = FileBasedModelsTreeUpdater(
+                interval=10,
+                api_spec=api_spec,
+                download_dir=model_dir,
+            )
+            cron.start()
+        elif predictor_type == TensorFlowPredictorType:
+            tf_serving_port = os.getenv("CORTEX_TF_BASE_SERVING_PORT", "9000")
+            tf_serving_host = os.getenv("CORTEX_TF_SERVING_HOST", "localhost")
+            cron = TFSModelLoader(
+                interval=10,
+                api_spec=api_spec,
+                address=f"{tf_serving_host}:{tf_serving_port}",
+                tfs_model_dir=model_dir,
+                download_dir=model_dir,
+            )
+            cron.start()
+        elif predictor_type == TensorFlowNeuronPredictorType:
+            cron = prepare_tfs_servers_api(api_spec, model_dir)
+            cron.start()
+
+        # wait until the cron finishes its first pass
+        if cron:
+            while not cron.ran_once():
+                time.sleep(0.25)
+
+            # disable live reloading when the BatchAPI kind is used
+            # disable live reloading for the TF predictor when Inferentia is used and when multiple processes are used (num procs > 1)
+            if api_spec["kind"] != "RealtimeAPI" or (
+                predictor_type == TensorFlowNeuronPredictorType
+                and has_multiple_tf_servers
+                and num_processes > 1
+            ):
+                cron.stop()
+
+    # to syncronize with the other serving processes
+    open("/mnt/workspace/init_script_run.txt", "a").close()
+
+    # don't exit the script if the cron is running
+    while cron and cron.is_alive():
+        time.sleep(0.25)
 
 
 if __name__ == "__main__":
diff --git a/pkg/workloads/cortex/serve/requirements.txt b/pkg/workloads/cortex/serve/requirements.txt
index d4e38eabd6..9771067fd8 100644
--- a/pkg/workloads/cortex/serve/requirements.txt
+++ b/pkg/workloads/cortex/serve/requirements.txt
@@ -1,4 +1,5 @@
 boto3==1.14.53
+grpcio==1.32.0
 datadog==0.39.0
 dill==0.3.2
 fastapi==0.61.1
diff --git a/pkg/workloads/cortex/serve/serve.py b/pkg/workloads/cortex/serve/serve.py
index 2d4265743d..4cf18b8eca 100644
--- a/pkg/workloads/cortex/serve/serve.py
+++ b/pkg/workloads/cortex/serve/serve.py
@@ -33,9 +33,10 @@
 from starlette.exceptions import HTTPException as StarletteHTTPException
 
 from cortex.lib import util
-from cortex.lib.type import API, get_spec
-from cortex.lib.log import cx_logger
-from cortex.lib.storage import S3, LocalStorage, FileLock
+from cortex.lib.api import API, get_api
+from cortex.lib.log import cx_logger as logger
+from cortex.lib.concurrency import LockedFile
+from cortex.lib.storage import S3, LocalStorage
 from cortex.lib.exceptions import UserRuntimeException
 
 API_SUMMARY_MESSAGE = (
@@ -210,7 +211,7 @@ def predict(request: Request):
                 tasks.add_task(api.upload_class, class_name=predicted_value)
                 local_cache["class_set"].add(predicted_value)
         except:
-            cx_logger().warn("unable to record prediction metric", exc_info=True)
+            logger().warn("unable to record prediction metric", exc_info=True)
 
     if util.has_method(predictor_impl, "post_predict"):
         kwargs = build_post_predict_kwargs(prediction, request)
@@ -255,8 +256,10 @@ def build_post_predict_kwargs(response, request: Request):
 def get_summary():
     response = {"message": API_SUMMARY_MESSAGE}
 
-    if hasattr(local_cache["client"], "input_signatures"):
-        response["model_signatures"] = local_cache["client"].input_signatures
+    if hasattr(local_cache["client"], "metadata"):
+        client = local_cache["client"]
+        predictor = local_cache["api"].predictor
+        response["model_metadata"] = client.metadata
 
     return response
 
@@ -270,48 +273,39 @@ def start():
 
 
 def start_fn():
-    cache_dir = os.environ["CORTEX_CACHE_DIR"]
     provider = os.environ["CORTEX_PROVIDER"]
-    spec_path = os.environ["CORTEX_API_SPEC"]
     project_dir = os.environ["CORTEX_PROJECT_DIR"]
+    spec_path = os.environ["CORTEX_API_SPEC"]
 
     model_dir = os.getenv("CORTEX_MODEL_DIR")
+    cache_dir = os.getenv("CORTEX_CACHE_DIR")
+    bucket = os.getenv("CORTEX_BUCKET")
+    region = os.getenv("AWS_REGION")
+
     tf_serving_port = os.getenv("CORTEX_TF_BASE_SERVING_PORT", "9000")
     tf_serving_host = os.getenv("CORTEX_TF_SERVING_HOST", "localhost")
 
-    if provider == "local":
-        storage = LocalStorage(os.getenv("CORTEX_CACHE_DIR"))
-    else:
-        storage = S3(bucket=os.environ["CORTEX_BUCKET"], region=os.environ["AWS_REGION"])
-
     has_multiple_servers = os.getenv("CORTEX_MULTIPLE_TF_SERVERS")
     if has_multiple_servers:
-        with FileLock("/run/used_ports.json.lock"):
-            with open("/run/used_ports.json", "r+") as f:
-                used_ports = json.load(f)
-                for port in used_ports.keys():
-                    if not used_ports[port]:
-                        tf_serving_port = port
-                        used_ports[port] = True
-                        break
-                f.seek(0)
-                json.dump(used_ports, f)
-                f.truncate()
+        with LockedFile("/run/used_ports.json", "r+") as f:
+            used_ports = json.load(f)
+            for port in used_ports.keys():
+                if not used_ports[port]:
+                    tf_serving_port = port
+                    used_ports[port] = True
+                    break
+            f.seek(0)
+            json.dump(used_ports, f)
+            f.truncate()
 
     try:
-        raw_api_spec = get_spec(provider, storage, cache_dir, spec_path)
-        api = API(
-            provider=provider,
-            storage=storage,
-            model_dir=model_dir,
-            cache_dir=cache_dir,
-            **raw_api_spec,
-        )
+        api = get_api(provider, spec_path, model_dir, cache_dir, bucket, region)
+
         client = api.predictor.initialize_client(
             tf_serving_host=tf_serving_host, tf_serving_port=tf_serving_port
         )
-        cx_logger().info("loading the predictor from {}".format(api.predictor.path))
-        predictor_impl = api.predictor.initialize_impl(project_dir, client, raw_api_spec, None)
+        logger().info("loading the predictor from {}".format(api.predictor.path))
+        predictor_impl = api.predictor.initialize_impl(project_dir, client)
 
         local_cache["api"] = api
         local_cache["provider"] = provider
@@ -328,7 +322,7 @@ def start_fn():
             predict_route = "/predict"
         local_cache["predict_route"] = predict_route
     except:
-        cx_logger().exception("failed to start api")
+        logger().exception("failed to start api")
         sys.exit(1)
 
     if (
@@ -339,7 +333,7 @@ def start_fn():
         try:
             local_cache["class_set"] = api.get_cached_classes()
         except:
-            cx_logger().warn("an error occurred while attempting to load classes", exc_info=True)
+            logger().warn("an error occurred while attempting to load classes", exc_info=True)
 
     app.add_api_route(local_cache["predict_route"], predict, methods=["POST"])
     app.add_api_route(local_cache["predict_route"], get_summary, methods=["GET"])
diff --git a/pkg/workloads/cortex/serve/start/batch.py b/pkg/workloads/cortex/serve/start/batch.py
index 69e5305d96..57b9be37ce 100644
--- a/pkg/workloads/cortex/serve/start/batch.py
+++ b/pkg/workloads/cortex/serve/start/batch.py
@@ -20,15 +20,17 @@
 import json
 import threading
 import math
+import pathlib
 
 import boto3
 import botocore
 
 from cortex import consts
 from cortex.lib import util
-from cortex.lib.type import API, get_spec
-from cortex.lib.log import cx_logger
-from cortex.lib.storage import S3, LocalStorage, FileLock
+from cortex.lib.api import API, get_spec, get_api
+from cortex.lib.log import cx_logger as logger
+from cortex.lib.concurrency import LockedFile
+from cortex.lib.storage import S3, LocalStorage
 from cortex.lib.exceptions import UserRuntimeException
 
 API_LIVENESS_UPDATE_PERIOD = 5  # seconds
@@ -127,7 +129,7 @@ def handle_on_complete(message):
             if should_run_on_job_complete:
                 # double check that the queue is still empty (except for the job_complete message)
                 if not_visible_count <= 1:
-                    cx_logger().info("executing on_job_complete")
+                    logger().info("executing on_job_complete")
                     predictor_impl.on_job_complete()
                     sqs_client.delete_message(QueueUrl=queue_url, ReceiptHandle=receipt_handle)
                     return True
@@ -164,7 +166,7 @@ def sqs_loop():
 
         if response.get("Messages") is None or len(response["Messages"]) == 0:
             if no_messages_found_in_previous_iteration:
-                cx_logger().info("no batches left in queue, exiting...")
+                logger().info("no batches left in queue, exiting...")
                 return
             else:
                 no_messages_found_in_previous_iteration = True
@@ -179,14 +181,14 @@ def sqs_loop():
         if "MessageAttributes" in message and "job_complete" in message["MessageAttributes"]:
             handled_on_complete = handle_on_complete(message)
             if handled_on_complete:
-                cx_logger().info("no batches left in queue, job has been completed")
+                logger().info("no batches left in queue, job has been completed")
                 return
             else:
                 # sometimes on_job_complete message will be released if there are other messages still to be processed
                 continue
 
         try:
-            cx_logger().info(f"processing batch {message['MessageId']}")
+            logger().info(f"processing batch {message['MessageId']}")
 
             start_time = time.time()
 
@@ -201,12 +203,15 @@ def sqs_loop():
             api_spec.post_metrics(
                 [failed_counter_metric(), time_per_batch_metric(time.time() - start_time)]
             )
-            cx_logger().exception("failed to process batch")
+            logger().exception("failed to process batch")
         finally:
             sqs_client.delete_message(QueueUrl=queue_url, ReceiptHandle=receipt_handle)
 
 
 def start():
+    while not pathlib.Path("/mnt/workspace/init_script_run.txt").is_file():
+        time.sleep(0.2)
+
     cache_dir = os.environ["CORTEX_CACHE_DIR"]
     provider = os.environ["CORTEX_PROVIDER"]
     api_spec_path = os.environ["CORTEX_API_SPEC"]
@@ -217,45 +222,42 @@ def start():
     tf_serving_port = os.getenv("CORTEX_TF_BASE_SERVING_PORT", "9000")
     tf_serving_host = os.getenv("CORTEX_TF_SERVING_HOST", "localhost")
 
-    storage = S3(bucket=os.environ["CORTEX_BUCKET"], region=os.environ["AWS_REGION"])
+    bucket = os.getenv("CORTEX_BUCKET")
+    region = os.getenv("AWS_REGION")
 
     has_multiple_servers = os.getenv("CORTEX_MULTIPLE_TF_SERVERS")
     if has_multiple_servers:
-        with FileLock("/run/used_ports.json.lock"):
-            with open("/run/used_ports.json", "r+") as f:
-                used_ports = json.load(f)
-                for port in used_ports.keys():
-                    if not used_ports[port]:
-                        tf_serving_port = port
-                        used_ports[port] = True
-                        break
-                f.seek(0)
-                json.dump(used_ports, f)
-                f.truncate()
-
-    raw_api_spec = get_spec(provider, storage, cache_dir, api_spec_path)
+        with LockedFile("/run/used_ports.json", "r+") as f:
+            used_ports = json.load(f)
+            for port in used_ports.keys():
+                if not used_ports[port]:
+                    tf_serving_port = port
+                    used_ports[port] = True
+                    break
+            f.seek(0)
+            json.dump(used_ports, f)
+            f.truncate()
+
+    api = get_api(provider, api_spec_path, model_dir, cache_dir, bucket, region)
+    storage, api_spec = get_spec(provider, api_spec_path, cache_dir, bucket, region)
     job_spec = get_job_spec(storage, cache_dir, job_spec_path)
 
-    api = API(
-        provider=provider, storage=storage, model_dir=model_dir, cache_dir=cache_dir, **raw_api_spec
-    )
-
     client = api.predictor.initialize_client(
         tf_serving_host=tf_serving_host, tf_serving_port=tf_serving_port
     )
-    cx_logger().info("loading the predictor from {}".format(api.predictor.path))
-    predictor_impl = api.predictor.initialize_impl(project_dir, client, raw_api_spec, job_spec)
+    logger().info("loading the predictor from {}".format(api.predictor.path))
+    predictor_impl = api.predictor.initialize_impl(project_dir, client, job_spec)
 
     local_cache["api_spec"] = api
     local_cache["provider"] = provider
     local_cache["job_spec"] = job_spec
     local_cache["predictor_impl"] = predictor_impl
     local_cache["predict_fn_args"] = inspect.getfullargspec(predictor_impl.predict).args
-    local_cache["sqs_client"] = boto3.client("sqs", region_name=os.environ["AWS_REGION"])
+    local_cache["sqs_client"] = boto3.client("sqs", region_name=region)
 
     open("/mnt/workspace/api_readiness.txt", "a").close()
 
-    cx_logger().info("polling for batches...")
+    logger().info("polling for batches...")
     sqs_loop()
 
 
diff --git a/pkg/workloads/cortex/serve/start/server.py b/pkg/workloads/cortex/serve/start/server.py
index 564b0bb7c9..3acee0e1ba 100644
--- a/pkg/workloads/cortex/serve/start/server.py
+++ b/pkg/workloads/cortex/serve/start/server.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 
 import sys
+import pathlib
+import time
 
 import uvicorn
 import yaml
@@ -24,6 +26,9 @@ def main():
     with open("/src/cortex/serve/log_config.yaml", "r") as f:
         log_config = yaml.load(f, yaml.FullLoader)
 
+    while not pathlib.Path("/mnt/workspace/init_script_run.txt").is_file():
+        time.sleep(0.2)
+
     uvicorn.run(
         "cortex.serve.wsgi:app",
         uds=uds,