|
| 1 | +# Custom docker images |
| 2 | + |
| 3 | +!!! Warning |
| 4 | + This feature is currently in beta, and the API is likely to change. Please contact us if you are interested |
| 5 | + in using this feature. |
| 6 | + |
| 7 | +If you need more customization that what cloudpickle or zip artifacts can offer, or if you just already have a pre-built |
| 8 | +docker image, then you can create a Model Bundle with that docker image. You will need to modify your image to run a |
| 9 | +web server that exposes HTTP port 5005. |
| 10 | + |
| 11 | +In our example below, we assume that you have some existing Python function `my_inference_fn` that can be imported. |
| 12 | +If you need to invoke some other binary (e.g. a custom C++ binary), then you can shell out to the OS to call that binary; |
| 13 | +subsequent versions of this document will have native examples for non-Python binaries. |
| 14 | + |
| 15 | +For choice of web server, we recommend [FastAPI](https://fastapi.tiangolo.com/lo/) due to its speed and ergonomics. |
| 16 | +Any web server would work, although we give examples with FastAPI. |
| 17 | + |
| 18 | +## Step 1: Install FastAPI |
| 19 | + |
| 20 | +You can add `fastapi` to the `requirements.txt` file that gets installed as part of your Dockerfile. Alternatively, |
| 21 | +you can add `pip install fastapi` to the Dockerfile directly. |
| 22 | + |
| 23 | +## Step 2: Set up a web server application |
| 24 | + |
| 25 | +Inside your project workspace, create a `server.py` file with these contents: |
| 26 | + |
| 27 | +```py |
| 28 | +# test='skip' |
| 29 | +from fastapi import FastAPI |
| 30 | + |
| 31 | +from pydantic import BaseModel |
| 32 | + |
| 33 | +app = FastAPI() |
| 34 | + |
| 35 | +class MyRequestSchema(BaseModel): |
| 36 | + inputs: str |
| 37 | + |
| 38 | + |
| 39 | +class MyResponseSchema(BaseModel): |
| 40 | + response: str |
| 41 | + |
| 42 | +def my_inference_fn(req: MyRequestSchema) -> MyResponseSchema: |
| 43 | + # This is an example inference function - you can instead import a function from your own codebase, |
| 44 | + # or shell out to the OS, etc. |
| 45 | + resp = req.inputs + "_hello" |
| 46 | + return MyResponseSchema(response=resp) |
| 47 | + |
| 48 | +@app.get("/predict") |
| 49 | +async def predict(request: MyRequestSchema) -> MyResponseSchema: |
| 50 | + response = my_inference_fn(request) |
| 51 | + return response |
| 52 | + |
| 53 | +@app.get("/readyz") |
| 54 | +def readyz(): |
| 55 | + return "ok" |
| 56 | +``` |
| 57 | + |
| 58 | +## Step 3: Rebuild and push your image |
| 59 | + |
| 60 | +Build your updated Dockerfile and push the image to a location that is accessible by Scale. For instance, if you are |
| 61 | +using AWS ECR, please make sure that the necessary cross-account permissions allow Scale to pull your docker image. |
| 62 | + |
| 63 | +## Step 4: Deploy! |
| 64 | + |
| 65 | +Now you can upload your docker image as a Model Bundle, and then create a Model Endpoint referencing that Model Bundle. |
| 66 | + |
| 67 | + |
| 68 | +```py |
| 69 | +# test='skip' |
| 70 | +import os |
| 71 | + |
| 72 | +from launch import LaunchClient |
| 73 | + |
| 74 | +from server import MyRequestSchema, MyResponseSchema # Defined as part of your server.py |
| 75 | + |
| 76 | +client = LaunchClient(api_key=os.getenv("LAUNCH_API_KEY")) |
| 77 | + |
| 78 | +model_bundle_name = "my_bundle_name" |
| 79 | + |
| 80 | +client.create_model_bundle_from_runnable_image_v2( |
| 81 | + model_bundle_name=model_bundle_name, |
| 82 | + request_schema=MyRequestSchema, |
| 83 | + response_schema=MyResponseSchema, |
| 84 | + repository="$YOUR_ECR_REPO", |
| 85 | + tag="$YOUR_IMAGE_TAG", |
| 86 | + command=[ |
| 87 | + "dumb-init", |
| 88 | + "--", |
| 89 | + "uvicorn", |
| 90 | + "/path/in/docker/image/to/server.py", |
| 91 | + "--port", |
| 92 | + "5005", |
| 93 | + "--host", |
| 94 | + "::1", |
| 95 | + ], |
| 96 | + readiness_initial_delay_seconds=120, |
| 97 | + env={}, |
| 98 | +) |
| 99 | + |
| 100 | +client.create_model_endpoint( |
| 101 | + endpoint_name=f"endpoint-{model_bundle_name}", |
| 102 | + model_bundle=model_bundle_name, |
| 103 | + endpoint_type="async", |
| 104 | + min_workers=0, |
| 105 | + max_workers=1, |
| 106 | + per_worker=1, |
| 107 | + memory="30Gi", |
| 108 | + storage="40Gi", |
| 109 | + cpus=4, |
| 110 | + gpus=1, |
| 111 | + gpu_type="nvidia-ampere-a10", |
| 112 | + update_if_exists=True, |
| 113 | +) |
| 114 | +``` |
0 commit comments