From 9b7f346a294c64673f302a0440e0bd9036c93cf2 Mon Sep 17 00:00:00 2001
From: Matthew Deng <matt@anyscale.com>
Date: Mon, 8 Jan 2024 13:54:52 -0800
Subject: [PATCH 1/2] [serve] remove GPT-J example

Signed-off-by: Matthew Deng <matt@anyscale.com>
---
 ci/ray_ci/ml.tests.yml                        |   1 -
 .../data/examples/gptj_batch_prediction.ipynb |   2 -
 doc/source/ray-air/examples/BUILD             |  25 --
 .../ray-air/examples/gptj_serving.ipynb       | 241 ------------------
 doc/source/ray-overview/examples.rst          |   7 -
 doc/source/splash.html                        |   1 +
 6 files changed, 1 insertion(+), 276 deletions(-)
 delete mode 100644 doc/source/ray-air/examples/BUILD
 delete mode 100644 doc/source/ray-air/examples/gptj_serving.ipynb

diff --git a/ci/ray_ci/ml.tests.yml b/ci/ray_ci/ml.tests.yml
index 78358eea3277e..cd2bee90bff5a 100644
--- a/ci/ray_ci/ml.tests.yml
+++ b/ci/ray_ci/ml.tests.yml
@@ -3,4 +3,3 @@ flaky_tests:
   - //python/ray/train:horovod_cifar_pbt_example
   # doc tests
   # gpu tests
-  - //doc/source/ray-air/examples:gptj_serving
diff --git a/doc/source/data/examples/gptj_batch_prediction.ipynb b/doc/source/data/examples/gptj_batch_prediction.ipynb
index 35d820d9949a0..5c8fffc573844 100644
--- a/doc/source/data/examples/gptj_batch_prediction.ipynb
+++ b/doc/source/data/examples/gptj_batch_prediction.ipynb
@@ -13,8 +13,6 @@
     "\n",
     "It is highly recommended to read [Ray Train Key Concepts](train-key-concepts) and [Ray Data Key Concepts](data_key_concepts) before starting this example.\n",
     "\n",
-    "If you are interested in serving (online inference), see {doc}`/ray-air/examples/gptj_serving`.\n",
-    "\n",
     "```{note}\n",
     "In order to run this example, make sure your Ray cluster has access to at least one GPU with 16 or more GBs of memory. The amount of memory needed will depend on the model.\n",
     "```"
diff --git a/doc/source/ray-air/examples/BUILD b/doc/source/ray-air/examples/BUILD
deleted file mode 100644
index 79e5af9508238..0000000000000
--- a/doc/source/ray-air/examples/BUILD
+++ /dev/null
@@ -1,25 +0,0 @@
-load("//bazel:python.bzl", "py_test_run_all_subdirectory")
-load("//bazel:python.bzl", "py_test_run_all_notebooks")
-
-
-filegroup(
-    name = "air_examples",
-    srcs = glob(["*.ipynb", "*.py"]),
-    visibility = ["//doc:__subpackages__"]
-)
-
-# --------------------------------------------------------------------
-# Test all doc/source/ray-air/examples notebooks.
-# --------------------------------------------------------------------
-
-# GPU Tests
-
-py_test_run_all_notebooks(
-    size = "large",
-    include = [
-        "gptj_serving.ipynb",
-    ],
-    exclude = [],
-    data = ["//doc/source/ray-air/examples:air_examples"],
-    tags = ["exclusive", "team:ml", "gpu", "ray_air"],
-)
diff --git a/doc/source/ray-air/examples/gptj_serving.ipynb b/doc/source/ray-air/examples/gptj_serving.ipynb
deleted file mode 100644
index acd4d838266ba..0000000000000
--- a/doc/source/ray-air/examples/gptj_serving.ipynb
+++ /dev/null
@@ -1,241 +0,0 @@
-{
- "cells": [
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# GPT-J-6B Serving with Ray Serve\n",
-    "\n",
-    "In this example, we will showcase how to use the Ray Serve for **GPT-J serving (online inference)**. GPT-J is a GPT-2-like causal language model trained on the Pile dataset. This particular model has 6 billion parameters. For more information on GPT-J, click [here](https://huggingface.co/docs/transformers/model_doc/gptj).\n",
-    "\n",
-    "We will use Ray Serve for online inference and a pretrained model from Hugging Face hub. Note that you can easily adapt this example to use other similar models.\n",
-    "\n",
-    "It is highly recommended to read about [Ray Serve Key Concepts](serve-key-concepts) before starting this example.\n",
-    "\n",
-    "If you are interested in batch prediction (offline inference), see {doc}`/data/examples/gptj_batch_prediction`.\n",
-    "\n",
-    "```{note}\n",
-    "In order to run this example, make sure your Ray cluster has access to at least one GPU with 16 or more GBs of memory. The amount of memory needed will depend on the model.\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model_id = \"EleutherAI/gpt-j-6B\"\n",
-    "revision = \"float16\"  # use float16 weights to fit in 16GB GPUs\n",
-    "prompt = (\n",
-    "    \"In a shocking finding, scientists discovered a herd of unicorns living in a remote, \"\n",
-    "    \"previously unexplored valley, in the Andes Mountains. Even more surprising to the \"\n",
-    "    \"researchers was the fact that the unicorns spoke perfect English.\"\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import ray"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We define a {ref}`runtime environment <runtime-environments>` to ensure that the Ray workers have access to all the necessary packages. You can omit the `runtime_env` argument if you have all of the packages already installed on each node in your cluster."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ray.init(\n",
-    "    runtime_env={\n",
-    "        \"pip\": [\n",
-    "            \"accelerate>=0.16.0\",\n",
-    "            \"transformers>=4.26.0\",\n",
-    "            \"numpy<1.24\",  # remove when mlflow updates beyond 2.2\n",
-    "            \"torch\",\n",
-    "        ]\n",
-    "    }\n",
-    ")"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Setting up basic serving with Ray Serve is very similar to {doc}`batch inference with Ray Data </data/examples/gptj_batch_prediction>`. First, we define a callable class that will serve as the [Serve deployment](serve-key-concepts-deployment). At runtime, a deployment consists of a number of *replicas*, which are individual copies of the class or function that are started in separate Ray Actors (processes). The number of replicas can be scaled up or down (or even autoscaled) to match the incoming request load.\n",
-    "\n",
-    "We make sure to set the deployment to use 1 GPU by setting `\"num_gpus\"` in `ray_actor_options`. We load the model in `__init__`, which will allow us to save time by initializing a model just once and then use it to handle multiple requests.\n",
-    "\n",
-    "```{tip}\n",
-    "If you want to use inter-node model parallelism, you can also increase `num_gpus`. As we have created the model with `device_map=\"auto\"`, it will be automatically placed on correct devices. Note that this requires nodes with multiple GPUs.\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "\n",
-    "from ray import serve\n",
-    "from starlette.requests import Request\n",
-    "\n",
-    "\n",
-    "@serve.deployment(ray_actor_options={\"num_gpus\": 1})\n",
-    "class PredictDeployment:\n",
-    "    def __init__(self, model_id: str, revision: str = None):\n",
-    "        from transformers import AutoModelForCausalLM, AutoTokenizer\n",
-    "        import torch\n",
-    "\n",
-    "        self.model = AutoModelForCausalLM.from_pretrained(\n",
-    "            model_id,\n",
-    "            revision=revision,\n",
-    "            torch_dtype=torch.float16,\n",
-    "            low_cpu_mem_usage=True,\n",
-    "            device_map=\"auto\",  # automatically makes use of all GPUs available to the Actor\n",
-    "        )\n",
-    "        self.tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
-    "\n",
-    "    def generate(self, text: str) -> pd.DataFrame:\n",
-    "        input_ids = self.tokenizer(text, return_tensors=\"pt\").input_ids.to(\n",
-    "            self.model.device\n",
-    "        )\n",
-    "\n",
-    "        gen_tokens = self.model.generate(\n",
-    "            input_ids,\n",
-    "            do_sample=True,\n",
-    "            temperature=0.9,\n",
-    "            max_length=100,\n",
-    "        )\n",
-    "        return pd.DataFrame(\n",
-    "            self.tokenizer.batch_decode(gen_tokens), columns=[\"responses\"]\n",
-    "        )\n",
-    "\n",
-    "    async def __call__(self, http_request: Request) -> str:\n",
-    "        json_request: str = await http_request.json()\n",
-    "        prompts = []\n",
-    "        for prompt in json_request:\n",
-    "            text = prompt[\"text\"]\n",
-    "            if isinstance(text, list):\n",
-    "                prompts.extend(text)\n",
-    "            else:\n",
-    "                prompts.append(text)\n",
-    "        return self.generate(prompts)"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We can now `bind` the deployment with our arguments, and use {meth}`~ray.serve.run` to start it.\n",
-    "\n",
-    "```{note}\n",
-    "If you were running this script outside of a Jupyter notebook, the recommended way is to use the [`serve run` CLI command](serve-cli). In this case, you would remove the `serve.run(deployment)` line, and instead start the deployment by calling `serve run FILENAME:deployment`.\n",
-    "\n",
-    "For more information, see [Serve Development Workflow](serve-dev-workflow).\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "RayServeSyncHandle(deployment='PredictDeployment')"
-      ]
-     },
-     "execution_count": 18,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "deployment = PredictDeployment.bind(model_id=model_id, revision=revision)\n",
-    "serve.run(deployment)"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Let's try submitting a request to our deployment. We will use the same prompt as before, and send a POST request. The deployment will generate a response and return it."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "(ServeReplica:PredictDeployment pid=651, ip=10.0.8.161) The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
-      "(ServeReplica:PredictDeployment pid=651, ip=10.0.8.161) Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[{'responses': 'In a shocking finding, scientists discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English.\\n\\nThe findings come from a recent expedition to the region of Cordillera del Divisor, in northern Peru. The region was previously known to have an unusually high number of native animals.\\n\\n\"Our team was conducting a population census of the region’'}]\n"
-     ]
-    }
-   ],
-   "source": [
-    "import requests\n",
-    "\n",
-    "prompt = (\n",
-    "    \"In a shocking finding, scientists discovered a herd of unicorns living in a remote, \"\n",
-    "    \"previously unexplored valley, in the Andes Mountains. Even more surprising to the \"\n",
-    "    \"researchers was the fact that the unicorns spoke perfect English.\"\n",
-    ")\n",
-    "\n",
-    "sample_input = {\"text\": prompt}\n",
-    "\n",
-    "output = requests.post(\"http://localhost:8000/\", json=[sample_input]).json()\n",
-    "print(output)"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "venv",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "name": "python",
-   "version": "3.8.10 (default, Nov 14 2022, 12:59:47) \n[GCC 9.4.0]"
-  },
-  "orig_nbformat": 4,
-  "orphan": true,
-  "vscode": {
-   "interpreter": {
-    "hash": "3c0d54d489a08ae47a06eae2fd00ff032d6cddb527c382959b7b2575f6a8167f"
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/doc/source/ray-overview/examples.rst b/doc/source/ray-overview/examples.rst
index 631dfcefa52c7..875f378da686f 100644
--- a/doc/source/ray-overview/examples.rst
+++ b/doc/source/ray-overview/examples.rst
@@ -131,13 +131,6 @@ Ray Examples
 
         How to use Ray Data to do batch prediction with the Hugging Face Transformers GPT-J model
 
-    .. grid-item-card:: :bdg-secondary:`Code example`
-        :class-item: gallery-item serving large-language-models generative-ai
-        :link: /ray-air/examples/gptj_serving
-        :link-type: doc
-
-        How to use Ray Serve to do online serving with the Hugging Face Transformers GPT-J model
-
     .. grid-item-card:: :bdg-secondary:`Code example`
         :class-item: gallery-item computer-vision training generative-ai
         :link: /train/examples/pytorch/dreambooth_finetuning
diff --git a/doc/source/splash.html b/doc/source/splash.html
index b4131ea57c92d..eb806402bfcdd 100644
--- a/doc/source/splash.html
+++ b/doc/source/splash.html
@@ -328,6 +328,7 @@ <h2 style="font-weight: 600">Scaling with Ray</h2>
             <a href="./serve/api/index.html" target="_blank">API references</a>
           </div>
           <div class="col-6" style="display: flex; justify-content: flex-end">
+            <!-- TODO: Update this to diffrent example. -->
             <a
               href="https://github.com/ray-project/ray/blob/master/doc/source/ray-air/examples/gptj_serving.ipynb"
               target="_blank"

From 255c7cf6cdef9727b2c8efa6fee3cd7f0fbbaa56 Mon Sep 17 00:00:00 2001
From: Matthew Deng <matt@anyscale.com>
Date: Mon, 8 Jan 2024 14:32:56 -0800
Subject: [PATCH 2/2] example

Signed-off-by: Matthew Deng <matt@anyscale.com>
---
 doc/source/splash.html | 76 +++++++++++++++++++++++++++---------------
 1 file changed, 50 insertions(+), 26 deletions(-)

diff --git a/doc/source/splash.html b/doc/source/splash.html
index eb806402bfcdd..77079d2ac5cb8 100644
--- a/doc/source/splash.html
+++ b/doc/source/splash.html
@@ -285,40 +285,65 @@ <h2 style="font-weight: 600">Scaling with Ray</h2>
       >
         <pre style="margin: 0">
           <code class="language-python">
-import pandas as pd
+from io import BytesIO
+from fastapi import FastAPI
+from fastapi.responses import Response
+import torch
 
 from ray import serve
-from starlette.requests import Request
+from ray.serve.handle import DeploymentHandle
 
 
-@serve.deployment(ray_actor_options={"num_gpus": 1})
-class PredictDeployment:
-    def __init__(self, model_id: str, revision: str = None):
-        from transformers import AutoModelForCausalLM, AutoTokenizer
-        import torch
+app = FastAPI()
 
-        self.model = AutoModelForCausalLM.from_pretrained(
-            model_id,
-            …
-        )
-        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
 
-    def generate(self, text: str) -> pd.DataFrame:
-        input_ids = self.tokenizer(text, return_tensors="pt").input_ids.to(
-            self.model.device
-        )
+@serve.deployment(num_replicas=1)
+@serve.ingress(app)
+class APIIngress:
+    def __init__(self, diffusion_model_handle: DeploymentHandle) -> None:
+        self.handle = diffusion_model_handle
+
+    @app.get(
+        "/imagine",
+        responses={200: {"content": {"image/png": {}}}},
+        response_class=Response,
+    )
+    async def generate(self, prompt: str, img_size: int = 512):
+        assert len(prompt), "prompt parameter cannot be empty"
+
+        image = await self.handle.generate.remote(prompt, img_size=img_size)
+        file_stream = BytesIO()
+        image.save(file_stream, "PNG")
+        return Response(content=file_stream.getvalue(), media_type="image/png")
+
+
+@serve.deployment(
+    ray_actor_options={"num_gpus": 1},
+    autoscaling_config={"min_replicas": 0, "max_replicas": 2},
+)
+class StableDiffusionV2:
+    def __init__(self):
+        from diffusers import EulerDiscreteScheduler, StableDiffusionPipeline
+
+        model_id = "stabilityai/stable-diffusion-2"
 
-        gen_tokens = self.model.generate(
-            input_ids,
-            …
+        scheduler = EulerDiscreteScheduler.from_pretrained(
+            model_id, subfolder="scheduler"
         )
-        return pd.DataFrame(
-            self.tokenizer.batch_decode(gen_tokens), columns=["responses"]
+        self.pipe = StableDiffusionPipeline.from_pretrained(
+            model_id, scheduler=scheduler, revision="fp16", torch_dtype=torch.float16
         )
+        self.pipe = self.pipe.to("cuda")
+
+    def generate(self, prompt: str, img_size: int = 512):
+        assert len(prompt), "prompt parameter cannot be empty"
+
+        with torch.autocast("cuda"):
+            image = self.pipe(prompt, height=img_size, width=img_size).images[0]
+            return image
+
 
-    async def __call__(self, http_request: Request) -> str:
-        prompts: list[str] = await http_request.json()["prompts"]
-        return self.generate(prompts)
+entrypoint = APIIngress.bind(StableDiffusionV2.bind())
           </code>
         </pre>
         <div class="row" style="padding: 16px">
@@ -328,9 +353,8 @@ <h2 style="font-weight: 600">Scaling with Ray</h2>
             <a href="./serve/api/index.html" target="_blank">API references</a>
           </div>
           <div class="col-6" style="display: flex; justify-content: flex-end">
-            <!-- TODO: Update this to diffrent example. -->
             <a
-              href="https://github.com/ray-project/ray/blob/master/doc/source/ray-air/examples/gptj_serving.ipynb"
+              href="https://github.com/ray-project/ray/blob/master/doc/source/serve/doc_code/stable_diffusion.py"
               target="_blank"
             >
               <img src="_static/img/github-fill.png" height="25px" />