Merge branch 'main' into quantizer

OpenRouterTeam · Mar 19, 2024 · d313c8e · d313c8e
2 parents 5f233c7 + 754d41f
commit d313c8e
Show file tree

Hide file tree

Showing 4 changed files with 61 additions and 98 deletions.
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,20 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**To Reproduce**
+If applicable, provide steps to reproduce the behavior:
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**Additional context**
+Add any other context about the problem here.
diff --git a/modal/runner/containers/vllm_unified.py b/modal/runner/containers/vllm_unified.py
@@ -10,7 +10,6 @@
     get_logger,
     get_observability_secrets,
 )
-from shared.protocol import GPUType
 from shared.volumes import (
     does_model_exist,
     get_model_path,
@@ -31,12 +30,6 @@ def _make_container(
     """Helper function to create a container with the given GPU configuration."""
 
     num_gpus = gpu.count
-    if isinstance(gpu, modal.gpu.A100):
-        gpu_type = GPUType.A100_80G if gpu.memory == 80 else GPUType.A100_40G
-    elif isinstance(gpu, modal.gpu.H100):
-        gpu_type = GPUType.H100_80G
-    else:
-        raise ValueError(f"Unknown GPU type: {gpu}")
 
     # Avoid wasting resources & money in dev
     if keep_warm and is_env_dev():
@@ -63,21 +56,13 @@ def __init__(self):
                     ray.init(num_gpus=num_gpus, ignore_reinit_error=True)
 
                 super().__init__(
-                    gpu_type=gpu_type,
                     params=VllmParams(
                         model=str(model_path),
                         tensor_parallel_size=num_gpus,
                         **vllm_opts,
                     ),
                 )
 
-                # For any containers with keep_warm, we need to skip cold-start usage
-                # billing. This is because the first request might be minutes after
-                # the container is started, and we don't want to record that time as
-                # usage.
-                if keep_warm:
-                    self.is_first_request = False
-
                 # Performance improvement from https://github.com/vllm-project/vllm/issues/2073#issuecomment-1853422529
                 if num_gpus > 1:
                     import subprocess
@@ -118,29 +103,44 @@ def __init__(self):
 # Automatically populated by _make_container.
 REGISTERED_CONTAINERS = {}
 
+_phi2 = "TheBloke/phi-2-GPTQ"
 VllmContainer_MicrosoftPhi2 = _make_container(
     name="VllmContainer_MicrosoftPhi2",
-    model_name="microsoft/phi-2",
-    gpu=modal.gpu.A100(count=1, memory=40),
-    concurrent_inputs=120,
+    model_name=_phi2,
+    gpu=modal.gpu.A10G(count=1),
+    concurrent_inputs=4,
+    max_containers=5,
+    quantization="GPTQ",
 )
+
+_neural_chat = "TheBloke/neural-chat-7b-v3-1-GPTQ"
 VllmContainer_IntelNeuralChat7B = _make_container(
     name="VllmContainer_IntelNeuralChat7B",
-    model_name="Intel/neural-chat-7b-v3-1",
-    gpu=modal.gpu.A100(count=1, memory=40),
-    concurrent_inputs=100,
+    model_name=_neural_chat,
+    gpu=modal.gpu.A10G(count=1),
+    concurrent_inputs=4,
+    max_containers=5,
+    quantization="GPTQ",
 )
+
+_psyfighter = "TheBloke/Psyfighter-13B-GPTQ"
 VllmContainer_JebCarterPsyfighter13B = _make_container(
     "VllmContainer_JebCarterPsyfighter13B",
-    model_name="jebcarter/Psyfighter-13B",
-    gpu=modal.gpu.A100(count=1, memory=40),
-    concurrent_inputs=32,
+    model_name=_psyfighter,
+    gpu=modal.gpu.A10G(count=1),
+    concurrent_inputs=4,
+    max_containers=5,
+    quantization="GPTQ",
 )
+
+_psyfighter2 = "TheBloke/LLaMA2-13B-Psyfighter2-GPTQ"
 VllmContainer_KoboldAIPsyfighter2 = _make_container(
     name="VllmContainer_KoboldAIPsyfighter2",
-    model_name="KoboldAI/LLaMA2-13B-Psyfighter2",
-    gpu=modal.gpu.A100(count=1, memory=40),
-    concurrent_inputs=32,
+    model_name=_psyfighter2,
+    gpu=modal.gpu.A10G(count=1),
+    concurrent_inputs=4,
+    max_containers=5,
+    quantization="GPTQ",
 )
 
 _noromaid = "TheBloke/Noromaid-v0.1-mixtral-8x7b-Instruct-v3-GPTQ"
@@ -193,7 +193,15 @@ def __init__(self):
 # A re-mapping of model names to their respective quantized models.
 # From the outside, the model name is the original, but internally,
 # we use the quantized model name.
+#
+# NOTE: When serving quantized models, the throughput can suffer a ton
+#       at high batch sizes. Read this thread to learn why:
+#       https://github.com/vllm-project/vllm/issues/1002#issuecomment-1712824199
 QUANTIZED_MODELS = {
+    "microsoft/phi-2": _phi2,
+    "Intel/neural-chat-7b-v3-1": _neural_chat,
+    "jebcarter/Psyfighter-13B": _psyfighter,
+    "KoboldAI/LLaMA2-13B-Psyfighter2": _psyfighter2,
     "NeverSleep/Noromaid-v0.1-mixtral-8x7b-Instruct-v3": _noromaid,
     "jondurbin/bagel-34b-v0.2": _bagel,
     "sophosympatheia/Midnight-Rose-70B-v2.0.3": _midnight_rose,

diff --git a/modal/runner/engines/vllm.py b/modal/runner/engines/vllm.py
@@ -11,7 +11,6 @@
 )
 from shared.protocol import (
     CompletionPayload,
-    GPUType,
     ResponseBody,
     Usage,
     create_error_text,
@@ -62,14 +61,7 @@ class VllmParams(BaseModel):
 
 
 class VllmEngine(BaseEngine):
-    def __init__(
-        self,
-        gpu_type: GPUType,
-        params: VllmParams,
-    ):
-        self.gpu_type = gpu_type
-        self.is_first_request = True
-        self.t_cold_start = time.time()
+    def __init__(self, params: VllmParams):
         self.engine = None
         self.engine_args = AsyncEngineArgs(
             **params.dict(),
@@ -81,44 +73,14 @@ def startup(self):
         with timer("engine init", model=self.engine_args.model):
             self.engine = AsyncLLMEngine.from_engine_args(self.engine_args)
 
-    @property
-    def gpu_count(self) -> int:
-        return self.engine_args.tensor_parallel_size
-
-    @property
-    def cost_per_second(self) -> float:
-        return self.gpu_count * self.gpu_type.cost_per_second
-
-    # @method()
-    # async def tokenize_prompt(self, payload: Payload) -> List[int]:
-    #     return self.tokenizer(payload.prompt).input_ids
-
-    # @method()
-    # async def max_model_len(self) -> int:
-    #     engine_model_config = await self.engine.get_model_config()
-    #     return engine_model_config.max_model_len
-
     @method()
     async def generate(self, payload: CompletionPayload, params):
         assert self.engine is not None, "Engine not initialized"
 
-        # Track usage as a running total. For the first request to the
-        # container, cold-start time is included in the usage duration.
         t_start_inference = time.time()
-        t_start_usage_duration = t_start_inference
-        if self.is_first_request:
-            self.is_first_request = False
-            t_start_usage_duration = self.t_cold_start
-
         resp = ResponseBody(
             text="",
-            usage=Usage(
-                prompt_tokens=0,
-                completion_tokens=0,
-                duration=0.0,
-                gpu_type=self.gpu_type,
-                gpu_count=self.gpu_count,
-            ),
+            usage=Usage(prompt_tokens=0, completion_tokens=0),
         )
 
         try:
@@ -134,7 +96,6 @@ async def generate(self, payload: CompletionPayload, params):
                 finish_reason = current.outputs[0].finish_reason
                 resp.usage.prompt_tokens = len(current.prompt_token_ids)
                 resp.usage.completion_tokens = len(current.outputs[0].token_ids)
-                resp.usage.duration = time.time() - t_start_usage_duration
 
                 # Non-streaming requests continue generating w/o yielding intermediate results
                 if not payload.stream:
@@ -158,15 +119,14 @@ async def generate(self, payload: CompletionPayload, params):
             data = resp.json(ensure_ascii=False)
             yield sse(data) if payload.stream else data
 
+            duration = time.time() - t_start_inference
             logger.info(
                 "Completed generation",
                 extra={
                     "model": self.engine_args.model,
                     "tokens": resp.usage.completion_tokens,
-                    "tps": resp.usage.completion_tokens
-                    / (time.time() - t_start_inference),
-                    "duration": resp.usage.duration,
-                    "cost": resp.usage.duration * self.cost_per_second,
+                    "tps": resp.usage.completion_tokens / duration,
+                    "duration": duration,
                 },
             )
         except Exception as err:

diff --git a/modal/shared/protocol.py b/modal/shared/protocol.py
@@ -1,29 +1,8 @@
-from enum import Enum
-from typing import Final, List, Optional, Union
+from typing import List, Optional, Union
 
 from fastapi.responses import JSONResponse, PlainTextResponse
 from pydantic import BaseModel
 
-_COST_PER_SECOND_A100_40G: Final[float] = 0.001036
-_COST_PER_SECOND_A100_80G: Final[float] = 0.001553
-_COST_PER_SECOND_H100_80G: Final[float] = 0.002125
-
-
-class GPUType(Enum):
-    A100_40G = "A100_40G"
-    A100_80G = "A100_80G"
-    H100_80G = "H100_80G"
-
-    @property
-    def cost_per_second(self) -> float:
-        match self:
-            case GPUType.A100_40G:
-                return _COST_PER_SECOND_A100_40G
-            case GPUType.A100_80G:
-                return _COST_PER_SECOND_A100_80G
-            case GPUType.H100_80G:
-                return _COST_PER_SECOND_H100_80G
-
 
 # https://github.com/vllm-project/vllm/blob/320a622ec4d098f2da5d097930f4031517e7327b/vllm/sampling_params.py#L7-L52
 # Lines were sorted for consistency
@@ -68,10 +47,6 @@ class Usage(BaseModel):
     prompt_tokens: int
     completion_tokens: int
 
-    duration: float
-    gpu_type: GPUType
-    gpu_count: int
-
 
 class ResponseBody(BaseModel):
     text: str