diff --git a/docker/compose/docker-compose.gemma-4b-gpu.yml b/docker/compose/docker-compose.gemma-4b-gpu.yml new file mode 100644 index 00000000..8cb630df --- /dev/null +++ b/docker/compose/docker-compose.gemma-4b-gpu.yml @@ -0,0 +1,52 @@ +services: + gemma_4b_it_gpu: + image: nillion/nilai-vllm:latest + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + ipc: host + ulimits: + memlock: -1 + stack: 67108864 + env_file: + - .env # must include HUGGINGFACE_HUB_TOKEN + restart: unless-stopped + depends_on: + etcd: + condition: service_healthy + command: > + --model "google/gemma-3-4b-it" + --trust-remote-code + --gpu-memory-utilization 0.85 + --max-model-len 60000 + --tensor-parallel-size 1 + --uvicorn-log-level warning + --chat-template-content-format "openai" + --dtype "bfloat16" + environment: + - SVC_HOST=gemma_4b_it_gpu + - SVC_PORT=8000 + - ETCD_HOST=etcd + - ETCD_PORT=2379 + - TOOL_SUPPORT=true + - ENABLE_MULTIMODAL=true + volumes: + - hugging_face_models:/root/.cache/huggingface + networks: + - backend_net + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + interval: 30s + retries: 3 + start_period: 90s + timeout: 10s + +volumes: + hugging_face_models: + +networks: + backend_net: diff --git a/docker/vllm.Dockerfile b/docker/vllm.Dockerfile index 01ba019f..99ac788f 100644 --- a/docker/vllm.Dockerfile +++ b/docker/vllm.Dockerfile @@ -1,4 +1,4 @@ -FROM vllm/vllm-openai:v0.7.3 +FROM vllm/vllm-openai:latest # # Specify model name and path during build # ARG MODEL_NAME=llama_1b_cpu @@ -14,13 +14,17 @@ COPY --link . /daemon/ WORKDIR /daemon/nilai-models/ RUN apt-get update && \ - apt-get install build-essential -y && \ - pip install uv && \ + apt-get install -y ffmpeg libsm6 libxext6 libgl1 build-essential && \ + pip install uv pillow torchvision torchaudio && \ uv sync && \ apt-get clean && \ apt-get autoremove && \ rm -rf /var/lib/apt/lists/* +# Install dependencies for multimodal models +RUN pip install pillow ftfy regex +RUN pip install git+https://github.com/huggingface/transformers@v4.49.0-Gemma-3 + # Expose port 8000 for incoming requests EXPOSE 8000 diff --git a/nilai-api/src/nilai_api/config/mainnet.py b/nilai-api/src/nilai_api/config/mainnet.py index 2a27af1a..8ac9f2e2 100644 --- a/nilai-api/src/nilai_api/config/mainnet.py +++ b/nilai-api/src/nilai_api/config/mainnet.py @@ -10,6 +10,7 @@ "cognitivecomputations/Dolphin3.0-Llama3.1-8B": 30, "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B": 5, "hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4": 5, + "google/gemma-3-4b-it": 5, } # It defines the number of requests allowed for each user for a given time frame. diff --git a/nilai-api/src/nilai_api/config/testnet.py b/nilai-api/src/nilai_api/config/testnet.py index 8efcb7df..98f62910 100644 --- a/nilai-api/src/nilai_api/config/testnet.py +++ b/nilai-api/src/nilai_api/config/testnet.py @@ -10,6 +10,7 @@ "cognitivecomputations/Dolphin3.0-Llama3.1-8B": 5, "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B": 5, "hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4": 5, + "google/gemma-3-4b-it": 5, } # It defines the number of requests allowed for each user for a given time frame. diff --git a/nilai-api/src/nilai_api/routers/private.py b/nilai-api/src/nilai_api/routers/private.py index 86a6ed04..6755e77d 100644 --- a/nilai-api/src/nilai_api/routers/private.py +++ b/nilai-api/src/nilai_api/routers/private.py @@ -238,16 +238,28 @@ async def chat_completion_stream_generator() -> AsyncGenerator[str, None]: chat_completion_stream_generator(), media_type="text/event-stream", # Ensure client interprets as Server-Sent Events ) + client = OpenAI(base_url=model_url, api_key="") - response = client.chat.completions.create( - model=req.model, - messages=req.messages, # type: ignore - stream=req.stream, - top_p=req.top_p, - temperature=req.temperature, - max_tokens=req.max_tokens, - tools=req.tools, # type: ignore - ) # type: ignore + if req.response_format: + response = client.beta.chat.completions.parse( + model=req.model, + messages=req.messages, + top_p=req.top_p, + temperature=req.temperature, + max_tokens=req.max_tokens, + tools=req.tools, + response_format=req.response_format, + ) + else: + response = client.chat.completions.create( + model=req.model, + messages=req.messages, # type: ignore + stream=req.stream, + top_p=req.top_p, + temperature=req.temperature, + max_tokens=req.max_tokens, + tools=req.tools, # type: ignore + ) # type: ignore model_response = SignedChatCompletion( **response.model_dump(), diff --git a/packages/nilai-common/src/nilai_common/api_model.py b/packages/nilai-common/src/nilai_common/api_model.py index 8576d931..5260eeb1 100644 --- a/packages/nilai-common/src/nilai_common/api_model.py +++ b/packages/nilai-common/src/nilai_common/api_model.py @@ -1,5 +1,5 @@ import uuid -from typing import List, Optional, Literal, Iterable +from typing import List, Optional, Literal, Union from openai.types.chat import ChatCompletion, ChatCompletionMessage from openai.types.chat.chat_completion import Choice as OpenaAIChoice @@ -21,8 +21,23 @@ ] +# Define ImageUrl for image content +class ImageUrl(BaseModel): + url: str + + +# Define MessageContent for multimodal content +class MessageContent(BaseModel): + type: Literal["text", "image_url"] + text: Optional[str] = None + image_url: Optional[ImageUrl] = None + + +# Define Message as a standalone class class Message(ChatCompletionMessage): - role: Literal["system", "user", "assistant", "tool"] # type: ignore + role: Literal["system", "user", "assistant", "tool"] + content: Union[str, List[MessageContent]] + name: Optional[str] = None class Choice(OpenaAIChoice): @@ -34,10 +49,11 @@ class ChatRequest(BaseModel): messages: List[Message] = Field(..., min_length=1) temperature: Optional[float] = Field(default=0.2, ge=0.0, le=5.0) top_p: Optional[float] = Field(default=0.95, ge=0.0, le=1.0) - max_tokens: Optional[int] = Field(default=2048, ge=1, le=100000) + max_tokens: Optional[int] = Field(default=10000, ge=1, le=100000) stream: Optional[bool] = False - tools: Optional[Iterable[ChatCompletionToolParam]] = None + tools: List[ChatCompletionToolParam] = Field(default_factory=list) nilrag: Optional[dict] = {} + response_format: Optional[dict] = {} class SignedChatCompletion(ChatCompletion):