-
Notifications
You must be signed in to change notification settings - Fork 56
[WIP][AQUA][GPU Shape Recommendation] Support for Service Managed Models #1252
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
f9e0a1d
bab9f82
89d9a3a
c56882b
6b0f0a4
f293271
7b321f5
e61fae5
2af1bcd
fbbf87e
fcb5162
0d7ade4
32b8bcc
82ae39b
0eff21e
cce4d90
389d050
4284885
fb33939
4a2c63a
015aa56
19de240
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -78,15 +78,25 @@ | |
|
||
IN_FLIGHT_QUANTIZATION = {"4bit"} # vLLM only supports 4bit in-flight-quantization | ||
|
||
VLLM_PARAMS_KEY = "VLLM_PARAMS" | ||
VLLM_ENV_KEY = "VLLM" | ||
QUANT_FLAG = "--quantization" | ||
MAX_MODEL_LEN_FLAG = "--max-model-len" | ||
|
||
TROUBLESHOOT_MSG = "The selected model is too large to fit on standard GPU shapes with the current configuration.\nAs troubleshooting, we have suggested the two largest available GPU shapes using the smallest quantization level ('4bit') to maximize chances of fitting the model. " | ||
|
||
VLLM_PARAMS = { | ||
"max_model_len": "--max-model-len", | ||
"in_flight_quant": "--quantization bitsandbytes --load-format bitsandbytes", | ||
"trust_remote_code": "--trust-remote-code" | ||
} | ||
|
||
DEFAULT_WEIGHT_SIZE = "bfloat16" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It looks like we already have such variable, three lines below? |
||
DEFAULT_MAX_SEQ_LEN = 4096 | ||
|
||
DEFAULT_WEIGHT_SIZE = "float32" | ||
|
||
|
||
BITS_AND_BYTES_8BIT = "8bit" | ||
BITS_AND_BYTES_4BIT = "4bit" | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -131,6 +131,10 @@ def construct_deployment_params(self) -> str: | |
# vLLM only supports 4bit in-flight quantization | ||
params.append(VLLM_PARAMS["in_flight_quant"]) | ||
|
||
# add trust-remote-code if custom modules are specified | ||
if c.trust_remote_code: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could we use more meaningful name for the config variable? |
||
params.append(VLLM_PARAMS["trust_remote_code"]) | ||
|
||
params = " ".join(params) if params else "" | ||
return params | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,6 +11,7 @@ | |
from ads.aqua.shaperecommend.constants import ( | ||
BITS_AND_BYTES_4BIT, | ||
BITS_AND_BYTES_8BIT, | ||
DEFAULT_MAX_SEQ_LEN, | ||
DEFAULT_WEIGHT_SIZE, | ||
NEXT_QUANT, | ||
QUANT_MAPPING, | ||
|
@@ -42,7 +43,7 @@ class LLMConfig(BaseModel): | |
description="Dimension of each attention head. Typically hidden_size // num_attention_heads.", | ||
) | ||
max_seq_len: Optional[int] = Field( | ||
4096, description="Maximum input sequence length (context window)." | ||
DEFAULT_MAX_SEQ_LEN, description="Maximum input sequence length (context window)." | ||
) | ||
weight_dtype: Optional[str] = Field( | ||
DEFAULT_WEIGHT_SIZE, | ||
|
@@ -74,7 +75,9 @@ class LLMConfig(BaseModel): | |
None, description="For MoE architectures, size of the MLP activation layer." | ||
) | ||
|
||
tie_word_embeddings: Optional[bool] = Field(None) | ||
tie_word_embeddings: Optional[bool] = Field(True, description="if True, input and output embedding matrices share the same parameters in memory.") | ||
|
||
trust_remote_code: Optional[bool] = Field(False, description="if True, the model requires custom code to operate.") | ||
|
||
@property | ||
def bytes_per_parameter(self) -> float: | ||
|
@@ -207,6 +210,17 @@ def validate_model_support(cls, raw: dict) -> ValueError: | |
"Encoder-decoder models (ex. T5, Gemma) and encoder-only (BERT) are not supported at this time." | ||
) | ||
|
||
@staticmethod | ||
def get_bool(raw, key, default=False): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In |
||
val = raw.get(key) | ||
if val is None: | ||
return default | ||
if isinstance(val, bool): | ||
return val | ||
if isinstance(val, str): | ||
return val.lower() == "true" | ||
return bool(val) | ||
|
||
@classmethod | ||
def from_raw_config(cls, raw: dict) -> "LLMConfig": | ||
""" | ||
|
@@ -257,6 +271,10 @@ def from_raw_config(cls, raw: dict) -> "LLMConfig": | |
"intermediate_size" | ||
) | ||
|
||
tie_word_embeddings = LLMConfig.get_bool(raw, "tie_word_embeddings", True) | ||
|
||
trust_remote_code = "auto_map" in raw # trust-remote-code is always needed when this key is present | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you add more description for the section below? Can the error be more specific?
|
||
# Type safety: minimal assertion | ||
if None in [ | ||
num_hidden_layers, | ||
|
@@ -280,4 +298,6 @@ def from_raw_config(cls, raw: dict) -> "LLMConfig": | |
max_seq_len=int(max_seq_len), | ||
num_local_experts=num_local_experts, | ||
intermediate_size=intermediate_size, | ||
tie_word_embeddings=tie_word_embeddings, | ||
trust_remote_code=trust_remote_code | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -93,6 +93,25 @@ def which_shapes( | |
shapes = self.valid_compute_shapes(compartment_id=request.compartment_id) | ||
|
||
ds_model = self._validate_model_ocid(request.model_id) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I noticed that this method also retrieves model details, but the function name doesn’t suggest that it returns the model config as well. I’d recommend separating the retrieval of model details from this function to make its purpose clearer and more consistent. |
||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you update the description for this method? Specifically input params. |
||
model_name = ds_model.display_name if ds_model.display_name else "" | ||
|
||
if request.deployment_config: | ||
shape_recommendation_report = ( | ||
ShapeRecommendationReport.from_deployment_config( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we check if a deployment_config was successfully obtained, and if so we will generate the report immediately. |
||
request.deployment_config, model_name, shapes | ||
) | ||
) | ||
|
||
else: | ||
data = self._get_model_config(ds_model) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure why would we need to repeat same lines again? |
||
|
||
llm_config = LLMConfig.from_raw_config(data) | ||
|
||
shape_recommendation_report = self._summarize_shapes_for_seq_lens( | ||
llm_config, shapes, model_name | ||
) | ||
|
||
data = self._get_model_config(ds_model) | ||
|
||
llm_config = LLMConfig.from_raw_config(data) | ||
|
@@ -102,7 +121,7 @@ def which_shapes( | |
shape_recommendation_report = self._summarize_shapes_for_seq_lens( | ||
llm_config, shapes, model_name | ||
) | ||
|
||
if request.generate_table and shape_recommendation_report.recommendations: | ||
shape_recommendation_report = self._rich_diff_table( | ||
shape_recommendation_report | ||
|
@@ -248,13 +267,19 @@ def _rich_diff_table(shape_report: ShapeRecommendationReport) -> Table: | |
else: | ||
total_memory = f"CPU: {str(shape.memory_in_gbs)}" | ||
|
||
|
||
if model: | ||
model_size = str(model.total_model_gb) | ||
else: | ||
model_size = "Using Pre-Defined Config" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe we should use |
||
|
||
table.add_row( | ||
shape.name, | ||
str(shape.available), | ||
str(shape.shape_series), | ||
str(gpu.gpu_count), | ||
total_memory, | ||
str(model.total_model_gb), | ||
model_size, | ||
deploy.quantization, | ||
recommendation, | ||
) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,11 +2,22 @@ | |
# Copyright (c) 2025 Oracle and/or its affiliates. | ||
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ | ||
|
||
import json | ||
|
||
from typing import List, Optional | ||
|
||
from pydantic import BaseModel, Field | ||
|
||
from ads.aqua.common.entities import ComputeShapeSummary | ||
from ads.aqua.modeldeployment.config_loader import AquaDeploymentConfig | ||
from ads.aqua.shaperecommend.constants import ( | ||
DEFAULT_WEIGHT_SIZE, | ||
MAX_MODEL_LEN_FLAG, | ||
QUANT_FLAG, | ||
QUANT_MAPPING, | ||
VLLM_ENV_KEY, | ||
VLLM_PARAMS_KEY, | ||
) | ||
from ads.aqua.shaperecommend.constants import QUANT_MAPPING | ||
from ads.aqua.shaperecommend.estimator import MemoryEstimator | ||
from ads.config import COMPARTMENT_OCID | ||
|
@@ -30,6 +41,10 @@ class RequestRecommend(BaseModel): | |
COMPARTMENT_OCID, description="The OCID of user's compartment" | ||
) | ||
|
||
deployment_config: Optional[AquaDeploymentConfig] = Field( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it would be better to do: |
||
{}, description="The deployment configuration for model (only available for service models)." | ||
) | ||
|
||
class Config: | ||
protected_namespaces = () | ||
|
||
|
@@ -42,7 +57,9 @@ class DeploymentParams(BaseModel): # noqa: N801 | |
quantization: Optional[str] = Field( | ||
None, description="Type of quantization (e.g. 4bit)." | ||
) | ||
max_model_len: Optional[int] = Field(None, description="Maximum length of input sequence.") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Merge conflict? |
||
max_model_len: int = Field(..., description="Maximum length of input sequence.") | ||
|
||
params: str = Field( | ||
..., description="Runtime parameters for deployment with vLLM, etc." | ||
) | ||
|
@@ -68,6 +85,13 @@ class ModelConfig(BaseModel): | |
The configuration for a model based on specific set of deployment parameters and memory capacity of shape. | ||
""" | ||
|
||
deployment_params: DeploymentParams = Field( | ||
..., description="Parameters for deployment." | ||
) | ||
model_details: Optional[ModelDetail] = Field(None, description="Details about the model.") | ||
|
||
recommendation: Optional[str] = Field("", description="GPU recommendation for the model.") | ||
|
||
model_details: ModelDetail = Field(..., description="Details about the model.") | ||
deployment_params: DeploymentParams = Field( | ||
..., description="Parameters for deployment." | ||
|
@@ -231,3 +255,89 @@ class ShapeRecommendationReport(BaseModel): | |
None, | ||
description="Details for troubleshooting if no shapes fit the current model.", | ||
) | ||
|
||
@classmethod | ||
def from_deployment_config(cls, deployment_config: AquaDeploymentConfig, model_name: str, valid_shapes: List[ComputeShapeSummary]) -> "ShapeRecommendationReport": | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please use the formatter, to format the code. |
||
""" | ||
Creates a ShapeRecommendationReport from an AquaDeploymentConfig, extracting recommended | ||
model configurations for each valid compute shape. | ||
|
||
Parameters | ||
---------- | ||
deployment_config : AquaDeploymentConfig | ||
The object containing per-shape deployment configurations. | ||
model_name : str | ||
The name of the model for which to generate recommendations. | ||
valid_shapes : list of ComputeShapeSummary | ||
List of compute shapes to evaluate and recommend deployment configurations for. | ||
|
||
Returns | ||
------- | ||
ShapeRecommendationReport | ||
Report containing recommendations for each valid compute shape. | ||
|
||
Notes | ||
----- | ||
For service models, this method interprets pre-set deployment configurations to derive | ||
recommendations for each allowed compute shape, including environment variables, quantization, | ||
and maximum model length parameters. | ||
""" | ||
|
||
recs = [] | ||
for shape in valid_shapes: | ||
current_config = deployment_config.configuration.get(shape.name) | ||
if not current_config: | ||
continue | ||
|
||
quantization = None | ||
max_model_len = None | ||
recommendation = "" | ||
current_params = current_config.parameters.get(VLLM_PARAMS_KEY) | ||
current_env = current_config.env.get(VLLM_ENV_KEY) | ||
|
||
if current_params: | ||
param_list = current_params.split() | ||
|
||
if QUANT_FLAG in param_list: | ||
idx = param_list.index(QUANT_FLAG) | ||
if idx + 1 < len(param_list): | ||
quantization = param_list[idx + 1] | ||
|
||
if MAX_MODEL_LEN_FLAG in param_list: | ||
idx = param_list.index(MAX_MODEL_LEN_FLAG) | ||
if idx + 1 < len(param_list): | ||
try: | ||
max_model_len = int(param_list[idx + 1]) | ||
except ValueError: | ||
max_model_len = None | ||
|
||
if current_env: | ||
recommendation += f"ENV: {json.dumps(current_env)}\n\n" | ||
|
||
if not current_params and not current_env: # model works with default params and no extra env variables | ||
recommendation += "No override PARAMS and ENV variables needed. \n\n" | ||
|
||
recommendation += "Model fits well within the allowed compute shape." | ||
|
||
deployment_params = DeploymentParams( | ||
quantization=quantization if quantization else DEFAULT_WEIGHT_SIZE, | ||
max_model_len=max_model_len, | ||
params=current_params if current_params else "", | ||
) | ||
|
||
# need to adjust for multiple configs per shape | ||
configuration = [ModelConfig( | ||
deployment_params=deployment_params, | ||
recommendation=recommendation, | ||
)] | ||
|
||
recs.append(ShapeReport( | ||
shape_details=shape, | ||
configurations=configuration | ||
) | ||
) | ||
|
||
return ShapeRecommendationReport( | ||
display_name=model_name, | ||
recommendations=recs | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
{ | ||
"architectures": [ | ||
"MistralForCausalLM" | ||
], | ||
"attention_dropout": 0.0, | ||
"bos_token_id": 1, | ||
"eos_token_id": 2, | ||
"pad_token_id": 11, | ||
"head_dim": 128, | ||
"hidden_act": "silu", | ||
"hidden_size": 5120, | ||
"initializer_range": 0.02, | ||
"intermediate_size": 32768, | ||
"max_position_embeddings": 131072, | ||
"model_type": "mistral", | ||
"num_attention_heads": 32, | ||
"num_hidden_layers": 40, | ||
"num_key_value_heads": 8, | ||
"rms_norm_eps": 1e-05, | ||
"rope_theta": 1000000000.0, | ||
"sliding_window": null, | ||
"tie_word_embeddings": false, | ||
"torch_dtype": "bfloat16", | ||
"transformers_version": "4.53.1", | ||
"use_cache": true, | ||
"vocab_size": 131072 | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you update the description and add acceptable params for kwargs?