Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions ads/aqua/modeldeployment/deployment.py
Original file line number Diff line number Diff line change
Expand Up @@ -1297,6 +1297,9 @@ def recommend_shape(self, **kwargs) -> Union[Table, ShapeRecommendationReport]:
AquaValueError
If model type is unsupported by tool (no recommendation report generated)
"""
deployment_config = self.get_deployment_config(model_id=kwargs.get("model_id"))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you update the description and add acceptable params for kwargs?

kwargs["deployment_config"] = deployment_config

try:
request = RequestRecommend(**kwargs)
except ValidationError as e:
Expand Down
10 changes: 10 additions & 0 deletions ads/aqua/shaperecommend/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,15 +78,25 @@

IN_FLIGHT_QUANTIZATION = {"4bit"} # vLLM only supports 4bit in-flight-quantization

VLLM_PARAMS_KEY = "VLLM_PARAMS"
VLLM_ENV_KEY = "VLLM"
QUANT_FLAG = "--quantization"
MAX_MODEL_LEN_FLAG = "--max-model-len"

TROUBLESHOOT_MSG = "The selected model is too large to fit on standard GPU shapes with the current configuration.\nAs troubleshooting, we have suggested the two largest available GPU shapes using the smallest quantization level ('4bit') to maximize chances of fitting the model. "

VLLM_PARAMS = {
"max_model_len": "--max-model-len",
"in_flight_quant": "--quantization bitsandbytes --load-format bitsandbytes",
"trust_remote_code": "--trust-remote-code"
}

DEFAULT_WEIGHT_SIZE = "bfloat16"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like we already have such variable, three lines below?

DEFAULT_MAX_SEQ_LEN = 4096

DEFAULT_WEIGHT_SIZE = "float32"


BITS_AND_BYTES_8BIT = "8bit"
BITS_AND_BYTES_4BIT = "4bit"

Expand Down
4 changes: 4 additions & 0 deletions ads/aqua/shaperecommend/estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,10 @@ def construct_deployment_params(self) -> str:
# vLLM only supports 4bit in-flight quantization
params.append(VLLM_PARAMS["in_flight_quant"])

# add trust-remote-code if custom modules are specified
if c.trust_remote_code:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we use more meaningful name for the config variable?
c->llm_config?

params.append(VLLM_PARAMS["trust_remote_code"])

params = " ".join(params) if params else ""
return params

Expand Down
24 changes: 22 additions & 2 deletions ads/aqua/shaperecommend/llm_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from ads.aqua.shaperecommend.constants import (
BITS_AND_BYTES_4BIT,
BITS_AND_BYTES_8BIT,
DEFAULT_MAX_SEQ_LEN,
DEFAULT_WEIGHT_SIZE,
NEXT_QUANT,
QUANT_MAPPING,
Expand Down Expand Up @@ -42,7 +43,7 @@ class LLMConfig(BaseModel):
description="Dimension of each attention head. Typically hidden_size // num_attention_heads.",
)
max_seq_len: Optional[int] = Field(
4096, description="Maximum input sequence length (context window)."
DEFAULT_MAX_SEQ_LEN, description="Maximum input sequence length (context window)."
)
weight_dtype: Optional[str] = Field(
DEFAULT_WEIGHT_SIZE,
Expand Down Expand Up @@ -74,7 +75,9 @@ class LLMConfig(BaseModel):
None, description="For MoE architectures, size of the MLP activation layer."
)

tie_word_embeddings: Optional[bool] = Field(None)
tie_word_embeddings: Optional[bool] = Field(True, description="if True, input and output embedding matrices share the same parameters in memory.")

trust_remote_code: Optional[bool] = Field(False, description="if True, the model requires custom code to operate.")

@property
def bytes_per_parameter(self) -> float:
Expand Down Expand Up @@ -207,6 +210,17 @@ def validate_model_support(cls, raw: dict) -> ValueError:
"Encoder-decoder models (ex. T5, Gemma) and encoder-only (BERT) are not supported at this time."
)

@staticmethod
def get_bool(raw, key, default=False):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In ads/coomon/utils we already have - parse_bool function, maybe this function can be reused?

val = raw.get(key)
if val is None:
return default
if isinstance(val, bool):
return val
if isinstance(val, str):
return val.lower() == "true"
return bool(val)

@classmethod
def from_raw_config(cls, raw: dict) -> "LLMConfig":
"""
Expand Down Expand Up @@ -257,6 +271,10 @@ def from_raw_config(cls, raw: dict) -> "LLMConfig":
"intermediate_size"
)

tie_word_embeddings = LLMConfig.get_bool(raw, "tie_word_embeddings", True)

trust_remote_code = "auto_map" in raw # trust-remote-code is always needed when this key is present

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you add more description for the section below? Can the error be more specific?

        if None in [
            num_hidden_layers,
            hidden_size,
            vocab_size,
            num_attention_heads,
            head_dim,
        ]:
            raise ValueError("Missing required value in model config.")

# Type safety: minimal assertion
if None in [
num_hidden_layers,
Expand All @@ -280,4 +298,6 @@ def from_raw_config(cls, raw: dict) -> "LLMConfig":
max_seq_len=int(max_seq_len),
num_local_experts=num_local_experts,
intermediate_size=intermediate_size,
tie_word_embeddings=tie_word_embeddings,
trust_remote_code=trust_remote_code
)
29 changes: 27 additions & 2 deletions ads/aqua/shaperecommend/recommend.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,25 @@ def which_shapes(
shapes = self.valid_compute_shapes(compartment_id=request.compartment_id)

ds_model = self._validate_model_ocid(request.model_id)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I noticed that this method also retrieves model details, but the function name doesn’t suggest that it returns the model config as well. I’d recommend separating the retrieval of model details from this function to make its purpose clearer and more consistent.


Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you update the description for this method? Specifically input params.

model_name = ds_model.display_name if ds_model.display_name else ""

if request.deployment_config:
shape_recommendation_report = (
ShapeRecommendationReport.from_deployment_config(
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we check if a deployment_config was successfully obtained, and if so we will generate the report immediately.

request.deployment_config, model_name, shapes
)
)

else:
data = self._get_model_config(ds_model)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure why would we need to repeat same lines again?


llm_config = LLMConfig.from_raw_config(data)

shape_recommendation_report = self._summarize_shapes_for_seq_lens(
llm_config, shapes, model_name
)

data = self._get_model_config(ds_model)

llm_config = LLMConfig.from_raw_config(data)
Expand All @@ -102,7 +121,7 @@ def which_shapes(
shape_recommendation_report = self._summarize_shapes_for_seq_lens(
llm_config, shapes, model_name
)

if request.generate_table and shape_recommendation_report.recommendations:
shape_recommendation_report = self._rich_diff_table(
shape_recommendation_report
Expand Down Expand Up @@ -248,13 +267,19 @@ def _rich_diff_table(shape_report: ShapeRecommendationReport) -> Table:
else:
total_memory = f"CPU: {str(shape.memory_in_gbs)}"


if model:
model_size = str(model.total_model_gb)
else:
model_size = "Using Pre-Defined Config"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we should use - instead? Should we add total model gb param in the service configs?


table.add_row(
shape.name,
str(shape.available),
str(shape.shape_series),
str(gpu.gpu_count),
total_memory,
str(model.total_model_gb),
model_size,
deploy.quantization,
recommendation,
)
Expand Down
110 changes: 110 additions & 0 deletions ads/aqua/shaperecommend/shape_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,22 @@
# Copyright (c) 2025 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

import json

from typing import List, Optional

from pydantic import BaseModel, Field

from ads.aqua.common.entities import ComputeShapeSummary
from ads.aqua.modeldeployment.config_loader import AquaDeploymentConfig
from ads.aqua.shaperecommend.constants import (
DEFAULT_WEIGHT_SIZE,
MAX_MODEL_LEN_FLAG,
QUANT_FLAG,
QUANT_MAPPING,
VLLM_ENV_KEY,
VLLM_PARAMS_KEY,
)
from ads.aqua.shaperecommend.constants import QUANT_MAPPING
from ads.aqua.shaperecommend.estimator import MemoryEstimator
from ads.config import COMPARTMENT_OCID
Expand All @@ -30,6 +41,10 @@ class RequestRecommend(BaseModel):
COMPARTMENT_OCID, description="The OCID of user's compartment"
)

deployment_config: Optional[AquaDeploymentConfig] = Field(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it would be better to do: default=None

{}, description="The deployment configuration for model (only available for service models)."
)

class Config:
protected_namespaces = ()

Expand All @@ -42,7 +57,9 @@ class DeploymentParams(BaseModel): # noqa: N801
quantization: Optional[str] = Field(
None, description="Type of quantization (e.g. 4bit)."
)
max_model_len: Optional[int] = Field(None, description="Maximum length of input sequence.")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Merge conflict?

max_model_len: int = Field(..., description="Maximum length of input sequence.")

params: str = Field(
..., description="Runtime parameters for deployment with vLLM, etc."
)
Expand All @@ -68,6 +85,13 @@ class ModelConfig(BaseModel):
The configuration for a model based on specific set of deployment parameters and memory capacity of shape.
"""

deployment_params: DeploymentParams = Field(
..., description="Parameters for deployment."
)
model_details: Optional[ModelDetail] = Field(None, description="Details about the model.")

recommendation: Optional[str] = Field("", description="GPU recommendation for the model.")

model_details: ModelDetail = Field(..., description="Details about the model.")
deployment_params: DeploymentParams = Field(
..., description="Parameters for deployment."
Expand Down Expand Up @@ -231,3 +255,89 @@ class ShapeRecommendationReport(BaseModel):
None,
description="Details for troubleshooting if no shapes fit the current model.",
)

@classmethod
def from_deployment_config(cls, deployment_config: AquaDeploymentConfig, model_name: str, valid_shapes: List[ComputeShapeSummary]) -> "ShapeRecommendationReport":
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please use the formatter, to format the code.

"""
Creates a ShapeRecommendationReport from an AquaDeploymentConfig, extracting recommended
model configurations for each valid compute shape.

Parameters
----------
deployment_config : AquaDeploymentConfig
The object containing per-shape deployment configurations.
model_name : str
The name of the model for which to generate recommendations.
valid_shapes : list of ComputeShapeSummary
List of compute shapes to evaluate and recommend deployment configurations for.

Returns
-------
ShapeRecommendationReport
Report containing recommendations for each valid compute shape.

Notes
-----
For service models, this method interprets pre-set deployment configurations to derive
recommendations for each allowed compute shape, including environment variables, quantization,
and maximum model length parameters.
"""

recs = []
for shape in valid_shapes:
current_config = deployment_config.configuration.get(shape.name)
if not current_config:
continue

quantization = None
max_model_len = None
recommendation = ""
current_params = current_config.parameters.get(VLLM_PARAMS_KEY)
current_env = current_config.env.get(VLLM_ENV_KEY)

if current_params:
param_list = current_params.split()

if QUANT_FLAG in param_list:
idx = param_list.index(QUANT_FLAG)
if idx + 1 < len(param_list):
quantization = param_list[idx + 1]

if MAX_MODEL_LEN_FLAG in param_list:
idx = param_list.index(MAX_MODEL_LEN_FLAG)
if idx + 1 < len(param_list):
try:
max_model_len = int(param_list[idx + 1])
except ValueError:
max_model_len = None

if current_env:
recommendation += f"ENV: {json.dumps(current_env)}\n\n"

if not current_params and not current_env: # model works with default params and no extra env variables
recommendation += "No override PARAMS and ENV variables needed. \n\n"

recommendation += "Model fits well within the allowed compute shape."

deployment_params = DeploymentParams(
quantization=quantization if quantization else DEFAULT_WEIGHT_SIZE,
max_model_len=max_model_len,
params=current_params if current_params else "",
)

# need to adjust for multiple configs per shape
configuration = [ModelConfig(
deployment_params=deployment_params,
recommendation=recommendation,
)]

recs.append(ShapeReport(
shape_details=shape,
configurations=configuration
)
)

return ShapeRecommendationReport(
display_name=model_name,
recommendations=recs
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
{
"architectures": [
"MistralForCausalLM"
],
"attention_dropout": 0.0,
"bos_token_id": 1,
"eos_token_id": 2,
"pad_token_id": 11,
"head_dim": 128,
"hidden_act": "silu",
"hidden_size": 5120,
"initializer_range": 0.02,
"intermediate_size": 32768,
"max_position_embeddings": 131072,
"model_type": "mistral",
"num_attention_heads": 32,
"num_hidden_layers": 40,
"num_key_value_heads": 8,
"rms_norm_eps": 1e-05,
"rope_theta": 1000000000.0,
"sliding_window": null,
"tie_word_embeddings": false,
"torch_dtype": "bfloat16",
"transformers_version": "4.53.1",
"use_cache": true,
"vocab_size": 131072
}
Loading
Loading