oracle · elizjo · Jul 7, 2025 · Jul 14, 2025 · Jul 25, 2025 · Jul 25, 2025
@@ -1297,6 +1297,9 @@ def recommend_shape(self, **kwargs) -> Union[Table, ShapeRecommendationReport]:
         AquaValueError
             If model type is unsupported by tool (no recommendation report generated)
         """
+        deployment_config = self.get_deployment_config(model_id=kwargs.get("model_id"))
+        kwargs["deployment_config"] = deployment_config
+
         try:
             request = RequestRecommend(**kwargs)
         except ValidationError as e:

@@ -78,15 +78,25 @@
 
 IN_FLIGHT_QUANTIZATION = {"4bit"}  # vLLM only supports 4bit in-flight-quantization
 
+VLLM_PARAMS_KEY = "VLLM_PARAMS"
+VLLM_ENV_KEY = "VLLM"
+QUANT_FLAG = "--quantization"
+MAX_MODEL_LEN_FLAG = "--max-model-len"
+
 TROUBLESHOOT_MSG = "The selected model is too large to fit on standard GPU shapes with the current configuration.\nAs troubleshooting, we have suggested the two largest available GPU shapes using the smallest quantization level ('4bit') to maximize chances of fitting the model. "
 
 VLLM_PARAMS = {
     "max_model_len": "--max-model-len",
     "in_flight_quant": "--quantization bitsandbytes --load-format bitsandbytes",
+    "trust_remote_code": "--trust-remote-code"
 }
 
+DEFAULT_WEIGHT_SIZE = "bfloat16"
+DEFAULT_MAX_SEQ_LEN = 4096
+
 DEFAULT_WEIGHT_SIZE = "float32"
 
+
 BITS_AND_BYTES_8BIT = "8bit"
 BITS_AND_BYTES_4BIT = "4bit"
 

@@ -131,6 +131,10 @@ def construct_deployment_params(self) -> str:
             # vLLM only supports 4bit in-flight quantization
             params.append(VLLM_PARAMS["in_flight_quant"])
 
+        # add trust-remote-code if custom modules are specified
+        if c.trust_remote_code:
+            params.append(VLLM_PARAMS["trust_remote_code"])
+
         params = " ".join(params) if params else ""
         return params
 

@@ -11,6 +11,7 @@
 from ads.aqua.shaperecommend.constants import (
     BITS_AND_BYTES_4BIT,
     BITS_AND_BYTES_8BIT,
+    DEFAULT_MAX_SEQ_LEN,
     DEFAULT_WEIGHT_SIZE,
     NEXT_QUANT,
     QUANT_MAPPING,
@@ -42,7 +43,7 @@ class LLMConfig(BaseModel):
         description="Dimension of each attention head. Typically hidden_size // num_attention_heads.",
     )
     max_seq_len: Optional[int] = Field(
-        4096, description="Maximum input sequence length (context window)."
+        DEFAULT_MAX_SEQ_LEN, description="Maximum input sequence length (context window)."
     )
     weight_dtype: Optional[str] = Field(
         DEFAULT_WEIGHT_SIZE,
@@ -74,7 +75,9 @@ class LLMConfig(BaseModel):
         None, description="For MoE architectures, size of the MLP activation layer."
     )
 
-    tie_word_embeddings: Optional[bool] = Field(None)
+    tie_word_embeddings: Optional[bool] = Field(True, description="if True, input and output embedding matrices share the same parameters in memory.")
+
+    trust_remote_code: Optional[bool] = Field(False, description="if True, the model requires custom code to operate.")
 
     @property
     def bytes_per_parameter(self) -> float:
@@ -207,6 +210,17 @@ def validate_model_support(cls, raw: dict) -> ValueError:
                 "Encoder-decoder models (ex. T5, Gemma) and encoder-only (BERT) are not supported at this time."
             )
 
+    @staticmethod
+    def get_bool(raw, key, default=False):
+        val = raw.get(key)
+        if val is None:
+            return default
+        if isinstance(val, bool):
+            return val
+        if isinstance(val, str):
+            return val.lower() == "true"
+        return bool(val)
+
     @classmethod
     def from_raw_config(cls, raw: dict) -> "LLMConfig":
         """
@@ -257,6 +271,10 @@ def from_raw_config(cls, raw: dict) -> "LLMConfig":
             "intermediate_size"
         )
 
+        tie_word_embeddings = LLMConfig.get_bool(raw, "tie_word_embeddings", True)
+
+        trust_remote_code = "auto_map" in raw # trust-remote-code is always needed when this key is present
+
         # Type safety: minimal assertion
         if None in [
             num_hidden_layers,
@@ -280,4 +298,6 @@ def from_raw_config(cls, raw: dict) -> "LLMConfig":
             max_seq_len=int(max_seq_len),
             num_local_experts=num_local_experts,
             intermediate_size=intermediate_size,
+            tie_word_embeddings=tie_word_embeddings,
+            trust_remote_code=trust_remote_code
         )
@@ -93,6 +93,25 @@ def which_shapes(
             shapes = self.valid_compute_shapes(compartment_id=request.compartment_id)
 
             ds_model = self._validate_model_ocid(request.model_id)
+
+            model_name = ds_model.display_name if ds_model.display_name else ""
+
+            if request.deployment_config:
+                shape_recommendation_report = (
+                    ShapeRecommendationReport.from_deployment_config(
+                        request.deployment_config, model_name, shapes
+                    )
+                )
+
+            else:
+                data = self._get_model_config(ds_model)
+
+                llm_config = LLMConfig.from_raw_config(data)
+
+                shape_recommendation_report = self._summarize_shapes_for_seq_lens(
+                    llm_config, shapes, model_name
+                )
+
             data = self._get_model_config(ds_model)
 
             llm_config = LLMConfig.from_raw_config(data)
@@ -102,7 +121,7 @@ def which_shapes(
             shape_recommendation_report = self._summarize_shapes_for_seq_lens(
                 llm_config, shapes, model_name
             )
-
+            
             if request.generate_table and shape_recommendation_report.recommendations:
                 shape_recommendation_report = self._rich_diff_table(
                     shape_recommendation_report
@@ -248,13 +267,19 @@ def _rich_diff_table(shape_report: ShapeRecommendationReport) -> Table:
             else:
                 total_memory = f"CPU: {str(shape.memory_in_gbs)}"
 
+
+            if model:
+                model_size = str(model.total_model_gb)
+            else:
+                model_size = "Using Pre-Defined Config"
+
             table.add_row(
                 shape.name,
                 str(shape.available),
                 str(shape.shape_series),
                 str(gpu.gpu_count),
                 total_memory,
-                str(model.total_model_gb),
+                model_size,
                 deploy.quantization,
                 recommendation,
             )

@@ -2,11 +2,22 @@
 # Copyright (c) 2025 Oracle and/or its affiliates.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
 
+import json
+
 from typing import List, Optional
 
 from pydantic import BaseModel, Field
 
 from ads.aqua.common.entities import ComputeShapeSummary
+from ads.aqua.modeldeployment.config_loader import AquaDeploymentConfig
+from ads.aqua.shaperecommend.constants import (
+    DEFAULT_WEIGHT_SIZE,
+    MAX_MODEL_LEN_FLAG,
+    QUANT_FLAG,
+    QUANT_MAPPING,
+    VLLM_ENV_KEY,
+    VLLM_PARAMS_KEY,
+)
 from ads.aqua.shaperecommend.constants import QUANT_MAPPING
 from ads.aqua.shaperecommend.estimator import MemoryEstimator
 from ads.config import COMPARTMENT_OCID
@@ -30,6 +41,10 @@ class RequestRecommend(BaseModel):
         COMPARTMENT_OCID, description="The OCID of user's compartment"
     )
 
+    deployment_config: Optional[AquaDeploymentConfig] = Field(
+        {}, description="The deployment configuration for model (only available for service models)."
+    )
+
     class Config:
         protected_namespaces = ()
 
@@ -42,7 +57,9 @@ class DeploymentParams(BaseModel):  # noqa: N801
     quantization: Optional[str] = Field(
         None, description="Type of quantization (e.g. 4bit)."
     )
+    max_model_len: Optional[int] = Field(None, description="Maximum length of input sequence.")
     max_model_len: int = Field(..., description="Maximum length of input sequence.")
+
     params: str = Field(
         ..., description="Runtime parameters for deployment with vLLM, etc."
     )
@@ -68,6 +85,13 @@ class ModelConfig(BaseModel):
     The configuration for a model based on specific set of deployment parameters and memory capacity of shape.
     """
 
+    deployment_params: DeploymentParams = Field(
+        ..., description="Parameters for deployment."
+    )
+    model_details: Optional[ModelDetail] = Field(None, description="Details about the model.")
+
+    recommendation: Optional[str] = Field("", description="GPU recommendation for the model.")
+
     model_details: ModelDetail = Field(..., description="Details about the model.")
     deployment_params: DeploymentParams = Field(
         ..., description="Parameters for deployment."
@@ -231,3 +255,89 @@ class ShapeRecommendationReport(BaseModel):
         None,
         description="Details for troubleshooting if no shapes fit the current model.",
     )
+
+    @classmethod
+    def from_deployment_config(cls, deployment_config: AquaDeploymentConfig, model_name: str, valid_shapes: List[ComputeShapeSummary]) -> "ShapeRecommendationReport":
+        """
+        Creates a ShapeRecommendationReport from an AquaDeploymentConfig, extracting recommended
+        model configurations for each valid compute shape.
+
+        Parameters
+        ----------
+        deployment_config : AquaDeploymentConfig
+            The object containing per-shape deployment configurations.
+        model_name : str
+            The name of the model for which to generate recommendations.
+        valid_shapes : list of ComputeShapeSummary
+            List of compute shapes to evaluate and recommend deployment configurations for.
+
+        Returns
+        -------
+        ShapeRecommendationReport
+            Report containing recommendations for each valid compute shape.
+
+        Notes
+        -----
+        For service models, this method interprets pre-set deployment configurations to derive
+        recommendations for each allowed compute shape, including environment variables, quantization,
+        and maximum model length parameters.
+        """
+
+        recs = []
+        for shape in valid_shapes:
+            current_config = deployment_config.configuration.get(shape.name)
+            if not current_config:
+                continue
+
+            quantization = None
+            max_model_len = None
+            recommendation = ""
+            current_params = current_config.parameters.get(VLLM_PARAMS_KEY)
+            current_env = current_config.env.get(VLLM_ENV_KEY)
+
+            if current_params:
+                param_list = current_params.split()
+
+                if QUANT_FLAG in param_list:
+                    idx = param_list.index(QUANT_FLAG)
+                    if idx + 1 < len(param_list):
+                        quantization = param_list[idx + 1]
+
+                if MAX_MODEL_LEN_FLAG in param_list:
+                    idx = param_list.index(MAX_MODEL_LEN_FLAG)
+                    if idx + 1 < len(param_list):
+                        try:
+                            max_model_len = int(param_list[idx + 1])
+                        except ValueError:
+                            max_model_len = None
+
+            if current_env:
+                recommendation += f"ENV: {json.dumps(current_env)}\n\n"
+
+            if not current_params and not current_env: # model works with default params and no extra env variables
+                recommendation += "No override PARAMS and ENV variables needed. \n\n"
+
+            recommendation += "Model fits well within the allowed compute shape."
+
+            deployment_params = DeploymentParams(
+                quantization=quantization if quantization else DEFAULT_WEIGHT_SIZE,
+                max_model_len=max_model_len,
+                params=current_params if current_params else "",
+            )
+
+            # need to adjust for multiple configs per shape
+            configuration = [ModelConfig(
+                deployment_params=deployment_params,
+                recommendation=recommendation,
+            )]
+
+            recs.append(ShapeReport(
+                shape_details=shape,
+                configurations=configuration
+            )
+            )
+
+        return ShapeRecommendationReport(
+            display_name=model_name,
+            recommendations=recs
+        )
@@ -0,0 +1,27 @@
+{
+  "architectures": [
+    "MistralForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "pad_token_id": 11,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 5120,
+  "initializer_range": 0.02,
+  "intermediate_size": 32768,
+  "max_position_embeddings": 131072,
+  "model_type": "mistral",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-05,
+  "rope_theta": 1000000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.53.1",
+  "use_cache": true,
+  "vocab_size": 131072
+}