[VLM] Server Scenario, performance and accuracy (#2388)

johncalesp · web-flow · commit 32b6d66dcf6b · 2025-11-18T08:52:20.000+04:00
* initial server version

* change description for parameter use_token_latencies

* [Automated Commit] Format Codebase

* changes based on PR comments
diff --git a/multimodal/vl2l/README.md b/multimodal/vl2l/README.md
@@ -107,22 +107,28 @@ docker run --gpus all \                                 # Use all the GPUs on th
 Performance only mode:
 
 ```bash
-mlperf-inf-mm-vl2l --settings.senario offline --settings.mode performance_only
+mlperf-inf-mm-vl2l --settings.test.scenario offline --settings.test.mode performance_only
 ```
 
 Accuracy only mode:
 
-TBD
+```bash
+mlperf-inf-mm-vl2l --settings.test.scenario offline --settings.test.mode accuracy_only
+```
 
 ### Run the benchmark for the Server scenario
 
 Performance only mode:
 
-TBD
+```bash
+mlperf-inf-mm-vl2l --settings.test.scenario server --settings.test.mode performance_only
+```
 
 Accuracy only mode:
 
-TBD
+```bash
+mlperf-inf-mm-vl2l --settings.test.scenario server --settings.test.mode accuracy_only
+```
 
 ## Developer Guide
 
diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/cli.py b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/cli.py
@@ -5,11 +5,12 @@
 import sys
 from datetime import timedelta
 from enum import StrEnum, auto
+from pathlib import Path
 from typing import Annotated
 
 import mlperf_loadgen as lg
 from loguru import logger
-from pydantic import BaseModel, Field, field_validator
+from pydantic import BaseModel, DirectoryPath, Field, field_validator
 from pydantic_typer import Typer
 from typer import Option
 
@@ -74,6 +75,36 @@ def __init__(self, test_mode: TestMode) -> None:
         super().__init__(f"Unknown test mode: {test_mode}")
 
 
+class LoggingMode(StrEnum):
+    """Specifies when logging should be sampled and stringified."""
+
+    ASYNC_POLL = auto()
+    """ Logs are serialized and output on an IOThread that polls for new logs
+      at a fixed interval. This is the only mode currently implemented."""
+
+    END_OF_TEST_ONLY = auto()
+    """ Not implemented """
+
+    SYNCHRONOUS = auto()
+    """ Not implemented """
+
+    def to_lgtype(self) -> lg.LoggingMode:
+        """Convert logging mode to its corresponding LoadGen type."""
+        match self:
+            case LoggingMode.ASYNC_POLL:
+                return lg.LoggingMode.AsyncPoll
+            case _:
+                raise UnknownLoggingModeValueError
+
+
+class UnknownLoggingModeValueError(ValueError):
+    """The exception raised when an unknown logging mode is encountered."""
+
+    def __init__(self, test_mode: TestMode) -> None:
+        """Initialize the exception."""
+        super().__init__(f"Unknown logging mode: {test_mode}")
+
+
 class TestSettings(BaseModel):
     """The test settings for the MLPerf inference LoadGen."""
 
@@ -102,27 +133,73 @@ class TestSettings(BaseModel):
         ),
     ] = 100
 
+    server_expected_qps: Annotated[
+        float,
+        Field(
+            description="The expected QPS for the server scenario. "
+            "Loadgen will try to send as many request as necessary "
+            "to achieve this value.",
+        ),
+    ] = 1
+
+    server_target_latency: Annotated[
+        timedelta,
+        Field(description="""Expected latency constraint for Server scenario.
+        This is a constraint that we expect depending
+              on the argument server_expected_qps.
+        When server_expected_qps increases, we expect the latency to also increase.
+        When server_expected_qps decreases, we expect the latency to also decrease."""),
+    ] = timedelta(seconds=1)
+
+    server_ttft_latency: Annotated[
+        timedelta,
+        Field(description="""Time to First Token (TTFT)
+              latency constraint result validation"
+              (used when use_token_latencies is enabled)."""),
+    ] = timedelta(seconds=1)
+
+    server_tpot_latency: Annotated[
+        timedelta,
+        Field(description="""Time per Output Token (TPOT)
+              latency constraint result validation"
+              (used when use_token_latencies is enabled)."""),
+    ] = timedelta(seconds=1)
+
     min_duration: Annotated[
         timedelta,
         Field(
-            description=(
-                "The minimum testing duration (in seconds or ISO 8601 format like"
-                " PT5S)."
-            ),
+            description="""The minimum testing duration
+                (in seconds or ISO 8601 format like PT5S).
+                The benchmark runs until this value has been met.""",
         ),
     ] = timedelta(seconds=5)
 
+    min_query_count: Annotated[
+        int,
+        Field(
+            description="""The minimum testing query count.
+            The benchmark runs until this value has been met.""",
+        ),
+    ] = 100
+
     use_token_latencies: Annotated[
         bool,
         Field(
-            description="When set to True, LoadGen will track TTFT and TPOT.",
+            description="""By default,
+            the Server scenario will use server_target_latency as the constraint.
+            When set to True, the Server scenario will use server_ttft_latency
+            and server_tpot_latency as the constraint.""",
         ),
-    ] = True
+    ] = False
 
-    @field_validator("min_duration", mode="before")
+    @field_validator("server_target_latency",
+                     "server_ttft_latency",
+                     "server_tpot_latency",
+                     "min_duration",
+                     mode="before")
     @classmethod
-    def parse_min_duration(cls, value: timedelta |
-                           float | str) -> timedelta | str:
+    def parse_timedelta(cls, value: timedelta |
+                        float | str) -> timedelta | str:
         """Parse timedelta from seconds (int/float/str) or ISO 8601 format."""
         if isinstance(value, timedelta):
             return value
@@ -144,12 +221,133 @@ def to_lgtype(self) -> lg.TestSettings:
         settings.scenario = self.scenario.to_lgtype()
         settings.mode = self.mode.to_lgtype()
         settings.offline_expected_qps = self.offline_expected_qps
+        settings.server_target_qps = self.server_expected_qps
+        settings.server_target_latency_ns = round(
+            self.server_target_latency.total_seconds() * 1e9)
+        settings.ttft_latency = round(
+            self.server_ttft_latency.total_seconds() * 1e9)
+        settings.tpot_latency = round(
+            self.server_tpot_latency.total_seconds() * 1e9)
         settings.min_duration_ms = round(
             self.min_duration.total_seconds() * 1000)
+        settings.min_query_count = self.min_query_count
         settings.use_token_latencies = self.use_token_latencies
         return settings
 
 
+class LogOutputSettings(BaseModel):
+    """The test log output settings for the MLPerf inference LoadGen."""
+    outdir: Annotated[
+        DirectoryPath,
+        Field(
+            description="Where to save the output files from the benchmark.",
+        ),
+    ] = DirectoryPath("output")
+    prefix: Annotated[
+        str,
+        Field(
+            description="Modify the filenames of the logs with a prefix.",
+        ),
+    ] = "mlperf_log_"
+    suffix: Annotated[
+        str,
+        Field(
+            description="Modify the filenames of the logs with a suffix.",
+        ),
+    ] = ""
+    prefix_with_datetime: Annotated[
+        bool,
+        Field(
+            description="Modify the filenames of the logs with a datetime.",
+        ),
+    ] = False
+    copy_detail_to_stdout: Annotated[
+        bool,
+        Field(
+            description="Print details of performance test to stdout.",
+        ),
+    ] = False
+    copy_summary_to_stdout: Annotated[
+        bool,
+        Field(
+            description="Print results of performance test to terminal.",
+        ),
+    ] = True
+
+    @field_validator("outdir", mode="before")
+    @classmethod
+    def parse_directory_field(cls, value: str) -> None:
+        """Verify and create the output directory to store log files."""
+        path = Path(value)
+        path.mkdir(exist_ok=True)
+        return path
+
+    def to_lgtype(self) -> lg.LogOutputSettings:
+        """Convert the log output settings to its corresponding LoadGen type."""
+        log_output_settings = lg.LogOutputSettings()
+        log_output_settings.outdir = self.outdir.as_posix()
+        log_output_settings.prefix = self.prefix
+        log_output_settings.suffix = self.suffix
+        log_output_settings.prefix_with_datetime = self.prefix_with_datetime
+        log_output_settings.copy_detail_to_stdout = self.copy_detail_to_stdout
+        log_output_settings.copy_summary_to_stdout = self.copy_summary_to_stdout
+        return log_output_settings
+
+
+class LogSettings(BaseModel):
+    """The test log settings for the MLPerf inference LoadGen."""
+    log_output: Annotated[
+        LogOutputSettings,
+        Field(
+            description="Log output settings",
+        ),
+    ] = LogOutputSettings
+    log_mode: Annotated[
+        LoggingMode,
+        Field(
+            description="""How and when logging should be
+            sampled and stringified at runtime""",
+        ),
+    ] = LoggingMode.ASYNC_POLL
+    enable_trace: Annotated[
+        bool,
+        Field(
+            description="Enable trace",
+        ),
+    ] = True
+
+    def to_lgtype(self) -> lg.LogSettings:
+        """Convert log settings to its corresponding LoadGen type."""
+        log_settings = lg.LogSettings()
+        log_settings.log_output = self.log_output.to_lgtype()
+        log_settings.log_mode = self.log_mode.to_lgtype()
+        log_settings.enable_trace = self.enable_trace
+        return log_settings
+
+
+class Settings(BaseModel):
+    """Combine the settings for the test and logging of LoadGen."""
+    test: Annotated[
+        TestSettings,
+        Field(
+            description="Test settings parameters.",
+        ),
+    ] = TestSettings
+
+    logging: Annotated[
+        LogSettings,
+        Field(
+            description="Test logging parameters",
+        ),
+    ] = LogSettings
+
+    def to_lgtype(self) -> tuple[lg.TestSettings, lg.LogSettings]:
+        """Return test and log settings for LoadGen."""
+        test_settings = self.test.to_lgtype()
+        log_settings = self.logging.to_lgtype()
+        return (test_settings, log_settings)
+
+
 class Model(BaseModel):
     """Specifies the model to use for the VL2L benchmark."""
 
@@ -211,7 +409,7 @@ class Endpoint(BaseModel):
 @app.command()
 def main(
     *,
-    settings: TestSettings,
+    settings: Settings,
     model: Model,
     dataset: Dataset,
     endpoint: Endpoint,
@@ -234,17 +432,18 @@ def main(
         "Running VL2L benchmark with OpenAI API endpoint: {}",
         endpoint)
     logger.info("Running VL2L benchmark with random seed: {}", random_seed)
-    lg_settings = settings.to_lgtype()
+    test_settings, log_settings = settings.to_lgtype()
     task = ShopifyGlobalCatalogue(
         dataset_cli=dataset,
         model_cli=model,
         endpoint_cli=endpoint,
+        scenario=settings.test.scenario,
         random_seed=random_seed,
     )
     sut = task.construct_sut()
     qsl = task.construct_qsl()
     logger.info("Starting the VL2L benchmark with LoadGen...")
-    lg.StartTest(sut, qsl, lg_settings)
+    lg.StartTestWithLogSettings(sut, qsl, test_settings, log_settings)
     logger.info("The VL2L benchmark with LoadGen completed.")
     lg.DestroyQSL(qsl)
     lg.DestroySUT(sut)
diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/task.py b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/task.py