CentML
diff --git a/‎multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/cli.py‎
Lines changed: 7 additions & 4 deletions b/‎multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/cli.py‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py‎
Lines changed: 98 additions & 55 deletions b/‎multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py‎
Lines changed: 98 additions & 55 deletions
@@ -27,6 +27,11 @@
 
 @app.command()
 def evaluate(
+    *,
+    random_seed: Annotated[
+        int,
+        Option(help="The seed for the random number generator used by the benchmark."),
+    ] = 12345,
     filename: Annotated[
         FilePath,
         Option(
@@ -37,7 +42,7 @@ def evaluate(
 ) -> None:
     """Evaluate the accuracy of the VLM responses."""
     logger.info("Evaluating the accuracy file")
-    run_evaluation(filename=filename, dataset=dataset)
+    run_evaluation(random_seed=random_seed, filename=filename, dataset=dataset)
 
 
 @benchmark_app.command(name="endpoint")
@@ -78,9 +83,7 @@ def _run_benchmark(
     """Run the VL2L benchmark."""
     logger.info("Running VL2L benchmark with settings: {}", settings)
     logger.info("Running VL2L benchmark with dataset: {}", dataset)
-    logger.info(
-        "Running VL2L benchmark with OpenAI API endpoint: {}",
-        endpoint)
+    logger.info("Running VL2L benchmark with OpenAI API endpoint: {}", endpoint)
     logger.info("Running VL2L benchmark with random seed: {}", random_seed)
     test_settings, log_settings = settings.to_lgtype()
     task = ShopifyGlobalCatalogue(
 
@@ -11,7 +11,7 @@
 from hiclass.metrics import f1  # type: ignore[import-untyped]
 from loguru import logger
 from pydantic import ValidationError
-from rapidfuzz import fuzz
+from rapidfuzz import fuzz  # type: ignore[import-untyped]
 from sklearn.metrics import f1_score  # type: ignore[import-untyped]
 from tabulate import tabulate
 
@@ -22,24 +22,23 @@
 
 from .schema import ProductMetadata
 
-# Initialize the Generator
-# As of NumPy 1.17+,
-# this isolates the random state,
-# which is safer for reproducibility and parallel processing.
-rng = np.random.default_rng()
+_TRUE_CATEGORY_PAD = "<|__TRUE_CATEGORY_PAD__|>"
+_PRED_CATEGORY_PAD = "<|__PRED_CATEGORY_PAD__|>"
+_PRED_BRAND_PAD = "<|__PRED_BRAND_PAD__|>"
+_CATEGORY_SEPARATOR = " > "
 
 
 def get_hierarchical_components(
     predicted_path: str,
     true_path: str,
-    separator: str = " > ",
+    separator: str = _CATEGORY_SEPARATOR,
 ) -> tuple[int, int, int]:
     """Calculates the components for Hierarchical Precision.
 
     Args:
         predicted_path: Categories predicted by the VLM.
         true_path: Ground truth categories.
-        separator: String used to separate each category.
+        separator: The separator used to separate each level of the category.
 
     Returns:
         Tuple of number of intersections,
@@ -58,8 +57,7 @@ def get_hierarchical_components(
     intersection_count = 0
 
     # Iterate through the paths simultaneously
-    for pred_cat, true_cat in zip(
-            predicted_categories, true_categories, strict=False):
+    for pred_cat, true_cat in zip(predicted_categories, true_categories, strict=False):
         if pred_cat == true_cat:
             intersection_count += 1
         else:
@@ -72,12 +70,15 @@ def get_hierarchical_components(
     return intersection_count, pred_length, true_length
 
 
-def calculate_hierarchical_f1(data: list[tuple[str, str]]) -> float:
+def calculate_hierarchical_f1(
+    data: list[tuple[str, str]],
+    separator: str = _CATEGORY_SEPARATOR,
+) -> float:
     """Calculates the aggregate hF scores for a list of samples.
 
     Args:
-        data: A list of tuples, where each tuple is
-            (predicted_path_str, true_path_str).
+        data: A list of tuples, where each tuple is (predicted_path_str, true_path_str).
+        separator: The separator used to split the paths into levels of the category.
 
     Returns:
         F1 score
@@ -89,8 +90,9 @@ def calculate_hierarchical_f1(data: list[tuple[str, str]]) -> float:
     # 1. Aggregate the components across all samples
     for pred_path, true_path in data:
         intersection, pred_len, true_len = get_hierarchical_components(
-            pred_path,
-            true_path,
+            predicted_path=pred_path,
+            true_path=true_path,
+            separator=separator,
         )
 
         total_intersection += intersection
@@ -156,20 +158,25 @@ def calculate_secondhand_f1(data: list[tuple[bool, bool]]) -> float:
     return f1_score(y_src, y_pred)
 
 
-def calculate_hiclass_f1(data: list[tuple[str, str]]) -> float:
+def calculate_hiclass_f1(
+    data: list[tuple[str, str]],
+    separator: str = _CATEGORY_SEPARATOR,
+) -> float:
     """Alt method to calculate hierarchical F1.
 
     Args:
-         data: List of tuples of predicted and true values
+        data: List of tuples of predicted and true values
+        separator: The separator used to split the paths into levels of the category.
+
     Returs:
         f1 score
     """
     y_pred_raw = []
     y_true_raw = []
 
     for pred, src in data:
-        path1 = pred.split(" > ")
-        path2 = src.split(" > ")
+        path1 = pred.split(separator)
+        path2 = src.split(separator)
 
         y_pred_raw.append(path1)
         y_true_raw.append(path2)
@@ -182,11 +189,11 @@ def calculate_hiclass_f1(data: list[tuple[str, str]]) -> float:
     for i in range(len(y_true_raw)):
         # Pad Truth
         pad_len_true = max_len - len(y_true_raw[i])
-        y_true_raw[i] += [""] * pad_len_true
+        y_true_raw[i] += [_TRUE_CATEGORY_PAD] * pad_len_true
 
         # Pad Prediction
         pad_len_pred = max_len - len(y_pred_raw[i])
-        y_pred_raw[i] += [""] * pad_len_pred
+        y_pred_raw[i] += [_PRED_CATEGORY_PAD] * pad_len_pred
 
     # 4. Convert to numpy arrays
     y_true = np.array(y_true_raw)
@@ -196,8 +203,9 @@ def calculate_hiclass_f1(data: list[tuple[str, str]]) -> float:
     return f1(y_true, y_pred)
 
 
-def run_evaluation(filename: FilePath, dataset: DatasetCLI) -> None:
+def run_evaluation(random_seed: int, filename: FilePath, dataset: DatasetCLI) -> None:
     """Main function to run the evaluation."""
+    rng = np.random.default_rng(seed=random_seed)
     with Path.open(filename) as f:
         model_output = json.load(f)
 
@@ -207,26 +215,43 @@ def run_evaluation(filename: FilePath, dataset: DatasetCLI) -> None:
         split="+".join(dataset.split),
     )
 
+    num_unparsable_responses = 0
     category_dataset_pred_src = []
     category_rand_pred_src = []
     is_secondhand_pred_src = []
     is_secondhand_rand_pred_src = []
     brand_pred_src = []
 
+    all_possible_brands = set()
+
     for elem in model_output:
         idx = elem["qsl_idx"]
         response = bytes.fromhex(elem["data"]).decode("utf-8")
+        ground_truth_item = original_data[idx]
+        all_possible_brands.add(ground_truth_item["ground_truth_brand"])
         try:
             pred_item = ProductMetadata.model_validate_json(response)
         except ValidationError:
-            logger.exception(
+            num_unparsable_responses += 1
+            pred_item = ProductMetadata(
+                category=_CATEGORY_SEPARATOR.join(
+                    [_PRED_CATEGORY_PAD]
+                    * len(
+                        ground_truth_item["ground_truth_category"].split(
+                            _CATEGORY_SEPARATOR,
+                        ),
+                    ),
+                ),
+                brand=_PRED_BRAND_PAD,
+                is_secondhand=rng.choice([True, False], size=1).tolist()[0],
+            )
+            logger.error(
                 "Response\n{}\n(for the sample at index {}) cannot be validated against"
-                " the expected schema\n{}\n. Thus, this submission result is invalid.",
+                " the expected schema. Overwriting this response into \n{}\n",
                 response,
                 idx,
-                json.dumps(ProductMetadata.model_json_schema(), indent=2),
+                pred_item,
             )
-        ground_truth_item = original_data[idx]
         category_dataset_pred_src.append(
             (pred_item.category, ground_truth_item["ground_truth_category"]),
         )
@@ -236,48 +261,66 @@ def run_evaluation(filename: FilePath, dataset: DatasetCLI) -> None:
                 ground_truth_item["ground_truth_is_secondhand"],
             ),
         )
+        brand_pred_src.append(
+            (pred_item.brand, ground_truth_item["ground_truth_brand"]),
+        )
         # random category selection
         # Uniform distribution is the default
-        rand_cat = rng.choice(ground_truth_item["potential_product_categories"],
-                              size=1).tolist()[0]
-        category_rand_pred_src.append((rand_cat,
-                                       ground_truth_item["ground_truth_category"]))
-
+        rand_cat = rng.choice(ground_truth_item["potential_product_categories"])
+        category_rand_pred_src.append(
+            (rand_cat, ground_truth_item["ground_truth_category"]),
+        )
         # random is_secondhand selection
-        rand_is_secondhand = rng.choice([True, False], size=1).tolist()[0]
-        is_secondhand_rand_pred_src.append((rand_is_secondhand,
-                                            ground_truth_item["ground_truth_is_secondhand"]))
-
-        brand_pred_src.append((pred_item.brand,
-                               ground_truth_item["ground_truth_brand"]))
+        rand_is_secondhand = rng.choice([True, False])
+        is_secondhand_rand_pred_src.append(
+            (rand_is_secondhand, ground_truth_item["ground_truth_is_secondhand"]),
+        )
 
     category_f1_score = calculate_hierarchical_f1(category_dataset_pred_src)
     hiclass_f1_score = calculate_hiclass_f1(category_dataset_pred_src)
     is_secondhand_f1_score = calculate_secondhand_f1(is_secondhand_pred_src)
     brand_score = calculate_brand_f1_score(brand_pred_src)
 
     rand_cat_f1_score = calculate_hierarchical_f1(category_rand_pred_src)
-    rand_hiclass_f1_score = calculate_hierarchical_f1(category_rand_pred_src)
-    rand_is_seconhand_f1_score = calculate_secondhand_f1(
-        is_secondhand_rand_pred_src)
-
-    data = [
-        ["category", category_f1_score, hiclass_f1_score,
-         rand_cat_f1_score, rand_hiclass_f1_score, 0],
-        ["is_secondhand", is_secondhand_f1_score, 0,
-         rand_is_seconhand_f1_score, 0, 0],
-        ["brand", 0, 0, 0, 0, brand_score],
-    ]
+    rand_hiclass_f1_score = calculate_hiclass_f1(category_rand_pred_src)
+    rand_is_seconhand_f1_score = calculate_secondhand_f1(is_secondhand_rand_pred_src)
+    rand_brand_score = calculate_brand_f1_score(
+        [
+            (
+                rng.choice(list(all_possible_brands)),
+                original_data[elem["qsl_idx"]]["ground_truth_brand"],
+            )
+            for elem in model_output
+        ],
+    )
 
     logger.info(
-        "Results:\n{}",
+        "{} responses cannot be parsed against the expected schema. Results:\n{}",
+        num_unparsable_responses,
         tabulate(
-            data,
-            headers=["Fields", "F1 Score",
-                     "HiClass F1 Score",
-                     "F1 Score Random Selection",
-                     "HiClass F1 Score Random Selection",
-                     "Brand F1 Score"],
+            [
+                [
+                    "From accuracy file",
+                    category_f1_score,
+                    hiclass_f1_score,
+                    brand_score,
+                    is_secondhand_f1_score,
+                ],
+                [
+                    "Random selection",
+                    rand_cat_f1_score,
+                    rand_hiclass_f1_score,
+                    rand_brand_score,
+                    rand_is_seconhand_f1_score,
+                ],
+            ],
+            headers=[
+                "Results",
+                "Category hierarchical F1 Score",
+                "Category HiClass F1 Score",
+                "Brand F1 Score",
+                "Is_secondhand F1 Score",
+            ],
             tablefmt="fancy_grid",
         ),
     )