huggingface · Wauplin · Nov 4, 2022 · Oct 31, 2022 · Oct 31, 2022 · Oct 31, 2022
diff --git a/src/huggingface_hub/repocard.py b/src/huggingface_hub/repocard.py
@@ -27,6 +27,7 @@
 
 from .constants import REPOCARD_NAME
 from .utils import EntryNotFoundError, validate_hf_hub_args
+from .utils._deprecation import _deprecate_positional_args
 from .utils.logging import get_logger
 
 
@@ -533,6 +534,7 @@ def metadata_save(local_path: Union[str, Path], data: Dict) -> None:
         readme.close()
 
 
+@_deprecate_positional_args(version="0.12")
 def metadata_eval_result(
     model_pretty_name: str,
     task_pretty_name: str,
@@ -547,6 +549,7 @@ def metadata_eval_result(
     dataset_config: Optional[str] = None,
     dataset_split: Optional[str] = None,
     dataset_revision: Optional[str] = None,
+    metrics_verification_token: Optional[str] = None,
 ) -> Dict:
     """
     Creates a metadata dict with the result from a model evaluated on a dataset.
@@ -572,16 +575,16 @@ def metadata_eval_result(
             The name of the metric configuration used in `load_metric()`.
             Example: bleurt-large-512 in `load_metric("bleurt", "bleurt-large-512")`.
         metrics_verified (`bool`, *optional*, defaults to `False`):
-            If true, indicates that evaluation was generated by Hugging Face (vs. self-reported).
-            If a user tries to push self-reported metric results with verified=True, the push
-            will be rejected.
+            Indicates whether the metrics have originate from Hugging Face's [evaluation service](https://huggingface.co/spaces/autoevaluate/model-evaluator) or not. Automatically computed by Hugging Face, do not set.
         dataset_config (`str`, *optional*):
             Example: fr. The name of the dataset configuration used in `load_dataset()`.
         dataset_split (`str`, *optional*):
             Example: test. The name of the dataset split used in `load_dataset()`.
         dataset_revision (`str`, *optional*):
             Example: 5503434ddd753f426f4b38109466949a1217c2bb. The name of the dataset dataset revision
             used in `load_dataset()`.
+        metrics_verification_token (`bool`, *optional*):
+            A JSON Web Token that is used to verify whether the metrics originate from Hugging Face's [evaluation service](https://huggingface.co/spaces/autoevaluate/model-evaluator) or not.
 
     Returns:
         `dict`: a metadata dict with the result from a model evaluated on a dataset.
@@ -649,6 +652,7 @@ def metadata_eval_result(
                     dataset_type=dataset_id,
                     metric_config=metrics_config,
                     verified=metrics_verified,
+                    verifyToken=metrics_verification_token,
                     dataset_config=dataset_config,
                     dataset_split=dataset_split,
                     dataset_revision=dataset_revision,

diff --git a/src/huggingface_hub/repocard_data.py b/src/huggingface_hub/repocard_data.py
@@ -51,7 +51,9 @@ class EvalResult:
         metric_args (`Dict[str, Any]`, *optional*):
             The arguments passed during `Metric.compute()`. Example for `bleu`: max_order: 4
         verified (`bool`, *optional*):
-            If true, indicates that evaluation was generated by Hugging Face (vs. self-reported).
+            Indicates whether the metrics have originate from Hugging Face's [evaluation service](https://huggingface.co/spaces/autoevaluate/model-evaluator) or not. Automatically computed by Hugging Face, do not set.
+        verifyToken (`str`, *optional*):
+            A JSON Web Token that is used to verify whether the metrics originate from Hugging Face's [evaluation service](https://huggingface.co/spaces/autoevaluate/model-evaluator) or not.
     """
 
     # Required
@@ -113,9 +115,12 @@ class EvalResult:
     # Example for `bleu`: max_order: 4
     metric_args: Optional[Dict[str, Any]] = None
 
-    # If true, indicates that evaluation was generated by Hugging Face (vs. self-reported).
+    # Automatically computed, do not set. Dynamically overridden by Hugging Face in API calls to indicate if evaluation was verified by Hugging Face.
     verified: Optional[bool] = None
 
+    # Generated by Hugging Face to verify the results are valid.
+    verifyToken: Optional[str] = None
+
 
 @dataclass
 class CardData:
@@ -416,6 +421,7 @@ def model_index_to_eval_results(
                 metric_args = metric.get("args")
                 metric_config = metric.get("config")
                 verified = metric.get("verified")
+                verifyToken = metric.get("verifyToken")
 
                 eval_result = EvalResult(
                     task_type=task_type,  # Required
@@ -432,6 +438,7 @@ def model_index_to_eval_results(
                     metric_args=metric_args,
                     metric_config=metric_config,
                     verified=verified,
+                    verifyToken=verifyToken,
                 )
                 eval_results.append(eval_result)
     return name, eval_results
@@ -521,6 +528,7 @@ def eval_results_to_model_index(
                     "config": result.metric_config,
                     "args": result.metric_args,
                     "verified": result.verified,
+                    "verifyToken": result.verifyToken,
                 }
                 for result in results
             ],