From c95110839cc09d58a200f53cf4b97a51607320a3 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Mon, 21 Nov 2022 08:39:00 +0100
Subject: [PATCH 1/9] Fix metadata_update for verified evaluations

---
 src/huggingface_hub/repocard.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/huggingface_hub/repocard.py b/src/huggingface_hub/repocard.py
index f88c932a89..970cd58911 100644
--- a/src/huggingface_hub/repocard.py
+++ b/src/huggingface_hub/repocard.py
@@ -807,6 +807,11 @@ def metadata_update(
                                 )
                             result_found = True
                             existing_result.metric_value = new_result.metric_value
+                            # If metric is from Hugging Face's evaluation service
+                            # (https://huggingface.co/spaces/autoevaluate/model-evaluator),
+                            # we also need to update the verification token.
+                            if new_result.verified is True:
+                                existing_result.verify_token = new_result.verify_token
                     if not result_found:
                         card.data.eval_results.append(new_result)
         else:

From 23467e630964ea827c189d65c909dee07c4a9e33 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Mon, 21 Nov 2022 11:07:02 +0100
Subject: [PATCH 2/9] Add dataset metadata to model index conversion

---
 src/huggingface_hub/repocard.py      |  5 +----
 src/huggingface_hub/repocard_data.py | 22 +++++++++++++++++-----
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/src/huggingface_hub/repocard.py b/src/huggingface_hub/repocard.py
index 970cd58911..74c3debac1 100644
--- a/src/huggingface_hub/repocard.py
+++ b/src/huggingface_hub/repocard.py
@@ -807,10 +807,7 @@ def metadata_update(
                                 )
                             result_found = True
                             existing_result.metric_value = new_result.metric_value
-                            # If metric is from Hugging Face's evaluation service
-                            # (https://huggingface.co/spaces/autoevaluate/model-evaluator),
-                            # we also need to update the verification token.
-                            if new_result.verified is True:
+                            if existing_result.verified is True:
                                 existing_result.verify_token = new_result.verify_token
                     if not result_found:
                         card.data.eval_results.append(new_result)
diff --git a/src/huggingface_hub/repocard_data.py b/src/huggingface_hub/repocard_data.py
index 843d609278..c4a33b67b4 100644
--- a/src/huggingface_hub/repocard_data.py
+++ b/src/huggingface_hub/repocard_data.py
@@ -129,7 +129,9 @@ def is_equal_except_value(self, other: "EvalResult") -> bool:
         for key, _ in self.__dict__.items():
             if key == "metric_value":
                 continue
-            if getattr(self, key) != getattr(other, key):
+            # For metrics computed by Hugging Face's evaluation service, `verify_token` is derived from `metric_value`,
+            # so we exclude it here in the comparison.
+            if key != "verify_token" and getattr(self, key) != getattr(other, key):
                 return False
         return True
 
@@ -514,12 +516,22 @@ def eval_results_to_model_index(
     # Here, we make a map of those pairs and the associated EvalResults.
     task_and_ds_types_map = defaultdict(list)
     for eval_result in eval_results:
-        task_and_ds_pair = (eval_result.task_type, eval_result.dataset_type)
+        task_and_ds_pair = (
+            eval_result.task_type,
+            eval_result.dataset_type,
+            eval_result.dataset_config,
+            eval_result.dataset_split,
+        )
         task_and_ds_types_map[task_and_ds_pair].append(eval_result)
 
     # Use the map from above to generate the model index data.
     model_index_data = []
-    for (task_type, dataset_type), results in task_and_ds_types_map.items():
+    for (
+        task_type,
+        dataset_type,
+        dataset_config,
+        dataset_split,
+    ), results in task_and_ds_types_map.items():
         data = {
             "task": {
                 "type": task_type,
@@ -528,8 +540,8 @@ def eval_results_to_model_index(
             "dataset": {
                 "name": results[0].dataset_name,
                 "type": dataset_type,
-                "config": results[0].dataset_config,
-                "split": results[0].dataset_split,
+                "config": dataset_config,
+                "split": dataset_split,
                 "revision": results[0].dataset_revision,
                 "args": results[0].dataset_args,
             },

From 49b5d36d71e95af1f30341a1c88636470c076a5e Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Sun, 27 Nov 2022 15:35:51 +0100
Subject: [PATCH 3/9] Add regression test for #1208

---
 .../cards/sample_simple_model_index.md        | 19 ++++++++++++++++---
 tests/test_repocard.py                        | 10 ++++++++++
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/tests/fixtures/cards/sample_simple_model_index.md b/tests/fixtures/cards/sample_simple_model_index.md
index 15d26c7069..c0e6ff9bdb 100644
--- a/tests/fixtures/cards/sample_simple_model_index.md
+++ b/tests/fixtures/cards/sample_simple_model_index.md
@@ -8,7 +8,7 @@ tags:
 datasets:
 - beans
 metrics:
-- acc
+- accuracy
 model-index:
 - name: my-cool-model
   results:
@@ -18,12 +18,25 @@ model-index:
       type: beans
       name: Beans
     metrics:
-    - type: acc
+    - type: accuracy
       value: 0.9
+  - task:
+      type: image-classification
+    dataset:
+      type: beans
+      name: Beans
+      config: default
+      split: test
+      revision: 5503434ddd753f426f4b38109466949a1217c2bb
+      args:
+        date: 20220120
+    metrics:
+    - type: f1
+      value: 0.66
 ---
 
 # my-cool-model
 
 ## Model description
 
-You can embed local or remote images using `![](...)`
+This is a test model card with multiple evaluations across different (dataset, metric) configurations.
diff --git a/tests/test_repocard.py b/tests/test_repocard.py
index f035226811..d90a9a6e8e 100644
--- a/tests/test_repocard.py
+++ b/tests/test_repocard.py
@@ -766,6 +766,16 @@ def test_model_card_with_invalid_model_index(self):
         )
         self.assertIsNone(card.data.eval_results)
 
+    def test_model_card_with_model_index(self):
+        """Test that loading a model card with multiple evaluations is consistent with `metadata_load`.
+
+        Regression test for https://github.com/huggingface/huggingface_hub/issues/1208
+        """
+        sample_path = SAMPLE_CARDS_DIR / "sample_simple_model_index.md"
+        card = ModelCard.load(sample_path)
+        metadata = metadata_load(sample_path)
+        self.assertDictEqual(card.data.to_dict(), metadata)
+
     def test_load_model_card_from_file(self):
         sample_path = SAMPLE_CARDS_DIR / "sample_simple.md"
         card = ModelCard.load(sample_path)

From cfaee4ead4414b8397fff5a14dacc1a2ff02f133 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Sun, 27 Nov 2022 17:47:24 +0100
Subject: [PATCH 4/9] Add regression test for #1214

---
 tests/test_repocard.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/tests/test_repocard.py b/tests/test_repocard.py
index d90a9a6e8e..6575332339 100644
--- a/tests/test_repocard.py
+++ b/tests/test_repocard.py
@@ -127,7 +127,7 @@
       value: 0.2662102282047272
       name: Accuracy
       config: default
-      verified: false
+      verified: true
 ---
 """
 
@@ -322,6 +322,20 @@ def test_update_existing_result_with_overwrite(self):
         updated_metadata = metadata_load(self.repo_path / self.REPO_NAME / "README.md")
         self.assertDictEqual(updated_metadata, new_metadata)
 
+    def test_update_verification(self):
+        """Tests whether updating the verification fields updates in-place.
+
+        Regression test for https://github.com/huggingface/huggingface_hub/issues/1210
+        """
+        new_metadata = copy.deepcopy(self.existing_metadata)
+        new_metadata["model-index"][0]["results"][0]["metrics"][0][
+            "verifyToken"
+        ] = "1234"
+        metadata_update(self.repo_id, new_metadata, token=self._token, overwrite=True)
+        self.repo.git_pull()
+        updated_metadata = metadata_load(self.repo_path / self.REPO_NAME / "README.md")
+        self.assertDictEqual(updated_metadata, new_metadata)
+
     def test_metadata_update_upstream(self):
         new_metadata = copy.deepcopy(self.existing_metadata)
         new_metadata["model-index"][0]["results"][0]["metrics"][0]["value"] = 0.1

From 8e4eae2792c9107ba2ffd7486cda9729906e24c1 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Sun, 27 Nov 2022 18:03:55 +0100
Subject: [PATCH 5/9] Use unique ID

---
 src/huggingface_hub/repocard_data.py | 39 ++++++++++++++--------------
 tests/test_repocard.py               |  8 +++---
 2 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/src/huggingface_hub/repocard_data.py b/src/huggingface_hub/repocard_data.py
index c4a33b67b4..b71460507e 100644
--- a/src/huggingface_hub/repocard_data.py
+++ b/src/huggingface_hub/repocard_data.py
@@ -121,6 +121,18 @@ class EvalResult:
     # A JSON Web Token that is used to verify whether the metrics originate from Hugging Face's [evaluation service](https://huggingface.co/spaces/autoevaluate/model-evaluator) or not.
     verify_token: Optional[str] = None
 
+    @property
+    def unique_identifier(self) -> Tuple:
+        """Returns a tuple that uniquely identifies this evaluation."""
+        return (
+            self.task_type,
+            self.dataset_type,
+            self.dataset_config,
+            self.dataset_split,
+            self.dataset_revision,
+            self.dataset_args,
+        )
+
     def is_equal_except_value(self, other: "EvalResult") -> bool:
         """
         Return True if `self` and `other` describe exactly the same metric but with a
@@ -516,34 +528,23 @@ def eval_results_to_model_index(
     # Here, we make a map of those pairs and the associated EvalResults.
     task_and_ds_types_map = defaultdict(list)
     for eval_result in eval_results:
-        task_and_ds_pair = (
-            eval_result.task_type,
-            eval_result.dataset_type,
-            eval_result.dataset_config,
-            eval_result.dataset_split,
-        )
-        task_and_ds_types_map[task_and_ds_pair].append(eval_result)
+        task_and_ds_types_map[eval_result.unique_identifier].append(eval_result)
 
     # Use the map from above to generate the model index data.
     model_index_data = []
-    for (
-        task_type,
-        dataset_type,
-        dataset_config,
-        dataset_split,
-    ), results in task_and_ds_types_map.items():
+    for eval_result_identifier, results in task_and_ds_types_map.items():
         data = {
             "task": {
-                "type": task_type,
+                "type": eval_result_identifier[0],
                 "name": results[0].task_name,
             },
             "dataset": {
                 "name": results[0].dataset_name,
-                "type": dataset_type,
-                "config": dataset_config,
-                "split": dataset_split,
-                "revision": results[0].dataset_revision,
-                "args": results[0].dataset_args,
+                "type": eval_result_identifier[1],
+                "config": eval_result_identifier[2],
+                "split": eval_result_identifier[3],
+                "revision": eval_result_identifier[4],
+                "args": eval_result_identifier[5],
             },
             "metrics": [
                 {
diff --git a/tests/test_repocard.py b/tests/test_repocard.py
index 6575332339..4f3a14c27f 100644
--- a/tests/test_repocard.py
+++ b/tests/test_repocard.py
@@ -248,7 +248,7 @@ def test_metadata_eval_result(self):
             metrics_id="accuracy",
             metrics_value=0.2662102282047272,
             metrics_config="default",
-            metrics_verified=False,
+            metrics_verified=True,
             dataset_pretty_name="ReactionGIF",
             dataset_id="julien-c/reactiongif",
             dataset_config="default",
@@ -322,8 +322,8 @@ def test_update_existing_result_with_overwrite(self):
         updated_metadata = metadata_load(self.repo_path / self.REPO_NAME / "README.md")
         self.assertDictEqual(updated_metadata, new_metadata)
 
-    def test_update_verification(self):
-        """Tests whether updating the verification fields updates in-place.
+    def test_update_verify_token(self):
+        """Tests whether updating the verification token updates in-place.
 
         Regression test for https://github.com/huggingface/huggingface_hub/issues/1210
         """
@@ -848,7 +848,7 @@ def test_model_card_from_template_eval_results(self):
                         metric_value=0.2662102282047272,
                         metric_name="Accuracy",
                         metric_config="default",
-                        verified=False,
+                        verified=True,
                     ),
                 ],
                 model_name="RoBERTa fine-tuned on ReactionGIF",

From 2c00a83ddbc6dcc7794893275af3f0de7b6e3f1e Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Sun, 27 Nov 2022 18:09:36 +0100
Subject: [PATCH 6/9] Fix typing

---
 src/huggingface_hub/repocard_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/huggingface_hub/repocard_data.py b/src/huggingface_hub/repocard_data.py
index b71460507e..e126f65c31 100644
--- a/src/huggingface_hub/repocard_data.py
+++ b/src/huggingface_hub/repocard_data.py
@@ -122,7 +122,7 @@ class EvalResult:
     verify_token: Optional[str] = None
 
     @property
-    def unique_identifier(self) -> Tuple:
+    def unique_identifier(self) -> tuple:
         """Returns a tuple that uniquely identifies this evaluation."""
         return (
             self.task_type,

From da6ae02f6efdd775803921d49cb42319f504252f Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Mon, 28 Nov 2022 12:34:35 +0100
Subject: [PATCH 7/9] Fix hashing

---
 src/huggingface_hub/repocard_data.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/huggingface_hub/repocard_data.py b/src/huggingface_hub/repocard_data.py
index e126f65c31..84cc34d611 100644
--- a/src/huggingface_hub/repocard_data.py
+++ b/src/huggingface_hub/repocard_data.py
@@ -130,7 +130,6 @@ def unique_identifier(self) -> tuple:
             self.dataset_config,
             self.dataset_split,
             self.dataset_revision,
-            self.dataset_args,
         )
 
     def is_equal_except_value(self, other: "EvalResult") -> bool:
@@ -544,7 +543,7 @@ def eval_results_to_model_index(
                 "config": eval_result_identifier[2],
                 "split": eval_result_identifier[3],
                 "revision": eval_result_identifier[4],
-                "args": eval_result_identifier[5],
+                "args": results[0].dataset_args,
             },
             "metrics": [
                 {

From b2bb6d6cc3391977bda02e17073166791a418db2 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Mon, 28 Nov 2022 21:46:00 +0100
Subject: [PATCH 8/9] Refactor

---
 src/huggingface_hub/repocard_data.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/huggingface_hub/repocard_data.py b/src/huggingface_hub/repocard_data.py
index 84cc34d611..532053c9ce 100644
--- a/src/huggingface_hub/repocard_data.py
+++ b/src/huggingface_hub/repocard_data.py
@@ -531,18 +531,20 @@ def eval_results_to_model_index(
 
     # Use the map from above to generate the model index data.
     model_index_data = []
-    for eval_result_identifier, results in task_and_ds_types_map.items():
+    for results in task_and_ds_types_map.values():
+        # All items from `results` share same metadata
+        sample_result = results[0]
         data = {
             "task": {
-                "type": eval_result_identifier[0],
-                "name": results[0].task_name,
+                "type": sample_result.task_type,
+                "name": sample_result.task_name,
             },
             "dataset": {
-                "name": results[0].dataset_name,
-                "type": eval_result_identifier[1],
-                "config": eval_result_identifier[2],
-                "split": eval_result_identifier[3],
-                "revision": eval_result_identifier[4],
+                "name": sample_result.dataset_name,
+                "type": sample_result.dataset_type,
+                "config": sample_result.dataset_config,
+                "split": sample_result.dataset_split,
+                "revision": sample_result.dataset_revision,
                 "args": results[0].dataset_args,
             },
             "metrics": [

From ad02d4556d035ba6a6b374fcb5a65df4c9ac8efe Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Tue, 29 Nov 2022 06:14:01 +0100
Subject: [PATCH 9/9] Fix args

---
 src/huggingface_hub/repocard_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/huggingface_hub/repocard_data.py b/src/huggingface_hub/repocard_data.py
index 532053c9ce..bc70766db6 100644
--- a/src/huggingface_hub/repocard_data.py
+++ b/src/huggingface_hub/repocard_data.py
@@ -545,7 +545,7 @@ def eval_results_to_model_index(
                 "config": sample_result.dataset_config,
                 "split": sample_result.dataset_split,
                 "revision": sample_result.dataset_revision,
-                "args": results[0].dataset_args,
+                "args": sample_result.dataset_args,
             },
             "metrics": [
                 {