From c95110839cc09d58a200f53cf4b97a51607320a3 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Mon, 21 Nov 2022 08:39:00 +0100 Subject: [PATCH 1/9] Fix metadata_update for verified evaluations --- src/huggingface_hub/repocard.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/huggingface_hub/repocard.py b/src/huggingface_hub/repocard.py index f88c932a89..970cd58911 100644 --- a/src/huggingface_hub/repocard.py +++ b/src/huggingface_hub/repocard.py @@ -807,6 +807,11 @@ def metadata_update( ) result_found = True existing_result.metric_value = new_result.metric_value + # If metric is from Hugging Face's evaluation service + # (https://huggingface.co/spaces/autoevaluate/model-evaluator), + # we also need to update the verification token. + if new_result.verified is True: + existing_result.verify_token = new_result.verify_token if not result_found: card.data.eval_results.append(new_result) else: From 23467e630964ea827c189d65c909dee07c4a9e33 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Mon, 21 Nov 2022 11:07:02 +0100 Subject: [PATCH 2/9] Add dataset metadata to model index conversion --- src/huggingface_hub/repocard.py | 5 +---- src/huggingface_hub/repocard_data.py | 22 +++++++++++++++++----- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/src/huggingface_hub/repocard.py b/src/huggingface_hub/repocard.py index 970cd58911..74c3debac1 100644 --- a/src/huggingface_hub/repocard.py +++ b/src/huggingface_hub/repocard.py @@ -807,10 +807,7 @@ def metadata_update( ) result_found = True existing_result.metric_value = new_result.metric_value - # If metric is from Hugging Face's evaluation service - # (https://huggingface.co/spaces/autoevaluate/model-evaluator), - # we also need to update the verification token. - if new_result.verified is True: + if existing_result.verified is True: existing_result.verify_token = new_result.verify_token if not result_found: card.data.eval_results.append(new_result) diff --git a/src/huggingface_hub/repocard_data.py b/src/huggingface_hub/repocard_data.py index 843d609278..c4a33b67b4 100644 --- a/src/huggingface_hub/repocard_data.py +++ b/src/huggingface_hub/repocard_data.py @@ -129,7 +129,9 @@ def is_equal_except_value(self, other: "EvalResult") -> bool: for key, _ in self.__dict__.items(): if key == "metric_value": continue - if getattr(self, key) != getattr(other, key): + # For metrics computed by Hugging Face's evaluation service, `verify_token` is derived from `metric_value`, + # so we exclude it here in the comparison. + if key != "verify_token" and getattr(self, key) != getattr(other, key): return False return True @@ -514,12 +516,22 @@ def eval_results_to_model_index( # Here, we make a map of those pairs and the associated EvalResults. task_and_ds_types_map = defaultdict(list) for eval_result in eval_results: - task_and_ds_pair = (eval_result.task_type, eval_result.dataset_type) + task_and_ds_pair = ( + eval_result.task_type, + eval_result.dataset_type, + eval_result.dataset_config, + eval_result.dataset_split, + ) task_and_ds_types_map[task_and_ds_pair].append(eval_result) # Use the map from above to generate the model index data. model_index_data = [] - for (task_type, dataset_type), results in task_and_ds_types_map.items(): + for ( + task_type, + dataset_type, + dataset_config, + dataset_split, + ), results in task_and_ds_types_map.items(): data = { "task": { "type": task_type, @@ -528,8 +540,8 @@ def eval_results_to_model_index( "dataset": { "name": results[0].dataset_name, "type": dataset_type, - "config": results[0].dataset_config, - "split": results[0].dataset_split, + "config": dataset_config, + "split": dataset_split, "revision": results[0].dataset_revision, "args": results[0].dataset_args, }, From 49b5d36d71e95af1f30341a1c88636470c076a5e Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Sun, 27 Nov 2022 15:35:51 +0100 Subject: [PATCH 3/9] Add regression test for #1208 --- .../cards/sample_simple_model_index.md | 19 ++++++++++++++++--- tests/test_repocard.py | 10 ++++++++++ 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/tests/fixtures/cards/sample_simple_model_index.md b/tests/fixtures/cards/sample_simple_model_index.md index 15d26c7069..c0e6ff9bdb 100644 --- a/tests/fixtures/cards/sample_simple_model_index.md +++ b/tests/fixtures/cards/sample_simple_model_index.md @@ -8,7 +8,7 @@ tags: datasets: - beans metrics: -- acc +- accuracy model-index: - name: my-cool-model results: @@ -18,12 +18,25 @@ model-index: type: beans name: Beans metrics: - - type: acc + - type: accuracy value: 0.9 + - task: + type: image-classification + dataset: + type: beans + name: Beans + config: default + split: test + revision: 5503434ddd753f426f4b38109466949a1217c2bb + args: + date: 20220120 + metrics: + - type: f1 + value: 0.66 --- # my-cool-model ## Model description -You can embed local or remote images using `![](...)` +This is a test model card with multiple evaluations across different (dataset, metric) configurations. diff --git a/tests/test_repocard.py b/tests/test_repocard.py index f035226811..d90a9a6e8e 100644 --- a/tests/test_repocard.py +++ b/tests/test_repocard.py @@ -766,6 +766,16 @@ def test_model_card_with_invalid_model_index(self): ) self.assertIsNone(card.data.eval_results) + def test_model_card_with_model_index(self): + """Test that loading a model card with multiple evaluations is consistent with `metadata_load`. + + Regression test for https://github.com/huggingface/huggingface_hub/issues/1208 + """ + sample_path = SAMPLE_CARDS_DIR / "sample_simple_model_index.md" + card = ModelCard.load(sample_path) + metadata = metadata_load(sample_path) + self.assertDictEqual(card.data.to_dict(), metadata) + def test_load_model_card_from_file(self): sample_path = SAMPLE_CARDS_DIR / "sample_simple.md" card = ModelCard.load(sample_path) From cfaee4ead4414b8397fff5a14dacc1a2ff02f133 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Sun, 27 Nov 2022 17:47:24 +0100 Subject: [PATCH 4/9] Add regression test for #1214 --- tests/test_repocard.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/tests/test_repocard.py b/tests/test_repocard.py index d90a9a6e8e..6575332339 100644 --- a/tests/test_repocard.py +++ b/tests/test_repocard.py @@ -127,7 +127,7 @@ value: 0.2662102282047272 name: Accuracy config: default - verified: false + verified: true --- """ @@ -322,6 +322,20 @@ def test_update_existing_result_with_overwrite(self): updated_metadata = metadata_load(self.repo_path / self.REPO_NAME / "README.md") self.assertDictEqual(updated_metadata, new_metadata) + def test_update_verification(self): + """Tests whether updating the verification fields updates in-place. + + Regression test for https://github.com/huggingface/huggingface_hub/issues/1210 + """ + new_metadata = copy.deepcopy(self.existing_metadata) + new_metadata["model-index"][0]["results"][0]["metrics"][0][ + "verifyToken" + ] = "1234" + metadata_update(self.repo_id, new_metadata, token=self._token, overwrite=True) + self.repo.git_pull() + updated_metadata = metadata_load(self.repo_path / self.REPO_NAME / "README.md") + self.assertDictEqual(updated_metadata, new_metadata) + def test_metadata_update_upstream(self): new_metadata = copy.deepcopy(self.existing_metadata) new_metadata["model-index"][0]["results"][0]["metrics"][0]["value"] = 0.1 From 8e4eae2792c9107ba2ffd7486cda9729906e24c1 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Sun, 27 Nov 2022 18:03:55 +0100 Subject: [PATCH 5/9] Use unique ID --- src/huggingface_hub/repocard_data.py | 39 ++++++++++++++-------------- tests/test_repocard.py | 8 +++--- 2 files changed, 24 insertions(+), 23 deletions(-) diff --git a/src/huggingface_hub/repocard_data.py b/src/huggingface_hub/repocard_data.py index c4a33b67b4..b71460507e 100644 --- a/src/huggingface_hub/repocard_data.py +++ b/src/huggingface_hub/repocard_data.py @@ -121,6 +121,18 @@ class EvalResult: # A JSON Web Token that is used to verify whether the metrics originate from Hugging Face's [evaluation service](https://huggingface.co/spaces/autoevaluate/model-evaluator) or not. verify_token: Optional[str] = None + @property + def unique_identifier(self) -> Tuple: + """Returns a tuple that uniquely identifies this evaluation.""" + return ( + self.task_type, + self.dataset_type, + self.dataset_config, + self.dataset_split, + self.dataset_revision, + self.dataset_args, + ) + def is_equal_except_value(self, other: "EvalResult") -> bool: """ Return True if `self` and `other` describe exactly the same metric but with a @@ -516,34 +528,23 @@ def eval_results_to_model_index( # Here, we make a map of those pairs and the associated EvalResults. task_and_ds_types_map = defaultdict(list) for eval_result in eval_results: - task_and_ds_pair = ( - eval_result.task_type, - eval_result.dataset_type, - eval_result.dataset_config, - eval_result.dataset_split, - ) - task_and_ds_types_map[task_and_ds_pair].append(eval_result) + task_and_ds_types_map[eval_result.unique_identifier].append(eval_result) # Use the map from above to generate the model index data. model_index_data = [] - for ( - task_type, - dataset_type, - dataset_config, - dataset_split, - ), results in task_and_ds_types_map.items(): + for eval_result_identifier, results in task_and_ds_types_map.items(): data = { "task": { - "type": task_type, + "type": eval_result_identifier[0], "name": results[0].task_name, }, "dataset": { "name": results[0].dataset_name, - "type": dataset_type, - "config": dataset_config, - "split": dataset_split, - "revision": results[0].dataset_revision, - "args": results[0].dataset_args, + "type": eval_result_identifier[1], + "config": eval_result_identifier[2], + "split": eval_result_identifier[3], + "revision": eval_result_identifier[4], + "args": eval_result_identifier[5], }, "metrics": [ { diff --git a/tests/test_repocard.py b/tests/test_repocard.py index 6575332339..4f3a14c27f 100644 --- a/tests/test_repocard.py +++ b/tests/test_repocard.py @@ -248,7 +248,7 @@ def test_metadata_eval_result(self): metrics_id="accuracy", metrics_value=0.2662102282047272, metrics_config="default", - metrics_verified=False, + metrics_verified=True, dataset_pretty_name="ReactionGIF", dataset_id="julien-c/reactiongif", dataset_config="default", @@ -322,8 +322,8 @@ def test_update_existing_result_with_overwrite(self): updated_metadata = metadata_load(self.repo_path / self.REPO_NAME / "README.md") self.assertDictEqual(updated_metadata, new_metadata) - def test_update_verification(self): - """Tests whether updating the verification fields updates in-place. + def test_update_verify_token(self): + """Tests whether updating the verification token updates in-place. Regression test for https://github.com/huggingface/huggingface_hub/issues/1210 """ @@ -848,7 +848,7 @@ def test_model_card_from_template_eval_results(self): metric_value=0.2662102282047272, metric_name="Accuracy", metric_config="default", - verified=False, + verified=True, ), ], model_name="RoBERTa fine-tuned on ReactionGIF", From 2c00a83ddbc6dcc7794893275af3f0de7b6e3f1e Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Sun, 27 Nov 2022 18:09:36 +0100 Subject: [PATCH 6/9] Fix typing --- src/huggingface_hub/repocard_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/huggingface_hub/repocard_data.py b/src/huggingface_hub/repocard_data.py index b71460507e..e126f65c31 100644 --- a/src/huggingface_hub/repocard_data.py +++ b/src/huggingface_hub/repocard_data.py @@ -122,7 +122,7 @@ class EvalResult: verify_token: Optional[str] = None @property - def unique_identifier(self) -> Tuple: + def unique_identifier(self) -> tuple: """Returns a tuple that uniquely identifies this evaluation.""" return ( self.task_type, From da6ae02f6efdd775803921d49cb42319f504252f Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Mon, 28 Nov 2022 12:34:35 +0100 Subject: [PATCH 7/9] Fix hashing --- src/huggingface_hub/repocard_data.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/huggingface_hub/repocard_data.py b/src/huggingface_hub/repocard_data.py index e126f65c31..84cc34d611 100644 --- a/src/huggingface_hub/repocard_data.py +++ b/src/huggingface_hub/repocard_data.py @@ -130,7 +130,6 @@ def unique_identifier(self) -> tuple: self.dataset_config, self.dataset_split, self.dataset_revision, - self.dataset_args, ) def is_equal_except_value(self, other: "EvalResult") -> bool: @@ -544,7 +543,7 @@ def eval_results_to_model_index( "config": eval_result_identifier[2], "split": eval_result_identifier[3], "revision": eval_result_identifier[4], - "args": eval_result_identifier[5], + "args": results[0].dataset_args, }, "metrics": [ { From b2bb6d6cc3391977bda02e17073166791a418db2 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Mon, 28 Nov 2022 21:46:00 +0100 Subject: [PATCH 8/9] Refactor --- src/huggingface_hub/repocard_data.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/huggingface_hub/repocard_data.py b/src/huggingface_hub/repocard_data.py index 84cc34d611..532053c9ce 100644 --- a/src/huggingface_hub/repocard_data.py +++ b/src/huggingface_hub/repocard_data.py @@ -531,18 +531,20 @@ def eval_results_to_model_index( # Use the map from above to generate the model index data. model_index_data = [] - for eval_result_identifier, results in task_and_ds_types_map.items(): + for results in task_and_ds_types_map.values(): + # All items from `results` share same metadata + sample_result = results[0] data = { "task": { - "type": eval_result_identifier[0], - "name": results[0].task_name, + "type": sample_result.task_type, + "name": sample_result.task_name, }, "dataset": { - "name": results[0].dataset_name, - "type": eval_result_identifier[1], - "config": eval_result_identifier[2], - "split": eval_result_identifier[3], - "revision": eval_result_identifier[4], + "name": sample_result.dataset_name, + "type": sample_result.dataset_type, + "config": sample_result.dataset_config, + "split": sample_result.dataset_split, + "revision": sample_result.dataset_revision, "args": results[0].dataset_args, }, "metrics": [ From ad02d4556d035ba6a6b374fcb5a65df4c9ac8efe Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Tue, 29 Nov 2022 06:14:01 +0100 Subject: [PATCH 9/9] Fix args --- src/huggingface_hub/repocard_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/huggingface_hub/repocard_data.py b/src/huggingface_hub/repocard_data.py index 532053c9ce..bc70766db6 100644 --- a/src/huggingface_hub/repocard_data.py +++ b/src/huggingface_hub/repocard_data.py @@ -545,7 +545,7 @@ def eval_results_to_model_index( "config": sample_result.dataset_config, "split": sample_result.dataset_split, "revision": sample_result.dataset_revision, - "args": results[0].dataset_args, + "args": sample_result.dataset_args, }, "metrics": [ {