From 78dd0f48010c5b61538ac5e96ee57dba00dd7850 Mon Sep 17 00:00:00 2001 From: jnanliu Date: Mon, 23 Dec 2024 10:45:39 +0000 Subject: [PATCH 1/2] update implementation of G-Pass@k --- metrics/gpassk/README.md | 90 ++++++++++++++++++ metrics/gpassk/app.py | 6 ++ metrics/gpassk/gpassk.py | 161 ++++++++++++++++++++++++++++++++ metrics/gpassk/requirements.txt | 3 + 4 files changed, 260 insertions(+) create mode 100644 metrics/gpassk/README.md create mode 100644 metrics/gpassk/app.py create mode 100644 metrics/gpassk/gpassk.py create mode 100644 metrics/gpassk/requirements.txt diff --git a/metrics/gpassk/README.md b/metrics/gpassk/README.md new file mode 100644 index 00000000..bdc4686f --- /dev/null +++ b/metrics/gpassk/README.md @@ -0,0 +1,90 @@ +--- +title: G-Pass@k +emoji: 🤗 +colorFrom: blue +colorTo: red +sdk: gradio +sdk_version: 3.19.1 +app_file: app.py +pinned: false +tags: +- evaluate +- metric +description: >- + G-Pass@$k$ is a generalization of the Pass@$k$ metric, which evaluates both the stability and potential of large language models (LLMs) in reasoning tasks, described in https://arxiv.org/abs/2412.13147. +--- + +# Metric Card for GPassK + +## Metric Description +G-Pass@$k$ is a generalization of the Pass@$k$ metric, which evaluates both the stability and potential of large language models (LLMs) in reasoning tasks. + +Given a threshold $\tau$, the G-Pass@$k_{\tau}$ measures the probability that a model will pass at least $m = \lceil \tau \cdot k \rceil$ out of $k$ attempts, +where $c$ is the number of correct solutions and $n$ is the total number of generations. + +$$ + \text{G-Pass@}k_{\tau} = \left[ \sum_{j = \lceil \tau \cdot k \rceil}^{c} \frac{\binom{c}{j} \cdot \binom{n - c}{k - j}}{\binom{n}{k}} \right] +$$ + +mG-Pass@$k$ extends the concept of G-Pass@$k_{\tau}$ by integrating over all thresholds from 0.5 to 1.0, +effectively calculating the area under the curve of G-Pass@$k_{\tau}$. +This provides an overall measure of how well the LLM performs across different levels of stringency. + +$$ + \text{mG-Pass@}k = 2\int_{0.5}^{1.0} \text{G-Pass@}k_{\tau} d \tau = \frac{2}{k} \sum_{i= \lceil 0.5 \cdot k \rceil + 1}^{k} \text{G-Pass@}k_{\frac{i}{k}} +$$ + +## How to Use + +### Inputs +- **predictions** (List[List[str]]): list of generations to evaluate. Each prediction should be a list of string with several model-generated solutions. +- **references** (List[str]): list of answer for each prediction. +- **k** (List[int]): list of number of attempts to consider in evaluation (Default: [4, 8, 16]). +- **thresholds** (List[float]): list of thresholds to consider in evaluation (Default: [0.25, 0.5, 0.75, 1.0]). +- **check_correct_fn** (Callable): function to check if a prediction is correct. It should have two parameters: `pred` and `ref` and output a boolean + +### Output Values + +The G-Pass@k metric returns one dict: +`g_pass_at_k`: dict with scores for each $k$ and threshold, and mG-Pass@$k$. + +These metrics can take on any value between 0 and 1, inclusive. Higher scores are better. + +#### Values from Popular Papers +The [leaderboard](https://open-compass.github.io/GPassK/) contains performance of several open-source and closed-source LLMs on the mathematical task. + +### Examples +```python +from evaluate import load +g_pass_at_k_evaluator = evaluate.load("gpassk") +predictions = [["a", "b", "a", "a", "b", "a", "b", "c", "a", "c", "b", "a", "a", "b", "a", "b"]] +references = ["a"] +check_correct_fn = lambda pred, ref: pred == ref +g_pass_at_k = g_pass_at_k_evaluator.compute(predictions=predictions, + references=references, k=[4, 8], check_correct_fn=check_correct_fn) +print(g_pass_at_k) +{ + 'G-Pass@4_0.25': 0.9615384615384616, 'G-Pass@4_0.5': 0.7153846153846154, + 'G-Pass@4_0.75': 0.2846153846153846, 'G-Pass@4_1.0': 0.038461538461538464, + 'G-Pass@8_0.25': 0.9949494949494949, 'G-Pass@8_0.5': 0.6903651903651904, + 'G-Pass@8_0.75': 0.06596736596736597, 'G-Pass@8_1.0': 7.77000777000777e-05, + 'mG-Pass@4': 0.16153846153846152, 'mG-Pass@8': 0.09518259518259518 +} +``` + +## Citation +```bibtex +@misc{liu2024llmscapablestablereasoning, + title={Are Your LLMs Capable of Stable Reasoning?}, + author={Junnan Liu and Hongwei Liu and Linchen Xiao and Ziyi Wang and Kuikun Liu and Songyang Gao and Wenwei Zhang and Songyang Zhang and Kai Chen}, + year={2024}, + eprint={2412.13147}, + archivePrefix={arXiv}, + primaryClass={cs.AI}, + url={https://arxiv.org/abs/2412.13147}, +} +``` + +## Further References + +- [GPassK on github](https://github.com/open-compass/GPassK/) diff --git a/metrics/gpassk/app.py b/metrics/gpassk/app.py new file mode 100644 index 00000000..ba08b4c3 --- /dev/null +++ b/metrics/gpassk/app.py @@ -0,0 +1,6 @@ +import evaluate +from evaluate.utils import launch_gradio_widget + + +module = evaluate.load("gpassk") +launch_gradio_widget(module) \ No newline at end of file diff --git a/metrics/gpassk/gpassk.py b/metrics/gpassk/gpassk.py new file mode 100644 index 00000000..1c437fa5 --- /dev/null +++ b/metrics/gpassk/gpassk.py @@ -0,0 +1,161 @@ +# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Implementation of G-Pass@k Metirc Described in https://arxiv.org/abs/2412.13147.""" +from typing import List, Callable +from functools import partial +import inspect + +import datasets +import numpy as np +from scipy.stats import hypergeom + +import evaluate + + +_CITATION = """\ +@misc{liu2024llmscapablestablereasoning, + title={Are Your LLMs Capable of Stable Reasoning?}, + author={Junnan Liu and Hongwei Liu and Linchen Xiao and Ziyi Wang and Kuikun Liu and Songyang Gao and Wenwei Zhang and Songyang Zhang and Kai Chen}, + year={2024}, + eprint={2412.13147}, + archivePrefix={arXiv}, + primaryClass={cs.AI}, + url={https://arxiv.org/abs/2412.13147}, +} +""" + + +_DESCRIPTION = """\ +G-Pass@:math:`k` is a generalization of the Pass@:math:`k` metric, which evaluates both the stability and potential of large language models (LLMs) in reasoning tasks. +Given a threshold :math:`\tau`, the G-Pass@:math:`k_{\tau}` measures the probability that a model will pass at least :math:`m = \lceil \tau \cdot k \rceil` out of :math:`k` attempts, +where :math:`c` is the number of correct solutions and :math:`n` is the total number of generations. + +.. math:: + \text{G-Pass@}k_{\tau} = \left[ \sum_{j = \lceil \tau \cdot k \rceil}^{c} \frac{\binom{c}{j} \cdot \binom{n - c}{k - j}}{\binom{n}{k}} \right] + +mG-Pass@:math:`k` extends the concept of G-Pass@:math:`k_{\tau}` by integrating over all thresholds from 0.5 to 1.0, +effectively calculating the area under the curve of G-Pass@:math:`k_{\tau}`. +This provides an overall measure of how well the LLM performs across different levels of stringency. + +.. math:: + \text{mG-Pass@}k = 2\int_{0.5}^{1.0} \text{G-Pass@}k_{\tau} d \tau = \frac{2}{k} \sum_{i= \lceil 0.5 \cdot k \rceil + 1}^{k} \text{G-Pass@}k_{\frac{i}{k}} + +""" + + +_KWARGS_DESCRIPTION = """ +Calculates how good are predictions given some references, using certain scores +Args: + predictions: list of generations to evaluate. Each prediction should be a + list of string with several model-generated solutions. + references: list of answer for each prediction. + k: list of number of attempts to consider in evaluation (Default: [4, 8, 16]). + thresholds: list of thresholds to consider in evaluation (Default: [0.25, 0.5, 0.75, 1.0]). + check_correct_fn: function to check if a prediction is correct. + It should have two parameters: `pred` and `ref` and output a boolean. +Returns: + g_pass_at_k: dict with scores for each k and threshold, and mG-Pass@k. +Examples: + >>> g_pass_at_k_evaluator = evaluate.load("gpassk") + >>> predictions = [["a", "b", "a", "a", "b", "a", "b", "c", "a", "c", "b", "a", "a", "b", "a", "b"]] + >>> references = ["a"] + >>> check_correct_fn = lambda pred, ref: pred == ref + >>> g_pass_at_k = g_pass_at_k_evaluator.compute(predictions=predictions, + references=references, k=[4, 8], check_correct_fn=check_correct_fn) + >>> print(g_pass_at_k) + {'G-Pass@4_0.25': 0.9615384615384616, 'G-Pass@4_0.5': 0.7153846153846154, + 'G-Pass@4_0.75': 0.2846153846153846, 'G-Pass@4_1.0': 0.038461538461538464, + 'G-Pass@8_0.25': 0.9949494949494949, 'G-Pass@8_0.5': 0.6903651903651904, + 'G-Pass@8_0.75': 0.06596736596736597, 'G-Pass@8_1.0': 7.77000777000777e-05, + 'mG-Pass@4': 0.16153846153846152, 'mG-Pass@8': 0.09518259518259518} +""" + + +@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) +class GPassK(evaluate.Metric): + + def _info(self): + return evaluate.MetricInfo( + module_type="metric", + description=_DESCRIPTION, + citation=_CITATION, + inputs_description=_KWARGS_DESCRIPTION, + features=datasets.Features({ + 'predictions': datasets.Value('int64'), + 'references': datasets.Value('int64'), + }), + homepage="https://open-compass.github.io/GPassK/", + codebase_urls=["https://github.com/open-compass/GPassK"], + reference_urls=["http://arxiv.org/abs/2412.13147"] + ) + + def _compute(self, + predictions: List[List[str]], + references: List[str], + k=[4, 8, 16], + thresholds=[0.25, 0.5, 0.75, 1.0], + check_correct_fn: Callable = None): + """Compute GPassK metric.""" + + if check_correct_fn is None: + raise ValueError('`check_correct_fn` is required for GPassK metric') + + sig = inspect.signature(check_correct_fn) + if len(sig.parameters) != 2: + raise ValueError(f'`check_correct_fn` should have exactly 2 parameters, got {len(sig.parameters)}') + for name in sig.parameters: + if name not in ['pred', 'ref']: + raise ValueError(f'`check_correct_fn` should have only `pred` and `ref` as parameters, got {name}') + + n_list, c_list = [], [] + for preds, ref in zip(predictions, references): + labels = list(map(partial(check_correct_fn, ref=ref), preds)) + n = len(preds) + c = sum(labels) + n_list.append(n) + c_list.append(c) + + g_pass_at_k = { + f"G-Pass@{k_i}_{t}": np.mean([compute_g_pass_at_k(n, c, k_i, t) for n, c in zip(n_list, c_list)]).item() + for k_i in k + for t in thresholds + } + g_pass_at_k.update({ + f"mG-Pass@{k_i}": np.mean([compute_mg_pass_at_k(n, c, k_i) for n, c in zip(n_list, c_list)]).item() + for k_i in k + }) + return g_pass_at_k + + +def _compute_g_pass_at_k(n, c, k, m): + if m > min(c, k) or k > n or c < 0 or n <= 0 or m < 0: + return 0.0 + return hypergeom.sf(m - 1, n, c, k) + + +def compute_g_pass_at_k(n, c, k, t): + m = max(int(np.ceil(k * t)), 1) + return _compute_g_pass_at_k(n, c, k, m) + + +def compute_mg_pass_at_k(n, c, k): + l, r = int(np.ceil(k * 0.5)), k + + mg_pass_at_k = 0.0 + for i in range(l + 1, r + 1): + mg_pass_at_k += _compute_g_pass_at_k(n, c, k, i) + mg_pass_at_k = 2 * mg_pass_at_k / k + + return mg_pass_at_k diff --git a/metrics/gpassk/requirements.txt b/metrics/gpassk/requirements.txt new file mode 100644 index 00000000..52df037e --- /dev/null +++ b/metrics/gpassk/requirements.txt @@ -0,0 +1,3 @@ +git+https://github.com/huggingface/evaluate@main +scipy==1.14.1 +numpy==2.2.1 \ No newline at end of file From 58e82e08872842d295b19cbff97d5833197a7624 Mon Sep 17 00:00:00 2001 From: jnanliu Date: Mon, 23 Dec 2024 12:04:33 +0000 Subject: [PATCH 2/2] fix readme & metric input features --- metrics/gpassk/README.md | 70 ++++++++++++++++++++++++++-------------- metrics/gpassk/gpassk.py | 6 ++-- 2 files changed, 49 insertions(+), 27 deletions(-) diff --git a/metrics/gpassk/README.md b/metrics/gpassk/README.md index bdc4686f..0f95e013 100644 --- a/metrics/gpassk/README.md +++ b/metrics/gpassk/README.md @@ -17,58 +17,78 @@ description: >- # Metric Card for GPassK ## Metric Description -G-Pass@$k$ is a generalization of the Pass@$k$ metric, which evaluates both the stability and potential of large language models (LLMs) in reasoning tasks. +G-Pass@ $k$ is a generalization of the Pass@$k$ metric, which evaluates both the stability and potential of large language models (LLMs) in reasoning tasks. -Given a threshold $\tau$, the G-Pass@$k_{\tau}$ measures the probability that a model will pass at least $m = \lceil \tau \cdot k \rceil$ out of $k$ attempts, -where $c$ is the number of correct solutions and $n$ is the total number of generations. +- **G-Pass@k**: Evaluates the probability that an LLM will correctly solve at least $ m = \lceil \tau \cdot k \rceil $ out of $ k $ attempts, where: + - $ \tau $: The threshold indicating the minimum proportion of correct solutions. + - $ c $: The number of correct solutions generated by the model. + - $ n $: The total number of generations or trials. $$ - \text{G-Pass@}k_{\tau} = \left[ \sum_{j = \lceil \tau \cdot k \rceil}^{c} \frac{\binom{c}{j} \cdot \binom{n - c}{k - j}}{\binom{n}{k}} \right] +\text{G-Pass@}k_{\tau} = \left[ \sum_{j = \lceil \tau \cdot k \rceil}^{c} \frac{\binom{c}{j} \cdot \binom{n - c}{k - j}}{\binom{n}{k}} \right] $$ -mG-Pass@$k$ extends the concept of G-Pass@$k_{\tau}$ by integrating over all thresholds from 0.5 to 1.0, -effectively calculating the area under the curve of G-Pass@$k_{\tau}$. -This provides an overall measure of how well the LLM performs across different levels of stringency. +- **mG-Pass@k**: Extends G-Pass@$ k_{\tau} $ by integrating over all thresholds from 0.5 to 1.0, effectively calculating the area under the curve of G-Pass@$ k_{\tau} $. This provides an overall measure of the LLM's performance across different stringency levels. $$ - \text{mG-Pass@}k = 2\int_{0.5}^{1.0} \text{G-Pass@}k_{\tau} d \tau = \frac{2}{k} \sum_{i= \lceil 0.5 \cdot k \rceil + 1}^{k} \text{G-Pass@}k_{\frac{i}{k}} +\text{mG-Pass@}k = 2\int_{0.5}^{1.0} \text{G-Pass@}k_{\tau} d \tau = \frac{2}{k} \sum_{i= \lceil 0.5 \cdot k \rceil + 1}^{k} \text{G-Pass@}k_{\frac{i}{k}} $$ + ## How to Use ### Inputs -- **predictions** (List[List[str]]): list of generations to evaluate. Each prediction should be a list of string with several model-generated solutions. -- **references** (List[str]): list of answer for each prediction. -- **k** (List[int]): list of number of attempts to consider in evaluation (Default: [4, 8, 16]). -- **thresholds** (List[float]): list of thresholds to consider in evaluation (Default: [0.25, 0.5, 0.75, 1.0]). -- **check_correct_fn** (Callable): function to check if a prediction is correct. It should have two parameters: `pred` and `ref` and output a boolean +To use the G-Pass@k metric, provide the following inputs: + +- **predictions** (`List[List[str]]`): A list of lists, where each sublist contains multiple model-generated solutions for a single task. +- **references** (`List[str]`): A list of correct answers corresponding to each set of predictions. +- **k** (`List[int]`): A list of integers representing the number of attempts to consider in the evaluation. Defaults to `[4, 8, 16]`. +- **thresholds** (`List[float]`): A list of floating-point numbers representing the thresholds to consider in the evaluation. Defaults to `[0.25, 0.5, 0.75, 1.0]`. +- **check_correct_fn** (`Callable`): A function that checks if a prediction matches the reference. It should take two parameters (`pred`, `ref`) and return a boolean value. ### Output Values -The G-Pass@k metric returns one dict: -`g_pass_at_k`: dict with scores for each $k$ and threshold, and mG-Pass@$k$. +The G-Pass@k metric returns a dictionary with scores for each combination of $ k $ and threshold, as well as the mG-Pass@k score. -These metrics can take on any value between 0 and 1, inclusive. Higher scores are better. +All metrics produce values between 0 and 1, inclusive. A higher score indicates better performance. #### Values from Popular Papers -The [leaderboard](https://open-compass.github.io/GPassK/) contains performance of several open-source and closed-source LLMs on the mathematical task. +For a comprehensive overview of how various open-source and closed-source LLMs perform on this metric, visit [leaderboard](https://open-compass.github.io/GPassK/). ### Examples +Below is an example of how to compute the G-Pass@k metric using Python: + ```python from evaluate import load -g_pass_at_k_evaluator = evaluate.load("gpassk") + +g_pass_at_k_evaluator = load("gpassk") predictions = [["a", "b", "a", "a", "b", "a", "b", "c", "a", "c", "b", "a", "a", "b", "a", "b"]] references = ["a"] check_correct_fn = lambda pred, ref: pred == ref -g_pass_at_k = g_pass_at_k_evaluator.compute(predictions=predictions, - references=references, k=[4, 8], check_correct_fn=check_correct_fn) + +g_pass_at_k = g_pass_at_k_evaluator.compute( + predictions=predictions, + references=references, + k=[4, 8], + check_correct_fn=check_correct_fn +) + print(g_pass_at_k) +``` + +Sample output: +```json { - 'G-Pass@4_0.25': 0.9615384615384616, 'G-Pass@4_0.5': 0.7153846153846154, - 'G-Pass@4_0.75': 0.2846153846153846, 'G-Pass@4_1.0': 0.038461538461538464, - 'G-Pass@8_0.25': 0.9949494949494949, 'G-Pass@8_0.5': 0.6903651903651904, - 'G-Pass@8_0.75': 0.06596736596736597, 'G-Pass@8_1.0': 7.77000777000777e-05, - 'mG-Pass@4': 0.16153846153846152, 'mG-Pass@8': 0.09518259518259518 + "G-Pass@4_0.25": 0.9615384615384616, + "G-Pass@4_0.5": 0.7153846153846154, + "G-Pass@4_0.75": 0.2846153846153846, + "G-Pass@4_1.0": 0.038461538461538464, + "G-Pass@8_0.25": 0.9949494949494949, + "G-Pass@8_0.5": 0.6903651903651904, + "G-Pass@8_0.75": 0.06596736596736597, + "G-Pass@8_1.0": 7.77000777000777e-05, + "mG-Pass@4": 0.16153846153846152, + "mG-Pass@8": 0.09518259518259518 } ``` diff --git a/metrics/gpassk/gpassk.py b/metrics/gpassk/gpassk.py index 1c437fa5..f2684586 100644 --- a/metrics/gpassk/gpassk.py +++ b/metrics/gpassk/gpassk.py @@ -93,8 +93,10 @@ def _info(self): citation=_CITATION, inputs_description=_KWARGS_DESCRIPTION, features=datasets.Features({ - 'predictions': datasets.Value('int64'), - 'references': datasets.Value('int64'), + 'predictions': datasets.Sequence(datasets.Value('string')), + 'references': datasets.Value('string'), + 'k': datasets.Sequence(datasets.Value('int32')), + 'thresholds': datasets.Sequence(datasets.Value('float')) }), homepage="https://open-compass.github.io/GPassK/", codebase_urls=["https://github.com/open-compass/GPassK"],