From 78dd0f48010c5b61538ac5e96ee57dba00dd7850 Mon Sep 17 00:00:00 2001
From: jnanliu <to.liujn@outlook.com>
Date: Mon, 23 Dec 2024 10:45:39 +0000
Subject: [PATCH 1/2] update  implementation of G-Pass@k

---
 metrics/gpassk/README.md        |  90 ++++++++++++++++++
 metrics/gpassk/app.py           |   6 ++
 metrics/gpassk/gpassk.py        | 161 ++++++++++++++++++++++++++++++++
 metrics/gpassk/requirements.txt |   3 +
 4 files changed, 260 insertions(+)
 create mode 100644 metrics/gpassk/README.md
 create mode 100644 metrics/gpassk/app.py
 create mode 100644 metrics/gpassk/gpassk.py
 create mode 100644 metrics/gpassk/requirements.txt

diff --git a/metrics/gpassk/README.md b/metrics/gpassk/README.md
new file mode 100644
index 00000000..bdc4686f
--- /dev/null
+++ b/metrics/gpassk/README.md
@@ -0,0 +1,90 @@
+---
+title: G-Pass@k
+emoji: 🤗 
+colorFrom: blue
+colorTo: red
+sdk: gradio
+sdk_version: 3.19.1
+app_file: app.py
+pinned: false
+tags:
+- evaluate
+- metric
+description: >-
+    G-Pass@$k$ is a generalization of the Pass@$k$ metric, which evaluates both the stability and potential of large language models (LLMs) in reasoning tasks, described in https://arxiv.org/abs/2412.13147.
+---
+
+# Metric Card for GPassK
+
+## Metric Description
+G-Pass@$k$ is a generalization of the Pass@$k$ metric, which evaluates both the stability and potential of large language models (LLMs) in reasoning tasks. 
+
+Given a threshold $\tau$, the G-Pass@$k_{\tau}$ measures the probability that a model will pass at least $m = \lceil \tau \cdot k \rceil$ out of $k$ attempts, 
+where $c$ is the number of correct solutions and $n$ is the total number of generations.
+
+$$
+    \text{G-Pass@}k_{\tau} = \left[ \sum_{j = \lceil \tau \cdot k \rceil}^{c} \frac{\binom{c}{j} \cdot \binom{n - c}{k - j}}{\binom{n}{k}} \right]
+$$
+
+mG-Pass@$k$ extends the concept of G-Pass@$k_{\tau}$ by integrating over all thresholds from 0.5 to 1.0, 
+effectively calculating the area under the curve of G-Pass@$k_{\tau}$. 
+This provides an overall measure of how well the LLM performs across different levels of stringency.
+
+$$
+    \text{mG-Pass@}k = 2\int_{0.5}^{1.0} \text{G-Pass@}k_{\tau} d \tau = \frac{2}{k} \sum_{i= \lceil 0.5 \cdot k \rceil + 1}^{k} \text{G-Pass@}k_{\frac{i}{k}}
+$$
+
+## How to Use
+
+### Inputs
+- **predictions** (List[List[str]]): list of generations to evaluate. Each prediction should be a list of string with several model-generated solutions.
+- **references** (List[str]): list of answer for each prediction.
+- **k** (List[int]): list of number of attempts to consider in evaluation (Default: [4, 8, 16]).
+- **thresholds** (List[float]): list of thresholds to consider in evaluation (Default: [0.25, 0.5, 0.75, 1.0]).
+- **check_correct_fn** (Callable): function to check if a prediction is correct. It should have two parameters: `pred` and `ref` and output a boolean
+
+### Output Values
+
+The G-Pass@k metric returns one dict:
+`g_pass_at_k`: dict with scores for each $k$ and threshold, and mG-Pass@$k$.
+
+These metrics can take on any value between 0 and 1, inclusive. Higher scores are better.
+
+#### Values from Popular Papers
+The [leaderboard](https://open-compass.github.io/GPassK/) contains performance of several open-source and closed-source LLMs on the mathematical task.
+
+### Examples
+```python
+from evaluate import load
+g_pass_at_k_evaluator = evaluate.load("gpassk")
+predictions = [["a", "b", "a", "a", "b", "a", "b", "c", "a", "c", "b", "a", "a", "b", "a", "b"]]
+references = ["a"]
+check_correct_fn = lambda pred, ref: pred == ref
+g_pass_at_k = g_pass_at_k_evaluator.compute(predictions=predictions, 
+    references=references, k=[4, 8], check_correct_fn=check_correct_fn)
+print(g_pass_at_k)
+{
+    'G-Pass@4_0.25': 0.9615384615384616, 'G-Pass@4_0.5': 0.7153846153846154, 
+    'G-Pass@4_0.75': 0.2846153846153846, 'G-Pass@4_1.0': 0.038461538461538464, 
+    'G-Pass@8_0.25': 0.9949494949494949, 'G-Pass@8_0.5': 0.6903651903651904, 
+    'G-Pass@8_0.75': 0.06596736596736597, 'G-Pass@8_1.0': 7.77000777000777e-05, 
+    'mG-Pass@4': 0.16153846153846152, 'mG-Pass@8': 0.09518259518259518
+}
+```
+
+## Citation
+```bibtex
+@misc{liu2024llmscapablestablereasoning,
+      title={Are Your LLMs Capable of Stable Reasoning?}, 
+      author={Junnan Liu and Hongwei Liu and Linchen Xiao and Ziyi Wang and Kuikun Liu and Songyang Gao and Wenwei Zhang and Songyang Zhang and Kai Chen},
+      year={2024},
+      eprint={2412.13147},
+      archivePrefix={arXiv},
+      primaryClass={cs.AI},
+      url={https://arxiv.org/abs/2412.13147}, 
+}
+```
+
+## Further References
+
+- [GPassK on github](https://github.com/open-compass/GPassK/)
diff --git a/metrics/gpassk/app.py b/metrics/gpassk/app.py
new file mode 100644
index 00000000..ba08b4c3
--- /dev/null
+++ b/metrics/gpassk/app.py
@@ -0,0 +1,6 @@
+import evaluate
+from evaluate.utils import launch_gradio_widget
+
+
+module = evaluate.load("gpassk")
+launch_gradio_widget(module)
\ No newline at end of file
diff --git a/metrics/gpassk/gpassk.py b/metrics/gpassk/gpassk.py
new file mode 100644
index 00000000..1c437fa5
--- /dev/null
+++ b/metrics/gpassk/gpassk.py
@@ -0,0 +1,161 @@
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Implementation of G-Pass@k Metirc Described in https://arxiv.org/abs/2412.13147."""
+from typing import List, Callable
+from functools import partial
+import inspect
+
+import datasets
+import numpy as np
+from scipy.stats import hypergeom
+
+import evaluate
+
+
+_CITATION = """\
+@misc{liu2024llmscapablestablereasoning,
+      title={Are Your LLMs Capable of Stable Reasoning?}, 
+      author={Junnan Liu and Hongwei Liu and Linchen Xiao and Ziyi Wang and Kuikun Liu and Songyang Gao and Wenwei Zhang and Songyang Zhang and Kai Chen},
+      year={2024},
+      eprint={2412.13147},
+      archivePrefix={arXiv},
+      primaryClass={cs.AI},
+      url={https://arxiv.org/abs/2412.13147}, 
+}
+"""
+
+
+_DESCRIPTION = """\
+G-Pass@:math:`k` is a generalization of the Pass@:math:`k` metric, which evaluates both the stability and potential of large language models (LLMs) in reasoning tasks. 
+Given a threshold :math:`\tau`, the G-Pass@:math:`k_{\tau}` measures the probability that a model will pass at least :math:`m = \lceil \tau \cdot k \rceil` out of :math:`k` attempts, 
+where :math:`c` is the number of correct solutions and :math:`n` is the total number of generations.
+
+.. math::
+    \text{G-Pass@}k_{\tau} = \left[ \sum_{j = \lceil \tau \cdot k \rceil}^{c} \frac{\binom{c}{j} \cdot \binom{n - c}{k - j}}{\binom{n}{k}} \right]
+
+mG-Pass@:math:`k` extends the concept of G-Pass@:math:`k_{\tau}` by integrating over all thresholds from 0.5 to 1.0, 
+effectively calculating the area under the curve of G-Pass@:math:`k_{\tau}`. 
+This provides an overall measure of how well the LLM performs across different levels of stringency.
+
+.. math::
+    \text{mG-Pass@}k = 2\int_{0.5}^{1.0} \text{G-Pass@}k_{\tau} d \tau = \frac{2}{k} \sum_{i= \lceil 0.5 \cdot k \rceil + 1}^{k} \text{G-Pass@}k_{\frac{i}{k}}
+
+"""
+
+
+_KWARGS_DESCRIPTION = """
+Calculates how good are predictions given some references, using certain scores
+Args:
+    predictions: list of generations to evaluate. Each prediction should be a
+    list of string with several model-generated solutions.
+    references: list of answer for each prediction.
+    k: list of number of attempts to consider in evaluation (Default: [4, 8, 16]).
+    thresholds: list of thresholds to consider in evaluation (Default: [0.25, 0.5, 0.75, 1.0]).
+    check_correct_fn: function to check if a prediction is correct. 
+    It should have two parameters: `pred` and `ref` and output a boolean.
+Returns:
+    g_pass_at_k: dict with scores for each k and threshold, and mG-Pass@k.
+Examples:
+    >>> g_pass_at_k_evaluator = evaluate.load("gpassk")
+    >>> predictions = [["a", "b", "a", "a", "b", "a", "b", "c", "a", "c", "b", "a", "a", "b", "a", "b"]]
+    >>> references = ["a"]
+    >>> check_correct_fn = lambda pred, ref: pred == ref
+    >>> g_pass_at_k = g_pass_at_k_evaluator.compute(predictions=predictions, 
+    references=references, k=[4, 8], check_correct_fn=check_correct_fn)
+    >>> print(g_pass_at_k)
+    {'G-Pass@4_0.25': 0.9615384615384616, 'G-Pass@4_0.5': 0.7153846153846154, 
+    'G-Pass@4_0.75': 0.2846153846153846, 'G-Pass@4_1.0': 0.038461538461538464, 
+    'G-Pass@8_0.25': 0.9949494949494949, 'G-Pass@8_0.5': 0.6903651903651904, 
+    'G-Pass@8_0.75': 0.06596736596736597, 'G-Pass@8_1.0': 7.77000777000777e-05, 
+    'mG-Pass@4': 0.16153846153846152, 'mG-Pass@8': 0.09518259518259518}
+"""
+
+
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class GPassK(evaluate.Metric):
+
+    def _info(self):
+        return evaluate.MetricInfo(
+            module_type="metric",
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features({
+                'predictions': datasets.Value('int64'),
+                'references': datasets.Value('int64'),
+            }),
+            homepage="https://open-compass.github.io/GPassK/",
+            codebase_urls=["https://github.com/open-compass/GPassK"],
+            reference_urls=["http://arxiv.org/abs/2412.13147"]
+        )
+
+    def _compute(self, 
+                 predictions: List[List[str]], 
+                 references: List[str], 
+                 k=[4, 8, 16], 
+                 thresholds=[0.25, 0.5, 0.75, 1.0], 
+                 check_correct_fn: Callable = None):
+        """Compute GPassK metric."""
+
+        if check_correct_fn is None:
+            raise ValueError('`check_correct_fn` is required for GPassK metric')
+        
+        sig = inspect.signature(check_correct_fn)
+        if len(sig.parameters) != 2:
+            raise ValueError(f'`check_correct_fn` should have exactly 2 parameters, got {len(sig.parameters)}')
+        for name in sig.parameters:
+            if name not in ['pred', 'ref']:
+                raise ValueError(f'`check_correct_fn` should have only `pred` and `ref` as parameters, got {name}')
+        
+        n_list, c_list = [], []
+        for preds, ref in zip(predictions, references):
+            labels = list(map(partial(check_correct_fn, ref=ref), preds))
+            n = len(preds)
+            c = sum(labels)
+            n_list.append(n)
+            c_list.append(c)
+        
+        g_pass_at_k = {
+            f"G-Pass@{k_i}_{t}": np.mean([compute_g_pass_at_k(n, c, k_i, t) for n, c in zip(n_list, c_list)]).item()
+            for k_i in k
+            for t in thresholds
+        }
+        g_pass_at_k.update({
+            f"mG-Pass@{k_i}": np.mean([compute_mg_pass_at_k(n, c, k_i) for n, c in zip(n_list, c_list)]).item()
+            for k_i in k
+        })
+        return g_pass_at_k
+    
+
+def _compute_g_pass_at_k(n, c, k, m):
+    if m > min(c, k) or k > n or c < 0 or n <= 0 or m < 0:
+        return 0.0
+    return hypergeom.sf(m - 1, n, c, k)
+
+
+def compute_g_pass_at_k(n, c, k, t):
+    m = max(int(np.ceil(k * t)), 1)
+    return _compute_g_pass_at_k(n, c, k, m)
+
+
+def compute_mg_pass_at_k(n, c, k):
+    l, r = int(np.ceil(k * 0.5)), k
+
+    mg_pass_at_k = 0.0
+    for i in range(l + 1, r + 1):
+        mg_pass_at_k += _compute_g_pass_at_k(n, c, k, i)
+    mg_pass_at_k = 2 * mg_pass_at_k / k
+
+    return mg_pass_at_k
diff --git a/metrics/gpassk/requirements.txt b/metrics/gpassk/requirements.txt
new file mode 100644
index 00000000..52df037e
--- /dev/null
+++ b/metrics/gpassk/requirements.txt
@@ -0,0 +1,3 @@
+git+https://github.com/huggingface/evaluate@main
+scipy==1.14.1
+numpy==2.2.1
\ No newline at end of file

From 58e82e08872842d295b19cbff97d5833197a7624 Mon Sep 17 00:00:00 2001
From: jnanliu <to.liujn@outlook.com>
Date: Mon, 23 Dec 2024 12:04:33 +0000
Subject: [PATCH 2/2] fix readme & metric input features

---
 metrics/gpassk/README.md | 70 ++++++++++++++++++++++++++--------------
 metrics/gpassk/gpassk.py |  6 ++--
 2 files changed, 49 insertions(+), 27 deletions(-)

diff --git a/metrics/gpassk/README.md b/metrics/gpassk/README.md
index bdc4686f..0f95e013 100644
--- a/metrics/gpassk/README.md
+++ b/metrics/gpassk/README.md
@@ -17,58 +17,78 @@ description: >-
 # Metric Card for GPassK
 
 ## Metric Description
-G-Pass@$k$ is a generalization of the Pass@$k$ metric, which evaluates both the stability and potential of large language models (LLMs) in reasoning tasks. 
+G-Pass@ $k$ is a generalization of the Pass@$k$ metric, which evaluates both the stability and potential of large language models (LLMs) in reasoning tasks. 
 
-Given a threshold $\tau$, the G-Pass@$k_{\tau}$ measures the probability that a model will pass at least $m = \lceil \tau \cdot k \rceil$ out of $k$ attempts, 
-where $c$ is the number of correct solutions and $n$ is the total number of generations.
+- **G-Pass@k**: Evaluates the probability that an LLM will correctly solve at least $ m = \lceil \tau \cdot k \rceil $ out of $ k $ attempts, where:
+  - $ \tau $: The threshold indicating the minimum proportion of correct solutions.
+  - $ c $: The number of correct solutions generated by the model.
+  - $ n $: The total number of generations or trials.
 
 $$
-    \text{G-Pass@}k_{\tau} = \left[ \sum_{j = \lceil \tau \cdot k \rceil}^{c} \frac{\binom{c}{j} \cdot \binom{n - c}{k - j}}{\binom{n}{k}} \right]
+\text{G-Pass@}k_{\tau} = \left[ \sum_{j = \lceil \tau \cdot k \rceil}^{c} \frac{\binom{c}{j} \cdot \binom{n - c}{k - j}}{\binom{n}{k}} \right]
 $$
 
-mG-Pass@$k$ extends the concept of G-Pass@$k_{\tau}$ by integrating over all thresholds from 0.5 to 1.0, 
-effectively calculating the area under the curve of G-Pass@$k_{\tau}$. 
-This provides an overall measure of how well the LLM performs across different levels of stringency.
+- **mG-Pass@k**: Extends G-Pass@$ k_{\tau} $ by integrating over all thresholds from 0.5 to 1.0, effectively calculating the area under the curve of G-Pass@$ k_{\tau} $. This provides an overall measure of the LLM's performance across different stringency levels.
 
 $$
-    \text{mG-Pass@}k = 2\int_{0.5}^{1.0} \text{G-Pass@}k_{\tau} d \tau = \frac{2}{k} \sum_{i= \lceil 0.5 \cdot k \rceil + 1}^{k} \text{G-Pass@}k_{\frac{i}{k}}
+\text{mG-Pass@}k = 2\int_{0.5}^{1.0} \text{G-Pass@}k_{\tau} d \tau = \frac{2}{k} \sum_{i= \lceil 0.5 \cdot k \rceil + 1}^{k} \text{G-Pass@}k_{\frac{i}{k}}
 $$
 
+
 ## How to Use
 
 ### Inputs
-- **predictions** (List[List[str]]): list of generations to evaluate. Each prediction should be a list of string with several model-generated solutions.
-- **references** (List[str]): list of answer for each prediction.
-- **k** (List[int]): list of number of attempts to consider in evaluation (Default: [4, 8, 16]).
-- **thresholds** (List[float]): list of thresholds to consider in evaluation (Default: [0.25, 0.5, 0.75, 1.0]).
-- **check_correct_fn** (Callable): function to check if a prediction is correct. It should have two parameters: `pred` and `ref` and output a boolean
+To use the G-Pass@k metric, provide the following inputs:
+
+- **predictions** (`List[List[str]]`): A list of lists, where each sublist contains multiple model-generated solutions for a single task.
+- **references** (`List[str]`): A list of correct answers corresponding to each set of predictions.
+- **k** (`List[int]`): A list of integers representing the number of attempts to consider in the evaluation. Defaults to `[4, 8, 16]`.
+- **thresholds** (`List[float]`): A list of floating-point numbers representing the thresholds to consider in the evaluation. Defaults to `[0.25, 0.5, 0.75, 1.0]`.
+- **check_correct_fn** (`Callable`): A function that checks if a prediction matches the reference. It should take two parameters (`pred`, `ref`) and return a boolean value.
 
 ### Output Values
 
-The G-Pass@k metric returns one dict:
-`g_pass_at_k`: dict with scores for each $k$ and threshold, and mG-Pass@$k$.
+The G-Pass@k metric returns a dictionary with scores for each combination of $ k $ and threshold, as well as the mG-Pass@k score.
 
-These metrics can take on any value between 0 and 1, inclusive. Higher scores are better.
+All metrics produce values between 0 and 1, inclusive. A higher score indicates better performance.
 
 #### Values from Popular Papers
-The [leaderboard](https://open-compass.github.io/GPassK/) contains performance of several open-source and closed-source LLMs on the mathematical task.
+For a comprehensive overview of how various open-source and closed-source LLMs perform on this metric, visit [leaderboard](https://open-compass.github.io/GPassK/).
 
 ### Examples
+Below is an example of how to compute the G-Pass@k metric using Python:
+
 ```python
 from evaluate import load
-g_pass_at_k_evaluator = evaluate.load("gpassk")
+
+g_pass_at_k_evaluator = load("gpassk")
 predictions = [["a", "b", "a", "a", "b", "a", "b", "c", "a", "c", "b", "a", "a", "b", "a", "b"]]
 references = ["a"]
 check_correct_fn = lambda pred, ref: pred == ref
-g_pass_at_k = g_pass_at_k_evaluator.compute(predictions=predictions, 
-    references=references, k=[4, 8], check_correct_fn=check_correct_fn)
+
+g_pass_at_k = g_pass_at_k_evaluator.compute(
+    predictions=predictions,
+    references=references,
+    k=[4, 8],
+    check_correct_fn=check_correct_fn
+)
+
 print(g_pass_at_k)
+```
+
+Sample output:
+```json
 {
-    'G-Pass@4_0.25': 0.9615384615384616, 'G-Pass@4_0.5': 0.7153846153846154, 
-    'G-Pass@4_0.75': 0.2846153846153846, 'G-Pass@4_1.0': 0.038461538461538464, 
-    'G-Pass@8_0.25': 0.9949494949494949, 'G-Pass@8_0.5': 0.6903651903651904, 
-    'G-Pass@8_0.75': 0.06596736596736597, 'G-Pass@8_1.0': 7.77000777000777e-05, 
-    'mG-Pass@4': 0.16153846153846152, 'mG-Pass@8': 0.09518259518259518
+    "G-Pass@4_0.25": 0.9615384615384616,
+    "G-Pass@4_0.5": 0.7153846153846154,
+    "G-Pass@4_0.75": 0.2846153846153846,
+    "G-Pass@4_1.0": 0.038461538461538464,
+    "G-Pass@8_0.25": 0.9949494949494949,
+    "G-Pass@8_0.5": 0.6903651903651904,
+    "G-Pass@8_0.75": 0.06596736596736597,
+    "G-Pass@8_1.0": 7.77000777000777e-05,
+    "mG-Pass@4": 0.16153846153846152,
+    "mG-Pass@8": 0.09518259518259518
 }
 ```
 
diff --git a/metrics/gpassk/gpassk.py b/metrics/gpassk/gpassk.py
index 1c437fa5..f2684586 100644
--- a/metrics/gpassk/gpassk.py
+++ b/metrics/gpassk/gpassk.py
@@ -93,8 +93,10 @@ def _info(self):
             citation=_CITATION,
             inputs_description=_KWARGS_DESCRIPTION,
             features=datasets.Features({
-                'predictions': datasets.Value('int64'),
-                'references': datasets.Value('int64'),
+                'predictions': datasets.Sequence(datasets.Value('string')),
+                'references': datasets.Value('string'),
+                'k': datasets.Sequence(datasets.Value('int32')),
+                'thresholds': datasets.Sequence(datasets.Value('float'))
             }),
             homepage="https://open-compass.github.io/GPassK/",
             codebase_urls=["https://github.com/open-compass/GPassK"],