Bump the pip group across 1 directory with 3 updates (#3070)

Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Yifan Mai <yifan@cs.stanford.edu>
stanford-crfm · Oct 21, 2024 · bec3727 · bec3727
1 parent 4fbe7d4
commit bec3727
Show file tree

Hide file tree

Showing 7 changed files with 82 additions and 25 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -167,7 +167,7 @@ natsort==8.4.0
 nest-asyncio==1.6.0
 networkx==3.2.1
 ninja==1.11.1.1
-nltk==3.8.1
+nltk==3.9.1
 nncf==2.13.0
 nodeenv==1.9.1
 NudeNet==2.0.9
@@ -178,8 +178,7 @@ onnx==1.16.2
 onnxruntime==1.19.2
 open_clip_torch==2.26.1
 openai==1.48.0
-OpenCC==1.1.7
-opencv-python==4.7.0.72
+opencv-python==4.8.1.78
 opencv-python-headless==4.10.0.84
 openvino==2024.4.0
 openvino-telemetry==2024.1.0
@@ -274,7 +273,7 @@ soupsieve==2.6
 spacy==3.7.6
 spacy-legacy==3.0.12
 spacy-loggers==1.0.5
-sqlitedict==1.7.0
+sqlitedict==2.1.0
 srsly==2.4.8
 surge-api==1.1.4
 sympy==1.11.1

diff --git a/setup.cfg b/setup.cfg
@@ -32,14 +32,15 @@ install_requires=
     importlib-resources~=5.10
     Mako~=1.2
     numpy~=1.23
+    pandas~=2.0
     pyhocon~=0.3.59
     retrying~=1.3
     spacy~=3.5
     tqdm~=4.64
     zstandard~=0.18.0
     # sqlitedict==2.0.0 is slow! https://github.com/RaRe-Technologies/sqlitedict/issues/152
     # Keep sqlitedict version at 1.7.0.
-    sqlitedict~=1.7
+    sqlitedict>=2.1.0,<3.0
     bottle~=0.12.23
 
     # Basic Scenarios
@@ -48,7 +49,7 @@ install_requires=
     pyarrow-hotfix~=0.6  # Hotfix for CVE-2023-47248
 
     # Basic metrics
-    nltk~=3.7,<3.8.2  # See https://github.com/stanford-crfm/helm/issues/2926
+    nltk~=3.7,!=3.9.0  # Cannot use 3.9.0 due to https://github.com/nltk/nltk/issues/3308
     rouge-score~=0.1.2
     scipy~=1.10
     uncertainty-calibration~=0.1.4
@@ -218,7 +219,7 @@ image2struct =
     html2text~=2024.2.26
 
     # Metrics
-    opencv-python~=4.7.0.68
+    opencv-python>=4.7.0.68,<4.8.2.0
     lpips~=0.1.4
     imagehash~=4.3.1 # for caching
 
@@ -254,7 +255,7 @@ heim =
     lpips~=0.1.4
     multilingual-clip~=1.0.10
     NudeNet~=2.0.9
-    opencv-python~=4.7.0.68
+    opencv-python>=4.7.0.68,<4.8.2.0
     pytorch-fid~=0.3.0
     tensorflow~=2.11
     timm~=0.6.12

diff --git a/src/helm/benchmark/metrics/bias_metrics.py b/src/helm/benchmark/metrics/bias_metrics.py
@@ -8,9 +8,18 @@
 
 from helm.common.request import RequestResult, GeneratedOutput
 from helm.benchmark.adaptation.request_state import RequestState
-from .statistic import Stat
-from .metric_name import MetricName
-from .bias_word_lists import GENDER_TO_WORD_LISTS, RACE_TO_NAME_LISTS, ADJECTIVE_LIST, PROFESSION_LIST
+from helm.benchmark.metrics.nltk_helper import install_nltk_resources
+from helm.benchmark.metrics.statistic import Stat
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.bias_word_lists import (
+    GENDER_TO_WORD_LISTS,
+    RACE_TO_NAME_LISTS,
+    ADJECTIVE_LIST,
+    PROFESSION_LIST,
+)
+
+
+install_nltk_resources()
 
 
 class BiasMetric(EvaluateInstancesMetric):

diff --git a/src/helm/benchmark/metrics/evaluate_reference_metrics.py b/src/helm/benchmark/metrics/evaluate_reference_metrics.py
@@ -1,32 +1,31 @@
 from dataclasses import replace
+from functools import partial
 from typing import Callable, Dict, List, Optional, Set, Tuple, cast
+import re
+import string
+
+from nltk.metrics.scores import f_measure
+from nltk.tokenize import word_tokenize
+from nltk.translate.bleu_score import sentence_bleu
+from rouge_score import rouge_scorer
 import numpy as np
-from functools import partial
+
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
 from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.metrics import code_metrics_helper
 from helm.benchmark.metrics.cleva_metrics_helper import ChineseTokenizer
 from helm.benchmark.metrics.metric_name import MetricName
 from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.nltk_helper import install_nltk_resources
 from helm.benchmark.metrics.statistic import Stat
 from helm.benchmark.scenarios.code_scenario import CodeReference
+from helm.benchmark.scenarios.math_scenario import is_equiv, is_equiv_chain_of_thought
 from helm.benchmark.scenarios.scenario import Reference
 from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.common.request import GeneratedOutput
-from helm.benchmark.scenarios.math_scenario import is_equiv, is_equiv_chain_of_thought
-from nltk.metrics.scores import f_measure
-from nltk.translate.bleu_score import sentence_bleu
-from nltk.tokenize import word_tokenize
-from rouge_score import rouge_scorer
-import re
-import string
-from . import code_metrics_helper
-import nltk
 
 
-try:
-    nltk.data.find("tokenizers/punkt")
-except LookupError:
-    nltk.download("punkt")  # Required for rouge
+install_nltk_resources()
 
 
 def pass_at_k_estimator(n: int, c: int, k: int) -> float:

diff --git a/src/helm/benchmark/metrics/nltk_helper.py b/src/helm/benchmark/metrics/nltk_helper.py
@@ -0,0 +1,30 @@
+import nltk
+from importlib.metadata import version
+
+
+def install_nltk_resources():
+    """Install resources for nltk tokenizers, which is required for bleu and rouge scores."""
+    # Install "punkt_tab" for nltk>=3.9.1 or "punkt" for nltk<=3.8.1
+    #
+    # "punkt" is not longer supported for newer versions of nltk due to a security issue
+    # and has been replaced by "punkt_tab". For more information, see:
+    #
+    # - https://github.com/stanford-crfm/helm/issues/2926
+    # - https://github.com/nltk/nltk/issues/3293
+    # - https://github.com/nltk/nltk/issues/3266
+    # - https://nvd.nist.gov/vuln/detail/CVE-2024-39705
+    #
+    # TODO: Remove support for nltk<=3.8.1 and only install "punkt_tab"
+    nltk_major_version, nltk_minor_version = [int(v) for v in version("nltk").split(".")[0:2]]
+    if nltk_major_version < 3:
+        raise Exception("ntlk version <3 is not supported")
+    if nltk_minor_version >= 9:
+        try:
+            nltk.data.find("tokenizers/punkt_tab")
+        except LookupError:
+            nltk.download("punkt_tab")
+    else:
+        try:
+            nltk.data.find("tokenizers/punkt")
+        except LookupError:
+            nltk.download("punkt")
diff --git a/src/helm/benchmark/metrics/test_bias_metrics.py b/src/helm/benchmark/metrics/test_bias_metrics.py
@@ -2,6 +2,10 @@
 from typing import Callable, List, Optional
 
 from .bias_metrics import BiasMetric
+from helm.benchmark.metrics.nltk_helper import install_nltk_resources
+
+
+install_nltk_resources()
 
 
 @dataclass(frozen=True)

diff --git a/src/helm/benchmark/metrics/test_evaluate_reference_metrics.py b/src/helm/benchmark/metrics/test_evaluate_reference_metrics.py
@@ -1,4 +1,7 @@
+import pytest
 from helm.benchmark.metrics.evaluate_reference_metrics import (
+    bleu_1,
+    chinese_bleu_1,
     exact_match,
     exact_match_indicator,
     final_number_exact_match,
@@ -28,3 +31,15 @@ def test_final_number_exact_match():
     assert final_number_exact_match("34.2", "2") == 0
     assert final_number_exact_match("342", "342.") == 1
     assert final_number_exact_match("3,420", "3420") == 1
+
+
+def test_bleu_1():
+    assert bleu_1("the quick brown fox jumped over the lazy dog", "the fox jumped over the dog") == pytest.approx(
+        0.6065306597126334
+    )
+
+
+def test_chinese_bleu_1():
+    assert chinese_bleu_1(
+        "太祖武皇帝，沛國譙人也，姓曹，諱操，字孟德，漢相國參之後。", "太祖武皇帝，沛國譙人也，漢相國參之後。"
+    ) == pytest.approx(0.5907775139012316)