Skip to content

Commit

Permalink
Bump the pip group across 1 directory with 3 updates (#3070)
Browse files Browse the repository at this point in the history
Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Yifan Mai <yifan@cs.stanford.edu>
  • Loading branch information
dependabot[bot] and yifanmai authored Oct 21, 2024
1 parent 4fbe7d4 commit bec3727
Show file tree
Hide file tree
Showing 7 changed files with 82 additions and 25 deletions.
7 changes: 3 additions & 4 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ natsort==8.4.0
nest-asyncio==1.6.0
networkx==3.2.1
ninja==1.11.1.1
nltk==3.8.1
nltk==3.9.1
nncf==2.13.0
nodeenv==1.9.1
NudeNet==2.0.9
Expand All @@ -178,8 +178,7 @@ onnx==1.16.2
onnxruntime==1.19.2
open_clip_torch==2.26.1
openai==1.48.0
OpenCC==1.1.7
opencv-python==4.7.0.72
opencv-python==4.8.1.78
opencv-python-headless==4.10.0.84
openvino==2024.4.0
openvino-telemetry==2024.1.0
Expand Down Expand Up @@ -274,7 +273,7 @@ soupsieve==2.6
spacy==3.7.6
spacy-legacy==3.0.12
spacy-loggers==1.0.5
sqlitedict==1.7.0
sqlitedict==2.1.0
srsly==2.4.8
surge-api==1.1.4
sympy==1.11.1
Expand Down
9 changes: 5 additions & 4 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,15 @@ install_requires=
importlib-resources~=5.10
Mako~=1.2
numpy~=1.23
pandas~=2.0
pyhocon~=0.3.59
retrying~=1.3
spacy~=3.5
tqdm~=4.64
zstandard~=0.18.0
# sqlitedict==2.0.0 is slow! https://github.com/RaRe-Technologies/sqlitedict/issues/152
# Keep sqlitedict version at 1.7.0.
sqlitedict~=1.7
sqlitedict>=2.1.0,<3.0
bottle~=0.12.23

# Basic Scenarios
Expand All @@ -48,7 +49,7 @@ install_requires=
pyarrow-hotfix~=0.6 # Hotfix for CVE-2023-47248

# Basic metrics
nltk~=3.7,<3.8.2 # See https://github.com/stanford-crfm/helm/issues/2926
nltk~=3.7,!=3.9.0 # Cannot use 3.9.0 due to https://github.com/nltk/nltk/issues/3308
rouge-score~=0.1.2
scipy~=1.10
uncertainty-calibration~=0.1.4
Expand Down Expand Up @@ -218,7 +219,7 @@ image2struct =
html2text~=2024.2.26

# Metrics
opencv-python~=4.7.0.68
opencv-python>=4.7.0.68,<4.8.2.0
lpips~=0.1.4
imagehash~=4.3.1 # for caching

Expand Down Expand Up @@ -254,7 +255,7 @@ heim =
lpips~=0.1.4
multilingual-clip~=1.0.10
NudeNet~=2.0.9
opencv-python~=4.7.0.68
opencv-python>=4.7.0.68,<4.8.2.0
pytorch-fid~=0.3.0
tensorflow~=2.11
timm~=0.6.12
Expand Down
15 changes: 12 additions & 3 deletions src/helm/benchmark/metrics/bias_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,18 @@

from helm.common.request import RequestResult, GeneratedOutput
from helm.benchmark.adaptation.request_state import RequestState
from .statistic import Stat
from .metric_name import MetricName
from .bias_word_lists import GENDER_TO_WORD_LISTS, RACE_TO_NAME_LISTS, ADJECTIVE_LIST, PROFESSION_LIST
from helm.benchmark.metrics.nltk_helper import install_nltk_resources
from helm.benchmark.metrics.statistic import Stat
from helm.benchmark.metrics.metric_name import MetricName
from helm.benchmark.metrics.bias_word_lists import (
GENDER_TO_WORD_LISTS,
RACE_TO_NAME_LISTS,
ADJECTIVE_LIST,
PROFESSION_LIST,
)


install_nltk_resources()


class BiasMetric(EvaluateInstancesMetric):
Expand Down
27 changes: 13 additions & 14 deletions src/helm/benchmark/metrics/evaluate_reference_metrics.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,31 @@
from dataclasses import replace
from functools import partial
from typing import Callable, Dict, List, Optional, Set, Tuple, cast
import re
import string

from nltk.metrics.scores import f_measure
from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
import numpy as np
from functools import partial

from helm.benchmark.adaptation.adapter_spec import AdapterSpec
from helm.benchmark.adaptation.request_state import RequestState
from helm.benchmark.metrics import code_metrics_helper
from helm.benchmark.metrics.cleva_metrics_helper import ChineseTokenizer
from helm.benchmark.metrics.metric_name import MetricName
from helm.benchmark.metrics.metric_service import MetricService
from helm.benchmark.metrics.nltk_helper import install_nltk_resources
from helm.benchmark.metrics.statistic import Stat
from helm.benchmark.scenarios.code_scenario import CodeReference
from helm.benchmark.scenarios.math_scenario import is_equiv, is_equiv_chain_of_thought
from helm.benchmark.scenarios.scenario import Reference
from helm.common.optional_dependencies import handle_module_not_found_error
from helm.common.request import GeneratedOutput
from helm.benchmark.scenarios.math_scenario import is_equiv, is_equiv_chain_of_thought
from nltk.metrics.scores import f_measure
from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import word_tokenize
from rouge_score import rouge_scorer
import re
import string
from . import code_metrics_helper
import nltk


try:
nltk.data.find("tokenizers/punkt")
except LookupError:
nltk.download("punkt") # Required for rouge
install_nltk_resources()


def pass_at_k_estimator(n: int, c: int, k: int) -> float:
Expand Down
30 changes: 30 additions & 0 deletions src/helm/benchmark/metrics/nltk_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import nltk
from importlib.metadata import version


def install_nltk_resources():
"""Install resources for nltk tokenizers, which is required for bleu and rouge scores."""
# Install "punkt_tab" for nltk>=3.9.1 or "punkt" for nltk<=3.8.1
#
# "punkt" is not longer supported for newer versions of nltk due to a security issue
# and has been replaced by "punkt_tab". For more information, see:
#
# - https://github.com/stanford-crfm/helm/issues/2926
# - https://github.com/nltk/nltk/issues/3293
# - https://github.com/nltk/nltk/issues/3266
# - https://nvd.nist.gov/vuln/detail/CVE-2024-39705
#
# TODO: Remove support for nltk<=3.8.1 and only install "punkt_tab"
nltk_major_version, nltk_minor_version = [int(v) for v in version("nltk").split(".")[0:2]]
if nltk_major_version < 3:
raise Exception("ntlk version <3 is not supported")
if nltk_minor_version >= 9:
try:
nltk.data.find("tokenizers/punkt_tab")
except LookupError:
nltk.download("punkt_tab")
else:
try:
nltk.data.find("tokenizers/punkt")
except LookupError:
nltk.download("punkt")
4 changes: 4 additions & 0 deletions src/helm/benchmark/metrics/test_bias_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
from typing import Callable, List, Optional

from .bias_metrics import BiasMetric
from helm.benchmark.metrics.nltk_helper import install_nltk_resources


install_nltk_resources()


@dataclass(frozen=True)
Expand Down
15 changes: 15 additions & 0 deletions src/helm/benchmark/metrics/test_evaluate_reference_metrics.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import pytest
from helm.benchmark.metrics.evaluate_reference_metrics import (
bleu_1,
chinese_bleu_1,
exact_match,
exact_match_indicator,
final_number_exact_match,
Expand Down Expand Up @@ -28,3 +31,15 @@ def test_final_number_exact_match():
assert final_number_exact_match("34.2", "2") == 0
assert final_number_exact_match("342", "342.") == 1
assert final_number_exact_match("3,420", "3420") == 1


def test_bleu_1():
assert bleu_1("the quick brown fox jumped over the lazy dog", "the fox jumped over the dog") == pytest.approx(
0.6065306597126334
)


def test_chinese_bleu_1():
assert chinese_bleu_1(
"太祖武皇帝,沛國譙人也,姓曹,諱操,字孟德,漢相國參之後。", "太祖武皇帝,沛國譙人也,漢相國參之後。"
) == pytest.approx(0.5907775139012316)

0 comments on commit bec3727

Please sign in to comment.