Skip to content

Commit

Permalink
add mistral / mixtral / gemini
Browse files Browse the repository at this point in the history
  • Loading branch information
YannDubs committed Jan 3, 2024
1 parent c4def7d commit b4f37e9
Show file tree
Hide file tree
Showing 16 changed files with 250 additions and 17 deletions.
9 changes: 8 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,14 @@
]
PACKAGES_ANALYSIS = ["seaborn", "matplotlib", "jupyterlab"]
PACKAGES_LOCAL = ["accelerate", "transformers", "bitsandbytes", "xformers", "peft", "optimum", "einops"]
PACKAGES_ALL_API = ["anthropic>=0.3.3", "huggingface_hub", "cohere", "replicate", "boto3>=1.28.58"]
PACKAGES_ALL_API = [
"anthropic>=0.3.3",
"huggingface_hub",
"cohere",
"replicate",
"boto3>=1.28.58",
"google-generativeai",
]
PACKAGES_ALL = PACKAGES_LOCAL + PACKAGES_ALL_API + PACKAGES_ANALYSIS + PACKAGES_DEV

setuptools.setup(
Expand Down
17 changes: 14 additions & 3 deletions src/alpaca_eval/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
BASE_DIR = Path(__file__).parents[2]

### API specific ###
API_MAX_CONCURRENCY = int(os.environ.get("API_MAX_CONCURRENCY", 5))

OPENAI_MAX_CONCURRENCY = int(os.environ.get("OPENAI_MAX_CONCURRENCY", 5))
OPENAI_CLIENT_CONFIG_PATH = os.environ.get("OPENAI_CLIENT_CONFIG_PATH", BASE_DIR / "client_configs/openai_configs.yaml")
# the following is for backward compatibility, the recommended way is to use OPENAI_CLIENT_CONFIG_PATH
Expand All @@ -22,7 +24,9 @@
#

ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", None)
ANTHROPIC_MAX_CONCURRENCY = int(os.environ.get("ANTHROPIC_MAX_CONCURRENCY", 1))
ANTHROPIC_MAX_CONCURRENCY = int(os.environ.get("ANTHROPIC_MAX_CONCURRENCY", API_MAX_CONCURRENCY))

GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY", None)

COHERE_API_KEY = os.environ.get("COHERE_API_KEY", None)

Expand All @@ -33,7 +37,7 @@

IS_ALPACA_EVAL_2 = ast.literal_eval(os.environ.get("IS_ALPACA_EVAL_2", "True"))
ANNOTATOR_CONFIG_AE1 = "alpaca_eval_gpt4"
ANNOTATOR_CONFIG_AE2 = "weighted_alpaca_eval_gpt4_turbo"
ANNOTATOR_CONFIG_AE2 = "alpaca_eval_gpt4_turbo_fn" # "weighted_alpaca_eval_gpt4_turbo"
DEFAULT_ANNOTATOR_CONFIG = ANNOTATOR_CONFIG_AE2 if IS_ALPACA_EVAL_2 else ANNOTATOR_CONFIG_AE1
DEFAULT_CACHE_DIR = None
EVALUATORS_CONFIG_DIR = CURRENT_DIR / "evaluators_configs"
Expand Down Expand Up @@ -131,6 +135,10 @@ def ALPACAFARM_GOLD_ANNOTATIONS():
(str(ALPACAEVAL_REFERENCE_OUTPUTS_1), "chatgpt_fn"): ALPACAEVAL_1_LEADERBOARD_PATHS / "chatgpt_fn_leaderboard.csv",
(str(ALPACAEVAL_REFERENCE_OUTPUTS_2), ANNOTATOR_CONFIG_AE2): ALPACAEVAL_2_LEADERBOARD_PATHS
/ f"{ANNOTATOR_CONFIG_AE2}_leaderboard.csv",
(str(ALPACAEVAL_REFERENCE_OUTPUTS_2), "weighted_alpaca_eval_gpt4_turbo"): ALPACAEVAL_2_LEADERBOARD_PATHS
/ f"weighted_alpaca_eval_gpt4_turbo_leaderboard.csv",
(str(ALPACAEVAL_REFERENCE_OUTPUTS_2), "alpaca_eval_cot_gpt4_turbo_fn"): ALPACAEVAL_2_LEADERBOARD_PATHS
/ f"alpaca_eval_cot_gpt4_turbo_fn_leaderboard.csv",
# needs to add the non default config. ie either with or without the logprob
}

Expand Down Expand Up @@ -172,8 +180,11 @@ def ALPACAFARM_GOLD_ANNOTATIONS():
"Yi-34B-Chat",
"llama-2-70b-chat-hf",
"claude-2",
# "cohere",
"cohere",
"chatgpt",
"gemini-pro",
"Mixtral-8x7B-Instruct-v0.1",
"Mistral-7B-Instruct-v0.2"
# "vicuna-33b-v1.3",
# "llama-2-13b-chat-hf",
# "llama-2-7b-chat-hf",
Expand Down
10 changes: 10 additions & 0 deletions src/alpaca_eval/decoders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,16 @@ def get_fn_completions(name: Union[str, Callable]) -> Callable:

return anthropic_completions

elif name == "google_completions":
try:
from .google import google_completions
except ImportError as e:
packages = ["google.generativeai"]
logging.exception(f"You need {packages} to use google_completions. Error:")
raise e

return google_completions

elif name == "openai_completions":
try:
from .openai import openai_completions
Expand Down
2 changes: 1 addition & 1 deletion src/alpaca_eval/decoders/anthropic.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def anthropic_completions(
n_examples = len(prompts)
if n_examples == 0:
logging.info("No samples to annotate.")
return []
return dict(completions=[], price_per_example=[], time_per_example=[], completions_all=[])
else:
to_log = f"Using `anthropic_completions` on {n_examples} prompts using {model_name} and num_procs={num_procs}."
logging.info(to_log)
Expand Down
2 changes: 1 addition & 1 deletion src/alpaca_eval/decoders/cohere.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,6 @@ def _cohere_completion_helper(
return text, num_tokens

except CohereError as e:
print(f"Try #{trynum+1}/{max_tries}: Error running prompt {repr(prompt)}: {e}")
logging.info(f"Try #{trynum+1}/{max_tries}: Error running prompt {repr(prompt)}: {e}")

return " ", 0 # placeholder response for errors, doesn't allow empty string
148 changes: 148 additions & 0 deletions src/alpaca_eval/decoders/google.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
import functools
import logging
import multiprocessing
import random
import time
from typing import Optional, Sequence, Union

import google.generativeai as genai
import numpy as np
import tqdm

from .. import constants, utils

__all__ = ["google_completions"]


def google_completions(
prompts: Sequence[str],
max_output_tokens: Union[int, Sequence[int]] = 2048,
model_name="gemini-pro",
num_procs: int = constants.API_MAX_CONCURRENCY,
**decoding_kwargs, # ,
) -> dict[str, list]:
"""Decode with Anthropic API.
Parameters
----------
prompts : list of str
Prompts to get completions for.
max_output_tokens : int or list of int, optional
Number of tokens to sample for each prompt. If a list, must be the same length as `prompts`.
model_name : str, optional
Name of the model to use for decoding.
num_procs : int, optional
Number of parallel processes to use for decoding.
decoding_kwargs :
Additional kwargs to pass to `genai.types.GenerationConfig`.
"""
num_procs = num_procs or constants.API_MAX_CONCURRENCY

n_examples = len(prompts)
if n_examples == 0:
logging.info("No samples to annotate.")
return dict(completions=[], price_per_example=[], time_per_example=[], completions_all=[])
else:
to_log = f"Using `google_completions` on {n_examples} prompts using {model_name} and num_procs={num_procs}."
logging.info(to_log)

if isinstance(max_output_tokens, int):
max_output_tokens = [max_output_tokens] * n_examples

inputs = zip(prompts, max_output_tokens)

kwargs = dict(model_name=model_name, **decoding_kwargs)
kwargs_to_log = {k: v for k, v in kwargs.items() if "api_key" not in k}
logging.info(f"Kwargs to completion: {kwargs_to_log}")
with utils.Timer() as t:
if num_procs == 1:
responses = [_google_completion_helper(inp, **kwargs) for inp in tqdm.tqdm(inputs, desc="prompts")]
else:
with multiprocessing.Pool(num_procs) as p:
partial_completion_helper = functools.partial(_google_completion_helper, **kwargs)
responses = list(
tqdm.tqdm(
p.imap(partial_completion_helper, inputs),
desc="prompts",
total=len(prompts),
)
)
logging.info(f"Completed {n_examples} examples in {t}.")

# anthropic doesn't return total tokens but 1 token approx 4 chars
price = [_get_price(len(p), len(r), model_name) for p, r in zip(prompts, responses)]

avg_time = [t.duration / n_examples] * len(responses)

return dict(completions=responses, price_per_example=price, time_per_example=avg_time, completions_all=responses)


def _google_completion_helper(
args: tuple[str, int],
sleep_time: int = 2,
temperature: Optional[float] = 0.7,
model_name: str = "gemini-pro",
google_api_keys: Optional[Sequence[str]] = None,
max_tries=10,
**kwargs,
):
prompt, max_output_tokens = args

google_api_keys = google_api_keys or (constants.GOOGLE_API_KEY,)
google_api_key = random.choice(google_api_keys)

genai.configure(api_key=google_api_key)
model = genai.GenerativeModel(model_name)
n_tries = 0

while True:
try:
response = model.generate_content(
prompt,
generation_config=genai.types.GenerationConfig(
temperature=temperature,
max_output_tokens=max_output_tokens,
**kwargs,
),
# don't block anything for evaluation
safety_settings={
"HARM_CATEGORY_HARASSMENT": "block_none",
"HARM_CATEGORY_DANGEROUS_CONTENT": "block_none",
"HARM_CATEGORY_HATE_SPEECH": "block_none",
"HARM_CATEGORY_SEXUALLY_EXPLICIT": "block_none",
},
)
text = response.text
# num_tokens = model.count_tokens(text)

return text

# error code 429 is rate limit
except Exception as e:
if "429" in str(e):
logging.info(f"Rate limit reached. Sleeping {sleep_time} seconds.")
time.sleep(sleep_time)

else:
# TODO: better catching of errors when rate limits
logging.exception(f"Unknown error, so we are retrying. Retry #{n_tries}/{max_tries}. Error:")
time.sleep(sleep_time)
n_tries += 1
if n_tries > max_tries:
break

return ""


def _get_price(n_in_char: int, n_out_char: int, model: str) -> float:
"""Returns the price per token for a given model"""
if model == "gemini-pro":
return (n_in_char * 0.00025 + n_out_char * 0.0005) / 1000

else:
logging.warning(f"Unknown model {model} for computing price per token.")
return np.nan
23 changes: 17 additions & 6 deletions src/alpaca_eval/decoders/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def openai_completions(
is_strip: bool = True,
num_procs: Optional[int] = constants.OPENAI_MAX_CONCURRENCY,
batch_size: Optional[int] = None,
price_per_token: Optional[float] = None,
**decoding_kwargs,
) -> dict[str, list]:
r"""Get openai completions for the given prompts. Allows additional parameters such as tokens to avoid and
Expand Down Expand Up @@ -58,6 +59,9 @@ def openai_completions(
is_strip : bool, optional
Whether to strip trailing and leading spaces from the prompts.
price_per_token : float, optional
Price per token for the model. If not provided, we will try to infer it from the model name.
decoding_kwargs :
Additional kwargs to pass to `openai.Completion` or `openai.ChatCompletion`.
Expand Down Expand Up @@ -159,7 +163,7 @@ def openai_completions(
completions_text = [completion["text"] for completion in completions_all]

price = [
completion["total_tokens"] * _get_price_per_token(model_name)
completion["total_tokens"] * _get_price_per_token(model_name, price_per_token)
for completion_batch in completions
for completion in completion_batch
]
Expand All @@ -185,17 +189,22 @@ def _openai_completion_helper(
openai_api_keys: Optional[Sequence[str]] = constants.OPENAI_API_KEYS,
openai_api_base: Optional[str] = os.getenv("OPENAI_API_BASE") if os.getenv("OPENAI_API_BASE") else openai.base_url,
############################
client_kwargs: Optional[dict[str, Any]] = None,
**kwargs,
):
client_kwargs = client_kwargs or dict()
prompt_batch, max_tokens = args
all_clients = utils.get_all_clients(
client_config_path,
model_name=kwargs["model"],
get_backwards_compatible_configs=_get_backwards_compatible_configs,
default_client_class="openai.OpenAI",
openai_organization_ids=openai_organization_ids,
openai_api_keys=openai_api_keys,
openai_api_base=openai_api_base,
backward_compatibility_kwargs=dict(
openai_organization_ids=openai_organization_ids,
openai_api_keys=openai_api_keys,
openai_api_base=openai_api_base,
),
**client_kwargs,
)

# randomly select the client
Expand Down Expand Up @@ -337,9 +346,11 @@ def _string_to_dict(to_convert):
return {s.split("=", 1)[0]: s.split("=", 1)[1] for s in to_convert.split(" ") if len(s) > 0}


def _get_price_per_token(model):
def _get_price_per_token(model, price_per_token=None):
"""Returns the price per token for a given model"""
if "gpt-4-1106" in model:
if price_per_token is not None:
return float(price_per_token)
elif "gpt-4-1106" in model:
return (
0.01 / 1000
) # that's not completely true because decoding is 0.03 but close enough given that most is context
Expand Down
11 changes: 8 additions & 3 deletions src/alpaca_eval/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,9 +127,14 @@ def evaluate(

if max_instances is not None:
# first we shuffle both outputs with a fix seed => more representative
seed = 123
model_outputs = model_outputs.sample(frac=1, random_state=seed)
reference_outputs = reference_outputs.sample(frac=1, random_state=seed)
if len(model_outputs) != len(reference_outputs):
logging.warning(
"model_outputs and reference_outputs have different lengths, so we cannot shuffle before taking the first max_instances."
)
else:
seed = 123
model_outputs = model_outputs.sample(frac=1, random_state=seed)
reference_outputs = reference_outputs.sample(frac=1, random_state=seed)

model_outputs = model_outputs[:max_instances]
reference_outputs = reference_outputs[:max_instances]
Expand Down
2 changes: 2 additions & 0 deletions src/alpaca_eval/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@ def describe_head2head(self, predictions: npt.ArrayLike) -> dict[str, float]:
n_wins=n_wins,
n_wins_base=n_wins_base,
n_draws=n_draws,
# note that n_draws will happen more often for weighted win rate because you can get 1.5 somewhat often due
# to float precision
n_total=n_total,
)

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
Mistral-7B-Instruct-v0.2:
prompt_template: "Mixtral-8x7B-Instruct-v0.1/togetherai_prompt.txt" # together already deals with prompt
fn_completions: "openai_completions"
completions_kwargs:
model_name: "mistralai/Mistral-7B-Instruct-v0.2"
max_tokens: 4096
requires_chatml: True
price_per_token: 2e-7
client_kwargs:
base_url: 'https://api.together.xyz'
pretty_name: "Mistral 7B v0.2"
link: "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2"
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
Mixtral-8x7B-Instruct-v0.1:
prompt_template: "Mixtral-8x7B-Instruct-v0.1/togetherai_prompt.txt" # together already deals with prompt
fn_completions: "openai_completions"
completions_kwargs:
model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
max_tokens: 4096
requires_chatml: True
price_per_token: 6e-7
client_kwargs:
base_url: 'https://api.together.xyz'
pretty_name: "Mixtral 8x7B v0.1"
link: "https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1"
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
<|im_start|>user
{instruction}
<|im_end|>
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,7 @@ Yi-34B-Chat-Verified:
num_procs: 1
max_tokens: 3500
top_p: 0.8
client_kwargs:
base_url: "http://api.01ww.xyz/v1"
pretty_name: "Yi 34B Chat"
link: "https://huggingface.co/01-ai/Yi-34B-Chat"
8 changes: 8 additions & 0 deletions src/alpaca_eval/models_configs/gemini-pro/configs.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
gemini-pro:
prompt_template: "gemini-pro/prompt.txt"
fn_completions: "google_completions"
completions_kwargs:
model_name: "gemini-pro"
max_output_tokens: 2048
candidate_count: 1
pretty_name: "Gemini Pro"
1 change: 1 addition & 0 deletions src/alpaca_eval/models_configs/gemini-pro/prompt.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{instruction}
Loading

0 comments on commit b4f37e9

Please sign in to comment.