Skip to content

Commit

Permalink
add a basic bfcl command-line interface (ShishirPatil#621)
Browse files Browse the repository at this point in the history
add a simple cli wrapping openfunctions_evaluation.py (`bfcl run`) and
eval_runner.py (`bfcl evaluate`).

```
➜ bfcl
                                                                                                                             
 Usage: bfcl [OPTIONS] COMMAND [ARGS]...                                                                                     
                                                                                                                             
╭─ Options ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
│ --install-completion            Install completion for the current shell.                                                 │
│ --show-completion               Show completion for the current shell, to copy it or customize the installation.          │
│ --help                -h        Show this message and exit.                                                               │
╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
╭─ Commands ────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
│ models            List available models.                                                                                  │
│ test-categories   List available test categories.                                                                         │
│ run               Run one or more models on a test-category (same as openfunctions_evaluation).                           │
│ results           List the results available for evaluation.                                                              │
│ evaluate          Evaluate results from run of one or more models on a test-category (same as eval_runner).               │
│ scores            Display the leaderboard.                                                                                │
╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯

➜ bfcl run -h
                                                                                                                    
 Usage: bfcl run [OPTIONS]                                                                                          
                                                                                                                    
 Run one or more models on a test-category (same as openfunctions_evaluation).                                      
                                                                                                                    
╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────╮
│ --model                           TEXT     A list of model names to evaluate.                                    │
│                                            [default: gorilla-openfunctions-v2]                                   │
│ --test-category                   TEXT     A list of test categories to run the evaluation on. [default: all]    │
│ --api-sanity-check        -c               Perform the REST API status sanity check before running the           │
│                                            evaluation.                                                           │
│ --temperature                     FLOAT    The temperature parameter for the model. [default: 0.001]             │
│ --top-p                           FLOAT    The top-p parameter for the model. [default: 1.0]                     │
│ --max-tokens                      INTEGER  The maximum number of tokens for the model. [default: 1200]           │
│ --num-gpus                        INTEGER  The number of GPUs to use. [default: 1]                               │
│ --timeout                         INTEGER  The timeout for the model in seconds. [default: 60]                   │
│ --num-threads                     INTEGER  The number of threads to use. [default: 1]                            │
│ --gpu-memory-utilization          FLOAT    The GPU memory utilization. [default: 0.9]                            │
│ --help                    -h               Show this message and exit.                                           │
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯


➜ bfcl evaluate -h
                                                                                                                    
 Usage: bfcl evaluate [OPTIONS]                                                                                     
                                                                                                                    
 Evaluate results from run of one or more models on a test-category (same as eval_runner).                          
                                                                                                                    
╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────╮
│ *  --model                     TEXT  A list of model names to evaluate. [default: None] [required]               │
│ *  --test-category             TEXT  A list of test categories to run the evaluation on. [default: None]         │
│                                      [required]                                                                  │
│    --api-sanity-check  -c            Perform the REST API status sanity check before running the evaluation.     │
│    --help              -h            Show this message and exit.                                                 │
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
```

---------

Co-authored-by: Huanzhi (Hans) Mao <huanzhimao@gmail.com>
  • Loading branch information
2 people authored and VishnuSuresh27 committed Nov 11, 2024
1 parent 198cfd9 commit 26d2293
Show file tree
Hide file tree
Showing 10 changed files with 534 additions and 259 deletions.
1 change: 1 addition & 0 deletions berkeley-function-call-leaderboard/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ All notable changes to the Berkeley Function Calling Leaderboard will be documen
- `MadeAgents/Hammer2.0-3b`
- `MadeAgents/Hammer2.0-1.5b`
- `MadeAgents/Hammer2.0-0.5b`
- [Oct 10, 2024] [#621](https://github.com/ShishirPatil/gorilla/pull/621), [#675](https://github.com/ShishirPatil/gorilla/pull/675): Add a basic command-line interface for ease of use.
- [Oct 5, 2024] [#633](https://github.com/ShishirPatil/gorilla/pull/633): Add new model `openbmb/MiniCPM3-4B` to the leaderboard.
- [Oct 5, 2024] [#642](https://github.com/ShishirPatil/gorilla/pull/642): Add the following new models to the leaderboard:
- `Qwen/Qwen2.5-7B-Instruct`
Expand Down
16 changes: 8 additions & 8 deletions berkeley-function-call-leaderboard/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ If decided to run locally-hosted model, the generation script uses vLLM and ther
Use the following command for LLM inference of the evaluation dataset with specific models.

```bash
python openfunctions_evaluation.py --model MODEL_NAME --test-category TEST_CATEGORY --num-threads 1
bfcl generate --model MODEL_NAME --test-category TEST_CATEGORY --num-threads 1
```

You can optionally specify the number of threads to use for _parallel inference_ by setting the `--num-threads` flag to speed up inference for **hosted models**, not applicable for OSS models.
Expand All @@ -112,7 +112,7 @@ If no `MODEL_NAME` is provided, the model `gorilla-openfunctions-v2` will be use

### Models Available

Below is _a table of models we support_ to run our leaderboard evaluation against. If the models support function calling (FC), we will follow its function calling format provided by official documentation. Otherwise, we use a consistent system message to prompt the model to generate function calls in the right format.
Below is _a table of models we support_ to run our leaderboard evaluation against. If the models support function calling (FC), we will follow its function calling format provided by official documentation. Otherwise, we use a consistent system message to prompt the model to generate function calls in the right format. You can also use `bfcl models` command to list out all available models.

|Model | Type |
|---|---|
Expand Down Expand Up @@ -198,7 +198,7 @@ For `Databrick-DBRX-instruct`, you need to create a Databrick Azure workspace an

In the following two sections, the optional `--test-category` parameter can be used to specify the category of tests to run. You can specify multiple categories separated by spaces. Available options include:

- Available test groups:
- Available test groups (you can also use `bfcl test-categories` command to see):
- `all`: All test categories.
- This is the default option if no test category is provided.
- `multi_turn`: All multi-turn test categories.
Expand Down Expand Up @@ -249,7 +249,7 @@ In the following two sections, the optional `--test-category` parameter can be u
Navigate to the `gorilla/berkeley-function-call-leaderboard/bfcl/eval_checker` directory and run the `eval_runner.py` script with the desired parameters. The basic syntax is as follows:

```bash
python eval_runner.py --model MODEL_NAME --test-category TEST_CATEGORY
bfcl evaluate --model MODEL_NAME --test-category TEST_CATEGORY
```

For available options for `MODEL_NAME` and `TEST_CATEGORY`, please refer to the [Models Available](#models-available) and [Available Test Category](#available-test-category) section.
Expand All @@ -261,25 +261,25 @@ If no `MODEL_NAME` is provided, all available model results will be evaluated by
If you want to run all tests for the `gorilla-openfunctions-v2` model, you can use the following command:

```bash
python eval_runner.py --model gorilla-openfunctions-v2
bfcl evaluate --model gorilla-openfunctions-v2
```

If you want to evaluate all offline tests (do not require RapidAPI keys) for OpenAI GPT-3.5, you can use the following command:

```bash
python eval_runner.py --model gpt-3.5-turbo-0125 --test-category ast
bfcl evaluate --model gpt-3.5-turbo-0125 --test-category ast
```

If you want to run the `rest` tests for a few Claude models, you can use the following command:

```bash
python eval_runner.py --model claude-3-5-sonnet-20240620 claude-3-opus-20240229 claude-3-sonnet-20240229 --test-category rest
bfcl evaluate --model claude-3-5-sonnet-20240620 claude-3-opus-20240229 claude-3-sonnet-20240229 --test-category rest
```

If you want to run `live_simple` and `javascript` tests for a few models and `gorilla-openfunctions-v2`, you can use the following command:

```bash
python eval_runner.py --model gorilla-openfunctions-v2 claude-3-5-sonnet-20240620 gpt-4-0125-preview gemini-1.5-pro-preview-0514 --test-category live_simple javascript
bfcl evaluate --model gorilla-openfunctions-v2 claude-3-5-sonnet-20240620 gpt-4-0125-preview gemini-1.5-pro-preview-0514 --test-category live_simple javascript
```

### Model-Specific Optimization
Expand Down
231 changes: 231 additions & 0 deletions berkeley-function-call-leaderboard/bfcl/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,231 @@
import csv
from collections import namedtuple
from datetime import datetime
from typing import List

import typer
from bfcl._llm_response_generation import main as generation_main
from bfcl.constant import DOTENV_PATH, RESULT_PATH, SCORE_PATH, TEST_COLLECTION_MAPPING
from bfcl.eval_checker import eval_runner
from bfcl.model_handler.handler_map import HANDLER_MAP
from dotenv import load_dotenv
from tabulate import tabulate


class ExecutionOrderGroup(typer.core.TyperGroup):
def list_commands(self, ctx):
return [
"models",
"test-categories",
"generate",
"results",
"evaluate",
"scores",
]


cli = typer.Typer(
context_settings=dict(help_option_names=["-h", "--help"]),
no_args_is_help=True,
cls=ExecutionOrderGroup,
)


@cli.command()
def test_categories():
"""
List available test categories.
"""
table = tabulate(
[
(category, "\n".join(test for test in tests))
for category, tests in TEST_COLLECTION_MAPPING.items()
],
headers=["Test category", "Test names"],
tablefmt="grid",
)
print(table)


@cli.command()
def models():
"""
List available models.
"""
table = tabulate(
[[model] for model in HANDLER_MAP.keys()],
tablefmt="plain",
colalign=("left",),
)
print(table)


@cli.command()
def generate(
model: List[str] = typer.Option(
["gorilla-openfunctions-v2"], help="A list of model names to evaluate."
),
test_category: List[str] = typer.Option(
["all"], help="A list of test categories to run the evaluation on."
),
api_sanity_check: bool = typer.Option(
False,
"--api-sanity-check",
"-c",
help="Perform the REST API status sanity check before running the evaluation.",
),
temperature: float = typer.Option(
0.001, help="The temperature parameter for the model."
),
include_debugging_log: bool = typer.Option(
False,
help="Include debugging log in the response file to see model's interaction with the state machine.",
),
num_gpus: int = typer.Option(1, help="The number of GPUs to use."),
num_threads: int = typer.Option(1, help="The number of threads to use."),
gpu_memory_utilization: float = typer.Option(
0.9, help="The GPU memory utilization."
),
):
"""
Generate the LLM response for one or more models on a test-category (same as openfunctions_evaluation.py).
"""
generationArgs = namedtuple(
"generationArgs",
[
"model",
"test_category",
"api_sanity_check",
"temperature",
"include_debugging_log",
"num_gpus",
"num_threads",
"gpu_memory_utilization",
],
)

load_dotenv(dotenv_path=DOTENV_PATH, verbose=True, override=True) # Load the .env file
generation_main(
generationArgs(
model=model,
test_category=test_category,
api_sanity_check=api_sanity_check,
temperature=temperature,
include_debugging_log=include_debugging_log,
num_gpus=num_gpus,
num_threads=num_threads,
gpu_memory_utilization=gpu_memory_utilization,
)
)


@cli.command()
def results():
"""
List the results available for evaluation.
"""

def display_name(name: str):
"""
Undo the / -> _ transformation if it happened.
Args:
name (str): The name of the model in the result directory.
Returns:
str: The original name of the model.
"""
if name not in HANDLER_MAP:
candidate = name.replace("_", "/")
if candidate in HANDLER_MAP:
return candidate
print(f"Unknown model name: {name}")
return name

result_dir = RESULT_PATH

results_data = []
for dir in result_dir.iterdir():
# Check if it is a directory and not a file
if not dir.is_dir():
continue

results_data.append(
(
display_name(dir.name),
datetime.fromtimestamp(dir.stat().st_ctime).strftime(
"%Y-%m-%d %H:%M:%S"
),
)
)

print(
tabulate(
results_data,
headers=["Model name", "Creation time"],
tablefmt="pretty",
)
)


@cli.command()
def evaluate(
model: List[str] = typer.Option(None, help="A list of model names to evaluate."),
test_category: List[str] = typer.Option(
None, help="A list of test categories to run the evaluation on."
),
api_sanity_check: bool = typer.Option(
False,
"--api-sanity-check",
"-c",
help="Perform the REST API status sanity check before running the evaluation.",
),
):
"""
Evaluate results from run of one or more models on a test-category (same as eval_runner.py).
"""

load_dotenv(dotenv_path=DOTENV_PATH, verbose=True, override=True) # Load the .env file
eval_runner.main(model, test_category, api_sanity_check)


@cli.command()
def scores():
"""
Display the leaderboard.
"""

def truncate(text, length=22):
return (text[:length] + "...") if len(text) > length else text

# files = ["./score/data_non_live.csv", "./score/data_live.csv", "./score/data_overall.csv"]
file = SCORE_PATH / "data_overall.csv"

selected_columns = [
"Rank",
"Model",
"Overall Acc",
"Non-Live AST Acc",
"Non-Live Exec Acc",
"Live Acc",
"Multi Turn Acc",
"Relevance Detection",
"Irrelevance Detection",
]

if file.exists():
with open(file, newline="") as csvfile:
reader = csv.reader(csvfile)
headers = next(reader) # Read the header row
column_indices = [headers.index(col) for col in selected_columns]
data = [
[row[i] for i in column_indices] for row in reader
] # Read the rest of the data
selected_columns = selected_columns[:-2] + ["Relevance", "Irrelevance"] # Shorten the column names
print(tabulate(data, headers=selected_columns, tablefmt="grid"))
else:
print(f"\nFile {file} not found.\n")


if __name__ == "__main__":
cli()
Loading

0 comments on commit 26d2293

Please sign in to comment.