Skip to content

Commit

Permalink
[BFCL] Chore: Improve Inference Log Readability (#746)
Browse files Browse the repository at this point in the history
This PR improves the readability of the inference log; makes it read
more like a conversation-style. The instruction on how to
interpret/understand the log is also included. The inference log is
included as part of the result file.

The score file is also improved in structure for better readability.

This will not change the leaderboard score. 





Here is an example inference log.
```
"inference_log": [
    {
        "begin_of_turn_query": [
            {
                "role": "user",
                "content": "First cd within the 'ResearchDocs' directory and then locate any files titled 'report.csv'."
            }
        ],
        "step_0": [
            {
                "role": "assistant",
                "content": [
                    {
                        "cd": "{\"folder\":\"ResearchDocs\"}"
                    }
                ]
            },
            {
                "role": "handler_log",
                "content": "Successfully decoded model response.",
                "model_response_decoded": [
                    "cd(folder='ResearchDocs')"
                ]
            },
            {
                "role": "tool",
                "content": "{\"current_working_directory\": \"ResearchDocs\"}"
            }
        ],
        "step_1": [
            {
                "role": "assistant",
                "content": [
                    {
                        "find": "{\"path\":\"ResearchDocs\",\"name\":\"report.csv\"}"
                    }
                ]
            },
            {
                "role": "handler_log",
                "content": "Successfully decoded model response.",
                "model_response_decoded": [
                    "find(path='ResearchDocs',name='report.csv')"
                ]
            },
            {
                "role": "tool",
                "content": "{\"matches\": [\"ResearchDocs/report.csv\"]}"
            }
        ],
        "step_2": [
            {
                "role": "assistant",
                "content": "The file titled `report.csv` has been located in the `ResearchDocs` directory."
            },
            {
                "role": "handler_log",
                "content": "Error decoding the model response. Proceed to next turn.",
                "error": "'str' object has no attribute 'items'"
            }
        ]
    },
    {
        "begin_of_turn_query": [
            {
                "role": "user",
                "content": "Once you locate 'report.csv' in the 'ResearchDocs' directory, delve into it to find lines that reference the 'Quarterly Financial Overview'."
            }
        ],
        "step_0": [
            {
                "role": "assistant",
                "content": [
                    {
                        "grep": "{\"file_name\":\"report.csv\",\"pattern\":\"Quarterly Financial Overview\"}"
                    }
                ]
            },
            {
                "role": "handler_log",
                "content": "Successfully decoded model response.",
                "model_response_decoded": [
                    "grep(file_name='report.csv',pattern='Quarterly Financial Overview')"
                ]
            },
            {
                "role": "tool",
                "content": "{\"matching_lines\": [\"Line 1: Introduction Line 2: Quarterly Financial Overview Line 3: Details Line 4: More Details Line 5: Quarterly Financial Overview Line 6: Conclusion Line 7: Quarterly Financial Overview Line 8: Quarter has been successful. Line 9: Quarterly Financial Overview Line 10: Final Thoughts\"]}"
            }
        ],
        "step_1": [
            {
                "role": "assistant",
                "content": "The following lines in `report.csv` reference the \"Quarterly Financial Overview\":\n\n- Line 2: Quarterly Financial Overview\n- Line 5: Quarterly Financial Overview\n- Line 7: Quarterly Financial Overview\n- Line 9: Quarterly Financial Overview"
            },
            {
                "role": "handler_log",
                "content": "Error decoding the model response. Proceed to next turn.",
                "error": "'str' object has no attribute 'items'"
            }
        ]
    },
    {
        "begin_of_turn_query": [
            {
                "role": "user",
                "content": "Extract and display the last five lines from the 'report.csv' file in the 'ResearchDocs' directory, as these need to be emphasized for recent updates."
            }
        ],
        "step_0": [
            {
                "role": "assistant",
                "content": [
                    {
                        "tail": "{\"file_name\":\"report.csv\",\"lines\":5}"
                    }
                ]
            },
            {
                "role": "handler_log",
                "content": "Successfully decoded model response.",
                "model_response_decoded": [
                    "tail(file_name='report.csv',lines=5)"
                ]
            },
            {
                "role": "tool",
                "content": "{\"last_lines\": \"Line 1: Introduction Line 2: Quarterly Financial Overview Line 3: Details Line 4: More Details Line 5: Quarterly Financial Overview Line 6: Conclusion Line 7: Quarterly Financial Overview Line 8: Quarter has been successful. Line 9: Quarterly Financial Overview Line 10: Final Thoughts\"}"
            }
        ],
        "step_1": [
            {
                "role": "assistant",
                "content": "The last five lines from the `report.csv` file are as follows:\n\n- Line 6: Conclusion\n- Line 7: Quarterly Financial Overview\n- Line 8: Quarter has been successful.\n- Line 9: Quarterly Financial Overview\n- Line 10: Final Thoughts"
            },
            {
                "role": "handler_log",
                "content": "Error decoding the model response. Proceed to next turn.",
                "error": "'str' object has no attribute 'items'"
            }
        ]
    },
    {
        "begin_of_turn_query": [
            {
                "role": "user",
                "content": "Logging in as USR001. Lastly, upon completion of our file review, kindly message my colleague, John Levy, add him as new contact, that 'Latest Quarter Performance has been well.'"
            }
        ],
        "step_0": [
            {
                "role": "assistant",
                "content": [
                    {
                        "message_login": "{\"user_id\": \"USR001\"}"
                    },
                    {
                        "add_contact": "{\"user_name\": \"John Levy\"}"
                    }
                ]
            },
            {
                "role": "handler_log",
                "content": "Successfully decoded model response.",
                "model_response_decoded": [
                    "message_login(user_id='USR001')",
                    "add_contact(user_name='John Levy')"
                ]
            },
            {
                "role": "tool",
                "content": "{\"login_status\": true, \"message\": \"User 'USR001' logged in successfully.\"}"
            },
            {
                "role": "tool",
                "content": "{\"added_status\": true, \"user_id\": \"USR005\", \"message\": \"Contact 'John Levy' added successfully.\"}"
            }
        ],
        "step_1": [
            {
                "role": "assistant",
                "content": [
                    {
                        "get_user_id": "{\"user\":\"John Levy\"}"
                    }
                ]
            },
            {
                "role": "handler_log",
                "content": "Successfully decoded model response.",
                "model_response_decoded": [
                    "get_user_id(user='John Levy')"
                ]
            },
            {
                "role": "tool",
                "content": "{\"user_id\": \"USR005\"}"
            }
        ],
        "step_2": [
            {
                "role": "assistant",
                "content": [
                    {
                        "send_message": "{\"receiver_id\":\"USR005\",\"message\":\"Latest Quarter Performance has been well.\"}"
                    }
                ]
            },
            {
                "role": "handler_log",
                "content": "Successfully decoded model response.",
                "model_response_decoded": [
                    "send_message(receiver_id='USR005',message='Latest Quarter Performance has been well.')"
                ]
            },
            {
                "role": "tool",
                "content": "{\"sent_status\": true, \"message_id\": {\"new_id\": 67410}, \"message\": \"Message sent to 'USR005' successfully.\"}"
            }
        ],
        "step_3": [
            {
                "role": "assistant",
                "content": "You have been successfully logged in as **USR001**, and your colleague **John Levy** has been added as a new contact. The message \"Latest Quarter Performance has been well.\" has been sent to him successfully."
            },
            {
                "role": "handler_log",
                "content": "Error decoding the model response. Proceed to next turn.",
                "error": "'str' object has no attribute 'items'"
            }
        ]
    }
]
```
  • Loading branch information
HuanzhiMao authored Nov 11, 2024
1 parent 52cf553 commit a5f32a8
Show file tree
Hide file tree
Showing 13 changed files with 405 additions and 132 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,8 @@ berkeley-function-call-leaderboard/score/
berkeley-function-call-leaderboard/.env
!berkeley-function-call-leaderboard/.env.example

# Ignore multi turn ground truth conversation log
berkeley-function-call-leaderboard/utils/ground_truth_conversation/

.direnv/
.venv
.venv
25 changes: 25 additions & 0 deletions berkeley-function-call-leaderboard/LOG_GUIDE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Guide to Inference Logs

> An inference log is included along with the llm response in the results file to help you analyze and debug the model's performance, and to better understand the model behavior. To enable a more detailed log, use the `--include-state-log` flag and/or the `--include-input-log` flag in the generation command.
## Log Structure

The log is structured as a list, representing a conversational interaction between the model, system, and user. There are five types of roles in the log:

1. **`user`**: Represents the user's input or query.
2. **`assistant`**: Represents the model's raw response.
3. **`tool`**: Represents the output of a function execution, if the model makes a valid function call. Each function call results in a separate `tool` entry.
4. **`state_info`**: Represents the state of the backend API system at the end of each turn. The initial state is also included at the beginning of the log. This entry is available only if the `--include-state-log` flag is set in the generation command.
5. **`handler_log`**: Represents internal logs from the inference pipeline. These entries indicate various stages and events within the pipeline, including:

- **inference_input**: Snapshot of the fully-transformed input just before it's sent to the model API endpoint. Useful for debugging input integrity and format.
- Available only if the `--include-input-log` flag is set in the generation command.
- This section can be verbose and may affect log readability; it is generally not necessary for most analyses.
- **decode_success**: Indicates the successful decoding of the model's raw response, with the decoded response included in the `model_response_decoded` field. Following this, any function calls are executed, and the current turn continues.
- **empty_response**: Indicates that the model handler returned an empty response (e.g., no function call) based on the decoding strategy. When this occurs, the pipeline proceeds to the next turn.
- **decode_failure**: Indicates a failure in decoding the raw model response, with the raw response included in the `model_response_decoded` field. The pipeline then proceeds to the next turn.
- **force_quit**: Indicates that the model handler has forcefully ended the conversation after the model made 20 unsuccessful attempts (eg, steps) within one turn or task; the count reset at the beginning of each turn. No further turns are processed for this entry.

## Single Turn Categories

For single-turn categories, the only log entry available is the inference input (under `handler_log` role), because there is no interaction with the model or system.
3 changes: 3 additions & 0 deletions berkeley-function-call-leaderboard/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,9 @@ For available options for `MODEL_NAME` and `TEST_CATEGORY`, please refer to the

If no `MODEL_NAME` is provided, the model `gorilla-openfunctions-v2` will be used by default. If no `TEST_CATEGORY` is provided, all test categories will be run by default.

> An inference log will be included along with the llm response to help you analyze and debug the model's performance, and to better understand the model behavior. To see a more verbose log, you can set the `--include-state-log` and/or the `--include-input-log` flag in the generation command.
> Please refer to the `LOG_GUIDE.md` file for more information on how to interpret the inference logs and what each flag does.
#### For API-hosted models:

```bash
Expand Down
24 changes: 12 additions & 12 deletions berkeley-function-call-leaderboard/bfcl/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,18 +68,18 @@ def generate(
test_category: List[str] = typer.Option(
["all"], help="A list of test categories to run the evaluation on."
),
api_sanity_check: bool = typer.Option(
False,
"--api-sanity-check",
"-c",
help="Perform the REST API status sanity check before running the evaluation.",
),
temperature: float = typer.Option(
0.001, help="The temperature parameter for the model."
),
include_debugging_log: bool = typer.Option(
include_input_log: bool = typer.Option(
False,
"--include-input-log",
help="Include the fully-transformed input to the model inference endpoint in the inference log; only relevant for debugging input integrity and format.",
),
include_state_log: bool = typer.Option(
False,
help="Include debugging log in the response file to see model's interaction with the state machine.",
"--include-state-log",
help="Include info about the state of each API system after each turn in the inference log; only relevant for multi-turn categories.",
),
num_gpus: int = typer.Option(1, help="The number of GPUs to use."),
num_threads: int = typer.Option(1, help="The number of threads to use."),
Expand All @@ -98,9 +98,9 @@ def generate(
[
"model",
"test_category",
"api_sanity_check",
"temperature",
"include_debugging_log",
"include_input_log",
"include_state_log",
"num_gpus",
"num_threads",
"gpu_memory_utilization",
Expand All @@ -113,9 +113,9 @@ def generate(
generationArgs(
model=model,
test_category=test_category,
api_sanity_check=api_sanity_check,
temperature=temperature,
include_debugging_log=include_debugging_log,
include_input_log=include_input_log,
include_state_log=include_state_log,
num_gpus=num_gpus,
num_threads=num_threads,
gpu_memory_utilization=gpu_memory_utilization,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ def get_args():

# Parameters for the model that you want to test.
parser.add_argument("--temperature", type=float, default=0.001)
parser.add_argument("--include-debugging-log", "-d", action="store_true", default=False)
parser.add_argument("--include-input-log", action="store_true", default=False)
parser.add_argument("--include-state-log", action="store_true", default=False)
parser.add_argument("--num-threads", default=1, type=int)
parser.add_argument("--num-gpus", default=1, type=int)
parser.add_argument("--backend", default="vllm", type=str, choices=["vllm", "sglang"])
Expand Down Expand Up @@ -153,7 +154,7 @@ def process_multi_turn_test_case(test_cases, test_category):
return test_cases


def multi_threaded_inference(handler, test_case, include_debugging_log):
def multi_threaded_inference(handler, test_case, include_input_log, include_state_log):

assert type(test_case["function"]) is list

Expand All @@ -162,7 +163,7 @@ def multi_threaded_inference(handler, test_case, include_debugging_log):
while True:
try:
result, metadata = handler.inference(
copy.deepcopy(test_case), include_debugging_log
copy.deepcopy(test_case), include_input_log, include_state_log
)
break # Success, exit the loop
except Exception as e:
Expand Down Expand Up @@ -215,7 +216,8 @@ def generate_results(args, model_name, test_cases_total):
num_gpus=args.num_gpus,
gpu_memory_utilization=args.gpu_memory_utilization,
backend=args.backend,
include_debugging_log=args.include_debugging_log,
include_input_log=args.include_input_log,
include_state_log=args.include_state_log,
)

else:
Expand All @@ -230,7 +232,8 @@ def generate_results(args, model_name, test_cases_total):
multi_threaded_inference,
handler,
test_case,
args.include_debugging_log,
args.include_input_log,
args.include_state_log,
)
futures.append(future)

Expand Down
2 changes: 2 additions & 0 deletions berkeley-function-call-leaderboard/bfcl/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
POSSIBLE_ANSWER_PATH = "../data/possible_answer/"
SCORE_PATH = "../score/"
DOTENV_PATH = "../.env"
UTILS_PATH = "../utils/"

VERSION_PREFIX = "BFCL_v3"

Expand Down Expand Up @@ -196,6 +197,7 @@
POSSIBLE_ANSWER_PATH = (script_dir / POSSIBLE_ANSWER_PATH).resolve()
SCORE_PATH = (script_dir / SCORE_PATH).resolve()
DOTENV_PATH = (script_dir / DOTENV_PATH).resolve()
UTILS_PATH = (script_dir / UTILS_PATH).resolve()

RESULT_PATH.mkdir(parents=True, exist_ok=True)
SCORE_PATH.mkdir(parents=True, exist_ok=True)
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@ def multi_turn_runner(
temp["model_result_raw"] = multi_turn_model_result_list
temp["model_result_decoded"] = multi_turn_model_result_list_decoded
temp["possible_answer"] = multi_turn_ground_truth_list
temp["inference_log"] = model_result[i].get("inference_log", "")
result.append(temp)
else:
correct_count += 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def multi_turn_checker(

# Note that we combine all the sub-step results into a single list, for easier comparison
single_turn_model_execution_results = []
single_turn_model_execution_results_uncombined = []
single_turn_ground_truth_execution_results = []
model_instances = {} # Will be overwritten in the for loop
single_step_model_execution_results = [] # Will be overwritten in the for loop
Expand All @@ -51,6 +52,7 @@ def multi_turn_checker(
)
)
single_turn_model_execution_results.extend(single_step_model_execution_results)
single_turn_model_execution_results_uncombined.append(single_step_model_execution_results)

# Execute the ground truth function calls
single_turn_ground_truth_execution_results, ground_truth_instances = (
Expand All @@ -70,7 +72,7 @@ def multi_turn_checker(
all_turn_model_execution_results.extend(single_turn_model_execution_results)
execution_results.append(
{
"model": single_turn_model_execution_results,
"model": single_turn_model_execution_results_uncombined,
"ground_truth": single_turn_ground_truth_execution_results,
}
)
Expand Down Expand Up @@ -210,8 +212,8 @@ def response_checker(
"error_type": "multi_turn:execution_response_mismatch",
"details": {
"missing_items": missing_items,
"model_response": model_response_list,
"ground_truth_response": ground_truth_response_list,
"model_response (including all previous turns)": model_response_list,
"ground_truth_response (only the current turn)": ground_truth_response_list,
},
}

Expand Down
Loading

0 comments on commit a5f32a8

Please sign in to comment.