Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 33 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

## X-MAS-Bench

1. Specify your model configs in `./configs/X-MAS_Bench_config.json`:
### 1. Specify your model configs in `./configs/X-MAS_Bench_config.json`:
```
"gpt-4o-mini-2024-07-18": {
"model_list": [
Expand All @@ -17,16 +17,22 @@
}
```

2. Inference on a dataset (the outputs will be saved under "./X-MAS-Bench/results/")
### 2. Inference on a dataset (the outputs will be saved under "./X-MAS-Bench/results/")
```
# bash scripts/infer_X-MAS_Bench.sh
python X-MAS-Bench/infer_direct.py --model_name <model_name> --model_config <config_path> --test_dataset_name <dataset_name>

python X-MAS-Bench/infer_qa.py --model_name <model_name> --model_config <config_path> --test_dataset_name <dataset_name>
```
If you want to infer with **revise, aggregation or evaluation**, please make sure you have replace the content of "./X-MAS-Bench/results/" with **source file** in [Google Drive](https://drive.google.com/file/d/1ijQCzujXdYZDV95vWJHUpvOQFWVgUbXF/view?usp=sharing) first.
You can download the .zip file named results.zip to the "./X-MAS-Bench/results/" path and unzip it.

3. Evaluate on a dataset (the outputs will be saved under "./X-MAS-Bench/results/")
If you want to infer with **planning**, please make sure you have load the default source model **"llama-3.1-8b-instruct", "qwen-2.5-7b-instruct" and "qwen-2.5-14b-instruct"**.

### 3. Evaluate on a dataset (the outputs will be saved under "./X-MAS-Bench/results/")
```
# bash scripts/eval_X-MAS_Bench.sh
python X-MAS-Bench/eval_bench.py --model_name <eval_model_name> --model_config <config_path> --dataset_name <dataset_name> --infer_name <infer_name> --eval_mode bench-test

python X-MAS-Bench/eval_bench.py --eval_model_name <eval_model_name> --model_config <config_path> --dataset_name <dataset_name> --model_name <model_name> --function_name <function_name> --eval_mode bench-test
# We use llama-3.1-70b-instruct as <eval_model_name>
```

Expand All @@ -35,7 +41,7 @@ You can download the .zip file named results.zip to the "./X-MAS-Bench/results/"

## X-MAS-Design

1. Specify your model configs in `./configs/X-MAS_Design_config.json`:
### 1. Specify your model configs in `./configs/X-MAS_Design_config.json`:
```
"gpt-4o-mini-2024-07-18": {
"model_list": [
Expand All @@ -45,21 +51,37 @@ You can download the .zip file named results.zip to the "./X-MAS-Bench/results/"
}
```

2. Inference on a dataset (the outputs will be saved under "./X-MAS-Design/results/")
### 2. Inference on a dataset (the outputs will be saved under "./X-MAS-Design/results/")
```
# bash scripts/infer_X-MAS_Design.sh

# (Parallel)
python X-MAS-Design/inference_X-MAS.py --method_name <method_name> --model_name <model_name> --test_dataset_name <test_dataset_name> --model_api_config <model_api_config>
python X-MAS-Design/inference_mas.py --model_api_config <model_api_config> --method_name <method_name> --test_dataset_name <test_dataset_name>


# Or (Sequential)
python X-MAS-Design/inference_X-MAS.py --method_name <method_name> --model_name <model_name> --test_dataset_name <test_dataset_name> --model_api_config <model_api_config> --sequential
python X-MAS-Design/inference_mas.py --model_api_config <model_api_config> --method_name <method_name> --test_dataset_name <test_dataset_name> --sequential
```

3. Evaluate on a dataset (the outputs will be saved under "./X-MAS-Design/results/")
You can change the default model list in the method_config files for the corresponding method, like model list in "./X-MAS-Design/methods/llm_debate/configs/config_main.yaml" for "./X-MAS-Design/methods/llm_debate/llm_debate_main.py"

If you want to run a method like llm_debate with your own config or settings, you can modify them accordingly and run the following code.

```
bash scripts/eval_X-MAS_Design.sh
# bash scripts/infer_X-MAS_Design.sh

# (Parallel)
python X-MAS-Design/inference_mas.py --model_api_config <model_api_config> --method_name <method_name> --method_config_name <method_config_name> --test_dataset_name <test_dataset_name>

# Or (Sequential)
python X-MAS-Design/inference_mas.py --model_api_config <model_api_config> --method_name <method_name> --method_config_name <method_config_name> --test_dataset_name <test_dataset_name> --sequential
```

### 3. Evaluate on a dataset (the outputs will be saved under "./X-MAS-Design/results/")
```
# bash scripts/eval_X-MAS_Design.sh

python X-MAS-Design/eval_mas.py --eval_model_name llama-3.1-70b-instruct --model_api_config <model_api_config> --method_name <method_name> --method_config_name <method_config_name> --test_dataset_name <test_dataset_name> --eval_mode bench-test
```

## Citation
Expand Down
14 changes: 7 additions & 7 deletions X-MAS-Bench/eval_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -658,7 +658,7 @@ def load_source_data(source_dir):
print(f"Loaded {len(source_map)} source data items.")
return source_map

def get_evaluation(eval_data, model_url_list, model_name, dataset_name, infer_name, max_workers=4, sequential=False):
def get_evaluation(eval_data, model_url_list, model_name, dataset_name, function_name, infer_name, max_workers=4, sequential=False):
"""
批量评分函数,使用并行处理来加速评分过程。

Expand All @@ -668,7 +668,7 @@ def get_evaluation(eval_data, model_url_list, model_name, dataset_name, infer_na
"""
# print(eval_data)
if "evaluate" in infer_name:
source_dir = f"./X-MAS-Bench/results/{dataset_name}/qwen2.5-32b-instruct_direct_eval.json"
source_dir = f"./X-MAS-Bench/results/{dataset_name}/qa/qwen2.5-32b-instruct_qa_eval.json"
source_map = load_source_data(source_dir)
else:
source_map = {}
Expand Down Expand Up @@ -704,7 +704,8 @@ def get_evaluation(eval_data, model_url_list, model_name, dataset_name, infer_na
if __name__ == "__main__":

parser = argparse.ArgumentParser()
parser.add_argument("--model_name", type=str, default="llama-3.1-70b-instruct", help="the LLM for judgement")
parser.add_argument("--eval_model_name", type=str, default="llama-3.1-70b-instruct", help="the LLM for judgement")
parser.add_argument("--model_name", type=str, default="qwen-2.5-32b-instruct", help="the LLM to be judged")
parser.add_argument("--function_name", type=str, default="direct", help="the function for judgement")
parser.add_argument("--model_config", type=str, default="")
parser.add_argument("--dataset_name", type=str, default="MedMCQA")
Expand All @@ -713,7 +714,7 @@ def get_evaluation(eval_data, model_url_list, model_name, dataset_name, infer_na
parser.add_argument("--dry_run", action="store_true")
parser.add_argument("--sequential", action="store_true")
args = parser.parse_args()
# args.infer_name = f"{args.model_name}_{args.function_name}.jsonl"
args.infer_name = f"{args.model_name}_{args.function_name}.jsonl"
print("="*50)
print(json.dumps(vars(args), indent=4))

Expand All @@ -735,8 +736,7 @@ def get_evaluation(eval_data, model_url_list, model_name, dataset_name, infer_na

# print('-'*20 + f"\n>> Evaluating {i}-th dataset: {dataset_name}")
if args.eval_mode == "bench-test":
infer_path = f"./X-MAS-Bench/results/{dataset_name}/{args.infer_name}"
# infer_path = f"./results/{dataset_name}/{args.infer_name}"
infer_path = f"./X-MAS-Bench/results/{dataset_name}/{args.function_name}/{args.infer_name}"


save_eval_path = infer_path.replace(".jsonl", "_eval.json")
Expand Down Expand Up @@ -782,7 +782,7 @@ def get_evaluation(eval_data, model_url_list, model_name, dataset_name, infer_na

print(f">> Running Loaded {len(eval_data)} samples")

eval_content_list, score_list = get_evaluation(eval_data, model_url_list, args.model_name, dataset_name, args.infer_name, max_workers, args.sequential)
eval_content_list, score_list = get_evaluation(eval_data, model_url_list, args.model_name, dataset_name, args.function_name, args.infer_name, max_workers, args.sequential)

# mapping the response back to the original query
for i, eval_content, score in zip(range(len(eval_data)), eval_content_list, score_list):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,15 +47,15 @@ def aggregate_init_answers(query, answer_dict, shuffle_index):
def get_sample_pool(test_dataset_name, aggregate_model_names):
query_dict = defaultdict(dict)
for model_name in aggregate_model_names:
with open(f"X-MAS-Bench/results/{test_dataset_name}/{model_name}_direct.jsonl", "r") as f:
with open(f"X-MAS-Bench/results/{test_dataset_name}/qa/{model_name}_qa.jsonl", "r") as f:
for line in f:
sample = json.loads(line)
query = sample["query"]
query_dict[query][model_name] = sample['generated_output']

shuffle_matrix = create_shuffled_matrix(len(query_dict), len(aggregate_model_names))
sample_pool = []
with open(f"X-MAS-Bench/results/{test_dataset_name}/{aggregate_model_names[0]}_direct.jsonl", "r") as f:
with open(f"X-MAS-Bench/results/{test_dataset_name}/qa/{aggregate_model_names[0]}_qa.jsonl", "r") as f:
for i, line in enumerate(f):
sample = json.loads(line)
query = sample["query"]
Expand Down Expand Up @@ -130,8 +130,8 @@ def process_sample(sample):
try:

# ================== Define the output files ==================
output_logging = f"X-MAS-Bench/results/{test_dataset_name}/log/{args.model_name}_aggregate.txt"
output_json = f"X-MAS-Bench/results/{test_dataset_name}/{args.model_name}_aggregate.jsonl"
output_logging = f"X-MAS-Bench/results/{test_dataset_name}/aggregation/log/{args.model_name}_aggregation.txt"
output_json = f"X-MAS-Bench/results/{test_dataset_name}/aggregation/{args.model_name}_aggregation.jsonl"
output_dir_log = os.path.dirname(output_logging)
output_dir_json = os.path.dirname(output_json)
os.makedirs(output_dir_log, exist_ok=True)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def evaluate_init_answers(query, answer):

def get_sample_pool(test_dataset_name):
sample_pool = []
with open(f"X-MAS-Bench/results/{test_dataset_name}/qwen2.5-32b-instruct/direct/qwen2.5-32b-instruct_direct.jsonl", "r") as f:
with open(f"X-MAS-Bench/results/{test_dataset_name}/qa/qwen2.5-32b-instruct_qa.jsonl", "r") as f:

for i, line in enumerate(f):
sample = json.loads(line)
Expand Down Expand Up @@ -113,8 +113,8 @@ def process_sample(sample):


# ================== Define the output files ==================
output_logging = f"X-MAS-Bench/results/{test_dataset_name}/log/{args.model_name}_evaluate.txt"
output_json = f"X-MAS-Bench/results/{test_dataset_name}/{args.model_name}_evaluate.jsonl"
output_logging = f"X-MAS-Bench/results/{test_dataset_name}/evaluation/log/{args.model_name}_evaluation.txt"
output_json = f"X-MAS-Bench/results/{test_dataset_name}/evaluation/{args.model_name}_evaluation.jsonl"
output_dir_log = os.path.dirname(output_logging)
output_dir_json = os.path.dirname(output_json)
os.makedirs(output_dir_log, exist_ok=True)
Expand Down
23 changes: 12 additions & 11 deletions X-MAS-Bench/infer_plan.py → X-MAS-Bench/infer_planning.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,16 +74,17 @@ def plan_init_answers(query):
return str


def get_sample_pool(test_dataset_name):
def get_sample_pool(test_dataset_name):
test_data_path = f"X-MAS-Bench/benchmarks/{test_dataset_name}.json"
with open(test_data_path, "r") as f:
samples = json.load(f)

sample_pool = []
with open(f"X-MAS-Bench/results/{test_dataset_name}/qwen2.5-32b-instruct/direct/qwen2.5-32b-instruct_direct.jsonl", "r") as f:
for i, line in enumerate(f):
sample = json.loads(line)
query = sample["query"]
sample_copy = deepcopy(sample)
sample_copy["plan_query"] = plan_init_answers(query)
del sample_copy["generated_output"]
sample_pool.append(sample_copy)
for i,sample in enumerate(samples):
query = sample["query"]
sample_copy = deepcopy(sample)
sample_copy["plan_query"] = plan_init_answers(query)
sample_pool.append(sample_copy)

return sample_pool

Expand Down Expand Up @@ -193,8 +194,8 @@ def output_trans(response):


# ================== Define the output files ==================
output_logging = f"X-MAS-Bench/results/{test_dataset_name}/log/{args.model_name}_plan.txt"
output_json = f"X-MAS-Bench/results/{test_dataset_name}/{args.model_name}_plan.jsonl"
output_logging = f"X-MAS-Bench/results/{test_dataset_name}/planning/log/{args.model_name}_planning.txt"
output_json = f"X-MAS-Bench/results/{test_dataset_name}/planning/{args.model_name}_planning.jsonl"
output_dir_log = os.path.dirname(output_logging)
output_dir_json = os.path.dirname(output_json)
os.makedirs(output_dir_log, exist_ok=True)
Expand Down
6 changes: 3 additions & 3 deletions X-MAS-Bench/infer_direct.py → X-MAS-Bench/infer_qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,8 @@ def process_sample(sample):
test_dataset_name = args.test_dataset_name
try:
# ================== Define the output files ==================
output_logging = f"X-MAS-Bench/results/{test_dataset_name}/log/{args.model_name}_direct.txt"
output_json = f"X-MAS-Bench/results/{test_dataset_name}/{args.model_name}_direct.jsonl"
output_logging = f"X-MAS-Bench/results/{test_dataset_name}/qa/log/{args.model_name}_qa.txt"
output_json = f"X-MAS-Bench/results/{test_dataset_name}/qa/{args.model_name}_qa.jsonl"
test_data_path = f"X-MAS-Bench/benchmarks/{test_dataset_name}.json"

output_dir_log = os.path.dirname(output_logging)
Expand Down Expand Up @@ -116,4 +116,4 @@ def process_sample(sample):
for _ in tqdm(executor.map(process_sample, sample_pool), total=len(sample_pool), desc=f"Processing queries with {args.model_name} on {test_dataset_name}"):
pass
except Exception as e:
print(f"direct Traceback: {traceback.format_exc()}")
print(f"qa Traceback: {traceback.format_exc()}")
6 changes: 3 additions & 3 deletions X-MAS-Bench/infer_revise.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def revise_init_answers(query, answer):
def get_sample_pool(test_dataset_name):
# query_dict = defaultdict(dict)
sample_pool = []
with open(f"X-MAS-Bench/results/{test_dataset_name}/qwen2.5-32b-instruct/direct/qwen2.5-32b-instruct_direct.jsonl", "r") as f:
with open(f"X-MAS-Bench/results/{test_dataset_name}/qa/qwen2.5-32b-instruct_qa.jsonl", "r") as f:
for i, line in enumerate(f):
sample = json.loads(line)
query = sample["query"]
Expand Down Expand Up @@ -106,8 +106,8 @@ def process_sample(sample):


# ================== Define the output files ==================
output_logging = f"X-MAS-Bench/results/{test_dataset_name}/log/{args.model_name}_revise.txt"
output_json = f"X-MAS-Bench/results/{test_dataset_name}/{args.model_name}_revise.jsonl"
output_logging = f"X-MAS-Bench/results/{test_dataset_name}/revise/log/{args.model_name}_revise.txt"
output_json = f"X-MAS-Bench/results/{test_dataset_name}/revise/{args.model_name}_revise.jsonl"
output_dir_log = os.path.dirname(output_logging)
output_dir_json = os.path.dirname(output_json)
os.makedirs(output_dir_log, exist_ok=True)
Expand Down
Loading