-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsubmit_get_refiner_teacher_data.py
104 lines (84 loc) · 2.65 KB
/
submit_get_refiner_teacher_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import os
import argparse
os.environ["HF_HUB_OFFLINE"] = "1"
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
# NUM_GPUS = 4
TRAIN_TASKS = [
"musique_train",
"arc_c_train",
"triviaqa_train",
"hotpotqa_train",
"pubhealth_train",
]
EVAL_TASKS = [
"musique_dev",
"popqa",
"triviaqa",
"hotpotqa_dev_distractor",
"2wiki_dev",
# "arc_c",
]
teacher_model_names = (
"Qwen/Qwen2-72B-Instruct",
"meta-llama/Meta-Llama-3.1-70B-Instruct",
"meta-llama/Llama-2-70b-chat-hf",
"meta-llama/Meta-Llama-3-70B-Instruct",
"dnhkng/RYS-XLarge",
)
teacher_model_inference_names = (
"Qwen2_72B",
"Llama_3.1_70b",
"Llama_2_70b",
"Llama_3_70b",
"RYS_XLarge",
)
def parse_args():
parser = argparse.ArgumentParser(description="Get evaluation for train and evaluation task")
parser.add_argument(
"--task_dir", type=str, default=None, help="name to directory containing training data"
)
parser.add_argument(
"--top_n", type=int, default=10, help="number of top retrieval"
)
args = parser.parse_args()
return args
def run_task(
task,
top_n=10,
task_dir=None):
if task_dir is None:
task_dir = f"./train_data/"
matched_file_name = None
for file_name in os.listdir(task_dir):
if file_name.startswith(task):
matched_file_name = file_name
if f"{task}_exemplar_top{top_n}.jsonl" in file_name:
print(f"Continue recording with {file_name}")
break
if matched_file_name is None:
raise IndexError(f"Warning: {task} task not evaluated, maybe file not found in", task_dir)
file_path = os.path.abspath(os.path.join(task_dir, matched_file_name))
output_path = os.path.abspath(os.path.join(task_dir, f"{task}_exemplar_top{top_n}.jsonl"))
model_list = zip(teacher_model_names, teacher_model_inference_names)
for model, inference in model_list:
print(f"Run refiner teacher models using {file_path} on {inference}")
os.system(f"""
python ./get_refiner_teacher_data.py \
--model_name_or_path {model} \
--per_gpu_eval_batch_size 12765 \
--task {task} \
--top_n {top_n} \
--inference_name {inference} \
--input "{file_path}" \
--output "{output_path}"
""")
if __name__ == '__main__':
args = parse_args()
lst_tasks = TRAIN_TASKS
if isinstance(args.task_dir, str) and "eval_data" in args.task_dir:
lst_tasks = EVAL_TASKS
for task in lst_tasks:
run_task(task=task,
top_n=args.top_n,
task_dir=args.task_dir)
# python ./submit_get_refiner_teacher_data.py --top_n 10 --task_dir ./eval_data