Skip to content

Commit 7542214

Browse files
committed
latest
1 parent 40f4725 commit 7542214

File tree

7 files changed

+49
-66
lines changed

7 files changed

+49
-66
lines changed

README.md

+2-3
Original file line numberDiff line numberDiff line change
@@ -39,13 +39,12 @@ python eval_wizard.py
3939

4040
Process the jsonl file to extract code samples from model completions
4141

42-
**Note**: the replit base model does not go through this process
42+
**Note**: the replit base + instruct model does not go through this process
4343

4444
```sh
4545
# replace args for various models:
4646
# --path results/wizard --out_path results/wizard/eval.jsonl
4747
# --path results/opencode --out_path results/opencode/eval.jsonl
48-
# --path results/replit_instruct --out_path results/replit_instruct/eval.jsonl
4948

5049
python process_eval.py --path results/wizard --out_path results/wizard/processed.jsonl --add_prompt
5150
```
@@ -56,7 +55,7 @@ Then get the results
5655
# replace args for various models:
5756
# results/wizard/processed.jsonl
5857
# results/opencode/processed.jsonl
59-
# results/replit_instruct/processed.jsonl
58+
# results/replit_instruct/eval.jsonl
6059
# results/replit/eval.jsonl
6160

6261
evaluate_functional_correctness results/wizard/processed.jsonl

core/evaluation.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
PreTrainedTokenizer,
55
)
66
from tqdm import tqdm
7+
import itertools
78
import typing
89

910
BatchGenerator = typing.Callable[
@@ -17,13 +18,19 @@ def run_eval(
1718
num_samples_per_task: int,
1819
out_path: str,
1920
generate_batch_completion: BatchGenerator,
21+
is_starcoder: bool = False,
2022
):
2123
problems = read_problems()
24+
# problems = dict(itertools.islice(problems.items(), 20))
2225
samples = []
2326
pbar = tqdm(total=len(problems) * num_samples_per_task)
2427

2528
for task_id in problems:
26-
prompt = problems[task_id]["prompt"].replace(" ", "\t")
29+
if is_starcoder:
30+
prompt = problems[task_id]["prompt"].replace(" ", "\t")
31+
else:
32+
prompt = problems[task_id]["prompt"]
33+
2734
batch_completions = generate_batch_completion(
2835
model, tokenizer, prompt, num_samples_per_task
2936
)

eval_opencode.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,9 @@ def tokenize_opencode(tokenizer: PreTrainedTokenizer, prompt: str):
5454

5555
# verbose, but follows what is shown in the readme
5656
user = tokenizer("User:")
57-
prompt_text = tokenizer(f"""Create a Python script for this problem: {prompt}""")
57+
prompt_text = tokenizer(
58+
f"""Complete the following Python code without any tests or explanation\n{prompt}"""
59+
)
5860
eot_token = tokenizer("<|end_of_turn|>")
5961
assistant = tokenizer("Assistant:")
6062

@@ -111,5 +113,10 @@ def convert_to_tensors(opencode_tokens: list[dict], device: torch.device):
111113
)
112114

113115
run_eval(
114-
model, tokenizer, num_samples_per_task, out_path, generate_batch_completion
116+
model,
117+
tokenizer,
118+
num_samples_per_task,
119+
out_path,
120+
generate_batch_completion,
121+
True,
115122
)

eval_replit.py

+11-22
Original file line numberDiff line numberDiff line change
@@ -13,27 +13,12 @@
1313
TOKEN = ""
1414

1515

16-
# references: https://github.com/declare-lab/instruct-eval
17-
def count_indent(text: str) -> int:
18-
count = 0
19-
for char in text:
20-
if char == " ":
21-
count += 1
22-
else:
23-
break
24-
return count
25-
26-
27-
def fix_indents(text: str, multiple: int = 2) -> str:
28-
outputs = []
29-
for line in text.split("\n"):
30-
while count_indent(line) % multiple != 0:
31-
line = " " + line
32-
outputs.append(line)
33-
return "\n".join(outputs)
16+
def fix_indents(text: str) -> str:
17+
return text.replace("\t", " ")
3418

3519

3620
def filter_code(completion: str) -> str:
21+
# The program tends to overwrite, we only take the first function
3722
completion = completion.lstrip("\n")
3823
return completion.split("\n\n")[0]
3924

@@ -42,7 +27,7 @@ def filter_code(completion: str) -> str:
4227
def generate_batch_completion(
4328
model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prompt, batch_size
4429
) -> list[str]:
45-
prompt_input = f"""Complete the following Python code without any additional tests or explanations\n{prompt}"""
30+
prompt_input = f"""Complete the following Python code without any tests or explanation\n{prompt}"""
4631

4732
input_batch = [prompt_input for _ in range(batch_size)]
4833
inputs = tokenizer(input_batch, return_tensors="pt").to(model.device)
@@ -59,13 +44,13 @@ def generate_batch_completion(
5944
pad_token_id=tokenizer.pad_token_id,
6045
)
6146

62-
output = tokenizer.batch_decode(
47+
batch_completions = tokenizer.batch_decode(
6348
[ids[input_ids_cutoff:] for ids in generated_ids],
6449
skip_special_tokens=True,
6550
clean_up_tokenization_spaces=False,
6651
)
6752

68-
return [filter_code(fix_indents(sample)) for sample in output]
53+
return [filter_code(fix_indents(completion)) for completion in batch_completions]
6954

7055

7156
if __name__ == "__main__":
@@ -91,5 +76,9 @@ def generate_batch_completion(
9176
)
9277

9378
run_eval(
94-
model, tokenizer, num_samples_per_task, out_path, generate_batch_completion
79+
model,
80+
tokenizer,
81+
num_samples_per_task,
82+
out_path,
83+
generate_batch_completion,
9584
)

eval_replit_instruct.py

+2-16
Original file line numberDiff line numberDiff line change
@@ -13,25 +13,11 @@
1313
TOKEN = ""
1414

1515

16-
# references: https://github.com/nlpxucan/WizardLM/tree/main/WizardCoder
17-
def format_output(output: str):
18-
try:
19-
return output.replace("\t", " ")
20-
except:
21-
return ""
22-
23-
2416
@torch.inference_mode()
2517
def generate_batch_completion(
2618
model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prompt: str, batch_size: int
2719
) -> list[str]:
28-
prompt_input = f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
29-
30-
### Instruction:
31-
Create a Python script for this problem:
32-
{prompt}
33-
34-
### Response:"""
20+
prompt_input = f"""### Instruction:\nComplete the following Python code without any tests or explanation\n{prompt}\n\n### Response:"""
3521

3622
input_batch = [prompt_input for _ in range(batch_size)]
3723
inputs = tokenizer(input_batch, return_tensors="pt").to(model.device)
@@ -54,7 +40,7 @@ def generate_batch_completion(
5440
clean_up_tokenization_spaces=False,
5541
)
5642

57-
return [format_output(out) for out in output]
43+
return output
5844

5945

6046
if __name__ == "__main__":

eval_wizard.py

+8-17
Original file line numberDiff line numberDiff line change
@@ -13,25 +13,11 @@
1313
TOKEN = ""
1414

1515

16-
# references: https://github.com/nlpxucan/WizardLM/tree/main/WizardCoder
17-
def format_output(output: str):
18-
try:
19-
return output.replace("\t", " ")
20-
except:
21-
return ""
22-
23-
2416
@torch.inference_mode()
2517
def generate_batch_completion(
2618
model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prompt: str, batch_size: int
2719
) -> list[str]:
28-
prompt_input = f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
29-
30-
### Instruction:
31-
Create a Python script for this problem:
32-
{prompt}
33-
34-
### Response:"""
20+
prompt_input = f"""### Instruction:\nComplete the following Python code without any tests or explanation\n{prompt}\n\n### Response:"""
3521

3622
input_batch = [prompt_input for _ in range(batch_size)]
3723
inputs = tokenizer(input_batch, return_tensors="pt").to(model.device)
@@ -53,7 +39,7 @@ def generate_batch_completion(
5339
skip_special_tokens=True,
5440
)
5541

56-
return [format_output(out) for out in output]
42+
return output
5743

5844

5945
if __name__ == "__main__":
@@ -81,5 +67,10 @@ def generate_batch_completion(
8167
)
8268

8369
run_eval(
84-
model, tokenizer, num_samples_per_task, out_path, generate_batch_completion
70+
model,
71+
tokenizer,
72+
num_samples_per_task,
73+
out_path,
74+
generate_batch_completion,
75+
True,
8576
)

human-eval/human_eval/evaluation.py

+9-5
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
def estimate_pass_at_k(
1414
num_samples: Union[int, List[int], np.ndarray],
1515
num_correct: Union[List[int], np.ndarray],
16-
k: int
16+
k: int,
1717
) -> np.ndarray:
1818
"""
1919
Estimates pass@k of each problem and returns them in an array.
@@ -33,7 +33,9 @@ def estimator(n: int, c: int, k: int) -> float:
3333
assert len(num_samples) == len(num_correct)
3434
num_samples_it = iter(num_samples)
3535

36-
return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])
36+
return np.array(
37+
[estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)]
38+
)
3739

3840

3941
def evaluate_functional_correctness(
@@ -52,7 +54,6 @@ def evaluate_functional_correctness(
5254

5355
# Check the generated samples against test suites.
5456
with ThreadPoolExecutor(max_workers=n_workers) as executor:
55-
5657
futures = []
5758
completion_id = Counter()
5859
n_samples = 0
@@ -86,8 +87,11 @@ def evaluate_functional_correctness(
8687
correct = np.array(correct)
8788

8889
ks = k
89-
pass_at_k = {f"pass@{k}": estimate_pass_at_k(total, correct, k).mean()
90-
for k in ks if (total >= k).all()}
90+
pass_at_k = {
91+
f"pass@{k}": estimate_pass_at_k(total, correct, k).mean()
92+
for k in ks
93+
if (total >= k).all()
94+
}
9195

9296
# Finally, save the results in one file:
9397
def combine_results():

0 commit comments

Comments
 (0)