diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py index 6e94672..6805253 100644 --- a/bigcodebench/evaluate.py +++ b/bigcodebench/evaluate.py @@ -277,6 +277,32 @@ def stucking_checker(): if not os.path.isfile(result_path): with open(result_path, "w") as f: json.dump(results, f, indent=2) + + pass_at_k_path = result_path.replace("_eval_results.json", "_pass_at_k.json") + pass_at_k["model"] = flags.samples.split("/")[-1].replace(".jsonl", "") + pass_at_k["subset"] = flags.subset + + def save_pass_at_k(): + with open(pass_at_k_path, "w") as f: + json.dump(pass_at_k, f, indent=2) + + if os.path.isfile(pass_at_k_path): + saved_pass_at_k = json.load(open(pass_at_k_path, "r")) + # compare saved_pass_at_k with pass_at_k + for k in saved_pass_at_k.keys(): + if pass_at_k[k] != saved_pass_at_k[k]: + cprint(f"Warning: {k} is different from the saved one", "yellow") + + # ask user whether to save the pass@k + decision = "" + while decision.lower() not in ["y", "n"]: + print(f"Save pass@k to {pass_at_k_path}? [Y/N]") + decision = input() + if decision.lower() == "y": + save_pass_at_k() + + else: + save_pass_at_k() def main(): diff --git a/bigcodebench/generate.py b/bigcodebench/generate.py index 7edebfc..ca21560 100644 --- a/bigcodebench/generate.py +++ b/bigcodebench/generate.py @@ -35,7 +35,7 @@ def codegen( if model.is_direct_completion() and subset == "instruct": raise Exception("Base model does not support direct completion for instruct tasks") - + # create save_path if it doesn't exist, e.g., a/b.jsonl dirname = os.path.dirname(save_path) if not os.path.exists(dirname) and dirname != "": @@ -118,6 +118,8 @@ def main(): parser.add_argument("--base_url", default=None, type=str) parser.add_argument("--tp", default=1, type=int) parser.add_argument("--trust_remote_code", action="store_true") + parser.add_argument("--tokenizer_name", default=None, type=str) + args = parser.parse_args() @@ -145,7 +147,8 @@ def main(): temperature=args.temperature, base_url=args.base_url, tp=args.tp, - trust_remote_code=args.trust_remote_code + trust_remote_code=args.trust_remote_code, + tokenizer_name=args.tokenizer_name ) if not args.save_path: @@ -161,7 +164,7 @@ def main(): strip_newlines=args.strip_newlines, n_samples=args.n_samples, resume=args.resume, - id_range=args.id_range, + id_range=args.id_range ) diff --git a/bigcodebench/model.py b/bigcodebench/model.py index 19110d5..b2b89fa 100644 --- a/bigcodebench/model.py +++ b/bigcodebench/model.py @@ -92,6 +92,7 @@ def __init__( max_new_tokens: int = 1280, dtype: str = "bfloat16", # default trust_remote_code: bool = False, + tokenizer_name: str = None, ) -> None: print("Initializing a decoder model: {} ...".format(name)) self.name = name @@ -102,6 +103,7 @@ def __init__( self.max_new_tokens = max_new_tokens self.dtype = dtype self.trust_remote_code = trust_remote_code + self.tokenizer_name = tokenizer_name @abstractmethod def codegen( @@ -129,11 +131,13 @@ def __init__(self, name: str, dataset: str, tp: int, **kwargs) -> None: "dtype": self.dtype, "trust_remote_code": self.trust_remote_code, } - - self.tokenizer = AutoTokenizer.from_pretrained(self.name, **kwargs) + if self.tokenizer_name is None: + self.tokenizer_name = self.name + + self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, **kwargs) if self.tokenizer.chat_template is None: self.eos += extra_eos_for_direct_completion(dataset) - self.llm = LLM(model=name, max_model_len=2048, **kwargs) + self.llm = LLM(model=name, max_model_len=2048, tokenizer=self.tokenizer_name, **kwargs) def is_direct_completion(self) -> bool: return self.tokenizer.chat_template is None @@ -185,9 +189,12 @@ def __init__(self, name: str, dataset: str, **kwargs): kwargs["torch_dtype"] = getattr(torch, self.dtype) self.skip_special_tokens = True - print(f"{kwargs = }") + print(f"{kwargs = }", self.tokenizer_name) + + if self.tokenizer_name is None: + self.tokenizer_name = self.name - self.tokenizer = AutoTokenizer.from_pretrained(name, **kwargs) + self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, **kwargs) if self.tokenizer.chat_template is None: self.eos += extra_eos_for_direct_completion(dataset) @@ -253,7 +260,7 @@ def __init__(self, name: str, **kwargs): super().__init__(name=name, **kwargs) self.eos += ["\n```\n"] print(f"EOS strings: {self.eos}") - self.tokenizer = AutoTokenizer.from_pretrained(self.name, **kwargs) + self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name if self.tokenizer_name else self.name, **kwargs) def codegen( self, prompt: str, do_sample: bool = True, num_samples: int = 200 @@ -486,6 +493,7 @@ def make_model( tp=1, base_url=None, trust_remote_code=False, + tokenizer_name=None, ): if backend == "vllm": return GeneralVllmDecoder( @@ -495,6 +503,7 @@ def make_model( dataset=dataset, tp=tp, trust_remote_code=trust_remote_code, + tokenizer_name=tokenizer_name, ) elif backend == "hf": return GenenralHfTorchDecoder( @@ -503,6 +512,7 @@ def make_model( temperature=temperature, dataset=dataset, trust_remote_code=trust_remote_code, + tokenizer_name=tokenizer_name, ) elif backend == "openai": return OpenAIChatDecoder(