diff --git a/LeanCopilotTests/ModelAPIs.lean b/LeanCopilotTests/ModelAPIs.lean index 652dc38..1af675a 100644 --- a/LeanCopilotTests/ModelAPIs.lean +++ b/LeanCopilotTests/ModelAPIs.lean @@ -99,4 +99,27 @@ def reproverExternalEncoder : ExternalEncoder := { -- Go to ./python and run `uvicorn server:app --port 23337` #eval encode reproverExternalEncoder "n : ℕ\n⊢ gcd n n = n" + +/-- +LLMs apis: openai, claude etc. +-/ +def gpt4 : ExternalGenerator := { + name := "gpt4" + host := "localhost" + port := 23337 +} + +#eval generate gpt4 "n : ℕ\n⊢ gcd n n = n" + +/-- +Math LLMs: InternLM, Deepseekmath etc. +-/ +def internLM : ExternalGenerator := { + name := "InternLM" + host := "localhost" + port := 23337 +} + +#eval generate internLM "n : ℕ\n⊢ gcd n n = n" + -/ diff --git a/python/README.md b/python/README.md index c8f5cd4..7933cea 100644 --- a/python/README.md +++ b/python/README.md @@ -7,7 +7,7 @@ Python Server for External Models conda create --name lean-copilot python=3.10 python numpy conda activate lean-copilot pip install torch --index-url https://download.pytorch.org/whl/cu121 # Depending on whether you have CUDA and the CUDA version; see https://pytorch.org/. -pip install fastapi uvicorn loguru transformers openai +pip install fastapi uvicorn loguru transformers openai anthropic google vllm ``` diff --git a/python/external_models/__init__.py b/python/external_models/__init__.py new file mode 100644 index 0000000..193c35a --- /dev/null +++ b/python/external_models/__init__.py @@ -0,0 +1,5 @@ +from .oai_runner import OpenAIRunner +from .hf_runner import HFTacticGenerator +from .vllm_runner import VLLMTacticGenerator +from .claude_runner import ClaudeRunner +from .gemini_runner import GeminiRunner \ No newline at end of file diff --git a/python/external_models/claude_runner.py b/python/external_models/claude_runner.py new file mode 100644 index 0000000..bf01b8e --- /dev/null +++ b/python/external_models/claude_runner.py @@ -0,0 +1,59 @@ +import torch +import numpy as np +from loguru import logger +from typing import List, Tuple +from abc import ABC, abstractmethod +from transformers import ( + AutoModelForCausalLM, + AutoModelForSeq2SeqLM, + AutoTokenizer, + AutoModelForTextEncoding, +) +import os +import numpy as np +try: + from anthropic import Anthropic +except ImportError as e: + pass +from .external_parser import * + + +class ClaudeRunner(Generator, Transformer): + client = Anthropic(api_key=os.getenv("ANTHROPIC_KEY")) + + def __init__(self, **args): + self.client_kwargs: dict[str | str] = { + "model": args['model'], + "temperature": args['temperature'], + "max_tokens": args['max_tokens'], + "top_p": args['top_p'], + } + self.name = self.client_kwargs["model"] + + def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float]]: + prompt = pre_process_input(self.name, input + target_prefix) + + try: + response = self.client.completions.create( + prompt=prompt, + **self.client_kwargs, + ) + content = response.completion + + except Exception as e: + raise e + + results = [(post_process_output(self.name, content),1.0)]# current claude only supports one output + return choices_dedup(results) + + +if __name__ == "__main__": + + generation_kwargs = {"model": "claude-3-opus", + "temperature": 0.9, + "max_tokens": 1024, + "top_p": 0.9, + } + + model = ClaudeRunner(**generation_kwargs) + print(model.generate("n : ℕ\n⊢ gcd n n = n")) diff --git a/python/external_models/external_parser.py b/python/external_models/external_parser.py new file mode 100644 index 0000000..11235d8 --- /dev/null +++ b/python/external_models/external_parser.py @@ -0,0 +1,81 @@ +import os +import torch +import argparse +import numpy as np +from loguru import logger +from typing import List, Tuple +from abc import ABC, abstractmethod +from transformers import ( + AutoModelForCausalLM, + AutoModelForSeq2SeqLM, + AutoTokenizer, + AutoModelForTextEncoding, +) + + +def get_cuda_if_available(): + return torch.device("cuda" if torch.cuda.is_available() else "cpu") + + +def pre_process_input(model_name, input): + if model_name == "internlm/internlm2-math-plus-1_8b": + prompt="My LEAN 4 state is:\n```lean\n" + input + \ + "```\nPlease predict a possible tactic to help me prove the theorem." + prompt = f"""<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n""" + elif model_name == "gpt-3.5-turbo" or model_name == "gpt-4-turbo-preview": + prompt = 'Here is a theorom you need to prove in Lean:\n' + \ + input+'\nNow you should suggest one line tactic in lean code:' + elif 'gemini' in model_name or "claude" in model_name: + prompt = 'Here is a theorom you need to prove in Lean:\n' + \ + input+'\nNow you should suggest one line tactic in lean code:' + else: + raise NotImplementedError(f"External model '{model_name}' not supported") + return prompt + + +def post_process_output(model_name, output): + if model_name == "internlm/internlm2-math-plus-1_8b": + result = output.split( + 'assistant')[-1].split('lean')[-1].split('```')[0].split('\n')[1] + elif model_name == "gpt-3.5-turbo" or model_name == "gpt-4-turbo-preview": + result = output.split('lean')[-1].split('```')[0].split('\n')[1] + elif 'gemini' in model_name or "claude" in model_name: + result = output.split('lean')[-1].split('```')[0].split('\n')[1] + else: + raise NotImplementedError(f"External model '{model_name}' not supported") + return result + + +def choices_dedup(output_list: List[tuple[str, float]]) -> List[tuple[str, float]]: + unique_data = {} + for item in output_list: + if item[0] not in unique_data or item[1] > unique_data[item[0]]: + unique_data[item[0]] = item[1] + sorted_data = sorted(unique_data.items(), key=lambda x: x[1], reverse=True) + return sorted_data + + +class Generator(ABC): + @abstractmethod + def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float]]: + pass + + +class Encoder(ABC): + @abstractmethod + def encode(self, input: str) -> np.ndarray: + pass + + +class Transformer: + def cuda(self) -> None: + self.model.cuda() + + def cpu(self) -> None: + self.model.cpu() + + @property + def device(self) -> torch.device: + return self.model.device + + diff --git a/python/external_models/gemini_runner.py b/python/external_models/gemini_runner.py new file mode 100644 index 0000000..f8f555b --- /dev/null +++ b/python/external_models/gemini_runner.py @@ -0,0 +1,85 @@ +import torch +import numpy as np +from loguru import logger +from typing import List, Tuple +from abc import ABC, abstractmethod +from transformers import ( + AutoModelForCausalLM, + AutoModelForSeq2SeqLM, + AutoTokenizer, + AutoModelForTextEncoding, +) +import os +import numpy as np +from .external_parser import * + +try: + import google.generativeai as genai + from google.generativeai import GenerationConfig +except ImportError as e: + pass + + +class GeminiRunner(Generator, Transformer): + client = genai.configure(api_key=os.getenv("GOOGLE_API_KEY")) + safety_settings = [ + { + "category": "HARM_CATEGORY_HARASSMENT", + "threshold": "BLOCK_NONE", + }, + { + "category": "HARM_CATEGORY_HATE_SPEECH", + "threshold": "BLOCK_NONE", + }, + { + "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", + "threshold": "BLOCK_NONE", + }, + { + "category": "HARM_CATEGORY_DANGEROUS_CONTENT", + "threshold": "BLOCK_NONE", + },] + def __init__(self, **args): + + self.client_kwargs: dict[str | str] = { + "model": args['model'], + "temperature": args['temperature'], + "max_tokens": args['max_tokens'], + "top_p": args['top_p'], + + } + self.name = self.client_kwargs["model"] + + self.client = genai.GenerativeModel(args['model']) + self.generation_config = GenerationConfig( + candidate_count=1, + max_output_tokens=args['max_tokens'], + temperature=args['temperature'], + top_p=args['top_p'], + ) + def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float]]: + prompt = pre_process_input(self.name, input + target_prefix) + + + response = self.client.generate_content( + prompt, + generation_config=self.generation_config, + safety_settings=GeminiRunner.safety_settings, + ) + + + + results = [(post_process_output(self.name, response.text),1.0)]# current gemini only supports one output + return choices_dedup(results) + + +if __name__ == "__main__": + + generation_kwargs = {"model": 'gemini-1.0-pro', + "temperature": 0.9, + "max_tokens": 1024, + "top_p": 0.9, + } + + model = GeminiRunner(**generation_kwargs) + print(model.generate("n : ℕ\n⊢ gcd n n = n")) diff --git a/python/external_models/hf_runner.py b/python/external_models/hf_runner.py new file mode 100644 index 0000000..94dadcc --- /dev/null +++ b/python/external_models/hf_runner.py @@ -0,0 +1,96 @@ +import torch +import numpy as np +from loguru import logger +from typing import List, Tuple +from abc import ABC, abstractmethod +from transformers import ( + AutoModelForCausalLM, + AutoModelForSeq2SeqLM, + AutoTokenizer, + AutoModelForTextEncoding, +) +import os +import numpy as np + +import openai +from openai import OpenAI +from .external_parser import * + + +class HFTacticGenerator(Generator, Transformer): + def __init__( + self, + **args + ) -> None: + self.name = args['model'] + self.tokenizer = AutoTokenizer.from_pretrained( + self.name, trust_remote_code=True) + device = args['device'] + if device == "auto": + device = get_cuda_if_available() + else: + device = torch.device(device) + logger.info(f"Loading {self.name} on {device}") + self.model = AutoModelForCausalLM.from_pretrained( + self.name, trust_remote_code=True).to(device) + + self.generation_args: dict[str | str] = { + "do_sample": args["do_sample"], + "temperature": args['temperature'], # chat default is 0.8 + "max_new_tokens": args['max_new_tokens'], + "top_p": args['top_p'], # chat default is 0.8 + # "length_penalty": args["length_penalty"], + "num_return_sequences": args['num_return_sequences'], + # "num_beams": self.num_return_sequences, + # Here if we use beam search for llms the output are not diverse(few tactics are provided). + "output_scores": args["output_scores"], + "output_logits": args["output_logits"], + "return_dict_in_generate": args["return_dict_in_generate"], + } + + def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float]]: + prompt = input + target_prefix + '''prompt= 'Here is a theorom you need to prove in Lean:\n'+prompt+'\nNow you should suggest one line tactic in lean code:' + prompt = f"""<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n""" + ''' + prompt = pre_process_input(self.name, prompt) + + self.model = self.model.eval() + + tokenized_input = self.tokenizer(prompt, return_tensors="pt") + eos_token_id = [self.tokenizer.eos_token_id, + self.tokenizer.convert_tokens_to_ids(["<|im_end|>"])[0]] + outputs = self.model.generate( + tokenized_input.input_ids.to(self.device), + eos_token_id=eos_token_id, + **self.generation_args + ) + response = self.tokenizer.batch_decode( + outputs['sequences'], skip_special_tokens=True) + result = [] + index = 0 + for out, score in zip(response, outputs.scores): + out = post_process_output(self.name, out) + result.append((out, score[index].exp().sum().log().cpu().item())) + index += 1 + result = choices_dedup(result) + return result + + +if __name__ == "__main__": + + generation_kwargs = {"model": "internlm/internlm2-math-plus-1_8b", + "temperature": 0.6, + "max_new_tokens": 1024, + "top_p": 0.9, + "length_penalty": 0, + "num_return_sequences": 64, + "do_sample": True, + "output_scores": True, + "output_logits": False, + "return_dict_in_generate": True, + "device": "auto", + } + model = HFTacticGenerator(**generation_kwargs) + model.cuda() + print(model.generate("n : ℕ\n⊢ gcd n n = n")) diff --git a/python/external_models/oai_runner.py b/python/external_models/oai_runner.py new file mode 100644 index 0000000..b2231c6 --- /dev/null +++ b/python/external_models/oai_runner.py @@ -0,0 +1,87 @@ +import torch +import numpy as np +from loguru import logger +from typing import List, Tuple +from abc import ABC, abstractmethod +from transformers import ( + AutoModelForCausalLM, + AutoModelForSeq2SeqLM, + AutoTokenizer, + AutoModelForTextEncoding, +) +import os +import numpy as np +import openai +from openai import OpenAI +from .external_parser import * + + +class OpenAIRunner(Generator, Transformer): + client = OpenAI( + api_key=os.getenv("OPENAI_API_KEY"), + ) + + def __init__(self, **args): + self.client_kwargs: dict[str | str] = { + "model": args['model'], + "temperature": args['temperature'], + "max_tokens": args['max_tokens'], + "top_p": args['top_p'], + "frequency_penalty": 0, + "presence_penalty": 0, + "n": args['num_return_sequences'], + "timeout": args['openai_timeout'], + # "stop": args.stop, --> stop is only used for base models currently + } + self.name = self.client_kwargs["model"] + + def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float]]: + prompt = pre_process_input(self.name, input + target_prefix) + prompt = [ + {"role": "user", "content": f"{prompt}"}, + + ] + try: + response = OpenAIRunner.client.chat.completions.create( + messages=prompt, + logprobs=True, + **self.client_kwargs, + ) + except ( + openai.APIError, + openai.RateLimitError, + openai.InternalServerError, + openai.OpenAIError, + openai.APIStatusError, + openai.APITimeoutError, + openai.InternalServerError, + openai.APIConnectionError, + ) as e: + print("Exception: ", repr(e)) + print("Consider reducing the number of parallel processes.") + return OpenAIRunner.generate(self, input, target_prefix) + except Exception as e: + print(f"Failed to run the model for {prompt}!") + print("Exception: ", repr(e)) + raise e + + results = [(post_process_output(self.name, c.message.content), np.exp(-np.mean([ + token.logprob for token in c.logprobs.content])))for c in response.choices] + return choices_dedup(results) + + +if __name__ == "__main__": + + generation_kwargs = {"model": "gpt-4-turbo-preview", + "temperature": 0.9, + "max_tokens": 1024, + "top_p": 0.9, + "frequency_penalty": 0, + "presence_penalty": 0, + "num_return_sequences": 16, + "openai_timeout": 45, + # "stop": args.stop, --> stop is only used for base models currently + } + + model = OpenAIRunner(**generation_kwargs) + print(model.generate("n : ℕ\n⊢ gcd n n = n")) diff --git a/python/external_models/vllm_runner.py b/python/external_models/vllm_runner.py new file mode 100644 index 0000000..6ac5206 --- /dev/null +++ b/python/external_models/vllm_runner.py @@ -0,0 +1,114 @@ +import torch +import numpy as np +from loguru import logger +from typing import List, Tuple +from abc import ABC, abstractmethod +from transformers import ( + AutoModelForCausalLM, + AutoModelForSeq2SeqLM, + AutoTokenizer, + AutoModelForTextEncoding, +) +import os +import numpy as np + +try: + from vllm import LLM, SamplingParams +except ImportError as e: + print("Cannot import vllm") + pass + +from .external_parser import * + + +class VLLMTacticGenerator(Generator, Transformer): + def __init__( + self, + **args + ) -> None: + + self.name = args['model'] + self.llm = LLM( + model=self.name, + tokenizer=self.name, + tensor_parallel_size=args["tensor_parallel_size"], + # dtype=args.dtype, + enforce_eager=True, + max_model_len=4096, + disable_custom_all_reduce=False, + # enable_prefix_caching=args.enable_prefix_caching, + trust_remote_code=True, + ) + self.sampling_params = SamplingParams( + n=args['n'], + max_tokens=args['max_tokens'], + temperature=args['temperature'], + top_p=args['top_p'], + frequency_penalty=0, + presence_penalty=0, + ) + + self.tokenizer = AutoTokenizer.from_pretrained( + self.name, trust_remote_code=True) + device = args['device'] + if device == "auto": + device = get_cuda_if_available() + else: + device = torch.device(device) + logger.info(f"Loading {self.name} on {device}") + # self.model = AutoModelForCausalLM.from_pretrained(self.name, trust_remote_code=True).to(device) + + '''self.generation_args: dict[str | str] = { + "do_sample":args["do_sample"], + "temperature": args['temperature'],#chat default is 0.8 + "max_new_tokens": args['max_new_tokens'], + "top_p": args['top_p'],#chat default is 0.8 + #"length_penalty": args["length_penalty"], + "num_return_sequences": args['num_return_sequences'], + #"num_beams": self.num_return_sequences, + ##Here if we use beam search for llms the output are not diverse(few tactics are provided). + "output_scores":args["output_scores"], + "output_logits":args["output_logits"], + "return_dict_in_generate":args["return_dict_in_generate"], + }''' + + def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float]]: + prompt = input + target_prefix + '''prompt= 'Here is a theorom you need to prove in Lean:\n'+prompt+'\nNow you should suggest one line tactic in lean code:' + prompt = f"""<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n""" + ''' + prompt = pre_process_input(self.name, prompt) + + # self.model = self.model.eval() + + vllm_outputs = self.llm.generate(prompt, self.sampling_params) + # pdb.set_trace() + # print(vllm_outputs) + # exit() + result = [] + for output in vllm_outputs[0].outputs: # bsz=1 for now + out = output.text.split('<|im_end|>')[0] + result.append((post_process_output(self.name, out), + np.exp(output.cumulative_logprob))) + + result = choices_dedup(result) + return result + + +if __name__ == "__main__": + + generation_kwargs = {"model": "internlm/internlm2-math-plus-1_8b", + "tensor_parallel_size": 2, + "temperature": 0.6, + "max_tokens": 1024, + "top_p": 0.9, + "length_penalty": 0, + "n": 32, + "do_sample": True, + "output_scores": True, + "output_logits": False, + "return_dict_in_generate": True, + "device": "auto", + } + model = VLLMTacticGenerator(**generation_kwargs) + print(model.generate("n : ℕ\n⊢ gcd n n = n")) diff --git a/python/server.py b/python/server.py index 81a4b14..84c9733 100644 --- a/python/server.py +++ b/python/server.py @@ -3,13 +3,39 @@ from pydantic import BaseModel from models import * +from external_models import * app = FastAPI() models = { # "EleutherAI/llemma_7b": DecoderOnlyTransformer( # "EleutherAI/llemma_7b", num_return_sequences=2, max_length=64, device="auto" - #), + # ), + + "gpt4": OpenAIRunner( + model="gpt-4-turbo-preview", + temperature=0.9, + max_tokens=1024, + top_p=0.9, + frequency_penalty=0, + presence_penalty=0, + num_return_sequences=16, + openai_timeout=45, + ), + "InternLM": VLLMTacticGenerator( + model="internlm/internlm2-math-plus-1_8b", + tensor_parallel_size=2, + temperature=0.6, + max_tokens=1024, + top_p=0.9, + length_penalty=0, + n=32, + do_sample=True, + output_scores=True, + output_logits=False, + return_dict_in_generate=True, + device="auto", + ), "wellecks/llmstep-mathlib4-pythia2.8b": PythiaTacticGenerator( num_return_sequences=32, max_length=1024, device="auto" ),