lean-dojo · Peiyang-Song · Sep 1, 2024 · Jul 1, 2024 · Jul 2, 2024 · Jul 15, 2024
diff --git a/LeanCopilotTests/ModelAPIs.lean b/LeanCopilotTests/ModelAPIs.lean
@@ -99,4 +99,27 @@ def reproverExternalEncoder : ExternalEncoder := {
 
 -- Go to ./python and run `uvicorn server:app --port 23337`
 #eval encode reproverExternalEncoder "n : ℕ\n⊢ gcd n n = n"
+
+/--
+LLMs apis: openai, claude etc.
+-/
+def gpt4 : ExternalGenerator := {
+ name := "gpt4"
+ host := "localhost"
+ port := 23337
+}
+
+#eval generate gpt4 "n : ℕ\n⊢ gcd n n = n"
+
+/--
+Math LLMs: InternLM, Deepseekmath etc.
+-/
+def internLM : ExternalGenerator := {
+ name := "InternLM"
+ host := "localhost"
+ port := 23337
+}
+
+#eval generate internLM "n : ℕ\n⊢ gcd n n = n"
+
 -/
diff --git a/python/README.md b/python/README.md
@@ -7,7 +7,7 @@ Python Server for External Models
 conda create --name lean-copilot python=3.10 python numpy
 conda activate lean-copilot
 pip install torch --index-url https://download.pytorch.org/whl/cu121 # Depending on whether you have CUDA and the CUDA version; see https://pytorch.org/.
-pip install fastapi uvicorn loguru transformers openai
+pip install fastapi uvicorn loguru transformers openai anthropic google vllm
 ```
 
 

diff --git a/python/external_models/__init__.py b/python/external_models/__init__.py
@@ -0,0 +1,5 @@
+from .oai_runner import OpenAIRunner
+from .hf_runner import HFTacticGenerator
+from .vllm_runner import VLLMTacticGenerator
+from .claude_runner import ClaudeRunner
+from .gemini_runner import GeminiRunner
diff --git a/python/external_models/claude_runner.py b/python/external_models/claude_runner.py
@@ -0,0 +1,59 @@
+import torch
+import numpy as np
+from loguru import logger
+from typing import List, Tuple
+from abc import ABC, abstractmethod
+from transformers import (
+ AutoModelForCausalLM,
+ AutoModelForSeq2SeqLM,
+ AutoTokenizer,
+ AutoModelForTextEncoding,
+)
+import os
+import numpy as np
+try:
+ from anthropic import Anthropic
+except ImportError as e:
+ pass
+from .external_parser import *
+
+
+class ClaudeRunner(Generator, Transformer):
+ client = Anthropic(api_key=os.getenv("ANTHROPIC_KEY"))
+
+ def __init__(self, **args):
+ self.client_kwargs: dict[str | str] = {
+ "model": args['model'],
+ "temperature": args['temperature'],
+ "max_tokens": args['max_tokens'],
+ "top_p": args['top_p'],
+ }
+ self.name = self.client_kwargs["model"]
+
+ def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float]]:
+ prompt = pre_process_input(self.name, input + target_prefix)
+
+ try:
+ response = self.client.completions.create(
+ prompt=prompt,
+ **self.client_kwargs,
+ ) 
+ content = response.completion
+
+ except Exception as e:
+ raise e
+
+ results = [(post_process_output(self.name, content),1.0)]# current claude only supports one output
+ return choices_dedup(results)
+
+
+if __name__ == "__main__":
+
+ generation_kwargs = {"model": "claude-3-opus",
+ "temperature": 0.9,
+ "max_tokens": 1024,
+ "top_p": 0.9,
+ }
+
+ model = ClaudeRunner(**generation_kwargs)
+ print(model.generate("n : ℕ\n⊢ gcd n n = n"))
diff --git a/python/external_models/external_parser.py b/python/external_models/external_parser.py
@@ -0,0 +1,81 @@
+import os
+import torch
+import argparse
+import numpy as np
+from loguru import logger
+from typing import List, Tuple
+from abc import ABC, abstractmethod
+from transformers import (
+ AutoModelForCausalLM,
+ AutoModelForSeq2SeqLM,
+ AutoTokenizer,
+ AutoModelForTextEncoding,
+)
+
+
+def get_cuda_if_available():
+ return torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+
+def pre_process_input(model_name, input):
+ if model_name == "internlm/internlm2-math-plus-1_8b":
+ prompt="My LEAN 4 state is:\n```lean\n" + input + \
+ "```\nPlease predict a possible tactic to help me prove the theorem."
+ prompt = f"""<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"""
+ elif model_name == "gpt-3.5-turbo" or model_name == "gpt-4-turbo-preview":
+ prompt = 'Here is a theorom you need to prove in Lean:\n' + \
+ input+'\nNow you should suggest one line tactic in lean code:'
+ elif 'gemini' in model_name or "claude" in model_name:
+ prompt = 'Here is a theorom you need to prove in Lean:\n' + \
+ input+'\nNow you should suggest one line tactic in lean code:'
+ else:
+ raise NotImplementedError(f"External model '{model_name}' not supported")
+ return prompt
+
+
+def post_process_output(model_name, output):
+ if model_name == "internlm/internlm2-math-plus-1_8b":
+ result = output.split(
+ 'assistant')[-1].split('lean')[-1].split('```')[0].split('\n')[1]
+ elif model_name == "gpt-3.5-turbo" or model_name == "gpt-4-turbo-preview":
+ result = output.split('lean')[-1].split('```')[0].split('\n')[1]
+ elif 'gemini' in model_name or "claude" in model_name: 
+ result = output.split('lean')[-1].split('```')[0].split('\n')[1]
+ else:
+ raise NotImplementedError(f"External model '{model_name}' not supported")
+ return result
+
+
+def choices_dedup(output_list: List[tuple[str, float]]) -> List[tuple[str, float]]:
+ unique_data = {}
+ for item in output_list:
+ if item[0] not in unique_data or item[1] > unique_data[item[0]]:
+ unique_data[item[0]] = item[1]
+ sorted_data = sorted(unique_data.items(), key=lambda x: x[1], reverse=True)
+ return sorted_data
+
+
+class Generator(ABC):
+ @abstractmethod
+ def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float]]:
+ pass
+
+
+class Encoder(ABC):
+ @abstractmethod
+ def encode(self, input: str) -> np.ndarray:
+ pass
+
+
+class Transformer:
+ def cuda(self) -> None:
+ self.model.cuda()
+
+ def cpu(self) -> None:
+ self.model.cpu()
+
+ @property
+ def device(self) -> torch.device:
+ return self.model.device
+
+
diff --git a/python/external_models/gemini_runner.py b/python/external_models/gemini_runner.py
@@ -0,0 +1,85 @@
+import torch
+import numpy as np
+from loguru import logger
+from typing import List, Tuple
+from abc import ABC, abstractmethod
+from transformers import (
+ AutoModelForCausalLM,
+ AutoModelForSeq2SeqLM,
+ AutoTokenizer,
+ AutoModelForTextEncoding,
+)
+import os
+import numpy as np
+from .external_parser import *
+
+try:
+ import google.generativeai as genai
+ from google.generativeai import GenerationConfig
+except ImportError as e:
+ pass
+
+
+class GeminiRunner(Generator, Transformer):
+ client = genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
+ safety_settings = [
+ {
+ "category": "HARM_CATEGORY_HARASSMENT",
+ "threshold": "BLOCK_NONE",
+ },
+ {
+ "category": "HARM_CATEGORY_HATE_SPEECH",
+ "threshold": "BLOCK_NONE",
+ },
+ {
+ "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
+ "threshold": "BLOCK_NONE",
+ },
+ {
+ "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
+ "threshold": "BLOCK_NONE",
+ },]
+ def __init__(self, **args):
+
+ self.client_kwargs: dict[str | str] = {
+ "model": args['model'],
+ "temperature": args['temperature'],
+ "max_tokens": args['max_tokens'],
+ "top_p": args['top_p'],
+
+ }
+ self.name = self.client_kwargs["model"]
+
+ self.client = genai.GenerativeModel(args['model'])
+ self.generation_config = GenerationConfig(
+ candidate_count=1,
+ max_output_tokens=args['max_tokens'],
+ temperature=args['temperature'],
+ top_p=args['top_p'],
+ )
+ def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float]]:
+ prompt = pre_process_input(self.name, input + target_prefix)
+
+
+ response = self.client.generate_content(
+ prompt,
+ generation_config=self.generation_config,
+ safety_settings=GeminiRunner.safety_settings,
+ )
+
+
+
+ results = [(post_process_output(self.name, response.text),1.0)]# current gemini only supports one output
+ return choices_dedup(results)
+
+
+if __name__ == "__main__":
+
+ generation_kwargs = {"model": 'gemini-1.0-pro',
+ "temperature": 0.9,
+ "max_tokens": 1024,
+ "top_p": 0.9,
+ }
+
+ model = GeminiRunner(**generation_kwargs)
+ print(model.generate("n : ℕ\n⊢ gcd n n = n"))
diff --git a/python/external_models/hf_runner.py b/python/external_models/hf_runner.py
@@ -0,0 +1,101 @@
+import torch
+import numpy as np
+from loguru import logger
+from typing import List, Tuple
+from abc import ABC, abstractmethod
+from transformers import (
+ AutoModelForCausalLM,
+ AutoModelForSeq2SeqLM,
+ AutoTokenizer,
+ AutoModelForTextEncoding,
+)
+import os
+import numpy as np
+
+import openai
+from openai import OpenAI
+from .external_parser import *
+
+
+class HFTacticGenerator(Generator, Transformer):
+ def __init__(
+ self,
+ **args
+ ) -> None:
+ self.name = args['model']
+ self.tokenizer = AutoTokenizer.from_pretrained(
+ self.name, trust_remote_code=True)
+ device = args['device']
+ if device == "auto":
+ device = get_cuda_if_available()
+ else:
+ device = torch.device(device)
+ logger.info(f"Loading {self.name} on {device}")
+ self.model = AutoModelForCausalLM.from_pretrained(
+ self.name, trust_remote_code=True).to(device)
+
+ self.generation_args: dict[str | str] = {
+ "do_sample": args["do_sample"],
+ "temperature": args['temperature'], # chat default is 0.8
+ "max_new_tokens": args['max_new_tokens'],
+ "top_p": args['top_p'], # chat default is 0.8
+ # "length_penalty": args["length_penalty"],
+ "num_return_sequences": args['num_return_sequences'],
+ # "num_beams": self.num_return_sequences,
+ # Here if we use beam search for llms the output are not diverse(few tactics are provided).
+ "output_scores": args["output_scores"],
+ "output_logits": args["output_logits"],
+ "return_dict_in_generate": args["return_dict_in_generate"],
+ }
+
+ def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float]]:
+ prompt = input + target_prefix
+ '''prompt= 'Here is a theorom you need to prove in Lean:\n'+prompt+'\nNow you should suggest one line tactic in lean code:'
+ prompt = f"""<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"""
+ '''
+ prompt = pre_process_input(self.name, prompt)
+
+ self.model = self.model.eval()
+
+ tokenized_input = self.tokenizer(prompt, return_tensors="pt")
+ eos_token_id = [self.tokenizer.eos_token_id,
+ self.tokenizer.convert_tokens_to_ids(["<|im_end|>"])[0]]
+ outputs = self.model.generate(
+ tokenized_input.input_ids.to(self.device),
+ eos_token_id=eos_token_id,
+ **self.generation_args
+ )
+ # outputs = outputs[0].tolist()[len(tokenized_input.input_ids[0]) :]
+ response = self.tokenizer.batch_decode(
+ outputs['sequences'], skip_special_tokens=True)
+ # response = response.split("<|im_end|>")[0]
+ result = []
+ # if we ues beam search here should be zip(response,outputs.sequence_scores)
+ index = 0
+ for out, score in zip(response, outputs.scores):
+ out = post_process_output(self.name, out)
+ result.append((out, score[index].exp().sum().log().cpu().item()))
+ # if beam seach:
+ # result.append((out,score.exp().item()))
+ index += 1
+ result = choices_dedup(result)
+ return result
+
+
+if __name__ == "__main__":
+
+ generation_kwargs = {"model": "internlm/internlm2-math-plus-1_8b",
+ "temperature": 0.6,
+ "max_new_tokens": 1024,
+ "top_p": 0.9,
+ "length_penalty": 0,
+ "num_return_sequences": 64,
+ "do_sample": True,
+ "output_scores": True,
+ "output_logits": False,
+ "return_dict_in_generate": True,
+ "device": "auto",
+ }
+ model = HFTacticGenerator(**generation_kwargs)
+ model.cuda()
+ print(model.generate("n : ℕ\n⊢ gcd n n = n"))