Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support external models: vllm huggingface openai #97

Merged
merged 9 commits into from
Sep 1, 2024
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions LeanCopilotTests/ModelAPIs.lean
Original file line number Diff line number Diff line change
Expand Up @@ -99,4 +99,27 @@ def reproverExternalEncoder : ExternalEncoder := {

-- Go to ./python and run `uvicorn server:app --port 23337`
#eval encode reproverExternalEncoder "n : ℕ\n⊢ gcd n n = n"

/--
LLMs apis: openai, claude etc.
-/
def gpt4 : ExternalGenerator := {
name := "gpt4"
host := "localhost"
port := 23337
}

#eval generate gpt4 "n : ℕ\n⊢ gcd n n = n"

/--
Math LLMs: InternLM, Deepseekmath etc.
-/
def internLM : ExternalGenerator := {
name := "InternLM"
host := "localhost"
port := 23337
}

#eval generate internLM "n : ℕ\n⊢ gcd n n = n"

-/
2 changes: 1 addition & 1 deletion python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ Python Server for External Models
conda create --name lean-copilot python=3.10 python numpy
conda activate lean-copilot
pip install torch --index-url https://download.pytorch.org/whl/cu121 # Depending on whether you have CUDA and the CUDA version; see https://pytorch.org/.
pip install fastapi uvicorn loguru transformers openai
pip install fastapi uvicorn loguru transformers openai anthropic google vllm
```


Expand Down
5 changes: 5 additions & 0 deletions python/external_models/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .oai_runner import OpenAIRunner
from .hf_runner import HFTacticGenerator
from .vllm_runner import VLLMTacticGenerator
from .claude_runner import ClaudeRunner
from .gemini_runner import GeminiRunner
59 changes: 59 additions & 0 deletions python/external_models/claude_runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import torch
import numpy as np
from loguru import logger
from typing import List, Tuple
from abc import ABC, abstractmethod
from transformers import (
AutoModelForCausalLM,
AutoModelForSeq2SeqLM,
AutoTokenizer,
AutoModelForTextEncoding,
)
import os
import numpy as np
try:
from anthropic import Anthropic
except ImportError as e:
pass
from .external_parser import *


class ClaudeRunner(Generator, Transformer):
client = Anthropic(api_key=os.getenv("ANTHROPIC_KEY"))

def __init__(self, **args):
self.client_kwargs: dict[str | str] = {
"model": args['model'],
"temperature": args['temperature'],
"max_tokens": args['max_tokens'],
"top_p": args['top_p'],
}
self.name = self.client_kwargs["model"]

def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float]]:
prompt = pre_process_input(self.name, input + target_prefix)

try:
response = self.client.completions.create(
prompt=prompt,
**self.client_kwargs,
)
content = response.completion

except Exception as e:
raise e

results = [(post_process_output(self.name, content),1.0)]# current claude only supports one output
return choices_dedup(results)


if __name__ == "__main__":

generation_kwargs = {"model": "claude-3-opus",
"temperature": 0.9,
"max_tokens": 1024,
"top_p": 0.9,
}

model = ClaudeRunner(**generation_kwargs)
print(model.generate("n : ℕ\n⊢ gcd n n = n"))
81 changes: 81 additions & 0 deletions python/external_models/external_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import os
import torch
import argparse
import numpy as np
from loguru import logger
from typing import List, Tuple
from abc import ABC, abstractmethod
from transformers import (
AutoModelForCausalLM,
AutoModelForSeq2SeqLM,
AutoTokenizer,
AutoModelForTextEncoding,
)


def get_cuda_if_available():
return torch.device("cuda" if torch.cuda.is_available() else "cpu")


def pre_process_input(model_name, input):
if model_name == "internlm/internlm2-math-plus-1_8b":
prompt="My LEAN 4 state is:\n```lean\n" + input + \
"```\nPlease predict a possible tactic to help me prove the theorem."
prompt = f"""<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"""
elif model_name == "gpt-3.5-turbo" or model_name == "gpt-4-turbo-preview":
prompt = 'Here is a theorom you need to prove in Lean:\n' + \
input+'\nNow you should suggest one line tactic in lean code:'
elif 'gemini' in model_name or "claude" in model_name:
prompt = 'Here is a theorom you need to prove in Lean:\n' + \
input+'\nNow you should suggest one line tactic in lean code:'
else:
raise NotImplementedError(f"External model '{model_name}' not supported")
return prompt


def post_process_output(model_name, output):
if model_name == "internlm/internlm2-math-plus-1_8b":
result = output.split(
'assistant')[-1].split('lean')[-1].split('```')[0].split('\n')[1]
elif model_name == "gpt-3.5-turbo" or model_name == "gpt-4-turbo-preview":
result = output.split('lean')[-1].split('```')[0].split('\n')[1]
elif 'gemini' in model_name or "claude" in model_name:
result = output.split('lean')[-1].split('```')[0].split('\n')[1]
else:
raise NotImplementedError(f"External model '{model_name}' not supported")
return result


def choices_dedup(output_list: List[tuple[str, float]]) -> List[tuple[str, float]]:
unique_data = {}
for item in output_list:
if item[0] not in unique_data or item[1] > unique_data[item[0]]:
unique_data[item[0]] = item[1]
sorted_data = sorted(unique_data.items(), key=lambda x: x[1], reverse=True)
return sorted_data


class Generator(ABC):
@abstractmethod
def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float]]:
pass


class Encoder(ABC):
@abstractmethod
def encode(self, input: str) -> np.ndarray:
pass


class Transformer:
def cuda(self) -> None:
self.model.cuda()

def cpu(self) -> None:
self.model.cpu()

@property
def device(self) -> torch.device:
return self.model.device


85 changes: 85 additions & 0 deletions python/external_models/gemini_runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import torch
import numpy as np
from loguru import logger
from typing import List, Tuple
from abc import ABC, abstractmethod
from transformers import (
AutoModelForCausalLM,
AutoModelForSeq2SeqLM,
AutoTokenizer,
AutoModelForTextEncoding,
)
import os
import numpy as np
from .external_parser import *

try:
import google.generativeai as genai
from google.generativeai import GenerationConfig
except ImportError as e:
pass


class GeminiRunner(Generator, Transformer):
client = genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
safety_settings = [
{
"category": "HARM_CATEGORY_HARASSMENT",
"threshold": "BLOCK_NONE",
},
{
"category": "HARM_CATEGORY_HATE_SPEECH",
"threshold": "BLOCK_NONE",
},
{
"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
"threshold": "BLOCK_NONE",
},
{
"category": "HARM_CATEGORY_DANGEROUS_CONTENT",
"threshold": "BLOCK_NONE",
},]
def __init__(self, **args):

self.client_kwargs: dict[str | str] = {
"model": args['model'],
"temperature": args['temperature'],
"max_tokens": args['max_tokens'],
"top_p": args['top_p'],

}
self.name = self.client_kwargs["model"]

self.client = genai.GenerativeModel(args['model'])
self.generation_config = GenerationConfig(
candidate_count=1,
max_output_tokens=args['max_tokens'],
temperature=args['temperature'],
top_p=args['top_p'],
)
def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float]]:
prompt = pre_process_input(self.name, input + target_prefix)


response = self.client.generate_content(
prompt,
generation_config=self.generation_config,
safety_settings=GeminiRunner.safety_settings,
)



results = [(post_process_output(self.name, response.text),1.0)]# current gemini only supports one output
return choices_dedup(results)


if __name__ == "__main__":

generation_kwargs = {"model": 'gemini-1.0-pro',
"temperature": 0.9,
"max_tokens": 1024,
"top_p": 0.9,
}

model = GeminiRunner(**generation_kwargs)
print(model.generate("n : ℕ\n⊢ gcd n n = n"))
101 changes: 101 additions & 0 deletions python/external_models/hf_runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import torch
import numpy as np
from loguru import logger
from typing import List, Tuple
from abc import ABC, abstractmethod
from transformers import (
AutoModelForCausalLM,
AutoModelForSeq2SeqLM,
AutoTokenizer,
AutoModelForTextEncoding,
)
import os
import numpy as np

import openai
from openai import OpenAI
from .external_parser import *


class HFTacticGenerator(Generator, Transformer):
def __init__(
self,
**args
) -> None:
self.name = args['model']
self.tokenizer = AutoTokenizer.from_pretrained(
self.name, trust_remote_code=True)
device = args['device']
if device == "auto":
device = get_cuda_if_available()
else:
device = torch.device(device)
logger.info(f"Loading {self.name} on {device}")
self.model = AutoModelForCausalLM.from_pretrained(
self.name, trust_remote_code=True).to(device)

self.generation_args: dict[str | str] = {
"do_sample": args["do_sample"],
"temperature": args['temperature'], # chat default is 0.8
"max_new_tokens": args['max_new_tokens'],
"top_p": args['top_p'], # chat default is 0.8
# "length_penalty": args["length_penalty"],
"num_return_sequences": args['num_return_sequences'],
# "num_beams": self.num_return_sequences,
# Here if we use beam search for llms the output are not diverse(few tactics are provided).
hsz0403 marked this conversation as resolved.
Show resolved Hide resolved
"output_scores": args["output_scores"],
"output_logits": args["output_logits"],
"return_dict_in_generate": args["return_dict_in_generate"],
}

def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float]]:
prompt = input + target_prefix
'''prompt= 'Here is a theorom you need to prove in Lean:\n'+prompt+'\nNow you should suggest one line tactic in lean code:'
prompt = f"""<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"""
'''
prompt = pre_process_input(self.name, prompt)

self.model = self.model.eval()

tokenized_input = self.tokenizer(prompt, return_tensors="pt")
eos_token_id = [self.tokenizer.eos_token_id,
self.tokenizer.convert_tokens_to_ids(["<|im_end|>"])[0]]
outputs = self.model.generate(
tokenized_input.input_ids.to(self.device),
eos_token_id=eos_token_id,
**self.generation_args
)
# outputs = outputs[0].tolist()[len(tokenized_input.input_ids[0]) :]
response = self.tokenizer.batch_decode(
outputs['sequences'], skip_special_tokens=True)
# response = response.split("<|im_end|>")[0]
result = []
# if we ues beam search here should be zip(response,outputs.sequence_scores)
index = 0
for out, score in zip(response, outputs.scores):
out = post_process_output(self.name, out)
result.append((out, score[index].exp().sum().log().cpu().item()))
# if beam seach:
# result.append((out,score.exp().item()))
index += 1
result = choices_dedup(result)
return result


if __name__ == "__main__":

generation_kwargs = {"model": "internlm/internlm2-math-plus-1_8b",
"temperature": 0.6,
"max_new_tokens": 1024,
"top_p": 0.9,
"length_penalty": 0,
"num_return_sequences": 64,
"do_sample": True,
"output_scores": True,
"output_logits": False,
"return_dict_in_generate": True,
"device": "auto",
}
model = HFTacticGenerator(**generation_kwargs)
model.cuda()
print(model.generate("n : ℕ\n⊢ gcd n n = n"))
Loading