diff --git a/README.md b/README.md index ab24691c..0a74bda8 100644 --- a/README.md +++ b/README.md @@ -76,28 +76,28 @@ Use Claude 3 with Vision to see how it stacks up to GPT-4-Vision at operating a operate -m claude-3 ``` -#### Try LLaVa Hosted Through Ollama `-m llava` -If you wish to experiment with the Self-Operating Computer Framework using LLaVA on your own machine, you can with Ollama! +#### Try a model Hosted Through Ollama `-m llama3.2-vision` +If you wish to experiment with the Self-Operating Computer Framework using e.g. LLaVA on your own machine, you can with Ollama! *Note: Ollama currently only supports MacOS and Linux. Windows now in Preview* First, install Ollama on your machine from https://ollama.ai/download. -Once Ollama is installed, pull the LLaVA model: +Once Ollama is installed, pull the vision model: ``` -ollama pull llava +ollama pull llama3.2-vision ``` This will download the model on your machine which takes approximately 5 GB of storage. -When Ollama has finished pulling LLaVA, start the server: +When Ollama has finished pulling llama3.2-vision, start the server: ``` ollama serve ``` -That's it! Now start `operate` and select the LLaVA model: +That's it! Now start `operate` and select the model: ``` -operate -m llava +operate -m llama3.2-vision ``` -**Important:** Error rates when using LLaVA are very high. This is simply intended to be a base to build off of as local multimodal models improve over time. +**Important:** Error rates when using self-hosted models are very high. This is simply intended to be a base to build off of as local multimodal models improve over time. Learn more about Ollama at its [GitHub Repository](https://www.github.com/ollama/ollama) diff --git a/operate/models/apis.py b/operate/models/apis.py index d0ccb0c4..1b8a2fc4 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -50,14 +50,11 @@ async def get_next_action(model, messages, objective, session_id): return "coming soon" if model == "gemini-pro-vision": return call_gemini_pro_vision(messages, objective), None - if model == "llava": - operation = call_ollama_llava(messages) - return operation, None if model == "claude-3": operation = await call_claude_3_with_ocr(messages, objective, model) return operation, None - raise ModelNotRecognizedException(model) - + operation = call_ollama(model, messages) + return operation, None def call_gpt_4o(messages): if config.verbose: @@ -557,10 +554,9 @@ async def call_gpt_4o_labeled(messages, objective, model): traceback.print_exc() return call_gpt_4o(messages) - -def call_ollama_llava(messages): +def call_ollama(model, messages): if config.verbose: - print("[call_ollama_llava]") + print(f"[call_ollama] model {model}") time.sleep(1) try: model = config.initialize_ollama() @@ -579,7 +575,7 @@ def call_ollama_llava(messages): if config.verbose: print( - "[call_ollama_llava] user_prompt", + "[call_ollama] user_prompt", user_prompt, ) @@ -590,8 +586,8 @@ def call_ollama_llava(messages): } messages.append(vision_message) - response = model.chat( - model="llava", + response = ollama.chat( + model=model, messages=messages, ) @@ -607,7 +603,7 @@ def call_ollama_llava(messages): assistant_message = {"role": "assistant", "content": content} if config.verbose: print( - "[call_ollama_llava] content", + "[call_ollama] content", content, ) content = json.loads(content) @@ -624,7 +620,7 @@ def call_ollama_llava(messages): except Exception as e: print( - f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[llava] That did not work. Trying again {ANSI_RESET}", + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[{model}] That did not work. Trying again {ANSI_RESET}", e, ) print( @@ -633,7 +629,7 @@ def call_ollama_llava(messages): ) if config.verbose: traceback.print_exc() - return call_ollama_llava(messages) + return call_ollama(model, messages) async def call_claude_3_with_ocr(messages, objective, model):