diff --git a/README.md b/README.md index 8d65fe2e..92e6293b 100644 --- a/README.md +++ b/README.md @@ -108,6 +108,31 @@ Start `operate` with the SoM model operate -m gpt-4-with-som ``` +### Locally Hosted LLaVA Through Ollama +If you wish to experiment with the Self-Operating Computer Framework using LLaVA on your own machine, you can with Ollama! +*Note: Ollama currently only supports MacOS and Linux* + +First, install Ollama on your machine from https://ollama.ai/download. + +Once Ollama is installed, pull the LLaVA model: +``` +ollama pull llava +``` +This will download the model on your machine which takes approximately 5 GB of storage. + +When Ollama has finished pulling LLaVA, start the server: +``` +ollama serve +``` + +That's it! Now start `operate` and select the LLaVA model: +``` +operate -m llava +``` +**Important:** Error rates when using LLaVA are very high. This is simply intended to be a base to build off of as local multimodal models improve over time. + +Learn more about Ollama at its [GitHub Repository](https://www.github.com/ollama/ollama) + ### Voice Mode `--voice` The framework supports voice inputs for the objective. Try voice by following the instructions below. **Clone the repo** to a directory on your computer: diff --git a/operate/models/apis.py b/operate/models/apis.py index eec539e0..d81f597c 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -5,7 +5,7 @@ import traceback import io import easyocr - +import ollama from PIL import Image from ultralytics import YOLO @@ -53,6 +53,9 @@ async def get_next_action(model, messages, objective, session_id): return "coming soon" elif model == "gemini-pro-vision": return call_gemini_pro_vision(messages, objective), None + elif model == "llava": + operation = call_ollama_llava(messages), None + return operation raise ModelNotRecognizedException(model) @@ -464,6 +467,86 @@ async def call_gpt_4_vision_preview_labeled(messages, objective): return call_gpt_4_vision_preview(messages) +def call_ollama_llava(messages): + if VERBOSE: + print("[call_ollama_llava]") + time.sleep(1) + try: + screenshots_dir = "screenshots" + if not os.path.exists(screenshots_dir): + os.makedirs(screenshots_dir) + + screenshot_filename = os.path.join(screenshots_dir, "screenshot.png") + # Call the function to capture the screen with the cursor + capture_screen_with_cursor(screenshot_filename) + + if len(messages) == 1: + user_prompt = get_user_first_message_prompt() + else: + user_prompt = get_user_prompt() + + if VERBOSE: + print( + "[call_ollama_llava] user_prompt", + user_prompt, + ) + + vision_message = { + "role": "user", + "content": user_prompt, + "images": [screenshot_filename], + } + messages.append(vision_message) + + response = ollama.chat( + model="llava", + messages=messages, + ) + + # Important: Remove the image path from the message history. + # Ollama will attempt to load each image reference and will + # eventually timeout. + messages[-1]["images"] = None + + content = response['message']['content'].strip() + + if content.startswith("```json"): + content = content[len("```json") :] # Remove starting ```json + if content.endswith("```"): + content = content[: -len("```")] # Remove ending + + assistant_message = {"role": "assistant", "content": content} + if VERBOSE: + print( + "[call_ollama_llava] content", + content, + ) + content = json.loads(content) + + messages.append(assistant_message) + + return content + + except ollama.ResponseError as e: + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Operate] Couldn't connect to Ollama. With Ollama installed, run `ollama pull llava` then `ollama serve`{ANSI_RESET}", + e, + ) + + except Exception as e: + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[Operate] That did not work. Trying again {ANSI_RESET}", + e, + ) + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response was {ANSI_RESET}", + content, + ) + if VERBOSE: + traceback.print_exc() + return call_ollama_llava(messages) + + def get_last_assistant_message(messages): """ Retrieve the last message from the assistant in the messages array. diff --git a/requirements.txt b/requirements.txt index f2727e69..a4435b83 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,53 +1,102 @@ +aiohttp==3.9.1 +aiosignal==1.3.1 annotated-types==0.6.0 anyio==3.7.1 +attrs==23.2.0 +cachetools==5.3.2 certifi==2023.7.22 charset-normalizer==3.3.2 colorama==0.4.6 contourpy==1.2.0 cycler==0.12.1 distro==1.8.0 +easyocr==1.7.1 EasyProcess==1.1 entrypoint2==1.1 exceptiongroup==1.1.3 +filelock==3.13.1 fonttools==4.44.0 +frozenlist==1.4.1 +fsspec==2024.2.0 +google-ai-generativelanguage==0.4.0 +google-api-core==2.16.2 +google-auth==2.27.0 +google-generativeai==0.3.0 +googleapis-common-protos==1.62.0 +grpcio==1.60.1 +grpcio-status==1.60.1 h11==0.14.0 httpcore==1.0.2 -httpx==0.25.1 +httpx==0.25.2 idna==3.4 +imageio==2.33.1 importlib-resources==6.1.1 +Jinja2==3.1.3 kiwisolver==1.4.5 +lazy_loader==0.3 +MarkupSafe==2.1.5 matplotlib==3.8.1 MouseInfo==0.1.3 +mpmath==1.3.0 mss==9.0.1 +multidict==6.0.5 +networkx==3.2.1 +ninja==1.11.1.1 numpy==1.26.1 +ollama==0.1.6 openai==1.2.3 +opencv-python==4.9.0.80 +opencv-python-headless==4.9.0.80 packaging==23.2 +pandas==2.2.0 Pillow==10.1.0 prompt-toolkit==3.0.39 +proto-plus==1.23.0 +protobuf==4.25.2 +psutil==5.9.8 +py-cpuinfo==9.0.0 +pyasn1==0.5.1 +pyasn1-modules==0.3.0 PyAutoGUI==0.9.54 +pyclipper==1.3.0.post5 pydantic==2.4.2 pydantic_core==2.10.1 PyGetWindow==0.0.9 PyMsgBox==1.0.9 +pyobjc-core==10.1 +pyobjc-framework-Cocoa==10.1 +pyobjc-framework-Quartz==10.1 pyparsing==3.1.1 pyperclip==1.8.2 PyRect==0.2.0 pyscreenshot==3.1 PyScreeze==0.1.29 -python3-xlib==0.15 +python-bidi==0.4.2 python-dateutil==2.8.2 python-dotenv==1.0.0 +python3-xlib==0.15 pytweening==1.0.7 +pytz==2024.1 +PyYAML==6.0.1 requests==2.31.0 +rsa==4.9 rubicon-objc==0.4.7 +scikit-image==0.22.0 +scipy==1.12.0 +seaborn==0.13.2 +shapely==2.0.2 six==1.16.0 sniffio==1.3.0 +sympy==1.12 +thop==0.1.1.post2209072238 +tifffile==2024.1.30 +torch==2.2.0 +torchvision==0.17.0 tqdm==4.66.1 typing_extensions==4.8.0 +tzdata==2023.4 +ultralytics==8.0.227 urllib3==2.0.7 wcwidth==0.2.9 +yarl==1.9.4 zipp==3.17.0 -google-generativeai==0.3.0 -aiohttp==3.9.1 -ultralytics==8.0.227 -easyocr==1.7.1 \ No newline at end of file