Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,31 @@ Start `operate` with the SoM model
operate -m gpt-4-with-som
```

### Locally Hosted LLaVA Through Ollama
If you wish to experiment with the Self-Operating Computer Framework using LLaVA on your own machine, you can with Ollama!
*Note: Ollama currently only supports MacOS and Linux*

First, install Ollama on your machine from https://ollama.ai/download.

Once Ollama is installed, pull the LLaVA model:
```
ollama pull llava
```
This will download the model on your machine which takes approximately 5 GB of storage.

When Ollama has finished pulling LLaVA, start the server:
```
ollama serve
```

That's it! Now start `operate` and select the LLaVA model:
```
operate -m llava
```
**Important:** Error rates when using LLaVA are very high. This is simply intended to be a base to build off of as local multimodal models improve over time.

Learn more about Ollama at its [GitHub Repository](https://www.github.com/ollama/ollama)

### Voice Mode `--voice`
The framework supports voice inputs for the objective. Try voice by following the instructions below.
**Clone the repo** to a directory on your computer:
Expand Down
85 changes: 84 additions & 1 deletion operate/models/apis.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import traceback
import io
import easyocr

import ollama

from PIL import Image
from ultralytics import YOLO
Expand Down Expand Up @@ -53,6 +53,9 @@ async def get_next_action(model, messages, objective, session_id):
return "coming soon"
elif model == "gemini-pro-vision":
return call_gemini_pro_vision(messages, objective), None
elif model == "llava":
operation = call_ollama_llava(messages), None
return operation

raise ModelNotRecognizedException(model)

Expand Down Expand Up @@ -464,6 +467,86 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
return call_gpt_4_vision_preview(messages)


def call_ollama_llava(messages):
if VERBOSE:
print("[call_ollama_llava]")
time.sleep(1)
try:
screenshots_dir = "screenshots"
if not os.path.exists(screenshots_dir):
os.makedirs(screenshots_dir)

screenshot_filename = os.path.join(screenshots_dir, "screenshot.png")
# Call the function to capture the screen with the cursor
capture_screen_with_cursor(screenshot_filename)

if len(messages) == 1:
user_prompt = get_user_first_message_prompt()
else:
user_prompt = get_user_prompt()

if VERBOSE:
print(
"[call_ollama_llava] user_prompt",
user_prompt,
)

vision_message = {
"role": "user",
"content": user_prompt,
"images": [screenshot_filename],
}
messages.append(vision_message)

response = ollama.chat(
model="llava",
messages=messages,
)

# Important: Remove the image path from the message history.
# Ollama will attempt to load each image reference and will
# eventually timeout.
messages[-1]["images"] = None

content = response['message']['content'].strip()

if content.startswith("```json"):
content = content[len("```json") :] # Remove starting ```json
if content.endswith("```"):
content = content[: -len("```")] # Remove ending

assistant_message = {"role": "assistant", "content": content}
if VERBOSE:
print(
"[call_ollama_llava] content",
content,
)
content = json.loads(content)

messages.append(assistant_message)

return content

except ollama.ResponseError as e:
print(
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Operate] Couldn't connect to Ollama. With Ollama installed, run `ollama pull llava` then `ollama serve`{ANSI_RESET}",
e,
)

except Exception as e:
print(
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[Operate] That did not work. Trying again {ANSI_RESET}",
e,
)
print(
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response was {ANSI_RESET}",
content,
)
if VERBOSE:
traceback.print_exc()
return call_ollama_llava(messages)


def get_last_assistant_message(messages):
"""
Retrieve the last message from the assistant in the messages array.
Expand Down
61 changes: 55 additions & 6 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,53 +1,102 @@
aiohttp==3.9.1
aiosignal==1.3.1
annotated-types==0.6.0
anyio==3.7.1
attrs==23.2.0
cachetools==5.3.2
certifi==2023.7.22
charset-normalizer==3.3.2
colorama==0.4.6
contourpy==1.2.0
cycler==0.12.1
distro==1.8.0
easyocr==1.7.1
EasyProcess==1.1
entrypoint2==1.1
exceptiongroup==1.1.3
filelock==3.13.1
fonttools==4.44.0
frozenlist==1.4.1
fsspec==2024.2.0
google-ai-generativelanguage==0.4.0
google-api-core==2.16.2
google-auth==2.27.0
google-generativeai==0.3.0
googleapis-common-protos==1.62.0
grpcio==1.60.1
grpcio-status==1.60.1
h11==0.14.0
httpcore==1.0.2
httpx==0.25.1
httpx==0.25.2
idna==3.4
imageio==2.33.1
importlib-resources==6.1.1
Jinja2==3.1.3
kiwisolver==1.4.5
lazy_loader==0.3
MarkupSafe==2.1.5
matplotlib==3.8.1
MouseInfo==0.1.3
mpmath==1.3.0
mss==9.0.1
multidict==6.0.5
networkx==3.2.1
ninja==1.11.1.1
numpy==1.26.1
ollama==0.1.6
openai==1.2.3
opencv-python==4.9.0.80
opencv-python-headless==4.9.0.80
packaging==23.2
pandas==2.2.0
Pillow==10.1.0
prompt-toolkit==3.0.39
proto-plus==1.23.0
protobuf==4.25.2
psutil==5.9.8
py-cpuinfo==9.0.0
pyasn1==0.5.1
pyasn1-modules==0.3.0
PyAutoGUI==0.9.54
pyclipper==1.3.0.post5
pydantic==2.4.2
pydantic_core==2.10.1
PyGetWindow==0.0.9
PyMsgBox==1.0.9
pyobjc-core==10.1
pyobjc-framework-Cocoa==10.1
pyobjc-framework-Quartz==10.1
pyparsing==3.1.1
pyperclip==1.8.2
PyRect==0.2.0
pyscreenshot==3.1
PyScreeze==0.1.29
python3-xlib==0.15
python-bidi==0.4.2
python-dateutil==2.8.2
python-dotenv==1.0.0
python3-xlib==0.15
pytweening==1.0.7
pytz==2024.1
PyYAML==6.0.1
requests==2.31.0
rsa==4.9
rubicon-objc==0.4.7
scikit-image==0.22.0
scipy==1.12.0
seaborn==0.13.2
shapely==2.0.2
six==1.16.0
sniffio==1.3.0
sympy==1.12
thop==0.1.1.post2209072238
tifffile==2024.1.30
torch==2.2.0
torchvision==0.17.0
tqdm==4.66.1
typing_extensions==4.8.0
tzdata==2023.4
ultralytics==8.0.227
urllib3==2.0.7
wcwidth==0.2.9
yarl==1.9.4
zipp==3.17.0
google-generativeai==0.3.0
aiohttp==3.9.1
ultralytics==8.0.227
easyocr==1.7.1