ollama · rylativity · Dec 22, 2024
diff --git a/examples/multimodal-raw-generate.py b/examples/multimodal-raw-generate.py
@@ -0,0 +1,41 @@
+import sys
+import random
+import httpx
+
+from ollama import generate
+
+latest = httpx.get('https://xkcd.com/info.0.json')
+latest.raise_for_status()
+
+if len(sys.argv) > 1:
+  num = int(sys.argv[1])
+else:
+  num = random.randint(1, latest.json().get('num'))
+
+comic = httpx.get(f'https://xkcd.com/{num}/info.0.json')
+comic.raise_for_status()
+
+print(f'xkcd #{comic.json().get("num")}: {comic.json().get("alt")}')
+print(f'link: https://xkcd.com/{num}')
+print('---')
+
+raw = httpx.get(comic.json().get('img'))
+raw.raise_for_status()
+
+for response in generate(model='llama3.2-vision', 
+                         prompt='explain this comic:', 
+                         raw=True, 
+                         prepend_images_to_raw_prompt=True, 
+                         images=[raw.content], 
+                         stream=True):
+  print(response['response'], end='', flush=True)
+
+## Or to manually place image in specific location in prompt, use the below. 
+## note: the `[img-0]<image>` tag can be placed anywhere in the prompt, and the `0` corresponds to the index of the image in the `images` array
+
+# for response in generate(model='llama3.2-vision', 
+#                          prompt='[img-0]<image>explain this comic:', 
+#                          raw=True, 
+#                          images=[raw.content], 
+#                          stream=True):
+#   print(response['response'], end='', flush=True)
diff --git a/ollama/_client.py b/ollama/_client.py
@@ -188,6 +188,7 @@ def generate(
     context: Optional[Sequence[int]] = None,
     stream: Literal[False] = False,
     raw: bool = False,
+    prepend_images_to_raw_prompt: Optional[bool] = False,
     format: Optional[Union[Literal['', 'json'], JsonSchemaValue]] = None,
     images: Optional[Sequence[Union[str, bytes]]] = None,
     options: Optional[Union[Mapping[str, Any], Options]] = None,
@@ -206,6 +207,7 @@ def generate(
     context: Optional[Sequence[int]] = None,
     stream: Literal[True] = True,
     raw: bool = False,
+    prepend_images_to_raw_prompt: Optional[bool] = False,
     format: Optional[Union[Literal['', 'json'], JsonSchemaValue]] = None,
     images: Optional[Sequence[Union[str, bytes]]] = None,
     options: Optional[Union[Mapping[str, Any], Options]] = None,
@@ -223,6 +225,7 @@ def generate(
     context: Optional[Sequence[int]] = None,
     stream: bool = False,
     raw: Optional[bool] = None,
+    prepend_images_to_raw_prompt: Optional[bool] = False,
     format: Optional[Union[Literal['', 'json'], JsonSchemaValue]] = None,
     images: Optional[Sequence[Union[str, bytes]]] = None,
     options: Optional[Union[Mapping[str, Any], Options]] = None,
@@ -251,6 +254,7 @@ def generate(
         context=context,
         stream=stream,
         raw=raw,
+        prepend_images_to_raw_prompt=prepend_images_to_raw_prompt,
         format=format,
         images=[Image(value=image) for image in images] if images else None,
         options=options,

diff --git a/ollama/_types.py b/ollama/_types.py
@@ -204,6 +204,9 @@ class GenerateRequest(BaseGenerateRequest):
 
   raw: Optional[bool] = None
 
+  prepend_images_to_raw_prompt: Optional[bool] = None
+  'Whether to automatically add properly formatted and necessary image placeholders to top of raw prompt'
+
   images: Optional[Sequence[Image]] = None
   'Image data for multimodal models.'