Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions lib/crewai/src/crewai/agent/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from crewai.agents.cache.cache_handler import CacheHandler
from crewai.agents.crew_agent_executor import CrewAgentExecutor
from crewai.events.event_bus import crewai_event_bus
from crewai.multimodal import Image, MultipartContent
from crewai.events.types.knowledge_events import (
KnowledgeQueryCompletedEvent,
KnowledgeQueryFailedEvent,
Expand Down Expand Up @@ -213,6 +214,10 @@ class Agent(BaseAgent):
default=None,
description="A2A (Agent-to-Agent) configuration for delegating tasks to remote agents. Can be a single A2AConfig or a dict mapping agent IDs to configs.",
)
multipart_context: list[str | Image] | MultipartContent | None = Field(
default=None,
description="Multimodal context for the agent. Can be a list of text strings and Image objects, or a MultipartContent instance. This content is added to the agent's system prompt.",
)

@model_validator(mode="before")
def validate_from_repository(cls, v: Any) -> dict[str, Any] | None | Any: # noqa: N805
Expand Down
6 changes: 6 additions & 0 deletions lib/crewai/src/crewai/multimodal/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
"""Multimodal content support for CrewAI agents and tasks."""

from crewai.multimodal.image import Image
from crewai.multimodal.multipart_content import MultipartContent

__all__ = ["Image", "MultipartContent"]
245 changes: 245 additions & 0 deletions lib/crewai/src/crewai/multimodal/image.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,245 @@
"""Image class for handling various image formats in multimodal contexts."""

from __future__ import annotations

import base64
import mimetypes
from pathlib import Path
from typing import Any, Literal
from urllib.parse import urlparse

from pydantic import BaseModel, Field, field_validator


class Image(BaseModel):
"""Represents an image in various formats for multimodal content.

Supports:
- URLs (http://, https://)
- Data URLs (data:image/...;base64,...)
- Local file paths (absolute, relative, ~, file://)
- Raw base64 strings
- Binary data

Attributes:
source: The image source (URL, file path, or data)
source_type: Type of source (url, file, data_url, base64, binary)
media_type: MIME type of the image (e.g., 'image/png')
placeholder: Optional placeholder name for interpolation at runtime
"""

source: str | bytes | None = Field(
default=None,
description="Image source: URL, file path, base64 string, or binary data"
)
source_type: Literal["url", "file", "data_url", "base64", "binary"] | None = Field(
default=None,
description="Type of the image source"
)
media_type: str = Field(
default="image/png",
description="MIME type of the image"
)
placeholder: str | None = Field(
default=None,
description="Placeholder name for runtime interpolation (e.g., '{user_image}')"
)

@field_validator("source_type", mode="before")
@classmethod
def infer_source_type(cls, v: Any, info: Any) -> str:
"""Automatically infer source type if not provided."""
if v is not None:
return v

source = info.data.get("source")
if source is None:
return "url" # Default

if isinstance(source, bytes):
return "binary"

source_str = str(source)

# Check for data URL
if source_str.startswith("data:"):
return "data_url"

# Check for HTTP(S) URL
if source_str.startswith(("http://", "https://")):
return "url"

# Check for file:// URL
if source_str.startswith("file://"):
return "file"

# Check if it looks like base64 (no path separators, reasonable length)
if len(source_str) > 100 and "/" not in source_str[:50] and "\\" not in source_str[:50]:
return "base64"

# Default to file path
return "file"

@classmethod
def from_url(cls, url: str, media_type: str = "image/png") -> Image:
"""Create an Image from a URL.

Args:
url: HTTP(S) URL to the image
media_type: MIME type of the image

Returns:
Image instance
"""
return cls(source=url, source_type="url", media_type=media_type)

@classmethod
def from_file(cls, file_path: str | Path, media_type: str | None = None) -> Image:
"""Create an Image from a local file path.

Args:
file_path: Path to the local image file
media_type: MIME type (auto-detected if None)

Returns:
Image instance
"""
path = Path(file_path).expanduser().resolve()

if media_type is None:
media_type = mimetypes.guess_type(str(path))[0] or "image/png"

return cls(source=str(path), source_type="file", media_type=media_type)

@classmethod
def from_base64(cls, base64_string: str, media_type: str = "image/png") -> Image:
"""Create an Image from a base64 string.

Args:
base64_string: Base64-encoded image data
media_type: MIME type of the image

Returns:
Image instance
"""
return cls(source=base64_string, source_type="base64", media_type=media_type)

@classmethod
def from_binary(cls, binary_data: bytes, media_type: str = "image/png") -> Image:
"""Create an Image from binary data.

Args:
binary_data: Raw image bytes
media_type: MIME type of the image

Returns:
Image instance
"""
return cls(source=binary_data, source_type="binary", media_type=media_type)

@classmethod
def from_placeholder(cls, placeholder: str, media_type: str = "image/png") -> Image:
"""Create an Image placeholder for runtime interpolation.

Args:
placeholder: Placeholder name (e.g., 'user_image')
media_type: Expected MIME type of the image

Returns:
Image instance
"""
return cls(
source=None,
source_type="url", # Will be replaced at runtime
media_type=media_type,
placeholder=placeholder
)

def to_data_url(self) -> str:
"""Convert the image to a data URL format.

Reads local files and converts base64/binary to proper data URL format.
Returns existing URLs unchanged.

Returns:
Data URL string (data:image/...;base64,...)

Raises:
FileNotFoundError: If source is a file that doesn't exist
ValueError: If source is None and no placeholder
"""
if self.placeholder:
raise ValueError(
f"Cannot convert placeholder '{self.placeholder}' to data URL. "
"Replace placeholder with actual image data first."
)

if self.source is None:
raise ValueError("Image source is None")

# Already a data URL
if self.source_type == "data_url":
return str(self.source)

# HTTP(S) URL - return as-is (some providers support URLs directly)
if self.source_type == "url":
return str(self.source)

# Binary data
if self.source_type == "binary":
base64_data = base64.b64encode(self.source).decode("utf-8") # type: ignore
return f"data:{self.media_type};base64,{base64_data}"

# Base64 string
if self.source_type == "base64":
return f"data:{self.media_type};base64,{str(self.source)}"

# File path
if self.source_type == "file":
file_path = Path(str(self.source))

# Handle file:// URLs
if str(self.source).startswith("file://"):
file_path = Path(urlparse(str(self.source)).path)

file_path = file_path.expanduser().resolve()

if not file_path.exists():
raise FileNotFoundError(f"Image file not found: {file_path}")

if not file_path.is_file():
raise ValueError(f"Path is not a file: {file_path}")

with open(file_path, "rb") as f:
image_data = base64.b64encode(f.read()).decode("utf-8")

# Update media type from file if not explicitly set
guessed_type = mimetypes.guess_type(str(file_path))[0]
if guessed_type and self.media_type == "image/png":
self.media_type = guessed_type

return f"data:{self.media_type};base64,{image_data}"

# Fallback
return str(self.source)

def to_message_content(self) -> dict[str, Any]:
"""Convert image to LLM message content format.

Returns a dict compatible with most LLM providers' multimodal format.

Returns:
Dictionary with type and image_url fields
"""
return {
"type": "image_url",
"image_url": {
"url": self.to_data_url()
}
}

def __str__(self) -> str:
"""String representation of the image."""
if self.placeholder:
return f"Image(placeholder={self.placeholder})"
return f"Image({self.source_type}:{str(self.source)[:50]}...)"
92 changes: 92 additions & 0 deletions lib/crewai/src/crewai/multimodal/multipart_content.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
"""Multipart content class for handling mixed text and media content."""

from __future__ import annotations

from typing import Any

from pydantic import BaseModel, Field

from crewai.multimodal.image import Image


class MultipartContent(BaseModel):
"""Represents multipart content containing text and/or media.

Used to build compound context for agents and tasks that combines
text descriptions with images or other media types.

Attributes:
parts: List of content parts (strings for text, Image for images)
"""

parts: list[str | Image] = Field(
default_factory=list,
description="List of content parts (text strings or Image objects)"
)

def add_text(self, text: str) -> None:
"""Add a text part to the content.

Args:
text: Text content to add
"""
self.parts.append(text)

def add_image(self, image: Image) -> None:
"""Add an image part to the content.

Args:
image: Image object to add
"""
self.parts.append(image)

def to_message_content(self) -> list[dict[str, Any]]:
"""Convert multipart content to LLM message format.

Returns a list of content parts suitable for LLM APIs that support
multimodal inputs (like OpenAI's GPT-4V or Anthropic's Claude).

Returns:
List of dicts with 'type' and content-specific fields
"""
message_parts = []

for part in self.parts:
if isinstance(part, str):
message_parts.append({
"type": "text",
"text": part
})
elif isinstance(part, Image):
message_parts.append(part.to_message_content())

return message_parts

def get_text_only(self) -> str:
"""Extract only text content, ignoring images.

Useful for fallback scenarios or text-only processing.

Returns:
Concatenated text from all text parts
"""
text_parts = [part for part in self.parts if isinstance(part, str)]
return "\n".join(text_parts)

def has_images(self) -> bool:
"""Check if content contains any images.

Returns:
True if at least one Image part exists
"""
return any(isinstance(part, Image) for part in self.parts)

def __len__(self) -> int:
"""Return the number of content parts."""
return len(self.parts)

def __str__(self) -> str:
"""String representation showing content composition."""
text_count = sum(1 for p in self.parts if isinstance(p, str))
image_count = sum(1 for p in self.parts if isinstance(p, Image))
return f"MultipartContent({text_count} text parts, {image_count} images)"
5 changes: 5 additions & 0 deletions lib/crewai/src/crewai/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
TaskFailedEvent,
TaskStartedEvent,
)
from crewai.multimodal import Image, MultipartContent
from crewai.security import Fingerprint, SecurityConfig
from crewai.tasks.output_format import OutputFormat
from crewai.tasks.task_output import TaskOutput
Expand Down Expand Up @@ -191,6 +192,10 @@ class Task(BaseModel):
default=None,
description="Whether this task should append 'Trigger Payload: {crewai_trigger_payload}' to the task description when crewai_trigger_payload exists in crew inputs.",
)
multipart_context: list[str | Image] | MultipartContent | None = Field(
default=None,
description="Multimodal context for the task. Can be a list of text strings and Image objects, or a MultipartContent instance. This content is added to the task's context.",
)
_guardrail: GuardrailCallable | None = PrivateAttr(default=None)
_guardrails: list[GuardrailCallable] = PrivateAttr(
default_factory=list,
Expand Down
1 change: 1 addition & 0 deletions lib/crewai/tests/multimodal/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Init file for multimodal tests."""
Loading