Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Multimodal: <img x.jpg> will only detect filename, and ignore HTML #54

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion autogen/agentchat/contrib/img_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ def gpt4v_formatter(prompt: str, img_format: str = "uri") -> List[Union[str, dic
image_count = 0

# Find all image tags
for parsed_tag in utils.parse_tags_from_content("img", prompt):
for parsed_tag in utils.parse_tags_from_content("img", prompt, strict_filepath_match=True):
image_location = parsed_tag["attr"]["src"]
try:
if img_format == "pil":
Expand Down
29 changes: 20 additions & 9 deletions autogen/agentchat/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,9 @@ def aggregate_summary(usage_summary: Dict[str, Any], agent_summary: Dict[str, An
}


def parse_tags_from_content(tag: str, content: Union[str, List[Dict[str, Any]]]) -> List[Dict[str, Dict[str, str]]]:
def parse_tags_from_content(
tag: str, content: Union[str, List[Dict[str, Any]]], strict_filepath_match=False
) -> List[Dict[str, Dict[str, str]]]:
"""Parses HTML style tags from message contents.

The parsing is done by looking for patterns in the text that match the format of HTML tags. The tag to be parsed is
Expand All @@ -119,31 +121,40 @@ def parse_tags_from_content(tag: str, content: Union[str, List[Dict[str, Any]]])
tag (str): The HTML style tag to be parsed.
content (Union[str, List[Dict[str, Any]]]): The message content to parse. Can be a string or a list of content
items.
strict_filepath_match (bool, optional): If False (default), the parser will match all characters between the tag opening
and closing, including quotation marks and spaces. If True, the parser will only match simple tag contents
without spaces or quotes. This is useful for parsing filenames or URLs and ignoring complex HTML-like structures.

Returns:
List[Dict[str, str]]: A list of dictionaries, where each dictionary represents a parsed tag. Each dictionary
contains three key-value pairs: 'type' which is the tag, 'attr' which is a dictionary of the parsed attributes,
List[Dict[str, Union[str, Dict[str, str], re.Match]]]: A list of dictionaries, where each dictionary represents a parsed tag. Each dictionary
contains three key-value pairs: 'tag' which is the tag name, 'attr' which is a dictionary of the parsed attributes,
and 'match' which is a regular expression match object.

Raises:
ValueError: If the content is not a string or a list.
"""
results = []
if isinstance(content, str):
results.extend(_parse_tags_from_text(tag, content))
results.extend(_parse_tags_from_text(tag, content, strict_filepath_match))
# Handles case for multimodal messages.
elif isinstance(content, list):
for item in content:
if item.get("type") == "text":
results.extend(_parse_tags_from_text(tag, item["text"]))
results.extend(_parse_tags_from_text(tag, item["text"], strict_filepath_match))
else:
raise ValueError(f"content must be str or list, but got {type(content)}")

return results


def _parse_tags_from_text(tag: str, text: str) -> List[Dict[str, str]]:
pattern = re.compile(f"<{tag} (.*?)>")
def _parse_tags_from_text(
tag: str, text: str, strict_filepath_match: bool = False
) -> List[Dict[str, Union[str, Dict[str, str], re.Match]]]:
# the regular expression should not contain quotation marks. Otherwise, it is not a match.
if strict_filepath_match:
pattern = re.compile(f"<{tag} ([^\"' >]+)>")
else:
pattern = re.compile(f"<{tag} (.*?)>")

results = []
for match in re.finditer(pattern, text):
Expand All @@ -154,12 +165,12 @@ def _parse_tags_from_text(tag: str, text: str) -> List[Dict[str, str]]:
return results


def _parse_attributes_from_tags(tag_content: str):
def _parse_attributes_from_tags(tag_content: str) -> Dict[str, str]:
pattern = r"([^ ]+)"
attrs = re.findall(pattern, tag_content)
reconstructed_attrs = _reconstruct_attributes(attrs)

def _append_src_value(content, value):
def _append_src_value(content: Dict[str, str], value: str) -> None:
if "src" in content:
content["src"] += f" {value}"
else:
Expand Down
60 changes: 52 additions & 8 deletions test/agentchat/test_agentchat_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,38 +13,80 @@
TAG_PARSING_TESTS = [
{
"message": "Hello agent, can you take a look at this image <img http://example.com/image.png>",
"strict_filepath_match": False,
"expected": [{"tag": "img", "attr": {"src": "http://example.com/image.png"}}],
},
{
"message": "Can you transcribe this audio? <audio http://example.com/au=dio.mp3>",
"strict_filepath_match": False,
"expected": [{"tag": "audio", "attr": {"src": "http://example.com/au=dio.mp3"}}],
},
{
"message": "Can you describe what's in this image <img url='http://example.com/=image.png'>",
"strict_filepath_match": False,
"expected": [{"tag": "img", "attr": {"url": "http://example.com/=image.png"}}],
},
{
"message": "Can you describe what's in this image <img http://example.com/image.png> and transcribe this audio? <audio http://example.com/audio.mp3>",
"strict_filepath_match": False,
"expected": [
{"tag": "img", "attr": {"src": "http://example.com/image.png"}},
{"tag": "audio", "attr": {"src": "http://example.com/audio.mp3"}},
],
},
{
"message": "Can you generate this audio? <audio text='Hello I'm a robot' prompt='whisper'>",
"strict_filepath_match": False,
"expected": [{"tag": "audio", "attr": {"text": "Hello I'm a robot", "prompt": "whisper"}}],
},
{
"message": "Can you describe what's in this image <img http://example.com/image.png width='100'> and this image <img http://hello.com/image=.png>?",
"strict_filepath_match": False,
"expected": [
{"tag": "img", "attr": {"src": "http://example.com/image.png", "width": "100"}},
{"tag": "img", "attr": {"src": "http://hello.com/image=.png"}},
],
},
{
"message": "Text with no tags",
"strict_filepath_match": False,
"expected": [],
},
{
"message": 'Can you generate this audio? <audio text="Hello I\'m a robot" prompt="whisper">',
"strict_filepath_match": False,
"expected": [{"tag": "audio", "attr": {"text": "Hello I'm a robot", "prompt": "whisper"}}],
},
{
"message": 'Can you generate this audio? <audio text="Hello I\'m a robot" prompt="whisper">',
"strict_filepath_match": True,
"expected": [],
},
{
"message": 'Complex nested quotes <img src="http://example.com/image.png" alt="A "quoted" description">',
"strict_filepath_match": False,
"expected": [{"tag": "img", "attr": {"src": "http://example.com/image.png", "alt": 'A "quoted" description'}}],
},
{
"message": 'Complex nested quotes <img src="http://example.com/image.png" alt="A "quoted" description">',
"strict_filepath_match": True,
"expected": [],
},
{
"message": "Complex nested quotes <img http://example.com/image.png>",
"strict_filepath_match": True,
"expected": [{"tag": "img", "attr": {"src": "http://example.com/image.png"}}],
},
{
"message": 'Can you generate this audio? <audio text="Hello I\'m a robot" prompt="whisper">',
"strict_filepath_match": True,
"expected": [], # Empty because strict_filepath_match=True doesn't match tags with spaces or quotes
},
{
"message": 'Complex nested quotes <img src="http://example.com/image.png" alt="A "quoted" description">',
"strict_filepath_match": True,
"expected": [], # Empty because strict_filepath_match=True doesn't match tags with spaces or quotes
},
]


Expand All @@ -54,31 +96,33 @@ def _delete_unused_keys(d: Dict) -> None:


@pytest.mark.parametrize("test_case", TAG_PARSING_TESTS)
def test_tag_parsing(test_case: Dict[str, Union[str, List[Dict[str, Union[str, Dict[str, str]]]]]]) -> None:
"""Test the tag_parsing function."""
def test_tag_parsing(test_case: Dict[str, Union[str, bool, List[Dict[str, Union[str, Dict[str, str]]]]]]) -> None:
"""Test the strict_filepath_match feature in tag parsing."""
message = test_case["message"]
strict_filepath_match = test_case["strict_filepath_match"]
expected = test_case["expected"]
tags = ["img", "audio", "random"]
tags = ["img", "audio"]

result = []
for tag in tags:
parsed_tags = agentchat.utils.parse_tags_from_content(tag, message)
parsed_tags = agentchat.utils.parse_tags_from_content(tag, message, strict_filepath_match=strict_filepath_match)
for item in parsed_tags:
_delete_unused_keys(item)

result.extend(parsed_tags)
print(result, expected, strict_filepath_match)
assert result == expected

result = []
for tag in tags:
content = [{"type": "text", "text": message}]
parsed_tags = agentchat.utils.parse_tags_from_content(tag, content)
parsed_tags = agentchat.utils.parse_tags_from_content(tag, content, strict_filepath_match=strict_filepath_match)
for item in parsed_tags:
_delete_unused_keys(item)

result.extend(parsed_tags)
print(result, expected)
assert result == expected


if __name__ == "__main__":
test_tag_parsing(TAG_PARSING_TESTS[0])
for test_case in TAG_PARSING_TESTS:
test_tag_parsing(test_case)
Loading