autogenhub · BabyCNM · Oct 5, 2024 · Oct 5, 2024 · Oct 9, 2024 · Oct 20, 2024
diff --git a/autogen/agentchat/contrib/img_utils.py b/autogen/agentchat/contrib/img_utils.py
@@ -207,7 +207,7 @@ def gpt4v_formatter(prompt: str, img_format: str = "uri") -> List[Union[str, dic
     image_count = 0
 
     # Find all image tags
-    for parsed_tag in utils.parse_tags_from_content("img", prompt):
+    for parsed_tag in utils.parse_tags_from_content("img", prompt, strict_filepath_match=True):
         image_location = parsed_tag["attr"]["src"]
         try:
             if img_format == "pil":

diff --git a/autogen/agentchat/utils.py b/autogen/agentchat/utils.py
@@ -102,7 +102,9 @@ def aggregate_summary(usage_summary: Dict[str, Any], agent_summary: Dict[str, An
     }
 
 
-def parse_tags_from_content(tag: str, content: Union[str, List[Dict[str, Any]]]) -> List[Dict[str, Dict[str, str]]]:
+def parse_tags_from_content(
+    tag: str, content: Union[str, List[Dict[str, Any]]], strict_filepath_match=False
+) -> List[Dict[str, Dict[str, str]]]:
     """Parses HTML style tags from message contents.
 
     The parsing is done by looking for patterns in the text that match the format of HTML tags. The tag to be parsed is
@@ -119,31 +121,40 @@ def parse_tags_from_content(tag: str, content: Union[str, List[Dict[str, Any]]])
         tag (str): The HTML style tag to be parsed.
         content (Union[str, List[Dict[str, Any]]]): The message content to parse. Can be a string or a list of content
             items.
+        strict_filepath_match (bool, optional): If False (default), the parser will match all characters between the tag opening
+            and closing, including quotation marks and spaces. If True, the parser will only match simple tag contents
+            without spaces or quotes. This is useful for parsing filenames or URLs and ignoring complex HTML-like structures.
 
     Returns:
-        List[Dict[str, str]]: A list of dictionaries, where each dictionary represents a parsed tag. Each dictionary
-            contains three key-value pairs: 'type' which is the tag, 'attr' which is a dictionary of the parsed attributes,
+        List[Dict[str, Union[str, Dict[str, str], re.Match]]]: A list of dictionaries, where each dictionary represents a parsed tag. Each dictionary
+            contains three key-value pairs: 'tag' which is the tag name, 'attr' which is a dictionary of the parsed attributes,
             and 'match' which is a regular expression match object.
 
     Raises:
         ValueError: If the content is not a string or a list.
     """
     results = []
     if isinstance(content, str):
-        results.extend(_parse_tags_from_text(tag, content))
+        results.extend(_parse_tags_from_text(tag, content, strict_filepath_match))
     # Handles case for multimodal messages.
     elif isinstance(content, list):
         for item in content:
             if item.get("type") == "text":
-                results.extend(_parse_tags_from_text(tag, item["text"]))
+                results.extend(_parse_tags_from_text(tag, item["text"], strict_filepath_match))
     else:
         raise ValueError(f"content must be str or list, but got {type(content)}")
 
     return results
 
 
-def _parse_tags_from_text(tag: str, text: str) -> List[Dict[str, str]]:
-    pattern = re.compile(f"<{tag} (.*?)>")
+def _parse_tags_from_text(
+    tag: str, text: str, strict_filepath_match: bool = False
+) -> List[Dict[str, Union[str, Dict[str, str], re.Match]]]:
+    # the regular expression should not contain quotation marks. Otherwise, it is not a match.
+    if strict_filepath_match:
+        pattern = re.compile(f"<{tag} ([^\"' >]+)>")
+    else:
+        pattern = re.compile(f"<{tag} (.*?)>")
 
     results = []
     for match in re.finditer(pattern, text):
@@ -154,12 +165,12 @@ def _parse_tags_from_text(tag: str, text: str) -> List[Dict[str, str]]:
     return results
 
 
-def _parse_attributes_from_tags(tag_content: str):
+def _parse_attributes_from_tags(tag_content: str) -> Dict[str, str]:
     pattern = r"([^ ]+)"
     attrs = re.findall(pattern, tag_content)
     reconstructed_attrs = _reconstruct_attributes(attrs)
 
-    def _append_src_value(content, value):
+    def _append_src_value(content: Dict[str, str], value: str) -> None:
         if "src" in content:
             content["src"] += f" {value}"
         else:

diff --git a/test/agentchat/test_agentchat_utils.py b/test/agentchat/test_agentchat_utils.py
@@ -13,38 +13,80 @@
 TAG_PARSING_TESTS = [
     {
         "message": "Hello agent, can you take a look at this image <img http://example.com/image.png>",
+        "strict_filepath_match": False,
         "expected": [{"tag": "img", "attr": {"src": "http://example.com/image.png"}}],
     },
     {
         "message": "Can you transcribe this audio? <audio http://example.com/au=dio.mp3>",
+        "strict_filepath_match": False,
         "expected": [{"tag": "audio", "attr": {"src": "http://example.com/au=dio.mp3"}}],
     },
     {
         "message": "Can you describe what's in this image <img url='http://example.com/=image.png'>",
+        "strict_filepath_match": False,
         "expected": [{"tag": "img", "attr": {"url": "http://example.com/=image.png"}}],
     },
     {
         "message": "Can you describe what's in this image <img http://example.com/image.png> and transcribe this audio? <audio http://example.com/audio.mp3>",
+        "strict_filepath_match": False,
         "expected": [
             {"tag": "img", "attr": {"src": "http://example.com/image.png"}},
             {"tag": "audio", "attr": {"src": "http://example.com/audio.mp3"}},
         ],
     },
     {
         "message": "Can you generate this audio? <audio text='Hello I'm a robot' prompt='whisper'>",
+        "strict_filepath_match": False,
         "expected": [{"tag": "audio", "attr": {"text": "Hello I'm a robot", "prompt": "whisper"}}],
     },
     {
         "message": "Can you describe what's in this image <img http://example.com/image.png width='100'> and this image <img http://hello.com/image=.png>?",
+        "strict_filepath_match": False,
         "expected": [
             {"tag": "img", "attr": {"src": "http://example.com/image.png", "width": "100"}},
             {"tag": "img", "attr": {"src": "http://hello.com/image=.png"}},
         ],
     },
     {
         "message": "Text with no tags",
+        "strict_filepath_match": False,
         "expected": [],
     },
+    {
+        "message": 'Can you generate this audio? <audio text="Hello I\'m a robot" prompt="whisper">',
+        "strict_filepath_match": False,
+        "expected": [{"tag": "audio", "attr": {"text": "Hello I'm a robot", "prompt": "whisper"}}],
+    },
+    {
+        "message": 'Can you generate this audio? <audio text="Hello I\'m a robot" prompt="whisper">',
+        "strict_filepath_match": True,
+        "expected": [],
+    },
+    {
+        "message": 'Complex nested quotes <img src="http://example.com/image.png" alt="A "quoted" description">',
+        "strict_filepath_match": False,
+        "expected": [{"tag": "img", "attr": {"src": "http://example.com/image.png", "alt": 'A "quoted" description'}}],
+    },
+    {
+        "message": 'Complex nested quotes <img src="http://example.com/image.png" alt="A "quoted" description">',
+        "strict_filepath_match": True,
+        "expected": [],
+    },
+    {
+        "message": "Complex nested quotes <img http://example.com/image.png>",
+        "strict_filepath_match": True,
+        "expected": [{"tag": "img", "attr": {"src": "http://example.com/image.png"}}],
+    },
+    {
+        "message": 'Can you generate this audio? <audio text="Hello I\'m a robot" prompt="whisper">',
+        "strict_filepath_match": True,
+        "expected": [],  # Empty because strict_filepath_match=True doesn't match tags with spaces or quotes
+    },
+    {
+        "message": 'Complex nested quotes <img src="http://example.com/image.png" alt="A "quoted" description">',
+        "strict_filepath_match": True,
+        "expected": [],  # Empty because strict_filepath_match=True doesn't match tags with spaces or quotes
+    },
 ]
 
 
@@ -54,31 +96,33 @@ def _delete_unused_keys(d: Dict) -> None:
 
 
 @pytest.mark.parametrize("test_case", TAG_PARSING_TESTS)
-def test_tag_parsing(test_case: Dict[str, Union[str, List[Dict[str, Union[str, Dict[str, str]]]]]]) -> None:
-    """Test the tag_parsing function."""
+def test_tag_parsing(test_case: Dict[str, Union[str, bool, List[Dict[str, Union[str, Dict[str, str]]]]]]) -> None:
+    """Test the strict_filepath_match feature in tag parsing."""
     message = test_case["message"]
+    strict_filepath_match = test_case["strict_filepath_match"]
     expected = test_case["expected"]
-    tags = ["img", "audio", "random"]
+    tags = ["img", "audio"]
 
     result = []
     for tag in tags:
-        parsed_tags = agentchat.utils.parse_tags_from_content(tag, message)
+        parsed_tags = agentchat.utils.parse_tags_from_content(tag, message, strict_filepath_match=strict_filepath_match)
         for item in parsed_tags:
             _delete_unused_keys(item)
-
         result.extend(parsed_tags)
+    print(result, expected, strict_filepath_match)
     assert result == expected
 
     result = []
     for tag in tags:
         content = [{"type": "text", "text": message}]
-        parsed_tags = agentchat.utils.parse_tags_from_content(tag, content)
+        parsed_tags = agentchat.utils.parse_tags_from_content(tag, content, strict_filepath_match=strict_filepath_match)
         for item in parsed_tags:
             _delete_unused_keys(item)
-
         result.extend(parsed_tags)
+    print(result, expected)
     assert result == expected
 
 
 if __name__ == "__main__":
-    test_tag_parsing(TAG_PARSING_TESTS[0])
+    for test_case in TAG_PARSING_TESTS:
+        test_tag_parsing(test_case)