From a51ffc9147962a71f6a18d0a03be1b35f078f6dc Mon Sep 17 00:00:00 2001
From: AswanthManoj <aswanthmanoj51@gmail.com>
Date: Fri, 23 Aug 2024 16:13:55 +0530
Subject: [PATCH] Added structural cue chunking strategy based on JinaAI's
 tokenizer chunking

---
 omniparse/chunking/__init__.py | 157 +++++++++++++++++++++++++++++++++
 1 file changed, 157 insertions(+)

diff --git a/omniparse/chunking/__init__.py b/omniparse/chunking/__init__.py
index 66429f0..748c6d8 100644
--- a/omniparse/chunking/__init__.py
+++ b/omniparse/chunking/__init__.py
@@ -111,3 +111,160 @@ def chunk(self, text: str) -> list:
         for i in range(0, len(words), self.step):
             chunks.append(" ".join(words[i : i + self.window_size]))
         return chunks
+
+
+# Structural cue based chunking
+class StructuralCueChunking(ChunkingStrategy):
+    """
+    Inspired by https://jina.ai/tokenizer/#chunking which leverage common structural cues 
+    and build a set of rules and heuristics which should perform exceptionally well across 
+    diverse types of content, including Markdown, HTML, LaTeX, and more, 
+    ensuring accurate segmentation of text into meaningful chunks.
+    
+    Reference: https://gist.github.com/JeremiahZhang/2f8ae87dad836b25f40c02b8c43d16ec
+    Original x post: https://x.com/JinaAI_/status/1823756993108304135
+    """
+    def __init__(self, max_chunk_size: int=500, **kwargs):
+        """
+        Args:
+            max_chunk_size (int, optional): The maximum size of a chunk. Defaults to 500.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            None
+        """
+        import regex
+        self.MAX_TABLE_ROWS = 20
+        self.LOOKAHEAD_RANGE = 100
+        self.MAX_HEADING_LENGTH = 7
+        self.MAX_SENTENCE_LENGTH = 400
+        self.MAX_NESTED_LIST_ITEMS = 6
+        self.MAX_BLOCKQUOTE_LINES = 15
+        self.MAX_NESTED_PARENTHESES = 5
+        self.MAX_LIST_INDENT_SPACES = 7
+        self.MAX_LIST_ITEM_LENGTH = 200
+        self.MAX_TABLE_CELL_LENGTH = 200
+        self.MAX_MATH_BLOCK_LENGTH = 500
+        self.MAX_PARAGRAPH_LENGTH = 1000
+        self.MAX_QUOTED_TEXT_LENGTH = 300
+        self.MAX_INDENTED_CODE_LINES = 20
+        self.MAX_CODE_BLOCK_LENGTH = 1500
+        self.MAX_HTML_TABLE_LENGTH = 2000
+        self.MAX_MATH_INLINE_LENGTH = 100
+        self.MAX_CODE_LANGUAGE_LENGTH = 20
+        self.MIN_HORIZONTAL_RULE_LENGTH = 3
+        self.max_chunk_size = max_chunk_size
+        self.MAX_BLOCKQUOTE_LINE_LENGTH = 200
+        self.MAX_HEADING_CONTENT_LENGTH = 200
+        self.MAX_STANDALONE_LINE_LENGTH = 800
+        self.MAX_HEADING_UNDERLINE_LENGTH = 200
+        self.MAX_HTML_TAG_CONTENT_LENGTH = 1000
+        self.MAX_HTML_TAG_ATTRIBUTES_LENGTH = 100
+        self.MAX_PARENTHETICAL_CONTENT_LENGTH = 200
+        self.MAX_HTML_HEADING_ATTRIBUTES_LENGTH = 100
+        
+        self.pattern = self.__pattern__()
+        
+    def __pattern__(self) -> str:
+        
+        # 1. Headings (Setext-style, Markdown, and HTML-style, with length constraints)
+        heading_regex = rf"""(?:^(?:[#*=-]{{1,{self.MAX_HEADING_LENGTH}}}|\w[^\r\n]{{0,{self.MAX_HEADING_CONTENT_LENGTH}}}\r?\n[-=]{{2,{self.MAX_HEADING_UNDERLINE_LENGTH}}}|<h[1-6][^>]{{0,{self.MAX_HTML_HEADING_ATTRIBUTES_LENGTH}}}>)[^\r\n]{{1,{self.MAX_HEADING_CONTENT_LENGTH}}}(?:</h[1-6]>)?(?:\r?\n|$))"""
+
+        # 2. New pattern for citations
+        citation_regex = rf"(?:\[[0-9]+\][^\r\n]{{1,{self.MAX_STANDALONE_LINE_LENGTH}}})"
+
+        # 3. List items (bulleted, numbered, lettered, or task lists, including nested, up to three levels, with length constraints)
+        list_item_regex = rf"(?:(?:^|\r?\n)[ \t]{{0,3}}(?:[-*+•]|\d{{1,3}}\.\w\.|\[[ xX]\])[ \t]+(?:(?:\b[^\r\n]{{1,{self.MAX_LIST_ITEM_LENGTH}}}\b(?:[.!?…]|\.{{3}}|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?=\s|$))|(?:\b[^\r\n]{{1,{self.MAX_LIST_ITEM_LENGTH}}}\b(?=[\r\n]|$))|(?:\b[^\r\n]{{1,{self.MAX_LIST_ITEM_LENGTH}}}\b(?=[.!?…]|\.{{3}}|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?:.{{1,{self.LOOKAHEAD_RANGE}}}(?:[.!?…]|\.{{3}}|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?=\s|$))?))"
+        list_item_regex += rf"(?:(?:\r?\n[ \t]{{2,5}}(?:[-*+•]|\d{{1,3}}\.\w\.|\[[ xX]\])[ \t]+(?:(?:\b[^\r\n]{{1,{self.MAX_LIST_ITEM_LENGTH}}}\b(?:[.!?…]|\.{{3}}|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?=\s|$))|(?:\b[^\r\n]{{1,${self.MAX_LIST_ITEM_LENGTH}}}\b(?=[\r\n]|$))|(?:\b[^\r\n]{{1,${self.MAX_LIST_ITEM_LENGTH}}}\b(?=[.!?…]|\.{{3}}|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?:.{{1,{self.LOOKAHEAD_RANGE}}}(?:[.!?…]|\.{{3}}|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?=\s|$))?)))"
+        list_item_regex += rf"{{0,{self.MAX_NESTED_LIST_ITEMS}}}(?:\r?\n[ \t]{{4,{self.MAX_LIST_INDENT_SPACES}}}(?:[-*+•]|\d{{1,3}}\.\w\.|\[[ xX]\])[ \t]+(?:(?:\b[^\r\n]{{1,{self.MAX_LIST_ITEM_LENGTH}}}\b(?:[.!?…]|\.{{3}}|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?=\s|$))|(?:\b[^\r\n]{{1,{self.MAX_LIST_ITEM_LENGTH}}}\b(?=[\r\n]|$))|(?:\b[^\r\n]{{1,{self.MAX_LIST_ITEM_LENGTH}}}\b(?=[.!?…]|\.{{3}}|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?:.{{1,{self.LOOKAHEAD_RANGE}}}(?:[.!?…]|\.{{3}}|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?=\s|$))?)))"
+        list_item_regex += rf"{{0,{self.MAX_NESTED_LIST_ITEMS}}})?)"
+
+        # 4. Block quotes (including nested quotes and citations, up to three levels, with length constraints)
+        block_regex = rf"(?:(?:^>(?:>|\s{{2,}}){{0,2}}(?:(?:\b[^\r\n]{{0,{self.MAX_BLOCKQUOTE_LINE_LENGTH}}}\b(?:[.!?…]|\.{{3}}|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?=\s|$))|(?:\b[^\r\n]{{0,{self.MAX_BLOCKQUOTE_LINE_LENGTH}}}\b(?=[\r\n]|$))|(?:\b[^\r\n]{{0,{self.MAX_BLOCKQUOTE_LINE_LENGTH}}}\b(?=[.!?…]|\.{{3}}|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?:.{{1,{self.LOOKAHEAD_RANGE}}}(?:[.!?…]|\.{{3}}|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?=\s|$))?))\r?\n?){{1,{self.MAX_BLOCKQUOTE_LINES}}})"
+
+        # 5. Code blocks (fenced, indented, or HTML pre/code tags, with length constraints)
+        code_block_regex = rf"(?:(?:^|\r?\n)(?:\`\`\`|~~~)(?:\w{{0,{self.MAX_CODE_LANGUAGE_LENGTH}}})?\r?\n[\s\S]{{0,{self.MAX_CODE_BLOCK_LENGTH}}}?(?:\`\`\`|~~~)\r?\n?"
+        code_block_regex += rf"|(?:(?:^|\r?\n)(?: {{4}}|\t)[^\r\n]{{0,{self.MAX_LIST_ITEM_LENGTH}}}(?:\r?\n(?: {{4}}|\t)[^\r\n]{{0,{self.MAX_LIST_ITEM_LENGTH}}}){{0,{self.MAX_INDENTED_CODE_LINES}}}\r?\n?)"
+        code_block_regex += rf"|(?:<pre>(?:<code>)?[\s\S]{{0,{self.MAX_CODE_BLOCK_LENGTH}}}?(?:</code>)?</pre>))"
+
+        # 6. Tables (Markdown, grid tables, and HTML tables, with length constraints)
+        table_regex = rf"(?:(?:^|\r?\n)(?:\|[^\r\n]{{0,{self.MAX_TABLE_CELL_LENGTH}}}\|(?:\r?\n\|[-:]{{1,{self.MAX_TABLE_CELL_LENGTH}}}\|){{0,1}}(?:\r?\n\|[^\r\n]{{0,{self.MAX_TABLE_CELL_LENGTH}}}\|){{0,{self.MAX_TABLE_ROWS}}}"
+        table_regex += rf"|<table>[\s\S]{{0,{self.MAX_HTML_TABLE_LENGTH}}}?</table>))"
+
+        # 7. Horizontal rules (Markdown and HTML hr tag)
+        horizontal_rule_regex = rf"(?:^(?:[-*_]){{{self.MIN_HORIZONTAL_RULE_LENGTH},}}\s*$|<hr\s*/?>)"
+
+        # 8. Standalone lines or phrases (including single-line blocks and HTML elements, with length constraints)
+        single_line_regex = rf"(?:^(?:<[a-zA-Z][^>]{{0,{self.MAX_HTML_TAG_ATTRIBUTES_LENGTH}}}>)?(?:(?:[^\r\n]{{1,{self.MAX_STANDALONE_LINE_LENGTH}}}(?:[.!?…]|\.\.\.|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?=\s|$))|(?:[^\r\n]{{1,{self.MAX_STANDALONE_LINE_LENGTH}}}(?=[\r\n]|$))|(?:[^\r\n]{{1,{self.MAX_STANDALONE_LINE_LENGTH}}}(?=[.!?…]|\.\.\.|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?:.{{1,{self.LOOKAHEAD_RANGE}}}(?:[.!?…]|\.\.\.|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?=\s|$))?))(?:</[a-zA-Z]+>)?(?:\r?\n|$))"
+
+        # 9. Sentences or phrases ending with punctuation (including ellipsis and Unicode punctuation)
+        sentence_regex = rf"(?:(?:[^\r\n]{{1,{self.MAX_SENTENCE_LENGTH}}}(?:[.!?…]|\.\.\.|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?=\s|$))|(?:[^\r\n]{{1,{self.MAX_SENTENCE_LENGTH}}}(?=[\r\n]|$))|(?:[^\r\n]{{1,{self.MAX_SENTENCE_LENGTH}}}(?=[.!?…]|\.\.\.|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?:.{{1,{self.LOOKAHEAD_RANGE}}}(?:[.!?…]|\.\.\.|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?=\s|$))?))"
+
+        # 10. Quoted text, parenthetical phrases, or bracketed content (with length constraints)
+        quoted_text = "(?:"
+        quoted_text += rf"(?<!\w)\"\"\"[^\"]{{0,{self.MAX_QUOTED_TEXT_LENGTH}}}\"\"\"(?!\w)"
+        quoted_text += rf"""|(?<!\w)(?P<quote>['"\`'"])[^\r\n]{{0,{self.MAX_QUOTED_TEXT_LENGTH}}}(?P=quote)(?!\w)"""
+        quoted_text += rf"|\([^\r\n()]{{0,{self.MAX_PARENTHETICAL_CONTENT_LENGTH}}}(?:\([^\r\n()]{{0,{self.MAX_PARENTHETICAL_CONTENT_LENGTH}}}\)[^\r\n()]{{0,{self.MAX_PARENTHETICAL_CONTENT_LENGTH}}}){{0,{self.MAX_NESTED_PARENTHESES}}}\)"
+        quoted_text += rf"|\[[^\r\n\[\]]{{0,{self.MAX_PARENTHETICAL_CONTENT_LENGTH}}}(?:\[[^\r\n\[\]]{{0,{self.MAX_PARENTHETICAL_CONTENT_LENGTH}}}\][^\r\n\[\]]{{0,{self.MAX_PARENTHETICAL_CONTENT_LENGTH}}}){{0,{self.MAX_NESTED_PARENTHESES}}}\]"
+        quoted_text += rf"|\$[^\r\n$]{{0,{self.MAX_MATH_INLINE_LENGTH}}}\$"
+        quoted_text += rf"|\`[^\`\r\n]{{0,{self.MAX_MATH_INLINE_LENGTH}}}\`"
+        quoted_text += ")"
+
+        # 11. Paragraphs (with length constraints)
+        paragraph_regex = rf"(?:(?:^|\r?\n\r?\n)(?:<p>)?(?:(?:[^\r\n]{{1,{self.MAX_PARAGRAPH_LENGTH}}}(?:[.!?…]|\.{{3}}|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?=\s|$))|(?:[^\r\n]{{1,{self.MAX_PARAGRAPH_LENGTH}}}(?=[\r\n]|$))|(?:[^\r\n]{{1,{self.MAX_PARAGRAPH_LENGTH}}}(?=[.!?…]|\.{{3}}|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?:.{{1,{self.LOOKAHEAD_RANGE}}}(?:[.!?…]|\.{{3}}|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?=\s|$))?))(?:</p>)?(?=\r?\n\r?\n|$))"
+
+        # 12. HTML-like tags and their content (including self-closing tags and attributes, with length constraints)
+        html_like_regex = rf"(?:<[a-zA-Z][^>]{{0,{self.MAX_HTML_TAG_ATTRIBUTES_LENGTH}}}(?:>[\s\S]{{0,{self.MAX_HTML_TAG_CONTENT_LENGTH}}}?</[a-zA-Z]+>|\s*/>))"
+
+        #13. LaTeX-style math expressions (inline and block, with length constraints)
+        latex_regex = rf"(?:(?:\$\$[\s\S]{{0,{self.MAX_MATH_BLOCK_LENGTH}}}?\$\$)|(?:\$[^\$\r\n]{{0,{self.MAX_MATH_INLINE_LENGTH}}}\$))"
+
+        # 14. Fallback for any remaining content (with length constraints)
+        fallback_regex = rf"(?:(?:[^\r\n]{{1,{self.MAX_STANDALONE_LINE_LENGTH}}}(?:[.!?…]|\.{{3}}|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?=\s|$))|(?:[^\r\n]{{1,{self.MAX_STANDALONE_LINE_LENGTH}}}(?=[\r\n]|$))|(?:[^\r\n]{{1,{self.MAX_STANDALONE_LINE_LENGTH}}}(?=[.!?…]|\.{{3}}|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?:.{{1,{self.LOOKAHEAD_RANGE}}}(?:[.!?…]|\.{{3}}|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?=\s|$))?))"
+        
+        return re.compile('|'.join((f"({heading_regex}", citation_regex, list_item_regex, block_regex, code_block_regex, table_regex, horizontal_rule_regex, single_line_regex, sentence_regex, quoted_text, paragraph_regex, html_like_regex, latex_regex, f"{fallback_regex})")), re.MULTILINE | re.DOTALL)
+
+    def chunk(self, text: str) -> list:
+        """
+        Breaks down a given text into smaller chunks based on common stuctural cues and maximum chunk size.
+        
+        Args:
+            text (str): The input text to be chunked.
+        
+        Returns:
+            list: A list of chunked text, where each chunk is a string.
+        """
+        chunks = re.findall(self.pattern, text)
+        
+        temp_chunk = ""
+        final_chunks = []
+        
+        for chunk in chunks:
+            chunk=chunk[0]
+            if len(temp_chunk) + len(chunk) > self.max_chunk_size:
+                final_chunks.append(temp_chunk.strip())
+                temp_chunk = chunk
+            else:
+                temp_chunk += chunk
+        
+        if temp_chunk:
+            final_chunks.append(temp_chunk.strip())
+        
+        # If a chunk is too large, break it down further
+        refined_chunks = []
+        for chunk in final_chunks:
+            if len(chunk) > self.max_chunk_size:
+                sentences = re.split(r'(?<=[.!?]) +', chunk)  # Split by sentence
+                temp_chunk = ""
+                for sentence in sentences:
+                    if len(temp_chunk) + len(sentence) > self.max_chunk_size:
+                        refined_chunks.append(temp_chunk.strip())
+                        temp_chunk = sentence
+                    else:
+                        temp_chunk += f" {sentence}"
+                if temp_chunk:
+                    refined_chunks.append(temp_chunk.strip())
+            else:
+                refined_chunks.append(chunk)
+        
+        return refined_chunks
\ No newline at end of file