fix(ocr_mkcontent): improve language detection and content formatting (…

…#458) Optimize the language detection logic to enhance content formatting. This change addresses issues with long word segmentation. Language detection now uses a threshold to determine the language of a text based on the proportion of English characters. Formatting rules for content have been updated to consider a list of languages (initially including Chinese, Japanese, and Korean) where no space is added between content segments for inline equations and text spans, improving the handling of Asian languages. The impact of these changes includes improved accuracy in language detection, better segmentation of long words, and more appropriate spacing in content formatting for multiple languages.
opendatalab · Aug 20, 2024 · 66e3ce9 · 66e3ce9
1 parent f4316f0
commit 66e3ce9
Showing 1 changed file with 19 additions and 7 deletions.
diff --git a/magic_pdf/dict2md/ocr_mkcontent.py b/magic_pdf/dict2md/ocr_mkcontent.py
@@ -14,7 +14,7 @@ def split_long_words(text):
     for i in range(len(segments)):
         words = re.findall(r'\w+|[^\w]', segments[i], re.UNICODE)
         for j in range(len(words)):
-            if len(words[j]) > 15:
+            if len(words[j]) > 10:
                 words[j] = ' '.join(wordninja.split(words[j]))
         segments[i] = ''.join(words)
     return ' '.join(segments)
@@ -147,6 +147,18 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
 
 
 def merge_para_with_text(para_block):
+    def detect_language(text):
+        en_pattern = r'[a-zA-Z]+'
+        en_matches = re.findall(en_pattern, text)
+        en_length = sum(len(match) for match in en_matches)
+        if len(text) > 0:
+            if en_length / len(text) >= 0.5:
+                return 'en'
+            else:
+                return "unknown"
+        else:
+            return "empty"
+
     para_text = ''
     for line in para_block['lines']:
         line_text = ""
@@ -162,7 +174,8 @@ def merge_para_with_text(para_block):
             content = ''
             if span_type == ContentType.Text:
                 content = span['content']
-                language = detect_lang(content)
+                # language = detect_lang(content)
+                language = detect_language(content)
                 if language == 'en':  # 只对英文长词进行分词处理，中文分词会丢失文本
                     content = ocr_escape_special_markdown_char(split_long_words(content))
                 else:
@@ -171,12 +184,12 @@ def merge_para_with_text(para_block):
                 content = f" ${span['content']}$ "
             elif span_type == ContentType.InterlineEquation:
                 content = f"\n$$\n{span['content']}\n$$\n"
-
             if content != '':
-                if 'zh' in line_lang:  # 遇到一些一个字一个span的文档，这种单字语言判断不准，需要用整行文本判断
-                    para_text += content  # 中文语境下，content间不需要空格分隔
+                langs = ['zh', 'ja', 'ko']
+                if line_lang in langs:  # 遇到一些一个字一个span的文档，这种单字语言判断不准，需要用整行文本判断
+                    para_text += content  # 中文/日语/韩文语境下，content间不需要空格分隔
                 else:
-                    para_text += content + ' '  # 英文语境下 content间需要空格分隔
+                    para_text += content + ' '  # 西方文本语境下 content间需要空格分隔
     return para_text
 
 
@@ -202,7 +215,6 @@ def para_to_standard_format(para, img_buket_path):
                 elif span_type == ContentType.InlineEquation:
                     content = f"${span['content']}$"
                     inline_equation_num += 1
-
                 if language == 'en':  # 英文语境下 content间需要空格分隔
                     para_text += content + ' '
                 else:  # 中文语境下，content间不需要空格分隔