Skip to content

Commit

Permalink
fix(ocr_mkcontent): improve language detection and content formatting (
Browse files Browse the repository at this point in the history
…#458)

Optimize the language detection logic to enhance content formatting.  This
change addresses issues with long word segmentation. Language detection now uses a
threshold to determine the language of a text based on the proportion of English characters.
Formatting rules for content have been updated to consider a list of languages (initially
including Chinese, Japanese, and Korean) where no space is added between content segments
for inline equations and text spans, improving the handling of Asian languages.

The impact of these changes includes improved accuracy in language detection, better
segmentation of long words, and more appropriate spacing in content formatting for multiple
languages.
  • Loading branch information
myhloli authored Aug 20, 2024
1 parent f4316f0 commit 66e3ce9
Showing 1 changed file with 19 additions and 7 deletions.
26 changes: 19 additions & 7 deletions magic_pdf/dict2md/ocr_mkcontent.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def split_long_words(text):
for i in range(len(segments)):
words = re.findall(r'\w+|[^\w]', segments[i], re.UNICODE)
for j in range(len(words)):
if len(words[j]) > 15:
if len(words[j]) > 10:
words[j] = ' '.join(wordninja.split(words[j]))
segments[i] = ''.join(words)
return ' '.join(segments)
Expand Down Expand Up @@ -147,6 +147,18 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):


def merge_para_with_text(para_block):
def detect_language(text):
en_pattern = r'[a-zA-Z]+'
en_matches = re.findall(en_pattern, text)
en_length = sum(len(match) for match in en_matches)
if len(text) > 0:
if en_length / len(text) >= 0.5:
return 'en'
else:
return "unknown"
else:
return "empty"

para_text = ''
for line in para_block['lines']:
line_text = ""
Expand All @@ -162,7 +174,8 @@ def merge_para_with_text(para_block):
content = ''
if span_type == ContentType.Text:
content = span['content']
language = detect_lang(content)
# language = detect_lang(content)
language = detect_language(content)
if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本
content = ocr_escape_special_markdown_char(split_long_words(content))
else:
Expand All @@ -171,12 +184,12 @@ def merge_para_with_text(para_block):
content = f" ${span['content']}$ "
elif span_type == ContentType.InterlineEquation:
content = f"\n$$\n{span['content']}\n$$\n"

if content != '':
if 'zh' in line_lang: # 遇到一些一个字一个span的文档,这种单字语言判断不准,需要用整行文本判断
para_text += content # 中文语境下,content间不需要空格分隔
langs = ['zh', 'ja', 'ko']
if line_lang in langs: # 遇到一些一个字一个span的文档,这种单字语言判断不准,需要用整行文本判断
para_text += content # 中文/日语/韩文语境下,content间不需要空格分隔
else:
para_text += content + ' ' # 英文语境下 content间需要空格分隔
para_text += content + ' ' # 西方文本语境下 content间需要空格分隔
return para_text


Expand All @@ -202,7 +215,6 @@ def para_to_standard_format(para, img_buket_path):
elif span_type == ContentType.InlineEquation:
content = f"${span['content']}$"
inline_equation_num += 1

if language == 'en': # 英文语境下 content间需要空格分隔
para_text += content + ' '
else: # 中文语境下,content间不需要空格分隔
Expand Down

0 comments on commit 66e3ce9

Please sign in to comment.