Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: replace \u0002, \u0003 in common text #521

Merged
merged 2 commits into from
Sep 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,10 @@ tmp/
tmp
.vscode
.vscode/
/tests/
ocr_demo

/app/common/__init__.py
/magic_pdf/config/__init__.py
source.dev.env

tmp
21 changes: 21 additions & 0 deletions magic_pdf/dict2md/ocr_mkcontent.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,20 @@
import re


def __is_hyphen_at_line_end(line):
"""
Check if a line ends with one or more letters followed by a hyphen.

Args:
line (str): The line of text to check.

Returns:
bool: True if the line ends with one or more letters followed by a hyphen, False otherwise.
"""
# Use regex to check if the line ends with one or more letters followed by a hyphen
return bool(re.search(r'[A-Za-z]+-\s*$', line))


def split_long_words(text):
segments = text.split(' ')
for i in range(len(segments)):
Expand Down Expand Up @@ -184,10 +198,17 @@ def detect_language(text):
content = f" ${span['content']}$ "
elif span_type == ContentType.InterlineEquation:
content = f"\n$$\n{span['content']}\n$$\n"

if content != '':
langs = ['zh', 'ja', 'ko']
if line_lang in langs: # 遇到一些一个字一个span的文档,这种单字语言判断不准,需要用整行文本判断
para_text += content # 中文/日语/韩文语境下,content间不需要空格分隔
elif line_lang == 'en':
# 如果是前一行带有-连字符,那么末尾不应该加空格
if __is_hyphen_at_line_end(para_text):
para_text += content
else:
para_text += content + ' '
else:
para_text += content + ' ' # 西方文本语境下 content间需要空格分隔
return para_text
Expand Down
19 changes: 18 additions & 1 deletion magic_pdf/pdf_parse_union_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,23 @@ def remove_horizontal_overlap_block_which_smaller(all_bboxes):
return is_useful_block_horz_overlap, all_bboxes


def __replace_STX_ETX(text_str:str):
""" Replace \u0002 and \u0003, as these characters become garbled when extracted using pymupdf. In fact, they were originally quotation marks.
Drawback: This issue is only observed in English text; it has not been found in Chinese text so far.

Args:
text_str (str): raw text

Returns:
_type_: replaced text
"""
if text_str:
s = text_str.replace('\u0002', "'")
s = s.replace("\u0003", "'")
return s
return text_str


def txt_spans_extract(pdf_page, inline_equations, interline_equations):
text_raw_blocks = pdf_page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"]
char_level_text_blocks = pdf_page.get_text("rawdict", flags=fitz.TEXTFLAGS_TEXT)[
Expand All @@ -63,7 +80,7 @@ def txt_spans_extract(pdf_page, inline_equations, interline_equations):
spans.append(
{
"bbox": list(span["bbox"]),
"content": span["text"],
"content": __replace_STX_ETX(span["text"]),
"type": ContentType.Text,
"score": 1.0,
}
Expand Down
28 changes: 28 additions & 0 deletions tests/test_para/test_hyphen_at_line_end.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@

from magic_pdf.dict2md.ocr_mkcontent import __is_hyphen_at_line_end


def test_hyphen_at_line_end():
"""
测试行尾是不是一个连字符
"""
test_cases_ok = [
"I am zhang-",
"you are zhang- ",
"math-",
"This is a TEST-",
"This is a TESTing-",
"美国人 hello-",
]
test_cases_bad = [
"This is a TEST$-",
"This is a TEST21-",
"中国人-",
"美国人 hello人-",
"this is 123-",
]
for test_case in test_cases_ok:
assert __is_hyphen_at_line_end(test_case)

for test_case in test_cases_bad:
assert not __is_hyphen_at_line_end(test_case)
Loading