Merge pull request #10 from hiroi-sora/main

一些修改
qwedc001 · Feb 28, 2024 · 26930e3 · 26930e3
2 parents 37b945a + 283fb7e
commit 26930e3
Show file tree

Hide file tree

Showing 4 changed files with 330 additions and 103 deletions.
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-<h1 align="center">适用于 Umi-OCR 文字识别工具 的 Tesseract 插件</h1>
+<h1 align="center">适用于 Umi-OCR 文字识别工具 的 TesseractOCR 插件</h1>
 
 <p align="center">
   <a href="https://github.com/qwedc001/tesseractOCR_umi_plugin/releases/latest">
@@ -9,11 +9,33 @@
   </a>
 </p>
 
+## 插件说明
+
+将本插件加载进 [Umi-OCR](https://github.com/hiroi-sora/Umi-OCR) 即可使用。
+
+与其他插件（如PaddleOCR）相比， TesseractOCR 有这些 **优点** ：
+
+- 👍 英文语言的识别准确率高，且不易出现空格丢失现象。
+- 👍 自带段落分析模型，对书籍/论文排版具有精度非常好的识别率。
+- 👍 允许同时勾选多个语言库（如中文+英文+日文）进行识别。
+- 👍 使用 fast 模型库时，识别速度比 Paddle 更快。
+
+TesseractOCR 有这些 **缺点** ：
+
+- 🙁 英文以外的语言（包括中文、日文……），准确度较差。
+
+TesseractOCR 的 **适用场景** ：
+
+- 纯英文内容。
+- 需要解析文章排版，如PDF识别时。
+
 ## 开始使用
 
 ### 对于用户
 
-下载 release 中已经打包好的插件，放入 Umi-OCR 的 plugins 文件夹中即可使用。
+1. 下载 release 中已经打包好的插件，放入 `Umi-OCR/UmiOCR-data/plugins` 文件夹中。
+2. 启动 Umi-OCR ，将 **全局设置** → **文字识别** → **当前接口** 改为 `TesseractOCR` ，然后 **点击** `应用修改` 。
+3. 在各个标签页（如`批量OCR`）中，将 **设置** → **排版解析方案** 改为 `不做处理` ，以便启用TesseractOCR自带的排版解析模型。
 
 ### 对于开发者
 
@@ -35,17 +57,11 @@ pip install Pillow,pytesseract
 
 release 包中内置有中英日以及数学识别语言库，如果您所需的语言不在其中，您可以前往 [Tesseract_Fast](https://github.com/tesseract-ocr/tessdata_fast) 或者 [Tesseract_best](https://github.com/tesseract-ocr/tessdata_best) 寻找您所需要的语言库，下载后将其放入 engine/tessdata 文件夹中即可。
 
-## 关于 Umi-OCR 项目结构
-
-### 各仓库：
-
-- [主仓库](https://github.com/hiroi-sora/Umi-OCR)
-- [插件库](https://github.com/hiroi-sora/Umi-OCR_plugins) -> [本插件项目](https://github.com/qwedc001/tesseractOCR_umi_plugin)👈
-- [Win 运行库](https://github.com/hiroi-sora/Umi-OCR_runtime_windows)
-
 ### 工程结构：
 
-`**` 后缀表示本仓库(`插件仓库`)包含的内容。
+`**` 后缀表示本仓库(`插件仓库`)包含的代码文件。
+
+其他文件请在Release包中获取。
 
 ```
 tesseractOCR_umi_plugin

diff --git a/api_tesseractocr.py b/api_tesseractocr.py
@@ -1,106 +1,168 @@
 import os
-import sys
 import site
 import base64
 from PIL import Image
 from io import BytesIO
 import traceback
+import unicodedata
 
 # 当前目录
 CurrentDir = os.path.dirname(os.path.abspath(__file__))
 # 依赖包目录
 SitePackages = os.path.join(CurrentDir, "site-packages")
 
-ModelDir = os.path.join(CurrentDir,"engine/tessdata/")
+ModelDir = os.path.join(CurrentDir, "engine/tessdata/")
+
 
 class Api:
     def __init__(self, globalArgd):
         self.tesseractOcr = None
-        self.accuracy = float(globalArgd['accur'])
+        self.accuracy = float(globalArgd["accur"])
 
-    def get_select_languages(self,argd) -> list:
+    def get_select_languages(self, argd) -> list:
         selects = []
-        for k,flag in argd.items():
+        for k, flag in argd.items():
             if k.startswith("language.") and flag:
                 language = k[9:]
-                if (language == 'chi_sim' or language == "chi_tra") and argd['vert']:
-                        selects.append(language+"_vert")
+                if (language == "chi_sim" or language == "chi_tra") and argd["vert"]:
+                    selects.append(language + "_vert")
                 selects.append(language)
         return selects
 
-    # 获取两个连续单词的分隔符。letter1为单词1结尾字母，letter2为单词2结尾字母
-    def _word_separator(self, letter1, letter2):
-        # 判断结尾和开头，是否属于汉藏语族
-        # 汉藏语族：行间无需分割符。印欧语族：则两行之间需加空格。
-        ranges = [
-            (0x4E00, 0x9FFF),  # 汉字
-            (0x3040, 0x30FF),  # 日文
-            (0xAC00, 0xD7AF),  # 韩文
-            (0xFF01, 0xFF5E),  # 全角字符
-        ]
-        fa, fb = False, False
-        for l, r in ranges:
-            if l <= ord(letter1) <= r:
-                fa = True
-            if l <= ord(letter2) <= r:
-                fb = True
-        if fa and fb: # 两个字符都是汉藏语族，才没有空格
+    @staticmethod  # 按 key 取一行的内容
+    def _get_r(row, key):
+        tessKey = {  # TesseratOCR 结果表格下标与键的映射
+            "level": 0,
+            "page_num": 1,
+            "block_num": 2,
+            "par_num": 3,
+            "line_num": 4,
+            "word_num": 5,
+            "left": 6,
+            "top": 7,
+            "width": 8,
+            "height": 9,
+            "conf": 10,
+            "text": 11,
+        }
+        if key == "text":
+            return str(row[tessKey[key]])
+        elif key == "conf":
+            return float(row[tessKey[key]])
+        else:
+            return int(row[tessKey[key]])
+
+    @staticmethod  # 传入前句尾字符和后句首字符，返回分隔符
+    def _word_separator(letter1, letter2):
+
+        # 判断Unicode字符是否属于中文、日文或韩文字符集
+        def is_cjk(character):
+            cjk_unicode_ranges = [
+                (0x4E00, 0x9FFF),  # 中文
+                (0x3040, 0x30FF),  # 日文
+                (0x1100, 0x11FF),  # 韩文
+                (0x3130, 0x318F),  # 韩文兼容字母
+                (0xAC00, 0xD7AF),  # 韩文音节
+                # 全角符号
+                (0x3000, 0x303F),  # 中文符号和标点
+                (0xFE30, 0xFE4F),  # 中文兼容形式标点
+                (0xFF00, 0xFFEF),  # 半角和全角形式字符
+            ]
+            return any(
+                start <= ord(character) <= end for start, end in cjk_unicode_ranges
+            )
+
+        if is_cjk(letter1) and is_cjk(letter2):
             return ""
 
-        # 特殊情况：字母2为缩写，如 n't。或者字母2为结尾符号，意味着OCR错误分割。
-        if letter2 in {r"'", ",", ".", "!", "?", ";", ":"}:
+        # 特殊情况：前文为连字符。
+        if letter1 == "-":
+            return ""
+        # 特殊情况：后文为任意标点符号。
+        if unicodedata.category(letter2).startswith("P"):
             return ""
-        # 其它正常情况，如 俩单词 或 一单词一汉字，加空格
+        # 其它正常情况加空格
         return " "
-
-    def calcBox(self,left,right):
+
+    @staticmethod  # 测试用：打印结果表格
+    def _test_print_table(res):
+        # ['level', 'page_num', 'block_num', 'par_num', 'line_num', 'word_num', 'left', 'top', 'width', 'height', 'conf', 'text']
+        print("原始输出：\n")
+        # 计算每列的最大宽度
+        col_widths = [max(len(str(item)) for item in col) for col in zip(*res)]
+        for row in res:
+            s = " ".join(str(item).ljust(col_widths[i]) for i, item in enumerate(row))
+            print(s)
+
+    @staticmethod  # 测试用：打印结果字典
+    def _test_print_datas(datas):
+        for d in datas:
+            print(f'{d["score"]:.3f}|{d["text"]}|【{repr(d["end"])}】')
+
+    def calcBox(self, left, right):
         topLeft = left[0]
         topRight = right[1] if right else left[1]
         bottomLeft = left[3]
         bottomRight = right[2] if right else left[2]
-        return [topLeft,topRight, bottomRight, bottomLeft]
+        return [topLeft, topRight, bottomRight, bottomLeft]
 
-    def standardize(self,res):
-        # ['level', 'page_num', 'block_num', 'par_num', 'line_num', 'word_num', 'left', 'top', 'width', 'height', 'conf', 'text']
+    def standardize(self, res):
+        # self._test_print_table(res)
         datas = []
-        curString = ""
-        curLeftBox = None
-        curRightBox = None
-        scores = []
-        for item in res[2:]: # 第一行为固定的提示表头
-            text = item[11]
-            score = float(item[10])
-            level = int(item[0]) # level 为 5 时为单词，依据此进行组句
-            left,top,width,height = int(item[6]), int(item[7]), int(item[8]), int(item[9])
-            topLeft = [left,top]
-            topRight = [left+width,top]
-            bottomLeft = [left,top+height]
-            bottomRight = [left+width,top+height]
-            box = [topLeft,topRight, bottomRight, bottomLeft]
-            if level != 5 and len(scores) != 0 and not curString.isspace():
-                final = 0
-                for i in range(len(scores)):
-                    final += scores[i]
-                datas.append({"text": curString, "score": final / len(scores), "box": self.calcBox(curLeftBox,curRightBox), "end": ''})
-                curRightBox = None
-                curLeftBox = None
-                scores = []
-                curString = ""
-            if level == 3 and len(datas):
-                datas[-1]["end"]='\n'
+        # 当前行的信息
+        data = None
+        text = ""
+        score = 0
+        num = 0
+        last_level = -1
+        # 遍历所有行
+        for index in range(1, len(res)):
+            row = res[index]
+            level = self._get_r(row, "level")
+            # 结束上一行
+            if last_level == 5 and level != 5:
+                if not text.isspace() or not text:  # 跳过纯空格或空行
+                    data["text"] = text
+                    data["score"] = score / (max(num, 1) * 100)
+                    # 若 level 不是 line ，说明新一行不属于同一自然段，结尾要换行
+                    if level != 4:
+                        data["end"] = "\n"
+                    datas.append(data)
+            # 发现新的一行
+            if level == 4:
+                left = self._get_r(row, "left")
+                top = self._get_r(row, "top")
+                width = self._get_r(row, "width")
+                height = self._get_r(row, "height")
+                data = {
+                    "box": [
+                        [left, top],
+                        [left + width, top],
+                        [left + width, top + height],
+                        [left, top + height],
+                    ],
+                }
+                score = 0
+                num = 0
+                text = ""
+            # 补充当前行
             if level == 5:
-                if score <= self.accuracy:
-                    continue
-                if curString == "": # 开头不做处理
-                    curLeftBox = box
-                    curString = text
-                else:
-                    curRightBox = box
-                    curString += self._word_separator(curString[-1],text[-1])+text
-                scores.append(score)
-                continue
-            else: # 多个非 level5 相连则不做处理，直接跳过即可
+                sep = ""
+                now_text = self._get_r(row, "text")
+                if text and now_text:  # 获取间隔符
+                    sep = self._word_separator(text[-1], now_text[0])
+                text += sep + now_text
+                score += self._get_r(row, "conf")
+                num += 1
+            last_level = level
+        # 遍历所有结果，补充 ["end"] 参数
+        for index in range(len(datas) - 1):
+            d1 = datas[index]
+            if "end" in d1:  # 跳过已有
                 continue
+            d2 = datas[index + 1]  # 下一行
+            d1["end"] = self._word_separator(d1["text"][-1], d2["text"][0])
+        # self._test_print_datas(datas)
         if datas:
             out = {"code": 100, "data": datas}
         else:
@@ -109,16 +171,21 @@ def standardize(self,res):
 
     # 获取OcrHandle 实例
     def start(self, argd):
-        self.psm = "--psm 3" if argd['psm'] else "--psm 6" # psm 3: 自动分页 psm6: 单文本块分页  magic number来源：tesseract docs
+        self.psm = (
+            "--psm 3" if argd["psm"] else "--psm 6"
+        )  # psm 3: 自动分页 psm6: 单文本块分页  magic number来源：tesseract docs
         try:
             langs = self.get_select_languages(argd)
             self.languages = "+".join(langs)
             if self.tesseractOcr:  # 引擎已启动，则跳过再启动
                 return ""
             site.addsitedir(SitePackages)  # 依赖库到添加python搜索路径
             import pytesseract
-            pytesseract.pytesseract.tesseract_cmd = os.path.join(CurrentDir,'engine/tesseract.exe')
-            self.tesseractOcr=pytesseract
+
+            pytesseract.pytesseract.tesseract_cmd = os.path.join(
+                CurrentDir, "engine/tesseract.exe"
+            )
+            self.tesseractOcr = pytesseract
             return ""
         except Exception as e:
             self.tesseractOcr = None
@@ -133,8 +200,17 @@ def _run(self, img: Image):
             res = {"code": 201, "data": "tesseractOcr not initialized."}
         else:
             try:
-                res = [item.split('\t') for item in self.tesseractOcr.image_to_data(img, lang=self.languages,config=self.psm).split('\n')][:-1] # TODO: 此处tesseract docs实际上给出的command line example很少，所以此处的config以最重要的psm先代替，其他的需要再多研究一下docs再加入
-                res.append([-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,""]) # 确保所有的文字都被正确append
+                res = [
+                    item.split("\t")
+                    for item in self.tesseractOcr.image_to_data(
+                        img, lang=self.languages, config=self.psm
+                    ).split("\n")
+                ][
+                    :-1
+                ]  # TODO: 此处tesseract docs实际上给出的command line example很少，所以此处的config以最重要的psm先代替，其他的需要再多研究一下docs再加入
+                res.append(
+                    [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, ""]
+                )  # 确保所有的文字都被正确append
                 res = self.standardize(res)
             except Exception as e:
                 traceback.print_exc()

diff --git a/i18n.csv b/i18n.csv
@@ -6,5 +6,9 @@ TesseractOCR（本地）,TesseractOCR (Local),TesseractOCR（本地）,Tesseract
 语言,Language,語言,言語
 请在仅当文本内容包含多语言时再勾选额外识别语言，否则可能会出现识别精度下降问题。,Please check the extra recognition language only when the text content contains multiple languages or the recognition accuracy may be reduced.,僅當文本內容包含多語言時再勾選額外識別語言，否則可能會出現識別精度下降問題。,テキストに複数の言語が含まれている場合にのみ、追加の認識言語をチェックしてください。それ以外の場合、認識精度が低下する可能性があります。
 自动识别排版,Auto detect the layout,自動識別排版,レイアウトの自動検出
-设置分段格式为自动识别多块文本块排版格式，否则采用单文本块格式识别（只建议在确定无多栏识别场景时关闭，否则可能会出现识别排版错误）,Set the paragraph format to automatically detect the layout of multiple text blocks or the single text block format will be used for recognition (It is only recommended to turn off when it is determined that there is no multi-column recognition scene or the recognition layout may be incorrect),將分段格式設置為自動識別多塊文本塊排版格式，否則採用單文本塊格式識別（只建議在確定無多欄識別場景時關閉，否則可能會出現識別排版錯誤）,段落の形式を複数のテキストブロックのレイアウトを自動的に検出するか、単一のテキストブロックの形式を認識するかに設定します（複数の列認識シーンがないことが確認されている場合にのみオフにすることをお勧めします。それ以外の場合、認識レイアウトが正しくない可能性があります）
+"除非图像中只有一句文本，否则应保持开启。
+同时，建议将[排版解析方案]设为“不做处理”。","Unless there is only one sentence of text in the image, it should remain open.
+And, it is recommended to set the [Layout Analysis Scheme] to ""No processing"".","除非影像中只有一句文字，否則應保持開啟。
+同時，建議將[排版解析方案]設為“不做處理”。","画像にテキストの文が1つしかない限り、開いたままにしておく必要があります。
+また、［組版解析スキーム］を「処理しない」にすることをお勧めします。"
 开启竖版识别,Enable vertical text recognition,開啟豎版識別,垂直テキスト認識を有効にする