设置热词替换之后再次调整中英文之间的空格，避免中英相互替换后空格异常。

HaujetZhao · Dec 17, 2023 · bc795df · bc795df
1 parent dadfce9
commit bc795df
Show file tree

Hide file tree

Showing 3 changed files with 52 additions and 39 deletions.
diff --git a/core_client.py b/core_client.py
@@ -1,5 +1,6 @@
 # coding: utf-8
 
+from globs_var import *
 import os
 import sys
 import platform
@@ -348,6 +349,12 @@ async def do_recognize():
         decoding_results = hot_sub_en.热词替换(decoding_results)
     if hot_rule: 
         decoding_results = hot_sub_rule.热词替换(decoding_results)
+
+    result_0 = decoding_results
+
+    # 调整中英空格排版
+    if format_spell:
+        decoding_results = result_1 = en_in_zh.sub(adjust_space, decoding_results)
 
     # 打印结果
     if paste:   
@@ -362,7 +369,9 @@ async def do_recognize():
         keyboard.write(decoding_results)
 
     # 终端显示结果
-    console.print(f'识别结果：[green4]{decoding_results}')
+    console.print(f'识别结果：[green4]{result_0}')
+    if format_spell:
+        console.print(f'    调整中英空格排版：{result_1}')
     console.print(f'    录音时长：{len(samples1) / 16000: >8.2f}s')
     console.print(f'    识别时长：{t2 - t1: >8.2f}s')
     console.print(f'    Real Time Factor: {(t2-t1) / (len(samples1)/16000): >5.2f}')

diff --git a/core_server.py b/core_server.py
@@ -1,4 +1,5 @@
 
+from globs_var import *
 from os import path, sep, mkdir, makedirs, getcwd, chdir
 import sys
 if 'BASE_DIR' not in globals():
@@ -15,8 +16,6 @@
 from pathlib import Path
 import time
 import asyncio
-import re
-from string import digits, ascii_letters
 
 import numpy as np
 import websockets
@@ -33,7 +32,6 @@
 
 format_num      = True      # 输出时是否将中文数字转为阿拉伯数字
 format_punc     = True      # 输出时是否启用标点符号引擎（在 MacOS 上标点引擎似乎有问题，应当改为 False）
-format_spell    = True      # 输出时是否调整中英之间的空格
 
 model_dir = Path() / 'models'
 paraformer_path = Path() / 'models' / 'paraformer-offline-zh' / 'model.onnx'
@@ -66,41 +64,6 @@ class args:
 
 # ========================================================================
 
-en_in_zh = re.compile(r"""(?ix)    # i 表示忽略大小写，x 表示开启注释模式
-    ([\u4e00-\u9fa5]|[a-z0-9]+\s)?      # 左侧是中文，或者英文加空格
-    ([a-z0-9 ]+)                    # 中间是一个或多个「英文数字加空格」
-    ([\u4e00-\u9fa5]|[a-z0-9]+)?       # 右是中文，或者英文加空格
-""")
-
-def adjust_space(original: re.Match):
-    left : str = original.group(1)
-    center : str = original.group(2)
-    right : str = original.group(3)
-    # 如果拼写字母中间有空格，就把空格都去掉
-    if center:
-        final = re.sub(r'((\d) )?(\b\w) ?(?!\w{2})', r'\2\3', center).strip()
-        # 测试地址 https://regex101.com/r/1Vtu7V/1
-        # final = re.sub(r'(\b\w) (?!\w{2})', r'\1', original.group(2)).strip()
-
-    # 如果英文的左边有汉字或英文，给两组之间加上空格
-    if left :
-        if left.strip(digits) == left and center.lstrip(digits) == center :  # 左侧结尾不是数字，中间开头不是数字
-            final = ' ' + final
-        final = left.rstrip() + final
-
-    # 如果英文左边的汉字被前一个组消费了，就要手动去看一下前一个字是不是中文
-    elif re.match(r'[\u4e00-\u9fa5]', original.string[original.start(2) - 1]): 
-        if center.lstrip(digits) == center:     # 确保中间开头不是数字
-            final = ' ' + final
-
-    # 如果英文的右边有汉字，给中英之间加上空格
-    if right:
-        if center.rstrip(digits) == center:     # 确保中间结尾不是数字
-            final += ' '
-        final += right.lstrip()
-
-    return final
-
 async def ws_serve(websocket, path):
     global loop
     global format_num, format_punc, format_spell

diff --git a/globs_var.py b/globs_var.py
@@ -0,0 +1,41 @@
+format_spell    = True      # 输出时是否调整中英之间的空格
+
+import re
+from string import digits, ascii_letters
+
+# ========================================================================
+
+en_in_zh = re.compile(r"""(?ix)    # i 表示忽略大小写，x 表示开启注释模式
+    ([\u4e00-\u9fa5]|[a-z0-9]+\s)?      # 左侧是中文，或者英文加空格
+    ([a-z0-9 ]+)                    # 中间是一个或多个「英文数字加空格」
+    ([\u4e00-\u9fa5]|[a-z0-9]+)?       # 右是中文，或者英文加空格
+""")
+
+def adjust_space(original: re.Match):
+    left : str = original.group(1)
+    center : str = original.group(2)
+    right : str = original.group(3)
+    # 如果拼写字母中间有空格，就把空格都去掉
+    if center:
+        final = re.sub(r'((\d) )?(\b\w) ?(?!\w{2})', r'\2\3', center).strip()
+        # 测试地址 https://regex101.com/r/1Vtu7V/1
+        # final = re.sub(r'(\b\w) (?!\w{2})', r'\1', original.group(2)).strip()
+
+    # 如果英文的左边有汉字或英文，给两组之间加上空格
+    if left :
+        if left.strip(digits) == left and center.lstrip(digits) == center :  # 左侧结尾不是数字，中间开头不是数字
+            final = ' ' + final
+        final = left.rstrip() + final
+
+    # 如果英文左边的汉字被前一个组消费了，就要手动去看一下前一个字是不是中文
+    elif re.match(r'[\u4e00-\u9fa5]', original.string[original.start(2) - 1]): 
+        if center.lstrip(digits) == center:     # 确保中间开头不是数字
+            final = ' ' + final
+
+    # 如果英文的右边有汉字，给中英之间加上空格
+    if right:
+        if center.rstrip(digits) == center:     # 确保中间结尾不是数字
+            final += ' '
+        final += right.lstrip()
+
+    return final