Skip to content

Commit

Permalink
设置热词替换之后再次调整中英文之间的空格,避免中英相互替换后空格异常。
Browse files Browse the repository at this point in the history
  • Loading branch information
shandianchengzi committed Dec 17, 2023
1 parent dadfce9 commit bc795df
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 39 deletions.
11 changes: 10 additions & 1 deletion core_client.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# coding: utf-8

from globs_var import *
import os
import sys
import platform
Expand Down Expand Up @@ -348,6 +349,12 @@ async def do_recognize():
decoding_results = hot_sub_en.热词替换(decoding_results)
if hot_rule:
decoding_results = hot_sub_rule.热词替换(decoding_results)

result_0 = decoding_results

# 调整中英空格排版
if format_spell:
decoding_results = result_1 = en_in_zh.sub(adjust_space, decoding_results)

# 打印结果
if paste:
Expand All @@ -362,7 +369,9 @@ async def do_recognize():
keyboard.write(decoding_results)

# 终端显示结果
console.print(f'识别结果:[green4]{decoding_results}')
console.print(f'识别结果:[green4]{result_0}')
if format_spell:
console.print(f' 调整中英空格排版:{result_1}')
console.print(f' 录音时长:{len(samples1) / 16000: >8.2f}s')
console.print(f' 识别时长:{t2 - t1: >8.2f}s')
console.print(f' Real Time Factor: {(t2-t1) / (len(samples1)/16000): >5.2f}')
Expand Down
39 changes: 1 addition & 38 deletions core_server.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@

from globs_var import *
from os import path, sep, mkdir, makedirs, getcwd, chdir
import sys
if 'BASE_DIR' not in globals():
Expand All @@ -15,8 +16,6 @@
from pathlib import Path
import time
import asyncio
import re
from string import digits, ascii_letters

import numpy as np
import websockets
Expand All @@ -33,7 +32,6 @@

format_num = True # 输出时是否将中文数字转为阿拉伯数字
format_punc = True # 输出时是否启用标点符号引擎(在 MacOS 上标点引擎似乎有问题,应当改为 False)
format_spell = True # 输出时是否调整中英之间的空格

model_dir = Path() / 'models'
paraformer_path = Path() / 'models' / 'paraformer-offline-zh' / 'model.onnx'
Expand Down Expand Up @@ -66,41 +64,6 @@ class args:

# ========================================================================

en_in_zh = re.compile(r"""(?ix) # i 表示忽略大小写,x 表示开启注释模式
([\u4e00-\u9fa5]|[a-z0-9]+\s)? # 左侧是中文,或者英文加空格
([a-z0-9 ]+) # 中间是一个或多个「英文数字加空格」
([\u4e00-\u9fa5]|[a-z0-9]+)? # 右是中文,或者英文加空格
""")

def adjust_space(original: re.Match):
left : str = original.group(1)
center : str = original.group(2)
right : str = original.group(3)
# 如果拼写字母中间有空格,就把空格都去掉
if center:
final = re.sub(r'((\d) )?(\b\w) ?(?!\w{2})', r'\2\3', center).strip()
# 测试地址 https://regex101.com/r/1Vtu7V/1
# final = re.sub(r'(\b\w) (?!\w{2})', r'\1', original.group(2)).strip()

# 如果英文的左边有汉字或英文,给两组之间加上空格
if left :
if left.strip(digits) == left and center.lstrip(digits) == center : # 左侧结尾不是数字,中间开头不是数字
final = ' ' + final
final = left.rstrip() + final

# 如果英文左边的汉字被前一个组消费了,就要手动去看一下前一个字是不是中文
elif re.match(r'[\u4e00-\u9fa5]', original.string[original.start(2) - 1]):
if center.lstrip(digits) == center: # 确保中间开头不是数字
final = ' ' + final

# 如果英文的右边有汉字,给中英之间加上空格
if right:
if center.rstrip(digits) == center: # 确保中间结尾不是数字
final += ' '
final += right.lstrip()

return final

async def ws_serve(websocket, path):
global loop
global format_num, format_punc, format_spell
Expand Down
41 changes: 41 additions & 0 deletions globs_var.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
format_spell = True # 输出时是否调整中英之间的空格

import re
from string import digits, ascii_letters

# ========================================================================

en_in_zh = re.compile(r"""(?ix) # i 表示忽略大小写,x 表示开启注释模式
([\u4e00-\u9fa5]|[a-z0-9]+\s)? # 左侧是中文,或者英文加空格
([a-z0-9 ]+) # 中间是一个或多个「英文数字加空格」
([\u4e00-\u9fa5]|[a-z0-9]+)? # 右是中文,或者英文加空格
""")

def adjust_space(original: re.Match):
left : str = original.group(1)
center : str = original.group(2)
right : str = original.group(3)
# 如果拼写字母中间有空格,就把空格都去掉
if center:
final = re.sub(r'((\d) )?(\b\w) ?(?!\w{2})', r'\2\3', center).strip()
# 测试地址 https://regex101.com/r/1Vtu7V/1
# final = re.sub(r'(\b\w) (?!\w{2})', r'\1', original.group(2)).strip()

# 如果英文的左边有汉字或英文,给两组之间加上空格
if left :
if left.strip(digits) == left and center.lstrip(digits) == center : # 左侧结尾不是数字,中间开头不是数字
final = ' ' + final
final = left.rstrip() + final

# 如果英文左边的汉字被前一个组消费了,就要手动去看一下前一个字是不是中文
elif re.match(r'[\u4e00-\u9fa5]', original.string[original.start(2) - 1]):
if center.lstrip(digits) == center: # 确保中间开头不是数字
final = ' ' + final

# 如果英文的右边有汉字,给中英之间加上空格
if right:
if center.rstrip(digits) == center: # 确保中间结尾不是数字
final += ' '
final += right.lstrip()

return final

0 comments on commit bc795df

Please sign in to comment.