Skip to content

Commit

Permalink
V0.2.0支持保留原文的翻译,增加翻译错误处理,增加word和PowerPoint的支持
Browse files Browse the repository at this point in the history
  • Loading branch information
infrost committed Aug 17, 2024
1 parent 92cd3b1 commit 32d95bd
Show file tree
Hide file tree
Showing 11 changed files with 332 additions and 73 deletions.
Binary file modified Lib/__pycache__/__init__.cpython-312.pyc
Binary file not shown.
Binary file modified Lib/__pycache__/compose.cpython-312.pyc
Binary file not shown.
Binary file added Lib/__pycache__/config.cpython-312.pyc
Binary file not shown.
Binary file modified Lib/__pycache__/data_process.cpython-312.pyc
Binary file not shown.
Binary file modified Lib/__pycache__/extract.cpython-312.pyc
Binary file not shown.
127 changes: 121 additions & 6 deletions Lib/compose.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import os
import zipfile
import xml.etree.ElementTree as ET
from lxml import etree as ET
import tempfile
from Lib.config import config

def read_strings_from_file(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
Expand All @@ -20,7 +21,8 @@ def update_shared_strings_in_xlsx(file_path, strings):
print("sharedStrings.xml 文件不存在。")
return

# 解析 XML 文件
# 解析 XML 文件,保留原始格式
#parser = ET.XMLParser(remove_blank_text=True, recover=True)
tree = ET.parse(shared_strings_path)
root = tree.getroot()

Expand All @@ -30,14 +32,20 @@ def update_shared_strings_in_xlsx(file_path, strings):
# 确保提供的字符串数量与 <t> 标签数量匹配
if len(strings) != len(t_elements):
print(f"警告: 提供的字符串数量 ({len(strings)}) 与现有 <t> 标签数量 ({len(t_elements)}) 不匹配。")
else:
print("已匹配所有标签")

# 按顺序替换 <t> 标签中的文本
for t_element, new_string in zip(t_elements, strings):
t_element.text = new_string
if config.get("save_original", False): # 检查 config["save_original"] 的值
t_element.text = t_element.text + new_string if t_element.text else new_string
else:
t_element.text = new_string

# 将修改后的 XML 写回文件,不改变格式
with open(shared_strings_path, 'wb') as f:
tree.write(f, xml_declaration=True, encoding='UTF-8', pretty_print=False)

# 将修改后的 XML 写回文件
tree.write(shared_strings_path, xml_declaration=True, encoding='UTF-8')

# 将修改后的文件压缩回 .xlsx
new_xlsx_path = file_path.replace('.xlsx', '_translated.xlsx')

Expand All @@ -52,6 +60,98 @@ def update_shared_strings_in_xlsx(file_path, strings):
arcname = os.path.relpath(file_path, tmpdirname)
zip_ref.write(file_path, arcname)

def update_shared_strings_in_docx(file_path, strings):
with tempfile.TemporaryDirectory() as tmpdirname:
with zipfile.ZipFile(file_path, 'r') as zip_ref:
zip_ref.extractall(tmpdirname)

shared_strings_path = os.path.join(tmpdirname, 'word', 'document.xml')

if not os.path.exists(shared_strings_path):
print("错误:document.xml 文件不存在。")
return

parser = ET.XMLParser(remove_blank_text=True)
tree = ET.parse(shared_strings_path, parser)
root = tree.getroot()

namespaces = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
t_elements = root.xpath('//w:t', namespaces=namespaces)

if len(strings) != len(t_elements):
print(f"警告: 提供的字符串数量 ({len(strings)}) 与现有 <t> 标签数量 ({len(t_elements)}) 不匹配。")

for t_element, new_string in zip(t_elements, strings):
if config.get("save_original", False):
t_element.text = (t_element.text or '') + new_string
else:
t_element.text = new_string

tree.write(shared_strings_path, xml_declaration=True, encoding='UTF-8', pretty_print=True)

new_docx_path = file_path.replace('.docx', '_translated.docx')

if os.path.exists(new_docx_path):
os.remove(new_docx_path)

with zipfile.ZipFile(new_docx_path, 'w') as zip_ref:
for foldername, subfolders, filenames in os.walk(tmpdirname):
for filename in filenames:
file_path = os.path.join(foldername, filename)
arcname = os.path.relpath(file_path, tmpdirname)
zip_ref.write(file_path, arcname)

def update_shared_strings_in_pptx(file_path, strings):
with tempfile.TemporaryDirectory() as tmpdirname:
with zipfile.ZipFile(file_path, 'r') as zip_ref:
zip_ref.extractall(tmpdirname)

slides_dir = os.path.join(tmpdirname, 'ppt', 'slides')

if not os.path.exists(slides_dir):
print("错误:slides 目录不存在。")
return

parser = ET.XMLParser(remove_blank_text=True)
slide_files = sorted([f for f in os.listdir(slides_dir) if f.endswith('.xml')])

string_index = 0

for slide_file in slide_files:
slide_path = os.path.join(slides_dir, slide_file)
tree = ET.parse(slide_path, parser)
root = tree.getroot()

namespaces = {'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'}
t_elements = root.xpath('//a:t', namespaces=namespaces)

for t_element in t_elements:
if string_index < len(strings):
t_element.text = strings[string_index]
string_index += 1
else:
break

# 更新后的内容写回幻灯片文件
tree.write(slide_path, xml_declaration=True, encoding='UTF-8', pretty_print=True)

if string_index < len(strings):
print(f"警告: 提供的字符串数量 ({len(strings)}) 超过了现有 <a:t> 标签数量 ({string_index}),未能完全替换。")

new_pptx_path = file_path.replace('.pptx', '_translated.pptx')

if os.path.exists(new_pptx_path):
os.remove(new_pptx_path)

with zipfile.ZipFile(new_pptx_path, 'w') as zip_ref:
for foldername, subfolders, filenames in os.walk(tmpdirname):
for filename in filenames:
file_path = os.path.join(foldername, filename)
arcname = os.path.relpath(file_path, tmpdirname)
zip_ref.write(file_path, arcname)

print(f"更新后的文件已保存为: {new_pptx_path}")

def compose_file(file_type, input_path):

result_file_path = './out/translated_result.txt'
Expand All @@ -64,3 +164,18 @@ def compose_file(file_type, input_path):
update_shared_strings_in_xlsx(input_path, result_strings)
print(f"生成翻译后的电子表格...")

if file_type == "Word":
update_shared_strings_in_docx(input_path, result_strings)
print(f"生成翻译后的word文档...")

if file_type == "PowerPoint":
update_shared_strings_in_pptx(input_path, result_strings)
print(f"生成翻译后的ppt文档...")

def dev_mode():
file_type = "PowerPoint"
input_path = r"C:\Users\skyts\Desktop\开会班组员工.pptx"
compose_file (file_type, input_path)



29 changes: 29 additions & 0 deletions Lib/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import os
import json

# 定义默认配置
default_config = {
"save_original": False, #保留原文件值
"version": "0.2.0",
"dev": False
}

# 定义配置文件的路径
config_path = './config.json'

def load_config():
"""加载配置,如果config.json不存在则创建并使用默认配置。"""
if not os.path.exists(config_path):
# 如果不存在,创建config.json并写入默认配置
with open(config_path, 'w') as config_file:
json.dump(default_config, config_file, indent=4)
return default_config
else:
# 如果存在,加载config.json并更新默认配置
with open(config_path, 'r') as config_file:
user_config = json.load(config_file)
# 更新默认配置
return {**default_config, **user_config}

# 直接加载配置供外部使用
config = load_config()
65 changes: 47 additions & 18 deletions Lib/data_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import httpx
import json
import os
import time
output_dir = './out'
os.makedirs(output_dir, exist_ok=True)

Expand Down Expand Up @@ -56,32 +57,60 @@ def process_file(file_path, source_lang, target_lang):
strings_array.append('\n'.join(current_string))
alternative_index = 0 # 用于命名 alternatives 文件
with open('./out/translated_result.txt', 'w', encoding='utf-8') as result_file:
process_block_count = 1
for s in strings_array:
print(f"正在处理...\n{s}")
print(f"正在处理第{process_block_count}个块...")
process_block_count = process_block_count + 1
json_array = str(s)
data = {
"text": s,
"source_lang": source_lang,
"target_lang": target_lang
}
post_data = json.dumps(data)
try:
# 发送POST请求并打印结果
r = httpx.post(url=deeplx_api, data=post_data)
response_data = r.json()

# 保存 data 内容到 translated_result.txt
result_file.write(response_data['data'] + '\n')
print(f"收到数据{response_data}")
retry_count = 0
max_retries = 5
success = False
while retry_count < max_retries:
try:
# 发送POST请求并打印结果
r = httpx.post(url=deeplx_api, data=post_data)
response_data = r.json()

# 检查 'data' 是否存在
if 'data' in response_data:
# 保存 data 内容到 translated_result.txt
result_file.write(response_data['data'] + '\n')
success = True
#print(f"收到数据 {response_data}")

# 如果存在 alternatives,保存每个替代到不同的文件
if "alternatives" in response_data and response_data["alternatives"] is not None:
alternatives = response_data["alternatives"]
print(alternatives)
for alternative in alternatives:
with open(f'./out/alternatives({alternative_index}).txt', 'w', encoding='utf-8') as alt_file:
alt_file.write(alternative + '\n')
alternative_index += 1
break # 成功处理后退出重试循环

else:
raise ValueError("未找到返回数据,可能是因为请求过于频繁")

except (httpx.RequestError, ValueError) as exc:
retry_count += 1
if retry_count < max_retries:
print(f"Error occurred: {exc}. Retrying in 2 seconds... ({retry_count}/{max_retries})")
time.sleep(2)
else:
print(f"Failed after {max_retries} retries. Moving on to the next string.")

if not success:
error_lines = s.splitlines()
error_messages = [f"Error: {error_line}" for error_line in error_lines]
error_message = "\n".join(error_messages) + "\n"
result_file.write(error_message)
print(f"Failed after {max_retries} retries. Moving on to the next string.")

# 如果存在 alternatives,保存每个替代到不同的文件
if "alternatives" in response_data and response_data["alternatives"] is not None :
alternatives = response_data["alternatives"]
print(alternatives)
for alternative in alternatives:
with open(f'./out/alternatives({alternative_index}).txt', 'w', encoding='utf-8') as alt_file:
alt_file.write(alternative + '\n')
alternative_index += 1

except httpx.RequestError as exc:
print(f"An error occurred while requesting {exc.request.url!r}.")
Loading

0 comments on commit 32d95bd

Please sign in to comment.