V0.2.0支持保留原文的翻译，增加翻译错误处理，增加word和PowerPoint的支持

infrost · Aug 17, 2024 · 32d95bd · 32d95bd
1 parent 92cd3b1
commit 32d95bd
Show file tree

Hide file tree

Showing 11 changed files with 332 additions and 73 deletions.
diff --git a/Lib/__pycache__/__init__.cpython-312.pyc b/Lib/__pycache__/__init__.cpython-312.pyc
diff --git a/Lib/__pycache__/compose.cpython-312.pyc b/Lib/__pycache__/compose.cpython-312.pyc
diff --git a/Lib/__pycache__/config.cpython-312.pyc b/Lib/__pycache__/config.cpython-312.pyc
diff --git a/Lib/__pycache__/data_process.cpython-312.pyc b/Lib/__pycache__/data_process.cpython-312.pyc
diff --git a/Lib/__pycache__/extract.cpython-312.pyc b/Lib/__pycache__/extract.cpython-312.pyc
diff --git a/Lib/compose.py b/Lib/compose.py
@@ -1,7 +1,8 @@
 import os
 import zipfile
-import xml.etree.ElementTree as ET
+from lxml import etree as ET
 import tempfile
+from Lib.config import config
 
 def read_strings_from_file(file_path):
     with open(file_path, 'r', encoding='utf-8') as f:
@@ -20,7 +21,8 @@ def update_shared_strings_in_xlsx(file_path, strings):
             print("sharedStrings.xml 文件不存在。")
             return
 
-        # 解析 XML 文件
+        # 解析 XML 文件，保留原始格式
+        #parser = ET.XMLParser(remove_blank_text=True, recover=True)
         tree = ET.parse(shared_strings_path)
         root = tree.getroot()
 
@@ -30,14 +32,20 @@ def update_shared_strings_in_xlsx(file_path, strings):
         # 确保提供的字符串数量与 <t> 标签数量匹配
         if len(strings) != len(t_elements):
             print(f"警告: 提供的字符串数量 ({len(strings)}) 与现有 <t> 标签数量 ({len(t_elements)}) 不匹配。")
+        else:
+            print("已匹配所有标签")
 
         # 按顺序替换 <t> 标签中的文本
         for t_element, new_string in zip(t_elements, strings):
-            t_element.text = new_string
+            if config.get("save_original", False):  # 检查 config["save_original"] 的值
+                t_element.text = t_element.text + new_string if t_element.text else new_string
+            else:
+                t_element.text = new_string
+
+        # 将修改后的 XML 写回文件，不改变格式
+        with open(shared_strings_path, 'wb') as f:
+            tree.write(f, xml_declaration=True, encoding='UTF-8', pretty_print=False)
 
-        # 将修改后的 XML 写回文件
-        tree.write(shared_strings_path, xml_declaration=True, encoding='UTF-8')
-
         # 将修改后的文件压缩回 .xlsx
         new_xlsx_path = file_path.replace('.xlsx', '_translated.xlsx')
 
@@ -52,6 +60,98 @@ def update_shared_strings_in_xlsx(file_path, strings):
                     arcname = os.path.relpath(file_path, tmpdirname)
                     zip_ref.write(file_path, arcname)
 
+def update_shared_strings_in_docx(file_path, strings):
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        with zipfile.ZipFile(file_path, 'r') as zip_ref:
+            zip_ref.extractall(tmpdirname)
+
+        shared_strings_path = os.path.join(tmpdirname, 'word', 'document.xml')
+
+        if not os.path.exists(shared_strings_path):
+            print("错误：document.xml 文件不存在。")
+            return
+
+        parser = ET.XMLParser(remove_blank_text=True)
+        tree = ET.parse(shared_strings_path, parser)
+        root = tree.getroot()
+
+        namespaces = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
+        t_elements = root.xpath('//w:t', namespaces=namespaces)
+
+        if len(strings) != len(t_elements):
+            print(f"警告: 提供的字符串数量 ({len(strings)}) 与现有 <t> 标签数量 ({len(t_elements)}) 不匹配。")
+
+        for t_element, new_string in zip(t_elements, strings):
+            if config.get("save_original", False): 
+                t_element.text = (t_element.text or '') + new_string
+            else:
+                t_element.text = new_string
+
+        tree.write(shared_strings_path, xml_declaration=True, encoding='UTF-8', pretty_print=True)
+
+        new_docx_path = file_path.replace('.docx', '_translated.docx')
+
+        if os.path.exists(new_docx_path):
+            os.remove(new_docx_path)
+
+        with zipfile.ZipFile(new_docx_path, 'w') as zip_ref:
+            for foldername, subfolders, filenames in os.walk(tmpdirname):
+                for filename in filenames:
+                    file_path = os.path.join(foldername, filename)
+                    arcname = os.path.relpath(file_path, tmpdirname)
+                    zip_ref.write(file_path, arcname)
+
+def update_shared_strings_in_pptx(file_path, strings):
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        with zipfile.ZipFile(file_path, 'r') as zip_ref:
+            zip_ref.extractall(tmpdirname)
+
+        slides_dir = os.path.join(tmpdirname, 'ppt', 'slides')
+
+        if not os.path.exists(slides_dir):
+            print("错误：slides 目录不存在。")
+            return
+
+        parser = ET.XMLParser(remove_blank_text=True)
+        slide_files = sorted([f for f in os.listdir(slides_dir) if f.endswith('.xml')])
+
+        string_index = 0
+
+        for slide_file in slide_files:
+            slide_path = os.path.join(slides_dir, slide_file)
+            tree = ET.parse(slide_path, parser)
+            root = tree.getroot()
+
+            namespaces = {'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'}
+            t_elements = root.xpath('//a:t', namespaces=namespaces)
+
+            for t_element in t_elements:
+                if string_index < len(strings):
+                    t_element.text = strings[string_index]
+                    string_index += 1
+                else:
+                    break
+
+            # 更新后的内容写回幻灯片文件
+            tree.write(slide_path, xml_declaration=True, encoding='UTF-8', pretty_print=True)
+
+        if string_index < len(strings):
+            print(f"警告: 提供的字符串数量 ({len(strings)}) 超过了现有 <a:t> 标签数量 ({string_index})，未能完全替换。")
+
+        new_pptx_path = file_path.replace('.pptx', '_translated.pptx')
+
+        if os.path.exists(new_pptx_path):
+            os.remove(new_pptx_path)
+
+        with zipfile.ZipFile(new_pptx_path, 'w') as zip_ref:
+            for foldername, subfolders, filenames in os.walk(tmpdirname):
+                for filename in filenames:
+                    file_path = os.path.join(foldername, filename)
+                    arcname = os.path.relpath(file_path, tmpdirname)
+                    zip_ref.write(file_path, arcname)
+
+        print(f"更新后的文件已保存为: {new_pptx_path}")
+
 def compose_file(file_type, input_path):
 
     result_file_path = './out/translated_result.txt'
@@ -64,3 +164,18 @@ def compose_file(file_type, input_path):
         update_shared_strings_in_xlsx(input_path, result_strings)
         print(f"生成翻译后的电子表格...")
 
+    if file_type == "Word":
+        update_shared_strings_in_docx(input_path, result_strings)
+        print(f"生成翻译后的word文档...")
+
+    if file_type == "PowerPoint":
+        update_shared_strings_in_pptx(input_path, result_strings)
+        print(f"生成翻译后的ppt文档...")
+
+def dev_mode():
+    file_type = "PowerPoint"
+    input_path = r"C:\Users\skyts\Desktop\开会班组员工.pptx"
+    compose_file (file_type, input_path)
+
+
+
diff --git a/Lib/config.py b/Lib/config.py
@@ -0,0 +1,29 @@
+import os
+import json
+
+# 定义默认配置
+default_config = {
+    "save_original": False, #保留原文件值
+    "version": "0.2.0",
+    "dev": False
+}
+
+# 定义配置文件的路径
+config_path = './config.json'
+
+def load_config():
+    """加载配置，如果config.json不存在则创建并使用默认配置。"""
+    if not os.path.exists(config_path):
+        # 如果不存在，创建config.json并写入默认配置
+        with open(config_path, 'w') as config_file:
+            json.dump(default_config, config_file, indent=4)
+        return default_config
+    else:
+        # 如果存在，加载config.json并更新默认配置
+        with open(config_path, 'r') as config_file:
+            user_config = json.load(config_file)
+        # 更新默认配置
+        return {**default_config, **user_config}
+
+# 直接加载配置供外部使用
+config = load_config()
diff --git a/Lib/data_process.py b/Lib/data_process.py
@@ -21,6 +21,7 @@
 import httpx
 import json
 import os
+import time
 output_dir = './out'
 os.makedirs(output_dir, exist_ok=True)
 
@@ -56,32 +57,60 @@ def process_file(file_path, source_lang, target_lang):
         strings_array.append('\n'.join(current_string))
     alternative_index = 0  # 用于命名 alternatives 文件
     with open('./out/translated_result.txt', 'w', encoding='utf-8') as result_file:
+        process_block_count = 1
         for s in strings_array:
-            print(f"正在处理...\n{s}")
+            print(f"正在处理第{process_block_count}个块...")
+            process_block_count = process_block_count + 1
             json_array = str(s)
             data = {
                 "text": s,
                 "source_lang": source_lang,
                 "target_lang": target_lang
             }
             post_data = json.dumps(data)
-            try:
-                # 发送POST请求并打印结果
-                r = httpx.post(url=deeplx_api, data=post_data)
-                response_data = r.json()
 
-                # 保存 data 内容到 translated_result.txt
-                result_file.write(response_data['data'] + '\n')
-                print(f"收到数据{response_data}")
+            retry_count = 0
+            max_retries = 5
+            success = False
+            while retry_count < max_retries:
+                try:
+                    # 发送POST请求并打印结果
+                    r = httpx.post(url=deeplx_api, data=post_data)
+                    response_data = r.json()
+
+                    # 检查 'data' 是否存在
+                    if 'data' in response_data:
+                        # 保存 data 内容到 translated_result.txt
+                        result_file.write(response_data['data'] + '\n')
+                        success = True
+                        #print(f"收到数据 {response_data}")
+
+                        # 如果存在 alternatives，保存每个替代到不同的文件
+                        if "alternatives" in response_data and response_data["alternatives"] is not None:
+                            alternatives = response_data["alternatives"]
+                            print(alternatives)
+                            for alternative in alternatives:
+                                with open(f'./out/alternatives({alternative_index}).txt', 'w', encoding='utf-8') as alt_file:
+                                    alt_file.write(alternative + '\n')
+                                alternative_index += 1
+                        break  # 成功处理后退出重试循环
+
+                    else:
+                        raise ValueError("未找到返回数据，可能是因为请求过于频繁")
+
+                except (httpx.RequestError, ValueError) as exc:
+                    retry_count += 1
+                    if retry_count < max_retries:
+                        print(f"Error occurred: {exc}. Retrying in 2 seconds... ({retry_count}/{max_retries})")
+                        time.sleep(2)
+                    else:
+                        print(f"Failed after {max_retries} retries. Moving on to the next string.")
+
+            if not success:
+                error_lines = s.splitlines()
+                error_messages = [f"Error: {error_line}" for error_line in error_lines]
+                error_message = "\n".join(error_messages) + "\n"
+                result_file.write(error_message)
+                print(f"Failed after {max_retries} retries. Moving on to the next string.")
 
-                # 如果存在 alternatives，保存每个替代到不同的文件
-                if "alternatives" in response_data and response_data["alternatives"] is not None :
-                    alternatives = response_data["alternatives"]
-                    print(alternatives)
-                    for alternative in alternatives:
-                        with open(f'./out/alternatives({alternative_index}).txt', 'w', encoding='utf-8') as alt_file:
-                            alt_file.write(alternative + '\n')
-                        alternative_index += 1
 
-            except httpx.RequestError as exc:
-                print(f"An error occurred while requesting {exc.request.url!r}.")