debug

修复了法语的《》表示引用但是会被spacy错误视为句子结束分割的bug 逐步测试各个语言中
Huanshere · Sep 12, 2024 · 93ae6c7 · 93ae6c7
1 parent 9e42551
commit 93ae6c7
Show file tree

Hide file tree

Showing 5 changed files with 33 additions and 49 deletions.
diff --git a/README.md b/README.md
@@ -62,15 +62,15 @@ https://github.com/user-attachments/assets/0f5d5878-bfa5-41e4-ade1-d2b81d925a7d
 
 | 输入语言 | 支持程度 | 示例视频 |
 |---------|---------|---------|
-| 英语 | ⭐⭐⭐ | [英转中 demo](https://github.com/user-attachments/assets/127373bb-c152-4b7a-8d9d-e586b2c62b4b)  |
-| 日语 | ⭐⭐ |  |
-| 俄语 | ⭐⭐ |  |
-| 中文 | ❎ |  |
+| 英语 | 🤩 | [英转中 demo](https://github.com/user-attachments/assets/127373bb-c152-4b7a-8d9d-e586b2c62b4b)  |
+| 俄语 | 😊 | [俄转中 demo](https://github.com/user-attachments/assets/25264b5b-6931-4d39-948c-5a1e4ce42fa7) |
+| 日语 | 😖 | ❌ |
+| 中文 | 😖 | ❌ |
 | 法语 | ❓ (尚未测试) |  |
 | 德语 | ❓ (尚未测试) |  |
 | 西班牙语 | ❓ (尚未测试) |  |
 
-- 输出语言支持：VideoLingo 支持翻译成所有语言
+- 输出语言支持：VideoLingo 支持翻译成claude会的所有语言
 
 ## 🙏 致谢
 

diff --git a/config.example.py b/config.example.py
@@ -94,33 +94,21 @@
 # Spacy model
 # Spacy 模型
 SPACY_MODEL_MAP = {
-    "en": "en_core_web_sm",
-    "zh": "zh_core_web_sm",
-    "es": "es_core_news_lg",
-    "fr": "fr_core_news_lg",
-    "de": "de_core_news_lg",
-    "it": "it_core_news_lg",
-    "ja": "ja_core_news_lg",
-    "pt": "pt_core_news_lg",
-    "nl": "nl_core_news_lg",
-    "el": "el_core_news_lg",
-    "ru": "ru_core_news_lg",
-    "ar": "ar_core_news_lg",
-    "hi": "hi_core_news_lg",
-    "ko": "ko_core_news_lg",
-    "pl": "pl_core_news_lg",
-    "uk": "uk_core_news_lg",
-    "vi": "vi_core_news_lg",
-    "tr": "tr_core_news_lg",
-    "th": "th_core_news_lg",
-    "ro": "ro_core_news_lg",
-    "da": "da_core_news_lg",
-    "fi": "fi_core_news_lg",
-    "hu": "hu_core_news_lg",
-    "nb": "nb_core_news_lg",
-    "sv": "sv_core_news_lg"
+    "en": "en_core_web_md",
+    "ru": "ru_core_news_md",
+    "fr": "fr_core_news_md",
+
+    # "es": "es_core_news_md",
+    # "de": "de_core_news_md",
+    # "it": "it_core_news_md",
+
+
+    # Not supported
+    # "zh": "zh_core_web_md",
+    # "ja": "ja_core_news_md",
 }
 
+
 # 使用空格分割的语言
 # Languages that split with space
 LANGUAGE_SPLIT_WITH_SPACE = ['en', 'es', 'fr', 'de', 'it', 'pt', 'nl', 'el', 'ru', 'ar', 'hi', 'pl', 'uk', 'vi', 'tr', 'ro', 'da', 'fi', 'hu', 'nb', 'sv']

diff --git a/core/all_whisper_methods/whisperX.py b/core/all_whisper_methods/whisperX.py
@@ -71,26 +71,27 @@ def transcribe_audio(audio_file: str) -> Dict:
 
 def process_transcription(result: Dict) -> pd.DataFrame:
     all_words = []
-    # save to debug as json 
-    with open('output/log/debug.json', 'a', encoding='utf-8') as f:
-        json.dump(result, f, ensure_ascii=False, indent=4)
     for segment in result['segments']:
         for word in segment['words']:
             if 'start' not in word and 'end' not in word:
                 if all_words:
-                    # 合并到前一个词
+                    # Merge with the previous word
                     all_words[-1]['text'] = f'{all_words[-1]["text"][:-1]}{word["word"]}"'
                 else:
-                    # 如果是第一个词，暂时保存，等待下一个有时间戳的词
+                    # If it's the first word, temporarily save it and wait for the next word with a timestamp
                     temp_word = word["word"]
             else:
-                # 正常情况，有开始和结束时间
+                # Normal case, with start and end times
                 word_dict = {
                     'text': f'"{temp_word}{word["word"]}"' if 'temp_word' in locals() else f'"{word["word"]}"',
                     'start': word.get('start', all_words[-1]['end'] if all_words else 0),
                     'end': word['end'],
                     'score': word.get('score', 0)
                 }
+
+                # ! For French, we need to convert guillemets to empty strings
+                word_dict['text'] = word_dict['text'].replace('»', '').replace('«', '')
+
                 all_words.append(word_dict)
                 if 'temp_word' in locals():
                     del temp_word

diff --git a/core/all_whisper_methods/whisperXapi.py b/core/all_whisper_methods/whisperXapi.py
@@ -66,10 +66,14 @@ def transcribe_audio(audio_base64: str) -> Dict:
     except Exception as e:
         raise Exception(f"Error accessing whisperX API: {e} Please check your Replicate API key and internet connection.\n")
 
+
 def process_transcription(result: Dict) -> pd.DataFrame:
     all_words = []
     for segment in result['segments']:
         for word in segment['words']:
+            # ! For French, we need to convert guillemets to empty strings
+            word["word"] = word["word"].replace('»', '').replace('«', '')
+
             if 'start' not in word and 'end' not in word:
                 if all_words:
                     # Merge with the previous word
@@ -80,11 +84,12 @@ def process_transcription(result: Dict) -> pd.DataFrame:
             else:
                 # Normal case, with start and end times
                 word_dict = {
-                    'text': f'"{temp_word}{word["word"]}"' if 'temp_word' in locals() else f'"{word["word"]}"',
+                    'text': f'{temp_word}{word["word"]}' if 'temp_word' in locals() else f'{word["word"]}',
                     'start': word.get('start', all_words[-1]['end'] if all_words else 0),
                     'end': word['end'],
                     'score': word.get('score', 0)
                 }
+
                 all_words.append(word_dict)
                 if 'temp_word' in locals():
                     del temp_word

diff --git a/core/spacy_utils/split_by_mark.py b/core/spacy_utils/split_by_mark.py
@@ -14,7 +14,7 @@ def split_by_mark(nlp):
     chunks = pd.read_excel("output/log/cleaned_chunks.xlsx")
     chunks.text = chunks.text.apply(lambda x: x.strip('"'))
 
-    # 用 joiner 拼接
+    # join with joiner
     input_text = joiner.join(chunks.text.to_list())
 
     doc = nlp(input_text)
@@ -29,15 +29,5 @@ def split_by_mark(nlp):
     print("💾 Sentences split by punctuation marks saved to →  `sentences_by_mark.txt`")
 
 if __name__ == "__main__":
-    # nlp = init_nlp()
-    # split_by_mark(nlp)
-
-    s = """そうで。"""
     nlp = init_nlp()
-    doc = nlp(s)
-    print(doc)
-    assert doc.has_annotation("SENT_START")
-
-    sentences_by_mark = [sent.text for sent in doc.sents]
-    print(sentences_by_mark)
-
+    split_by_mark(nlp)