Skip to content

Commit

Permalink
debug
Browse files Browse the repository at this point in the history
修复了法语的《》表示引用但是会被spacy错误视为句子结束分割的bug

逐步测试各个语言中
  • Loading branch information
Huanshere committed Sep 12, 2024
1 parent 9e42551 commit 93ae6c7
Show file tree
Hide file tree
Showing 5 changed files with 33 additions and 49 deletions.
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,15 +62,15 @@ https://github.com/user-attachments/assets/0f5d5878-bfa5-41e4-ade1-d2b81d925a7d

| 输入语言 | 支持程度 | 示例视频 |
|---------|---------|---------|
| 英语 | ⭐⭐⭐ | [英转中 demo](https://github.com/user-attachments/assets/127373bb-c152-4b7a-8d9d-e586b2c62b4b) |
| 日语 | ⭐⭐ | |
| 俄语 | ⭐⭐ | |
| 中文 | | |
| 英语 | 🤩 | [英转中 demo](https://github.com/user-attachments/assets/127373bb-c152-4b7a-8d9d-e586b2c62b4b) |
| 俄语 | 😊 | [俄转中 demo](https://github.com/user-attachments/assets/25264b5b-6931-4d39-948c-5a1e4ce42fa7) |
| 日语 | 😖 | |
| 中文 | 😖 | |
| 法语 | ❓ (尚未测试) | |
| 德语 | ❓ (尚未测试) | |
| 西班牙语 | ❓ (尚未测试) | |

- 输出语言支持:VideoLingo 支持翻译成所有语言
- 输出语言支持:VideoLingo 支持翻译成claude会的所有语言

## 🙏 致谢

Expand Down
38 changes: 13 additions & 25 deletions config.example.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,33 +94,21 @@
# Spacy model
# Spacy 模型
SPACY_MODEL_MAP = {
"en": "en_core_web_sm",
"zh": "zh_core_web_sm",
"es": "es_core_news_lg",
"fr": "fr_core_news_lg",
"de": "de_core_news_lg",
"it": "it_core_news_lg",
"ja": "ja_core_news_lg",
"pt": "pt_core_news_lg",
"nl": "nl_core_news_lg",
"el": "el_core_news_lg",
"ru": "ru_core_news_lg",
"ar": "ar_core_news_lg",
"hi": "hi_core_news_lg",
"ko": "ko_core_news_lg",
"pl": "pl_core_news_lg",
"uk": "uk_core_news_lg",
"vi": "vi_core_news_lg",
"tr": "tr_core_news_lg",
"th": "th_core_news_lg",
"ro": "ro_core_news_lg",
"da": "da_core_news_lg",
"fi": "fi_core_news_lg",
"hu": "hu_core_news_lg",
"nb": "nb_core_news_lg",
"sv": "sv_core_news_lg"
"en": "en_core_web_md",
"ru": "ru_core_news_md",
"fr": "fr_core_news_md",

# "es": "es_core_news_md",
# "de": "de_core_news_md",
# "it": "it_core_news_md",


# Not supported
# "zh": "zh_core_web_md",
# "ja": "ja_core_news_md",
}


# 使用空格分割的语言
# Languages that split with space
LANGUAGE_SPLIT_WITH_SPACE = ['en', 'es', 'fr', 'de', 'it', 'pt', 'nl', 'el', 'ru', 'ar', 'hi', 'pl', 'uk', 'vi', 'tr', 'ro', 'da', 'fi', 'hu', 'nb', 'sv']
Expand Down
13 changes: 7 additions & 6 deletions core/all_whisper_methods/whisperX.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,26 +71,27 @@ def transcribe_audio(audio_file: str) -> Dict:

def process_transcription(result: Dict) -> pd.DataFrame:
all_words = []
# save to debug as json
with open('output/log/debug.json', 'a', encoding='utf-8') as f:
json.dump(result, f, ensure_ascii=False, indent=4)
for segment in result['segments']:
for word in segment['words']:
if 'start' not in word and 'end' not in word:
if all_words:
# 合并到前一个词
# Merge with the previous word
all_words[-1]['text'] = f'{all_words[-1]["text"][:-1]}{word["word"]}"'
else:
# 如果是第一个词,暂时保存,等待下一个有时间戳的词
# If it's the first word, temporarily save it and wait for the next word with a timestamp
temp_word = word["word"]
else:
# 正常情况,有开始和结束时间
# Normal case, with start and end times
word_dict = {
'text': f'"{temp_word}{word["word"]}"' if 'temp_word' in locals() else f'"{word["word"]}"',
'start': word.get('start', all_words[-1]['end'] if all_words else 0),
'end': word['end'],
'score': word.get('score', 0)
}

# ! For French, we need to convert guillemets to empty strings
word_dict['text'] = word_dict['text'].replace('»', '').replace('«', '')

all_words.append(word_dict)
if 'temp_word' in locals():
del temp_word
Expand Down
7 changes: 6 additions & 1 deletion core/all_whisper_methods/whisperXapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,14 @@ def transcribe_audio(audio_base64: str) -> Dict:
except Exception as e:
raise Exception(f"Error accessing whisperX API: {e} Please check your Replicate API key and internet connection.\n")


def process_transcription(result: Dict) -> pd.DataFrame:
all_words = []
for segment in result['segments']:
for word in segment['words']:
# ! For French, we need to convert guillemets to empty strings
word["word"] = word["word"].replace('»', '').replace('«', '')

if 'start' not in word and 'end' not in word:
if all_words:
# Merge with the previous word
Expand All @@ -80,11 +84,12 @@ def process_transcription(result: Dict) -> pd.DataFrame:
else:
# Normal case, with start and end times
word_dict = {
'text': f'"{temp_word}{word["word"]}"' if 'temp_word' in locals() else f'"{word["word"]}"',
'text': f'{temp_word}{word["word"]}' if 'temp_word' in locals() else f'{word["word"]}',
'start': word.get('start', all_words[-1]['end'] if all_words else 0),
'end': word['end'],
'score': word.get('score', 0)
}

all_words.append(word_dict)
if 'temp_word' in locals():
del temp_word
Expand Down
14 changes: 2 additions & 12 deletions core/spacy_utils/split_by_mark.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def split_by_mark(nlp):
chunks = pd.read_excel("output/log/cleaned_chunks.xlsx")
chunks.text = chunks.text.apply(lambda x: x.strip('"'))

# 用 joiner 拼接
# join with joiner
input_text = joiner.join(chunks.text.to_list())

doc = nlp(input_text)
Expand All @@ -29,15 +29,5 @@ def split_by_mark(nlp):
print("💾 Sentences split by punctuation marks saved to → `sentences_by_mark.txt`")

if __name__ == "__main__":
# nlp = init_nlp()
# split_by_mark(nlp)

s = """そうで。"""
nlp = init_nlp()
doc = nlp(s)
print(doc)
assert doc.has_annotation("SENT_START")

sentences_by_mark = [sent.text for sent in doc.sents]
print(sentences_by_mark)

split_by_mark(nlp)

0 comments on commit 93ae6c7

Please sign in to comment.