forked from FunAudioLLM/CosyVoice
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest.py
118 lines (100 loc) · 6.48 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import torchaudio
import os
import torch
from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
from cosyvoice.utils.file_utils import load_wav
from funasr import AutoModel
from funasr.utils.postprocess_utils import rich_transcription_postprocess
model_dir = "iic/SenseVoiceSmall"
# export PYTHONPATH=third_party/Matcha-TTS
current_path = os.environ.get('PYTHONPATH', '')
Matcha_path = 'third_party/Matcha-TTS'
if Matcha_path not in current_path:
os.environ['PYTHONPATH'] = Matcha_path + os.pathsep + current_path
sensevoice = AutoModel(
model=model_dir,
trust_remote_code=True,
remote_code="/data/megastore/Projects/DuJing/code/SenseVoice/model.py",
vad_model="fsmn-vad",
vad_kwargs={"max_single_segment_time": 30000},
device="cuda:0",
)
# cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT')
# # sft usage
# print(cosyvoice.list_avaliable_spks())
# output = cosyvoice.inference_sft('那有哪些美剧是不太适合学英语的呢?我来给大家举几个例子吧。第一个《破产姐妹》,我不知道为什么总有人推荐这一部,我先声明一下,我真的很喜欢很喜欢破产姐妹,它真的很下饭。我大学有一段时间就是天天去食堂打包吃的,然后回到宿舍,我就边看边吃,甚至听到她那个片头曲,我就会很有食欲,但是我真的真的没有办法用它来学英语。一个是语速太快了;第二全是开车的台词,你说是生活中、考试试中哪儿会用到?所以我觉得破产姐妹下饭必备,学英语还是算了。', '中文女')
# torchaudio.save('sft.wav', output['tts_speech'], 24000)
# output = cosyvoice.inference_sft('The problems of efficiency today are less drastic but more chronic, they can also prolong the evils that they were intended to solve and took the electronic medical record. It seemed to be the answer to the problem of doctors handwriting, and it had the benefit of providing much better data for treatments. In practice, it has meant much more electronic paperwork and physicians are now complaining that they have less rather than more time to see patients individually. The obsession with efficiency can actually make us less efficient.', '英文男')
# torchaudio.save('sft-en.wav', output['tts_speech'], 24000)
# cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-25Hz')
cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B')
# zero_shot usage
ref_audios = [
'/data/megastore/SHARE/TTS/ref_audios/jim_10s.wav',
'/data/megastore/SHARE/TTS/ref_audios/caikangyong_10s.wav',
'/data/megastore/SHARE/TTS/ref_audios/ASR_lingli_10s.wav',
'/data/megastore/SHARE/TTS/ref_audios/ASR_lola_10s.wav',
'/data/megastore/SHARE/TTS/ref_audios/jyr/ASR_jiangyueren_10s.wav',
'/data/megastore/SHARE/TTS/ref_audios/dubbing.wav',
'/data/megastore/SHARE/TTS/ref_audios/cute2.wav',
]
texts = [
('zh','那有哪些美剧是不太适合学英语的呢?我来给大家举几个例子吧。第一个《破产姐妹》,我不知道为什么总有人推荐这一部,我先声明一下,我真的很喜欢很喜欢破产姐妹,它真的很下饭。我大学有一段时间就是天天去食堂打包吃的,然后回到宿舍,我就边看边吃,甚至听到她那个片头曲,我就会很有食欲,但是我真的真的没有办法用它来学英语。一个是语速太快了;第二全是开车的台词,你说是生活中、考试试中哪儿会用到?所以我觉得破产姐妹下饭必备,学英语还是算了。'),
('en','The problems of efficiency today are less drastic but more chronic, they can also prolong the evils that they were intended to solve and took the electronic medical record. It seemed to be the answer to the problem of doctors handwriting, and it had the benefit of providing much better data for treatments. In practice, it has meant much more electronic paperwork and physicians are now complaining that they have less rather than more time to see patients individually. The obsession with efficiency can actually make us less efficient.'),
]
for ref_audio in ref_audios:
print(f'Cloning the reference audio: {ref_audio}')
id = os.path.basename(ref_audio)
id = os.path.splitext(id)[0]
prompt_speech_16k = load_wav(ref_audio, 16000)
print(prompt_speech_16k.size())
if prompt_speech_16k.size(1) > 160000:
prompt_speech_16k = prompt_speech_16k[:, :160000]
# en
res = sensevoice.generate(
input=prompt_speech_16k[0],
cache={},
language="auto", # "zh", "en", "yue", "ja", "ko", "nospeech"
use_itn=True,
batch_size_s=60,
merge_vad=True, #
merge_length_s=15,
)
prompt_text = rich_transcription_postprocess(res[0]["text"])
print("prompt_text:", prompt_text)
for (lang,long_text) in texts:
if lang == 'zh':
text_list = long_text.split("。")
elif lang == 'en':
text_list = long_text.split(".")
else:
raise NotImplementedError
total_audio = []
for text in text_list:
text = text.strip()
if len(text)<1:
continue
if lang == 'zh':
#text = "<|zh|>" + text
text+="。"
elif lang == 'en':
#text = "<|en|>" + text
text+="."
cur_wave = []
for i,j in enumerate(cosyvoice.inference_zero_shot(
text,
prompt_text,
prompt_speech_16k,
stream=True)):
cur_wave.append(j['tts_speech'])
wave = torch.cat(cur_wave, dim=-1)
torchaudio.save(f'{id}_{text[0:10]}.wav', wave, 24000)
# # cross_lingual usage
# prompt_speech_16k = load_wav('cross_lingual_prompt.wav', 16000)
# output = cosyvoice.inference_cross_lingual('<|en|>And then later on, fully acquiring that company. So keeping management in line, interest in line with the asset that\'s coming into the family is a reason why sometimes we don\'t buy the whole thing.', prompt_speech_16k)
# torchaudio.save('cross_lingual.wav', output['tts_speech'], 24000)
#
# cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-Instruct')
# # instruct usage
# output = cosyvoice.inference_instruct('在面对挑战时,他展现了非凡的<strong>勇气</strong>与<strong>智慧</strong>。', '中文男', 'Theo \'Crimson\', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.')
# torchaudio.save('instruct.wav', output['tts_speech'], 24000)