Skip to content

Commit 6d6cf10

Browse files
committed
fix: tap voice in play to end playback (#14) (#38)
1 parent 4851f7c commit 6d6cf10

File tree

5 files changed

+149
-58
lines changed

5 files changed

+149
-58
lines changed

src/config.ts

+37
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,43 @@ export const azureRegions = [
224224
'westus2',
225225
]
226226

227+
export const voiceStyleMap = {
228+
'advertisement_upbeat': '产品推广',
229+
'affectionate': '亲切',
230+
'angry': '生气',
231+
'assistant': '数字助理',
232+
'calm': '冷静',
233+
'chat': '轻松',
234+
'cheerful': '愉悦',
235+
'customerservice': '热情',
236+
'depressed': '沮丧',
237+
'disgruntled': '抱怨',
238+
'documentary-narration': '记录片',
239+
'embarrassed': '犹豫',
240+
'empathetic': '关心',
241+
'envious': '钦佩',
242+
'excited': '乐观',
243+
'fearful': '不安',
244+
'friendly': '友好',
245+
'gentle': '温和',
246+
'hopeful': '温暖',
247+
'lyrical': '感伤',
248+
'narration-professional': '客观',
249+
'narration-relaxed': '舒缓',
250+
'newscast': '新闻播报',
251+
'newscast-casual': '随意新闻',
252+
'newscast-formal': '权威新闻',
253+
'poetry-reading': '诗歌',
254+
'sad': '悲伤',
255+
'serious': '严肃',
256+
'shouting': '大喊',
257+
'sports_commentary': '轻松体育赛事',
258+
'sports_commentary_excited': '紧张体育赛事',
259+
'whispering': '柔和',
260+
'terrified': '害怕',
261+
'unfriendly': '无情',
262+
} as Record<string, string>
263+
227264
export const openaiModels = [
228265
'gpt-3.5-turbo',
229266
'gpt-3.5-turbo-0301',

src/hooks/useSpeechService.ts

+36-28
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@ import type { VoiceInfo } from 'microsoft-cognitiveservices-speech-sdk'
22
import {
33
AudioConfig,
44
CancellationErrorCode,
5-
ResultReason,
65
SpeakerAudioDestination,
76
SpeechConfig,
87
SpeechRecognizer,
@@ -52,6 +51,7 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z
5251

5352
// const isFetchAllVoices = ref(false) // 是否在请求所有语音列表
5453
const rate = ref(1) // 语速 (0,2]
54+
const style = ref('Neural') // 情感
5555

5656
let mediaRecorder: MediaRecorder | null
5757
const chunks: Blob[] = []
@@ -61,35 +61,15 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z
6161

6262
const recognizer = ref<SpeechRecognizer>(new SpeechRecognizer(speechConfig.value))
6363
const synthesizer = ref<SpeechSynthesizer>(new SpeechSynthesizer(speechConfig.value))
64-
6564
// 引入变量,触发 SpeechSynthesizer 实例的重新创建
6665
const count = ref(0)
67-
66+
const player = ref(new SpeakerAudioDestination())
6867
watch([language, voiceName, count, azureKey, azureRegion, ttsPassword], ([lang, voice]) => {
6968
speechConfig.value = SpeechConfig.fromSubscription(resultAzureKey.value, resultAzureRegion.value)
7069
speechConfig.value.speechRecognitionLanguage = lang
7170
speechConfig.value.speechSynthesisLanguage = lang
7271
speechConfig.value.speechSynthesisVoiceName = voice
7372
console.log(lang, voice)
74-
75-
// 通过playback结束事件来判断播放结束
76-
const player = new SpeakerAudioDestination()
77-
player.onAudioStart = function (_) {
78-
if (isSynthesError.value) return
79-
isPlaying.value = true
80-
isPlayend.value = false
81-
console.log('playback started')
82-
}
83-
player.onAudioEnd = function (_) {
84-
console.log('playback finished')
85-
isPlaying.value = false
86-
isPlayend.value = true
87-
}
88-
89-
const audioConfig = AudioConfig.fromDefaultMicrophoneInput()
90-
const audioConfiga = AudioConfig.fromSpeakerOutput(player)
91-
recognizer.value = new SpeechRecognizer(speechConfig.value, audioConfig)
92-
synthesizer.value = new SpeechSynthesizer(speechConfig.value, audioConfiga)
9373
}, {
9474
immediate: true,
9575
})
@@ -103,6 +83,7 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z
10383
mediaRecorder = new MediaRecorder(stream)
10484

10585
mediaRecorder.ondataavailable = (e) => {
86+
console.log(chunks, 'c')
10687
chunks.push(e.data)
10788
}
10889

@@ -117,6 +98,9 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z
11798
}
11899

119100
const startRecognizeSpeech = async (cb?: (text: string) => void) => {
101+
const audioConfig = AudioConfig.fromDefaultMicrophoneInput()
102+
recognizer.value = new SpeechRecognizer(speechConfig.value, audioConfig)
103+
120104
isRecognizReadying.value = true
121105

122106
recognizer.value.canceled = () => {
@@ -143,7 +127,6 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z
143127
isRecognizReadying.value = false
144128
isRecognizing.value = false
145129
}
146-
147130
recognizer.value.startContinuousRecognitionAsync(async () => {
148131
await audioRecorder()
149132
isRecognizing.value = true
@@ -160,8 +143,7 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z
160143

161144
// 停止语音识别
162145
const stopRecognizeSpeech = (): Promise<void> => {
163-
mediaRecorder!.stop()
164-
146+
mediaRecorder?.stop()
165147
isRecognizReadying.value = false
166148
return new Promise((resolve, reject) => {
167149
recognizer.value.stopContinuousRecognitionAsync(() => {
@@ -211,18 +193,23 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z
211193
})
212194
}
213195

214-
const ssmlToSpeak = async (text: string, { voice, voiceRate, lang }: { voice?: string; voiceRate?: number; lang?: string } = {}) => {
196+
const ssmlToSpeak = async (text: string, { voice, voiceRate, lang, voiceStyle }: { voice?: string; voiceRate?: number; lang?: string; voiceStyle?: string } = {}) => {
197+
applySynthesizerConfiguration()
198+
215199
isSynthesizing.value = true
216200
isSynthesError.value = false
217201
const targetLang = lang || speechConfig.value.speechSynthesisLanguage
218202
const targetVoice = voice || speechConfig.value.speechSynthesisVoiceName
219203
const targetRate = voiceRate || rate.value
204+
const targetFeel = voiceStyle || style.value
220205

221206
const ssml = `
222-
<speak version="1.0" xmlns="https://www.w3.org/2001/10/synthesis" xml:lang="${targetLang}">
207+
<speak version="1.0" xmlns:mstts="https://www.w3.org/2001/mstts" xmlns="https://www.w3.org/2001/10/synthesis" xml:lang="${targetLang}">
223208
<voice name="${targetVoice}">
224209
<prosody rate="${targetRate}">
225-
${text}
210+
<mstts:express-as style="${targetFeel}" styledegree="1.5">
211+
${text}
212+
</mstts:express-as>
226213
</prosody>
227214
</voice>
228215
</speak>`
@@ -274,6 +261,25 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z
274261
return res.voices
275262
}
276263

264+
function applySynthesizerConfiguration() {
265+
// 通过playback结束事件来判断播放结束
266+
player.value = new SpeakerAudioDestination()
267+
player.value.onAudioStart = function (_) {
268+
if (isSynthesError.value) return
269+
isPlaying.value = true
270+
isPlayend.value = false
271+
console.log('playback started.....')
272+
}
273+
player.value.onAudioEnd = function (_) {
274+
console.log('playback finished....')
275+
isPlaying.value = false
276+
isPlayend.value = true
277+
}
278+
279+
const speakConfig = AudioConfig.fromSpeakerOutput(player.value)
280+
synthesizer.value = new SpeechSynthesizer(speechConfig.value, speakConfig)
281+
}
282+
277283
return {
278284
languages,
279285
language,
@@ -292,6 +298,8 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z
292298
allVoices,
293299
isSynthesizing,
294300
rate,
301+
style,
295302
audioBlob,
303+
player,
296304
}
297305
}

0 commit comments

Comments
 (0)