Skip to content

Commit 1235b52

Browse files
liuxinqiliou666
liuxinqi
authored andcommitted
feat: audio support record (#29)
1 parent 0674a39 commit 1235b52

File tree

5 files changed

+126
-16
lines changed

5 files changed

+126
-16
lines changed

src/hooks/useSpeechService.ts

+35-9
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ import type { VoiceInfo } from 'microsoft-cognitiveservices-speech-sdk'
22
import {
33
AudioConfig,
44
CancellationErrorCode,
5+
ResultReason,
56
SpeakerAudioDestination,
67
SpeechConfig,
78
SpeechRecognizer,
@@ -52,6 +53,10 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z
5253
// const isFetchAllVoices = ref(false) // 是否在请求所有语音列表
5354
const rate = ref(1) // 语速 (0,2]
5455

56+
let mediaRecorder: MediaRecorder | null
57+
const chunks: Blob[] = []
58+
const audioBlob = ref<Blob>(new Blob())
59+
5560
const allVoices = ref<VoiceInfo[]>([])
5661

5762
const recognizer = ref<SpeechRecognizer>(new SpeechRecognizer(speechConfig.value))
@@ -89,13 +94,29 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z
8994
immediate: true,
9095
})
9196

92-
// watch([azureKey, azureRegion], () => {
93-
// if (isFetchAllVoice && allVoices.value.length === 0)
94-
// getVoices()
95-
// })
96-
9797
// 语音识别
98-
const startRecognizeSpeech = (cb?: (text: string) => void) => {
98+
99+
const audioRecorder = async () => {
100+
// 暂时通过 mediaRecorder 方式实现录音保存,后续可能会改为直接通过 SpeechRecognizer 实现保存
101+
102+
const stream = await navigator.mediaDevices.getUserMedia({ audio: true })
103+
mediaRecorder = new MediaRecorder(stream)
104+
105+
mediaRecorder.ondataavailable = (e) => {
106+
chunks.push(e.data)
107+
}
108+
109+
mediaRecorder.onstop = (e) => {
110+
const blob = new Blob(chunks, { type: 'audio/wav' })
111+
audioBlob.value = blob
112+
mediaRecorder = null
113+
chunks.length = 0
114+
}
115+
116+
mediaRecorder.start()
117+
}
118+
119+
const startRecognizeSpeech = async (cb?: (text: string) => void) => {
99120
isRecognizReadying.value = true
100121

101122
recognizer.value.canceled = () => {
@@ -105,9 +126,10 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z
105126
console.log('Recognize result: ', e.result.text)
106127
cb && cb(e.result.text)
107128
}
108-
recognizer.value.recognizing = (s, e) => {
109-
console.log('Recognize recognizing', e.result.text)
129+
recognizer.value.recognizing = (s, event) => {
130+
console.log('Recognize recognizing', event.result.text)
110131
}
132+
111133
recognizer.value.sessionStopped = (s, e) => {
112134
console.log('\n Session stopped event.')
113135
recognizer.value.stopContinuousRecognitionAsync()
@@ -122,7 +144,8 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z
122144
isRecognizing.value = false
123145
}
124146

125-
recognizer.value.startContinuousRecognitionAsync(() => {
147+
recognizer.value.startContinuousRecognitionAsync(async () => {
148+
await audioRecorder()
126149
isRecognizing.value = true
127150
isRecognizReadying.value = false
128151
console.log('Recognize...')
@@ -137,6 +160,8 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z
137160

138161
// 停止语音识别
139162
const stopRecognizeSpeech = (): Promise<void> => {
163+
mediaRecorder!.stop()
164+
140165
isRecognizReadying.value = false
141166
return new Promise((resolve, reject) => {
142167
recognizer.value.stopContinuousRecognitionAsync(() => {
@@ -267,5 +292,6 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z
267292
allVoices,
268293
isSynthesizing,
269294
rate,
295+
audioBlob,
270296
}
271297
}

src/pages/Home/components/Content.vue

+59-5
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
<script setup lang="ts">
22
import Button from '@/components/Button.vue'
33
import { generatTranslate, generateText } from '@/server/api'
4-
import { verifyOpenKey } from '@/utils'
4+
import { base64ToBlob, blobToBase64, verifyOpenKey } from '@/utils'
55
import { useConversationStore } from '@/stores'
66
77
interface Translates {
@@ -27,6 +27,7 @@ const {
2727
stopRecognizeSpeech,
2828
ssmlToSpeak,
2929
isSynthesizing,
30+
audioBlob,
3031
} = useSpeechService({ langs: store.allLanguage as any, isFetchAllVoice: false })
3132
3233
// states
@@ -106,10 +107,11 @@ async function onSubmit() {
106107
107108
store.changeConversations([
108109
...currentChatMessages.value,
109-
{ content: message.value, role: 'user' },
110+
{ content: message.value, role: 'user', audioBlob: await blobToBase64(audioBlob.value) },
110111
])
112+
const tempCurrentChatMessages = currentChatMessages.value.map(x => ({ content: x.content, role: x.role })) // 发送的请求中需去除audioBlob
111113
const systemMessage = currentChatMessages.value[0]
112-
const relativeMessage = [...chatMessages.value, { content: message.value, role: 'user' }].slice(-(Number(chatRememberCount.value))) // 保留最近的几条消息
114+
const relativeMessage = [...tempCurrentChatMessages, { content: message.value, role: 'user' }].slice(-(Number(chatRememberCount.value))) // 保留最近的几条消息
113115
const prompts = [systemMessage, ...relativeMessage] as ChatMessage[]
114116
115117
message.value = ''
@@ -134,13 +136,40 @@ async function onSubmit() {
134136
store.changeLoading(false)
135137
}
136138
139+
// assistant speak
137140
function speak(content: string, index: number) {
141+
restartAudio()
138142
if (isPlaying.value || isSynthesizing.value) return
139143
speakIndex.value = index
140144
text.value = content
141145
ssmlToSpeak(content)
142146
}
143147
148+
// user speak
149+
let audio = new Audio()
150+
151+
function restartAudio() {
152+
audio.pause()
153+
audio.currentTime = 0
154+
isPlaying.value = false
155+
// audio.play()
156+
}
157+
158+
function userSpeak(audioData: string, index: number) {
159+
if (isPlaying.value || isSynthesizing.value) return
160+
speakIndex.value = index
161+
audio = new Audio(URL.createObjectURL(base64ToBlob(audioData)))
162+
audio.play()
163+
audio.onplay = () => {
164+
isPlaying.value = true
165+
}
166+
167+
audio.onended = () => {
168+
isPlaying.value = false
169+
speakIndex.value = -1
170+
}
171+
}
172+
144173
const recognize = async () => {
145174
try {
146175
console.log('isRecognizing', isRecognizing.value)
@@ -194,15 +223,15 @@ const translate = async (text: string, i: number) => {
194223
<div class="w-10 h-10">
195224
<img w-full h-full object-fill rounded-full :src="item.role === 'user' ? selfAvatar : currentAvatar" alt="">
196225
</div>
197-
198226
<div style="flex-basis:fit-content" mx-2>
199227
<p p-2 my-2 chat-box>
200228
{{ item.content }}
201229
</p>
202-
<p v-show="item.role === 'assistant' && translates[item.content + i]?.isShow " p-2 my-2 chat-box>
230+
<p v-show=" translates[item.content + i]?.isShow " p-2 my-2 chat-box>
203231
{{ translates[item.content + i]?.result }}
204232
</p>
205233

234+
<!-- assistant -->
206235
<p v-if="item.role === 'assistant'" mt-2 flex>
207236
<template v-if="speakIndex !== i">
208237
<span class="chat-btn" @click="speak(item.content, i)">
@@ -224,6 +253,31 @@ const translate = async (text: string, i: number) => {
224253
<i icon-btn i-eos-icons:bubble-loading />
225254
</span>
226255
</p>
256+
257+
<!-- user -->
258+
<p v-else mt-2 flex>
259+
<template v-if="item.audioBlob">
260+
<template v-if="speakIndex !== i">
261+
<span class="chat-btn" @click="userSpeak(item.audioBlob, i)">
262+
<i icon-btn rotate-270 i-ic:sharp-wifi />
263+
</span>
264+
</template>
265+
<template v-else>
266+
<span v-if="isPlaying" class="chat-btn" @click="restartAudio()">
267+
<i icon-btn rotate-270 i-svg-spinners:wifi-fade />
268+
</span>
269+
<span v-else class="chat-btn" @click="userSpeak(item.audioBlob, i)">
270+
<i icon-btn rotate-270 i-ic:sharp-wifi />
271+
</span>
272+
</template>
273+
</template>
274+
<span v-if="!isTranslating || translateIndex !== i" ml-1 class="chat-btn" @click="translate(item.content, i)">
275+
<i icon-btn i-carbon:ibm-watson-language-translator />
276+
</span>
277+
<span v-else ml-1 class="chat-btn">
278+
<i icon-btn i-eos-icons:bubble-loading />
279+
</span>
280+
</p>
227281
</div>
228282
</div>
229283
</template>

src/stores/index.ts

+2-2
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ export interface Conversation {
2424
key: Key // 名称 唯一标识
2525
name: string // 名称
2626
desc: string
27-
chatMessages: ChatMessage[] // 聊天信息
27+
chatMessages: ChatMessageWithAudioUrl[] // 聊天信息
2828
language: string // tts stt
2929
voice: string // 参考 https://aka.ms/speech/tts-languages
3030
avatar: string // 用户头像
@@ -66,7 +66,7 @@ export const useConversationStore = defineStore('conversation', {
6666
},
6767
},
6868
actions: {
69-
changeConversations(chatMessages: ChatMessage[]) {
69+
changeConversations(chatMessages: ChatMessageWithAudioUrl[]) {
7070
this.chatMessages(this.currentKey)!.chatMessages = chatMessages
7171
},
7272
changeCurrentKey(key: Key) {

src/types..d.ts

+4
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,7 @@ interface ImagePayload {
2727
n?: number
2828
size?: string
2929
}
30+
31+
interface ChatMessageWithAudioUrl extends ChatMessage {
32+
audioBlob?: string //base64
33+
}

src/utils/tools.ts

+26
Original file line numberDiff line numberDiff line change
@@ -25,3 +25,29 @@ export async function fetchWithTimeout(
2525

2626
return response
2727
}
28+
29+
export function blobToBase64(blob: Blob) {
30+
return new Promise<string>((resolve, reject) => {
31+
const reader = new FileReader()
32+
if (blob.size === 0)
33+
return resolve('')
34+
35+
reader.readAsDataURL(blob)
36+
reader.onload = function () {
37+
const dataUrl = reader.result
38+
resolve(dataUrl!.toString())
39+
}
40+
reader.onerror = reject
41+
})
42+
}
43+
44+
export function base64ToBlob(dataUrl: string) {
45+
const arr = dataUrl.split(',')
46+
const mime = arr[0].match(/:(.*?);/)![1]
47+
const bstr = atob(arr[1])
48+
let n = bstr.length
49+
const u8arr = new Uint8Array(n)
50+
while (n--)
51+
u8arr[n] = bstr.charCodeAt(n)
52+
return new Blob([u8arr], { type: mime })
53+
}

0 commit comments

Comments
 (0)