perf: tts buffer

c121914yu · Nov 16, 2023 · 7ca6e3f · 7ca6e3f
1 parent db14c06
commit 7ca6e3f
Show file tree

Hide file tree

Showing 16 changed files with 125 additions and 84 deletions.
diff --git a/packages/global/core/ai/model.d.ts b/packages/global/core/ai/model.d.ts
@@ -29,5 +29,7 @@ export type AudioSpeechModelType = {
   model: string;
   name: string;
   price: number;
-  voices: { label: string; value: string }[];
+  baseUrl?: string;
+  key?: string;
+  voices: { label: string; value: string; bufferId: string }[];
 };
diff --git a/packages/global/core/ai/model.ts b/packages/global/core/ai/model.ts
@@ -107,12 +107,12 @@ export const defaultAudioSpeechModels: AudioSpeechModelType[] = [
     name: 'OpenAI TTS1',
     price: 0,
     voices: [
-      { label: 'Alloy', value: 'Alloy' },
-      { label: 'Echo', value: 'Echo' },
-      { label: 'Fable', value: 'Fable' },
-      { label: 'Onyx', value: 'Onyx' },
-      { label: 'Nova', value: 'Nova' },
-      { label: 'Shimmer', value: 'Shimmer' }
+      { label: 'Alloy', value: 'Alloy', bufferId: 'openai-Alloy' },
+      { label: 'Echo', value: 'Echo', bufferId: 'openai-Echo' },
+      { label: 'Fable', value: 'Fable', bufferId: 'openai-Fable' },
+      { label: 'Onyx', value: 'Onyx', bufferId: 'openai-Onyx' },
+      { label: 'Nova', value: 'Nova', bufferId: 'openai-Nova' },
+      { label: 'Shimmer', value: 'Shimmer', bufferId: 'openai-Shimmer' }
     ]
   }
 ];
diff --git a/packages/global/core/ai/speech/constant.ts b/packages/global/core/ai/speech/constant.ts
diff --git a/packages/global/core/app/type.d.ts b/packages/global/core/app/type.d.ts
@@ -1,7 +1,6 @@
 import { ModuleItemType } from '../module/type';
 import { AppTypeEnum } from './constants';
 import { PermissionTypeEnum } from '../../support/permission/constant';
-import { Text2SpeechVoiceEnum } from '../ai/speech/constant';
 
 export interface AppSchema {
   _id: string;

diff --git a/packages/global/core/chat/type.d.ts b/packages/global/core/chat/type.d.ts
@@ -39,7 +39,6 @@ export type ChatItemSchema = {
   userFeedback?: string;
   adminFeedback?: AdminFbkType;
   [TaskResponseKeyEnum.responseData]?: ChatHistoryItemResType[];
-  tts?: Buffer;
 };
 
 export type AdminFbkType = {

diff --git a/packages/service/common/buffer/tts/schema.ts b/packages/service/common/buffer/tts/schema.ts
@@ -0,0 +1,35 @@
+import { connectionMongo, type Model } from '../../../common/mongo';
+const { Schema, model, models } = connectionMongo;
+import { TTSBufferSchemaType } from './type.d';
+
+export const collectionName = 'ttsbuffers';
+
+const TTSBufferSchema = new Schema({
+  bufferId: {
+    type: String,
+    required: true
+  },
+  text: {
+    type: String,
+    required: true
+  },
+  buffer: {
+    type: Buffer,
+    required: true
+  },
+  createTime: {
+    type: Date,
+    default: () => new Date()
+  }
+});
+
+try {
+  TTSBufferSchema.index({ bufferId: 1 });
+  //  24 hour
+  TTSBufferSchema.index({ createTime: 1 }, { expireAfterSeconds: 24 * 60 * 60 });
+} catch (error) {
+  console.log(error);
+}
+
+export const MongoTTSBuffer: Model<TTSBufferSchemaType> =
+  models[collectionName] || model(collectionName, TTSBufferSchema);
diff --git a/packages/service/common/buffer/tts/type.d.ts b/packages/service/common/buffer/tts/type.d.ts
@@ -0,0 +1,5 @@
+export type TTSBufferSchemaType = {
+  bufferId: string;
+  text: string;
+  buffer: Buffer;
+};
diff --git a/packages/service/core/ai/audio/speech.ts b/packages/service/core/ai/audio/speech.ts
@@ -1,28 +1,31 @@
 import type { NextApiResponse } from 'next';
 import { getAIApi } from '../config';
 import { defaultAudioSpeechModels } from '../../../../global/core/ai/model';
-import { Text2SpeechVoiceEnum } from '@fastgpt/global/core/ai/speech/constant';
+import { UserModelSchema } from '@fastgpt/global/support/user/type';
 
 export async function text2Speech({
   res,
   onSuccess,
   onError,
-  model = defaultAudioSpeechModels[0].model,
-  voice = Text2SpeechVoiceEnum.alloy,
   input,
-  speed = 1
+  model = defaultAudioSpeechModels[0].model,
+  voice,
+  speed = 1,
+  props
 }: {
   res: NextApiResponse;
   onSuccess: (e: { model: string; buffer: Buffer }) => void;
   onError: (e: any) => void;
-  model?: string;
-  voice?: `${Text2SpeechVoiceEnum}`;
   input: string;
+  model: string;
+  voice: string;
   speed?: number;
+  props?: UserModelSchema['openaiAccount'];
 }) {
-  const ai = getAIApi();
+  const ai = getAIApi(props);
   const response = await ai.audio.speech.create({
     model,
+    // @ts-ignore
     voice,
     input,
     response_format: 'mp3',

diff --git a/packages/service/core/chat/chatItemSchema.ts b/packages/service/core/chat/chatItemSchema.ts
@@ -68,9 +68,6 @@ const ChatItemSchema = new Schema({
   [TaskResponseKeyEnum.responseData]: {
     type: Array,
     default: []
-  },
-  tts: {
-    type: Buffer
   }
 });
 

diff --git a/projects/app/data/config.json b/projects/app/data/config.json
@@ -103,13 +103,15 @@
       "model": "tts-1",
       "name": "OpenAI TTS1",
       "price": 0,
+      "baseUrl": "https://api.openai.com/v1",
+      "key": "",
       "voices": [
-        { "label": "Alloy", "value": "Alloy" },
-        { "label": "Echo", "value": "Echo" },
-        { "label": "Fable", "value": "Fable" },
-        { "label": "Onyx", "value": "Onyx" },
-        { "label": "Nova", "value": "Nova" },
-        { "label": "Shimmer", "value": "Shimmer" }
+        { "label": "Alloy", "value": "alloy", "bufferId": "openai-Alloy" },
+        { "label": "Echo", "value": "echo", "bufferId": "openai-Echo" },
+        { "label": "Fable", "value": "fable", "bufferId": "openai-Fable" },
+        { "label": "Onyx", "value": "onyx", "bufferId": "openai-Onyx" },
+        { "label": "Nova", "value": "nova", "bufferId": "openai-Nova" },
+        { "label": "Shimmer", "value": "shimmer", "bufferId": "openai-Shimmer" }
       ]
     }
   ]

diff --git a/projects/app/src/global/core/chat/api.d.ts b/projects/app/src/global/core/chat/api.d.ts
@@ -1,7 +1,6 @@
 import type { AppTTSConfigType } from '@/types/app';
 
 export type GetChatSpeechProps = {
-  chatItemId?: string;
   ttsConfig: AppTTSConfigType;
   input: string;
 };
diff --git a/projects/app/src/pages/api/core/chat/item/getSpeech.ts b/projects/app/src/pages/api/core/chat/item/getSpeech.ts
@@ -1,12 +1,13 @@
 import type { NextApiRequest, NextApiResponse } from 'next';
 import { jsonRes } from '@fastgpt/service/common/response';
 import { connectToDatabase } from '@/service/mongo';
-import { MongoChatItem } from '@fastgpt/service/core/chat/chatItemSchema';
 import { GetChatSpeechProps } from '@/global/core/chat/api.d';
 import { text2Speech } from '@fastgpt/service/core/ai/audio/speech';
 import { pushAudioSpeechBill } from '@/service/support/wallet/bill/push';
 import { authCert } from '@fastgpt/service/support/permission/auth/common';
 import { authType2BillSource } from '@/service/support/wallet/bill/utils';
+import { getAudioSpeechModel } from '@/service/core/ai/model';
+import { MongoTTSBuffer } from '@fastgpt/service/common/buffer/tts/schema';
 
 /* 
 1. get tts from chatItem store
@@ -18,31 +19,43 @@ import { authType2BillSource } from '@/service/support/wallet/bill/utils';
 export default async function handler(req: NextApiRequest, res: NextApiResponse) {
   try {
     await connectToDatabase();
-    const { chatItemId, ttsConfig, input } = req.body as GetChatSpeechProps;
+    const { ttsConfig, input } = req.body as GetChatSpeechProps;
+
+    if (!ttsConfig.model || !ttsConfig.voice) {
+      throw new Error('model or voice not found');
+    }
 
     const { teamId, tmbId, authType } = await authCert({ req, authToken: true });
 
-    const chatItem = await (async () => {
-      if (!chatItemId) return null;
-      return await MongoChatItem.findOne(
-        {
-          dataId: chatItemId
-        },
-        'tts'
-      );
-    })();
+    const ttsModel = getAudioSpeechModel(ttsConfig.model);
+    const voiceData = ttsModel.voices.find((item) => item.value === ttsConfig.voice);
+
+    if (!voiceData) {
+      throw new Error('voice not found');
+    }
+
+    const ttsBuffer = await MongoTTSBuffer.findOne(
+      {
+        bufferId: voiceData.bufferId,
+        text: input
+      },
+      'buffer'
+    );
 
-    if (chatItem?.tts) {
-      return jsonRes(res, {
-        data: chatItem.tts
-      });
+    if (ttsBuffer?.buffer) {
+      return res.end(new Uint8Array(ttsBuffer.buffer.buffer));
     }
 
     await text2Speech({
+      res,
+      input,
       model: ttsConfig.model,
       voice: ttsConfig.voice,
-      input,
-      res,
+      props: {
+        // temp code
+        baseUrl: ttsModel.baseUrl || '',
+        key: ttsModel.key || ''
+      },
       onSuccess: async ({ model, buffer }) => {
         try {
           pushAudioSpeechBill({
@@ -53,9 +66,11 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
             source: authType2BillSource({ authType })
           });
 
-          if (!chatItem) return;
-          chatItem.tts = buffer;
-          await chatItem.save();
+          await MongoTTSBuffer.create({
+            bufferId: voiceData.bufferId,
+            text: input,
+            buffer
+          });
         } catch (error) {}
       },
       onError: (err) => {

diff --git a/projects/app/src/pages/api/system/getInitData.ts b/projects/app/src/pages/api/system/getInitData.ts
@@ -35,7 +35,11 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
       cqModels: global.cqModels,
       extractModels: global.extractModels,
       vectorModels: global.vectorModels,
-      audioSpeechModels: global.audioSpeechModels,
+      audioSpeechModels: global.audioSpeechModels.map((item) => ({
+        ...item,
+        baseUrl: undefined,
+        key: undefined
+      })),
       priceMd: global.priceMd,
       systemVersion: global.systemVersion || '0.0.0'
     }

diff --git a/projects/app/src/pages/app/detail/components/TTSSelect.tsx b/projects/app/src/pages/app/detail/components/TTSSelect.tsx
@@ -6,10 +6,10 @@ import React, { useCallback, useMemo } from 'react';
 import { useTranslation } from 'next-i18next';
 import MySelect from '@/components/Select';
 import { TTSTypeEnum } from '@/constants/app';
-import { Text2SpeechVoiceEnum, openaiTTSModel } from '@fastgpt/global/core/ai/speech/constant';
 import { AppTTSConfigType } from '@/types/app';
 import { useAudioPlay } from '@/web/common/utils/voice';
 import { useLoading } from '@/web/common/hooks/useLoading';
+import { audioSpeechModels } from '@/web/common/system/staticData';
 
 const TTSSelect = ({
   value,
@@ -37,10 +37,16 @@ const TTSSelect = ({
       if (e === TTSTypeEnum.none || e === TTSTypeEnum.web) {
         onChange({ type: e as `${TTSTypeEnum}` });
       } else {
+        const audioModel = audioSpeechModels.find((item) =>
+          item.voices.find((voice) => voice.value === e)
+        );
+        if (!audioModel) {
+          return;
+        }
         onChange({
           type: TTSTypeEnum.model,
-          model: openaiTTSModel,
-          voice: e as `${Text2SpeechVoiceEnum}`,
+          model: audioModel.model,
+          voice: e,
           speed: 1
         });
       }
@@ -77,12 +83,7 @@ const TTSSelect = ({
         list={[
           { label: t('core.app.tts.Close'), value: TTSTypeEnum.none },
           { label: t('core.app.tts.Web'), value: TTSTypeEnum.web },
-          { label: 'Alloy', value: Text2SpeechVoiceEnum.alloy },
-          { label: 'Echo', value: Text2SpeechVoiceEnum.echo },
-          { label: 'Fable', value: Text2SpeechVoiceEnum.fable },
-          { label: 'Onyx', value: Text2SpeechVoiceEnum.onyx },
-          { label: 'Nova', value: Text2SpeechVoiceEnum.nova },
-          { label: 'Shimmer', value: Text2SpeechVoiceEnum.shimmer }
+          ...audioSpeechModels.map((item) => item.voices).flat()
         ]}
         onchange={onclickChange}
       />

diff --git a/projects/app/src/types/app.d.ts b/projects/app/src/types/app.d.ts
@@ -15,7 +15,6 @@ import type { FlowModuleTemplateType, ModuleItemType } from '@fastgpt/global/cor
 import type { ChatSchema } from '@fastgpt/global/core/chat/type';
 import type { AppSchema } from '@fastgpt/global/core/app/type';
 import { ChatModelType } from '@/constants/model';
-import { Text2SpeechVoiceEnum } from '@fastgpt/global/core/ai/speech/constant';
 
 export interface ShareAppItem {
   _id: string;
@@ -40,7 +39,7 @@ export type VariableItemType = {
 export type AppTTSConfigType = {
   type: 'none' | 'web' | 'model';
   model?: string;
-  voice?: `${Text2SpeechVoiceEnum}`;
+  voice?: string;
   speed?: number;
 };
 

diff --git a/projects/app/src/web/common/utils/voice.ts b/projects/app/src/web/common/utils/voice.ts
@@ -60,23 +60,21 @@ export const useAudioPlay = (props?: { ttsConfig?: AppTTSConfigType }) => {
           });
           setAudioLoading(false);
 
-          if (response.headers.get('Content-Type') === 'application/json') {
-            const { data } = (await response.json()) as { data: Buffer };
-            console.log(data);
-
-            playAudioBuffer({ audio, buffer: data });
-            return resolve({ buffer: data });
-          }
-
           if (!response.body || !response.ok) {
-            throw new Error('Speech error');
+            const data = await response.json();
+            toast({
+              status: 'error',
+              title: getErrText(data, t('core.chat.Audio Speech Error'))
+            });
+            return reject(data);
           }
 
           const audioBuffer = await readAudioStream({
             audio,
             stream: response.body,
             contentType: 'audio/mpeg'
           });
+
           resolve({
             buffer: audioBuffer
           });