From 1b9d0e1a7caaf09aa16f60178bf27bb58ad90c23 Mon Sep 17 00:00:00 2001 From: Jaeyoun Nam Date: Sat, 3 Jun 2023 14:07:14 +0900 Subject: [PATCH 1/5] feat: tts using ElevenLabs (not streaming) (#10) --- apps/api/.envrc.example | 2 ++ apps/api/main.py | 36 ++++++++++++++++++++++++++++++++++++ apps/web/src/pages/index.tsx | 29 ++++++++++++++++++++++++----- 3 files changed, 62 insertions(+), 5 deletions(-) diff --git a/apps/api/.envrc.example b/apps/api/.envrc.example index 99cef89..e334382 100644 --- a/apps/api/.envrc.example +++ b/apps/api/.envrc.example @@ -1,3 +1,5 @@ export OPENAI_API_KEY=your-key export SUPABASE_BASE_URL=https://zwxffmhaivlzfxonxdwq.supabase.co export SUPABASE_ANON_KEY=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6Inp3eGZmbWhhaXZsemZ4b254ZHdxIiwicm9sZSI6ImFub24iLCJpYXQiOjE2ODQwNDAyNjcsImV4cCI6MTk5OTYxNjI2N30.97SWP08kPH05hjMyuswjrqHqQ1-dCHcp_LxN1y3MY70 +export ELEVENLABS_API_KEY=your-key +export ELEVENLABS_VOICE_ID=EXAVITQu4vr4xnSDxMaL diff --git a/apps/api/main.py b/apps/api/main.py index ca1d23f..70bc4b5 100644 --- a/apps/api/main.py +++ b/apps/api/main.py @@ -11,8 +11,11 @@ from fastapi import FastAPI, Request, WebSocket, WebSocketDisconnect, File, Form, UploadFile from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import FileResponse + from fastapi.templating import Jinja2Templates from langchain.vectorstores import VectorStore +import requests from callback import QuestionGenCallbackHandler, StreamingLLMCallbackHandler from query_data import get_chain @@ -70,6 +73,39 @@ async def transcriptions(audioData: UploadFile,model: Annotated[str, Form()]): transcription = openai.Audio.transcribe("whisper-1", audio_file) return transcription +@app.post("/tts") +async def tts(text: Annotated[str, Form()]): + OUTPUT_FILE = "./tts/speech.mp3" + CHUNK_SIZE = 1024 + elevenlabs_api_key = os.environ["ELEVENLABS_API_KEY"] + voice_id = os.environ["ELEVENLABS_VOICE_ID"] + tts_url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream" + headers = { + "Accept": "application/json", + "xi-api-key": elevenlabs_api_key + } + headers["Content-Type"] = "application/json" + + data = { + "text": text, + "model_id": "eleven_monolingual_v1", + "voice_settings": { + "stability": 0, + "similarity_boost":0 + } + } + + response = requests.post(tts_url, json=data, headers=headers, stream=True) + + with open(OUTPUT_FILE, 'wb+') as f: + for chunk in response.iter_content(chunk_size=CHUNK_SIZE): + if chunk: + f.write(chunk) + + return FileResponse(OUTPUT_FILE, media_type="audio/mp3") + + + @app.websocket("/chat") async def websocket_endpoint(websocket: WebSocket): await websocket.accept() diff --git a/apps/web/src/pages/index.tsx b/apps/web/src/pages/index.tsx index 2aac32a..37359f0 100644 --- a/apps/web/src/pages/index.tsx +++ b/apps/web/src/pages/index.tsx @@ -8,6 +8,8 @@ import axios from "axios"; export default function Home() { const [ws, setWs] = useState(null); const [isProcessing, setIsProcessing] = useState(false); + const audioRef = useRef(null); + const onTranscribe = async (blob: Blob) => { if (childRef.current) { @@ -28,7 +30,7 @@ export default function Home() { const { text } = await response.data; // you must return result from your server in Transcript format if (childRef.current) { - console.log("finish") + console.log("finish"); childRef.current.finish(); } @@ -65,8 +67,6 @@ export default function Home() { end: () => {}, }); - const delay = (ms: number) => new Promise((res) => setTimeout(res, ms)); - const listen = () => { // stt if (childRef.current) { @@ -88,7 +88,7 @@ export default function Home() { const endpoint = "ws://localhost:9000/chat"; const ws = new WebSocket(endpoint); - ws.onmessage = function (event) { + ws.onmessage = async function (event) { const messages = document.getElementById("messages"); const data = JSON.parse(event.data); if (data.sender === "bot") { @@ -99,7 +99,6 @@ export default function Home() { p.innerHTML = "JANOT: "; div.appendChild(p); messages.appendChild(div); - } else if (data.type === "stream") { const p = messages.lastChild.lastChild as HTMLParagraphElement; if (data.message === "\n") { @@ -109,6 +108,25 @@ export default function Home() { } } else if (data.type === "info") { } else if (data.type === "end") { + const p = messages.lastChild.lastChild as HTMLParagraphElement; + const finalText = (p.innerHTML).split("JANOT: ")[1]; + const enableTTS = false; + // Call tts + if (enableTTS) { + const formdata = new FormData(); + formdata.append("text", finalText); + const url = "http://localhost:9000/tts"; + const response = await axios.post(url, formdata, { + headers: { "Content-Type": "multipart/form-data" }, + responseType: "blob", + }); + const audioURL = URL.createObjectURL(response.data); + const audioElement = audioRef.current; + audioElement.src = audioURL; + audioElement.play(); + } + + if (childRef.current) { console.log("end"); childRef.current.end(); @@ -176,6 +194,7 @@ export default function Home() { className="overflow-auto text-center text-xl font-thin tracking-tight text-white" > +