-
Notifications
You must be signed in to change notification settings - Fork 5
/
main.py
139 lines (124 loc) · 4.56 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import os
import datetime
from openai import OpenAI
from elevenlabs.client import ElevenLabs, Voice
from elevenlabs import stream
import argparse
from dataclasses import asdict
from models import Message
import speech_recognition as sr
import logging
from pathlib import Path
logging.basicConfig(level=logging.INFO)
import dotenv
dotenv.load_dotenv('.env')
oai_client = OpenAI()
elevenlabs_client = ElevenLabs()
CHAT_MODEL = "gpt-4o"
TTS_MODEL = "tts-1"
MODEL_TEMPERATURE = 0.5
AUDIO_MODEL = "whisper-1"
VOICE_ID = os.getenv("ELEVENLABS_VOICE_ID")
def ask_gpt_chat(prompt: str, messages: list[Message]):
"""Returns ChatGPT's response to the given prompt."""
system_message = [{"role": "system", "content": prompt}]
message_dicts = [asdict(message) for message in messages]
conversation_messages = system_message + message_dicts
response = oai_client.chat.completions.create(
model=CHAT_MODEL,
messages=conversation_messages,
temperature=MODEL_TEMPERATURE
)
return response.choices[0].message.content
def setup_prompt(prompt_file: str = 'prompts/vet_prompt.md') -> str:
"""Creates a prompt for gpt for generating a response."""
with open(prompt_file) as f:
prompt = f.read()
return prompt
def get_transcription(file_path: str):
audio_file= open(file_path, "rb")
transcription = oai_client.audio.transcriptions.create(
model=AUDIO_MODEL,
file=audio_file
)
return transcription.text
def record():
# load the speech recognizer with CLI settings
r = sr.Recognizer()
# record audio stream from multiple sources
m = sr.Microphone()
with m as source:
r.adjust_for_ambient_noise(source)
logging.info(f'Listening...')
audio = r.listen(source)
# write audio to a WAV file
timestamp = datetime.datetime.now().timestamp()
with open(f"./recordings/{timestamp}.wav", "wb") as f:
f.write(audio.get_wav_data())
transcript = get_transcription(f"./recordings/{timestamp}.wav")
with open(f"./transcripts/{timestamp}.txt", "w") as f:
f.write(transcript)
return transcript
def oai_text_to_speech(text: str):
timestamp = datetime.datetime.now().timestamp()
speech_file_path = Path(__file__).parent / f"outputs/{timestamp}.mp3"
response = oai_client.audio.speech.create(
model=TTS_MODEL,
voice="nova",
input=text
)
response.write_to_file(speech_file_path)
return speech_file_path
def elevenlabs_text_to_speech(text: str):
audio_stream = elevenlabs_client.generate(
text=text,
voice=Voice(
voice_id=VOICE_ID
),
stream=True
)
stream(audio_stream)
def clean_up():
logging.info('Exiting...')
# Delete all the recordings and transcripts
for file in os.listdir('./recordings'):
os.remove(f"./recordings/{file}")
for file in os.listdir('./transcripts'):
os.remove(f"./transcripts/{file}")
for file in os.listdir('./outputs'):
os.remove(f"./outputs/{file}")
# Save the conversation
timestamp = datetime.datetime.now().timestamp()
with open(f'logs/conversation_{timestamp}.txt', 'w') as f:
for message in conversation_messages:
f.write(f"{message.role}: {message.content}\n")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-pf", "--prompt_file", help="Specify the prompt file to use.", type=str)
parser.add_argument("-tts", "--tts_type", help="Specify the TTS type to use.", type=str, default="openai", choices=["openai", "elevenlabs"])
args = parser.parse_args()
prompt_file = args.prompt_file
tts_type = args.tts_type or "openai"
prompt = setup_prompt(prompt_file)
conversation_messages = []
while True:
try:
user_input = record()
logging.info(f'Receiver: {user_input}')
conversation_messages.append(Message(role="user", content=user_input))
answer = ask_gpt_chat(prompt, conversation_messages)
logging.info(f'Caller: {answer}')
logging.info('Playing audio...')
if tts_type == "elevenlabs":
elevenlabs_text_to_speech(answer)
else:
audio_file = oai_text_to_speech(answer)
# Play the audio file
os.system(f"afplay {audio_file}")
conversation_messages.append(Message(role="assistant", content=answer))
if 'bye' in user_input.lower():
clean_up()
break
except KeyboardInterrupt:
clean_up()
break