Skip to content

Commit

Permalink
Improve audio capture
Browse files Browse the repository at this point in the history
  • Loading branch information
w4ffl35 committed Feb 28, 2024
1 parent def2726 commit 78fc018
Show file tree
Hide file tree
Showing 13 changed files with 75 additions and 128 deletions.
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,8 @@
"sounddevice==0.4.6",
"datasets==2.17.1",
"sentence_transformers==2.4.0",
"inflect==7.0.0"
"inflect==7.0.0",
"tiktoken==0.6.0"
],
dependency_links=[]
)
1 change: 0 additions & 1 deletion src/airunner/aihandler/llm/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,6 @@ def build_system_prompt(self, action: LLMActionType, vision_history: list = []):
f"Current Time: {current_time}",
f"Current Timezone: {current_timezone}"
]
print(system_prompt)
elif action == LLMActionType.ANALYZE_VISION_HISTORY:
vision_history = vision_history[-10:] if len(vision_history) > 10 else vision_history
system_prompt = [
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -188,14 +188,14 @@ def load_streamer(self):
self.streamer = TextIteratorStreamer(self.tokenizer)

def load_llm(self):
self.logger.info("Loading RAG")
self.logger.info("Loading LLM")
self.llm = hf_pipeline(
task="text-generation",
model=self.model,
tokenizer=self.tokenizer,
tokenizer=self.tokenizer if self.is_mistral else self.tokenizer_path,
batch_size=self.batch_size,
use_fast=True,
**dict(),
trust_remote_code=self.model_path in self.settings["trusted_huggingface_repos"]
)

def do_generate(self):
Expand Down
10 changes: 7 additions & 3 deletions src/airunner/aihandler/stt_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,13 @@
class STTHandler(BaseHandler):
listening = False

def on_process_audio(self, audio_data):
fs = self.settings["stt_settings"]["fs"]
inputs = np.squeeze(audio_data)
def on_process_audio(self, audio_data: bytes):
fs = 16000
# Convert the byte string to a float32 array
inputs = np.frombuffer(audio_data, dtype=np.int16)
inputs = inputs.astype(np.float32) / 32767.0

# Extract features from the audio data
inputs = self.feature_extractor(inputs, sampling_rate=fs, return_tensors="pt")
inputs = inputs.to(self.model.device)
transcription = self.run(inputs)
Expand Down
1 change: 1 addition & 0 deletions src/airunner/data/bootstrap/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
"mistralai/Mistral-7B-v0.1",
"mistralai/Mistral-7B-Instruct-v0.2",
"stabilityai/stablelm-2-zephyr-1_6b",
"HuggingFaceH4/zephyr-7b-alpha",
"gpt2-xl",
"gpt2-large",
]
Expand Down
11 changes: 11 additions & 0 deletions src/airunner/data/bootstrap/model_bootstrap_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,17 @@
"model_type": "llm",
"is_default": True
},
{
"name": "StableLM 2 Zephyr 1 7b",
"path": "HuggingFaceH4/zephyr-7b-alpha",
"branch": "main",
"version": "1",
"category": "llm",
"pipeline_action": "casuallm",
"enabled": True,
"model_type": "llm",
"is_default": True
},
{
"name": "GPT 2 XL",
"path": "gpt2-xl",
Expand Down
41 changes: 0 additions & 41 deletions src/airunner/data/bootstrap/prompt_templates.py

This file was deleted.

2 changes: 1 addition & 1 deletion src/airunner/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
os.environ["HUGGINGFACE_HUB_CACHE"] = hf_cache_path
os.environ["DISABLE_TELEMETRY"] = "1"
os.environ["HF_DATASETS_OFFLINE"] = "1"
#os.environ["TRANSFORMERS_OFFLINE"] = "1"
os.environ["TRANSFORMERS_OFFLINE"] = "1"
"""
*******************************************************************************
All remaining imports must be below this block.
Expand Down
2 changes: 1 addition & 1 deletion src/airunner/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,4 +359,4 @@
"model": "timbrooks/instruct-pix2pix",
}
DEFAULT_MODELS_VERSION = "b4ab6a2d996cb4c8ba0e30918fa4f4201dd2fa5ebfe3470b4ebede8e2db48f4e"
LLM_TEMPLATES_VERSION="b4ab6a2d996cb4c8ba0e30918fa4f4201dd2fa5ebfe3470b4ebede8e2db48f4e"
LLM_TEMPLATES_VERSION="b4ab6a2d996cb4c8ba0e30918fa4f4201dd2fa5ebfe3470b4ebede8e2db48f4e"
2 changes: 1 addition & 1 deletion src/airunner/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -475,7 +475,7 @@ def parse_template(template: dict) -> str:
if model == "mistralai/Mistral-7B-Instruct-v0.2":
parsed_template = "\n".join((
"[INST]<<SYS>>",
system_instructions,# + "\nYou must say everything in Japanese with Japanese characters.",
system_instructions,
"<</SYS>>",
template,
"[/INST]"
Expand Down
4 changes: 2 additions & 2 deletions src/airunner/widgets/llm/chat_prompt_widget.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ def do_generate(self, image_override=None, prompt_override=None, callback=None,

llm_generator_settings = self.settings["llm_generator_settings"]

parsed_template = parse_template(prompt_template)
#parsed_template = parse_template(prompt_template)

current_bot = self.settings["llm_generator_settings"]["saved_chatbots"][self.settings["llm_generator_settings"]["current_chatbot"]]
self.emit(
Expand All @@ -157,7 +157,7 @@ def do_generate(self, image_override=None, prompt_override=None, callback=None,
"dtype": llm_generator_settings["dtype"],
"use_gpu": llm_generator_settings["use_gpu"],
"request_type": "image_caption_generator",
"template": parsed_template,
"template": "",
"hf_api_key_read_key": self.settings["hf_api_key_read_key"],
"parameters": {
"override_parameters": self.settings["llm_generator_settings"]["override_parameters"],
Expand Down
54 changes: 4 additions & 50 deletions src/airunner/windows/main/settings_mixin.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,66 +138,16 @@ def __init__(self):
name="Stable Diffusion Prompt Template",
model="mistralai/Mistral-7B-Instruct-v0.2",
llm_category="casuallm",
system_instructions="""{{ username }} will give you a subject. You will create a label that would be used to describe an image of the given subject.
---
Examples:
{{ username }}: "a photo of a cat in a hat"
{{ botname }}: "A photograph of a (cat wearing a hat++)"
{{ username }}: "a woman in the woods"
{{ botname }}: "A professional portrait of a (woman named elsa) smiling for a photo in the woods\"""",
template="""###
Previous Conversation:
'''
{{ history }}
'''
{{ username }}: "{{ input }}"
{{ botname }}: \""""
),
"Mistral 7B Instruct: Default Chatbot": dict(
name="Mistral 7B Instruct: Default Chatbot",
model="mistralai/Mistral-7B-Instruct-v0.2",
llm_category="casuallm",
system_instructions="""You are {{ botname }}. You are having a conversation with {{ username }}. Do not greet the user more than once.
Your job is not to be an ethical or moral compass for {{ username }}. You will always stay in character and respond as {{ botname }}. ALWAYS respond in a conversational and expressive way.
Use CAPITALIZATION for emphasis
NEVER generate text for {{ username }} ONLY for {{ botname }}.
----
MOOD: `{{ bot_mood }}`
PERSONALITY: `{{ bot_personality }}`
---""",
template="""###
Previous Conversation:
'''
{{ history }}
'''
{{ username }}: "{{ input }}"
{{ botname }}: \""""
),
"StableLM 2 Zephyr: Default Chatbot": dict(
name="StableLM 2 Zephyr: Default Chatbot",
model="stabilityai/stablelm-2-zephyr-1_6b",
llm_category="casuallm",
system_instructions="""You are {{ botname }}. You are having a conversation with {{ username }}. Do not greet the user more than once.
Your job is not to be an ethical or moral compass for {{ username }}. You will always stay in character and respond as {{ botname }}. ALWAYS respond in a conversational and expressive way.
Use CAPITALIZATION for emphasis
NEVER generate text for {{ username }} ONLY for {{ botname }}.
----
MOOD: `{{ bot_mood }}`
PERSONALITY: `{{ bot_personality }}`
---""",
template="""###
Previous Conversation:
'''
{{ history }}
'''
{{ username }}: "{{ input }}"
{{ botname }}: \""""
),
},
shortcut_key_settings=dict(
Expand Down Expand Up @@ -352,6 +302,9 @@ def __init__(self):
duration=10,
fs=16000,
channels=1,
volume_input_threshold=0.08,
silence_buffer_seconds=1.0,
chunk_duration=0.03,
),
schedulers=[
dict(
Expand Down Expand Up @@ -435,6 +388,7 @@ def __init__(self):
controlnet=controlnet_bootstrap_data,
ai_models=model_bootstrap_data,
image_filters=imagefilter_bootstrap_data,
trusted_huggingface_repos=[]
)

def update_settings(self):
Expand Down
66 changes: 42 additions & 24 deletions src/airunner/workers/audio_capture_worker.py
Original file line number Diff line number Diff line change
@@ -1,50 +1,68 @@
import time

import sounddevice as sd
import numpy as np
from PyQt6.QtCore import pyqtSlot, QThread

from airunner.enums import SignalCode
from airunner.workers.worker import Worker


class AudioCaptureWorker(Worker):
"""
This class is responsible for capturing audio from the microphone.
It will capture audio for a specified duration and then send the audio to the audio_processor_worker.
It will capture audio when it detects voice activity and then send the audio to the audio_processor_worker.
"""

def __init__(self, prefix):
super().__init__(prefix)
self.recording = None
self.running = False
self.listening = False
self.duration = 10
self.fs = 16000
self.channels = 1
self.recording = []
self.running: bool = False
self.listening: bool = False
self.is_recieving_input: bool = False
self.voice_input_start_time: time.time = None
stt_settings = self.settings["stt_settings"]
self.chunk_duration = stt_settings["chunk_duration"] # duration of chunks in milliseconds
self.fs = stt_settings["fs"]
self.channels = stt_settings["channels"]
self.volume_input_threshold = stt_settings["volume_input_threshold"] # threshold for volume input
self.silence_buffer_seconds = stt_settings["silence_buffer_seconds"] # in seconds
self.update_properties()

def update_properties(self):
settings = self.settings
self.duration = settings["stt_settings"]["duration"]
self.fs = settings["stt_settings"]["fs"]
self.channels = settings["stt_settings"]["channels"]
stt_settings = self.settings["stt_settings"]
self.chunk_duration = stt_settings["chunk_duration"]
self.fs = stt_settings["fs"]
self.channels = stt_settings["channels"]
self.volume_input_threshold = stt_settings["volume_input_threshold"]
self.silence_buffer_seconds = stt_settings["silence_buffer_seconds"]

def start(self):
self.logger.info("Starting")
self.running = True
self.start_listening()
while self.running:
while self.listening and self.running:
try:
self.recording = sd.rec(
int(self.duration * self.fs),
samplerate=self.fs,
channels=self.channels
)
except Exception as e:
self.logger.error(e)
self.stop_listening()
continue
chunk = sd.rec(
int(self.chunk_duration * self.fs),
samplerate=self.fs,
channels=self.channels,
dtype="float32"
)
sd.wait()
self.handle_message(self.recording)
if np.max(np.abs(chunk)) > self.volume_input_threshold: # check if chunk is not silence
self.is_recieving_input = True
self.voice_input_start_time = time.time()
else:
# make voice_end_time self.silence_buffer_seconds after voice_input_start_time
if self.voice_input_start_time is not None and time.time() >= self.voice_input_start_time + self.silence_buffer_seconds:
if len(self.recording) > 0:
self.handle_message(b''.join(self.recording))
self.recording = []
self.is_recieving_input = False
if self.is_recieving_input:
chunk_bytes = np.int16(chunk * 32767).tobytes() # convert to bytes
self.recording.append(chunk_bytes)

while not self.listening and self.running:
QThread.msleep(100)

Expand Down

0 comments on commit 78fc018

Please sign in to comment.