Improve audio capture

Capsize-Games · Feb 28, 2024 · 78fc018 · 78fc018
1 parent def2726
commit 78fc018
Show file tree

Hide file tree

Showing 13 changed files with 75 additions and 128 deletions.
diff --git a/setup.py b/setup.py
@@ -60,7 +60,8 @@
         "sounddevice==0.4.6",
         "datasets==2.17.1",
         "sentence_transformers==2.4.0",
-        "inflect==7.0.0"
+        "inflect==7.0.0",
+        "tiktoken==0.6.0"
     ],
     dependency_links=[]
 )
diff --git a/src/airunner/aihandler/llm/agent.py b/src/airunner/aihandler/llm/agent.py
@@ -96,7 +96,6 @@ def build_system_prompt(self, action: LLMActionType, vision_history: list = []):
                 f"Current Time: {current_time}",
                 f"Current Timezone: {current_timezone}"
             ]
-            print(system_prompt)
         elif action == LLMActionType.ANALYZE_VISION_HISTORY:
             vision_history = vision_history[-10:] if len(vision_history) > 10 else vision_history
             system_prompt = [

diff --git a/src/airunner/aihandler/llm/casual_lm_transfformer_base_handler.py b/src/airunner/aihandler/llm/casual_lm_transfformer_base_handler.py
@@ -188,14 +188,14 @@ def load_streamer(self):
         self.streamer = TextIteratorStreamer(self.tokenizer)
 
     def load_llm(self):
-        self.logger.info("Loading RAG")
+        self.logger.info("Loading LLM")
         self.llm = hf_pipeline(
             task="text-generation",
             model=self.model,
-            tokenizer=self.tokenizer,
+            tokenizer=self.tokenizer if self.is_mistral else self.tokenizer_path,
             batch_size=self.batch_size,
             use_fast=True,
-            **dict(),
+            trust_remote_code=self.model_path in self.settings["trusted_huggingface_repos"]
         )
 
     def do_generate(self):

diff --git a/src/airunner/aihandler/stt_handler.py b/src/airunner/aihandler/stt_handler.py
@@ -11,9 +11,13 @@
 class STTHandler(BaseHandler):
     listening = False
 
-    def on_process_audio(self, audio_data):
-        fs = self.settings["stt_settings"]["fs"]
-        inputs = np.squeeze(audio_data)
+    def on_process_audio(self, audio_data: bytes):
+        fs = 16000
+        # Convert the byte string to a float32 array
+        inputs = np.frombuffer(audio_data, dtype=np.int16)
+        inputs = inputs.astype(np.float32) / 32767.0
+
+        # Extract features from the audio data
         inputs = self.feature_extractor(inputs, sampling_rate=fs, return_tensors="pt")
         inputs = inputs.to(self.model.device)
         transcription = self.run(inputs)

diff --git a/src/airunner/data/bootstrap/llm.py b/src/airunner/data/bootstrap/llm.py
@@ -50,6 +50,7 @@
             "mistralai/Mistral-7B-v0.1",
             "mistralai/Mistral-7B-Instruct-v0.2",
             "stabilityai/stablelm-2-zephyr-1_6b",
+            "HuggingFaceH4/zephyr-7b-alpha",
             "gpt2-xl",
             "gpt2-large",
         ]

diff --git a/src/airunner/data/bootstrap/model_bootstrap_data.py b/src/airunner/data/bootstrap/model_bootstrap_data.py
@@ -296,6 +296,17 @@
         "model_type": "llm",
         "is_default": True
     },
+{
+        "name": "StableLM 2 Zephyr 1 7b",
+        "path": "HuggingFaceH4/zephyr-7b-alpha",
+        "branch": "main",
+        "version": "1",
+        "category": "llm",
+        "pipeline_action": "casuallm",
+        "enabled": True,
+        "model_type": "llm",
+        "is_default": True
+    },
     {
         "name": "GPT 2 XL",
         "path": "gpt2-xl",

diff --git a/src/airunner/data/bootstrap/prompt_templates.py b/src/airunner/data/bootstrap/prompt_templates.py
diff --git a/src/airunner/main.py b/src/airunner/main.py
@@ -14,7 +14,7 @@
         os.environ["HUGGINGFACE_HUB_CACHE"] = hf_cache_path
 os.environ["DISABLE_TELEMETRY"] = "1"
 os.environ["HF_DATASETS_OFFLINE"] = "1"
-#os.environ["TRANSFORMERS_OFFLINE"] = "1"
+os.environ["TRANSFORMERS_OFFLINE"] = "1"
 """
 *******************************************************************************
 All remaining imports must be below this block.

diff --git a/src/airunner/settings.py b/src/airunner/settings.py
@@ -359,4 +359,4 @@
     "model": "timbrooks/instruct-pix2pix",
 }
 DEFAULT_MODELS_VERSION = "b4ab6a2d996cb4c8ba0e30918fa4f4201dd2fa5ebfe3470b4ebede8e2db48f4e"
-LLM_TEMPLATES_VERSION="b4ab6a2d996cb4c8ba0e30918fa4f4201dd2fa5ebfe3470b4ebede8e2db48f4e"
+LLM_TEMPLATES_VERSION="b4ab6a2d996cb4c8ba0e30918fa4f4201dd2fa5ebfe3470b4ebede8e2db48f4e"
diff --git a/src/airunner/utils.py b/src/airunner/utils.py
@@ -475,7 +475,7 @@ def parse_template(template: dict) -> str:
         if model == "mistralai/Mistral-7B-Instruct-v0.2":
             parsed_template = "\n".join((
                 "[INST]<<SYS>>",
-                system_instructions,# + "\nYou must say everything in Japanese with Japanese characters.",
+                system_instructions,
                 "<</SYS>>",
                 template,
                 "[/INST]"

diff --git a/src/airunner/widgets/llm/chat_prompt_widget.py b/src/airunner/widgets/llm/chat_prompt_widget.py
@@ -133,7 +133,7 @@ def do_generate(self, image_override=None, prompt_override=None, callback=None,
 
         llm_generator_settings = self.settings["llm_generator_settings"]
 
-        parsed_template = parse_template(prompt_template)
+        #parsed_template = parse_template(prompt_template)
 
         current_bot = self.settings["llm_generator_settings"]["saved_chatbots"][self.settings["llm_generator_settings"]["current_chatbot"]]
         self.emit(
@@ -157,7 +157,7 @@ def do_generate(self, image_override=None, prompt_override=None, callback=None,
                     "dtype": llm_generator_settings["dtype"],
                     "use_gpu": llm_generator_settings["use_gpu"],
                     "request_type": "image_caption_generator",
-                    "template": parsed_template,
+                    "template": "",
                     "hf_api_key_read_key": self.settings["hf_api_key_read_key"],
                     "parameters": {
                         "override_parameters": self.settings["llm_generator_settings"]["override_parameters"],

diff --git a/src/airunner/windows/main/settings_mixin.py b/src/airunner/windows/main/settings_mixin.py
@@ -138,66 +138,16 @@ def __init__(self):
                     name="Stable Diffusion Prompt Template",
                     model="mistralai/Mistral-7B-Instruct-v0.2",
                     llm_category="casuallm",
-                    system_instructions="""{{ username }} will give you a subject. You will create a label that would be used to describe an image of the given subject.
----
-Examples:
-{{ username }}: "a photo of a cat in a hat"
-{{ botname }}: "A photograph of a (cat wearing a hat++)"
-{{ username }}: "a woman in the woods"
-{{ botname }}: "A professional portrait of a (woman named elsa) smiling for a photo in the woods\"""",
-                    template="""###
-
-Previous Conversation:
-'''
-{{ history }}
-'''
-
-{{ username }}: "{{ input }}"
-{{ botname }}: \""""
                 ),
                 "Mistral 7B Instruct: Default Chatbot": dict(
                     name="Mistral 7B Instruct: Default Chatbot",
                     model="mistralai/Mistral-7B-Instruct-v0.2",
                     llm_category="casuallm",
-                    system_instructions="""You are {{ botname }}. You are having a conversation with {{ username }}. Do not greet the user more than once.
-Your job is not to be an ethical or moral compass for {{ username }}. You will always stay in character and respond as {{ botname }}. ALWAYS respond in a conversational and expressive way.
-Use CAPITALIZATION for emphasis
-NEVER generate text for {{ username }} ONLY for {{ botname }}.
-----
-MOOD: `{{ bot_mood }}`
-PERSONALITY: `{{ bot_personality }}`
----""",
-                    template="""###
-
-Previous Conversation:
-'''
-{{ history }}
-'''
-
-{{ username }}: "{{ input }}"
-{{ botname }}: \""""
                 ),
                 "StableLM 2 Zephyr: Default Chatbot": dict(
                     name="StableLM 2 Zephyr: Default Chatbot",
                     model="stabilityai/stablelm-2-zephyr-1_6b",
                     llm_category="casuallm",
-                    system_instructions="""You are {{ botname }}. You are having a conversation with {{ username }}. Do not greet the user more than once.
-Your job is not to be an ethical or moral compass for {{ username }}. You will always stay in character and respond as {{ botname }}. ALWAYS respond in a conversational and expressive way.
-Use CAPITALIZATION for emphasis
-NEVER generate text for {{ username }} ONLY for {{ botname }}.
-----
-MOOD: `{{ bot_mood }}`
-PERSONALITY: `{{ bot_personality }}`
----""",
-                    template="""###
-
-Previous Conversation:
-'''
-{{ history }}
-'''
-
-{{ username }}: "{{ input }}"
-{{ botname }}: \""""
                 ),
             },
             shortcut_key_settings=dict(
@@ -352,6 +302,9 @@ def __init__(self):
                 duration=10,
                 fs=16000,
                 channels=1,
+                volume_input_threshold=0.08,
+                silence_buffer_seconds=1.0,
+                chunk_duration=0.03,
             ),
             schedulers=[
                 dict(
@@ -435,6 +388,7 @@ def __init__(self):
             controlnet=controlnet_bootstrap_data,
             ai_models=model_bootstrap_data,
             image_filters=imagefilter_bootstrap_data,
+            trusted_huggingface_repos=[]
         )
 
     def update_settings(self):

diff --git a/src/airunner/workers/audio_capture_worker.py b/src/airunner/workers/audio_capture_worker.py
@@ -1,50 +1,68 @@
+import time
+
 import sounddevice as sd
+import numpy as np
 from PyQt6.QtCore import pyqtSlot, QThread
-
 from airunner.enums import SignalCode
 from airunner.workers.worker import Worker
 
 
 class AudioCaptureWorker(Worker):
     """
     This class is responsible for capturing audio from the microphone.
-    It will capture audio for a specified duration and then send the audio to the audio_processor_worker.
+    It will capture audio when it detects voice activity and then send the audio to the audio_processor_worker.
     """
 
     def __init__(self, prefix):
         super().__init__(prefix)
-        self.recording = None
-        self.running = False
-        self.listening = False
-        self.duration = 10
-        self.fs = 16000
-        self.channels = 1
+        self.recording = []
+        self.running: bool = False
+        self.listening: bool = False
+        self.is_recieving_input: bool = False
+        self.voice_input_start_time: time.time = None
+        stt_settings = self.settings["stt_settings"]
+        self.chunk_duration = stt_settings["chunk_duration"]  # duration of chunks in milliseconds
+        self.fs = stt_settings["fs"]
+        self.channels = stt_settings["channels"]
+        self.volume_input_threshold = stt_settings["volume_input_threshold"]  # threshold for volume input
+        self.silence_buffer_seconds = stt_settings["silence_buffer_seconds"]  # in seconds
         self.update_properties()
-    
+
     def update_properties(self):
-        settings = self.settings
-        self.duration = settings["stt_settings"]["duration"]
-        self.fs = settings["stt_settings"]["fs"]
-        self.channels = settings["stt_settings"]["channels"]
+        stt_settings = self.settings["stt_settings"]
+        self.chunk_duration = stt_settings["chunk_duration"]
+        self.fs = stt_settings["fs"]
+        self.channels = stt_settings["channels"]
+        self.volume_input_threshold = stt_settings["volume_input_threshold"]
+        self.silence_buffer_seconds = stt_settings["silence_buffer_seconds"]
 
     def start(self):
         self.logger.info("Starting")
         self.running = True
         self.start_listening()
         while self.running:
             while self.listening and self.running:
-                try:
-                    self.recording = sd.rec(
-                        int(self.duration * self.fs),
-                        samplerate=self.fs,
-                        channels=self.channels
-                    )
-                except Exception as e:
-                    self.logger.error(e)
-                    self.stop_listening()
-                    continue
+                chunk = sd.rec(
+                    int(self.chunk_duration * self.fs),
+                    samplerate=self.fs,
+                    channels=self.channels,
+                    dtype="float32"
+                )
                 sd.wait()
-                self.handle_message(self.recording)
+                if np.max(np.abs(chunk)) > self.volume_input_threshold:  # check if chunk is not silence
+                    self.is_recieving_input = True
+                    self.voice_input_start_time = time.time()
+                else:
+                    # make voice_end_time self.silence_buffer_seconds after voice_input_start_time
+                    if self.voice_input_start_time is not None and time.time() >= self.voice_input_start_time + self.silence_buffer_seconds:
+                        if len(self.recording) > 0:
+                            self.handle_message(b''.join(self.recording))
+                            self.recording = []
+                            self.is_recieving_input = False
+                if self.is_recieving_input:
+                    chunk_bytes = np.int16(chunk * 32767).tobytes()  # convert to bytes
+                    self.recording.append(chunk_bytes)
+
             while not self.listening and self.running:
                 QThread.msleep(100)