diff --git a/.gitignore b/.gitignore index 10b09df..289adb5 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,7 @@ uniteai.egg-info/ test.md test.txt *.log +debug_transcription.wav # VSCode .vscode/ diff --git a/Makefile b/Makefile index 59be611..72ad802 100644 --- a/Makefile +++ b/Makefile @@ -9,7 +9,7 @@ watch-tests: pytest --capture=no; \ done -upload: +publish_pypi: rm -r dist python -m build python -m twine upload dist/* diff --git a/clients/vscode/package.json b/clients/vscode/package.json index d5cecad..498c32c 100644 --- a/clients/vscode/package.json +++ b/clients/vscode/package.json @@ -3,7 +3,7 @@ "description": "Use AI in your Editor.", "author": "uniteai", "license": "Apache-2.0", - "version": "0.1.11", + "version": "0.1.12", "icon": "icon.jpeg", "repository": { "type": "git", diff --git a/clients/vscode/uniteai-0.1.11.vsix b/clients/vscode/uniteai-0.1.11.vsix deleted file mode 100644 index 1bc1118..0000000 Binary files a/clients/vscode/uniteai-0.1.11.vsix and /dev/null differ diff --git a/clients/vscode/uniteai-0.1.12.vsix b/clients/vscode/uniteai-0.1.12.vsix new file mode 100644 index 0000000..e9f7591 Binary files /dev/null and b/clients/vscode/uniteai-0.1.12.vsix differ diff --git a/pyproject.toml b/pyproject.toml index 0ada283..260eda8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "uniteai" -version = "0.1.9" +version = "0.1.10" description = "AI, Inside your Editor." readme = "README.md" license = "Apache-2.0" diff --git a/todo/021_efficient_realtime_transcription.md b/todo/021_efficient_realtime_transcription.md new file mode 100644 index 0000000..819d8af --- /dev/null +++ b/todo/021_efficient_realtime_transcription.md @@ -0,0 +1,12 @@ +# 021: Efficient Realtime Transcription + +As of recent commits, during a transcription window, the entire audio is saved in memory, and the whole thing is repeatedly transcribed. Inefficient. + + +## Options: + +* Freeze transcription of earlier portions, and only re-recognize the latest portions. Perhaps a sliding window would work, but then the window must overlay with previous windows so that, eg, words aren't cut in half, and there will be some effort needed to properly align the transcribed text with the audio. This seems like a huge ergonomic improvement, but perhaps technically tough. + +* Check the rms energy level of audio chunks to find the start/stop of phrases, and cut out silence + +* Cut out noise? Or perhaps `whisper` was trained on enough noisy data that it already deals well with it, and this would be a significant inefficiency. diff --git a/todo/CANCELLED 009_add_emacs_marker_for_transcription.md b/todo/CANCELLED 009_add_emacs_marker_for_transcription.md deleted file mode 100644 index 84b7c09..0000000 --- a/todo/CANCELLED 009_add_emacs_marker_for_transcription.md +++ /dev/null @@ -1,71 +0,0 @@ -# 009: Add Emacs Marker for Transcription :CANCELLED: - -- NOTES: This is cancelled because of how `002_newline_management` was solved, namely, giving the LSP a tagged block that it gets to control. - -The emacs marker can keep track of what point should be written to. - -## An example of dealing with a marker - -```elisp -;;;;;;;;;; -;; Marker Test - -(defvar-local my-global-marker nil) - -(defun my-initialize-marker () - "Set the global marker to the beginning of the buffer." - (setq my-global-marker (point-min-marker))) - -(add-hook 'find-file-hook 'my-initialize-marker) - -(defvar-local my-marker-overlay nil - "Overlay for the marker set by `marker-set-command'.") - -(defun get-marker-column (marker) - "Get column number of a marker" - (save-excursion - (goto-char marker) - (current-column))) - -(defun marker-update-command() - (interactive) - ;; Report marker - (let* ((doc (eglot--TextDocumentIdentifier)) - (line (line-number-at-pos my-global-marker)) - (character (get-marker-column my-global-marker)) - (params `(:emacsMarker (:line ,line :character ,character)))) - (eglot-execute-command (eglot--current-server-or-lose) 'command.markerSet (vector doc params))) - - ;; Remove the old overlay, if any - (when (overlayp my-marker-overlay) - (delete-overlay my-marker-overlay)) - - ;; Create a new overlay at the marker's position - (let ((marker-pos (marker-position my-global-marker))) - (setq my-marker-overlay (make-overlay marker-pos (1+ marker-pos))) - (overlay-put my-marker-overlay 'face 'highlight)) - ) - -(defun marker-set-command () - "Send an Emacs marker to the LSP server." - (interactive) - (setq my-global-marker (point-marker)) - (marker-update-command)) - -(defun marker-get-command () - "Get the Emacs marker from the LSP server." - (interactive) - (let* ((doc (eglot--TextDocumentIdentifier)) - (marker my-global-marker) - (line (line-number-at-pos marker)) - (character (current-column)) - (params `(:emacsMarker (:line ,line :character ,character)))) - (eglot-execute-command (eglot--current-server-or-lose) 'command.markerGet (vector doc params)))) - -(defun my-after-change-function (begin end length) - "Call `marker-set-command' if the current buffer is managed by Eglot." - (when (bound-and-true-p eglot--managed-mode) - (marker-update-command))) - -(add-hook 'after-change-functions #'my-after-change-function) -``` diff --git a/todo/CANCELLED 010_add_emacs_marker_for_llm.md b/todo/CANCELLED 010_add_emacs_marker_for_llm.md deleted file mode 100644 index 925d1fc..0000000 --- a/todo/CANCELLED 010_add_emacs_marker_for_llm.md +++ /dev/null @@ -1,7 +0,0 @@ -# 010: Add Emacs Marker for LLM :CANCELLED: - - -NOTES: This is cancelled because of how `002_newline_management` was solved, namely, giving the LSP a tagged block that it gets to control. - - -The emacs marker can keep track of what point should be written to. diff --git a/todo/019_realtime_transcription.md b/todo/DONE_019_realtime_transcription.md similarity index 69% rename from todo/019_realtime_transcription.md rename to todo/DONE_019_realtime_transcription.md index c3fd3bc..750b292 100644 --- a/todo/019_realtime_transcription.md +++ b/todo/DONE_019_realtime_transcription.md @@ -3,3 +3,10 @@ * Is there a library? * If not, what if we fired off multiple threads to listen at different time-scales, and combine the results? For instance a short timeout could catch every 1 second of audio, and optimistically transcribe that, but then when the longterm timescale listening thread returns, a transcription will likely yield a better result, so we can override previous misses. These audio chunks can be thrown in the same queue, tagged, and we can drain short-timescale chunks off the queue if there's a more recent long-timescale chunk. + + +RESULT: + +I've opted for recording the entire audio stream, and not doing processing before `recognize`. + +There are definite efficiency gains to still be had, so I'll make a new ticket, but this works well enough for short transcription runs for now. diff --git a/todo/020_fix_vscode_client.md b/todo/DONE_020_fix_vscode_client.md similarity index 100% rename from todo/020_fix_vscode_client.md rename to todo/DONE_020_fix_vscode_client.md diff --git a/uniteai/common.py b/uniteai/common.py index 7e0f799..a76e579 100644 --- a/uniteai/common.py +++ b/uniteai/common.py @@ -43,27 +43,6 @@ def mk_logger(name, level): return logger - -################################################## - -class ThreadSafeCounter: - ''' - A threadsafe incrementable integer. - ''' - - def __init__(self): - self.value = 0 - self._lock = Lock() - - def increment(self): - with self._lock: - self.value += 1 - return self.value - - def get(self): - return self.value - - ################################################## # Dict helpers diff --git a/uniteai/transcription.py b/uniteai/transcription.py index 97e8156..dbc24a2 100644 --- a/uniteai/transcription.py +++ b/uniteai/transcription.py @@ -6,54 +6,37 @@ ''' from thespian.actors import Actor -from typing import List -import pygls from pygls.server import LanguageServer from lsprotocol.types import ( - ApplyWorkspaceEditParams, CodeAction, CodeActionKind, CodeActionParams, Command, Position, - Range, TextDocumentIdentifier, - VersionedTextDocumentIdentifier, - TextEdit, - WorkspaceEdit, - DidChangeTextDocumentParams, ) -import sys import logging -from pygls.protocol import default_converter -import requests -import json -from concurrent.futures import ThreadPoolExecutor -import openai -import yaml -from threading import Thread, Lock, Event +from threading import Thread, Event from queue import Queue, Empty import speech_recognition as sr import re import numpy as np import time -from dataclasses import dataclass -from typing import List, Tuple -import re -import itertools import argparse +import threading +from functools import partial -from uniteai.common import ThreadSafeCounter, mk_logger, find_block, get_nested +from uniteai.common import mk_logger, find_block, get_nested from uniteai.edit import BlockJob, cleanup_block, init_block - START_TAG = ':START_TRANSCRIPTION:' END_TAG = ':END_TRANSCRIPTION:' NAME = 'transcription' # A custom logger for just this feature. You can tune the log level to turn # on/off just this feature's logs. -log = mk_logger(NAME, logging.DEBUG) +log_level = logging.DEBUG +log = mk_logger(NAME, log_level) ################################################## @@ -67,22 +50,17 @@ def __init__(self, model_path, model_size, volume_threshold): - self.model_type = model_type self.model_path = model_path self.model_size = model_size # Recognizer self.r = sr.Recognizer() + self.mic = sr.Microphone() + self.sample_rate = None + self.sample_width = None self.r.energy_threshold = volume_threshold self.r.dynamic_energy_threshold = False - self.audio_queue = Queue() - - # Keep track of the iteration when a thread was started. That way, if - # it had a blocking operation (like `r.listen`) that should have been - # terminated, but couldn't because the thread was blocked, well, now we - # can deprecate that thread. - self.transcription_counter = ThreadSafeCounter() def recognize(self, audio): log.debug(f'MODEL_TYPE: {self.model_type}') @@ -106,106 +84,156 @@ def recognize(self, audio): def _warmup(self): ''' Warm up, intended for a separate thread. ''' - empty_audio = sr.AudioData(np.zeros(10), sample_rate=1, sample_width=1) - self.recognize(empty_audio) - logging.info('Warmed up transcription model') - # TODO: Transcription needs to be tuned better to deal with ambient - # noise, and appropriate volume levels - # - logging.info('Adjusting thresholds for ambient noise') - with sr.Microphone() as source: - self.r.adjust_for_ambient_noise(source) + # Get some mic params + with self.mic as source: + self.sample_rate = source.SAMPLE_RATE + self.sample_width = source.SAMPLE_WIDTH + # Get model into memory + empty_audio = sr.AudioData(np.zeros(10), sample_rate=1, sample_width=1) + self.recognize(empty_audio) + log.info(f'Warmed up. sample_rate={self.sample_rate}, sample_width={self.sample_width}') def warmup(self): '''Load whisper model into memory.''' - logging.info('Warming up whisper in separate thread') + log.info('Warming up transcription model in separate thread') warmup_thread = Thread(target=self._warmup) warmup_thread.daemon = True warmup_thread.start() - def listen(self, should_stop): - def callback(r, audio): - log.debug('LISTENING CALLBACK called') - self.audio_queue.put(audio, block=False) - stop_listening_fn = self.r.listen_in_background( - sr.Microphone(), - callback - ) - return stop_listening_fn - - def transcription_worker(self, uri, edits, should_stop, - transcription_worker_is_running): - transcription_worker_is_running.set() - running_transcription = "" + def listen_(self, + queue: Queue, + should_stop: Event): + with sr.Microphone() as s: + while not should_stop.is_set(): + buf = s.stream.read(s.CHUNK) + queue.put(buf) + + def transcription_(self, + audio_queue, + transcription_callback, + finished_callback, + should_stop): + audios = [] while not should_stop.is_set(): try: # non-blocking, to more frequently allow the # `stop_transcription` signal to end this thread. - audio = self.audio_queue.get(False) + buffer = audio_queue.get(False) + + # TODO: can we more intelligently separate silence from speech? + # energy = audioop.rms(buffer, self.sample_width) + audios.append(buffer) + try: + while True: + buffer = audio_queue.get(False) + audios.append(buffer) + except Empty: + pass + + log.debug(f'len audio: {len(audios)}') + except Empty: time.sleep(0.2) continue try: + audio = sr.audio.AudioData( + b''.join(audios), + self.sample_rate, + self.sample_width + ) + # Debug audio: The audio gets sliced up regularly, how does it + # sound when stitched back? + if log_level == logging.DEBUG: + with open("debug_transcription.wav", "wb") as output_file: + output_file.write(audio.get_wav_data()) + + # breakout if needed + if should_stop.is_set(): + break + + # Speech-to-text x = self.recognize(audio) + + # Nothing recognized if not x: continue x = x.strip() - log.debug(f'TRANSCRIPTION: {x}') if filter_out(x): continue - # Add space to respect next loop of transcription - running_transcription += x + ' ' - job = BlockJob( - uri=uri, - start_tag=START_TAG, - end_tag=END_TAG, - text=f'\n{running_transcription}\n', - strict=False, - ) - edits.add_job(NAME, job) + # breakout if needed + if should_stop.is_set(): + break + + transcription_callback(x) + except sr.UnknownValueError: log.debug("ERROR: could not understand audio") - self.audio_queue.task_done() + audio_queue.task_done() - cleanup_block(NAME, [START_TAG, END_TAG], uri, edits) - transcription_worker_is_running.clear() + finished_callback() log.debug('DONE TRANSCRIBING') + def go(self, + transcription_callback, + finished_callback): + audio_queue = Queue() + should_stop = Event() + + # Listener Thread + l_thread = threading.Thread( + target=self.listen_, + args=(audio_queue, should_stop)) + l_thread.daemon = True + l_thread.start() + + # Transcription Thread + t_thread = threading.Thread( + target=self.transcription_, + args=(audio_queue, + transcription_callback, + finished_callback, + should_stop)) + t_thread.daemon = True + t_thread.start() + + def stop_fn(): + log.debug('stop_fn called') + should_stop.set() + l_thread.join() + t_thread.join() + + return stop_fn + ################################################## # Actor class TranscriptionActor(Actor): def __init__(self): - self.transcription_worker_is_running = Event() - self.should_stop = Event() + self.is_running = Event() self.tags = [START_TAG, END_TAG] self.speech_recognition = None # set during initialization - self.executor = ThreadPoolExecutor(max_workers=5) - self.transcription_thread_future = None # set during set_config/start self.model_path = None self.model_size = None self.volume_threshold = None - self.stop_listening_fn = lambda x,y: None + self.stop_fn = lambda: None def receiveMessage(self, msg, sender): command = msg.get('command') edits = msg.get('edits') - tw_set = self.transcription_worker_is_running.is_set() + tw_set = self.is_running.is_set() log.debug(f''' %%%%%%%%%% ACTOR RECV: {msg["command"]} ACTOR STATE: transcription_worker is running={tw_set} -should_stop: {self.should_stop.is_set()} -transcription_thread_future: {self.transcription_thread_future} EDITS STATE: job_thread alive: {edits.job_thread.is_alive() if edits and edits.job_thread else "NOT STARTED"} @@ -264,43 +292,40 @@ def receiveMessage(self, msg, sender): # load the model into GPU self.speech_recognition.warmup() - def start(self, uri, cursor_pos, edits): - tw_set = self.transcription_worker_is_running.is_set() - if tw_set: - log.info(f'WARN: ON_START_BUT_RUNNING. ' - f'transcription_worker is running={tw_set}') - return - log.debug('ACTOR START') - self.should_stop.clear() - - # Audio Listener - self.stop_listening_fn = self.speech_recognition.listen(self.should_stop) - - # Transcriber - self.transcription_thread_future = self.executor.submit( - self.speech_recognition.transcription_worker, - uri, edits, self.should_stop, self.transcription_worker_is_running) + def transcription_callback(self, edits, uri, text): + # Add space to respect next loop of transcription + log.debug(f'TRANSCRIBED: {text}') + job = BlockJob( + uri=uri, + start_tag=START_TAG, + end_tag=END_TAG, + text=f'\n{text}\n', + strict=False, + ) + edits.add_job(NAME, job) + def finished_callback(self, edits, uri): + log.debug(f'FINISHED CALLBACK: {uri}') + cleanup_block(NAME, [START_TAG, END_TAG], uri, edits) + + def start(self, uri, cursor_pos, edits): + if self.is_running.is_set(): + log.info('WARN: ON_START_BUT_RUNNING.') + return False + self.stop_fn = self.speech_recognition.go( + partial(self.transcription_callback, edits, uri), + partial(self.finished_callback, edits, uri)) + self.is_running.set() log.debug('START CAN RETURN') def stop(self): log.debug('ACTOR STOP') - tw_set = self.transcription_worker_is_running.is_set() - if not tw_set: - log.info('WARN: ON_STOP_BUT_STOPPED' - f'transcription_worker is running={tw_set}') + if not self.is_running.is_set(): + log.info('WARN: ON_STOP_BUT_STOPPED') return False - - self.should_stop.set() - self.stop_listening_fn(wait_for_stop=False) - - if self.transcription_thread_future: - log.debug('Waiting for audio `transcription_thread_future` to terminate') - self.transcription_thread_future.result() # block, wait to finish - self.transcription_thread_future = None # reset - - self.should_stop.clear() - self.stop_listening_fn = lambda x,y: None + self.stop_fn() + self.is_running.clear() + self.stop_fn = lambda: None log.debug('FINALLY STOPPED') @@ -327,8 +352,6 @@ def filter_alphanum(x: str) -> str: def filter_out(x: str) -> bool: x = filter_alphanum(x) - # if len(x) < 4: # weed out short utterances - # return True return x.strip().lower() in filter_list