Skip to content

Commit 7f52c9e

Browse files
committed
sync/tts_cache
port and improve the new TTS cache from mycroft-core
1 parent ea841db commit 7f52c9e

File tree

5 files changed

+477
-92
lines changed

5 files changed

+477
-92
lines changed

ovos_plugin_manager/templates/tts.py

Lines changed: 162 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -21,45 +21,35 @@
2121
# would hang here
2222
engine.playback.stop()
2323
"""
24-
import hashlib
25-
import os.path
24+
import inspect
2625
import random
2726
import re
28-
from os.path import isfile, join
27+
import subprocess
28+
from os.path import isfile, join, splitext
29+
from pathlib import Path
2930
from queue import Queue, Empty
3031
from threading import Thread
3132
from time import time, sleep
32-
import subprocess
33-
import os
34-
from inspect import signature
3533

34+
import requests
35+
from phoneme_guesser.exceptions import FailedToGuessPhonemes
36+
37+
from ovos_plugin_manager.utils.tts_cache import TextToSpeechCache, hash_sentence
3638
from ovos_utils import resolve_resource_file
39+
from ovos_utils.configuration import read_mycroft_config
3740
from ovos_utils.enclosure.api import EnclosureAPI
41+
from ovos_utils.file_utils import get_cache_directory
3842
from ovos_utils.lang.phonemes import get_phonemes
39-
from phoneme_guesser.exceptions import FailedToGuessPhonemes
4043
from ovos_utils.lang.visimes import VISIMES
4144
from ovos_utils.log import LOG
4245
from ovos_utils.messagebus import Message, FakeBus as BUS
46+
from ovos_utils.metrics import Stopwatch
4347
from ovos_utils.signal import check_for_signal, create_signal
4448
from ovos_utils.sound import play_mp3, play_wav
45-
from ovos_utils.metrics import Stopwatch
46-
from ovos_utils.configuration import read_mycroft_config
4749

4850
EMPTY_PLAYBACK_QUEUE_TUPLE = (None, None, None, None, None)
4951

5052

51-
def get_cache_directory(folder):
52-
if os.name == 'nt':
53-
import tempfile
54-
return tempfile.mkdtemp(folder)
55-
else:
56-
from memory_tempfile import MemoryTempfile
57-
tempfile = MemoryTempfile(fallback=True)
58-
path = os.path.join(tempfile.gettempdir(), folder)
59-
if not os.path.exists(path):
60-
os.makedirs(path)
61-
return path
62-
6353
class PlaybackThread(Thread):
6454
"""Thread class for playing back tts audio and sending
6555
viseme data to enclosure.
@@ -206,7 +196,6 @@ class TTS:
206196
def __init__(self, lang="en-us", config=None, validator=None,
207197
audio_ext='wav', phonetic_spelling=True, ssml_tags=None):
208198
self.log_timestamps = False
209-
super(TTS, self).__init__()
210199
if not config:
211200
try:
212201
config_core = read_mycroft_config() or {}
@@ -217,29 +206,33 @@ def __init__(self, lang="en-us", config=None, validator=None,
217206

218207
self.stopwatch = Stopwatch()
219208
self.tts_name = self.__class__.__name__
220-
self.bus = BUS()
209+
self.bus = BUS() # initialized in "init" step
221210
self.lang = lang or config.get("lang") or 'en-us'
222211
self.config = config or {}
223212
self.validator = validator or TTSValidator(self)
224213
self.phonetic_spelling = phonetic_spelling
225214
self.audio_ext = audio_ext
226215
self.ssml_tags = ssml_tags or []
216+
self.log_timestamps = self.config.get("log_timestamps", False)
227217

228218
self.voice = self.config.get("voice")
229-
self.cache_dir = get_cache_directory(self.tts_name)
230-
self.filename = join(self.cache_dir, 'tts.' + self.audio_ext)
219+
# TODO can self.filename be deprecated ? is it used anywhere at all?
220+
cache_dir = get_cache_directory(self.tts_name)
221+
self.filename = join(cache_dir, 'tts.' + self.audio_ext)
231222
self.enclosure = None
232223
random.seed()
233224
self.queue = Queue()
234225
self.playback = PlaybackThread(self.queue)
235-
# NOTE playback start call has been omitted and moved to init method
236-
# init is called by mycroft, but non mycroft usage wont call it,
237-
# meaning outside mycroft the enclosure is not set, bus is dummy and
238-
# playback thread is not used, playback queue is not wanted
239-
# if some module is calling get_tts (which is the correct usage)
240-
self.clear_cache()
226+
# NOTE: self.playback.start() was moved to init method
227+
# playback queue is not wanted if we only care about get_tts
228+
# init is called by mycroft, but non mycroft usage wont call it,
229+
# outside mycroft the enclosure is not set, bus is dummy and
230+
# playback thread is not used
241231
self.spellings = self.load_spellings()
242-
self.log_timestamps = self.config.get("log_timestamps", False)
232+
self.cache = TextToSpeechCache(
233+
self.config, self.tts_name, self.audio_ext
234+
)
235+
self.clear_cache()
243236
self.handle_metric({"metric_type": "tts.init"})
244237

245238
def handle_metric(self, metadata=None):
@@ -294,6 +287,7 @@ def end_audio(self, listen=False):
294287
if listen:
295288
self.bus.emit(Message('mycroft.mic.listen'))
296289

290+
self.cache.curate()
297291
# This check will clear the "signal"
298292
check_for_signal("isSpeaking")
299293
self.stopwatch.stop()
@@ -412,58 +406,114 @@ def execute(self, sentence, ident=None, listen=False, **kwargs):
412406
# Re-raise to allow the Exception to be handled externally as well.
413407
raise
414408

415-
def _execute(self, sentence, ident, listen, **kwargs):
409+
def _replace_phonetic_spellings(self, sentence):
416410
if self.phonetic_spelling:
417411
for word in re.findall(r"[\w']+", sentence):
418412
if word.lower() in self.spellings:
419-
sentence = sentence.replace(word,
420-
self.spellings[word.lower()])
413+
spelled = self.spellings[word.lower()]
414+
sentence = sentence.replace(word, spelled)
415+
return sentence
421416

417+
def _execute(self, sentence, ident, listen, **kwargs):
418+
sentence = self._replace_phonetic_spellings(sentence)
422419
chunks = self._preprocess_sentence(sentence)
423420
# Apply the listen flag to the last chunk, set the rest to False
424421
chunks = [(chunks[i], listen if i == len(chunks) - 1 else False)
425422
for i in range(len(chunks))]
426423
self.handle_metric({"metric_type": "tts.preprocessed",
427424
"n_chunks": len(chunks)})
428-
for sentence, l in chunks:
429-
key = str(hashlib.md5(
430-
sentence.encode('utf-8', 'ignore')).hexdigest())
431-
wav_file = os.path.join(self.cache_dir, key + '.' + self.audio_ext)
432425

433-
if os.path.exists(wav_file):
434-
LOG.debug("TTS cache hit")
435-
phonemes = self.load_phonemes(key)
436-
else:
437-
self.handle_metric({"metric_type": "tts.synth.start"})
438-
lang = kwargs.get("lang")
439-
if not lang and kwargs.get("message"):
440-
# some HolmesV derivatives accept a message object
441-
try:
442-
lang = kwargs["message"].data.get("lang") or \
443-
kwargs["message"].context.get("lang")
444-
except: # not a mycroft message object
445-
pass
446-
lang = lang or self.lang
447-
# check the signature to either pass lang or not
448-
if len(signature(self.get_tts).parameters) == 3:
449-
wav_file, phonemes = self.get_tts(sentence, wav_file,
450-
lang=lang)
451-
else:
452-
wav_file, phonemes = self.get_tts(sentence, wav_file)
453-
self.handle_metric({"metric_type": "tts.synth.finished"})
454-
if phonemes:
455-
self.save_phonemes(key, phonemes)
456-
else:
457-
try:
458-
# TODO, debug why phonemes fail ?
459-
phonemes = get_phonemes(sentence)
460-
self.handle_metric({"metric_type": "tts.phonemes.guess"})
461-
except (ImportError, FailedToGuessPhonemes):
462-
pass
463-
vis = self.viseme(phonemes) if phonemes else None
464-
self.queue.put((self.audio_ext, wav_file, vis, ident, l))
426+
# synth -> queue for playback
427+
for sentence, l in chunks:
428+
sentence_hash = hash_sentence(sentence)
429+
if sentence_hash in self.cache: # load from cache
430+
audio_file, phonemes = self._get_from_cache(sentence, sentence_hash)
431+
else: # synth + cache
432+
audio_file, phonemes = self._synth(sentence, sentence_hash, **kwargs)
433+
434+
viseme = self.viseme(phonemes) if phonemes else None
435+
audio_ext = self._determine_ext(audio_file)
436+
self.queue.put(
437+
(audio_ext, str(audio_file), viseme, ident, l)
438+
)
465439
self.handle_metric({"metric_type": "tts.queued"})
466440

441+
def _determine_ext(self, audio_file):
442+
# determine audio_ext on the fly
443+
# do not use the ext defined in the plugin since it might not match
444+
# some plugins support multiple extensions
445+
# or have caches in different extensions
446+
try:
447+
_, audio_ext = splitext(str(audio_file))
448+
return audio_ext
449+
except:
450+
return self.audio_ext
451+
452+
def _synth(self, sentence, sentence_hash=None, **kwargs):
453+
self.handle_metric({"metric_type": "tts.synth.start"})
454+
sentence_hash = sentence_hash or hash_sentence(sentence)
455+
audio = self.cache.define_audio_file(sentence_hash)
456+
457+
# parse requested language for this TTS request
458+
# NOTE: this is ovos only functionality, not in mycroft-core!
459+
lang = kwargs.get("lang")
460+
if not lang and kwargs.get("message"):
461+
# get lang from message object if possible
462+
try:
463+
lang = kwargs["message"].data.get("lang") or \
464+
kwargs["message"].context.get("lang")
465+
except: # not a mycroft message object
466+
pass
467+
kwargs["lang"] = lang or self.lang
468+
469+
# filter kwargs per plugin, different plugins expose different options
470+
# mycroft-core -> no kwargs
471+
# ovos -> lang
472+
# neon-core -> message
473+
kwargs = {k: v for k, v in kwargs.items()
474+
if k in inspect.signature(self.get_tts).parameters
475+
and k not in ["sentence", "wav_file"]}
476+
477+
# finally do the TTS synth
478+
audio.path, phonemes = self.get_tts(sentence, str(audio), **kwargs)
479+
self.handle_metric({"metric_type": "tts.synth.finished"})
480+
# cache sentence + phonemes
481+
self._cache_sentence(sentence, audio, phonemes, sentence_hash)
482+
return audio, phonemes
483+
484+
def _cache_phonemes(self, sentence, phonemes=None, sentence_hash=None):
485+
sentence_hash = sentence_hash or hash_sentence(sentence)
486+
if not phonemes:
487+
try: # TODO debug why get_phonemes fails in the first place
488+
phonemes = get_phonemes(sentence)
489+
self.handle_metric({"metric_type": "tts.phonemes.guess"})
490+
except (ImportError, FailedToGuessPhonemes):
491+
pass
492+
if phonemes:
493+
return self.save_phonemes(sentence_hash, phonemes)
494+
return None
495+
496+
def _cache_sentence(self, sentence, audio_file, phonemes=None, sentence_hash=None):
497+
sentence_hash = sentence_hash or hash_sentence(sentence)
498+
# RANT: why do you hate strings ChrisV?
499+
if isinstance(audio_file.path, str):
500+
audio_file.path = Path(audio_file.path)
501+
pho_file = self._cache_phonemes(sentence, phonemes, sentence_hash)
502+
self.cache.cached_sentences[sentence_hash] = (audio_file, pho_file)
503+
self.handle_metric({"metric_type": "tts.synth.cached"})
504+
505+
def _get_from_cache(self, sentence, sentence_hash=None):
506+
sentence_hash = sentence_hash or hash_sentence(sentence)
507+
phonemes = None
508+
audio_file, pho_file = self.cache.cached_sentences[sentence_hash]
509+
LOG.info(f"Found {audio_file.name} in TTS cache")
510+
if not pho_file:
511+
# guess phonemes from sentence + cache them
512+
pho_file = self._cache_phonemes(sentence, sentence_hash)
513+
if pho_file:
514+
phonemes = pho_file.load()
515+
return audio_file, phonemes
516+
467517
def viseme(self, phonemes):
468518
"""Create visemes from phonemes.
469519
@@ -492,7 +542,7 @@ def viseme(self, phonemes):
492542

493543
def clear_cache(self):
494544
""" Remove all cached files. """
495-
pass
545+
self.cache.clear()
496546

497547
def save_phonemes(self, key, phonemes):
498548
"""Cache phonemes
@@ -501,29 +551,18 @@ def save_phonemes(self, key, phonemes):
501551
key (str): Hash key for the sentence
502552
phonemes (str): phoneme string to save
503553
"""
504-
pho_file = os.path.join(self.cache_dir, key + ".pho")
505-
try:
506-
with open(pho_file, "w") as cachefile:
507-
cachefile.write(phonemes)
508-
except Exception:
509-
LOG.exception("Failed to write {} to cache".format(pho_file))
510-
pass
554+
phoneme_file = self.cache.define_phoneme_file(key)
555+
phoneme_file.save(phonemes)
556+
return phoneme_file
511557

512558
def load_phonemes(self, key):
513559
"""Load phonemes from cache file.
514560
515561
Arguments:
516562
key (str): Key identifying phoneme cache
517563
"""
518-
pho_file = os.path.join(self.cache_dir, key + ".pho")
519-
if os.path.exists(pho_file):
520-
try:
521-
with open(pho_file, "r") as cachefile:
522-
phonemes = cachefile.read().strip()
523-
return phonemes
524-
except Exception:
525-
LOG.debug("Failed to read .PHO from cache")
526-
return None
564+
phoneme_file = self.cache.define_phoneme_file(key)
565+
return phoneme_file.load()
527566

528567
def stop(self):
529568
try:
@@ -628,3 +667,38 @@ def get_tts(self, sentence, wav_file, lang=None):
628667
files, phonemes = self.sentence_to_files(sentence)
629668
wav_file = self.concat(files, wav_file)
630669
return wav_file, phonemes
670+
671+
672+
class RemoteTTSException(Exception):
673+
pass
674+
675+
676+
class RemoteTTSTimeoutException(RemoteTTSException):
677+
pass
678+
679+
680+
class RemoteTTS(TTS):
681+
"""
682+
Abstract class for a Remote TTS engine implementation.
683+
This class is only provided for backwards compatibility
684+
Usage is discouraged
685+
"""
686+
687+
def __init__(self, lang, config, url, api_path, validator):
688+
super(RemoteTTS, self).__init__(lang, config, validator)
689+
self.api_path = api_path
690+
self.auth = None
691+
self.url = config.get('url', url).rstrip('/')
692+
693+
def build_request_params(self, sentence):
694+
pass
695+
696+
def get_tts(self, sentence, wav_file, lang=None):
697+
r = requests.get(
698+
self.url + self.api_path, params=self.build_request_params(sentence),
699+
timeout=10, verify=False, auth=self.auth)
700+
if r.status_code != 200:
701+
return None
702+
with open(wav_file, 'wb') as f:
703+
f.write(r.content)
704+
return wav_file, None
File renamed without changes.

0 commit comments

Comments
 (0)