|
1 | | -# Copyright 2021 Mycroft AI Inc. |
2 | | -# |
3 | | -# Licensed under the Apache License, Version 2.0 (the "License"); |
4 | | -# you may not use this file except in compliance with the License. |
5 | | -# You may obtain a copy of the License at |
6 | | -# |
7 | | -# http://www.apache.org/licenses/LICENSE-2.0 |
8 | | -# |
9 | | -# Unless required by applicable law or agreed to in writing, software |
10 | | -# distributed under the License is distributed on an "AS IS" BASIS, |
11 | | -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | | -# See the License for the specific language governing permissions and |
13 | | -# limitations under the License. |
14 | | -"""TTS cache maintenance. |
15 | | -
|
16 | | -There are two types of cache available to a TTS engine. Both are comprised of |
17 | | -audio and phoneme files. TTS engines can use the cache to improve performance |
18 | | -by not performing inference on sentences in the cache. |
19 | | -
|
20 | | -The first type of cache is a persistent cache. The cache is considered |
21 | | -persistent because the files are stored in a location that is not cleared on |
22 | | -reboot. TTS inference on these sentences should only need to occur once. The |
23 | | -persistent cache contains commonly spoken sentences. |
24 | | -
|
25 | | -The second cache type is a temporary cache stored in the /tmp directory, |
26 | | -which is cleared when a device is rebooted. Sentences are added to this cache |
27 | | -on the fly every time a TTS engine returns audio for a sentence that is not |
28 | | -already cached. |
29 | 1 | """ |
30 | | -import base64 |
31 | | -import hashlib |
32 | | -import json |
33 | | -import re |
34 | | -from pathlib import Path |
35 | | -from typing import List, Set, Tuple |
36 | | -from urllib import parse |
37 | | - |
38 | | -import requests |
39 | | - |
40 | | -from mycroft.util.file_utils import ( |
41 | | - ensure_directory_exists, get_cache_directory, curate_cache |
42 | | -) |
43 | | -from mycroft.util.log import LOG |
44 | | - |
45 | | - |
46 | | -def _get_mimic2_audio(sentence: str, url: str) -> Tuple[bytes, str]: |
47 | | - """Use the Mimic2 API to retrieve the audio for a sentence. |
48 | | -
|
49 | | - Args: |
50 | | - sentence: The sentence to be cached |
51 | | - """ |
52 | | - LOG.debug("Retrieving Mimic2 audio for sentence \"{}\'".format(sentence)) |
53 | | - mimic2_url = url + parse.quote(sentence) + '&visimes=True' |
54 | | - response = requests.get(mimic2_url) |
55 | | - response_data = response.json() |
56 | | - audio = base64.b64decode(response_data["audio_base64"]) |
57 | | - phonemes = response_data["visimes"] |
58 | | - |
59 | | - return audio, phonemes |
60 | | - |
61 | | - |
62 | | -def hash_sentence(sentence: str): |
63 | | - """Convert the sentence into a hash value used for the file name |
64 | | -
|
65 | | - Args: |
66 | | - sentence: The sentence to be cached |
67 | | - """ |
68 | | - encoded_sentence = sentence.encode("utf-8", "ignore") |
69 | | - sentence_hash = hashlib.md5(encoded_sentence).hexdigest() |
70 | | - |
71 | | - return sentence_hash |
72 | | - |
73 | | - |
74 | | -def hash_from_path(path: Path) -> str: |
75 | | - """Returns hash from a given path. |
76 | | -
|
77 | | - Simply removes extension and folder structure leaving the hash. |
78 | | -
|
79 | | - Args: |
80 | | - path: path to get hash from |
81 | | -
|
82 | | - Returns: |
83 | | - Hash reference for file. |
84 | | - """ |
85 | | - return path.with_suffix('').name |
86 | | - |
87 | | - |
88 | | -class AudioFile: |
89 | | - def __init__(self, cache_dir: Path, sentence_hash: str, file_type: str): |
90 | | - self.name = f"{sentence_hash}.{file_type}" |
91 | | - self.path = cache_dir.joinpath(self.name) |
92 | | - |
93 | | - def save(self, audio: bytes): |
94 | | - """Write a TTS cache file containing the audio to be spoken. |
95 | | -
|
96 | | - Args: |
97 | | - audio: TTS inference of a sentence |
98 | | - """ |
99 | | - try: |
100 | | - with open(self.path, "wb") as audio_file: |
101 | | - audio_file.write(audio) |
102 | | - except Exception: |
103 | | - LOG.exception("Failed to write {} to cache".format(self.name)) |
104 | | - |
105 | | - def exists(self): |
106 | | - return self.path.exists() |
107 | | - |
108 | | - |
109 | | -class PhonemeFile: |
110 | | - def __init__(self, cache_dir: Path, sentence_hash: str): |
111 | | - self.name = f"{sentence_hash}.pho" |
112 | | - self.path = cache_dir.joinpath(self.name) |
113 | | - |
114 | | - def load(self) -> List: |
115 | | - """Load phonemes from cache file.""" |
116 | | - phonemes = None |
117 | | - if self.path.exists(): |
118 | | - try: |
119 | | - with open(self.path) as phoneme_file: |
120 | | - phonemes = phoneme_file.read().strip() |
121 | | - except Exception: |
122 | | - LOG.exception("Failed to read phoneme from cache") |
123 | | - |
124 | | - return json.loads(phonemes) |
125 | | - |
126 | | - def save(self, phonemes): |
127 | | - """Write a TTS cache file containing the phoneme to be displayed. |
128 | | -
|
129 | | - Args: |
130 | | - phonemes: instructions for how to make the mouth on a device move |
131 | | - """ |
132 | | - try: |
133 | | - rec = json.dumps(phonemes) |
134 | | - with open(self.path, "w") as phoneme_file: |
135 | | - phoneme_file.write(rec) |
136 | | - except Exception: |
137 | | - LOG.error(f"Failed to write {self.name} to cache") |
138 | | - |
139 | | - def exists(self): |
140 | | - return self.path.exists() |
141 | | - |
142 | | - |
143 | | -class TextToSpeechCache: |
144 | | - """Class for all persistent and temporary caching operations.""" |
145 | | - def __init__(self, tts_config, tts_name, audio_file_type): |
146 | | - self.config = tts_config |
147 | | - self.tts_name = tts_name |
148 | | - if "preloaded_cache" in self.config: |
149 | | - self.persistent_cache_dir = Path(self.config["preloaded_cache"]) |
150 | | - ensure_directory_exists( |
151 | | - str(self.persistent_cache_dir), permissions=0o755 |
152 | | - ) |
153 | | - else: |
154 | | - self.persistent_cache_dir = None |
155 | | - self.temporary_cache_dir = Path( |
156 | | - get_cache_directory("tts/" + tts_name) |
157 | | - ) |
158 | | - ensure_directory_exists( |
159 | | - str(self.temporary_cache_dir), permissions=0o755 |
160 | | - ) |
161 | | - self.audio_file_type = audio_file_type |
162 | | - self.resource_dir = Path(__file__).parent.parent.joinpath("res") |
163 | | - self.cached_sentences = {} |
164 | | - # curate cache if disk usage is above min % |
165 | | - self.min_free_percent = self.config.get("min_free_percent", 75) |
166 | | - |
167 | | - def __contains__(self, sha): |
168 | | - """The cache contains a SHA if it knows of it and it exists on disk.""" |
169 | | - if sha not in self.cached_sentences: |
170 | | - return False # Doesn't know of it |
171 | | - else: |
172 | | - # Audio file must exist, phonemes are optional. |
173 | | - audio, phonemes = self.cached_sentences[sha] |
174 | | - return (audio.exists() and |
175 | | - (phonemes is None or phonemes.exists())) |
176 | | - |
177 | | - def load_persistent_cache(self): |
178 | | - """Load the contents of dialog files to the persistent cache directory. |
179 | | -
|
180 | | - Parse the dialog files in the resource directory into sentences. Then |
181 | | - add the audio for each sentence to the cache directory. |
182 | | -
|
183 | | - NOTE: There may be files pre-loaded in the persistent cache directory |
184 | | - prior to run time, such as pre-recorded audio files. This will add |
185 | | - files that do not already exist. |
186 | | -
|
187 | | - ANOTHER NOTE: Mimic2 is the only TTS engine that supports |
188 | | - downloading missing files. This logic will need to change if another |
189 | | - TTS engine implements it. |
190 | | - """ |
191 | | - if self.persistent_cache_dir is not None: |
192 | | - LOG.info("Adding dialog resources to persistent TTS cache...") |
193 | | - self._load_existing_audio_files() |
194 | | - self._load_existing_phoneme_files() |
195 | | - dialogs = self._collect_dialogs() |
196 | | - sentences = self._parse_dialogs(dialogs) |
197 | | - for sentence in sentences: |
198 | | - self._load_sentence(sentence) |
199 | | - LOG.info("Persistent TTS cache files added successfully.") |
200 | | - |
201 | | - def _load_existing_audio_files(self): |
202 | | - """Find the TTS audio files already in the persistent cache.""" |
203 | | - glob_pattern = "*." + self.audio_file_type |
204 | | - for file_path in self.persistent_cache_dir.glob(glob_pattern): |
205 | | - sentence_hash = file_path.name.split(".")[0] |
206 | | - audio_file = AudioFile( |
207 | | - self.persistent_cache_dir, sentence_hash, self.audio_file_type |
208 | | - ) |
209 | | - self.cached_sentences[sentence_hash] = audio_file, None |
210 | | - |
211 | | - def _load_existing_phoneme_files(self): |
212 | | - """Find the TTS phoneme files already in the persistent cache. |
213 | | -
|
214 | | - A phoneme file is no good without an audio file to pair it with. If |
215 | | - no audio file matches, do not load the phoneme. |
216 | | - """ |
217 | | - for file_path in self.persistent_cache_dir.glob("*.pho"): |
218 | | - sentence_hash = file_path.name.split(".")[0] |
219 | | - cached_sentence = self.cached_sentences.get(sentence_hash) |
220 | | - if cached_sentence is not None: |
221 | | - audio_file = cached_sentence[0] |
222 | | - phoneme_file = PhonemeFile( |
223 | | - self.persistent_cache_dir, sentence_hash |
224 | | - ) |
225 | | - self.cached_sentences[sentence_hash] = audio_file, phoneme_file |
226 | | - |
227 | | - def _collect_dialogs(self) -> List: |
228 | | - """Build a set of unique sentences from the dialog files. |
229 | | -
|
230 | | - The sentences will be parsed from *.dialog files present in |
231 | | - mycroft/res/text/en-us. |
232 | | - """ |
233 | | - dialogs = [] |
234 | | - dialog_directory = Path(self.resource_dir, "text", "en-us") |
235 | | - for dialog_file_path in dialog_directory.glob("*.dialog"): |
236 | | - with open(dialog_file_path) as dialog_file: |
237 | | - for dialog in dialog_file.readlines(): |
238 | | - dialogs.append(dialog.strip()) |
239 | | - |
240 | | - return dialogs |
241 | | - |
242 | | - @staticmethod |
243 | | - def _parse_dialogs(dialogs: List[str]) -> Set[str]: |
244 | | - """Split each dialog in the resources directory into sentences. |
245 | | -
|
246 | | - Do not consider sentences with special characters other than |
247 | | - punctuation |
248 | | - example : <<< LOADING <<< |
249 | | -
|
250 | | - Args: |
251 | | - dialogs: a list of the records in the dialog resource files |
252 | | - """ |
253 | | - sentences = set() |
254 | | - dialog_split_regex = r"(?<=\.|\;|\?)\s" |
255 | | - special_characters_regex = re.compile(r"[@#$%^*()<>/|}{~:]") |
256 | | - for dialog in dialogs: |
257 | | - dialog_sentences = re.split(dialog_split_regex, dialog) |
258 | | - for sentence in dialog_sentences: |
259 | | - match = special_characters_regex.search(sentence) |
260 | | - if match is None: |
261 | | - sentences.add(sentence) |
262 | | - |
263 | | - return sentences |
264 | | - |
265 | | - def _load_sentence(self, sentence: str): |
266 | | - """Build audio and phoneme files for each sentence to be cached. |
267 | | -
|
268 | | - Perform TTS inference on sentences parsed from dialog files. Store |
269 | | - the results in the persistent cache directory. |
270 | | -
|
271 | | - ASSUMPTION: The only TTS that supports persistent cache right now is |
272 | | - Mimic2. This method assumes a call to the Mimic2 API. If other TTS |
273 | | - engines want to take advantage of the persistent cache, this logic |
274 | | - will need to be more dynamic. |
275 | | - """ |
276 | | - # TODO support multiple engines |
277 | | - if self.tts_name != "Mimic2": |
278 | | - return |
279 | | - |
280 | | - sentence_hash = hash_sentence(sentence) |
281 | | - if sentence_hash not in self.cached_sentences: |
282 | | - LOG.info("Adding \"{}\" to cache".format(sentence)) |
283 | | - try: |
284 | | - mimic2_url = self.config["url"] |
285 | | - audio, phonemes = _get_mimic2_audio(sentence, mimic2_url) |
286 | | - except Exception: |
287 | | - log_msg = "Failed to get audio for sentence \"{}\"" |
288 | | - LOG.exception(log_msg.format(sentence)) |
289 | | - else: |
290 | | - self._add_to_persistent_cache(sentence_hash, audio, phonemes) |
291 | | - |
292 | | - def _add_to_persistent_cache( |
293 | | - self, sentence_hash: str, audio: bytes, phonemes: str |
294 | | - ): |
295 | | - """Add a audio/phoneme file pair to the persistent cache.""" |
296 | | - audio_file = AudioFile( |
297 | | - self.persistent_cache_dir, sentence_hash, self.audio_file_type |
298 | | - ) |
299 | | - audio_file.save(audio) |
300 | | - if phonemes is None: |
301 | | - phoneme_file = None |
302 | | - else: |
303 | | - phoneme_file = PhonemeFile( |
304 | | - self.persistent_cache_dir, sentence_hash |
305 | | - ) |
306 | | - phoneme_file.save(phonemes) |
307 | | - self.cached_sentences[sentence_hash] = audio_file, phoneme_file |
308 | | - |
309 | | - def clear(self): |
310 | | - """Remove all files from the temporary cache.""" |
311 | | - for cache_file_path in self.temporary_cache_dir.iterdir(): |
312 | | - if cache_file_path.is_dir(): |
313 | | - for sub_path in cache_file_path.iterdir(): |
314 | | - if sub_path.is_file(): |
315 | | - sub_path.unlink() |
316 | | - elif cache_file_path.is_file(): |
317 | | - cache_file_path.unlink() |
318 | | - |
319 | | - def curate(self): |
320 | | - """Remove cache data if disk space is running low.""" |
321 | | - files_removed = curate_cache(self.temporary_cache_dir, |
322 | | - min_free_percent=self.min_free_percent) |
323 | | - |
324 | | - hashes = set([hash_from_path(Path(path)) for path in files_removed]) |
325 | | - for sentence_hash in hashes: |
326 | | - if sentence_hash in self.cached_sentences: |
327 | | - self.cached_sentences.pop(sentence_hash) |
328 | | - |
329 | | - def define_audio_file(self, sentence_hash: str) -> AudioFile: |
330 | | - """Build an instance of an object representing an audio file.""" |
331 | | - audio_file = AudioFile( |
332 | | - self.temporary_cache_dir, sentence_hash, self.audio_file_type |
333 | | - ) |
334 | | - return audio_file |
335 | | - |
336 | | - def define_phoneme_file(self, sentence_hash: str) -> PhonemeFile: |
337 | | - """Build an instance of an object representing an phoneme file.""" |
338 | | - phoneme_file = PhonemeFile(self.temporary_cache_dir, sentence_hash) |
339 | | - return phoneme_file |
| 2 | +NOTE: this is dead code! do not use! |
| 3 | +This file is only present to ensure backwards compatibility |
| 4 | +in case someone is importing from here |
| 5 | +This is only meant for 3rd party code expecting ovos-core |
| 6 | +to be a drop in replacement for mycroft-core |
| 7 | +""" |
| 8 | +from ovos_plugin_manager.utils.tts_cache import hash_sentence, hash_from_path,\ |
| 9 | + AudioFile, PhonemeFile, TextToSpeechCache |
0 commit comments