Skip to content

Commit

Permalink
[googletts] Replace custom TTS cache with common TTS cache (#15208)
Browse files Browse the repository at this point in the history
* [googletts] Replace custom TTS cache with common TTS cache

--------

Signed-off-by: Gwendal Roulleau <gwendal.roulleau@gmail.com>
  • Loading branch information
dalgwen authored Jul 10, 2023
1 parent 72c0e1f commit 2899421
Show file tree
Hide file tree
Showing 6 changed files with 38 additions and 170 deletions.
8 changes: 1 addition & 7 deletions bundles/org.openhab.voice.googletts/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@

Google Cloud TTS Service uses the non-free Google Cloud Text-to-Speech API to convert text or Speech Synthesis Markup Language (SSML) input into audio data of natural human speech.
It provides multiple voices, available in different languages and variants and applies DeepMind’s groundbreaking research in WaveNet and Google’s powerful neural networks.
The implementation caches the converted texts to reduce the load on the API and make the conversion faster.
You can find them in the `$OPENHAB_USERDATA/cache/org.openhab.voice.googletts` folder.
The Google Cloud TTS service uses the openHAB TTS cache to cache audio files produced from the most recent queries in order to reduce traffic, improve performance and reduce number of requests.
Be aware, that using this service may incur cost on your Google Cloud account.
You can find pricing information on the [documentation page](https://cloud.google.com/text-to-speech/#pricing-summary).

Expand Down Expand Up @@ -47,10 +46,6 @@ It is recommended to clear this configuration parameter afterwards.
* **Pitch** - The pitch of selected voice, up to 20 semitones.
* **Volume Gain** - The volume of the output between 16dB and -96dB.
* **Speaking Rate** - The speaking rate can be 4x faster or slower than the normal rate.
* **Purge Cache** - Purges the cache e.g. after testing different voice configuration parameters.

When enabled the cache is purged once.
Make sure to disable this setting again so the cache is maintained after restarts.

In case you would like to setup the service via a text file, create a new file in `$OPENHAB_ROOT/conf/services` named `googletts.cfg`

Expand All @@ -63,7 +58,6 @@ org.openhab.voice.googletts:authcode=XXXXX
org.openhab.voice.googletts:pitch=0
org.openhab.voice.googletts:volumeGain=0
org.openhab.voice.googletts:speakingRate=1
org.openhab.voice.googletts:purgeCache=false
```

### Default Text-to-Speech and Voice Configuration
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,9 @@
*/
package org.openhab.voice.googletts.internal;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.math.BigInteger;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Base64;
import java.util.Dictionary;
import java.util.HashMap;
Expand Down Expand Up @@ -69,10 +61,6 @@
*/
class GoogleCloudAPI {

private static final char EXTENSION_SEPARATOR = '.';
private static final char UNIX_SEPARATOR = '/';
private static final char WINDOWS_SEPARATOR = '\\';

private static final String BEARER = "Bearer ";

private static final String GCP_AUTH_URI = "https://accounts.google.com/o/oauth2/auth";
Expand Down Expand Up @@ -103,11 +91,6 @@ class GoogleCloudAPI {
*/
private final Map<Locale, Set<GoogleTTSVoice>> voices = new HashMap<>();

/**
* Cache folder
*/
private File cacheFolder;

/**
* Configuration
*/
Expand All @@ -122,12 +105,10 @@ class GoogleCloudAPI {
/**
* Constructor.
*
* @param cacheFolder Service cache folder
*/
GoogleCloudAPI(ConfigurationAdmin configAdmin, OAuthFactory oAuthFactory, File cacheFolder) {
GoogleCloudAPI(ConfigurationAdmin configAdmin, OAuthFactory oAuthFactory) {
this.configAdmin = configAdmin;
this.oAuthFactory = oAuthFactory;
this.cacheFolder = cacheFolder;
}

/**
Expand Down Expand Up @@ -161,15 +142,6 @@ void setConfig(GoogleTTSConfig config) {
} else {
voices.clear();
}

// maintain cache
if (config.purgeCache) {
File[] files = cacheFolder.listFiles();
if (files != null && files.length > 0) {
Arrays.stream(files).forEach(File::delete);
}
logger.debug("Cache purged.");
}
}

public void dispose() {
Expand Down Expand Up @@ -341,97 +313,32 @@ private List<GoogleTTSVoice> listVoices() throws AuthenticationException, Commun
* @param codec Requested codec
* @return String array of Google audio format and the file extension to use.
*/
private String[] getFormatForCodec(String codec) {
private String getFormatForCodec(String codec) {
switch (codec) {
case AudioFormat.CODEC_MP3:
return new String[] { AudioEncoding.MP3.toString(), "mp3" };
return AudioEncoding.MP3.toString();
case AudioFormat.CODEC_PCM_SIGNED:
return new String[] { AudioEncoding.LINEAR16.toString(), "wav" };
return AudioEncoding.LINEAR16.toString();
default:
throw new IllegalArgumentException("Audio format " + codec + " is not yet supported");
}
}

public byte[] synthesizeSpeech(String text, GoogleTTSVoice voice, String codec) {
String[] format = getFormatForCodec(codec);
String fileNameInCache = getUniqueFilenameForText(text, voice.getTechnicalName());
File audioFileInCache = new File(cacheFolder, fileNameInCache + "." + format[1]);
String format = getFormatForCodec(codec);
try {
// check if in cache
if (audioFileInCache.exists()) {
logger.debug("Audio file {} was found in cache.", audioFileInCache.getName());
return Files.readAllBytes(audioFileInCache.toPath());
}

// if not in cache, get audio data and put to cache
byte[] audio = synthesizeSpeechByGoogle(text, voice, format[0]);
if (audio != null) {
saveAudioAndTextToFile(text, audioFileInCache, audio, voice.getTechnicalName());
}
return audio;
return synthesizeSpeechByGoogle(text, voice, format);
} catch (AuthenticationException | CommunicationException e) {
logger.warn("Error initializing Google Cloud TTS service: {}", e.getMessage());
if (oAuthService != null) {
oAuthFactory.ungetOAuthService(GoogleTTSService.SERVICE_PID);
oAuthService = null;
}
voices.clear();
} catch (FileNotFoundException e) {
logger.warn("Could not write file {} to cache: {}", audioFileInCache, e.getMessage());
} catch (IOException e) {
logger.debug("An unexpected IOException occurred: {}", e.getMessage());
}
return null;
}

/**
* Create cache entry.
*
* @param text Converted text.
* @param cacheFile Cache entry file.
* @param audio Byte array of the audio.
* @param voiceName Used voice
* @throws FileNotFoundException
* @throws IOException in case of file handling exceptions
*/
private void saveAudioAndTextToFile(String text, File cacheFile, byte[] audio, String voiceName)
throws IOException, FileNotFoundException {
logger.debug("Caching audio file {}", cacheFile.getName());
try (FileOutputStream audioFileOutputStream = new FileOutputStream(cacheFile)) {
audioFileOutputStream.write(audio);
}

// write text to file for transparency too
// this allows to know which contents is in which audio file
String textFileName = removeExtension(cacheFile.getName()) + ".txt";
logger.debug("Caching text file {}", textFileName);
try (FileOutputStream textFileOutputStream = new FileOutputStream(new File(cacheFolder, textFileName))) {
// @formatter:off
StringBuilder sb = new StringBuilder("Config: ")
.append(config.toConfigString())
.append(",voice=")
.append(voiceName)
.append(System.lineSeparator())
.append("Text: ")
.append(text)
.append(System.lineSeparator());
// @formatter:on
textFileOutputStream.write(sb.toString().getBytes(StandardCharsets.UTF_8));
}
}

/**
* Removes the extension of a file name.
*
* @param fileName the file name to remove the extension of
* @return the filename without the extension
*/
private String removeExtension(String fileName) {
int extensionPos = fileName.lastIndexOf(EXTENSION_SEPARATOR);
int lastSeparator = Math.max(fileName.lastIndexOf(UNIX_SEPARATOR), fileName.lastIndexOf(WINDOWS_SEPARATOR));
return lastSeparator > extensionPos ? fileName : fileName.substring(0, extensionPos);
}

/**
* Call Google service to synthesize the required text
*
Expand Down Expand Up @@ -476,25 +383,6 @@ private byte[] synthesizeSpeechByGoogle(String text, GoogleTTSVoice voice, Strin
return null;
}

/**
* Gets a unique filename for a give text, by creating a MD5 hash of it. It
* will be preceded by the locale.
* <p>
* Sample: "en-US_00a2653ac5f77063bc4ea2fee87318d3"
*/
private String getUniqueFilenameForText(String text, String voiceName) {
try {
MessageDigest md = MessageDigest.getInstance("MD5");
byte[] bytesOfMessage = (config.toConfigString() + text).getBytes(StandardCharsets.UTF_8);
String fileNameHash = String.format("%032x", new BigInteger(1, md.digest(bytesOfMessage)));
return voiceName + "_" + fileNameHash;
} catch (NoSuchAlgorithmException e) {
// should not happen
logger.error("Could not create MD5 hash for '{}'", text, e);
return null;
}
}

boolean isInitialized() {
return oAuthService != null;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,15 +44,10 @@ class GoogleTTSConfig {
*/
public Double speakingRate = 1d;

/**
* Purge cache after configuration changes.
*/
public Boolean purgeCache = Boolean.FALSE;

@Override
public String toString() {
return "GoogleTTSConfig{pitch=" + pitch + ", speakingRate=" + speakingRate + ", volumeGainDb=" + volumeGainDb
+ ", purgeCache=" + purgeCache + '}';
+ '}';
}

String toConfigString() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,24 +15,29 @@
import static org.openhab.voice.googletts.internal.GoogleTTSService.*;

import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.math.BigInteger;
import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Collections;
import java.util.HashSet;
import java.util.Locale;
import java.util.Map;
import java.util.Set;

import org.eclipse.jdt.annotation.NonNull;
import org.eclipse.jdt.annotation.NonNullByDefault;
import org.eclipse.jdt.annotation.Nullable;
import org.openhab.core.OpenHAB;
import org.openhab.core.audio.AudioFormat;
import org.openhab.core.audio.AudioStream;
import org.openhab.core.audio.ByteArrayAudioStream;
import org.openhab.core.audio.utils.AudioWaveUtils;
import org.openhab.core.auth.client.oauth2.OAuthFactory;
import org.openhab.core.config.core.ConfigurableService;
import org.openhab.core.voice.AbstractCachedTTSService;
import org.openhab.core.voice.TTSCache;
import org.openhab.core.voice.TTSException;
import org.openhab.core.voice.TTSService;
import org.openhab.core.voice.Voice;
Expand All @@ -52,10 +57,11 @@
*
* @author Gabor Bicskei - Initial contribution
*/
@Component(configurationPid = SERVICE_PID, property = Constants.SERVICE_PID + "=" + SERVICE_PID)
@Component(configurationPid = SERVICE_PID, property = Constants.SERVICE_PID + "="
+ SERVICE_PID, service = TTSService.class)
@ConfigurableService(category = SERVICE_CATEGORY, label = SERVICE_NAME
+ " Text-to-Speech", description_uri = SERVICE_CATEGORY + ":" + SERVICE_ID)
public class GoogleTTSService implements TTSService {
public class GoogleTTSService extends AbstractCachedTTSService {
/**
* Service name
*/
Expand All @@ -76,11 +82,6 @@ public class GoogleTTSService implements TTSService {
*/
static final String SERVICE_PID = "org.openhab." + SERVICE_CATEGORY + "." + SERVICE_ID;

/**
* Cache folder under $userdata
*/
private static final String CACHE_FOLDER_NAME = "cache";

/**
* Configuration parameters
*/
Expand All @@ -90,7 +91,6 @@ public class GoogleTTSService implements TTSService {
private static final String PARAM_PITCH = "pitch";
private static final String PARAM_SPEAKING_RATE = "speakingRate";
private static final String PARAM_VOLUME_GAIN_DB = "volumeGainDb";
private static final String PARAM_PURGE_CACHE = "purgeCache";

/**
* Logger.
Expand All @@ -117,8 +117,9 @@ public class GoogleTTSService implements TTSService {
private final GoogleTTSConfig config = new GoogleTTSConfig();

@Activate
public GoogleTTSService(final @Reference ConfigurationAdmin configAdmin,
final @Reference OAuthFactory oAuthFactory) {
public GoogleTTSService(final @Reference ConfigurationAdmin configAdmin, final @Reference OAuthFactory oAuthFactory,
@Reference TTSCache ttsCache, Map<String, Object> config) {
super(ttsCache);
this.configAdmin = configAdmin;
this.oAuthFactory = oAuthFactory;
}
Expand All @@ -128,15 +129,7 @@ public GoogleTTSService(final @Reference ConfigurationAdmin configAdmin,
*/
@Activate
protected void activate(Map<String, Object> config) {
// create cache folder
File userData = new File(OpenHAB.getUserDataFolder());
File cacheFolder = new File(new File(userData, CACHE_FOLDER_NAME), SERVICE_PID);
if (!cacheFolder.exists()) {
cacheFolder.mkdirs();
}
logger.debug("Using cache folder {}", cacheFolder.getAbsolutePath());

apiImpl = new GoogleCloudAPI(configAdmin, oAuthFactory, cacheFolder);
apiImpl = new GoogleCloudAPI(configAdmin, oAuthFactory);
updateConfig(config);
}

Expand Down Expand Up @@ -236,13 +229,6 @@ private void updateConfig(Map<String, Object> newConfig) {
config.volumeGainDb = Double.parseDouble(param);
}

// purgeCache
param = newConfig.containsKey(PARAM_PURGE_CACHE) ? newConfig.get(PARAM_PURGE_CACHE).toString() : null;
if (param != null) {
config.purgeCache = Boolean.parseBoolean(param);
}
logger.trace("New configuration: {}", config.toString());

if (config.clientId != null && !config.clientId.isEmpty() && config.clientSecret != null
&& !config.clientSecret.isEmpty()) {
apiImpl.setConfig(config);
Expand Down Expand Up @@ -313,7 +299,7 @@ public Set<AudioFormat> getSupportedFormats() {
* @throws TTSException in case the service is unavailable or a parameter is invalid.
*/
@Override
public AudioStream synthesize(String text, Voice voice, AudioFormat requestedFormat) throws TTSException {
public AudioStream synthesizeForCache(String text, Voice voice, AudioFormat requestedFormat) throws TTSException {
logger.debug("Synthesize '{}' for voice '{}' in format {}", text, voice.getUID(), requestedFormat);
// Validate known api key
if (!apiImpl.isInitialized()) {
Expand Down Expand Up @@ -361,4 +347,19 @@ private AudioFormat parseAudioFormat(byte[] audio) throws TTSException {
throw new TTSException("Cannot parse WAV format", e);
}
}

@Override
public @NonNull String getCacheKey(@NonNull String text, @NonNull Voice voice,
@NonNull AudioFormat requestedFormat) {
try {
MessageDigest md = MessageDigest.getInstance("MD5");
byte[] bytesOfMessage = (config.toConfigString() + text + requestedFormat).getBytes(StandardCharsets.UTF_8);
String hash = String.format("%032x", new BigInteger(1, md.digest(bytesOfMessage)));
return ((GoogleTTSVoice) voice).getTechnicalName() + "_" + hash;
} catch (NoSuchAlgorithmException e) {
// should not happen
logger.warn("Could not create MD5 hash for '{}'", text, e);
return "nomd5algorithm";
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,5 @@
<description>Speaking rate can be 4x faster or slower than the normal rate.</description>
<default>1</default>
</parameter>
<parameter name="purgeCache" type="boolean">
<advanced>true</advanced>
<label>Purge Cache</label>
<description>Purges the cache e.g. after testing different voice configuration parameters. When enabled the cache is
purged once. Make sure to disable this setting again so the cache is maintained after restarts.</description>
<default>false</default>
</parameter>
</config-description>

</config-description:config-descriptions>
Loading

0 comments on commit 2899421

Please sign in to comment.