From 415c57658f8a9777d2e1a275d70acec640cfb7ec Mon Sep 17 00:00:00 2001 From: RazvanP Date: Mon, 11 Mar 2024 11:26:30 +0200 Subject: [PATCH 1/3] JIT-12948 set the Google transcription model to latest_long as the default --- .../transcription/GoogleCloudTranscriptionService.java | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/jitsi/jigasi/transcription/GoogleCloudTranscriptionService.java b/src/main/java/org/jitsi/jigasi/transcription/GoogleCloudTranscriptionService.java index 107a0f20f..921faa64e 100644 --- a/src/main/java/org/jitsi/jigasi/transcription/GoogleCloudTranscriptionService.java +++ b/src/main/java/org/jitsi/jigasi/transcription/GoogleCloudTranscriptionService.java @@ -262,8 +262,12 @@ private RecognitionConfig getRecognitionConfig(TranscriptionRequest request) "encoding"); } - // set the model to use. It will default to a cheaper model with - // lower performance when not set. + // set the default model to "latest_long" instead of "default" + // https://cloud.google.com/speech-to-text/docs/transcription-model#transcription_models + // and https://cloud.google.com/speech-to-text/docs/latest-models#pricing for pricing + builder.setModel("latest_long"); + + // set the model to video if (useVideoModel) { if (logger.isDebugEnabled()) From 38fe12ee2f3a391518613aaf9ff71b0ef888c7d0 Mon Sep 17 00:00:00 2001 From: RazvanP Date: Mon, 11 Mar 2024 13:52:04 +0200 Subject: [PATCH 2/3] JIT-12948 allow the selection of any google model --- .../GoogleCloudTranscriptionService.java | 38 +++++++------------ 1 file changed, 13 insertions(+), 25 deletions(-) diff --git a/src/main/java/org/jitsi/jigasi/transcription/GoogleCloudTranscriptionService.java b/src/main/java/org/jitsi/jigasi/transcription/GoogleCloudTranscriptionService.java index 921faa64e..59320f70b 100644 --- a/src/main/java/org/jitsi/jigasi/transcription/GoogleCloudTranscriptionService.java +++ b/src/main/java/org/jitsi/jigasi/transcription/GoogleCloudTranscriptionService.java @@ -180,16 +180,15 @@ public class GoogleCloudTranscriptionService private final static int STREAMING_SESSION_TIMEOUT_MS = 2000; /** - * Property name to determine whether to use the Google Speech API's - * video model + * Property name to determine which Google Speech API model to use */ - private final static String P_NAME_USE_VIDEO_MODEL - = "org.jitsi.jigasi.transcription.USE_VIDEO_MODEL"; + private final static String GOOGLE_MODEL + = "org.jitsi.jigasi.transcription.google_model"; /** - * The default value for the property USE_VIDEO_MODEL + * The default value for the property GOOGLE_MODEL */ - private final static boolean DEFAULT_VALUE_USE_VIDEO_MODEL = false; + private final static String DEFAULT_VALUE_GOOGLE_MODEL = "latest_long"; /** * Check whether the given string contains a supported language tag @@ -229,10 +228,9 @@ public boolean supportsLanguageRouting() private List speechContexts = null; /** - * Whether to use the more expensive video model when making - * requests. + * The model used for STT */ - private boolean useVideoModel; + private final String useModel; /** * Creates the RecognitionConfig the Google service uses based @@ -262,19 +260,10 @@ private RecognitionConfig getRecognitionConfig(TranscriptionRequest request) "encoding"); } - // set the default model to "latest_long" instead of "default" - // https://cloud.google.com/speech-to-text/docs/transcription-model#transcription_models - // and https://cloud.google.com/speech-to-text/docs/latest-models#pricing for pricing - builder.setModel("latest_long"); - - // set the model to video - if (useVideoModel) + builder.setModel(useModel); + if (logger.isDebugEnabled()) { - if (logger.isDebugEnabled()) - { - logger.debug("Using the more expensive video model"); - } - builder.setModel("video"); + logger.debug("Using model " + useModel); } // set the Language tag @@ -296,8 +285,8 @@ private RecognitionConfig getRecognitionConfig(TranscriptionRequest request) */ public GoogleCloudTranscriptionService() { - useVideoModel = JigasiBundleActivator.getConfigurationService() - .getBoolean(P_NAME_USE_VIDEO_MODEL, DEFAULT_VALUE_USE_VIDEO_MODEL); + useModel = JigasiBundleActivator.getConfigurationService() + .getString(GOOGLE_MODEL, DEFAULT_VALUE_GOOGLE_MODEL); } /** @@ -704,8 +693,7 @@ private ApiStreamObserver createObserver( StreamingRecognitionConfig.newBuilder() .setConfig(config) .setInterimResults(RETRIEVE_INTERIM_RESULTS) - .setSingleUtterance(!useVideoModel && - SINGLE_UTTERANCE_ONLY) + .setSingleUtterance(SINGLE_UTTERANCE_ONLY) .build(); // StreamingCallable manages sending the audio and receiving From 368f5b94284d57e70cdf6d023d28866dfbcf2d76 Mon Sep 17 00:00:00 2001 From: RazvanP Date: Wed, 20 Mar 2024 11:02:55 +0200 Subject: [PATCH 3/3] JIT-12948 remove single utterance --- .../GoogleCloudTranscriptionService.java | 56 +++++-------------- 1 file changed, 13 insertions(+), 43 deletions(-) diff --git a/src/main/java/org/jitsi/jigasi/transcription/GoogleCloudTranscriptionService.java b/src/main/java/org/jitsi/jigasi/transcription/GoogleCloudTranscriptionService.java index 59320f70b..4c8d271bd 100644 --- a/src/main/java/org/jitsi/jigasi/transcription/GoogleCloudTranscriptionService.java +++ b/src/main/java/org/jitsi/jigasi/transcription/GoogleCloudTranscriptionService.java @@ -165,12 +165,6 @@ public class GoogleCloudTranscriptionService */ private final static boolean RETRIEVE_INTERIM_RESULTS = true; - /** - * Whether the Google Cloud API only listens for a single utterance - * or continuous to listen once an utterance is over - */ - private final static boolean SINGLE_UTTERANCE_ONLY = true; - /** * The amount of ms after which a StreamingRecognize session will be closed * when no new audio is given. This is to make sure the session retrieves @@ -194,7 +188,7 @@ public class GoogleCloudTranscriptionService * Check whether the given string contains a supported language tag * * @param tag the language tag - * @throws UnsupportedOperationException when the google cloud API does not + * @throws UnsupportedOperationException when the Google cloud API does not * support the given language */ private static void validateLanguageTag(String tag) @@ -280,7 +274,7 @@ private RecognitionConfig getRecognitionConfig(TranscriptionRequest request) } /** - * Create a TranscriptionService which will send audio to the google cloud + * Create a TranscriptionService which will send audio to the Google cloud * platform to get a transcription */ public GoogleCloudTranscriptionService() @@ -428,7 +422,7 @@ public class GoogleCloudStreamingRecognitionSession private RequestApiStreamObserverManager requestManager; /** - * A single thread which is used to sent all requests to the API. + * A single thread which is used to send all requests to the API. * This is needed to reliably sent the first request to the service */ private ExecutorService service = Executors.newSingleThreadExecutor(); @@ -506,7 +500,7 @@ public void addTranscriptionListener(TranscriptionListener listener) private static class GoogleCloudCostLogger { /** - * The length of a cost interval of the google cloud speech-to-text API + * The length of a cost interval of the Google cloud speech-to-text API */ private final static int INTERVAL_LENGTH_MS = 15000; @@ -556,7 +550,7 @@ private static class GoogleCloudCostLogger /** * Tell the {@link GoogleCloudCostLogger} that a certain length of audio - * was send. + * was sent. * * @param ms the length of the audio chunk sent to the API */ @@ -650,13 +644,13 @@ private class RequestApiStreamObserverManager private boolean stopped = false; /** - * Used to log the cost of every request which is send + * Used to log the cost of every request which is sent */ private final GoogleCloudCostLogger costLogger; /** * Create a new RequestApiStreamObserverManager, which will try - * to mimic a streaming session of indefinite lenth + * to mimic a streaming session of indefinite length * * @param client the SpeechClient with which to open new sessions * @param debugName extra text which will be added to logs @@ -679,7 +673,7 @@ private ApiStreamObserver createObserver( RecognitionConfig config) { // Each observer gets its own responseObserver to be able to - // to get an unique ID + // get a unique ID ResponseApiStreamingObserver responseObserver = new ResponseApiStreamingObserver( @@ -693,7 +687,6 @@ private ApiStreamObserver createObserver( StreamingRecognitionConfig.newBuilder() .setConfig(config) .setInterimResults(RETRIEVE_INTERIM_RESULTS) - .setSingleUtterance(SINGLE_UTTERANCE_ONLY) .build(); // StreamingCallable manages sending the audio and receiving @@ -914,16 +907,12 @@ public void onNext(StreamingRecognizeResponse message) if (logger.isDebugEnabled()) logger.debug( debugName + ": received error from StreamingRecognizeResponse: " - + message.getError().getMessage()); + + message.getError().getMessage()); requestManager.terminateCurrentSession(); return; } - // This will happen when SINGLE_UTTERANCE is set to true - // and the server has detected the end of the user's speech - // utterance. - if (isEndOfSingleUtteranceMessage(message) || - message.getResultsCount() == 0) + if (message.getResultsCount() == 0) { if (logger.isDebugEnabled()) logger.debug( @@ -936,14 +925,14 @@ public void onNext(StreamingRecognizeResponse message) List results = message.getResultsList(); StreamingRecognitionResult result = results.get(0); - // If empty, the session has reached it's time limit and + // If empty, the session has reached its time limit and // nothing new was said, but there should be an error in the message // so this is never supposed to happen if (result.getAlternativesList().isEmpty()) { logger.warn( debugName + ": received a list of alternatives which" - + " was empty"); + + " was empty"); requestManager.terminateCurrentSession(); return; } @@ -973,28 +962,9 @@ else if (logger.isDebugEnabled()) } } - /** - * Get whether a {@link StreamingRecognizeResponse} has an - * {@link StreamingRecognizeResponse#speechEventType_} of - * {@link StreamingRecognizeResponse.SpeechEventType# - * END_OF_SINGLE_UTTERANCE} - * - * @param message the message to check - * @return true if the message has the eventType - * {@link StreamingRecognizeResponse.SpeechEventType - * #END_OF_SINGLE_UTTERANCE}, false otherwise - */ - private boolean isEndOfSingleUtteranceMessage( - StreamingRecognizeResponse message) - { - return message.getSpeechEventType(). - equals(StreamingRecognizeResponse.SpeechEventType. - END_OF_SINGLE_UTTERANCE); - } - /** * Handle a single {@link StreamingRecognitionResult} by creating - * a {@link TranscriptionResult} based on the result and notifying all + * a {@link TranscriptionResult} based on the result and notifying * all registered {@link TranscriptionListener}s * * @param result the result to handle