From 415c57658f8a9777d2e1a275d70acec640cfb7ec Mon Sep 17 00:00:00 2001
From: RazvanP <razvan.purdel@8x8.com>
Date: Mon, 11 Mar 2024 11:26:30 +0200
Subject: [PATCH 1/3] JIT-12948 set the Google transcription model to
 latest_long as the default

---
 .../transcription/GoogleCloudTranscriptionService.java    | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/src/main/java/org/jitsi/jigasi/transcription/GoogleCloudTranscriptionService.java b/src/main/java/org/jitsi/jigasi/transcription/GoogleCloudTranscriptionService.java
index 107a0f20f..921faa64e 100644
--- a/src/main/java/org/jitsi/jigasi/transcription/GoogleCloudTranscriptionService.java
+++ b/src/main/java/org/jitsi/jigasi/transcription/GoogleCloudTranscriptionService.java
@@ -262,8 +262,12 @@ private RecognitionConfig getRecognitionConfig(TranscriptionRequest request)
                     "encoding");
         }
 
-        // set the model to use. It will default to a cheaper model with
-        // lower performance when not set.
+        // set the default model to "latest_long" instead of "default"
+        // https://cloud.google.com/speech-to-text/docs/transcription-model#transcription_models
+        // and https://cloud.google.com/speech-to-text/docs/latest-models#pricing for pricing
+        builder.setModel("latest_long");
+
+        // set the model to video
         if (useVideoModel)
         {
             if (logger.isDebugEnabled())

From 38fe12ee2f3a391518613aaf9ff71b0ef888c7d0 Mon Sep 17 00:00:00 2001
From: RazvanP <razvan.purdel@8x8.com>
Date: Mon, 11 Mar 2024 13:52:04 +0200
Subject: [PATCH 2/3] JIT-12948 allow the selection of any google model

---
 .../GoogleCloudTranscriptionService.java      | 38 +++++++------------
 1 file changed, 13 insertions(+), 25 deletions(-)

diff --git a/src/main/java/org/jitsi/jigasi/transcription/GoogleCloudTranscriptionService.java b/src/main/java/org/jitsi/jigasi/transcription/GoogleCloudTranscriptionService.java
index 921faa64e..59320f70b 100644
--- a/src/main/java/org/jitsi/jigasi/transcription/GoogleCloudTranscriptionService.java
+++ b/src/main/java/org/jitsi/jigasi/transcription/GoogleCloudTranscriptionService.java
@@ -180,16 +180,15 @@ public class GoogleCloudTranscriptionService
     private final static int STREAMING_SESSION_TIMEOUT_MS = 2000;
 
     /**
-     * Property name to determine whether to use the Google Speech API's
-     * video model
+     * Property name to determine which Google Speech API model to use
      */
-    private final static String P_NAME_USE_VIDEO_MODEL
-        = "org.jitsi.jigasi.transcription.USE_VIDEO_MODEL";
+    private final static String GOOGLE_MODEL
+        = "org.jitsi.jigasi.transcription.google_model";
 
     /**
-     * The default value for the property USE_VIDEO_MODEL
+     * The default value for the property GOOGLE_MODEL
      */
-    private final static boolean DEFAULT_VALUE_USE_VIDEO_MODEL = false;
+    private final static String DEFAULT_VALUE_GOOGLE_MODEL = "latest_long";
 
     /**
      * Check whether the given string contains a supported language tag
@@ -229,10 +228,9 @@ public boolean supportsLanguageRouting()
     private List<SpeechContext> speechContexts = null;
 
     /**
-     * Whether to use the more expensive video model when making
-     * requests.
+     * The model used for STT
      */
-    private boolean useVideoModel;
+    private final String useModel;
 
     /**
      * Creates the RecognitionConfig the Google service uses based
@@ -262,19 +260,10 @@ private RecognitionConfig getRecognitionConfig(TranscriptionRequest request)
                     "encoding");
         }
 
-        // set the default model to "latest_long" instead of "default"
-        // https://cloud.google.com/speech-to-text/docs/transcription-model#transcription_models
-        // and https://cloud.google.com/speech-to-text/docs/latest-models#pricing for pricing
-        builder.setModel("latest_long");
-
-        // set the model to video
-        if (useVideoModel)
+        builder.setModel(useModel);
+        if (logger.isDebugEnabled())
         {
-            if (logger.isDebugEnabled())
-            {
-                logger.debug("Using the more expensive video model");
-            }
-            builder.setModel("video");
+            logger.debug("Using model " + useModel);
         }
 
         // set the Language tag
@@ -296,8 +285,8 @@ private RecognitionConfig getRecognitionConfig(TranscriptionRequest request)
      */
     public GoogleCloudTranscriptionService()
     {
-        useVideoModel = JigasiBundleActivator.getConfigurationService()
-            .getBoolean(P_NAME_USE_VIDEO_MODEL, DEFAULT_VALUE_USE_VIDEO_MODEL);
+        useModel = JigasiBundleActivator.getConfigurationService()
+            .getString(GOOGLE_MODEL, DEFAULT_VALUE_GOOGLE_MODEL);
     }
 
     /**
@@ -704,8 +693,7 @@ private ApiStreamObserver<StreamingRecognizeRequest> createObserver(
                 StreamingRecognitionConfig.newBuilder()
                     .setConfig(config)
                     .setInterimResults(RETRIEVE_INTERIM_RESULTS)
-                    .setSingleUtterance(!useVideoModel &&
-                                            SINGLE_UTTERANCE_ONLY)
+                    .setSingleUtterance(SINGLE_UTTERANCE_ONLY)
                     .build();
 
             // StreamingCallable manages sending the audio and receiving

From 368f5b94284d57e70cdf6d023d28866dfbcf2d76 Mon Sep 17 00:00:00 2001
From: RazvanP <razvan.purdel@8x8.com>
Date: Wed, 20 Mar 2024 11:02:55 +0200
Subject: [PATCH 3/3] JIT-12948 remove single utterance

---
 .../GoogleCloudTranscriptionService.java      | 56 +++++--------------
 1 file changed, 13 insertions(+), 43 deletions(-)

diff --git a/src/main/java/org/jitsi/jigasi/transcription/GoogleCloudTranscriptionService.java b/src/main/java/org/jitsi/jigasi/transcription/GoogleCloudTranscriptionService.java
index 59320f70b..4c8d271bd 100644
--- a/src/main/java/org/jitsi/jigasi/transcription/GoogleCloudTranscriptionService.java
+++ b/src/main/java/org/jitsi/jigasi/transcription/GoogleCloudTranscriptionService.java
@@ -165,12 +165,6 @@ public class GoogleCloudTranscriptionService
      */
     private final static boolean RETRIEVE_INTERIM_RESULTS = true;
 
-    /**
-     * Whether the Google Cloud API only listens for a single utterance
-     * or continuous to listen once an utterance is over
-     */
-    private final static boolean SINGLE_UTTERANCE_ONLY = true;
-
     /**
      * The amount of ms after which a StreamingRecognize session will be closed
      * when no new audio is given. This is to make sure the session retrieves
@@ -194,7 +188,7 @@ public class GoogleCloudTranscriptionService
      * Check whether the given string contains a supported language tag
      *
      * @param tag the language tag
-     * @throws UnsupportedOperationException when the google cloud API does not
+     * @throws UnsupportedOperationException when the Google cloud API does not
      * support the given language
      */
     private static void validateLanguageTag(String tag)
@@ -280,7 +274,7 @@ private RecognitionConfig getRecognitionConfig(TranscriptionRequest request)
     }
 
     /**
-     * Create a TranscriptionService which will send audio to the google cloud
+     * Create a TranscriptionService which will send audio to the Google cloud
      * platform to get a transcription
      */
     public GoogleCloudTranscriptionService()
@@ -428,7 +422,7 @@ public class GoogleCloudStreamingRecognitionSession
         private RequestApiStreamObserverManager requestManager;
 
         /**
-         * A single thread which is used to sent all requests to the API.
+         * A single thread which is used to send all requests to the API.
          * This is needed to reliably sent the first request to the service
          */
         private ExecutorService service = Executors.newSingleThreadExecutor();
@@ -506,7 +500,7 @@ public void addTranscriptionListener(TranscriptionListener listener)
     private static class GoogleCloudCostLogger
     {
         /**
-         * The length of a cost interval of the google cloud speech-to-text API
+         * The length of a cost interval of the Google cloud speech-to-text API
          */
         private final static int INTERVAL_LENGTH_MS = 15000;
 
@@ -556,7 +550,7 @@ private static class GoogleCloudCostLogger
 
         /**
          * Tell the {@link GoogleCloudCostLogger} that a certain length of audio
-         * was send.
+         * was sent.
          *
          * @param ms the length of the audio chunk sent to the API
          */
@@ -650,13 +644,13 @@ private class RequestApiStreamObserverManager
         private boolean stopped = false;
 
         /**
-         * Used to log the cost of every request which is send
+         * Used to log the cost of every request which is sent
          */
         private final GoogleCloudCostLogger costLogger;
 
         /**
          * Create a new RequestApiStreamObserverManager, which will try
-         * to mimic a streaming session of indefinite lenth
+         * to mimic a streaming session of indefinite length
          *
          * @param client the SpeechClient with which to open new sessions
          * @param debugName extra text which will be added to logs
@@ -679,7 +673,7 @@ private ApiStreamObserver<StreamingRecognizeRequest> createObserver(
             RecognitionConfig config)
         {
             // Each observer gets its own responseObserver to be able to
-            // to get an unique ID
+            // get a unique ID
             ResponseApiStreamingObserver<StreamingRecognizeResponse>
                 responseObserver =
                 new ResponseApiStreamingObserver<StreamingRecognizeResponse>(
@@ -693,7 +687,6 @@ private ApiStreamObserver<StreamingRecognizeRequest> createObserver(
                 StreamingRecognitionConfig.newBuilder()
                     .setConfig(config)
                     .setInterimResults(RETRIEVE_INTERIM_RESULTS)
-                    .setSingleUtterance(SINGLE_UTTERANCE_ONLY)
                     .build();
 
             // StreamingCallable manages sending the audio and receiving
@@ -914,16 +907,12 @@ public void onNext(StreamingRecognizeResponse message)
                 if (logger.isDebugEnabled())
                     logger.debug(
                         debugName + ": received error from StreamingRecognizeResponse: "
-                        + message.getError().getMessage());
+                             + message.getError().getMessage());
                 requestManager.terminateCurrentSession();
                 return;
             }
 
-            // This will happen when SINGLE_UTTERANCE is set to true
-            // and the server has detected the end of the user's speech
-            // utterance.
-            if (isEndOfSingleUtteranceMessage(message) ||
-                message.getResultsCount() == 0)
+            if (message.getResultsCount() == 0)
             {
                 if (logger.isDebugEnabled())
                     logger.debug(
@@ -936,14 +925,14 @@ public void onNext(StreamingRecognizeResponse message)
             List<StreamingRecognitionResult> results = message.getResultsList();
             StreamingRecognitionResult result = results.get(0);
 
-            // If empty, the session has reached it's time limit and
+            // If empty, the session has reached its time limit and
             // nothing new was said, but there should be an error in the message
             // so this is never supposed to happen
             if (result.getAlternativesList().isEmpty())
             {
                 logger.warn(
                     debugName + ": received a list of alternatives which"
-                              + " was empty");
+                            + " was empty");
                 requestManager.terminateCurrentSession();
                 return;
             }
@@ -973,28 +962,9 @@ else if (logger.isDebugEnabled())
             }
         }
 
-        /**
-         * Get whether a {@link StreamingRecognizeResponse} has an
-         * {@link StreamingRecognizeResponse#speechEventType_} of
-         * {@link StreamingRecognizeResponse.SpeechEventType#
-         * END_OF_SINGLE_UTTERANCE}
-         *
-         * @param message the message to check
-         * @return true if the message has the eventType
-         * {@link StreamingRecognizeResponse.SpeechEventType
-         * #END_OF_SINGLE_UTTERANCE}, false otherwise
-         */
-        private boolean isEndOfSingleUtteranceMessage(
-            StreamingRecognizeResponse message)
-        {
-            return message.getSpeechEventType().
-                equals(StreamingRecognizeResponse.SpeechEventType.
-                    END_OF_SINGLE_UTTERANCE);
-        }
-
         /**
          * Handle a single {@link StreamingRecognitionResult} by creating
-         * a {@link TranscriptionResult} based on the result and notifying all
+         * a {@link TranscriptionResult} based on the result and notifying
          * all registered {@link TranscriptionListener}s
          *
          * @param result the result to handle