Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

latest_long as the default google ts model #526

Merged
merged 3 commits into from
Mar 20, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -165,12 +165,6 @@ public class GoogleCloudTranscriptionService
*/
private final static boolean RETRIEVE_INTERIM_RESULTS = true;

/**
* Whether the Google Cloud API only listens for a single utterance
* or continuous to listen once an utterance is over
*/
private final static boolean SINGLE_UTTERANCE_ONLY = true;

/**
* The amount of ms after which a StreamingRecognize session will be closed
* when no new audio is given. This is to make sure the session retrieves
Expand All @@ -180,22 +174,21 @@ public class GoogleCloudTranscriptionService
private final static int STREAMING_SESSION_TIMEOUT_MS = 2000;

/**
* Property name to determine whether to use the Google Speech API's
* video model
* Property name to determine which Google Speech API model to use
*/
private final static String P_NAME_USE_VIDEO_MODEL
= "org.jitsi.jigasi.transcription.USE_VIDEO_MODEL";
private final static String GOOGLE_MODEL
= "org.jitsi.jigasi.transcription.google_model";

/**
* The default value for the property USE_VIDEO_MODEL
* The default value for the property GOOGLE_MODEL
*/
private final static boolean DEFAULT_VALUE_USE_VIDEO_MODEL = false;
private final static String DEFAULT_VALUE_GOOGLE_MODEL = "latest_long";

/**
* Check whether the given string contains a supported language tag
*
* @param tag the language tag
* @throws UnsupportedOperationException when the google cloud API does not
* @throws UnsupportedOperationException when the Google cloud API does not
* support the given language
*/
private static void validateLanguageTag(String tag)
Expand Down Expand Up @@ -229,10 +222,9 @@ public boolean supportsLanguageRouting()
private List<SpeechContext> speechContexts = null;

/**
* Whether to use the more expensive video model when making
* requests.
* The model used for STT
*/
private boolean useVideoModel;
private final String useModel;

/**
* Creates the RecognitionConfig the Google service uses based
Expand Down Expand Up @@ -262,15 +254,10 @@ private RecognitionConfig getRecognitionConfig(TranscriptionRequest request)
"encoding");
}

// set the model to use. It will default to a cheaper model with
// lower performance when not set.
if (useVideoModel)
builder.setModel(useModel);
if (logger.isDebugEnabled())
{
if (logger.isDebugEnabled())
{
logger.debug("Using the more expensive video model");
}
builder.setModel("video");
logger.debug("Using model " + useModel);
}

// set the Language tag
Expand All @@ -287,13 +274,13 @@ private RecognitionConfig getRecognitionConfig(TranscriptionRequest request)
}

/**
* Create a TranscriptionService which will send audio to the google cloud
* Create a TranscriptionService which will send audio to the Google cloud
* platform to get a transcription
*/
public GoogleCloudTranscriptionService()
{
useVideoModel = JigasiBundleActivator.getConfigurationService()
.getBoolean(P_NAME_USE_VIDEO_MODEL, DEFAULT_VALUE_USE_VIDEO_MODEL);
useModel = JigasiBundleActivator.getConfigurationService()
.getString(GOOGLE_MODEL, DEFAULT_VALUE_GOOGLE_MODEL);
}

/**
Expand Down Expand Up @@ -435,7 +422,7 @@ public class GoogleCloudStreamingRecognitionSession
private RequestApiStreamObserverManager requestManager;

/**
* A single thread which is used to sent all requests to the API.
* A single thread which is used to send all requests to the API.
* This is needed to reliably sent the first request to the service
*/
private ExecutorService service = Executors.newSingleThreadExecutor();
Expand Down Expand Up @@ -513,7 +500,7 @@ public void addTranscriptionListener(TranscriptionListener listener)
private static class GoogleCloudCostLogger
{
/**
* The length of a cost interval of the google cloud speech-to-text API
* The length of a cost interval of the Google cloud speech-to-text API
*/
private final static int INTERVAL_LENGTH_MS = 15000;

Expand Down Expand Up @@ -563,7 +550,7 @@ private static class GoogleCloudCostLogger

/**
* Tell the {@link GoogleCloudCostLogger} that a certain length of audio
* was send.
* was sent.
*
* @param ms the length of the audio chunk sent to the API
*/
Expand Down Expand Up @@ -657,13 +644,13 @@ private class RequestApiStreamObserverManager
private boolean stopped = false;

/**
* Used to log the cost of every request which is send
* Used to log the cost of every request which is sent
*/
private final GoogleCloudCostLogger costLogger;

/**
* Create a new RequestApiStreamObserverManager, which will try
* to mimic a streaming session of indefinite lenth
* to mimic a streaming session of indefinite length
*
* @param client the SpeechClient with which to open new sessions
* @param debugName extra text which will be added to logs
Expand All @@ -686,7 +673,7 @@ private ApiStreamObserver<StreamingRecognizeRequest> createObserver(
RecognitionConfig config)
{
// Each observer gets its own responseObserver to be able to
// to get an unique ID
// get a unique ID
ResponseApiStreamingObserver<StreamingRecognizeResponse>
responseObserver =
new ResponseApiStreamingObserver<StreamingRecognizeResponse>(
Expand All @@ -700,8 +687,6 @@ private ApiStreamObserver<StreamingRecognizeRequest> createObserver(
StreamingRecognitionConfig.newBuilder()
.setConfig(config)
.setInterimResults(RETRIEVE_INTERIM_RESULTS)
.setSingleUtterance(!useVideoModel &&
SINGLE_UTTERANCE_ONLY)
.build();

// StreamingCallable manages sending the audio and receiving
Expand Down Expand Up @@ -922,16 +907,12 @@ public void onNext(StreamingRecognizeResponse message)
if (logger.isDebugEnabled())
logger.debug(
debugName + ": received error from StreamingRecognizeResponse: "
+ message.getError().getMessage());
+ message.getError().getMessage());
requestManager.terminateCurrentSession();
return;
}

// This will happen when SINGLE_UTTERANCE is set to true
// and the server has detected the end of the user's speech
// utterance.
if (isEndOfSingleUtteranceMessage(message) ||
message.getResultsCount() == 0)
if (message.getResultsCount() == 0)
{
if (logger.isDebugEnabled())
logger.debug(
Expand All @@ -944,14 +925,14 @@ public void onNext(StreamingRecognizeResponse message)
List<StreamingRecognitionResult> results = message.getResultsList();
StreamingRecognitionResult result = results.get(0);

// If empty, the session has reached it's time limit and
// If empty, the session has reached its time limit and
// nothing new was said, but there should be an error in the message
// so this is never supposed to happen
if (result.getAlternativesList().isEmpty())
{
logger.warn(
debugName + ": received a list of alternatives which"
+ " was empty");
+ " was empty");
requestManager.terminateCurrentSession();
return;
}
Expand Down Expand Up @@ -981,28 +962,9 @@ else if (logger.isDebugEnabled())
}
}

/**
* Get whether a {@link StreamingRecognizeResponse} has an
* {@link StreamingRecognizeResponse#speechEventType_} of
* {@link StreamingRecognizeResponse.SpeechEventType#
* END_OF_SINGLE_UTTERANCE}
*
* @param message the message to check
* @return true if the message has the eventType
* {@link StreamingRecognizeResponse.SpeechEventType
* #END_OF_SINGLE_UTTERANCE}, false otherwise
*/
private boolean isEndOfSingleUtteranceMessage(
StreamingRecognizeResponse message)
{
return message.getSpeechEventType().
equals(StreamingRecognizeResponse.SpeechEventType.
END_OF_SINGLE_UTTERANCE);
}

/**
* Handle a single {@link StreamingRecognitionResult} by creating
* a {@link TranscriptionResult} based on the result and notifying all
* a {@link TranscriptionResult} based on the result and notifying
* all registered {@link TranscriptionListener}s
*
* @param result the result to handle
Expand Down
Loading