From dd52b3a15e4c0742961fbc7a2623b1098dfbb728 Mon Sep 17 00:00:00 2001 From: Kristin Grace Galvin Date: Thu, 16 May 2019 11:56:35 -0700 Subject: [PATCH] samples: Updated Infinite streaming sample (#1422) * adding result end time and color-coded result output * updated infinited streaming sample to use result_end_time * lint fixes * more comments and formatting updates * Fix speech test. * More test adjustments. --- .../speech/InfiniteStreamRecognize.java | 151 +++++++++++++++--- .../java/com/example/speech/Recognize.java | 1 - .../java/com/example/speech/RecognizeIT.java | 12 +- 3 files changed, 137 insertions(+), 27 deletions(-) diff --git a/speech/snippets/src/main/java/com/example/speech/InfiniteStreamRecognize.java b/speech/snippets/src/main/java/com/example/speech/InfiniteStreamRecognize.java index 7844e23af5e..c281c298da9 100644 --- a/speech/snippets/src/main/java/com/example/speech/InfiniteStreamRecognize.java +++ b/speech/snippets/src/main/java/com/example/speech/InfiniteStreamRecognize.java @@ -20,14 +20,17 @@ import com.google.api.gax.rpc.ClientStream; import com.google.api.gax.rpc.ResponseObserver; import com.google.api.gax.rpc.StreamController; -import com.google.cloud.speech.v1.RecognitionConfig; -import com.google.cloud.speech.v1.SpeechClient; -import com.google.cloud.speech.v1.SpeechRecognitionAlternative; -import com.google.cloud.speech.v1.StreamingRecognitionConfig; -import com.google.cloud.speech.v1.StreamingRecognitionResult; -import com.google.cloud.speech.v1.StreamingRecognizeRequest; -import com.google.cloud.speech.v1.StreamingRecognizeResponse; +import com.google.cloud.speech.v1p1beta1.RecognitionConfig; +import com.google.cloud.speech.v1p1beta1.SpeechClient; +import com.google.cloud.speech.v1p1beta1.SpeechRecognitionAlternative; +import com.google.cloud.speech.v1p1beta1.StreamingRecognitionConfig; +import com.google.cloud.speech.v1p1beta1.StreamingRecognitionResult; +import com.google.cloud.speech.v1p1beta1.StreamingRecognizeRequest; +import com.google.cloud.speech.v1p1beta1.StreamingRecognizeResponse; import com.google.protobuf.ByteString; +import com.google.protobuf.Duration; +import java.lang.Math; +import java.text.DecimalFormat; import java.util.ArrayList; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; @@ -39,11 +42,29 @@ public class InfiniteStreamRecognize { + private static final int STREAMING_LIMIT = 10000; // 10 seconds + + public static final String RED = "\033[0;31m"; + public static final String GREEN = "\033[0;32m"; + public static final String YELLOW = "\033[0;33m"; + // Creating shared object private static volatile BlockingQueue sharedQueue = new LinkedBlockingQueue(); private static TargetDataLine targetDataLine; private static int BYTES_PER_BUFFER = 6400; // buffer size in bytes + private static int restartCounter = 0; + private static ArrayList audioInput = new ArrayList(); + private static ArrayList lastAudioInput = new ArrayList(); + private static int resultEndTimeInMS = 0; + private static int isFinalEndTime = 0; + private static int finalRequestEndTime = 0; + private static boolean newStream = true; + private static double bridgingOffset = 0; + private static boolean lastTranscriptWasFinal = false; + private static StreamController referenceToStreamController; + private static ByteString tempByteString; + public static void main(String... args) { try { infiniteStreamingRecognize(); @@ -60,6 +81,7 @@ class MicBuffer implements Runnable { @Override public void run() { + System.out.println(YELLOW); System.out.println("Start speaking...Press Ctrl-C to stop"); targetDataLine.start(); byte[] data = new byte[BYTES_PER_BUFFER]; @@ -88,24 +110,48 @@ public void run() { ArrayList responses = new ArrayList<>(); - public void onStart(StreamController controller) {} + public void onStart(StreamController controller) { + referenceToStreamController = controller; + } public void onResponse(StreamingRecognizeResponse response) { + responses.add(response); + StreamingRecognitionResult result = response.getResultsList().get(0); - // There can be several alternative transcripts for a given chunk of speech. Just - // use the first (most likely) one here. + + Duration resultEndTime = result.getResultEndTime(); + + resultEndTimeInMS = (int) ((resultEndTime.getSeconds() * 1000) + + (resultEndTime.getNanos() / 1000000)); + + double correctedTime = resultEndTimeInMS - bridgingOffset + + (STREAMING_LIMIT * restartCounter); + DecimalFormat format = new DecimalFormat("0.#"); + SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0); - System.out.printf("Transcript : %s\n", alternative.getTranscript()); - } + if (result.getIsFinal()) { + System.out.print(GREEN); + System.out.print("\033[2K\r"); + System.out.printf("%s: %s\n", format.format(correctedTime), + alternative.getTranscript()); - public void onComplete() { - System.out.println("Done"); - } + isFinalEndTime = resultEndTimeInMS; + lastTranscriptWasFinal = true; + } else { + System.out.print(RED); + System.out.print("\033[2K\r"); + System.out.printf("%s: %s", format.format(correctedTime), + alternative.getTranscript()); - public void onError(Throwable t) { - System.out.println(t); + lastTranscriptWasFinal = false; + } } + + public void onComplete() {} + + public void onError(Throwable t) {} + }; clientStream = client.streamingRecognizeCallable().splitCall(responseObserver); @@ -116,8 +162,12 @@ public void onError(Throwable t) { .setLanguageCode("en-US") .setSampleRateHertz(16000) .build(); + StreamingRecognitionConfig streamingRecognitionConfig = - StreamingRecognitionConfig.newBuilder().setConfig(recognitionConfig).build(); + StreamingRecognitionConfig.newBuilder() + .setConfig(recognitionConfig) + .setInterimResults(true) + .build(); StreamingRecognizeRequest request = StreamingRecognizeRequest.newBuilder() @@ -151,9 +201,28 @@ public void onError(Throwable t) { long estimatedTime = System.currentTimeMillis() - startTime; - if (estimatedTime >= 55000) { + if (estimatedTime >= STREAMING_LIMIT) { clientStream.closeSend(); + referenceToStreamController.cancel(); // remove Observer + + if (resultEndTimeInMS > 0) { + finalRequestEndTime = isFinalEndTime; + } + resultEndTimeInMS = 0; + + lastAudioInput = null; + lastAudioInput = audioInput; + audioInput = new ArrayList(); + + restartCounter++; + + if (!lastTranscriptWasFinal) { + System.out.print('\n'); + } + + newStream = true; + clientStream = client.streamingRecognizeCallable().splitCall(responseObserver); request = @@ -161,13 +230,55 @@ public void onError(Throwable t) { .setStreamingConfig(streamingRecognitionConfig) .build(); + System.out.println(YELLOW); + System.out.printf("%d: RESTARTING REQUEST\n", restartCounter * STREAMING_LIMIT); + startTime = System.currentTimeMillis(); } else { + + if ((newStream) && (lastAudioInput.size() > 0)) { + // if this is the first audio from a new request + // calculate amount of unfinalized audio from last request + // resend the audio to the speech client before incoming audio + double chunkTime = STREAMING_LIMIT / lastAudioInput.size(); + // ms length of each chunk in previous request audio arrayList + if (chunkTime != 0) { + if (bridgingOffset < 0) { + // bridging Offset accounts for time of resent audio + // calculated from last request + bridgingOffset = 0; + } + if (bridgingOffset > finalRequestEndTime) { + bridgingOffset = finalRequestEndTime; + } + int chunksFromMS = (int) Math.floor((finalRequestEndTime + - bridgingOffset) / chunkTime); + // chunks from MS is number of chunks to resend + bridgingOffset = (int) Math.floor((lastAudioInput.size() + - chunksFromMS) * chunkTime); + // set bridging offset for next request + for (int i = chunksFromMS; i < lastAudioInput.size(); i++) { + + request = + StreamingRecognizeRequest.newBuilder() + .setAudioContent(lastAudioInput.get(i)) + .build(); + clientStream.send(request); + } + } + newStream = false; + } + + tempByteString = ByteString.copyFrom(sharedQueue.take()); + request = StreamingRecognizeRequest.newBuilder() - .setAudioContent(ByteString.copyFrom(sharedQueue.take())) + .setAudioContent(tempByteString) .build(); + + audioInput.add(tempByteString); + } clientStream.send(request); diff --git a/speech/snippets/src/main/java/com/example/speech/Recognize.java b/speech/snippets/src/main/java/com/example/speech/Recognize.java index d8845d499be..0797e8211d0 100644 --- a/speech/snippets/src/main/java/com/example/speech/Recognize.java +++ b/speech/snippets/src/main/java/com/example/speech/Recognize.java @@ -699,7 +699,6 @@ public void onError(Throwable t) { System.out.println("Stop speaking."); targetDataLine.stop(); targetDataLine.close(); - break; } request = StreamingRecognizeRequest.newBuilder() diff --git a/speech/snippets/src/test/java/com/example/speech/RecognizeIT.java b/speech/snippets/src/test/java/com/example/speech/RecognizeIT.java index 2eef6f05807..0aa8db5d2ca 100644 --- a/speech/snippets/src/test/java/com/example/speech/RecognizeIT.java +++ b/speech/snippets/src/test/java/com/example/speech/RecognizeIT.java @@ -113,22 +113,22 @@ public void testStreamRecognize() throws Exception { @Test public void testAutoPunctuation() throws Exception { Recognize.transcribeFileWithAutomaticPunctuation(audioFileName); - String got = bout.toString(); - assertThat(got).contains("How old is the Brooklyn Bridge?"); + String got = bout.toString().toLowerCase(); + assertThat(got).contains("how old is the brooklyn bridge"); } @Test public void testGcsAutoPunctuation() throws Exception { Recognize.transcribeGcsWithAutomaticPunctuation(gcsAudioPath); - String got = bout.toString(); - assertThat(got).contains("How old is the Brooklyn Bridge?"); + String got = bout.toString().toLowerCase(); + assertThat(got).contains("how old is the brooklyn bridge"); } @Test public void testStreamAutoPunctuation() throws Exception { Recognize.streamingTranscribeWithAutomaticPunctuation(audioFileName); - String got = bout.toString(); - assertThat(got).contains("How old is the Brooklyn Bridge?"); + String got = bout.toString().toLowerCase(); + assertThat(got).contains("how old is the brooklyn bridge"); } @Test