GoogleCloudPlatform · nnegrey · Oct 1, 2019 · Sep 30, 2019 · lesv · Sep 30, 2019
diff --git a/speech/beta/pom.xml b/speech/beta/pom.xml
@@ -40,7 +40,7 @@
     <dependency>
       <groupId>com.google.cloud</groupId>
       <artifactId>google-cloud-speech</artifactId>
-      <version>0.56.0-beta</version>
+      <version>1.20.0</version>
     </dependency>
     <!-- [END speech_quickstart_dependencies] -->
 

diff --git a/speech/beta/src/main/java/com/example/speech/Recognize.java b/speech/beta/src/main/java/com/example/speech/Recognize.java
@@ -27,16 +27,18 @@
 import com.google.cloud.speech.v1p1beta1.RecognitionMetadata.MicrophoneDistance;
 import com.google.cloud.speech.v1p1beta1.RecognitionMetadata.RecordingDeviceType;
 import com.google.cloud.speech.v1p1beta1.RecognizeResponse;
+import com.google.cloud.speech.v1p1beta1.SpeakerDiarizationConfig;
 import com.google.cloud.speech.v1p1beta1.SpeechClient;
+
 import com.google.cloud.speech.v1p1beta1.SpeechRecognitionAlternative;
 import com.google.cloud.speech.v1p1beta1.SpeechRecognitionResult;
+import com.google.cloud.speech.v1p1beta1.WordInfo;
 import com.google.protobuf.ByteString;
 
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.util.ArrayList;
-import java.util.List;
 
 public class Recognize {
 
@@ -154,32 +156,52 @@ public static void transcribeDiarization(String fileName) throws Exception {
       RecognitionAudio recognitionAudio =
           RecognitionAudio.newBuilder().setContent(ByteString.copyFrom(content)).build();
 
+      SpeakerDiarizationConfig speakerDiarizationConfig = SpeakerDiarizationConfig.newBuilder()
+              .setEnableSpeakerDiarization(true)
+              .setMinSpeakerCount(2)
+              .setMaxSpeakerCount(2)
+              .build();
+
       // Configure request to enable Speaker diarization
-      RecognitionConfig config =
-          RecognitionConfig.newBuilder()
+      RecognitionConfig config = RecognitionConfig.newBuilder()
               .setEncoding(AudioEncoding.LINEAR16)
               .setLanguageCode("en-US")
               .setSampleRateHertz(8000)
-              .setEnableSpeakerDiarization(true)
-              .setDiarizationSpeakerCount(2)
+              .setDiarizationConfig(speakerDiarizationConfig)
               .build();
 
       // Perform the transcription request
       RecognizeResponse recognizeResponse = speechClient.recognize(config, recognitionAudio);
 
-      // Print out the results
-      for (SpeechRecognitionResult result : recognizeResponse.getResultsList()) {
-        // There can be several alternative transcripts for a given chunk of speech. Just
-        // use the first (most likely) one here.
-        SpeechRecognitionAlternative alternative = result.getAlternatives(0);
-        System.out.format("Transcript : %s\n", alternative.getTranscript());
-        // The words array contains the entire transcript up until that point.
-        // Referencing the last spoken word to get the associated Speaker tag
-        System.out.format(
-            "Speaker Tag %s: %s\n",
-            alternative.getWords((alternative.getWordsCount() - 1)).getSpeakerTag(),
-            alternative.getTranscript());
+      // Speaker Tags are only included in the last result object, which has only one alternative.
+      SpeechRecognitionAlternative alternative =
+              recognizeResponse.getResults(
+                      recognizeResponse.getResultsCount() - 1).getAlternatives(0);
+
+      // The alternative is made up of WordInfo objects that contain the speaker_tag.
+      WordInfo wordInfo = alternative.getWords(0);
+      int currentSpeakerTag = wordInfo.getSpeakerTag();
+
+      // For each word, get all the words associated with one speaker, once the speaker changes,
+      // add a new line with the new speaker and their spoken words.
+      StringBuilder speakerWords = new StringBuilder(
+              String.format("Speaker %d: %s", wordInfo.getSpeakerTag(), wordInfo.getWord()));
+
+      for (int i = 1; i < alternative.getWordsCount(); i++) {
+        wordInfo = alternative.getWords(i);
+        if (currentSpeakerTag == wordInfo.getSpeakerTag()) {
+          speakerWords.append(" ");
+          speakerWords.append(wordInfo.getWord());
+        } else {
+          speakerWords.append(
+                  String.format("\nSpeaker %d: %s",
+                          wordInfo.getSpeakerTag(),
+                          wordInfo.getWord()));
+          currentSpeakerTag = wordInfo.getSpeakerTag();
+        }
       }
+
+      System.out.println(speakerWords.toString());
     }
   }
   // [END speech_transcribe_diarization_beta]
@@ -192,14 +214,19 @@ public static void transcribeDiarization(String fileName) throws Exception {
    */
   public static void transcribeDiarizationGcs(String gcsUri) throws Exception {
     try (SpeechClient speechClient = SpeechClient.create()) {
+      SpeakerDiarizationConfig speakerDiarizationConfig = SpeakerDiarizationConfig.newBuilder()
+              .setEnableSpeakerDiarization(true)
+              .setMinSpeakerCount(2)
+              .setMaxSpeakerCount(2)
+              .build();
+
       // Configure request to enable Speaker diarization
       RecognitionConfig config =
           RecognitionConfig.newBuilder()
               .setEncoding(AudioEncoding.LINEAR16)
               .setLanguageCode("en-US")
               .setSampleRateHertz(8000)
-              .setEnableSpeakerDiarization(true)
-              .setDiarizationSpeakerCount(2)
+              .setDiarizationConfig(speakerDiarizationConfig)
               .build();
 
       // Set the remote path for the audio file
@@ -214,17 +241,37 @@ public static void transcribeDiarizationGcs(String gcsUri) throws Exception {
         Thread.sleep(10000);
       }
 
-      for (SpeechRecognitionResult result : response.get().getResultsList()) {
-        // There can be several alternative transcripts for a given chunk of speech. Just
-        // use the first (most likely) one here.
-        SpeechRecognitionAlternative alternative = result.getAlternatives(0);
-        // The words array contains the entire transcript up until that point.
-        // Referencing the last spoken word to get the associated Speaker tag
-        System.out.format(
-            "Speaker Tag %s:%s\n",
-            alternative.getWords((alternative.getWordsCount() - 1)).getSpeakerTag(),
-            alternative.getTranscript());
+      // Speaker Tags are only included in the last result object, which has only one alternative.
+      LongRunningRecognizeResponse longRunningRecognizeResponse = response.get();
+      SpeechRecognitionAlternative alternative =
+              longRunningRecognizeResponse.getResults(
+                      longRunningRecognizeResponse.getResultsCount() - 1)
+                      .getAlternatives(0);
+
+      // The alternative is made up of WordInfo objects that contain the speaker_tag.
+      WordInfo wordInfo = alternative.getWords(0);
+      int currentSpeakerTag = wordInfo.getSpeakerTag();
+
+      // For each word, get all the words associated with one speaker, once the speaker changes,
+      // add a new line with the new speaker and their spoken words.
+      StringBuilder speakerWords = new StringBuilder(
+              String.format("Speaker %d: %s", wordInfo.getSpeakerTag(), wordInfo.getWord()));
+
+      for (int i = 1; i < alternative.getWordsCount(); i++) {
+        wordInfo = alternative.getWords(i);
+        if (currentSpeakerTag == wordInfo.getSpeakerTag()) {
+          speakerWords.append(" ");
+          speakerWords.append(wordInfo.getWord());
+        } else {
+          speakerWords.append(
+                  String.format("\nSpeaker %d: %s",
+                          wordInfo.getSpeakerTag(),
+                          wordInfo.getWord()));
+          currentSpeakerTag = wordInfo.getSpeakerTag();
+        }
       }
+
+      System.out.println(speakerWords.toString());
     }
   }
   // [END speech_transcribe_diarization_gcs_beta]
@@ -454,7 +501,7 @@ public static void transcribeWordLevelConfidenceGcs(String gcsUri) throws Except
       RecognitionConfig config =
           RecognitionConfig.newBuilder()
               .setEncoding(AudioEncoding.FLAC)
-              .setSampleRateHertz(16000)
+              .setSampleRateHertz(44100)
               .setLanguageCode("en-US")
               .setEnableWordConfidence(true)
               .build();

diff --git a/speech/beta/src/test/java/com/example/speech/RecognizeIT.java b/speech/beta/src/test/java/com/example/speech/RecognizeIT.java
@@ -30,7 +30,7 @@
 @RunWith(JUnit4.class)
 @SuppressWarnings("checkstyle:abbreviationaswordinname")
 public class RecognizeIT {
-  private static final String BUCKET = "cloud-samples-tests";
+  private static final String BUCKET = "cloud-samples-data";
 
   private ByteArrayOutputStream bout;
   private PrintStream out;
@@ -39,7 +39,7 @@ public class RecognizeIT {
   private String audioFileName = "./resources/audio.raw";
   private String multiChannelAudioFileName = "./resources/commercial_stereo.wav";
   private String gcsMultiChannelAudioPath = "gs://" + BUCKET + "/speech/commercial_stereo.wav";
-  private String gcsAudioPath = "gs://" + BUCKET + "/speech/brooklyn.flac";
+  private String gcsAudioPath = "gs://" + BUCKET + "/speech/brooklyn_bridge.flac";
   private String gcsDiarizationAudioPath = "gs://" + BUCKET + "/speech/commercial_mono.wav";
 
   // The path to the video file to transcribe
@@ -71,14 +71,16 @@ public void testMetadata() throws Exception {
   public void testTranscribeDiarization() throws Exception {
     Recognize.transcribeDiarization(recognitionAudioFile);
     String got = bout.toString();
-    assertThat(got).contains("Speaker Tag 2:");
+    assertThat(got).contains("Speaker 1: I'm here");
+    assertThat(got).contains("Speaker 2: hi I'd like to buy a Chrome Cast");
   }
 
   @Test
   public void testTranscribeDiarizationGcs() throws Exception {
     Recognize.transcribeDiarizationGcs(gcsDiarizationAudioPath);
     String got = bout.toString();
-    assertThat(got).contains("Speaker Tag 2:");
+    assertThat(got).contains("Speaker 1: I'm here");
+    assertThat(got).contains("Speaker 2: hi I'd like to buy a Chrome Cast");
   }
 
   @Test