diff --git a/speech/beta/pom.xml b/speech/beta/pom.xml index d2dcacf27e2..3eaafe59edb 100644 --- a/speech/beta/pom.xml +++ b/speech/beta/pom.xml @@ -40,7 +40,7 @@ com.google.cloud google-cloud-speech - 0.56.0-beta + 1.20.0 diff --git a/speech/beta/src/main/java/com/example/speech/Recognize.java b/speech/beta/src/main/java/com/example/speech/Recognize.java index 7c8aaccd74a..8f0d1cd7059 100644 --- a/speech/beta/src/main/java/com/example/speech/Recognize.java +++ b/speech/beta/src/main/java/com/example/speech/Recognize.java @@ -27,16 +27,18 @@ import com.google.cloud.speech.v1p1beta1.RecognitionMetadata.MicrophoneDistance; import com.google.cloud.speech.v1p1beta1.RecognitionMetadata.RecordingDeviceType; import com.google.cloud.speech.v1p1beta1.RecognizeResponse; +import com.google.cloud.speech.v1p1beta1.SpeakerDiarizationConfig; import com.google.cloud.speech.v1p1beta1.SpeechClient; + import com.google.cloud.speech.v1p1beta1.SpeechRecognitionAlternative; import com.google.cloud.speech.v1p1beta1.SpeechRecognitionResult; +import com.google.cloud.speech.v1p1beta1.WordInfo; import com.google.protobuf.ByteString; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; -import java.util.List; public class Recognize { @@ -154,32 +156,52 @@ public static void transcribeDiarization(String fileName) throws Exception { RecognitionAudio recognitionAudio = RecognitionAudio.newBuilder().setContent(ByteString.copyFrom(content)).build(); + SpeakerDiarizationConfig speakerDiarizationConfig = SpeakerDiarizationConfig.newBuilder() + .setEnableSpeakerDiarization(true) + .setMinSpeakerCount(2) + .setMaxSpeakerCount(2) + .build(); + // Configure request to enable Speaker diarization - RecognitionConfig config = - RecognitionConfig.newBuilder() + RecognitionConfig config = RecognitionConfig.newBuilder() .setEncoding(AudioEncoding.LINEAR16) .setLanguageCode("en-US") .setSampleRateHertz(8000) - .setEnableSpeakerDiarization(true) - .setDiarizationSpeakerCount(2) + .setDiarizationConfig(speakerDiarizationConfig) .build(); // Perform the transcription request RecognizeResponse recognizeResponse = speechClient.recognize(config, recognitionAudio); - // Print out the results - for (SpeechRecognitionResult result : recognizeResponse.getResultsList()) { - // There can be several alternative transcripts for a given chunk of speech. Just - // use the first (most likely) one here. - SpeechRecognitionAlternative alternative = result.getAlternatives(0); - System.out.format("Transcript : %s\n", alternative.getTranscript()); - // The words array contains the entire transcript up until that point. - // Referencing the last spoken word to get the associated Speaker tag - System.out.format( - "Speaker Tag %s: %s\n", - alternative.getWords((alternative.getWordsCount() - 1)).getSpeakerTag(), - alternative.getTranscript()); + // Speaker Tags are only included in the last result object, which has only one alternative. + SpeechRecognitionAlternative alternative = + recognizeResponse.getResults( + recognizeResponse.getResultsCount() - 1).getAlternatives(0); + + // The alternative is made up of WordInfo objects that contain the speaker_tag. + WordInfo wordInfo = alternative.getWords(0); + int currentSpeakerTag = wordInfo.getSpeakerTag(); + + // For each word, get all the words associated with one speaker, once the speaker changes, + // add a new line with the new speaker and their spoken words. + StringBuilder speakerWords = new StringBuilder( + String.format("Speaker %d: %s", wordInfo.getSpeakerTag(), wordInfo.getWord())); + + for (int i = 1; i < alternative.getWordsCount(); i++) { + wordInfo = alternative.getWords(i); + if (currentSpeakerTag == wordInfo.getSpeakerTag()) { + speakerWords.append(" "); + speakerWords.append(wordInfo.getWord()); + } else { + speakerWords.append( + String.format("\nSpeaker %d: %s", + wordInfo.getSpeakerTag(), + wordInfo.getWord())); + currentSpeakerTag = wordInfo.getSpeakerTag(); + } } + + System.out.println(speakerWords.toString()); } } // [END speech_transcribe_diarization_beta] @@ -192,14 +214,19 @@ public static void transcribeDiarization(String fileName) throws Exception { */ public static void transcribeDiarizationGcs(String gcsUri) throws Exception { try (SpeechClient speechClient = SpeechClient.create()) { + SpeakerDiarizationConfig speakerDiarizationConfig = SpeakerDiarizationConfig.newBuilder() + .setEnableSpeakerDiarization(true) + .setMinSpeakerCount(2) + .setMaxSpeakerCount(2) + .build(); + // Configure request to enable Speaker diarization RecognitionConfig config = RecognitionConfig.newBuilder() .setEncoding(AudioEncoding.LINEAR16) .setLanguageCode("en-US") .setSampleRateHertz(8000) - .setEnableSpeakerDiarization(true) - .setDiarizationSpeakerCount(2) + .setDiarizationConfig(speakerDiarizationConfig) .build(); // Set the remote path for the audio file @@ -214,17 +241,37 @@ public static void transcribeDiarizationGcs(String gcsUri) throws Exception { Thread.sleep(10000); } - for (SpeechRecognitionResult result : response.get().getResultsList()) { - // There can be several alternative transcripts for a given chunk of speech. Just - // use the first (most likely) one here. - SpeechRecognitionAlternative alternative = result.getAlternatives(0); - // The words array contains the entire transcript up until that point. - // Referencing the last spoken word to get the associated Speaker tag - System.out.format( - "Speaker Tag %s:%s\n", - alternative.getWords((alternative.getWordsCount() - 1)).getSpeakerTag(), - alternative.getTranscript()); + // Speaker Tags are only included in the last result object, which has only one alternative. + LongRunningRecognizeResponse longRunningRecognizeResponse = response.get(); + SpeechRecognitionAlternative alternative = + longRunningRecognizeResponse.getResults( + longRunningRecognizeResponse.getResultsCount() - 1) + .getAlternatives(0); + + // The alternative is made up of WordInfo objects that contain the speaker_tag. + WordInfo wordInfo = alternative.getWords(0); + int currentSpeakerTag = wordInfo.getSpeakerTag(); + + // For each word, get all the words associated with one speaker, once the speaker changes, + // add a new line with the new speaker and their spoken words. + StringBuilder speakerWords = new StringBuilder( + String.format("Speaker %d: %s", wordInfo.getSpeakerTag(), wordInfo.getWord())); + + for (int i = 1; i < alternative.getWordsCount(); i++) { + wordInfo = alternative.getWords(i); + if (currentSpeakerTag == wordInfo.getSpeakerTag()) { + speakerWords.append(" "); + speakerWords.append(wordInfo.getWord()); + } else { + speakerWords.append( + String.format("\nSpeaker %d: %s", + wordInfo.getSpeakerTag(), + wordInfo.getWord())); + currentSpeakerTag = wordInfo.getSpeakerTag(); + } } + + System.out.println(speakerWords.toString()); } } // [END speech_transcribe_diarization_gcs_beta] @@ -454,7 +501,7 @@ public static void transcribeWordLevelConfidenceGcs(String gcsUri) throws Except RecognitionConfig config = RecognitionConfig.newBuilder() .setEncoding(AudioEncoding.FLAC) - .setSampleRateHertz(16000) + .setSampleRateHertz(44100) .setLanguageCode("en-US") .setEnableWordConfidence(true) .build(); diff --git a/speech/beta/src/test/java/com/example/speech/RecognizeIT.java b/speech/beta/src/test/java/com/example/speech/RecognizeIT.java index 5219b58dc79..4d7acf747e0 100644 --- a/speech/beta/src/test/java/com/example/speech/RecognizeIT.java +++ b/speech/beta/src/test/java/com/example/speech/RecognizeIT.java @@ -30,7 +30,7 @@ @RunWith(JUnit4.class) @SuppressWarnings("checkstyle:abbreviationaswordinname") public class RecognizeIT { - private static final String BUCKET = "cloud-samples-tests"; + private static final String BUCKET = "cloud-samples-data"; private ByteArrayOutputStream bout; private PrintStream out; @@ -39,7 +39,7 @@ public class RecognizeIT { private String audioFileName = "./resources/audio.raw"; private String multiChannelAudioFileName = "./resources/commercial_stereo.wav"; private String gcsMultiChannelAudioPath = "gs://" + BUCKET + "/speech/commercial_stereo.wav"; - private String gcsAudioPath = "gs://" + BUCKET + "/speech/brooklyn.flac"; + private String gcsAudioPath = "gs://" + BUCKET + "/speech/brooklyn_bridge.flac"; private String gcsDiarizationAudioPath = "gs://" + BUCKET + "/speech/commercial_mono.wav"; // The path to the video file to transcribe @@ -71,14 +71,16 @@ public void testMetadata() throws Exception { public void testTranscribeDiarization() throws Exception { Recognize.transcribeDiarization(recognitionAudioFile); String got = bout.toString(); - assertThat(got).contains("Speaker Tag 2:"); + assertThat(got).contains("Speaker 1: I'm here"); + assertThat(got).contains("Speaker 2: hi I'd like to buy a Chrome Cast"); } @Test public void testTranscribeDiarizationGcs() throws Exception { Recognize.transcribeDiarizationGcs(gcsDiarizationAudioPath); String got = bout.toString(); - assertThat(got).contains("Speaker Tag 2:"); + assertThat(got).contains("Speaker 1: I'm here"); + assertThat(got).contains("Speaker 2: hi I'd like to buy a Chrome Cast"); } @Test