Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Due to API backend changes, update the samples to match #1595

Merged
merged 1 commit into from
Oct 1, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion speech/beta/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
<dependency>
<groupId>com.google.cloud</groupId>
<artifactId>google-cloud-speech</artifactId>
<version>0.56.0-beta</version>
<version>1.20.0</version>
</dependency>
<!-- [END speech_quickstart_dependencies] -->

Expand Down
107 changes: 77 additions & 30 deletions speech/beta/src/main/java/com/example/speech/Recognize.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,16 +27,18 @@
import com.google.cloud.speech.v1p1beta1.RecognitionMetadata.MicrophoneDistance;
import com.google.cloud.speech.v1p1beta1.RecognitionMetadata.RecordingDeviceType;
import com.google.cloud.speech.v1p1beta1.RecognizeResponse;
import com.google.cloud.speech.v1p1beta1.SpeakerDiarizationConfig;
import com.google.cloud.speech.v1p1beta1.SpeechClient;

import com.google.cloud.speech.v1p1beta1.SpeechRecognitionAlternative;
import com.google.cloud.speech.v1p1beta1.SpeechRecognitionResult;
import com.google.cloud.speech.v1p1beta1.WordInfo;
import com.google.protobuf.ByteString;

import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;

public class Recognize {

Expand Down Expand Up @@ -154,32 +156,52 @@ public static void transcribeDiarization(String fileName) throws Exception {
RecognitionAudio recognitionAudio =
RecognitionAudio.newBuilder().setContent(ByteString.copyFrom(content)).build();

SpeakerDiarizationConfig speakerDiarizationConfig = SpeakerDiarizationConfig.newBuilder()
.setEnableSpeakerDiarization(true)
.setMinSpeakerCount(2)
.setMaxSpeakerCount(2)
.build();

// Configure request to enable Speaker diarization
RecognitionConfig config =
RecognitionConfig.newBuilder()
RecognitionConfig config = RecognitionConfig.newBuilder()
.setEncoding(AudioEncoding.LINEAR16)
.setLanguageCode("en-US")
.setSampleRateHertz(8000)
.setEnableSpeakerDiarization(true)
.setDiarizationSpeakerCount(2)
.setDiarizationConfig(speakerDiarizationConfig)
.build();

// Perform the transcription request
RecognizeResponse recognizeResponse = speechClient.recognize(config, recognitionAudio);

// Print out the results
for (SpeechRecognitionResult result : recognizeResponse.getResultsList()) {
// There can be several alternative transcripts for a given chunk of speech. Just
// use the first (most likely) one here.
SpeechRecognitionAlternative alternative = result.getAlternatives(0);
System.out.format("Transcript : %s\n", alternative.getTranscript());
// The words array contains the entire transcript up until that point.
// Referencing the last spoken word to get the associated Speaker tag
System.out.format(
"Speaker Tag %s: %s\n",
alternative.getWords((alternative.getWordsCount() - 1)).getSpeakerTag(),
alternative.getTranscript());
// Speaker Tags are only included in the last result object, which has only one alternative.
SpeechRecognitionAlternative alternative =
recognizeResponse.getResults(
recognizeResponse.getResultsCount() - 1).getAlternatives(0);

// The alternative is made up of WordInfo objects that contain the speaker_tag.
WordInfo wordInfo = alternative.getWords(0);
int currentSpeakerTag = wordInfo.getSpeakerTag();

// For each word, get all the words associated with one speaker, once the speaker changes,
// add a new line with the new speaker and their spoken words.
StringBuilder speakerWords = new StringBuilder(
String.format("Speaker %d: %s", wordInfo.getSpeakerTag(), wordInfo.getWord()));

for (int i = 1; i < alternative.getWordsCount(); i++) {
wordInfo = alternative.getWords(i);
if (currentSpeakerTag == wordInfo.getSpeakerTag()) {
speakerWords.append(" ");
speakerWords.append(wordInfo.getWord());
} else {
speakerWords.append(
String.format("\nSpeaker %d: %s",
wordInfo.getSpeakerTag(),
wordInfo.getWord()));
currentSpeakerTag = wordInfo.getSpeakerTag();
}
}

System.out.println(speakerWords.toString());
}
}
// [END speech_transcribe_diarization_beta]
Expand All @@ -192,14 +214,19 @@ public static void transcribeDiarization(String fileName) throws Exception {
*/
public static void transcribeDiarizationGcs(String gcsUri) throws Exception {
try (SpeechClient speechClient = SpeechClient.create()) {
SpeakerDiarizationConfig speakerDiarizationConfig = SpeakerDiarizationConfig.newBuilder()
.setEnableSpeakerDiarization(true)
.setMinSpeakerCount(2)
.setMaxSpeakerCount(2)
.build();

// Configure request to enable Speaker diarization
RecognitionConfig config =
RecognitionConfig.newBuilder()
.setEncoding(AudioEncoding.LINEAR16)
.setLanguageCode("en-US")
.setSampleRateHertz(8000)
.setEnableSpeakerDiarization(true)
.setDiarizationSpeakerCount(2)
.setDiarizationConfig(speakerDiarizationConfig)
.build();

// Set the remote path for the audio file
Expand All @@ -214,17 +241,37 @@ public static void transcribeDiarizationGcs(String gcsUri) throws Exception {
Thread.sleep(10000);
}

for (SpeechRecognitionResult result : response.get().getResultsList()) {
// There can be several alternative transcripts for a given chunk of speech. Just
// use the first (most likely) one here.
SpeechRecognitionAlternative alternative = result.getAlternatives(0);
// The words array contains the entire transcript up until that point.
// Referencing the last spoken word to get the associated Speaker tag
System.out.format(
"Speaker Tag %s:%s\n",
alternative.getWords((alternative.getWordsCount() - 1)).getSpeakerTag(),
alternative.getTranscript());
// Speaker Tags are only included in the last result object, which has only one alternative.
LongRunningRecognizeResponse longRunningRecognizeResponse = response.get();
SpeechRecognitionAlternative alternative =
longRunningRecognizeResponse.getResults(
longRunningRecognizeResponse.getResultsCount() - 1)
.getAlternatives(0);

// The alternative is made up of WordInfo objects that contain the speaker_tag.
WordInfo wordInfo = alternative.getWords(0);
int currentSpeakerTag = wordInfo.getSpeakerTag();

// For each word, get all the words associated with one speaker, once the speaker changes,
// add a new line with the new speaker and their spoken words.
StringBuilder speakerWords = new StringBuilder(
String.format("Speaker %d: %s", wordInfo.getSpeakerTag(), wordInfo.getWord()));

for (int i = 1; i < alternative.getWordsCount(); i++) {
wordInfo = alternative.getWords(i);
if (currentSpeakerTag == wordInfo.getSpeakerTag()) {
speakerWords.append(" ");
speakerWords.append(wordInfo.getWord());
} else {
speakerWords.append(
String.format("\nSpeaker %d: %s",
wordInfo.getSpeakerTag(),
wordInfo.getWord()));
currentSpeakerTag = wordInfo.getSpeakerTag();
}
}

System.out.println(speakerWords.toString());
}
}
// [END speech_transcribe_diarization_gcs_beta]
Expand Down Expand Up @@ -454,7 +501,7 @@ public static void transcribeWordLevelConfidenceGcs(String gcsUri) throws Except
RecognitionConfig config =
RecognitionConfig.newBuilder()
.setEncoding(AudioEncoding.FLAC)
.setSampleRateHertz(16000)
.setSampleRateHertz(44100)
.setLanguageCode("en-US")
.setEnableWordConfidence(true)
.build();
Expand Down
10 changes: 6 additions & 4 deletions speech/beta/src/test/java/com/example/speech/RecognizeIT.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
@RunWith(JUnit4.class)
@SuppressWarnings("checkstyle:abbreviationaswordinname")
public class RecognizeIT {
private static final String BUCKET = "cloud-samples-tests";
private static final String BUCKET = "cloud-samples-data";

private ByteArrayOutputStream bout;
private PrintStream out;
Expand All @@ -39,7 +39,7 @@ public class RecognizeIT {
private String audioFileName = "./resources/audio.raw";
private String multiChannelAudioFileName = "./resources/commercial_stereo.wav";
private String gcsMultiChannelAudioPath = "gs://" + BUCKET + "/speech/commercial_stereo.wav";
private String gcsAudioPath = "gs://" + BUCKET + "/speech/brooklyn.flac";
private String gcsAudioPath = "gs://" + BUCKET + "/speech/brooklyn_bridge.flac";
private String gcsDiarizationAudioPath = "gs://" + BUCKET + "/speech/commercial_mono.wav";

// The path to the video file to transcribe
Expand Down Expand Up @@ -71,14 +71,16 @@ public void testMetadata() throws Exception {
public void testTranscribeDiarization() throws Exception {
Recognize.transcribeDiarization(recognitionAudioFile);
String got = bout.toString();
assertThat(got).contains("Speaker Tag 2:");
assertThat(got).contains("Speaker 1: I'm here");
assertThat(got).contains("Speaker 2: hi I'd like to buy a Chrome Cast");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd bet at some point Chromecast will be a single word.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The odds are this is too brittle a test, but I'll approve it.

}

@Test
public void testTranscribeDiarizationGcs() throws Exception {
Recognize.transcribeDiarizationGcs(gcsDiarizationAudioPath);
String got = bout.toString();
assertThat(got).contains("Speaker Tag 2:");
assertThat(got).contains("Speaker 1: I'm here");
assertThat(got).contains("Speaker 2: hi I'd like to buy a Chrome Cast");
}

@Test
Expand Down