Skip to content

Commit

Permalink
feat: add alternative_language_codes to RecognitionConfig (#824)
Browse files Browse the repository at this point in the history
- [ ] Regenerate this pull request now.

PiperOrigin-RevId: 413453425

Source-Link: googleapis/googleapis@2b47b24

Source-Link: googleapis/googleapis-gen@7ffe6e0
Copy-Tag: eyJwIjoiLmdpdGh1Yi8uT3dsQm90LnlhbWwiLCJoIjoiN2ZmZTZlMGExYmY2M2Q4NTQwMDA5Y2U2OTg2NjBlYmI3MWM1NGZmMSJ9

feat: add WEBM_OPUS codec
feat: add SpeechAdaptation configuration
feat: add word confidence
feat: add spoken punctuation and spoken emojis
feat: add hint boost in SpeechContext
  • Loading branch information
gcf-owl-bot[bot] authored Dec 6, 2021
1 parent a5e2021 commit f32f412
Show file tree
Hide file tree
Showing 8 changed files with 3,086 additions and 417 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package google.cloud.speech.v1;
import "google/api/annotations.proto";
import "google/api/client.proto";
import "google/api/field_behavior.proto";
import "google/cloud/speech/v1/resource.proto";
import "google/longrunning/operations.proto";
import "google/protobuf/any.proto";
import "google/protobuf/duration.proto";
Expand Down Expand Up @@ -181,7 +182,8 @@ message RecognitionConfig {
// a lossless encoding (`FLAC` or `LINEAR16`). The accuracy of the speech
// recognition can be reduced if lossy codecs are used to capture or transmit
// audio, particularly if background noise is present. Lossy codecs include
// `MULAW`, `AMR`, `AMR_WB`, `OGG_OPUS`, `SPEEX_WITH_HEADER_BYTE`, `MP3`.
// `MULAW`, `AMR`, `AMR_WB`, `OGG_OPUS`, `SPEEX_WITH_HEADER_BYTE`, `MP3`,
// and `WEBM_OPUS`.
//
// The `FLAC` and `WAV` audio file formats include a header that describes the
// included audio content. You can request recognition for `WAV` files that
Expand Down Expand Up @@ -236,6 +238,11 @@ message RecognitionConfig {
// is replaced with a single byte containing the block length. Only Speex
// wideband is supported. `sample_rate_hertz` must be 16000.
SPEEX_WITH_HEADER_BYTE = 7;

// Opus encoded audio frames in WebM container
// ([OggOpus](https://wiki.xiph.org/OggOpus)). `sample_rate_hertz` must be
// one of 8000, 12000, 16000, 24000, or 48000.
WEBM_OPUS = 9;
}

// Encoding of audio data sent in all `RecognitionAudio` messages.
Expand Down Expand Up @@ -279,6 +286,20 @@ message RecognitionConfig {
// of the currently supported language codes.
string language_code = 3 [(google.api.field_behavior) = REQUIRED];

// A list of up to 3 additional
// [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tags,
// listing possible alternative languages of the supplied audio.
// See [Language
// Support](https://cloud.google.com/speech-to-text/docs/languages) for a list
// of the currently supported language codes. If alternative languages are
// listed, recognition result will contain recognition in the most likely
// language detected including the main language_code. The recognition result
// will include the language tag of the language detected in the audio. Note:
// This feature is only supported for Voice Command and Voice Search use cases
// and performance may vary for other use cases (e.g., phone call
// transcription).
repeated string alternative_language_codes = 18;

// Maximum number of recognition hypotheses to be returned.
// Specifically, the maximum number of `SpeechRecognitionAlternative` messages
// within each `SpeechRecognitionResult`.
Expand All @@ -293,6 +314,13 @@ message RecognitionConfig {
// won't be filtered out.
bool profanity_filter = 5;

// Speech adaptation configuration improves the accuracy of speech
// recognition. For more information, see the [speech
// adaptation](https://cloud.google.com/speech-to-text/docs/adaptation)
// documentation.
// When speech adaptation is set it supersedes the `speech_contexts` field.
SpeechAdaptation adaptation = 20;

// Array of [SpeechContext][google.cloud.speech.v1.SpeechContext].
// A means to provide context to assist the speech recognition. For more
// information, see
Expand All @@ -306,12 +334,33 @@ message RecognitionConfig {
// `false`.
bool enable_word_time_offsets = 8;

// If `true`, the top result includes a list of words and the
// confidence for those words. If `false`, no word-level confidence
// information is returned. The default is `false`.
bool enable_word_confidence = 15;

// If 'true', adds punctuation to recognition result hypotheses.
// This feature is only available in select languages. Setting this for
// requests in other languages has no effect at all.
// The default 'false' value does not add punctuation to result hypotheses.
bool enable_automatic_punctuation = 11;

// The spoken punctuation behavior for the call
// If not set, uses default behavior based on model of choice
// e.g. command_and_search will enable spoken punctuation by default
// If 'true', replaces spoken punctuation with the corresponding symbols in
// the request. For example, "how are you question mark" becomes "how are
// you?". See https://cloud.google.com/speech-to-text/docs/spoken-punctuation
// for support. If 'false', spoken punctuation is not replaced.
google.protobuf.BoolValue enable_spoken_punctuation = 22;

// The spoken emoji behavior for the call
// If not set, uses default behavior based on model of choice
// If 'true', adds spoken emoji formatting for the request. This will replace
// spoken emojis with the corresponding Unicode symbols in the final
// transcript. If 'false', spoken emojis are not replaced.
google.protobuf.BoolValue enable_spoken_emojis = 23;

// Config to enable speaker diarization and set additional
// parameters to make diarization better suited for your application.
// Note: When this is enabled, we send all the words from the beginning of the
Expand Down Expand Up @@ -537,6 +586,16 @@ message SpeechContext {
// improves the likelihood of correctly transcribing audio that includes
// months.
repeated string phrases = 1;

// Hint Boost. Positive value will increase the probability that a specific
// phrase will be recognized over other similar sounding phrases. The higher
// the boost, the higher the chance of false positive recognition as well.
// Negative boost values would correspond to anti-biasing. Anti-biasing is not
// enabled, so negative boost will simply be ignored. Though `boost` can
// accept a wide range of positive values, most use cases are best served with
// values between 0 and 20. We recommend using a binary search approach to
// finding the optimal value for your use case.
float boost = 4;
}

// Contains audio data in the encoding specified in the `RecognitionConfig`.
Expand Down Expand Up @@ -587,6 +646,12 @@ message LongRunningRecognizeResponse {

// When available, billed audio seconds for the corresponding request.
google.protobuf.Duration total_billed_time = 3;

// Original output config if present in the request.
TranscriptOutputConfig output_config = 6;

// If the transcript output fails this field contains the relevant error.
google.rpc.Status output_error = 7;
}

// Describes the progress of a long-running `LongRunningRecognize` call. It is
Expand Down Expand Up @@ -723,11 +788,10 @@ message StreamingRecognitionResult {
// For audio_channel_count = N, its output values can range from '1' to 'N'.
int32 channel_tag = 5;

// The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag of
// the language in this result. This language code was detected to have the
// most likelihood of being spoken in the audio.
string language_code = 6
[(google.api.field_behavior) = OUTPUT_ONLY];
// Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag
// of the language in this result. This language code was detected to have
// the most likelihood of being spoken in the audio.
string language_code = 6 [(google.api.field_behavior) = OUTPUT_ONLY];
}

// A speech recognition result corresponding to a portion of the audio.
Expand All @@ -742,6 +806,15 @@ message SpeechRecognitionResult {
// recognized result for the audio from that channel.
// For audio_channel_count = N, its output values can range from '1' to 'N'.
int32 channel_tag = 2;

// Time offset of the end of this result relative to the
// beginning of the audio.
google.protobuf.Duration result_end_time = 4;

// Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag
// of the language in this result. This language code was detected to have
// the most likelihood of being spoken in the audio.
string language_code = 5 [(google.api.field_behavior) = OUTPUT_ONLY];
}

// Alternative hypotheses (a.k.a. n-best list).
Expand Down Expand Up @@ -785,6 +858,15 @@ message WordInfo {
// The word corresponding to this set of information.
string word = 3;

// The confidence estimate between 0.0 and 1.0. A higher number
// indicates an estimated greater likelihood that the recognized words are
// correct. This field is set only for the top alternative of a non-streaming
// result or, of a streaming result where `is_final=true`.
// This field is not guaranteed to be accurate and users should not rely on it
// to be always provided.
// The default of 0.0 is a sentinel value indicating `confidence` was not set.
float confidence = 4;

// Output only. A distinct integer value is assigned for every speaker within
// the audio. This field specifies which one of those speakers was detected to
// have spoken this word. Value ranges from '1' to diarization_speaker_count.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
// Copyright 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

syntax = "proto3";

package google.cloud.speech.v1;

import "google/api/resource.proto";
import "google/protobuf/timestamp.proto";
import "google/api/annotations.proto";

option cc_enable_arenas = true;
option go_package = "google.golang.org/genproto/googleapis/cloud/speech/v1;speech";
option java_multiple_files = true;
option java_outer_classname = "SpeechResourceProto";
option java_package = "com.google.cloud.speech.v1";
option objc_class_prefix = "GCS";

// A set of words or phrases that represents a common concept likely to appear
// in your audio, for example a list of passenger ship names. CustomClass items
// can be substituted into placeholders that you set in PhraseSet phrases.
message CustomClass {
option (google.api.resource) = {
type: "speech.googleapis.com/CustomClass"
pattern: "projects/{project}/locations/{location}/customClasses/{custom_class}"
};

// An item of the class.
message ClassItem {
// The class item's value.
string value = 1;
}

// The resource name of the custom class.
string name = 1;

// If this custom class is a resource, the custom_class_id is the resource id
// of the CustomClass. Case sensitive.
string custom_class_id = 2;

// A collection of class items.
repeated ClassItem items = 3;
}

// Provides "hints" to the speech recognizer to favor specific words and phrases
// in the results.
message PhraseSet {
option (google.api.resource) = {
type: "speech.googleapis.com/PhraseSet"
pattern: "projects/{project}/locations/{location}/phraseSets/{phrase_set}"
};

// A phrases containing words and phrase "hints" so that
// the speech recognition is more likely to recognize them. This can be used
// to improve the accuracy for specific words and phrases, for example, if
// specific commands are typically spoken by the user. This can also be used
// to add additional words to the vocabulary of the recognizer. See
// [usage limits](https://cloud.google.com/speech-to-text/quotas#content).
//
// List items can also include pre-built or custom classes containing groups
// of words that represent common concepts that occur in natural language. For
// example, rather than providing a phrase hint for every month of the
// year (e.g. "i was born in january", "i was born in febuary", ...), use the
// pre-built `$MONTH` class improves the likelihood of correctly transcribing
// audio that includes months (e.g. "i was born in $month").
// To refer to pre-built classes, use the class' symbol prepended with `$`
// e.g. `$MONTH`. To refer to custom classes that were defined inline in the
// request, set the class's `custom_class_id` to a string unique to all class
// resources and inline classes. Then use the class' id wrapped in $`{...}`
// e.g. "${my-months}". To refer to custom classes resources, use the class'
// id wrapped in `${}` (e.g. `${my-months}`).
//
// Speech-to-Text supports three locations: `global`, `us` (US North America),
// and `eu` (Europe). If you are calling the `speech.googleapis.com`
// endpoint, use the `global` location. To specify a region, use a
// [regional endpoint](/speech-to-text/docs/endpoints) with matching `us` or
// `eu` location value.
message Phrase {
// The phrase itself.
string value = 1;

// Hint Boost. Overrides the boost set at the phrase set level.
// Positive value will increase the probability that a specific phrase will
// be recognized over other similar sounding phrases. The higher the boost,
// the higher the chance of false positive recognition as well. Negative
// boost will simply be ignored. Though `boost` can accept a wide range of
// positive values, most use cases are best served
// with values between 0 and 20. We recommend using a binary search approach
// to finding the optimal value for your use case. Speech recognition
// will skip PhraseSets with a boost value of 0.
float boost = 2;
}

// The resource name of the phrase set.
string name = 1;

// A list of word and phrases.
repeated Phrase phrases = 2;

// Hint Boost. Positive value will increase the probability that a specific
// phrase will be recognized over other similar sounding phrases. The higher
// the boost, the higher the chance of false positive recognition as well.
// Negative boost values would correspond to anti-biasing. Anti-biasing is not
// enabled, so negative boost will simply be ignored. Though `boost` can
// accept a wide range of positive values, most use cases are best served with
// values between 0 (exclusive) and 20. We recommend using a binary search
// approach to finding the optimal value for your use case. Speech recognition
// will skip PhraseSets with a boost value of 0.
float boost = 4;
}

// Speech adaptation configuration.
message SpeechAdaptation {
// A collection of phrase sets. To specify the hints inline, leave the
// phrase set's `name` blank and fill in the rest of its fields. Any
// phrase set can use any custom class.
repeated PhraseSet phrase_sets = 1;

// A collection of phrase set resource names to use.
repeated string phrase_set_references = 2 [(google.api.resource_reference) = {
type: "speech.googleapis.com/PhraseSet"
}];

// A collection of custom classes. To specify the classes inline, leave the
// class' `name` blank and fill in the rest of its fields, giving it a unique
// `custom_class_id`. Refer to the inline defined class in phrase hints by its
// `custom_class_id`.
repeated CustomClass custom_classes = 3;
}
Loading

0 comments on commit f32f412

Please sign in to comment.