Skip to content

Commit

Permalink
feat: add transcript normalization + m4a audio format support (#11937)
Browse files Browse the repository at this point in the history
- [ ] Regenerate this pull request now.

BEGIN_COMMIT_OVERRIDE
feat: add transcript normalization + m4a audio format support
docs: clarify alternatives for deprecated fields
docs: deprecate `BatchRecognizeFileResult.uri` in favor of
`cloud_storage_result.native_format_uri`
docs: deprecate `BatchRecognizeFileResult.transcript` in favor of
`inline_result.transcript`
END_COMMIT_OVERRIDE


PiperOrigin-RevId: 577926708

Source-Link:
googleapis/googleapis@37e816b

Source-Link:
googleapis/googleapis-gen@e12bd7b
Copy-Tag:
eyJwIjoicGFja2FnZXMvZ29vZ2xlLWNsb3VkLXNwZWVjaC8uT3dsQm90LnlhbWwiLCJoIjoiZTEyYmQ3YmRiYmI5ZDJlNDE4YTkyMjA3NWQyM2Y3N2E4YzFlNzQ4NSJ9

---------

Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
  • Loading branch information
gcf-owl-bot[bot] and gcf-owl-bot[bot] authored Oct 30, 2023
1 parent 80b7a92 commit 8536b20
Show file tree
Hide file tree
Showing 5 changed files with 184 additions and 13 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
BatchRecognizeResponse,
BatchRecognizeResults,
BatchRecognizeTranscriptionMetadata,
CloudStorageResult,
Config,
CreateCustomClassRequest,
CreatePhraseSetRequest,
Expand All @@ -43,6 +44,7 @@
GetPhraseSetRequest,
GetRecognizerRequest,
InlineOutputConfig,
InlineResult,
ListCustomClassesRequest,
ListCustomClassesResponse,
ListPhraseSetsRequest,
Expand All @@ -67,6 +69,7 @@
StreamingRecognitionResult,
StreamingRecognizeRequest,
StreamingRecognizeResponse,
TranscriptNormalization,
UndeleteCustomClassRequest,
UndeletePhraseSetRequest,
UndeleteRecognizerRequest,
Expand All @@ -87,6 +90,7 @@
"BatchRecognizeResponse",
"BatchRecognizeResults",
"BatchRecognizeTranscriptionMetadata",
"CloudStorageResult",
"Config",
"CreateCustomClassRequest",
"CreatePhraseSetRequest",
Expand All @@ -102,6 +106,7 @@
"GetPhraseSetRequest",
"GetRecognizerRequest",
"InlineOutputConfig",
"InlineResult",
"ListCustomClassesRequest",
"ListCustomClassesResponse",
"ListPhraseSetsRequest",
Expand All @@ -127,6 +132,7 @@
"StreamingRecognitionResult",
"StreamingRecognizeRequest",
"StreamingRecognizeResponse",
"TranscriptNormalization",
"UndeleteCustomClassRequest",
"UndeletePhraseSetRequest",
"UndeleteRecognizerRequest",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
BatchRecognizeResponse,
BatchRecognizeResults,
BatchRecognizeTranscriptionMetadata,
CloudStorageResult,
Config,
CreateCustomClassRequest,
CreatePhraseSetRequest,
Expand All @@ -37,6 +38,7 @@
GetPhraseSetRequest,
GetRecognizerRequest,
InlineOutputConfig,
InlineResult,
ListCustomClassesRequest,
ListCustomClassesResponse,
ListPhraseSetsRequest,
Expand All @@ -61,6 +63,7 @@
StreamingRecognitionResult,
StreamingRecognizeRequest,
StreamingRecognizeResponse,
TranscriptNormalization,
UndeleteCustomClassRequest,
UndeletePhraseSetRequest,
UndeleteRecognizerRequest,
Expand All @@ -80,6 +83,7 @@
"BatchRecognizeResponse",
"BatchRecognizeResults",
"BatchRecognizeTranscriptionMetadata",
"CloudStorageResult",
"Config",
"CreateCustomClassRequest",
"CreatePhraseSetRequest",
Expand All @@ -95,6 +99,7 @@
"GetPhraseSetRequest",
"GetRecognizerRequest",
"InlineOutputConfig",
"InlineResult",
"ListCustomClassesRequest",
"ListCustomClassesResponse",
"ListPhraseSetsRequest",
Expand All @@ -119,6 +124,7 @@
"StreamingRecognitionResult",
"StreamingRecognizeRequest",
"StreamingRecognizeResponse",
"TranscriptNormalization",
"UndeleteCustomClassRequest",
"UndeletePhraseSetRequest",
"UndeleteRecognizerRequest",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
"ExplicitDecodingConfig",
"SpeakerDiarizationConfig",
"RecognitionFeatures",
"TranscriptNormalization",
"SpeechAdaptation",
"RecognitionConfig",
"RecognizeRequest",
Expand All @@ -56,6 +57,8 @@
"RecognitionOutputConfig",
"BatchRecognizeResponse",
"BatchRecognizeResults",
"CloudStorageResult",
"InlineResult",
"BatchRecognizeFileResult",
"BatchRecognizeTranscriptionMetadata",
"BatchRecognizeMetadata",
Expand Down Expand Up @@ -587,9 +590,14 @@ class Recognizer(proto.Message):
User-settable, human-readable name for the
Recognizer. Must be 63 characters or less.
model (str):
Optional. Which model to use for recognition requests.
Select the model best suited to your domain to get best
results.
Optional. This field is now deprecated. Prefer the
[``model``][google.cloud.speech.v2.RecognitionConfig.model]
field in the
[``RecognitionConfig``][google.cloud.speech.v2.RecognitionConfig]
message.
Which model to use for recognition requests. Select the
model best suited to your domain to get best results.
Guidance for choosing which model to use can be found in the
`Transcription Models
Expand All @@ -598,7 +606,13 @@ class Recognizer(proto.Message):
`Table Of Supported
Models <https://cloud.google.com/speech-to-text/v2/docs/speech-to-text-supported-languages>`__.
language_codes (MutableSequence[str]):
Optional. The language of the supplied audio as a
Optional. This field is now deprecated. Prefer the
[``language_codes``][google.cloud.speech.v2.RecognitionConfig.language_codes]
field in the
[``RecognitionConfig``][google.cloud.speech.v2.RecognitionConfig]
message.
The language of the supplied audio as a
`BCP-47 <https://www.rfc-editor.org/rfc/bcp/bcp47.txt>`__
language tag.
Expand Down Expand Up @@ -772,6 +786,8 @@ class AutoDetectDecodingConfig(proto.Message):
- WEBM_OPUS: Opus audio frames in a WebM container.
- M4A: M4A audio format.
"""


Expand Down Expand Up @@ -991,6 +1007,56 @@ class MultiChannelMode(proto.Enum):
)


class TranscriptNormalization(proto.Message):
r"""Transcription normalization configuration. Use transcription
normalization to automatically replace parts of the transcript
with phrases of your choosing. For StreamingRecognize, this
normalization only applies to stable partial transcripts
(stability > 0.8) and final transcripts.
Attributes:
entries (MutableSequence[google.cloud.speech_v2.types.TranscriptNormalization.Entry]):
A list of replacement entries. We will perform replacement
with one entry at a time. For example, the second entry in
["cat" => "dog", "mountain cat" => "mountain dog"] will
never be applied because we will always process the first
entry before it. At most 100 entries.
"""

class Entry(proto.Message):
r"""A single replacement configuration.
Attributes:
search (str):
What to replace. Max length is 100
characters.
replace (str):
What to replace with. Max length is 100
characters.
case_sensitive (bool):
Whether the search is case sensitive.
"""

search: str = proto.Field(
proto.STRING,
number=1,
)
replace: str = proto.Field(
proto.STRING,
number=2,
)
case_sensitive: bool = proto.Field(
proto.BOOL,
number=3,
)

entries: MutableSequence[Entry] = proto.RepeatedField(
proto.MESSAGE,
number=1,
message=Entry,
)


class SpeechAdaptation(proto.Message):
r"""Provides "hints" to the speech recognizer to favor specific
words and phrases in the results. PhraseSets can be specified as
Expand Down Expand Up @@ -1109,6 +1175,13 @@ class RecognitionConfig(proto.Message):
Speech adaptation context that weights
recognizer predictions for specific words and
phrases.
transcript_normalization (google.cloud.speech_v2.types.TranscriptNormalization):
Optional. Use transcription normalization to
automatically replace parts of the transcript
with phrases of your choosing. For
StreamingRecognize, this normalization only
applies to stable partial transcripts (stability
> 0.8) and final transcripts.
"""

auto_decoding_config: "AutoDetectDecodingConfig" = proto.Field(
Expand Down Expand Up @@ -1141,6 +1214,11 @@ class RecognitionConfig(proto.Message):
number=6,
message="SpeechAdaptation",
)
transcript_normalization: "TranscriptNormalization" = proto.Field(
proto.MESSAGE,
number=11,
message="TranscriptNormalization",
)


class RecognizeRequest(proto.Message):
Expand Down Expand Up @@ -1820,29 +1898,73 @@ class BatchRecognizeResults(proto.Message):
)


class BatchRecognizeFileResult(proto.Message):
r"""Final results for a single file.
class CloudStorageResult(proto.Message):
r"""Final results written to Cloud Storage.
Attributes:
uri (str):
The Cloud Storage URI to which recognition
results were written.
"""

uri: str = proto.Field(
proto.STRING,
number=1,
)


class InlineResult(proto.Message):
r"""Final results returned inline in the recognition response.
Attributes:
transcript (google.cloud.speech_v2.types.BatchRecognizeResults):
The transcript for the audio file.
"""

transcript: "BatchRecognizeResults" = proto.Field(
proto.MESSAGE,
number=1,
message="BatchRecognizeResults",
)


class BatchRecognizeFileResult(proto.Message):
r"""Final results for a single file.
This message has `oneof`_ fields (mutually exclusive fields).
For each oneof, at most one member field can be set at the same time.
Setting any member of the oneof automatically clears all other
members.
.. _oneof: https://proto-plus-python.readthedocs.io/en/stable/fields.html#oneofs-mutually-exclusive-fields
Attributes:
error (google.rpc.status_pb2.Status):
Error if one was encountered.
metadata (google.cloud.speech_v2.types.RecognitionResponseMetadata):
transcript (google.cloud.speech_v2.types.BatchRecognizeResults):
The transcript for the audio file. This is populated only
when
cloud_storage_result (google.cloud.speech_v2.types.CloudStorageResult):
Recognition results written to Cloud Storage. This is
populated only when
[GcsOutputConfig][google.cloud.speech.v2.GcsOutputConfig] is
set in the
[RecognitionOutputConfig][[google.cloud.speech.v2.RecognitionOutputConfig].
This field is a member of `oneof`_ ``result``.
inline_result (google.cloud.speech_v2.types.InlineResult):
Recognition results. This is populated only when
[InlineOutputConfig][google.cloud.speech.v2.InlineOutputConfig]
is set in the
[RecognitionOutputConfig][[google.cloud.speech.v2.RecognitionOutputConfig].
This field is a member of `oneof`_ ``result``.
uri (str):
Deprecated. Use ``cloud_storage_result.native_format_uri``
instead.
transcript (google.cloud.speech_v2.types.BatchRecognizeResults):
Deprecated. Use ``inline_result.transcript`` instead.
"""

uri: str = proto.Field(
proto.STRING,
number=1,
)
error: status_pb2.Status = proto.Field(
proto.MESSAGE,
number=2,
Expand All @@ -1853,6 +1975,22 @@ class BatchRecognizeFileResult(proto.Message):
number=3,
message="RecognitionResponseMetadata",
)
cloud_storage_result: "CloudStorageResult" = proto.Field(
proto.MESSAGE,
number=5,
oneof="result",
message="CloudStorageResult",
)
inline_result: "InlineResult" = proto.Field(
proto.MESSAGE,
number=6,
oneof="result",
message="InlineResult",
)
uri: str = proto.Field(
proto.STRING,
number=1,
)
transcript: "BatchRecognizeResults" = proto.Field(
proto.MESSAGE,
number=4,
Expand Down
3 changes: 3 additions & 0 deletions packages/google-cloud-speech/noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,10 @@

BLACK_VERSION = "black[jupyter]==23.7.0"
ISORT_VERSION = "isort==5.11.0"

LINT_PATHS = ["docs", "google", "tests", "noxfile.py", "setup.py"]


DEFAULT_PYTHON_VERSION = "3.9"

UNIT_TEST_PYTHON_VERSIONS: List[str] = ["3.7", "3.8", "3.9", "3.10", "3.11"]
Expand Down Expand Up @@ -89,6 +91,7 @@ def lint(session):
"--check",
*LINT_PATHS,
)

session.run("flake8", "google", "tests")


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6871,6 +6871,15 @@ def test_create_recognizer_rest(request_type):
}
],
},
"transcript_normalization": {
"entries": [
{
"search": "search_value",
"replace": "replace_value",
"case_sensitive": True,
}
]
},
},
"annotations": {},
"state": 2,
Expand Down Expand Up @@ -7941,6 +7950,15 @@ def test_update_recognizer_rest(request_type):
}
],
},
"transcript_normalization": {
"entries": [
{
"search": "search_value",
"replace": "replace_value",
"case_sensitive": True,
}
]
},
},
"annotations": {},
"state": 2,
Expand Down

0 comments on commit 8536b20

Please sign in to comment.