From 5f926299a606cf8f4a092de4b43106e0c9fa0707 Mon Sep 17 00:00:00 2001 From: Thomas Schultz Date: Thu, 27 Oct 2016 13:41:52 -0400 Subject: [PATCH] Add speech streaming recognition. --- .../google/cloud/speech/_gax.py | 91 ++++++ .../google/cloud/speech/client.py | 86 ++++++ .../unit_tests/test__gax.py | 36 +++ .../unit_tests/test_client.py | 288 ++++++++++++++++-- 4 files changed, 476 insertions(+), 25 deletions(-) diff --git a/packages/google-cloud-python-speech/google/cloud/speech/_gax.py b/packages/google-cloud-python-speech/google/cloud/speech/_gax.py index c24f8acd365b..3cad482dd53a 100644 --- a/packages/google-cloud-python-speech/google/cloud/speech/_gax.py +++ b/packages/google-cloud-python-speech/google/cloud/speech/_gax.py @@ -106,6 +106,97 @@ def async_recognize(self, sample, language_code=None, return Operation.from_pb(response, self) + def streaming_recognize(self, sample, language_code=None, + max_alternatives=None, profanity_filter=None, + speech_context=None, single_utterance=False, + interim_results=False): + """Streaming speech recognition. + + .. note:: + + Streaming recognition requests are limited to 1 minute of audio. + See: https://cloud.google.com/speech/limits#content + + Yields :class:`~streaming_response.StreamingSpeechResponse` containing + results and metadata from the streaming request. + + :type sample: :class:`~google.cloud.speech.sample.Sample` + :param sample: Instance of ``Sample`` containing audio information. + + :type language_code: str + :param language_code: (Optional) The language of the supplied audio as + BCP-47 language tag. Example: ``'en-GB'``. + If omitted, defaults to ``'en-US'``. + + :type max_alternatives: int + :param max_alternatives: (Optional) Maximum number of recognition + hypotheses to be returned. The server may + return fewer than maxAlternatives. + Valid values are 0-30. A value of 0 or 1 + will return a maximum of 1. Defaults to 1 + + :type profanity_filter: bool + :param profanity_filter: If True, the server will attempt to filter + out profanities, replacing all but the + initial character in each filtered word with + asterisks, e.g. ``'f***'``. If False or + omitted, profanities won't be filtered out. + + :type speech_context: list + :param speech_context: A list of strings (max 50) containing words and + phrases "hints" so that the speech recognition + is more likely to recognize them. This can be + used to improve the accuracy for specific words + and phrases. This can also be used to add new + words to the vocabulary of the recognizer. + + :type single_utterance: bool + :param single_utterance: (Optional) If false or omitted, the recognizer + will perform continuous recognition + (continuing to process audio even if the user + pauses speaking) until the client closes the + output stream (gRPC API) or when the maximum + time limit has been reached. Multiple + SpeechRecognitionResults with the is_final + flag set to true may be returned. + If true, the recognizer will detect a single + spoken utterance. When it detects that the + user has paused or stopped speaking, it will + return an END_OF_UTTERANCE event and cease + recognition. It will return no more than one + SpeechRecognitionResult with the is_final flag + set to true. + + :type interim_results: bool + :param interim_results: (Optional) If true, interim results (tentative + hypotheses) may be returned as they become + available (these interim results are indicated + with the is_final=false flag). If false or + omitted, only is_final=true result(s) are + returned. + + :raises: :class:`ValueError` if sample.content is not a file-like + object. :class:`ValueError` if stream has closed. + + :rtype: :class:`~google.cloud.grpc.speech.v1beta1\ + .cloud_speech_pb2.StreamingRecognizeResponse` + :returns: ``StreamingRecognizeResponse`` instances. + """ + if getattr(sample.content, 'closed', None) is None: + raise ValueError('Please use file-like object for data stream.') + if sample.content.closed: + raise ValueError('Stream is closed.') + + requests = _stream_requests(sample, language_code=language_code, + max_alternatives=max_alternatives, + profanity_filter=profanity_filter, + speech_context=speech_context, + single_utterance=single_utterance, + interim_results=interim_results) + api = self._gapic_api + responses = api.streaming_recognize(requests) + return responses + def sync_recognize(self, sample, language_code=None, max_alternatives=None, profanity_filter=None, speech_context=None): """Synchronous Speech Recognition. diff --git a/packages/google-cloud-python-speech/google/cloud/speech/client.py b/packages/google-cloud-python-speech/google/cloud/speech/client.py index a321d92cce41..94716086b3a2 100644 --- a/packages/google-cloud-python-speech/google/cloud/speech/client.py +++ b/packages/google-cloud-python-speech/google/cloud/speech/client.py @@ -159,6 +159,92 @@ def speech_api(self): self._speech_api = _JSONSpeechAPI(self) return self._speech_api + def streaming_recognize(self, sample, language_code=None, + max_alternatives=None, profanity_filter=None, + speech_context=None, single_utterance=False, + interim_results=False): + """Streaming speech recognition. + + .. note:: + + Streaming recognition requests are limited to 1 minute of audio. + See: https://cloud.google.com/speech/limits#content + + Yields: list of :class:`~google.cloud.speech.alternative.Alternatives` + containing results and metadata from the streaming request. + + :type sample: :class:`~google.cloud.speech.sample.Sample` + :param sample: Instance of ``Sample`` containing audio information. + + :type language_code: str + :param language_code: (Optional) The language of the supplied audio as + BCP-47 language tag. Example: ``'en-GB'``. + If omitted, defaults to ``'en-US'``. + + :type max_alternatives: int + :param max_alternatives: (Optional) Maximum number of recognition + hypotheses to be returned. The server may + return fewer than maxAlternatives. + Valid values are 0-30. A value of 0 or 1 + will return a maximum of 1. Defaults to 1 + + :type profanity_filter: bool + :param profanity_filter: If True, the server will attempt to filter + out profanities, replacing all but the + initial character in each filtered word with + asterisks, e.g. ``'f***'``. If False or + omitted, profanities won't be filtered out. + + :type speech_context: list + :param speech_context: A list of strings (max 50) containing words and + phrases "hints" so that the speech recognition + is more likely to recognize them. This can be + used to improve the accuracy for specific words + and phrases. This can also be used to add new + words to the vocabulary of the recognizer. + + :type single_utterance: bool + :param single_utterance: (Optional) If false or omitted, the recognizer + will perform continuous recognition + (continuing to process audio even if the user + pauses speaking) until the client closes the + output stream (gRPC API) or when the maximum + time limit has been reached. Multiple + SpeechRecognitionResults with the is_final + flag set to true may be returned. + If true, the recognizer will detect a single + spoken utterance. When it detects that the + user has paused or stopped speaking, it will + return an END_OF_UTTERANCE event and cease + recognition. It will return no more than one + SpeechRecognitionResult with the is_final flag + set to true. + + :type interim_results: bool + :param interim_results: (Optional) If true, interim results (tentative + hypotheses) may be returned as they become + available (these interim results are indicated + with the ``is_final=False`` flag). If false or + omitted, only is_final=true result(s) are + returned. + + :raises: EnvironmentError if gRPC is not available. + """ + if not self._use_gax: + raise EnvironmentError('gRPC is required to use this API.') + + responses = self.speech_api.streaming_recognize(sample, language_code, + max_alternatives, + profanity_filter, + speech_context, + single_utterance, + interim_results) + for response in responses: + for result in response.results: + if result.is_final or interim_results: + yield [Alternative.from_pb(alternative) + for alternative in result.alternatives] + def sync_recognize(self, sample, language_code=None, max_alternatives=None, profanity_filter=None, speech_context=None): diff --git a/packages/google-cloud-python-speech/unit_tests/test__gax.py b/packages/google-cloud-python-speech/unit_tests/test__gax.py index 0bc600ea86b6..31af01178613 100644 --- a/packages/google-cloud-python-speech/unit_tests/test__gax.py +++ b/packages/google-cloud-python-speech/unit_tests/test__gax.py @@ -15,6 +15,35 @@ import unittest +class TestGAPICSpeechAPI(unittest.TestCase): + SAMPLE_RATE = 16000 + + def _getTargetClass(self): + from google.cloud.speech._gax import GAPICSpeechAPI + + return GAPICSpeechAPI + + def _makeOne(self, *args, **kw): + return self._getTargetClass()(*args, **kw) + + def test_use_bytes_instead_of_file_like_object(self): + from google.cloud import speech + from google.cloud.speech.sample import Sample + + credentials = {} + client = speech.Client(credentials=credentials, use_gax=True) + client.connection = _Connection() + client.connection.credentials = credentials + + sample = Sample(content=b'', encoding=speech.Encoding.FLAC, + sample_rate=self.SAMPLE_RATE) + + api = self._makeOne(client) + with self.assertRaises(ValueError): + api.streaming_recognize(sample) + self.assertEqual(client.connection._requested, []) + + class TestSpeechGAXMakeRequests(unittest.TestCase): SAMPLE_RATE = 16000 HINTS = ['hi'] @@ -137,3 +166,10 @@ def test_stream_requests(self): self.assertEqual(streaming_request.audio_content, self.AUDIO_CONTENT) self.assertIsInstance(config_request.streaming_config, StreamingRecognitionConfig) + + +class _Connection(object): + + def __init__(self, *responses): + self._responses = responses + self._requested = [] diff --git a/packages/google-cloud-python-speech/unit_tests/test_client.py b/packages/google-cloud-python-speech/unit_tests/test_client.py index 049a018bc0fd..b108fcaab579 100644 --- a/packages/google-cloud-python-speech/unit_tests/test_client.py +++ b/packages/google-cloud-python-speech/unit_tests/test_client.py @@ -15,33 +15,56 @@ import unittest -class TestClient(unittest.TestCase): - SAMPLE_RATE = 16000 - HINTS = ['hi'] - AUDIO_SOURCE_URI = 'gs://sample-bucket/sample-recording.flac' - AUDIO_CONTENT = '/9j/4QNURXhpZgAASUkq' +def _make_result(alternatives=()): + from google.cloud.grpc.speech.v1beta1 import cloud_speech_pb2 - @staticmethod - def _make_result(alternatives): - from google.cloud.grpc.speech.v1beta1 import cloud_speech_pb2 + return cloud_speech_pb2.SpeechRecognitionResult( + alternatives=[ + cloud_speech_pb2.SpeechRecognitionAlternative( + transcript=alternative['transcript'], + confidence=alternative['confidence'], + ) for alternative in alternatives + ], + ) - return cloud_speech_pb2.SpeechRecognitionResult( - alternatives=[ - cloud_speech_pb2.SpeechRecognitionAlternative( - transcript=alternative['transcript'], - confidence=alternative['confidence'], - ) for alternative in alternatives - ], - ) - def _make_sync_response(self, *results): - from google.cloud.grpc.speech.v1beta1 import cloud_speech_pb2 +def _make_streaming_result(alternatives=(), is_final=True): + from google.cloud.grpc.speech.v1beta1 import cloud_speech_pb2 - response = cloud_speech_pb2.SyncRecognizeResponse( - results=results, - ) + return cloud_speech_pb2.StreamingRecognitionResult( + alternatives=[ + cloud_speech_pb2.SpeechRecognitionAlternative( + transcript=alternative['transcript'], + confidence=alternative['confidence'], + ) for alternative in alternatives + ], + is_final=is_final, + ) - return response + +def _make_streaming_response(*results): + from google.cloud.grpc.speech.v1beta1 import cloud_speech_pb2 + + response = cloud_speech_pb2.StreamingRecognizeResponse( + results=results, + ) + return response + + +def _make_sync_response(*results): + from google.cloud.grpc.speech.v1beta1 import cloud_speech_pb2 + + response = cloud_speech_pb2.SyncRecognizeResponse( + results=results, + ) + return response + + +class TestClient(unittest.TestCase): + SAMPLE_RATE = 16000 + HINTS = ['hi'] + AUDIO_SOURCE_URI = 'gs://sample-bucket/sample-recording.flac' + AUDIO_CONTENT = '/9j/4QNURXhpZgAASUkq' def _getTargetClass(self): from google.cloud.speech.client import Client @@ -226,7 +249,7 @@ def make_channel(*args): return channel_obj def speech_api(channel=None): - return _MockGAPICSpeechAPI(response=self._make_sync_response(), + return _MockGAPICSpeechAPI(response=_make_sync_response(), channel=channel) host = 'foo.apis.invalid' @@ -261,6 +284,7 @@ def test_sync_recognize_with_gax(self): client.connection = _Connection() client.connection.credentials = creds client._speech_api = None + alternatives = [{ 'transcript': 'testing 1 2 3', 'confidence': 0.9224355, @@ -268,7 +292,7 @@ def test_sync_recognize_with_gax(self): 'transcript': 'testing 4 5 6', 'confidence': 0.0123456, }] - result = self._make_result(alternatives) + result = _make_result(alternatives) channel_args = [] channel_obj = object() @@ -279,7 +303,7 @@ def make_channel(*args): def speech_api(channel=None): return _MockGAPICSpeechAPI( - response=self._make_sync_response(result), + response=_make_sync_response(result), channel=channel) host = 'foo.apis.invalid' @@ -395,6 +419,214 @@ def speech_api(channel=None): self.assertFalse(operation.complete) self.assertIsNone(operation.response) + def test_streaming_depends_on_gax(self): + from google.cloud._testing import _Monkey + + credentials = _Credentials() + client = self._makeOne(credentials=credentials, use_gax=False) + client.connection = _Connection() + + with self.assertRaises(EnvironmentError): + list(client.streaming_recognize({})) + + def test_streaming_closed_stream(self): + from io import BytesIO + + from google.cloud._testing import _Monkey + + from google.cloud.speech import _gax + from google.cloud.speech.encoding import Encoding + + stream = BytesIO(b'Some audio data...') + credentials = _Credentials() + client = self._makeOne(credentials=credentials) + client.connection = _Connection() + client.connection.credentials = credentials + + channel_args = [] + channel_obj = object() + + def make_channel(*args): + channel_args.append(args) + return channel_obj + + def speech_api(channel=None): + return _MockGAPICSpeechAPI(channel=channel) + + host = 'foo.apis.invalid' + speech_api.SERVICE_ADDRESS = host + + stream.close() + + sample = client.sample(content=stream, + encoding=Encoding.LINEAR16, + sample_rate=self.SAMPLE_RATE) + + with _Monkey(_gax, SpeechApi=speech_api, + make_secure_channel=make_channel): + client._speech_api = _gax.GAPICSpeechAPI(client) + + with self.assertRaises(ValueError): + list(client.streaming_recognize(sample)) + + def test_stream_recognize_interim_results(self): + from io import BytesIO + + from google.cloud._testing import _Monkey + + from google.cloud.speech import _gax + from google.cloud.speech.encoding import Encoding + + stream = BytesIO(b'Some audio data...') + credentials = _Credentials() + client = self._makeOne(credentials=credentials) + client.connection = _Connection() + client.connection.credentials = credentials + + alternatives = [{ + 'transcript': 'testing streaming 1 2 3', + 'confidence': 0.9224355, + }, { + 'transcript': 'testing streaming 4 5 6', + 'confidence': 0.0123456, + }] + first_response = _make_streaming_response( + _make_streaming_result([], is_final=False)) + second_response = _make_streaming_response( + _make_streaming_result(alternatives, is_final=False)) + last_response = _make_streaming_response( + _make_streaming_result(alternatives, is_final=True)) + responses = [first_response, second_response, last_response] + + channel_args = [] + channel_obj = object() + + def make_channel(*args): + channel_args.append(args) + return channel_obj + + def speech_api(channel=None): + return _MockGAPICSpeechAPI(channel=channel, response=responses) + + host = 'foo.apis.invalid' + speech_api.SERVICE_ADDRESS = host + + with _Monkey(_gax, SpeechApi=speech_api, + make_secure_channel=make_channel): + client._speech_api = _gax.GAPICSpeechAPI(client) + + sample = client.sample(content=stream, + encoding=Encoding.LINEAR16, + sample_rate=self.SAMPLE_RATE) + + results = list(client.streaming_recognize(sample, + interim_results=True)) + self.assertEqual(results[0], []) + self.assertEqual(results[1][0].transcript, + alternatives[0]['transcript']) + self.assertEqual(results[1][0].confidence, + alternatives[0]['confidence']) + self.assertEqual(results[1][1].transcript, + alternatives[1]['transcript']) + self.assertEqual(results[1][1].confidence, + alternatives[1]['confidence']) + + def test_stream_recognize(self): + from io import BytesIO + + from google.cloud._testing import _Monkey + + from google.cloud.speech import _gax + from google.cloud.speech.encoding import Encoding + + stream = BytesIO(b'Some audio data...') + credentials = _Credentials() + client = self._makeOne(credentials=credentials) + client.connection = _Connection() + client.connection.credentials = credentials + + alternatives = [{ + 'transcript': 'testing streaming 1 2 3', + 'confidence': 0.9224355, + }, { + 'transcript': 'testing streaming 4 5 6', + 'confidence': 0.0123456, + }] + + first_response = _make_streaming_response( + _make_streaming_result(alternatives=alternatives, is_final=False)) + last_response = _make_streaming_response( + _make_streaming_result(alternatives=alternatives, is_final=True)) + responses = [first_response, last_response] + + channel_args = [] + channel_obj = object() + + def make_channel(*args): + channel_args.append(args) + return channel_obj + + def speech_api(channel=None): + return _MockGAPICSpeechAPI(channel=channel, response=responses) + + host = 'foo.apis.invalid' + speech_api.SERVICE_ADDRESS = host + + with _Monkey(_gax, SpeechApi=speech_api, + make_secure_channel=make_channel): + client._speech_api = _gax.GAPICSpeechAPI(client) + + sample = client.sample(content=stream, + encoding=Encoding.LINEAR16, + sample_rate=self.SAMPLE_RATE) + + results = list(client.streaming_recognize(sample)) + self.assertEqual(len(results), 1) + self.assertEqual(results[0][0].transcript, + alternatives[0]['transcript']) + self.assertEqual(results[0][0].confidence, + alternatives[0]['confidence']) + + def test_stream_recognize_no_results(self): + from io import BytesIO + + from google.cloud._testing import _Monkey + + from google.cloud.speech import _gax + from google.cloud.speech.encoding import Encoding + + stream = BytesIO(b'Some audio data...') + credentials = _Credentials() + client = self._makeOne(credentials=credentials) + client.connection = _Connection() + client.connection.credentials = credentials + + responses = [_make_streaming_response()] + + channel_args = [] + channel_obj = object() + + def make_channel(*args): + channel_args.append(args) + return channel_obj + + def speech_api(channel=None): + return _MockGAPICSpeechAPI(channel=channel, response=responses) + + host = 'foo.apis.invalid' + speech_api.SERVICE_ADDRESS = host + + with _Monkey(_gax, SpeechApi=speech_api, + make_secure_channel=make_channel): + client._speech_api = _gax.GAPICSpeechAPI(client) + + sample = client.sample(content=stream, + encoding=Encoding.LINEAR16, + sample_rate=self.SAMPLE_RATE) + + results = list(client.streaming_recognize(sample)) + self.assertEqual(results, []) + def test_speech_api_with_gax(self): from google.cloud._testing import _Monkey @@ -469,8 +701,14 @@ def async_recognize(self, config, audio): def sync_recognize(self, config, audio): self.config = config self.audio = audio + return self._response + def streaming_recognize(self, requests): + self._requests = requests + for response in self._response: + yield response + class _Credentials(object):