From 5f926299a606cf8f4a092de4b43106e0c9fa0707 Mon Sep 17 00:00:00 2001
From: Thomas Schultz <daspecster@gmail.com>
Date: Thu, 27 Oct 2016 13:41:52 -0400
Subject: [PATCH] Add speech streaming recognition.

---
 .../google/cloud/speech/_gax.py               |  91 ++++++
 .../google/cloud/speech/client.py             |  86 ++++++
 .../unit_tests/test__gax.py                   |  36 +++
 .../unit_tests/test_client.py                 | 288 ++++++++++++++++--
 4 files changed, 476 insertions(+), 25 deletions(-)

diff --git a/packages/google-cloud-python-speech/google/cloud/speech/_gax.py b/packages/google-cloud-python-speech/google/cloud/speech/_gax.py
index c24f8acd365b..3cad482dd53a 100644
--- a/packages/google-cloud-python-speech/google/cloud/speech/_gax.py
+++ b/packages/google-cloud-python-speech/google/cloud/speech/_gax.py
@@ -106,6 +106,97 @@ def async_recognize(self, sample, language_code=None,
 
         return Operation.from_pb(response, self)
 
+    def streaming_recognize(self, sample, language_code=None,
+                            max_alternatives=None, profanity_filter=None,
+                            speech_context=None, single_utterance=False,
+                            interim_results=False):
+        """Streaming speech recognition.
+
+        .. note::
+
+            Streaming recognition requests are limited to 1 minute of audio.
+            See: https://cloud.google.com/speech/limits#content
+
+        Yields :class:`~streaming_response.StreamingSpeechResponse` containing
+        results and metadata from the streaming request.
+
+        :type sample: :class:`~google.cloud.speech.sample.Sample`
+        :param sample: Instance of ``Sample`` containing audio information.
+
+        :type language_code: str
+        :param language_code: (Optional) The language of the supplied audio as
+                              BCP-47 language tag. Example: ``'en-GB'``.
+                              If omitted, defaults to ``'en-US'``.
+
+        :type max_alternatives: int
+        :param max_alternatives: (Optional) Maximum number of recognition
+                                 hypotheses to be returned. The server may
+                                 return fewer than maxAlternatives.
+                                 Valid values are 0-30. A value of 0 or 1
+                                 will return a maximum of 1. Defaults to 1
+
+        :type profanity_filter: bool
+        :param profanity_filter: If True, the server will attempt to filter
+                                 out profanities, replacing all but the
+                                 initial character in each filtered word with
+                                 asterisks, e.g. ``'f***'``. If False or
+                                 omitted, profanities won't be filtered out.
+
+        :type speech_context: list
+        :param speech_context: A list of strings (max 50) containing words and
+                               phrases "hints" so that the speech recognition
+                               is more likely to recognize them. This can be
+                               used to improve the accuracy for specific words
+                               and phrases. This can also be used to add new
+                               words to the vocabulary of the recognizer.
+
+        :type single_utterance: bool
+        :param single_utterance: (Optional) If false or omitted, the recognizer
+                                 will perform continuous recognition
+                                 (continuing to process audio even if the user
+                                 pauses speaking) until the client closes the
+                                 output stream (gRPC API) or when the maximum
+                                 time limit has been reached. Multiple
+                                 SpeechRecognitionResults with the is_final
+                                 flag set to true may be returned.
+                                 If true, the recognizer will detect a single
+                                 spoken utterance. When it detects that the
+                                 user has paused or stopped speaking, it will
+                                 return an END_OF_UTTERANCE event and cease
+                                 recognition. It will return no more than one
+                                 SpeechRecognitionResult with the is_final flag
+                                 set to true.
+
+        :type interim_results: bool
+        :param interim_results: (Optional) If true, interim results (tentative
+                                hypotheses) may be returned as they become
+                                available (these interim results are indicated
+                                with the is_final=false flag). If false or
+                                omitted, only is_final=true result(s) are
+                                returned.
+
+        :raises: :class:`ValueError` if sample.content is not a file-like
+                 object. :class:`ValueError` if stream has closed.
+
+        :rtype: :class:`~google.cloud.grpc.speech.v1beta1\
+                       .cloud_speech_pb2.StreamingRecognizeResponse`
+        :returns: ``StreamingRecognizeResponse`` instances.
+        """
+        if getattr(sample.content, 'closed', None) is None:
+            raise ValueError('Please use file-like object for data stream.')
+        if sample.content.closed:
+            raise ValueError('Stream is closed.')
+
+        requests = _stream_requests(sample, language_code=language_code,
+                                    max_alternatives=max_alternatives,
+                                    profanity_filter=profanity_filter,
+                                    speech_context=speech_context,
+                                    single_utterance=single_utterance,
+                                    interim_results=interim_results)
+        api = self._gapic_api
+        responses = api.streaming_recognize(requests)
+        return responses
+
     def sync_recognize(self, sample, language_code=None, max_alternatives=None,
                        profanity_filter=None, speech_context=None):
         """Synchronous Speech Recognition.
diff --git a/packages/google-cloud-python-speech/google/cloud/speech/client.py b/packages/google-cloud-python-speech/google/cloud/speech/client.py
index a321d92cce41..94716086b3a2 100644
--- a/packages/google-cloud-python-speech/google/cloud/speech/client.py
+++ b/packages/google-cloud-python-speech/google/cloud/speech/client.py
@@ -159,6 +159,92 @@ def speech_api(self):
                 self._speech_api = _JSONSpeechAPI(self)
         return self._speech_api
 
+    def streaming_recognize(self, sample, language_code=None,
+                            max_alternatives=None, profanity_filter=None,
+                            speech_context=None, single_utterance=False,
+                            interim_results=False):
+        """Streaming speech recognition.
+
+        .. note::
+
+            Streaming recognition requests are limited to 1 minute of audio.
+            See: https://cloud.google.com/speech/limits#content
+
+        Yields: list of :class:`~google.cloud.speech.alternative.Alternatives`
+                containing results and metadata from the streaming request.
+
+        :type sample: :class:`~google.cloud.speech.sample.Sample`
+        :param sample: Instance of ``Sample`` containing audio information.
+
+        :type language_code: str
+        :param language_code: (Optional) The language of the supplied audio as
+                              BCP-47 language tag. Example: ``'en-GB'``.
+                              If omitted, defaults to ``'en-US'``.
+
+        :type max_alternatives: int
+        :param max_alternatives: (Optional) Maximum number of recognition
+                                 hypotheses to be returned. The server may
+                                 return fewer than maxAlternatives.
+                                 Valid values are 0-30. A value of 0 or 1
+                                 will return a maximum of 1. Defaults to 1
+
+        :type profanity_filter: bool
+        :param profanity_filter: If True, the server will attempt to filter
+                                 out profanities, replacing all but the
+                                 initial character in each filtered word with
+                                 asterisks, e.g. ``'f***'``. If False or
+                                 omitted, profanities won't be filtered out.
+
+        :type speech_context: list
+        :param speech_context: A list of strings (max 50) containing words and
+                               phrases "hints" so that the speech recognition
+                               is more likely to recognize them. This can be
+                               used to improve the accuracy for specific words
+                               and phrases. This can also be used to add new
+                               words to the vocabulary of the recognizer.
+
+        :type single_utterance: bool
+        :param single_utterance: (Optional) If false or omitted, the recognizer
+                                 will perform continuous recognition
+                                 (continuing to process audio even if the user
+                                 pauses speaking) until the client closes the
+                                 output stream (gRPC API) or when the maximum
+                                 time limit has been reached. Multiple
+                                 SpeechRecognitionResults with the is_final
+                                 flag set to true may be returned.
+                                 If true, the recognizer will detect a single
+                                 spoken utterance. When it detects that the
+                                 user has paused or stopped speaking, it will
+                                 return an END_OF_UTTERANCE event and cease
+                                 recognition. It will return no more than one
+                                 SpeechRecognitionResult with the is_final flag
+                                 set to true.
+
+        :type interim_results: bool
+        :param interim_results: (Optional) If true, interim results (tentative
+                                hypotheses) may be returned as they become
+                                available (these interim results are indicated
+                                with the ``is_final=False`` flag). If false or
+                                omitted, only is_final=true result(s) are
+                                returned.
+
+        :raises: EnvironmentError if gRPC is not available.
+        """
+        if not self._use_gax:
+            raise EnvironmentError('gRPC is required to use this API.')
+
+        responses = self.speech_api.streaming_recognize(sample, language_code,
+                                                        max_alternatives,
+                                                        profanity_filter,
+                                                        speech_context,
+                                                        single_utterance,
+                                                        interim_results)
+        for response in responses:
+            for result in response.results:
+                if result.is_final or interim_results:
+                    yield [Alternative.from_pb(alternative)
+                           for alternative in result.alternatives]
+
     def sync_recognize(self, sample, language_code=None,
                        max_alternatives=None, profanity_filter=None,
                        speech_context=None):
diff --git a/packages/google-cloud-python-speech/unit_tests/test__gax.py b/packages/google-cloud-python-speech/unit_tests/test__gax.py
index 0bc600ea86b6..31af01178613 100644
--- a/packages/google-cloud-python-speech/unit_tests/test__gax.py
+++ b/packages/google-cloud-python-speech/unit_tests/test__gax.py
@@ -15,6 +15,35 @@
 import unittest
 
 
+class TestGAPICSpeechAPI(unittest.TestCase):
+    SAMPLE_RATE = 16000
+
+    def _getTargetClass(self):
+        from google.cloud.speech._gax import GAPICSpeechAPI
+
+        return GAPICSpeechAPI
+
+    def _makeOne(self, *args, **kw):
+        return self._getTargetClass()(*args, **kw)
+
+    def test_use_bytes_instead_of_file_like_object(self):
+        from google.cloud import speech
+        from google.cloud.speech.sample import Sample
+
+        credentials = {}
+        client = speech.Client(credentials=credentials, use_gax=True)
+        client.connection = _Connection()
+        client.connection.credentials = credentials
+
+        sample = Sample(content=b'', encoding=speech.Encoding.FLAC,
+                        sample_rate=self.SAMPLE_RATE)
+
+        api = self._makeOne(client)
+        with self.assertRaises(ValueError):
+            api.streaming_recognize(sample)
+        self.assertEqual(client.connection._requested, [])
+
+
 class TestSpeechGAXMakeRequests(unittest.TestCase):
     SAMPLE_RATE = 16000
     HINTS = ['hi']
@@ -137,3 +166,10 @@ def test_stream_requests(self):
         self.assertEqual(streaming_request.audio_content, self.AUDIO_CONTENT)
         self.assertIsInstance(config_request.streaming_config,
                               StreamingRecognitionConfig)
+
+
+class _Connection(object):
+
+    def __init__(self, *responses):
+        self._responses = responses
+        self._requested = []
diff --git a/packages/google-cloud-python-speech/unit_tests/test_client.py b/packages/google-cloud-python-speech/unit_tests/test_client.py
index 049a018bc0fd..b108fcaab579 100644
--- a/packages/google-cloud-python-speech/unit_tests/test_client.py
+++ b/packages/google-cloud-python-speech/unit_tests/test_client.py
@@ -15,33 +15,56 @@
 import unittest
 
 
-class TestClient(unittest.TestCase):
-    SAMPLE_RATE = 16000
-    HINTS = ['hi']
-    AUDIO_SOURCE_URI = 'gs://sample-bucket/sample-recording.flac'
-    AUDIO_CONTENT = '/9j/4QNURXhpZgAASUkq'
+def _make_result(alternatives=()):
+    from google.cloud.grpc.speech.v1beta1 import cloud_speech_pb2
 
-    @staticmethod
-    def _make_result(alternatives):
-        from google.cloud.grpc.speech.v1beta1 import cloud_speech_pb2
+    return cloud_speech_pb2.SpeechRecognitionResult(
+        alternatives=[
+            cloud_speech_pb2.SpeechRecognitionAlternative(
+                transcript=alternative['transcript'],
+                confidence=alternative['confidence'],
+            ) for alternative in alternatives
+        ],
+    )
 
-        return cloud_speech_pb2.SpeechRecognitionResult(
-            alternatives=[
-                cloud_speech_pb2.SpeechRecognitionAlternative(
-                    transcript=alternative['transcript'],
-                    confidence=alternative['confidence'],
-                ) for alternative in alternatives
-            ],
-        )
 
-    def _make_sync_response(self, *results):
-        from google.cloud.grpc.speech.v1beta1 import cloud_speech_pb2
+def _make_streaming_result(alternatives=(), is_final=True):
+    from google.cloud.grpc.speech.v1beta1 import cloud_speech_pb2
 
-        response = cloud_speech_pb2.SyncRecognizeResponse(
-            results=results,
-        )
+    return cloud_speech_pb2.StreamingRecognitionResult(
+        alternatives=[
+            cloud_speech_pb2.SpeechRecognitionAlternative(
+                transcript=alternative['transcript'],
+                confidence=alternative['confidence'],
+            ) for alternative in alternatives
+        ],
+        is_final=is_final,
+    )
 
-        return response
+
+def _make_streaming_response(*results):
+    from google.cloud.grpc.speech.v1beta1 import cloud_speech_pb2
+
+    response = cloud_speech_pb2.StreamingRecognizeResponse(
+        results=results,
+    )
+    return response
+
+
+def _make_sync_response(*results):
+    from google.cloud.grpc.speech.v1beta1 import cloud_speech_pb2
+
+    response = cloud_speech_pb2.SyncRecognizeResponse(
+        results=results,
+    )
+    return response
+
+
+class TestClient(unittest.TestCase):
+    SAMPLE_RATE = 16000
+    HINTS = ['hi']
+    AUDIO_SOURCE_URI = 'gs://sample-bucket/sample-recording.flac'
+    AUDIO_CONTENT = '/9j/4QNURXhpZgAASUkq'
 
     def _getTargetClass(self):
         from google.cloud.speech.client import Client
@@ -226,7 +249,7 @@ def make_channel(*args):
             return channel_obj
 
         def speech_api(channel=None):
-            return _MockGAPICSpeechAPI(response=self._make_sync_response(),
+            return _MockGAPICSpeechAPI(response=_make_sync_response(),
                                        channel=channel)
 
         host = 'foo.apis.invalid'
@@ -261,6 +284,7 @@ def test_sync_recognize_with_gax(self):
         client.connection = _Connection()
         client.connection.credentials = creds
         client._speech_api = None
+
         alternatives = [{
             'transcript': 'testing 1 2 3',
             'confidence': 0.9224355,
@@ -268,7 +292,7 @@ def test_sync_recognize_with_gax(self):
             'transcript': 'testing 4 5 6',
             'confidence': 0.0123456,
         }]
-        result = self._make_result(alternatives)
+        result = _make_result(alternatives)
 
         channel_args = []
         channel_obj = object()
@@ -279,7 +303,7 @@ def make_channel(*args):
 
         def speech_api(channel=None):
             return _MockGAPICSpeechAPI(
-                response=self._make_sync_response(result),
+                response=_make_sync_response(result),
                 channel=channel)
 
         host = 'foo.apis.invalid'
@@ -395,6 +419,214 @@ def speech_api(channel=None):
         self.assertFalse(operation.complete)
         self.assertIsNone(operation.response)
 
+    def test_streaming_depends_on_gax(self):
+        from google.cloud._testing import _Monkey
+
+        credentials = _Credentials()
+        client = self._makeOne(credentials=credentials, use_gax=False)
+        client.connection = _Connection()
+
+        with self.assertRaises(EnvironmentError):
+            list(client.streaming_recognize({}))
+
+    def test_streaming_closed_stream(self):
+        from io import BytesIO
+
+        from google.cloud._testing import _Monkey
+
+        from google.cloud.speech import _gax
+        from google.cloud.speech.encoding import Encoding
+
+        stream = BytesIO(b'Some audio data...')
+        credentials = _Credentials()
+        client = self._makeOne(credentials=credentials)
+        client.connection = _Connection()
+        client.connection.credentials = credentials
+
+        channel_args = []
+        channel_obj = object()
+
+        def make_channel(*args):
+            channel_args.append(args)
+            return channel_obj
+
+        def speech_api(channel=None):
+            return _MockGAPICSpeechAPI(channel=channel)
+
+        host = 'foo.apis.invalid'
+        speech_api.SERVICE_ADDRESS = host
+
+        stream.close()
+
+        sample = client.sample(content=stream,
+                               encoding=Encoding.LINEAR16,
+                               sample_rate=self.SAMPLE_RATE)
+
+        with _Monkey(_gax, SpeechApi=speech_api,
+                     make_secure_channel=make_channel):
+            client._speech_api = _gax.GAPICSpeechAPI(client)
+
+        with self.assertRaises(ValueError):
+            list(client.streaming_recognize(sample))
+
+    def test_stream_recognize_interim_results(self):
+        from io import BytesIO
+
+        from google.cloud._testing import _Monkey
+
+        from google.cloud.speech import _gax
+        from google.cloud.speech.encoding import Encoding
+
+        stream = BytesIO(b'Some audio data...')
+        credentials = _Credentials()
+        client = self._makeOne(credentials=credentials)
+        client.connection = _Connection()
+        client.connection.credentials = credentials
+
+        alternatives = [{
+            'transcript': 'testing streaming 1 2 3',
+            'confidence': 0.9224355,
+        }, {
+            'transcript': 'testing streaming 4 5 6',
+            'confidence': 0.0123456,
+        }]
+        first_response = _make_streaming_response(
+            _make_streaming_result([], is_final=False))
+        second_response = _make_streaming_response(
+            _make_streaming_result(alternatives, is_final=False))
+        last_response = _make_streaming_response(
+            _make_streaming_result(alternatives, is_final=True))
+        responses = [first_response, second_response, last_response]
+
+        channel_args = []
+        channel_obj = object()
+
+        def make_channel(*args):
+            channel_args.append(args)
+            return channel_obj
+
+        def speech_api(channel=None):
+            return _MockGAPICSpeechAPI(channel=channel, response=responses)
+
+        host = 'foo.apis.invalid'
+        speech_api.SERVICE_ADDRESS = host
+
+        with _Monkey(_gax, SpeechApi=speech_api,
+                     make_secure_channel=make_channel):
+            client._speech_api = _gax.GAPICSpeechAPI(client)
+
+        sample = client.sample(content=stream,
+                               encoding=Encoding.LINEAR16,
+                               sample_rate=self.SAMPLE_RATE)
+
+        results = list(client.streaming_recognize(sample,
+                                                  interim_results=True))
+        self.assertEqual(results[0], [])
+        self.assertEqual(results[1][0].transcript,
+                         alternatives[0]['transcript'])
+        self.assertEqual(results[1][0].confidence,
+                         alternatives[0]['confidence'])
+        self.assertEqual(results[1][1].transcript,
+                         alternatives[1]['transcript'])
+        self.assertEqual(results[1][1].confidence,
+                         alternatives[1]['confidence'])
+
+    def test_stream_recognize(self):
+        from io import BytesIO
+
+        from google.cloud._testing import _Monkey
+
+        from google.cloud.speech import _gax
+        from google.cloud.speech.encoding import Encoding
+
+        stream = BytesIO(b'Some audio data...')
+        credentials = _Credentials()
+        client = self._makeOne(credentials=credentials)
+        client.connection = _Connection()
+        client.connection.credentials = credentials
+
+        alternatives = [{
+            'transcript': 'testing streaming 1 2 3',
+            'confidence': 0.9224355,
+        }, {
+            'transcript': 'testing streaming 4 5 6',
+            'confidence': 0.0123456,
+        }]
+
+        first_response = _make_streaming_response(
+            _make_streaming_result(alternatives=alternatives, is_final=False))
+        last_response = _make_streaming_response(
+            _make_streaming_result(alternatives=alternatives, is_final=True))
+        responses = [first_response, last_response]
+
+        channel_args = []
+        channel_obj = object()
+
+        def make_channel(*args):
+            channel_args.append(args)
+            return channel_obj
+
+        def speech_api(channel=None):
+            return _MockGAPICSpeechAPI(channel=channel, response=responses)
+
+        host = 'foo.apis.invalid'
+        speech_api.SERVICE_ADDRESS = host
+
+        with _Monkey(_gax, SpeechApi=speech_api,
+                     make_secure_channel=make_channel):
+            client._speech_api = _gax.GAPICSpeechAPI(client)
+
+        sample = client.sample(content=stream,
+                               encoding=Encoding.LINEAR16,
+                               sample_rate=self.SAMPLE_RATE)
+
+        results = list(client.streaming_recognize(sample))
+        self.assertEqual(len(results), 1)
+        self.assertEqual(results[0][0].transcript,
+                         alternatives[0]['transcript'])
+        self.assertEqual(results[0][0].confidence,
+                         alternatives[0]['confidence'])
+
+    def test_stream_recognize_no_results(self):
+        from io import BytesIO
+
+        from google.cloud._testing import _Monkey
+
+        from google.cloud.speech import _gax
+        from google.cloud.speech.encoding import Encoding
+
+        stream = BytesIO(b'Some audio data...')
+        credentials = _Credentials()
+        client = self._makeOne(credentials=credentials)
+        client.connection = _Connection()
+        client.connection.credentials = credentials
+
+        responses = [_make_streaming_response()]
+
+        channel_args = []
+        channel_obj = object()
+
+        def make_channel(*args):
+            channel_args.append(args)
+            return channel_obj
+
+        def speech_api(channel=None):
+            return _MockGAPICSpeechAPI(channel=channel, response=responses)
+
+        host = 'foo.apis.invalid'
+        speech_api.SERVICE_ADDRESS = host
+
+        with _Monkey(_gax, SpeechApi=speech_api,
+                     make_secure_channel=make_channel):
+            client._speech_api = _gax.GAPICSpeechAPI(client)
+
+        sample = client.sample(content=stream,
+                               encoding=Encoding.LINEAR16,
+                               sample_rate=self.SAMPLE_RATE)
+
+        results = list(client.streaming_recognize(sample))
+        self.assertEqual(results, [])
+
     def test_speech_api_with_gax(self):
         from google.cloud._testing import _Monkey
 
@@ -469,8 +701,14 @@ def async_recognize(self, config, audio):
     def sync_recognize(self, config, audio):
         self.config = config
         self.audio = audio
+
         return self._response
 
+    def streaming_recognize(self, requests):
+        self._requests = requests
+        for response in self._response:
+            yield response
+
 
 class _Credentials(object):