Add speech streaming recognition.

googleapis · Nov 4, 2016 · 494b689 · 494b689
1 parent 3de5bc0
commit 494b689
Show file tree

Hide file tree

Showing 6 changed files with 478 additions and 18 deletions.
diff --git a/docs/speech-usage.rst b/docs/speech-usage.rst
@@ -151,5 +151,84 @@ words to the vocabulary of the recognizer.
     transcript: Hello, this is a test
     confidence: 0.81
 
+
+Streaming Recognition
+---------------------
+
+The :meth:`~google.cloud.speech.Client.streaming_recognize` method converts
+speech data to possible text alternatives on the fly.
+
+.. note::
+    Streaming recognition requests are limited to 1 minute of audio.
+
+    See: https://cloud.google.com/speech/limits#content
+
+.. code-block:: python
+
+    >>> from google.cloud import speech
+    >>> client = speech.Client()
+    >>> with open('./hello.wav', 'rb') as stream:
+    ...     sample = client.sample(content=stream,
+    ...                            encoding=speech.Encoding.LINEAR16,
+    ...                            sample_rate=16000)
+    ...     response = list(client.streaming_recognize(sample))
+    ...     print(response[0].transcript)
+    'hello'
+    ...     print(response[0].confidence)
+    0.973458576
+
+
+By default the recognizer will perform continuous recognition
+(continuing to process audio even if the user pauses speaking) until the client
+closes the output stream or when the maximum time limit has been reached.
+
+If you only want to recognize a single utterance you can set
+ ``single_utterance`` to ``True`` and only one result will be returned.
+
+See: `Single Utterance`_
+
+.. code-block:: python
+
+    >>> with open('./hello_pause_goodbye.wav', 'rb') as stream:
+    >>>     sample = client.sample(content=stream,
+    ...                            encoding=speech.Encoding.LINEAR16,
+    ...                            sample_rate=16000)
+    >>>     response = client.streaming_recognize(sample,
+    ...                                           single_utterance=True)
+    >>> results = list(response)
+    >>> print(results[0].transcript)
+    hello
+    >>> print(results[0].confidence)
+    0.96523453546
+
+
+If ``interim_results`` is set to ``True``, interim results
+(tentative hypotheses) may be returned as they become available.
+
+.. code-block:: python
+
+    >>> from google.cloud import speech
+    >>> client = speech.Client()
+    >>> with open('./hello.wav', 'rb') as stream:
+    ...     sample = client.sample(content=stream,
+    ...                            encoding=speech.Encoding.LINEAR16,
+    ...                            sample_rate=16000)
+    ...     for response in client.streaming_recognize(sample,
+    ...                                                interim_results=True):
+    ...         print('=' * 20)
+    ...         print(response[0].transcript)
+    ...         print(response[0].confidence)
+    ====================
+    'he'
+    None
+    ====================
+    'hell'
+    None
+    ====================
+    'hello'
+    0.973458576
+
+
+.. _Single Utterance: https://cloud.google.com/speech/reference/rpc/google.cloud.speech.v1beta1#streamingrecognitionconfig
 .. _sync_recognize: https://cloud.google.com/speech/reference/rest/v1beta1/speech/syncrecognize
 .. _Speech Asynchronous Recognize: https://cloud.google.com/speech/reference/rest/v1beta1/speech/asyncrecognize
diff --git a/speech/google/cloud/speech/_gax.py b/speech/google/cloud/speech/_gax.py
@@ -101,6 +101,95 @@ def async_recognize(self, sample, language_code=None,
 
         return Operation.from_pb(response, self)
 
+    def streaming_recognize(self, sample, language_code=None,
+                            max_alternatives=None, profanity_filter=None,
+                            speech_context=None, single_utterance=False,
+                            interim_results=False):
+        """Streaming speech recognition.
+
+        .. note::
+
+            Streaming recognition requests are limited to 1 minute of audio.
+            See: https://cloud.google.com/speech/limits#content
+
+        Yields :class:`~streaming_response.StreamingSpeechResponse` containing
+        results and metadata from the streaming request.
+
+        :type sample: :class:`~google.cloud.speech.sample.Sample`
+        :param sample: Instance of ``Sample`` containing audio information.
+
+        :type language_code: str
+        :param language_code: (Optional) The language of the supplied audio as
+                              BCP-47 language tag. Example: ``'en-GB'``.
+                              If omitted, defaults to ``'en-US'``.
+
+        :type max_alternatives: int
+        :param max_alternatives: (Optional) Maximum number of recognition
+                                 hypotheses to be returned. The server may
+                                 return fewer than maxAlternatives.
+                                 Valid values are 0-30. A value of 0 or 1
+                                 will return a maximum of 1. Defaults to 1
+
+        :type profanity_filter: bool
+        :param profanity_filter: If True, the server will attempt to filter
+                                 out profanities, replacing all but the
+                                 initial character in each filtered word with
+                                 asterisks, e.g. ``'f***'``. If False or
+                                 omitted, profanities won't be filtered out.
+
+        :type speech_context: list
+        :param speech_context: A list of strings (max 50) containing words and
+                               phrases "hints" so that the speech recognition
+                               is more likely to recognize them. This can be
+                               used to improve the accuracy for specific words
+                               and phrases. This can also be used to add new
+                               words to the vocabulary of the recognizer.
+
+        :type single_utterance: bool
+        :param single_utterance: (Optional) If false or omitted, the recognizer
+                                 will perform continuous recognition
+                                 (continuing to process audio even if the user
+                                 pauses speaking) until the client closes the
+                                 output stream (gRPC API) or when the maximum
+                                 time limit has been reached. Multiple
+                                 SpeechRecognitionResults with the is_final
+                                 flag set to true may be returned.
+                                 If true, the recognizer will detect a single
+                                 spoken utterance. When it detects that the
+                                 user has paused or stopped speaking, it will
+                                 return an END_OF_UTTERANCE event and cease
+                                 recognition. It will return no more than one
+                                 SpeechRecognitionResult with the is_final flag
+                                 set to true.
+
+        :type interim_results: bool
+        :param interim_results: (Optional) If true, interim results (tentative
+                                hypotheses) may be returned as they become
+                                available (these interim results are indicated
+                                with the is_final=false flag). If false or
+                                omitted, only is_final=true result(s) are
+                                returned.
+
+        :raises: :class:`EnvironmentError` if gRPC is not enabled and
+            :class:`ValueError` if stream has closed.
+
+        :rtype: :class:`~google.cloud.grpc.speech.v1beta1\
+                       .cloud_speech_pb2.StreamingRecognizeResponse`
+        :returns: ``StreamingRecognizeResponse`` instances.
+        """
+        if sample.content.closed:
+            raise ValueError('Stream is closed.')
+
+        requests = _stream_requests(sample, language_code=language_code,
+                                    max_alternatives=max_alternatives,
+                                    profanity_filter=profanity_filter,
+                                    speech_context=speech_context,
+                                    single_utterance=single_utterance,
+                                    interim_results=interim_results)
+        api = self._gapic_api
+        responses = api.streaming_recognize(requests)
+        return responses
+
     def sync_recognize(self, sample, language_code=None, max_alternatives=None,
                        profanity_filter=None, speech_context=None):
         """Synchronous Speech Recognition.

diff --git a/speech/google/cloud/speech/client.py b/speech/google/cloud/speech/client.py
@@ -159,6 +159,91 @@ def speech_api(self):
                 self._speech_api = _JSONSpeechAPI(self)
         return self._speech_api
 
+    def streaming_recognize(self, sample, language_code=None,
+                            max_alternatives=None, profanity_filter=None,
+                            speech_context=None, single_utterance=False,
+                            interim_results=False):
+        """Streaming speech recognition.
+
+        .. note::
+
+            Streaming recognition requests are limited to 1 minute of audio.
+            See: https://cloud.google.com/speech/limits#content
+
+        Yields: list of :class:`~google.cloud.speech.alternative.Alternatives`
+                containing results and metadata from the streaming request.
+
+        :type sample: :class:`~google.cloud.speech.sample.Sample`
+        :param sample: Instance of ``Sample`` containing audio information.
+
+        :type language_code: str
+        :param language_code: (Optional) The language of the supplied audio as
+                              BCP-47 language tag. Example: ``'en-GB'``.
+                              If omitted, defaults to ``'en-US'``.
+
+        :type max_alternatives: int
+        :param max_alternatives: (Optional) Maximum number of recognition
+                                 hypotheses to be returned. The server may
+                                 return fewer than maxAlternatives.
+                                 Valid values are 0-30. A value of 0 or 1
+                                 will return a maximum of 1. Defaults to 1
+
+        :type profanity_filter: bool
+        :param profanity_filter: If True, the server will attempt to filter
+                                 out profanities, replacing all but the
+                                 initial character in each filtered word with
+                                 asterisks, e.g. ``'f***'``. If False or
+                                 omitted, profanities won't be filtered out.
+
+        :type speech_context: list
+        :param speech_context: A list of strings (max 50) containing words and
+                               phrases "hints" so that the speech recognition
+                               is more likely to recognize them. This can be
+                               used to improve the accuracy for specific words
+                               and phrases. This can also be used to add new
+                               words to the vocabulary of the recognizer.
+
+        :type single_utterance: bool
+        :param single_utterance: (Optional) If false or omitted, the recognizer
+                                 will perform continuous recognition
+                                 (continuing to process audio even if the user
+                                 pauses speaking) until the client closes the
+                                 output stream (gRPC API) or when the maximum
+                                 time limit has been reached. Multiple
+                                 SpeechRecognitionResults with the is_final
+                                 flag set to true may be returned.
+                                 If true, the recognizer will detect a single
+                                 spoken utterance. When it detects that the
+                                 user has paused or stopped speaking, it will
+                                 return an END_OF_UTTERANCE event and cease
+                                 recognition. It will return no more than one
+                                 SpeechRecognitionResult with the is_final flag
+                                 set to true.
+
+        :type interim_results: bool
+        :param interim_results: (Optional) If true, interim results (tentative
+                                hypotheses) may be returned as they become
+                                available (these interim results are indicated
+                                with the is_final=false flag). If false or
+                                omitted, only is_final=true result(s) are
+                                returned.
+        """
+        if not self._use_gax:
+            raise EnvironmentError('gRPC is required to use this API.')
+
+        responses = self.speech_api.streaming_recognize(sample, language_code,
+                                                        max_alternatives,
+                                                        profanity_filter,
+                                                        speech_context,
+                                                        single_utterance,
+                                                        interim_results)
+        for response in responses:
+            results = getattr(response, 'results', [])
+            if results or interim_results:
+                for result in results:
+                    yield [Alternative.from_pb(alternative)
+                           for alternative in result.alternatives]
+
     def sync_recognize(self, sample, language_code=None,
                        max_alternatives=None, profanity_filter=None,
                        speech_context=None):

diff --git a/speech/google/cloud/speech/sample.py b/speech/google/cloud/speech/sample.py
@@ -47,8 +47,8 @@ class Sample(object):
     default_encoding = Encoding.FLAC
     default_sample_rate = 16000
 
-    def __init__(self, content=None, source_uri=None,
-                 encoding=None, sample_rate=None):
+    def __init__(self, content=None, source_uri=None, encoding=None,
+                 sample_rate=None):
 
         no_source = content is None and source_uri is None
         both_source = content is not None and source_uri is not None