diff --git a/speech/google/cloud/speech/_gax.py b/speech/google/cloud/speech/_gax.py index 877e71b71ce5..0019e3e2bea2 100644 --- a/speech/google/cloud/speech/_gax.py +++ b/speech/google/cloud/speech/_gax.py @@ -145,6 +145,82 @@ def sync_recognize(self, sample, language_code=None, max_alternatives=None, raise ValueError('More than one result or none returned from API.') +def _stream_requests(sample, language_code=None, max_alternatives=None, + profanity_filter=None, speech_context=None, + single_utterance=None, interim_results=None): + """Generate stream of requests from sample. + + :type sample: :class:`~google.cloud.speech.sample.Sample` + :param sample: Instance of ``Sample`` containing audio information. + + :type language_code: str + :param language_code: (Optional) The language of the supplied audio as + BCP-47 language tag. Example: ``'en-GB'``. + If omitted, defaults to ``'en-US'``. + + :type max_alternatives: int + :param max_alternatives: (Optional) Maximum number of recognition + hypotheses to be returned. The server may + return fewer than maxAlternatives. + Valid values are 0-30. A value of 0 or 1 + will return a maximum of 1. Defaults to 1 + + :type profanity_filter: bool + :param profanity_filter: (Optional) If True, the server will attempt to + filter out profanities, replacing all but the + initial character in each filtered word with + asterisks, e.g. ``'f***'``. If False or + omitted, profanities won't be filtered out. + + :type speech_context: list + :param speech_context: (Optional) A list of strings (max 50) containing + words and phrases "hints" so that the speech + recognition is more likely to recognize them. + This can be used to improve the accuracy for + specific words and phrases. This can also be used to + add new words to the vocabulary of the recognizer. + + :type single_utterance: bool + :param single_utterance: (Optional) If false or omitted, the recognizer + will perform continuous recognition + (continuing to process audio even if the user + pauses speaking) until the client closes the + output stream (gRPC API) or when the maximum + time limit has been reached. Multiple + SpeechRecognitionResults with the is_final + flag set to true may be returned. + + If true, the recognizer will detect a single + spoken utterance. When it detects that the + user has paused or stopped speaking, it will + return an END_OF_UTTERANCE event and cease + recognition. It will return no more than one + SpeechRecognitionResult with the is_final flag + set to true. + + :type interim_results: bool + :param interim_results: (Optional) If true, interim results (tentative + hypotheses) may be returned as they become + available (these interim results are indicated + with the is_final=false flag). If false or + omitted, only is_final=true result(s) are + returned. + """ + config_request = _make_streaming_request( + sample, language_code=language_code, max_alternatives=max_alternatives, + profanity_filter=profanity_filter, speech_context=speech_context, + single_utterance=single_utterance, interim_results=interim_results) + + # The config request MUST go first and not contain any audio data. + yield config_request + + while True: + data = sample.content.read(sample.chunk_size) + if not data: + break + yield StreamingRecognizeRequest(audio_content=data) + + def _make_streaming_request(sample, language_code, max_alternatives, profanity_filter, speech_context, single_utterance, diff --git a/speech/google/cloud/speech/sample.py b/speech/google/cloud/speech/sample.py index a197f20372f6..17394b7b2f31 100644 --- a/speech/google/cloud/speech/sample.py +++ b/speech/google/cloud/speech/sample.py @@ -68,6 +68,15 @@ def __init__(self, content=None, source_uri=None, else: raise ValueError('Invalid encoding: %s' % (encoding,)) + @property + def chunk_size(self): + """Chunk size to send over gRPC. ~100ms + + :rtype: int + :returns: Optimized chunk size. + """ + return int(self.sample_rate / 10.0) + @property def source_uri(self): """Google Cloud Storage URI of audio source. diff --git a/speech/unit_tests/test__gax.py b/speech/unit_tests/test__gax.py index 444d2f9cd40c..0bc600ea86b6 100644 --- a/speech/unit_tests/test__gax.py +++ b/speech/unit_tests/test__gax.py @@ -15,10 +15,10 @@ import unittest -class TestSpeechGAX(unittest.TestCase): +class TestSpeechGAXMakeRequests(unittest.TestCase): SAMPLE_RATE = 16000 HINTS = ['hi'] - AUDIO_CONTENT = '/9j/4QNURXhpZgAASUkq' + AUDIO_CONTENT = b'/9j/4QNURXhpZgAASUkq' def _callFUT(self, sample, language_code, max_alternatives, profanity_filter, speech_context, single_utterance, @@ -78,3 +78,62 @@ def test_ctor(self): self.assertEqual(config.max_alternatives, max_alternatives) self.assertTrue(config.profanity_filter) self.assertEqual(config.speech_context.phrases, self.HINTS) + + +class TestSpeechGAXMakeRequestsStream(unittest.TestCase): + SAMPLE_RATE = 16000 + HINTS = ['hi'] + AUDIO_CONTENT = b'/9j/4QNURXhpZgAASUkq' + + def _callFUT(self, sample, language_code, max_alternatives, + profanity_filter, speech_context, single_utterance, + interim_results): + from google.cloud.speech._gax import _stream_requests + return _stream_requests(sample=sample, + language_code=language_code, + max_alternatives=max_alternatives, + profanity_filter=profanity_filter, + speech_context=speech_context, + single_utterance=single_utterance, + interim_results=interim_results) + + def test_stream_requests(self): + from io import BytesIO + from google.cloud import speech + from google.cloud.speech.sample import Sample + from google.cloud.grpc.speech.v1beta1.cloud_speech_pb2 import ( + SpeechContext) + from google.cloud.grpc.speech.v1beta1.cloud_speech_pb2 import ( + StreamingRecognitionConfig) + from google.cloud.grpc.speech.v1beta1.cloud_speech_pb2 import ( + StreamingRecognizeRequest) + + sample = Sample(content=BytesIO(self.AUDIO_CONTENT), + encoding=speech.Encoding.FLAC, + sample_rate=self.SAMPLE_RATE) + language_code = 'US-en' + max_alternatives = 2 + profanity_filter = True + speech_context = SpeechContext(phrases=self.HINTS) + single_utterance = True + interim_results = False + streaming_requests = self._callFUT(sample, language_code, + max_alternatives, profanity_filter, + speech_context, single_utterance, + interim_results) + all_requests = [] + for streaming_request in streaming_requests: + self.assertIsInstance(streaming_request, StreamingRecognizeRequest) + all_requests.append(streaming_request) + + self.assertEqual(len(all_requests), 2) + + config_request = all_requests[0] + streaming_request = all_requests[1] + # This isn't set by _make_streaming_request(). + # The first request can only have `streaming_config` set. + # The following requests can only have `audio_content` set. + self.assertEqual(config_request.audio_content, b'') + self.assertEqual(streaming_request.audio_content, self.AUDIO_CONTENT) + self.assertIsInstance(config_request.streaming_config, + StreamingRecognitionConfig)