From 7fb59d416dfa52683d65a4089b9c0d9bd4ec93a2 Mon Sep 17 00:00:00 2001 From: Thomas Schultz Date: Thu, 10 Nov 2016 11:29:26 -0500 Subject: [PATCH] Add stability information to streaming results. Fixes #2702. --- docs/speech-usage.rst | 28 +++++++---- speech/google/cloud/speech/client.py | 71 +++++++++++++++++++++++++++- speech/unit_tests/test_client.py | 35 +++++++++----- system_tests/speech.py | 18 +++---- 4 files changed, 120 insertions(+), 32 deletions(-) diff --git a/docs/speech-usage.rst b/docs/speech-usage.rst index f73475ae57eae..aedd5fa9990ca 100644 --- a/docs/speech-usage.rst +++ b/docs/speech-usage.rst @@ -171,10 +171,10 @@ speech data to possible text alternatives on the fly. ... sample = client.sample(content=stream, ... encoding=speech.Encoding.LINEAR16, ... sample_rate=16000) - ... alternatives = list(client.streaming_recognize(sample)) - >>> print(alternatives[0].transcript) + ... results = list(client.streaming_recognize(sample)) + >>> print(results[0].alternatives[0].transcript) 'hello' - >>> print(alternatives[0].confidence) + >>> print(results[0].alternatives[0].confidence) 0.973458576 @@ -196,10 +196,10 @@ See: `Single Utterance`_ ... sample_rate=16000) ... responses = client.streaming_recognize(sample, ... single_utterance=True) - ... alternatives = list(responses) - >>> print(alternatives[0].transcript) + ... results = list(responses) + >>> print(results[0].alternatives[0].transcript) hello - >>> print(alternatives[0].confidence) + >>> print(results[0].alternatives[0].confidence) 0.96523453546 @@ -214,20 +214,28 @@ If ``interim_results`` is set to :data:`True`, interim results ... sample = client.sample(content=stream, ... encoding=speech.Encoding.LINEAR16, ... sample_rate=16000) - ... for alternatives in client.streaming_recognize(sample, - ... interim_results=True): + ... for results in client.streaming_recognize(sample, + ... interim_results=True): ... print('=' * 20) - ... print(alternatives[0].transcript) - ... print(alternatives[0].confidence) + ... print(results[0].alternatives[0].transcript) + ... print(results[0].alternatives[0].confidence) + ... print(results[0].is_final) + ... print(results[0].stability) ==================== 'he' None + False + 0.113245 ==================== 'hell' None + False + 0.132454 ==================== 'hello' 0.973458576 + True + 0.982345 .. _Single Utterance: https://cloud.google.com/speech/reference/rpc/google.cloud.speech.v1beta1#streamingrecognitionconfig diff --git a/speech/google/cloud/speech/client.py b/speech/google/cloud/speech/client.py index 94716086b3a25..85e7521485f30 100644 --- a/speech/google/cloud/speech/client.py +++ b/speech/google/cloud/speech/client.py @@ -242,8 +242,7 @@ def streaming_recognize(self, sample, language_code=None, for response in responses: for result in response.results: if result.is_final or interim_results: - yield [Alternative.from_pb(alternative) - for alternative in result.alternatives] + yield StreamingSpeechResult.from_pb(result) def sync_recognize(self, sample, language_code=None, max_alternatives=None, profanity_filter=None, @@ -299,6 +298,74 @@ def sync_recognize(self, sample, language_code=None, profanity_filter, speech_context) +class StreamingSpeechResult(object): + """Streaming speech result representation. + + :type alternatives: list + :param alternatives: List of protobuf speech alternatives. + + :type is_final: bool + :param is_final: Boolean indicator of results finality. + + :type stability: float + :param stability: 0.0-1.0 stability score for the results returned. + + :rtype: :class:`~google.cloud.speech.client.StreamingSpeechResult` + :returns: Instance of ``StreamingSpeechResult``. + """ + def __init__(self, alternatives, is_final=False, stability=0.0): + self._alternatives = [Alternative.from_pb(alternative) + for alternative in alternatives] + self._is_final = is_final + self._stability = stability + + @classmethod + def from_pb(cls, response): + """Factory: construct instance of ``StreamingSpeechResult``. + + :type response: :class:`~google.cloud.grpc.speech.v1beta1\ + .cloud_speech_pb2.StreamingRecognizeResult` + :param response: Istance of ``StreamingRecognizeResult`` protobuf. + + :rtype: :class:`~google.cloud.speech.client.StreamingSpeechResult` + :returns: Instance of ``StreamingSpeechResult``. + """ + alternatives = response.alternatives + is_final = response.is_final + stability = response.stability + return cls(alternatives=alternatives, is_final=is_final, + stability=stability) + + @property + def alternatives(self): + """List of alternative transcripts. + + :rtype: list of :class:`~google.cloud.speech.alternative.Alternative` + :returns: List of ``Alternative`` instances. + """ + return self._alternatives + + @property + def is_final(self): + """Boolean indicator of result finality. + + :rtype: bool + :returns: True if this result is final and no more processing will + occur. False if more processing can will be done and results + may change. + """ + return self._is_final + + @property + def stability(self): + """Result stability indicator. + + :rtype: float + :returns: 0.0-1.0 value indicating the stability the currents results. + """ + return self._stability + + class _JSONSpeechAPI(object): """Speech API for interacting with the JSON/REST version of the API. diff --git a/speech/unit_tests/test_client.py b/speech/unit_tests/test_client.py index b108fcaab5792..c371fafe00241 100644 --- a/speech/unit_tests/test_client.py +++ b/speech/unit_tests/test_client.py @@ -28,7 +28,7 @@ def _make_result(alternatives=()): ) -def _make_streaming_result(alternatives=(), is_final=True): +def _make_streaming_result(alternatives=(), is_final=True, stability=1.0): from google.cloud.grpc.speech.v1beta1 import cloud_speech_pb2 return cloud_speech_pb2.StreamingRecognitionResult( @@ -39,6 +39,7 @@ def _make_streaming_result(alternatives=(), is_final=True): ) for alternative in alternatives ], is_final=is_final, + stability=stability, ) @@ -476,6 +477,7 @@ def test_stream_recognize_interim_results(self): from google.cloud.speech import _gax from google.cloud.speech.encoding import Encoding + from google.cloud.speech.client import StreamingSpeechResult stream = BytesIO(b'Some audio data...') credentials = _Credentials() @@ -491,11 +493,13 @@ def test_stream_recognize_interim_results(self): 'confidence': 0.0123456, }] first_response = _make_streaming_response( - _make_streaming_result([], is_final=False)) + _make_streaming_result([], is_final=False, stability=0.122435)) second_response = _make_streaming_response( - _make_streaming_result(alternatives, is_final=False)) + _make_streaming_result(alternatives, is_final=False, + stability=0.1432343)) last_response = _make_streaming_response( - _make_streaming_result(alternatives, is_final=True)) + _make_streaming_result(alternatives, is_final=True, + stability=0.9834534)) responses = [first_response, second_response, last_response] channel_args = [] @@ -521,15 +525,24 @@ def speech_api(channel=None): results = list(client.streaming_recognize(sample, interim_results=True)) - self.assertEqual(results[0], []) - self.assertEqual(results[1][0].transcript, + + self.assertIsInstance(results[0], StreamingSpeechResult) + self.assertEqual(results[0].alternatives, []) + self.assertEqual(results[0].stability, 0.122435) + self.assertEqual(results[1].stability, 0.1432343) + self.assertEqual(results[1].alternatives[0].transcript, alternatives[0]['transcript']) - self.assertEqual(results[1][0].confidence, + self.assertEqual(results[1].alternatives[0].confidence, alternatives[0]['confidence']) - self.assertEqual(results[1][1].transcript, + self.assertEqual(results[1].alternatives[1].transcript, alternatives[1]['transcript']) - self.assertEqual(results[1][1].confidence, + self.assertEqual(results[1].alternatives[1].confidence, alternatives[1]['confidence']) + self.assertEqual(results[2].stability, 0.9834534) + self.assertEqual(results[2].alternatives[0].transcript, + alternatives[0]['transcript']) + self.assertEqual(results[2].alternatives[0].confidence, + alternatives[0]['confidence']) def test_stream_recognize(self): from io import BytesIO @@ -582,9 +595,9 @@ def speech_api(channel=None): results = list(client.streaming_recognize(sample)) self.assertEqual(len(results), 1) - self.assertEqual(results[0][0].transcript, + self.assertEqual(results[0].alternatives[0].transcript, alternatives[0]['transcript']) - self.assertEqual(results[0][0].confidence, + self.assertEqual(results[0].alternatives[0].confidence, alternatives[0]['confidence']) def test_stream_recognize_no_results(self): diff --git a/system_tests/speech.py b/system_tests/speech.py index 175674dbc96b5..25db94cf98e1f 100644 --- a/system_tests/speech.py +++ b/system_tests/speech.py @@ -127,15 +127,15 @@ def _make_streaming_request(self, file_obj, single_utterance=True, single_utterance=single_utterance, interim_results=interim_results) - def _check_results(self, results, num_results=1): - self.assertEqual(len(results), num_results) - top_result = results[0] + def _check_results(self, alternatives, num_results=1): + self.assertEqual(len(alternatives), num_results) + top_result = alternatives[0] self.assertIsInstance(top_result, Alternative) self.assertEqual(top_result.transcript, 'hello ' + self.ASSERT_TEXT) self.assertGreater(top_result.confidence, 0.90) if num_results == 2: - second_alternative = results[1] + second_alternative = alternatives[1] self.assertIsInstance(second_alternative, Alternative) self.assertEqual(second_alternative.transcript, self.ASSERT_TEXT) self.assertIsNone(second_alternative.confidence) @@ -192,7 +192,7 @@ def test_stream_recognize(self): with open(AUDIO_FILE, 'rb') as file_obj: for results in self._make_streaming_request(file_obj): - self._check_results(results) + self._check_results(results.alternatives) def test_stream_recognize_interim_results(self): if not Config.USE_GAX: @@ -207,12 +207,12 @@ def test_stream_recognize_interim_results(self): interim_results=True) responses = list(recognize) for response in responses: - if response[0].transcript: - self.assertIn(response[0].transcript, + if response.alternatives[0].transcript: + self.assertIn(response.alternatives[0].transcript, extras + self.ASSERT_TEXT) self.assertGreater(len(responses), 5) - self._check_results(responses[-1]) + self._check_results(responses[-1].alternatives) def test_stream_recognize_single_utterance(self): if not Config.USE_GAX: @@ -221,4 +221,4 @@ def test_stream_recognize_single_utterance(self): with open(AUDIO_FILE, 'rb') as file_obj: for results in self._make_streaming_request( file_obj, single_utterance=False): - self._check_results(results) + self._check_results(results.alternatives)