diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 00abede8b1d..03196caee29 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -83,6 +83,7 @@ /run/django/**/* @glasnt @GoogleCloudPlatform/aap-dpes @GoogleCloudPlatform/python-samples-reviewers /secretmanager/**/* @GoogleCloudPlatform/aap-dpes @GoogleCloudPlatform/python-samples-reviewers /securitycenter/**/* @GoogleCloudPlatform/dee-infra @GoogleCloudPlatform/python-samples-reviewers +/speech/**/* @GoogleCloudPlatform/dee-data-ai @GoogleCloudPlatform/python-samples-reviewers /storage/**/* @GoogleCloudPlatform/cloud-storage-dpes @GoogleCloudPlatform/python-samples-reviewers /storagetransfer/**/* @GoogleCloudPlatform/cloud-storage-dpes @GoogleCloudPlatform/python-samples-reviewers /texttospeech/**/* @GoogleCloudPlatform/dee-data-ai @GoogleCloudPlatform/python-samples-reviewers diff --git a/.github/blunderbuss.yml b/.github/blunderbuss.yml index 75b1f0c8808..bbb22a359a4 100644 --- a/.github/blunderbuss.yml +++ b/.github/blunderbuss.yml @@ -95,6 +95,7 @@ assign_issues_by: - 'api: texttospeech' - 'api: retail' - 'api: dialogflow' + - 'api: speech' to: - GoogleCloudPlatform/dee-data-ai - labels: diff --git a/speech/README.rst b/speech/README.rst deleted file mode 100644 index 06789313ca2..00000000000 --- a/speech/README.rst +++ /dev/null @@ -1,3 +0,0 @@ -These samples have been moved. - -https://github.com/googleapis/python-speech/tree/main/samples diff --git a/speech/microphone/README.rst b/speech/microphone/README.rst new file mode 100644 index 00000000000..e3185b312d4 --- /dev/null +++ b/speech/microphone/README.rst @@ -0,0 +1,86 @@ +.. This file is automatically generated. Do not edit this file directly. + +Google Cloud Speech API Python Samples +=============================================================================== + +.. image:: https://gstatic.com/cloudssh/images/open-btn.png + :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=speech/microphone/README.rst + + +This directory contains samples for Google Cloud Speech API. The `Google Cloud Speech API`_ enables easy integration of Google speech recognition technologies into developer applications. Send audio and receive a text transcription from the Cloud Speech API service. + +- See the `migration guide`_ for information about migrating to Python client library v0.27. + +.. _migration guide: https://cloud.google.com/speech/docs/python-client-migration + + + + +.. _Google Cloud Speech API: https://cloud.google.com/speech/docs/ + + + + + +Setup +------------------------------------------------------------------------------- + + +Authentication +++++++++++++++ + +This sample requires you to have authentication setup. Refer to the +`Authentication Getting Started Guide`_ for instructions on setting up +credentials for applications. + +.. _Authentication Getting Started Guide: + https://cloud.google.com/docs/authentication/getting-started + +Install Dependencies +++++++++++++++++++++ + +#. Clone python-docs-samples and change directory to the sample directory you want to use. + + .. code-block:: bash + + $ git clone https://github.com/googleapis/python-speech.git + +#. Install `pip`_ and `virtualenv`_ if you do not already have them. You may want to refer to the `Python Development Environment Setup Guide`_ for Google Cloud Platform for instructions. + + .. _Python Development Environment Setup Guide: + https://cloud.google.com/python/setup + +#. Create a virtualenv. Samples are compatible with Python 2.7 and 3.4+. + + .. code-block:: bash + + $ virtualenv env + $ source env/bin/activate + +#. Install the dependencies needed to run the samples. + + .. code-block:: bash + + $ pip install -r requirements.txt + +.. _pip: https://pip.pypa.io/ +.. _virtualenv: https://virtualenv.pypa.io/ + + + +The client library +------------------------------------------------------------------------------- + +This sample uses the `Google Cloud Client Library for Python`_. +You can read the documentation for more details on API usage and use GitHub +to `browse the source`_ and `report issues`_. + +.. _Google Cloud Client Library for Python: + https://googlecloudplatform.github.io/google-cloud-python/ +.. _browse the source: + https://github.com/GoogleCloudPlatform/google-cloud-python +.. _report issues: + https://github.com/GoogleCloudPlatform/google-cloud-python/issues + + +.. _Google Cloud SDK: https://cloud.google.com/sdk/ \ No newline at end of file diff --git a/speech/microphone/README.rst.in b/speech/microphone/README.rst.in new file mode 100644 index 00000000000..11831cca2df --- /dev/null +++ b/speech/microphone/README.rst.in @@ -0,0 +1,24 @@ +# This file is used to generate README.rst + +product: + name: Google Cloud Speech API + short_name: Cloud Speech API + url: https://cloud.google.com/speech/docs/ + description: > + The `Google Cloud Speech API`_ enables easy integration of Google speech + recognition technologies into developer applications. Send audio and receive + a text transcription from the Cloud Speech API service. + + + - See the `migration guide`_ for information about migrating to Python client library v0.27. + + + .. _migration guide: https://cloud.google.com/speech/docs/python-client-migration + +setup: +- auth +- install_deps + +cloud_client_library: true + +folder: speech/microphone \ No newline at end of file diff --git a/speech/microphone/requirements-test.txt b/speech/microphone/requirements-test.txt new file mode 100644 index 00000000000..1e6b7c5eaa1 --- /dev/null +++ b/speech/microphone/requirements-test.txt @@ -0,0 +1,2 @@ +pytest==7.2.0 +mock==5.0.1 diff --git a/speech/microphone/requirements.txt b/speech/microphone/requirements.txt new file mode 100644 index 00000000000..bf6af2131d4 --- /dev/null +++ b/speech/microphone/requirements.txt @@ -0,0 +1,4 @@ +google-cloud-speech==2.16.2 +pyaudio==0.2.13 +six==1.16.0 + diff --git a/speech/microphone/resources/quit.raw b/speech/microphone/resources/quit.raw new file mode 100644 index 00000000000..a01dfc45a59 Binary files /dev/null and b/speech/microphone/resources/quit.raw differ diff --git a/speech/microphone/transcribe_streaming_infinite.py b/speech/microphone/transcribe_streaming_infinite.py new file mode 100644 index 00000000000..0092e92a72d --- /dev/null +++ b/speech/microphone/transcribe_streaming_infinite.py @@ -0,0 +1,299 @@ +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Google Cloud Speech API sample application using the streaming API. + +NOTE: This module requires the dependencies `pyaudio` and `termcolor`. +To install using pip: + + pip install pyaudio + pip install termcolor + +Example usage: + python transcribe_streaming_infinite.py +""" + +# [START speech_transcribe_infinite_streaming] + +import re +import sys +import time + +from google.cloud import speech +import pyaudio +from six.moves import queue + +# Audio recording parameters +STREAMING_LIMIT = 240000 # 4 minutes +SAMPLE_RATE = 16000 +CHUNK_SIZE = int(SAMPLE_RATE / 10) # 100ms + +RED = "\033[0;31m" +GREEN = "\033[0;32m" +YELLOW = "\033[0;33m" + + +def get_current_time(): + """Return Current Time in MS.""" + + return int(round(time.time() * 1000)) + + +class ResumableMicrophoneStream: + """Opens a recording stream as a generator yielding the audio chunks.""" + + def __init__(self, rate, chunk_size): + self._rate = rate + self.chunk_size = chunk_size + self._num_channels = 1 + self._buff = queue.Queue() + self.closed = True + self.start_time = get_current_time() + self.restart_counter = 0 + self.audio_input = [] + self.last_audio_input = [] + self.result_end_time = 0 + self.is_final_end_time = 0 + self.final_request_end_time = 0 + self.bridging_offset = 0 + self.last_transcript_was_final = False + self.new_stream = True + self._audio_interface = pyaudio.PyAudio() + self._audio_stream = self._audio_interface.open( + format=pyaudio.paInt16, + channels=self._num_channels, + rate=self._rate, + input=True, + frames_per_buffer=self.chunk_size, + # Run the audio stream asynchronously to fill the buffer object. + # This is necessary so that the input device's buffer doesn't + # overflow while the calling thread makes network requests, etc. + stream_callback=self._fill_buffer, + ) + + def __enter__(self): + + self.closed = False + return self + + def __exit__(self, type, value, traceback): + + self._audio_stream.stop_stream() + self._audio_stream.close() + self.closed = True + # Signal the generator to terminate so that the client's + # streaming_recognize method will not block the process termination. + self._buff.put(None) + self._audio_interface.terminate() + + def _fill_buffer(self, in_data, *args, **kwargs): + """Continuously collect data from the audio stream, into the buffer.""" + + self._buff.put(in_data) + return None, pyaudio.paContinue + + def generator(self): + """Stream Audio from microphone to API and to local buffer""" + + while not self.closed: + data = [] + + if self.new_stream and self.last_audio_input: + + chunk_time = STREAMING_LIMIT / len(self.last_audio_input) + + if chunk_time != 0: + + if self.bridging_offset < 0: + self.bridging_offset = 0 + + if self.bridging_offset > self.final_request_end_time: + self.bridging_offset = self.final_request_end_time + + chunks_from_ms = round( + (self.final_request_end_time - self.bridging_offset) + / chunk_time + ) + + self.bridging_offset = round( + (len(self.last_audio_input) - chunks_from_ms) * chunk_time + ) + + for i in range(chunks_from_ms, len(self.last_audio_input)): + data.append(self.last_audio_input[i]) + + self.new_stream = False + + # Use a blocking get() to ensure there's at least one chunk of + # data, and stop iteration if the chunk is None, indicating the + # end of the audio stream. + chunk = self._buff.get() + self.audio_input.append(chunk) + + if chunk is None: + return + data.append(chunk) + # Now consume whatever other data's still buffered. + while True: + try: + chunk = self._buff.get(block=False) + + if chunk is None: + return + data.append(chunk) + self.audio_input.append(chunk) + + except queue.Empty: + break + + yield b"".join(data) + + +def listen_print_loop(responses, stream): + """Iterates through server responses and prints them. + + The responses passed is a generator that will block until a response + is provided by the server. + + Each response may contain multiple results, and each result may contain + multiple alternatives; for details, see https://goo.gl/tjCPAU. Here we + print only the transcription for the top alternative of the top result. + + In this case, responses are provided for interim results as well. If the + response is an interim one, print a line feed at the end of it, to allow + the next result to overwrite it, until the response is a final one. For the + final one, print a newline to preserve the finalized transcription. + """ + + for response in responses: + + if get_current_time() - stream.start_time > STREAMING_LIMIT: + stream.start_time = get_current_time() + break + + if not response.results: + continue + + result = response.results[0] + + if not result.alternatives: + continue + + transcript = result.alternatives[0].transcript + + result_seconds = 0 + result_micros = 0 + + if result.result_end_time.seconds: + result_seconds = result.result_end_time.seconds + + if result.result_end_time.microseconds: + result_micros = result.result_end_time.microseconds + + stream.result_end_time = int((result_seconds * 1000) + (result_micros / 1000)) + + corrected_time = ( + stream.result_end_time + - stream.bridging_offset + + (STREAMING_LIMIT * stream.restart_counter) + ) + # Display interim results, but with a carriage return at the end of the + # line, so subsequent lines will overwrite them. + + if result.is_final: + + sys.stdout.write(GREEN) + sys.stdout.write("\033[K") + sys.stdout.write(str(corrected_time) + ": " + transcript + "\n") + + stream.is_final_end_time = stream.result_end_time + stream.last_transcript_was_final = True + + # Exit recognition if any of the transcribed phrases could be + # one of our keywords. + if re.search(r"\b(exit|quit)\b", transcript, re.I): + sys.stdout.write(YELLOW) + sys.stdout.write("Exiting...\n") + stream.closed = True + break + + else: + sys.stdout.write(RED) + sys.stdout.write("\033[K") + sys.stdout.write(str(corrected_time) + ": " + transcript + "\r") + + stream.last_transcript_was_final = False + + +def main(): + """start bidirectional streaming from microphone input to speech API""" + + client = speech.SpeechClient() + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, + sample_rate_hertz=SAMPLE_RATE, + language_code="en-US", + max_alternatives=1, + ) + + streaming_config = speech.StreamingRecognitionConfig( + config=config, interim_results=True + ) + + mic_manager = ResumableMicrophoneStream(SAMPLE_RATE, CHUNK_SIZE) + print(mic_manager.chunk_size) + sys.stdout.write(YELLOW) + sys.stdout.write('\nListening, say "Quit" or "Exit" to stop.\n\n') + sys.stdout.write("End (ms) Transcript Results/Status\n") + sys.stdout.write("=====================================================\n") + + with mic_manager as stream: + + while not stream.closed: + sys.stdout.write(YELLOW) + sys.stdout.write( + "\n" + str(STREAMING_LIMIT * stream.restart_counter) + ": NEW REQUEST\n" + ) + + stream.audio_input = [] + audio_generator = stream.generator() + + requests = ( + speech.StreamingRecognizeRequest(audio_content=content) + for content in audio_generator + ) + + responses = client.streaming_recognize(streaming_config, requests) + + # Now, put the transcription responses to use. + listen_print_loop(responses, stream) + + if stream.result_end_time > 0: + stream.final_request_end_time = stream.is_final_end_time + stream.result_end_time = 0 + stream.last_audio_input = [] + stream.last_audio_input = stream.audio_input + stream.audio_input = [] + stream.restart_counter = stream.restart_counter + 1 + + if not stream.last_transcript_was_final: + sys.stdout.write("\n") + stream.new_stream = True + + +if __name__ == "__main__": + + main() + +# [END speech_transcribe_infinite_streaming] diff --git a/speech/microphone/transcribe_streaming_mic.py b/speech/microphone/transcribe_streaming_mic.py new file mode 100644 index 00000000000..9d01bfd8603 --- /dev/null +++ b/speech/microphone/transcribe_streaming_mic.py @@ -0,0 +1,196 @@ +# Copyright 2017 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Google Cloud Speech API sample application using the streaming API. + +NOTE: This module requires the additional dependency `pyaudio`. To install +using pip: + + pip install pyaudio + +Example usage: + python transcribe_streaming_mic.py +""" + +# [START speech_transcribe_streaming_mic] +from __future__ import division + +import re +import sys + +from google.cloud import speech + +import pyaudio +from six.moves import queue + +# Audio recording parameters +RATE = 16000 +CHUNK = int(RATE / 10) # 100ms + + +class MicrophoneStream(object): + """Opens a recording stream as a generator yielding the audio chunks.""" + + def __init__(self, rate, chunk): + self._rate = rate + self._chunk = chunk + + # Create a thread-safe buffer of audio data + self._buff = queue.Queue() + self.closed = True + + def __enter__(self): + self._audio_interface = pyaudio.PyAudio() + self._audio_stream = self._audio_interface.open( + format=pyaudio.paInt16, + # The API currently only supports 1-channel (mono) audio + # https://goo.gl/z757pE + channels=1, + rate=self._rate, + input=True, + frames_per_buffer=self._chunk, + # Run the audio stream asynchronously to fill the buffer object. + # This is necessary so that the input device's buffer doesn't + # overflow while the calling thread makes network requests, etc. + stream_callback=self._fill_buffer, + ) + + self.closed = False + + return self + + def __exit__(self, type, value, traceback): + self._audio_stream.stop_stream() + self._audio_stream.close() + self.closed = True + # Signal the generator to terminate so that the client's + # streaming_recognize method will not block the process termination. + self._buff.put(None) + self._audio_interface.terminate() + + def _fill_buffer(self, in_data, frame_count, time_info, status_flags): + """Continuously collect data from the audio stream, into the buffer.""" + self._buff.put(in_data) + return None, pyaudio.paContinue + + def generator(self): + while not self.closed: + # Use a blocking get() to ensure there's at least one chunk of + # data, and stop iteration if the chunk is None, indicating the + # end of the audio stream. + chunk = self._buff.get() + if chunk is None: + return + data = [chunk] + + # Now consume whatever other data's still buffered. + while True: + try: + chunk = self._buff.get(block=False) + if chunk is None: + return + data.append(chunk) + except queue.Empty: + break + + yield b"".join(data) + + +def listen_print_loop(responses): + """Iterates through server responses and prints them. + + The responses passed is a generator that will block until a response + is provided by the server. + + Each response may contain multiple results, and each result may contain + multiple alternatives; for details, see https://goo.gl/tjCPAU. Here we + print only the transcription for the top alternative of the top result. + + In this case, responses are provided for interim results as well. If the + response is an interim one, print a line feed at the end of it, to allow + the next result to overwrite it, until the response is a final one. For the + final one, print a newline to preserve the finalized transcription. + """ + num_chars_printed = 0 + for response in responses: + if not response.results: + continue + + # The `results` list is consecutive. For streaming, we only care about + # the first result being considered, since once it's `is_final`, it + # moves on to considering the next utterance. + result = response.results[0] + if not result.alternatives: + continue + + # Display the transcription of the top alternative. + transcript = result.alternatives[0].transcript + + # Display interim results, but with a carriage return at the end of the + # line, so subsequent lines will overwrite them. + # + # If the previous result was longer than this one, we need to print + # some extra spaces to overwrite the previous result + overwrite_chars = " " * (num_chars_printed - len(transcript)) + + if not result.is_final: + sys.stdout.write(transcript + overwrite_chars + "\r") + sys.stdout.flush() + + num_chars_printed = len(transcript) + + else: + print(transcript + overwrite_chars) + + # Exit recognition if any of the transcribed phrases could be + # one of our keywords. + if re.search(r"\b(exit|quit)\b", transcript, re.I): + print("Exiting..") + break + + num_chars_printed = 0 + + +def main(): + # See http://g.co/cloud/speech/docs/languages + # for a list of supported languages. + language_code = "en-US" # a BCP-47 language tag + + client = speech.SpeechClient() + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, + sample_rate_hertz=RATE, + language_code=language_code, + ) + + streaming_config = speech.StreamingRecognitionConfig( + config=config, interim_results=True + ) + + with MicrophoneStream(RATE, CHUNK) as stream: + audio_generator = stream.generator() + requests = ( + speech.StreamingRecognizeRequest(audio_content=content) + for content in audio_generator + ) + + responses = client.streaming_recognize(streaming_config, requests) + + # Now, put the transcription responses to use. + listen_print_loop(responses) + + +if __name__ == "__main__": + main() +# [END speech_transcribe_streaming_mic] diff --git a/speech/microphone/transcribe_streaming_mic_test.py b/speech/microphone/transcribe_streaming_mic_test.py new file mode 100644 index 00000000000..77b125cce6e --- /dev/null +++ b/speech/microphone/transcribe_streaming_mic_test.py @@ -0,0 +1,73 @@ +# Copyright 2017 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +import threading +import time + +import mock + +RESOURCES = os.path.join(os.path.dirname(__file__), "resources") + + +class MockPyAudio(object): + def __init__(self, audio_filename): + self.audio_filename = audio_filename + + def __call__(self, *args): + return self + + def open(self, stream_callback, rate, *args, **kwargs): + self.rate = rate + self.closed = threading.Event() + self.stream_thread = threading.Thread( + target=self.stream_audio, + args=(self.audio_filename, stream_callback, self.closed), + ) + self.stream_thread.start() + return self + + def close(self): + self.closed.set() + + def stop_stream(self): + pass + + def terminate(self): + pass + + def stream_audio(self, audio_filename, callback, closed, num_frames=512): + with open(audio_filename, "rb") as audio_file: + while not closed.is_set(): + # Approximate realtime by sleeping for the appropriate time for + # the requested number of frames + time.sleep(num_frames / float(self.rate)) + # audio is 16-bit samples, whereas python byte is 8-bit + num_bytes = 2 * num_frames + chunk = audio_file.read(num_bytes) or b"\0" * num_bytes + callback(chunk, None, None, None) + + +@mock.patch.dict( + "sys.modules", + pyaudio=mock.MagicMock(PyAudio=MockPyAudio(os.path.join(RESOURCES, "quit.raw"))), +) +def test_main(capsys): + import transcribe_streaming_mic + + transcribe_streaming_mic.main() + out, err = capsys.readouterr() + + assert re.search(r"quit", out, re.DOTALL | re.I) diff --git a/speech/snippets/README.rst b/speech/snippets/README.rst new file mode 100644 index 00000000000..692fc77a354 --- /dev/null +++ b/speech/snippets/README.rst @@ -0,0 +1,328 @@ +.. This file is automatically generated. Do not edit this file directly. + +Google Cloud Speech API Python Samples +=============================================================================== + +.. image:: https://gstatic.com/cloudssh/images/open-btn.png + :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=speech/cloud-client/README.rst + + +This directory contains samples for Google Cloud Speech API. The `Google Cloud Speech API`_ enables easy integration of Google speech recognition technologies into developer applications. Send audio and receive a text transcription from the Cloud Speech API service. + +- See the `migration guide`_ for information about migrating to Python client library v0.27. + +.. _migration guide: https://cloud.google.com/speech/docs/python-client-migration + + + + +.. _Google Cloud Speech API: https://cloud.google.com/speech/docs/ + + + + + +Setup +------------------------------------------------------------------------------- + + +Authentication +++++++++++++++ + +This sample requires you to have authentication setup. Refer to the +`Authentication Getting Started Guide`_ for instructions on setting up +credentials for applications. + +.. _Authentication Getting Started Guide: + https://cloud.google.com/docs/authentication/getting-started + +Install Dependencies +++++++++++++++++++++ + +#. Clone python-docs-samples and change directory to the sample directory you want to use. + + .. code-block:: bash + + $ git clone https://github.com/googleapis/python-speech.git + +#. Install `pip`_ and `virtualenv`_ if you do not already have them. You may want to refer to the `Python Development Environment Setup Guide`_ for Google Cloud Platform for instructions. + + .. _Python Development Environment Setup Guide: + https://cloud.google.com/python/setup + +#. Create a virtualenv. Samples are compatible with Python 2.7 and 3.4+. + + .. code-block:: bash + + $ virtualenv env + $ source env/bin/activate + +#. Install the dependencies needed to run the samples. + + .. code-block:: bash + + $ pip install -r requirements.txt + +.. _pip: https://pip.pypa.io/ +.. _virtualenv: https://virtualenv.pypa.io/ + +Samples +------------------------------------------------------------------------------- + +Quickstart ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +.. image:: https://gstatic.com/cloudssh/images/open-btn.png + :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=speech/cloud-client/quickstart.py,speech/cloud-client/README.rst + + + + +To run this sample: + +.. code-block:: bash + + $ python quickstart.py + + +Transcribe ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +.. image:: https://gstatic.com/cloudssh/images/open-btn.png + :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=speech/cloud-client/transcribe.py,speech/cloud-client/README.rst + + + + +To run this sample: + +.. code-block:: bash + + $ python transcribe.py + + usage: transcribe.py [-h] path + + Google Cloud Speech API sample application using the REST API for batch + processing. + + Example usage: + python transcribe.py resources/audio.raw + python transcribe.py gs://cloud-samples-tests/speech/brooklyn.flac + + positional arguments: + path File or GCS path for audio file to be recognized + + optional arguments: + -h, --help show this help message and exit + + + +Transcribe async ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +.. image:: https://gstatic.com/cloudssh/images/open-btn.png + :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=speech/cloud-client/transcribe_async.py,speech/cloud-client/README.rst + + + + +To run this sample: + +.. code-block:: bash + + $ python transcribe_async.py + + usage: transcribe_async.py [-h] path + + Google Cloud Speech API sample application using the REST API for async + batch processing. + Example usage: + python transcribe_async.py resources/audio.raw + python transcribe_async.py gs://cloud-samples-tests/speech/vr.flac + + positional arguments: + path File or GCS path for audio file to be recognized + + optional arguments: + -h, --help show this help message and exit + + + +Transcribe with word time offsets ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +.. image:: https://gstatic.com/cloudssh/images/open-btn.png + :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=speech/cloud-client/transcribe_word_time_offsets.py,speech/cloud-client/README.rst + + + + +To run this sample: + +.. code-block:: bash + + $ python transcribe_word_time_offsets.py + + usage: transcribe_word_time_offsets.py [-h] path + + Google Cloud Speech API sample that demonstrates word time offsets. + + Example usage: + python transcribe_word_time_offsets.py resources/audio.raw + python transcribe_word_time_offsets.py gs://cloud-samples-tests/speech/vr.flac + + positional arguments: + path File or GCS path for audio file to be recognized + + optional arguments: + -h, --help show this help message and exit + + + +Transcribe Streaming ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +.. image:: https://gstatic.com/cloudssh/images/open-btn.png + :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=speech/cloud-client/transcribe_streaming.py,speech/cloud-client/README.rst + + + + +To run this sample: + +.. code-block:: bash + + $ python transcribe_streaming.py + + usage: transcribe_streaming.py [-h] stream + + Google Cloud Speech API sample application using the streaming API. + + Example usage: + python transcribe_streaming.py resources/audio.raw + + positional arguments: + stream File to stream to the API + + optional arguments: + -h, --help show this help message and exit + + + +Transcribe Enhanced Models ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +.. image:: https://gstatic.com/cloudssh/images/open-btn.png + :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=speech/cloud-client/transcribe_enhanced_model.py,speech/cloud-client/README.rst + + + + +To run this sample: + +.. code-block:: bash + + $ python transcribe_enhanced_model.py + + usage: transcribe_enhanced_model.py [-h] path + + Google Cloud Speech API sample that demonstrates enhanced models + and recognition metadata. + + Example usage: + python transcribe_enhanced_model.py resources/commercial_mono.wav + + positional arguments: + path File to stream to the API + + optional arguments: + -h, --help show this help message and exit + + + +Transcribe Automatic Punctuation ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +.. image:: https://gstatic.com/cloudssh/images/open-btn.png + :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=speech/cloud-client/transcribe_auto_punctuation.py,speech/cloud-client/README.rst + + + + +To run this sample: + +.. code-block:: bash + + $ python transcribe_auto_punctuation.py + + usage: transcribe_auto_punctuation.py [-h] path + + Google Cloud Speech API sample that demonstrates auto punctuation + and recognition metadata. + + Example usage: + python transcribe_auto_punctuation.py resources/commercial_mono.wav + + positional arguments: + path File to stream to the API + + optional arguments: + -h, --help show this help message and exit + + + +Beta Samples ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +.. image:: https://gstatic.com/cloudssh/images/open-btn.png + :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=speech/cloud-client/beta_snippets.py,speech/cloud-client/README.rst + + + + +To run this sample: + +.. code-block:: bash + + $ python beta_snippets.py + + usage: beta_snippets.py [-h] command + + Google Cloud Speech API sample that demonstrates enhanced models + and recognition metadata. + + Example usage: + python beta_snippets.py enhanced-model + python beta_snippets.py metadata + python beta_snippets.py punctuation + python beta_snippets.py diarization + python beta_snippets.py multi-channel + python beta_snippets.py multi-language + python beta_snippets.py word-level-conf + + positional arguments: + command + + optional arguments: + -h, --help show this help message and exit + + + + + +The client library +------------------------------------------------------------------------------- + +This sample uses the `Google Cloud Client Library for Python`_. +You can read the documentation for more details on API usage and use GitHub +to `browse the source`_ and `report issues`_. + +.. _Google Cloud Client Library for Python: + https://googlecloudplatform.github.io/google-cloud-python/ +.. _browse the source: + https://github.com/GoogleCloudPlatform/google-cloud-python +.. _report issues: + https://github.com/GoogleCloudPlatform/google-cloud-python/issues + + +.. _Google Cloud SDK: https://cloud.google.com/sdk/ \ No newline at end of file diff --git a/speech/snippets/README.rst.in b/speech/snippets/README.rst.in new file mode 100644 index 00000000000..9447e48548d --- /dev/null +++ b/speech/snippets/README.rst.in @@ -0,0 +1,49 @@ +# This file is used to generate README.rst + +product: + name: Google Cloud Speech API + short_name: Cloud Speech API + url: https://cloud.google.com/speech/docs/ + description: > + The `Google Cloud Speech API`_ enables easy integration of Google speech + recognition technologies into developer applications. Send audio and receive + a text transcription from the Cloud Speech API service. + + + - See the `migration guide`_ for information about migrating to Python client library v0.27. + + + .. _migration guide: https://cloud.google.com/speech/docs/python-client-migration + +setup: +- auth +- install_deps + +samples: +- name: Quickstart + file: quickstart.py +- name: Transcribe + file: transcribe.py + show_help: true +- name: Transcribe async + file: transcribe_async.py + show_help: true +- name: Transcribe with word time offsets + file: transcribe_word_time_offsets.py + show_help: true +- name: Transcribe Streaming + file: transcribe_streaming.py + show_help: true +- name: Transcribe Enhanced Models + file: transcribe_enhanced_model.py + show_help: true +- name: Transcribe Automatic Punctuation + file: transcribe_auto_punctuation.py + show_help: true +- name: Beta Samples + file: beta_snippets.py + show_help: true + +cloud_client_library: true + +folder: speech/cloud-client \ No newline at end of file diff --git a/speech/snippets/adaptation_v2_custom_class_reference.py b/speech/snippets/adaptation_v2_custom_class_reference.py new file mode 100644 index 00000000000..a21baa50385 --- /dev/null +++ b/speech/snippets/adaptation_v2_custom_class_reference.py @@ -0,0 +1,96 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# [START speech_adaptation_v2_custom_class_reference] +import io + +from google.cloud.speech_v2 import SpeechClient +from google.cloud.speech_v2.types import cloud_speech + + +def adaptation_v2_custom_class_reference( + project_id, recognizer_id, phrase_set_id, custom_class_id, audio_file +): + # Instantiates a client + client = SpeechClient() + + request = cloud_speech.CreateRecognizerRequest( + parent=f"projects/{project_id}/locations/global", + recognizer_id=recognizer_id, + recognizer=cloud_speech.Recognizer( + language_codes=["en-US"], model="latest_short" + ), + ) + + # Creates a Recognizer + operation = client.create_recognizer(request=request) + recognizer = operation.result() + + # Reads a file as bytes + with io.open(audio_file, "rb") as f: + content = f.read() + + # Create a persistent CustomClass to reference in phrases + request = cloud_speech.CreateCustomClassRequest( + parent=f"projects/{project_id}/locations/global", + custom_class_id=custom_class_id, + custom_class=cloud_speech.CustomClass(items=[{"value": "fare"}]), + ) + + operation = client.create_custom_class(request=request) + custom_class = operation.result() + + # Create a persistent PhraseSet to reference in a recognition request + request = cloud_speech.CreatePhraseSetRequest( + parent=f"projects/{project_id}/locations/global", + phrase_set_id=phrase_set_id, + phrase_set=cloud_speech.PhraseSet( + phrases=[{"value": f"${{{custom_class.name}}}", "boost": 20}] + ), + ) + + operation = client.create_phrase_set(request=request) + phrase_set = operation.result() + + # Add a reference of the PhraseSet into the recognition request + adaptation = cloud_speech.SpeechAdaptation( + phrase_sets=[ + cloud_speech.SpeechAdaptation.AdaptationPhraseSet( + phrase_set=phrase_set.name + ) + ] + ) + config = cloud_speech.RecognitionConfig( + auto_decoding_config={}, adaptation=adaptation + ) + + request = cloud_speech.RecognizeRequest( + recognizer=recognizer.name, config=config, content=content + ) + + # Transcribes the audio into text + response = client.recognize(request=request) + + for result in response.results: + print("Transcript: {}".format(result.alternatives[0].transcript)) + + return response + + +# [END speech_adaptation_v2_custom_class_reference] + + +if __name__ == "__main__": + adaptation_v2_custom_class_reference() diff --git a/speech/snippets/adaptation_v2_custom_class_reference_test.py b/speech/snippets/adaptation_v2_custom_class_reference_test.py new file mode 100644 index 00000000000..5629326f5f5 --- /dev/null +++ b/speech/snippets/adaptation_v2_custom_class_reference_test.py @@ -0,0 +1,77 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +from uuid import uuid4 + +from google.cloud.speech_v2 import SpeechClient +from google.cloud.speech_v2.types import cloud_speech + +import adaptation_v2_custom_class_reference + +RESOURCES = os.path.join(os.path.dirname(__file__), "resources") + + +def delete_recognizer(name): + client = SpeechClient() + request = cloud_speech.DeleteRecognizerRequest(name=name) + client.delete_recognizer(request=request) + + +def delete_phrase_set(name): + client = SpeechClient() + request = cloud_speech.DeletePhraseSetRequest(name=name) + client.delete_phrase_set(request=request) + + +def delete_custom_class(name): + client = SpeechClient() + request = cloud_speech.DeleteCustomClassRequest(name=name) + client.delete_custom_class(request=request) + + +def test_adaptation_v2_custom_class_reference(capsys): + project_id = os.getenv("GOOGLE_CLOUD_PROJECT") + + recognizer_id = "recognizer-" + str(uuid4()) + phrase_set_id = "phrase-set-" + str(uuid4()) + custom_class_id = "custom-class-" + str(uuid4()) + response = ( + adaptation_v2_custom_class_reference.adaptation_v2_custom_class_reference( + project_id, + recognizer_id, + phrase_set_id, + custom_class_id, + os.path.join(RESOURCES, "fair.wav"), + ) + ) + + assert re.search( + r"the word is fare", + response.results[0].alternatives[0].transcript, + re.DOTALL | re.I, + ) + + delete_recognizer( + f"projects/{project_id}/locations/global/recognizers/{recognizer_id}" + ) + + delete_phrase_set( + f"projects/{project_id}/locations/global/phraseSets/{phrase_set_id}" + ) + + delete_custom_class( + f"projects/{project_id}/locations/global/customClasses/{custom_class_id}" + ) diff --git a/speech/snippets/adaptation_v2_inline_custom_class.py b/speech/snippets/adaptation_v2_inline_custom_class.py new file mode 100644 index 00000000000..0574a63149e --- /dev/null +++ b/speech/snippets/adaptation_v2_inline_custom_class.py @@ -0,0 +1,75 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# [START speech_adaptation_v2_inline_custom_class] +import io + +from google.cloud.speech_v2 import SpeechClient +from google.cloud.speech_v2.types import cloud_speech + + +def adaptation_v2_inline_custom_class(project_id, recognizer_id, audio_file): + # Instantiates a client + client = SpeechClient() + + request = cloud_speech.CreateRecognizerRequest( + parent=f"projects/{project_id}/locations/global", + recognizer_id=recognizer_id, + recognizer=cloud_speech.Recognizer( + language_codes=["en-US"], model="latest_short" + ), + ) + + # Creates a Recognizer + operation = client.create_recognizer(request=request) + recognizer = operation.result() + + # Reads a file as bytes + with io.open(audio_file, "rb") as f: + content = f.read() + + # Build inline phrase set to produce a more accurate transcript + phrase_set = cloud_speech.PhraseSet(phrases=[{"value": "${fare}", "boost": 20}]) + custom_class = cloud_speech.CustomClass(name="fare", items=[{"value": "fare"}]) + adaptation = cloud_speech.SpeechAdaptation( + phrase_sets=[ + cloud_speech.SpeechAdaptation.AdaptationPhraseSet( + inline_phrase_set=phrase_set + ) + ], + custom_classes=[custom_class], + ) + config = cloud_speech.RecognitionConfig( + auto_decoding_config={}, adaptation=adaptation + ) + + request = cloud_speech.RecognizeRequest( + recognizer=recognizer.name, config=config, content=content + ) + + # Transcribes the audio into text + response = client.recognize(request=request) + + for result in response.results: + print("Transcript: {}".format(result.alternatives[0].transcript)) + + return response + + +# [END speech_adaptation_v2_inline_custom_class] + + +if __name__ == "__main__": + adaptation_v2_inline_custom_class() diff --git a/speech/snippets/adaptation_v2_inline_custom_class_test.py b/speech/snippets/adaptation_v2_inline_custom_class_test.py new file mode 100644 index 00000000000..889675f99d3 --- /dev/null +++ b/speech/snippets/adaptation_v2_inline_custom_class_test.py @@ -0,0 +1,49 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +from uuid import uuid4 + +from google.cloud.speech_v2 import SpeechClient +from google.cloud.speech_v2.types import cloud_speech + +import adaptation_v2_inline_custom_class + +RESOURCES = os.path.join(os.path.dirname(__file__), "resources") + + +def delete_recognizer(name): + client = SpeechClient() + request = cloud_speech.DeleteRecognizerRequest(name=name) + client.delete_recognizer(request=request) + + +def test_adaptation_v2_inline_custom_class(capsys): + project_id = os.getenv("GOOGLE_CLOUD_PROJECT") + + recognizer_id = "recognizer-" + str(uuid4()) + response = adaptation_v2_inline_custom_class.adaptation_v2_inline_custom_class( + project_id, recognizer_id, os.path.join(RESOURCES, "fair.wav") + ) + + assert re.search( + r"the word is fare", + response.results[0].alternatives[0].transcript, + re.DOTALL | re.I, + ) + + delete_recognizer( + f"projects/{project_id}/locations/global/recognizers/{recognizer_id}" + ) diff --git a/speech/snippets/adaptation_v2_inline_phrase_set.py b/speech/snippets/adaptation_v2_inline_phrase_set.py new file mode 100644 index 00000000000..12f68c2fb4b --- /dev/null +++ b/speech/snippets/adaptation_v2_inline_phrase_set.py @@ -0,0 +1,73 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# [START speech_adaptation_v2_inline_phrase_set] +import io + +from google.cloud.speech_v2 import SpeechClient +from google.cloud.speech_v2.types import cloud_speech + + +def adaptation_v2_inline_phrase_set(project_id, recognizer_id, audio_file): + # Instantiates a client + client = SpeechClient() + + request = cloud_speech.CreateRecognizerRequest( + parent=f"projects/{project_id}/locations/global", + recognizer_id=recognizer_id, + recognizer=cloud_speech.Recognizer( + language_codes=["en-US"], model="latest_short" + ), + ) + + # Creates a Recognizer + operation = client.create_recognizer(request=request) + recognizer = operation.result() + + # Reads a file as bytes + with io.open(audio_file, "rb") as f: + content = f.read() + + # Build inline phrase set to produce a more accurate transcript + phrase_set = cloud_speech.PhraseSet(phrases=[{"value": "fare", "boost": 10}]) + adaptation = cloud_speech.SpeechAdaptation( + phrase_sets=[ + cloud_speech.SpeechAdaptation.AdaptationPhraseSet( + inline_phrase_set=phrase_set + ) + ] + ) + config = cloud_speech.RecognitionConfig( + auto_decoding_config={}, adaptation=adaptation + ) + + request = cloud_speech.RecognizeRequest( + recognizer=recognizer.name, config=config, content=content + ) + + # Transcribes the audio into text + response = client.recognize(request=request) + + for result in response.results: + print("Transcript: {}".format(result.alternatives[0].transcript)) + + return response + + +# [END speech_adaptation_v2_inline_phrase_set] + + +if __name__ == "__main__": + adaptation_v2_inline_phrase_set() diff --git a/speech/snippets/adaptation_v2_inline_phrase_set_test.py b/speech/snippets/adaptation_v2_inline_phrase_set_test.py new file mode 100644 index 00000000000..688d2e148d6 --- /dev/null +++ b/speech/snippets/adaptation_v2_inline_phrase_set_test.py @@ -0,0 +1,49 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +from uuid import uuid4 + +from google.cloud.speech_v2 import SpeechClient +from google.cloud.speech_v2.types import cloud_speech + +import adaptation_v2_inline_phrase_set + +RESOURCES = os.path.join(os.path.dirname(__file__), "resources") + + +def delete_recognizer(name): + client = SpeechClient() + request = cloud_speech.DeleteRecognizerRequest(name=name) + client.delete_recognizer(request=request) + + +def test_adaptation_v2_inline_phrase_set(capsys): + project_id = os.getenv("GOOGLE_CLOUD_PROJECT") + + recognizer_id = "recognizer-" + str(uuid4()) + response = adaptation_v2_inline_phrase_set.adaptation_v2_inline_phrase_set( + project_id, recognizer_id, os.path.join(RESOURCES, "fair.wav") + ) + + assert re.search( + r"the word is fare", + response.results[0].alternatives[0].transcript, + re.DOTALL | re.I, + ) + + delete_recognizer( + f"projects/{project_id}/locations/global/recognizers/{recognizer_id}" + ) diff --git a/speech/snippets/adaptation_v2_phrase_set_reference.py b/speech/snippets/adaptation_v2_phrase_set_reference.py new file mode 100644 index 00000000000..4d9f37948fa --- /dev/null +++ b/speech/snippets/adaptation_v2_phrase_set_reference.py @@ -0,0 +1,84 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# [START speech_adaptation_v2_phrase_set_reference] +import io + +from google.cloud.speech_v2 import SpeechClient +from google.cloud.speech_v2.types import cloud_speech + + +def adaptation_v2_phrase_set_reference( + project_id, recognizer_id, phrase_set_id, audio_file +): + # Instantiates a client + client = SpeechClient() + + request = cloud_speech.CreateRecognizerRequest( + parent=f"projects/{project_id}/locations/global", + recognizer_id=recognizer_id, + recognizer=cloud_speech.Recognizer( + language_codes=["en-US"], model="latest_short" + ), + ) + + # Creates a Recognizer + operation = client.create_recognizer(request=request) + recognizer = operation.result() + + # Reads a file as bytes + with io.open(audio_file, "rb") as f: + content = f.read() + + # Create a persistent PhraseSet to reference in a recognition request + request = cloud_speech.CreatePhraseSetRequest( + parent=f"projects/{project_id}/locations/global", + phrase_set_id=phrase_set_id, + phrase_set=cloud_speech.PhraseSet(phrases=[{"value": "fare", "boost": 10}]), + ) + + operation = client.create_phrase_set(request=request) + phrase_set = operation.result() + + # Add a reference of the PhraseSet into the recognition request + adaptation = cloud_speech.SpeechAdaptation( + phrase_sets=[ + cloud_speech.SpeechAdaptation.AdaptationPhraseSet( + phrase_set=phrase_set.name + ) + ] + ) + config = cloud_speech.RecognitionConfig( + auto_decoding_config={}, adaptation=adaptation + ) + + request = cloud_speech.RecognizeRequest( + recognizer=recognizer.name, config=config, content=content + ) + + # Transcribes the audio into text + response = client.recognize(request=request) + + for result in response.results: + print("Transcript: {}".format(result.alternatives[0].transcript)) + + return response + + +# [END speech_adaptation_v2_phrase_set_reference] + + +if __name__ == "__main__": + adaptation_v2_phrase_set_reference() diff --git a/speech/snippets/adaptation_v2_phrase_set_reference_test.py b/speech/snippets/adaptation_v2_phrase_set_reference_test.py new file mode 100644 index 00000000000..abd567ed5a9 --- /dev/null +++ b/speech/snippets/adaptation_v2_phrase_set_reference_test.py @@ -0,0 +1,60 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +from uuid import uuid4 + +from google.cloud.speech_v2 import SpeechClient +from google.cloud.speech_v2.types import cloud_speech + +import adaptation_v2_phrase_set_reference + +RESOURCES = os.path.join(os.path.dirname(__file__), "resources") + + +def delete_recognizer(name): + client = SpeechClient() + request = cloud_speech.DeleteRecognizerRequest(name=name) + client.delete_recognizer(request=request) + + +def delete_phrase_set(name): + client = SpeechClient() + request = cloud_speech.DeletePhraseSetRequest(name=name) + client.delete_phrase_set(request=request) + + +def test_adaptation_v2_phrase_set_reference(capsys): + project_id = os.getenv("GOOGLE_CLOUD_PROJECT") + + recognizer_id = "recognizer-" + str(uuid4()) + phrase_set_id = "phrase-set-" + str(uuid4()) + response = adaptation_v2_phrase_set_reference.adaptation_v2_phrase_set_reference( + project_id, recognizer_id, phrase_set_id, os.path.join(RESOURCES, "fair.wav") + ) + + assert re.search( + r"the word is fare", + response.results[0].alternatives[0].transcript, + re.DOTALL | re.I, + ) + + delete_recognizer( + f"projects/{project_id}/locations/global/recognizers/{recognizer_id}" + ) + + delete_phrase_set( + f"projects/{project_id}/locations/global/phraseSets/{phrase_set_id}" + ) diff --git a/speech/snippets/beta_snippets.py b/speech/snippets/beta_snippets.py new file mode 100644 index 00000000000..5ebb06d9ac4 --- /dev/null +++ b/speech/snippets/beta_snippets.py @@ -0,0 +1,355 @@ +# Copyright 2018 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Google Cloud Speech API sample that demonstrates enhanced models +and recognition metadata. + +Example usage: + python beta_snippets.py enhanced-model + python beta_snippets.py metadata + python beta_snippets.py punctuation + python beta_snippets.py diarization + python beta_snippets.py multi-channel + python beta_snippets.py multi-language + python beta_snippets.py word-level-conf + python beta_snippets.py spoken-punctuation-emojis +""" + +import argparse +import io + + +def transcribe_file_with_enhanced_model(): + """Transcribe the given audio file using an enhanced model.""" + # [START speech_transcribe_enhanced_model_beta] + from google.cloud import speech_v1p1beta1 as speech + + client = speech.SpeechClient() + + speech_file = "resources/commercial_mono.wav" + + with io.open(speech_file, "rb") as audio_file: + content = audio_file.read() + + audio = speech.RecognitionAudio(content=content) + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, + sample_rate_hertz=8000, + language_code="en-US", + use_enhanced=True, + # A model must be specified to use enhanced model. + model="phone_call", + ) + + response = client.recognize(config=config, audio=audio) + + for i, result in enumerate(response.results): + alternative = result.alternatives[0] + print("-" * 20) + print("First alternative of result {}".format(i)) + print("Transcript: {}".format(alternative.transcript)) + # [END speech_transcribe_enhanced_model_beta] + + +def transcribe_file_with_metadata(): + """Send a request that includes recognition metadata.""" + # [START speech_transcribe_recognition_metadata_beta] + from google.cloud import speech_v1p1beta1 as speech + + client = speech.SpeechClient() + + speech_file = "resources/commercial_mono.wav" + + with io.open(speech_file, "rb") as audio_file: + content = audio_file.read() + + # Here we construct a recognition metadata object. + # Most metadata fields are specified as enums that can be found + # in speech.enums.RecognitionMetadata + metadata = speech.RecognitionMetadata() + metadata.interaction_type = speech.RecognitionMetadata.InteractionType.DISCUSSION + metadata.microphone_distance = ( + speech.RecognitionMetadata.MicrophoneDistance.NEARFIELD + ) + metadata.recording_device_type = ( + speech.RecognitionMetadata.RecordingDeviceType.SMARTPHONE + ) + + # Some metadata fields are free form strings + metadata.recording_device_name = "Pixel 2 XL" + # And some are integers, for instance the 6 digit NAICS code + # https://www.naics.com/search/ + metadata.industry_naics_code_of_audio = 519190 + + audio = speech.RecognitionAudio(content=content) + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, + sample_rate_hertz=8000, + language_code="en-US", + # Add this in the request to send metadata. + metadata=metadata, + ) + + response = client.recognize(config=config, audio=audio) + + for i, result in enumerate(response.results): + alternative = result.alternatives[0] + print("-" * 20) + print("First alternative of result {}".format(i)) + print("Transcript: {}".format(alternative.transcript)) + # [END speech_transcribe_recognition_metadata_beta] + + +def transcribe_file_with_auto_punctuation(): + """Transcribe the given audio file with auto punctuation enabled.""" + # [START speech_transcribe_auto_punctuation_beta] + from google.cloud import speech_v1p1beta1 as speech + + client = speech.SpeechClient() + + speech_file = "resources/commercial_mono.wav" + + with io.open(speech_file, "rb") as audio_file: + content = audio_file.read() + + audio = speech.RecognitionAudio(content=content) + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, + sample_rate_hertz=8000, + language_code="en-US", + # Enable automatic punctuation + enable_automatic_punctuation=True, + ) + + response = client.recognize(config=config, audio=audio) + + for i, result in enumerate(response.results): + alternative = result.alternatives[0] + print("-" * 20) + print("First alternative of result {}".format(i)) + print("Transcript: {}".format(alternative.transcript)) + # [END speech_transcribe_auto_punctuation_beta] + + +def transcribe_file_with_diarization(): + """Transcribe the given audio file synchronously with diarization.""" + # [START speech_transcribe_diarization_beta] + from google.cloud import speech_v1p1beta1 as speech + + client = speech.SpeechClient() + + speech_file = "resources/commercial_mono.wav" + + with open(speech_file, "rb") as audio_file: + content = audio_file.read() + + audio = speech.RecognitionAudio(content=content) + + diarization_config = speech.SpeakerDiarizationConfig( + enable_speaker_diarization=True, + min_speaker_count=2, + max_speaker_count=10, + ) + + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, + sample_rate_hertz=8000, + language_code="en-US", + diarization_config=diarization_config, + ) + + print("Waiting for operation to complete...") + response = client.recognize(config=config, audio=audio) + + # The transcript within each result is separate and sequential per result. + # However, the words list within an alternative includes all the words + # from all the results thus far. Thus, to get all the words with speaker + # tags, you only have to take the words list from the last result: + result = response.results[-1] + + words_info = result.alternatives[0].words + + # Printing out the output: + for word_info in words_info: + print( + "word: '{}', speaker_tag: {}".format(word_info.word, word_info.speaker_tag) + ) + # [END speech_transcribe_diarization_beta] + + +def transcribe_file_with_multichannel(): + """Transcribe the given audio file synchronously with + multi channel.""" + # [START speech_transcribe_multichannel_beta] + from google.cloud import speech_v1p1beta1 as speech + + client = speech.SpeechClient() + + speech_file = "resources/Google_Gnome.wav" + + with open(speech_file, "rb") as audio_file: + content = audio_file.read() + + audio = speech.RecognitionAudio(content=content) + + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, + sample_rate_hertz=16000, + language_code="en-US", + audio_channel_count=1, + enable_separate_recognition_per_channel=True, + ) + + response = client.recognize(config=config, audio=audio) + + for i, result in enumerate(response.results): + alternative = result.alternatives[0] + print("-" * 20) + print("First alternative of result {}".format(i)) + print("Transcript: {}".format(alternative.transcript)) + print("Channel Tag: {}".format(result.channel_tag)) + # [END speech_transcribe_multichannel_beta] + + +def transcribe_file_with_multilanguage(): + """Transcribe the given audio file synchronously with + multi language.""" + # [START speech_transcribe_multilanguage_beta] + from google.cloud import speech_v1p1beta1 as speech + + client = speech.SpeechClient() + + speech_file = "resources/multi.wav" + first_lang = "en-US" + second_lang = "es" + + with open(speech_file, "rb") as audio_file: + content = audio_file.read() + + audio = speech.RecognitionAudio(content=content) + + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, + sample_rate_hertz=44100, + audio_channel_count=2, + language_code=first_lang, + alternative_language_codes=[second_lang], + ) + + print("Waiting for operation to complete...") + response = client.recognize(config=config, audio=audio) + + for i, result in enumerate(response.results): + alternative = result.alternatives[0] + print("-" * 20) + print("First alternative of result {}: {}".format(i, alternative)) + print("Transcript: {}".format(alternative.transcript)) + # [END speech_transcribe_multilanguage_beta] + + +def transcribe_file_with_word_level_confidence(): + """Transcribe the given audio file synchronously with + word level confidence.""" + # [START speech_transcribe_word_level_confidence_beta] + from google.cloud import speech_v1p1beta1 as speech + + client = speech.SpeechClient() + + speech_file = "resources/Google_Gnome.wav" + + with open(speech_file, "rb") as audio_file: + content = audio_file.read() + + audio = speech.RecognitionAudio(content=content) + + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, + sample_rate_hertz=16000, + language_code="en-US", + enable_word_confidence=True, + ) + + response = client.recognize(config=config, audio=audio) + + for i, result in enumerate(response.results): + alternative = result.alternatives[0] + print("-" * 20) + print("First alternative of result {}".format(i)) + print("Transcript: {}".format(alternative.transcript)) + print( + "First Word and Confidence: ({}, {})".format( + alternative.words[0].word, alternative.words[0].confidence + ) + ) + # [END speech_transcribe_word_level_confidence_beta] + + +def transcribe_file_with_spoken_punctuation_end_emojis(): + """Transcribe the given audio file with spoken punctuation and emojis enabled.""" + # [START speech_transcribe_spoken_punctuation_emojis_beta] + from google.cloud import speech_v1p1beta1 as speech + from google.protobuf import wrappers_pb2 + + client = speech.SpeechClient() + + speech_file = "resources/commercial_mono.wav" + + with io.open(speech_file, "rb") as audio_file: + content = audio_file.read() + + audio = speech.RecognitionAudio(content=content) + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, + sample_rate_hertz=8000, + language_code="en-US", + # Enable spoken punctuation + enable_spoken_punctuation=wrappers_pb2.BoolValue(value=True), + # Enable spoken emojis + enable_spoken_emojis=wrappers_pb2.BoolValue(value=True), + ) + + response = client.recognize(config=config, audio=audio) + + for i, result in enumerate(response.results): + alternative = result.alternatives[0] + print("-" * 20) + print("First alternative of result {}".format(i)) + print("Transcript: {}".format(alternative.transcript)) + # [END speech_transcribe_spoken_punctuation_emojis_beta] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument("command") + + args = parser.parse_args() + + if args.command == "enhanced-model": + transcribe_file_with_enhanced_model() + elif args.command == "metadata": + transcribe_file_with_metadata() + elif args.command == "punctuation": + transcribe_file_with_auto_punctuation() + elif args.command == "diarization": + transcribe_file_with_diarization() + elif args.command == "multi-channel": + transcribe_file_with_multichannel() + elif args.command == "multi-language": + transcribe_file_with_multilanguage() + elif args.command == "word-level-conf": + transcribe_file_with_word_level_confidence() + elif args.command == "spoken-punctuation-emojis": + transcribe_file_with_spoken_punctuation_end_emojis() diff --git a/speech/snippets/beta_snippets_test.py b/speech/snippets/beta_snippets_test.py new file mode 100644 index 00000000000..0f6c50d8f1b --- /dev/null +++ b/speech/snippets/beta_snippets_test.py @@ -0,0 +1,86 @@ +# Copyright 2018 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from beta_snippets import ( + transcribe_file_with_auto_punctuation, + transcribe_file_with_diarization, + transcribe_file_with_enhanced_model, + transcribe_file_with_metadata, + transcribe_file_with_multichannel, + transcribe_file_with_multilanguage, + transcribe_file_with_spoken_punctuation_end_emojis, + transcribe_file_with_word_level_confidence, +) + +RESOURCES = os.path.join(os.path.dirname(__file__), "resources") + + +def test_transcribe_file_with_enhanced_model(capsys): + transcribe_file_with_enhanced_model() + out, _ = capsys.readouterr() + + assert "Chrome" in out + + +def test_transcribe_file_with_metadata(capsys): + transcribe_file_with_metadata() + out, _ = capsys.readouterr() + + assert "Chrome" in out + + +def test_transcribe_file_with_auto_punctuation(capsys): + transcribe_file_with_auto_punctuation() + out, _ = capsys.readouterr() + + assert "First alternative of result " in out + + +def test_transcribe_diarization(capsys): + transcribe_file_with_diarization() + out, err = capsys.readouterr() + + assert "word:" in out + assert "speaker_tag:" in out + + +def test_transcribe_multichannel_file(capsys): + transcribe_file_with_multichannel() + out, err = capsys.readouterr() + + assert "OK Google stream stranger things from Netflix to my TV" in out + + +def test_transcribe_multilanguage_file(capsys): + transcribe_file_with_multilanguage() + out, err = capsys.readouterr() + + assert "First alternative of result" in out + assert "Transcript" in out + + +def test_transcribe_word_level_confidence(capsys): + transcribe_file_with_word_level_confidence() + out, err = capsys.readouterr() + + assert "OK Google stream stranger things from Netflix to my TV" in out + + +def test_transcribe_file_with_spoken_punctuation_end_emojis(capsys): + transcribe_file_with_spoken_punctuation_end_emojis() + out, err = capsys.readouterr() + + assert "First alternative of result " in out diff --git a/speech/snippets/create_recognizer.py b/speech/snippets/create_recognizer.py new file mode 100644 index 00000000000..95fa6caa078 --- /dev/null +++ b/speech/snippets/create_recognizer.py @@ -0,0 +1,44 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# [START speech_create_recognizer] +from google.cloud.speech_v2 import SpeechClient +from google.cloud.speech_v2.types import cloud_speech + + +def create_recognizer(project_id, recognizer_id): + # Instantiates a client + client = SpeechClient() + + request = cloud_speech.CreateRecognizerRequest( + parent=f"projects/{project_id}/locations/global", + recognizer_id=recognizer_id, + recognizer=cloud_speech.Recognizer( + language_codes=["en-US"], model="latest_long" + ), + ) + + operation = client.create_recognizer(request=request) + recognizer = operation.result() + + print("Created Recognizer:", recognizer.name) + return recognizer + + +# [END speech_create_recognizer] + + +if __name__ == "__main__": + create_recognizer() diff --git a/speech/snippets/create_recognizer_test.py b/speech/snippets/create_recognizer_test.py new file mode 100644 index 00000000000..95d6f32bf07 --- /dev/null +++ b/speech/snippets/create_recognizer_test.py @@ -0,0 +1,36 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from uuid import uuid4 + +from google.cloud.speech_v2 import SpeechClient +from google.cloud.speech_v2.types import cloud_speech + +import create_recognizer + + +def delete_recognizer(name): + client = SpeechClient() + request = cloud_speech.DeleteRecognizerRequest(name=name) + client.delete_recognizer(request=request) + + +def test_create_recognizer(capsys): + project_id = os.getenv("GOOGLE_CLOUD_PROJECT") + + recognizer = create_recognizer.create_recognizer( + project_id, "recognizer-" + str(uuid4()) + ) + delete_recognizer(recognizer.name) diff --git a/speech/snippets/multi_region.py b/speech/snippets/multi_region.py new file mode 100644 index 00000000000..56e0fd95c25 --- /dev/null +++ b/speech/snippets/multi_region.py @@ -0,0 +1,55 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def sync_recognize_with_multi_region_gcs(): + + # [START speech_multi_region] + + # Imports the Google Cloud client library + from google.cloud import speech + from google.api_core import client_options + + # Instantiates a client + + # [START speech_multi_region_client] + + # Pass an additional argument, ClientOptions, to specify the new endpoint. + client_options = client_options.ClientOptions( + api_endpoint="eu-speech.googleapis.com" + ) + + client = speech.SpeechClient(client_options=client_options) + # [END speech_multi_region_client] + + # The name of the audio file to transcribe + gcs_uri = "gs://cloud-samples-data/speech/brooklyn_bridge.raw" + + audio = speech.RecognitionAudio(uri=gcs_uri) + + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, + sample_rate_hertz=16000, + language_code="en-US", + ) + + # Detects speech in the audio file + response = client.recognize(config=config, audio=audio) + + for result in response.results: + print("Transcript: {}".format(result.alternatives[0].transcript)) + # [END speech_multi_region] + + +sync_recognize_with_multi_region_gcs() diff --git a/speech/snippets/multi_region_test.py b/speech/snippets/multi_region_test.py new file mode 100644 index 00000000000..104f87df3b5 --- /dev/null +++ b/speech/snippets/multi_region_test.py @@ -0,0 +1,22 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import multi_region + + +def test_multi_region(capsys): + multi_region.sync_recognize_with_multi_region_gcs() + out, _ = capsys.readouterr() + assert "Transcript: how old is the Brooklyn Bridge" in out diff --git a/speech/snippets/profanity_filter.py b/speech/snippets/profanity_filter.py new file mode 100644 index 00000000000..c9d8e8887e6 --- /dev/null +++ b/speech/snippets/profanity_filter.py @@ -0,0 +1,50 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" Google Cloud Speech API sample application using the REST API for batch +processing. + +Example usage: + python transcribe.py gs://cloud-samples-tests/speech/brooklyn.flac +""" + + +# [START speech_recognize_with_profanity_filter_gcs] +def sync_recognize_with_profanity_filter_gcs(gcs_uri): + + from google.cloud import speech + + client = speech.SpeechClient() + + audio = {"uri": gcs_uri} + + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.FLAC, + sample_rate_hertz=16000, + language_code="en-US", + profanity_filter=True, + ) + + response = client.recognize(config=config, audio=audio) + + for i, result in enumerate(response.results): + alternative = result.alternatives[0] + print("Transcript: {}".format(alternative.transcript)) + + +# [END speech_recognize_with_profanity_filter_gcs] + +sync_recognize_with_profanity_filter_gcs( + "gs://cloud-samples-tests/speech/brooklyn.flac" +) diff --git a/speech/snippets/profanity_filter_test.py b/speech/snippets/profanity_filter_test.py new file mode 100644 index 00000000000..c3d1ebe7ef8 --- /dev/null +++ b/speech/snippets/profanity_filter_test.py @@ -0,0 +1,25 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re + +import profanity_filter + + +def test_profanity_filter(capsys): + profanity_filter.sync_recognize_with_profanity_filter_gcs( + "gs://cloud-samples-tests/speech/brooklyn.flac" + ) + out, err = capsys.readouterr() + assert re.search(r"how old is the Brooklyn Bridge", out, re.DOTALL | re.I) diff --git a/speech/snippets/quickstart.py b/speech/snippets/quickstart.py new file mode 100644 index 00000000000..9b55aa22510 --- /dev/null +++ b/speech/snippets/quickstart.py @@ -0,0 +1,50 @@ +# Copyright 2016 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def run_quickstart(): + # [START speech_quickstart] + + # Imports the Google Cloud client library + # [START speech_python_migration_imports] + from google.cloud import speech + + # [END speech_python_migration_imports] + + # Instantiates a client + # [START speech_python_migration_client] + client = speech.SpeechClient() + # [END speech_python_migration_client] + + # The name of the audio file to transcribe + gcs_uri = "gs://cloud-samples-data/speech/brooklyn_bridge.raw" + + audio = speech.RecognitionAudio(uri=gcs_uri) + + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, + sample_rate_hertz=16000, + language_code="en-US", + ) + + # Detects speech in the audio file + response = client.recognize(config=config, audio=audio) + + for result in response.results: + print("Transcript: {}".format(result.alternatives[0].transcript)) + # [END speech_quickstart] + + +if __name__ == "__main__": + run_quickstart() diff --git a/speech/snippets/quickstart_test.py b/speech/snippets/quickstart_test.py new file mode 100644 index 00000000000..f162d1c94f3 --- /dev/null +++ b/speech/snippets/quickstart_test.py @@ -0,0 +1,22 @@ +# Copyright 2016 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import quickstart + + +def test_quickstart(capsys): + quickstart.run_quickstart() + out, _ = capsys.readouterr() + assert "Transcript: how old is the Brooklyn Bridge" in out diff --git a/speech/snippets/quickstart_v2.py b/speech/snippets/quickstart_v2.py new file mode 100644 index 00000000000..68143860d71 --- /dev/null +++ b/speech/snippets/quickstart_v2.py @@ -0,0 +1,62 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# [START speech_quickstart_v2] +import io + +from google.cloud.speech_v2 import SpeechClient +from google.cloud.speech_v2.types import cloud_speech + + +def quickstart_v2(project_id, recognizer_id, audio_file): + # Instantiates a client + client = SpeechClient() + + request = cloud_speech.CreateRecognizerRequest( + parent=f"projects/{project_id}/locations/global", + recognizer_id=recognizer_id, + recognizer=cloud_speech.Recognizer( + language_codes=["en-US"], model="latest_long" + ), + ) + + # Creates a Recognizer + operation = client.create_recognizer(request=request) + recognizer = operation.result() + + # Reads a file as bytes + with io.open(audio_file, "rb") as f: + content = f.read() + + config = cloud_speech.RecognitionConfig(auto_decoding_config={}) + + request = cloud_speech.RecognizeRequest( + recognizer=recognizer.name, config=config, content=content + ) + + # Transcribes the audio into text + response = client.recognize(request=request) + + for result in response.results: + print("Transcript: {}".format(result.alternatives[0].transcript)) + + return response + + +# [END speech_quickstart_v2] + + +if __name__ == "__main__": + quickstart_v2() diff --git a/speech/snippets/quickstart_v2_test.py b/speech/snippets/quickstart_v2_test.py new file mode 100644 index 00000000000..e1bb87c7292 --- /dev/null +++ b/speech/snippets/quickstart_v2_test.py @@ -0,0 +1,49 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +from uuid import uuid4 + +from google.cloud.speech_v2 import SpeechClient +from google.cloud.speech_v2.types import cloud_speech + +import quickstart_v2 + +RESOURCES = os.path.join(os.path.dirname(__file__), "resources") + + +def delete_recognizer(name): + client = SpeechClient() + request = cloud_speech.DeleteRecognizerRequest(name=name) + client.delete_recognizer(request=request) + + +def test_quickstart_v2(capsys): + project_id = os.getenv("GOOGLE_CLOUD_PROJECT") + + recognizer_id = "recognizer-" + str(uuid4()) + response = quickstart_v2.quickstart_v2( + project_id, recognizer_id, os.path.join(RESOURCES, "audio.wav") + ) + + assert re.search( + r"how old is the Brooklyn Bridge", + response.results[0].alternatives[0].transcript, + re.DOTALL | re.I, + ) + + delete_recognizer( + f"projects/{project_id}/locations/global/recognizers/{recognizer_id}" + ) diff --git a/speech/snippets/requirements-test.txt b/speech/snippets/requirements-test.txt new file mode 100644 index 00000000000..49780e03569 --- /dev/null +++ b/speech/snippets/requirements-test.txt @@ -0,0 +1 @@ +pytest==7.2.0 diff --git a/speech/snippets/requirements.txt b/speech/snippets/requirements.txt new file mode 100644 index 00000000000..ea0e8807053 --- /dev/null +++ b/speech/snippets/requirements.txt @@ -0,0 +1,2 @@ +google-cloud-speech==2.16.2 +google-cloud-storage==2.7.0 diff --git a/speech/snippets/resources/Google_Gnome.wav b/speech/snippets/resources/Google_Gnome.wav new file mode 100644 index 00000000000..2f497b7fbe7 Binary files /dev/null and b/speech/snippets/resources/Google_Gnome.wav differ diff --git a/speech/snippets/resources/audio.raw b/speech/snippets/resources/audio.raw new file mode 100644 index 00000000000..5ebf79d3c9c Binary files /dev/null and b/speech/snippets/resources/audio.raw differ diff --git a/speech/snippets/resources/audio.wav b/speech/snippets/resources/audio.wav new file mode 100644 index 00000000000..140a3022e96 Binary files /dev/null and b/speech/snippets/resources/audio.wav differ diff --git a/speech/snippets/resources/audio2.raw b/speech/snippets/resources/audio2.raw new file mode 100644 index 00000000000..35413b78817 Binary files /dev/null and b/speech/snippets/resources/audio2.raw differ diff --git a/speech/snippets/resources/audio_silence_padding.wav b/speech/snippets/resources/audio_silence_padding.wav new file mode 100644 index 00000000000..db883c38634 Binary files /dev/null and b/speech/snippets/resources/audio_silence_padding.wav differ diff --git a/speech/snippets/resources/commercial_mono.wav b/speech/snippets/resources/commercial_mono.wav new file mode 100644 index 00000000000..e6b9ed434f9 Binary files /dev/null and b/speech/snippets/resources/commercial_mono.wav differ diff --git a/speech/snippets/resources/fair.wav b/speech/snippets/resources/fair.wav new file mode 100644 index 00000000000..3eb1144f5cb Binary files /dev/null and b/speech/snippets/resources/fair.wav differ diff --git a/speech/snippets/resources/multi.wav b/speech/snippets/resources/multi.wav new file mode 100644 index 00000000000..7f71d74b951 Binary files /dev/null and b/speech/snippets/resources/multi.wav differ diff --git a/speech/snippets/resources/two_channel_16k.wav b/speech/snippets/resources/two_channel_16k.wav new file mode 100644 index 00000000000..2db62a8145a Binary files /dev/null and b/speech/snippets/resources/two_channel_16k.wav differ diff --git a/speech/snippets/speech_adaptation_beta.py b/speech/snippets/speech_adaptation_beta.py new file mode 100644 index 00000000000..e9f319fe836 --- /dev/null +++ b/speech/snippets/speech_adaptation_beta.py @@ -0,0 +1,99 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# DO NOT EDIT! This is a generated sample ("Request", "speech_adaptation_beta") + +# To install the latest published package dependency, execute the following: +# pip install google-cloud-speech + +# sample-metadata +# title: Speech Adaptation (Cloud Storage) +# description: Transcribe a short audio file with speech adaptation. +# usage: python3 samples/v1p1beta1/speech_adaptation_beta.py [--storage_uri "gs://cloud-samples-data/speech/brooklyn_bridge.mp3"] [--phrase "Brooklyn Bridge"] + +# [START speech_adaptation_beta] +from google.cloud import speech_v1p1beta1 as speech + + +def sample_recognize(storage_uri, phrase): + """ + Transcribe a short audio file with speech adaptation. + + Args: + storage_uri URI for audio file in Cloud Storage, e.g. gs://[BUCKET]/[FILE] + phrase Phrase "hints" help recognize the specified phrases from your audio. + """ + + client = speech.SpeechClient() + + # storage_uri = 'gs://cloud-samples-data/speech/brooklyn_bridge.mp3' + # phrase = 'Brooklyn Bridge' + phrases = [phrase] + + # Hint Boost. This value increases the probability that a specific + # phrase will be recognized over other similar sounding phrases. + # The higher the boost, the higher the chance of false positive + # recognition as well. Can accept wide range of positive values. + # Most use cases are best served with values between 0 and 20. + # Using a binary search approach may help you find the optimal value. + boost = 20.0 + speech_contexts_element = {"phrases": phrases, "boost": boost} + speech_contexts = [speech_contexts_element] + + # Sample rate in Hertz of the audio data sent + sample_rate_hertz = 44100 + + # The language of the supplied audio + language_code = "en-US" + + # Encoding of audio data sent. This sample sets this explicitly. + # This field is optional for FLAC and WAV audio formats + encoding = speech.RecognitionConfig.AudioEncoding.MP3 + + config = { + "speech_contexts": speech_contexts, + "sample_rate_hertz": sample_rate_hertz, + "language_code": language_code, + "encoding": encoding, + } + audio = {"uri": storage_uri} + + response = client.recognize(config=config, audio=audio) + + for result in response.results: + # First alternative is the most probable result + alternative = result.alternatives[0] + print("Transcript: {}".format(alternative.transcript)) + + # [END speech_adaptation_beta] + return response + + +def main(): + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument( + "--storage_uri", + type=str, + default="gs://cloud-samples-data/speech/brooklyn_bridge.mp3", + ) + parser.add_argument("--phrase", type=str, default="Brooklyn Bridge") + args = parser.parse_args() + + sample_recognize(args.storage_uri, args.phrase) + + +if __name__ == "__main__": + main() diff --git a/speech/snippets/speech_adaptation_beta_test.py b/speech/snippets/speech_adaptation_beta_test.py new file mode 100644 index 00000000000..3140ea41076 --- /dev/null +++ b/speech/snippets/speech_adaptation_beta_test.py @@ -0,0 +1,22 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import speech_adaptation_beta + + +def test_adaptation_beta(): + response = speech_adaptation_beta.sample_recognize( + "gs://cloud-samples-data/speech/brooklyn_bridge.mp3", "Brooklyn Bridge" + ) + assert "brooklyn" in response.results[0].alternatives[0].transcript.lower() diff --git a/speech/snippets/speech_model_adaptation_beta.py b/speech/snippets/speech_model_adaptation_beta.py new file mode 100644 index 00000000000..a3ac4e9d3dd --- /dev/null +++ b/speech/snippets/speech_model_adaptation_beta.py @@ -0,0 +1,96 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# [START speech_transcribe_with_model_adaptation] + +from google.cloud import speech_v1p1beta1 as speech + + +def transcribe_with_model_adaptation( + project_id, location, storage_uri, custom_class_id, phrase_set_id +): + + """ + Create`PhraseSet` and `CustomClasses` to create custom lists of similar + items that are likely to occur in your input data. + """ + + # Create the adaptation client + adaptation_client = speech.AdaptationClient() + + # The parent resource where the custom class and phrase set will be created. + parent = f"projects/{project_id}/locations/{location}" + + # Create the custom class resource + adaptation_client.create_custom_class( + { + "parent": parent, + "custom_class_id": custom_class_id, + "custom_class": { + "items": [ + {"value": "sushido"}, + {"value": "altura"}, + {"value": "taneda"}, + ] + }, + } + ) + custom_class_name = ( + f"projects/{project_id}/locations/{location}/customClasses/{custom_class_id}" + ) + # Create the phrase set resource + phrase_set_response = adaptation_client.create_phrase_set( + { + "parent": parent, + "phrase_set_id": phrase_set_id, + "phrase_set": { + "boost": 10, + "phrases": [ + {"value": f"Visit restaurants like ${{{custom_class_name}}}"} + ], + }, + } + ) + phrase_set_name = phrase_set_response.name + # The next section shows how to use the newly created custom + # class and phrase set to send a transcription request with speech adaptation + + # Speech adaptation configuration + speech_adaptation = speech.SpeechAdaptation(phrase_set_references=[phrase_set_name]) + + # speech configuration object + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, + sample_rate_hertz=24000, + language_code="en-US", + adaptation=speech_adaptation, + ) + + # The name of the audio file to transcribe + # storage_uri URI for audio file in Cloud Storage, e.g. gs://[BUCKET]/[FILE] + + audio = speech.RecognitionAudio(uri=storage_uri) + + # Create the speech client + speech_client = speech.SpeechClient() + + response = speech_client.recognize(config=config, audio=audio) + + for result in response.results: + print("Transcript: {}".format(result.alternatives[0].transcript)) + + # [END speech_transcribe_with_model_adaptation] + + return response.results[0].alternatives[0].transcript diff --git a/speech/snippets/speech_model_adaptation_beta_test.py b/speech/snippets/speech_model_adaptation_beta_test.py new file mode 100644 index 00000000000..8f3768ef3bc --- /dev/null +++ b/speech/snippets/speech_model_adaptation_beta_test.py @@ -0,0 +1,62 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import uuid + +import google.auth + +from google.cloud import speech_v1p1beta1 as speech + +import pytest + +import speech_model_adaptation_beta + + +STORAGE_URI = "gs://cloud-samples-data/speech/brooklyn_bridge.raw" +_, PROJECT_ID = google.auth.default() +LOCATION = "global" +client = speech.AdaptationClient() + + +def test_model_adaptation_beta(custom_class_id, phrase_set_id, capsys): + class_id = custom_class_id + phrase_id = phrase_set_id + transcript = speech_model_adaptation_beta.transcribe_with_model_adaptation( + PROJECT_ID, LOCATION, STORAGE_URI, class_id, phrase_id + ) + assert "how long is the Brooklyn Bridge" in transcript + + +@pytest.fixture +def custom_class_id(): + # The custom class id can't be too long + custom_class_id = f"customClassId{str(uuid.uuid4())[:8]}" + yield custom_class_id + # clean up resources + CLASS_PARENT = ( + f"projects/{PROJECT_ID}/locations/{LOCATION}/customClasses/{custom_class_id}" + ) + client.delete_custom_class(name=CLASS_PARENT) + + +@pytest.fixture +def phrase_set_id(): + # The phrase set id can't be too long + phrase_set_id = f"phraseSetId{str(uuid.uuid4())[:8]}" + yield phrase_set_id + # clean up resources + PHRASE_PARENT = ( + f"projects/{PROJECT_ID}/locations/{LOCATION}/phraseSets/{phrase_set_id}" + ) + client.delete_phrase_set(name=PHRASE_PARENT) diff --git a/speech/snippets/speech_quickstart_beta.py b/speech/snippets/speech_quickstart_beta.py new file mode 100644 index 00000000000..2a678ace954 --- /dev/null +++ b/speech/snippets/speech_quickstart_beta.py @@ -0,0 +1,83 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# DO NOT EDIT! This is a generated sample ("Request", "speech_quickstart_beta") + +# To install the latest published package dependency, execute the following: +# pip install google-cloud-speech + +# sample-metadata +# title: Quickstart Beta +# description: Performs synchronous speech recognition on an audio file +# usage: python3 samples/v1p1beta1/speech_quickstart_beta.py [--storage_uri "gs://cloud-samples-data/speech/brooklyn_bridge.mp3"] + +# [START speech_quickstart_beta] +from google.cloud import speech_v1p1beta1 as speech + + +def sample_recognize(storage_uri): + """ + Performs synchronous speech recognition on an audio file + + Args: + storage_uri URI for audio file in Cloud Storage, e.g. gs://[BUCKET]/[FILE] + """ + + client = speech.SpeechClient() + + # storage_uri = 'gs://cloud-samples-data/speech/brooklyn_bridge.mp3' + + # The language of the supplied audio + language_code = "en-US" + + # Sample rate in Hertz of the audio data sent + sample_rate_hertz = 44100 + + # Encoding of audio data sent. This sample sets this explicitly. + # This field is optional for FLAC and WAV audio formats. + encoding = speech.RecognitionConfig.AudioEncoding.MP3 + config = { + "language_code": language_code, + "sample_rate_hertz": sample_rate_hertz, + "encoding": encoding, + } + audio = {"uri": storage_uri} + + response = client.recognize(config=config, audio=audio) + + for result in response.results: + # First alternative is the most probable result + alternative = result.alternatives[0] + print("Transcript: {}".format(alternative.transcript)) + + # [END speech_quickstart_beta] + return response + + +def main(): + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument( + "--storage_uri", + type=str, + default="gs://cloud-samples-data/speech/brooklyn_bridge.mp3", + ) + args = parser.parse_args() + + sample_recognize(args.storage_uri) + + +if __name__ == "__main__": + main() diff --git a/speech/snippets/speech_quickstart_beta_test.py b/speech/snippets/speech_quickstart_beta_test.py new file mode 100644 index 00000000000..87fe8094881 --- /dev/null +++ b/speech/snippets/speech_quickstart_beta_test.py @@ -0,0 +1,22 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import speech_quickstart_beta + + +def test_quickstart_beta(): + response = speech_quickstart_beta.sample_recognize( + "gs://cloud-samples-data/speech/brooklyn_bridge.mp3" + ) + assert "brooklyn" in response.results[0].alternatives[0].transcript.lower() diff --git a/speech/snippets/speech_to_storage_beta.py b/speech/snippets/speech_to_storage_beta.py new file mode 100644 index 00000000000..5b0e4a102d4 --- /dev/null +++ b/speech/snippets/speech_to_storage_beta.py @@ -0,0 +1,85 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# [START speech_transcribe_with_speech_to_storage_beta] + +from google.cloud import speech +from google.cloud import storage +from google.cloud.speech_v1 import types + + +def export_transcript_to_storage_beta( + input_storage_uri, + output_storage_uri, + encoding, + sample_rate_hertz, + language_code, + bucket_name, + object_name, +): + + # input_uri URI for audio file in Cloud Storage, e.g. gs://[BUCKET]/[FILE] + audio = speech.RecognitionAudio(uri=input_storage_uri) + + # Pass in the URI of the Cloud Storage bucket to hold the transcription + output_config = speech.TranscriptOutputConfig(gcs_uri=output_storage_uri) + + # Speech configuration object + config = speech.RecognitionConfig( + encoding=encoding, + sample_rate_hertz=sample_rate_hertz, + language_code=language_code, + ) + + # Compose the long-running request + request = speech.LongRunningRecognizeRequest( + audio=audio, config=config, output_config=output_config + ) + + # create the speech client + speech_client = speech.SpeechClient() + + # create the storage client + storage_client = storage.Client() + + # run the recognizer to export transcript + operation = speech_client.long_running_recognize(request=request) + + print("Waiting for operation to complete...") + operation.result(timeout=90) + + # get bucket with name + bucket = storage_client.get_bucket(bucket_name) + + # get blob from bucket + blob = bucket.get_blob(object_name) + + # get content as bytes + results_bytes = blob.download_as_bytes() + + # get transcript exported in storage bucket + storage_transcript = types.LongRunningRecognizeResponse.from_json( + results_bytes, ignore_unknown_fields=True + ) + + # Each result is for a consecutive portion of the audio. Iterate through + # them to get the transcripts for the entire audio file. + for result in storage_transcript.results: + # The first alternative is the most likely one for this portion. + print(f"Transcript: {result.alternatives[0].transcript}") + print(f"Confidence: {result.alternatives[0].confidence}") + + # [END speech_transcribe_with_speech_to_storage_beta] + return storage_transcript.results diff --git a/speech/snippets/speech_to_storage_beta_test.py b/speech/snippets/speech_to_storage_beta_test.py new file mode 100644 index 00000000000..88a8017d98f --- /dev/null +++ b/speech/snippets/speech_to_storage_beta_test.py @@ -0,0 +1,68 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import uuid + +from google.cloud import speech_v1p1beta1 as speech +from google.cloud import storage +import pytest + +import speech_to_storage_beta + +STORAGE_URI = "gs://cloud-samples-data/speech/brooklyn_bridge.raw" + + +storage_client = storage.Client() + +BUCKET_UUID = str(uuid.uuid4())[:8] +BUCKET_NAME = f"speech-{BUCKET_UUID}" +BUCKET_PREFIX = "export-transcript-output-test" +DELIMETER = None + +INPUT_STORAGE_URI = "gs://cloud-samples-data/speech/commercial_mono.wav" +OUTPUT_STORAGE_URI = f"gs://{BUCKET_NAME}/{BUCKET_PREFIX}" +encoding = speech.RecognitionConfig.AudioEncoding.LINEAR16 +sample_rate_hertz = 8000 +language_code = "en-US" + + +def test_export_transcript_to_storage_beta(bucket, capsys): + results = speech_to_storage_beta.export_transcript_to_storage_beta( + INPUT_STORAGE_URI, + OUTPUT_STORAGE_URI, + encoding, + sample_rate_hertz, + language_code, + BUCKET_NAME, + BUCKET_PREFIX, + ) + assert len(results) > 0 + + +@pytest.fixture +def bucket(): + """Yields a bucket that is deleted after the test completes.""" + bucket = None + while bucket is None or bucket.exists(): + bucket = storage_client.bucket(BUCKET_NAME) + bucket.storage_class = "COLDLINE" + storage_client.create_bucket(bucket, location="us") + yield bucket + + blobs = storage_client.list_blobs(BUCKET_NAME, prefix=BUCKET_PREFIX) + + for blob in blobs: + blob.delete() + + bucket.delete(force=True) diff --git a/speech/snippets/transcribe.py b/speech/snippets/transcribe.py new file mode 100644 index 00000000000..e31fe9e4be1 --- /dev/null +++ b/speech/snippets/transcribe.py @@ -0,0 +1,99 @@ +# Copyright 2017 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Google Cloud Speech API sample application using the REST API for batch +processing. + +Example usage: + python transcribe.py resources/audio.raw + python transcribe.py gs://cloud-samples-tests/speech/brooklyn.flac +""" + +import argparse + + +# [START speech_transcribe_sync] +def transcribe_file(speech_file): + """Transcribe the given audio file.""" + from google.cloud import speech + import io + + client = speech.SpeechClient() + + # [START speech_python_migration_sync_request] + # [START speech_python_migration_config] + with io.open(speech_file, "rb") as audio_file: + content = audio_file.read() + + audio = speech.RecognitionAudio(content=content) + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, + sample_rate_hertz=16000, + language_code="en-US", + ) + # [END speech_python_migration_config] + + # [START speech_python_migration_sync_response] + response = client.recognize(config=config, audio=audio) + + # [END speech_python_migration_sync_request] + # Each result is for a consecutive portion of the audio. Iterate through + # them to get the transcripts for the entire audio file. + for result in response.results: + # The first alternative is the most likely one for this portion. + print("Transcript: {}".format(result.alternatives[0].transcript)) + # [END speech_python_migration_sync_response] + + +# [END speech_transcribe_sync] + + +# [START speech_transcribe_sync_gcs] +def transcribe_gcs(gcs_uri): + """Transcribes the audio file specified by the gcs_uri.""" + from google.cloud import speech + + client = speech.SpeechClient() + + # [START speech_python_migration_config_gcs] + audio = speech.RecognitionAudio(uri=gcs_uri) + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.FLAC, + sample_rate_hertz=16000, + language_code="en-US", + ) + # [END speech_python_migration_config_gcs] + + response = client.recognize(config=config, audio=audio) + + # Each result is for a consecutive portion of the audio. Iterate through + # them to get the transcripts for the entire audio file. + for result in response.results: + # The first alternative is the most likely one for this portion. + print("Transcript: {}".format(result.alternatives[0].transcript)) + + +# [END speech_transcribe_sync_gcs] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument("path", help="File or GCS path for audio file to be recognized") + args = parser.parse_args() + if args.path.startswith("gs://"): + transcribe_gcs(args.path) + else: + transcribe_file(args.path) diff --git a/speech/snippets/transcribe_async_file.py b/speech/snippets/transcribe_async_file.py new file mode 100644 index 00000000000..873cf7bbd54 --- /dev/null +++ b/speech/snippets/transcribe_async_file.py @@ -0,0 +1,60 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Google Cloud Speech-to-Text sample application using gRPC for async +batch processing. +""" + + +# [START speech_transcribe_async] +def transcribe_file(speech_file): + """Transcribe the given audio file asynchronously.""" + from google.cloud import speech + + client = speech.SpeechClient() + + # [START speech_python_migration_async_request] + with open(speech_file, "rb") as audio_file: + content = audio_file.read() + + """ + Note that transcription is limited to a 60 seconds audio file. + Use a GCS file for audio longer than 1 minute. + """ + audio = speech.RecognitionAudio(content=content) + + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, + sample_rate_hertz=16000, + language_code="en-US", + ) + + # [START speech_python_migration_async_response] + + operation = client.long_running_recognize(config=config, audio=audio) + # [END speech_python_migration_async_request] + + print("Waiting for operation to complete...") + response = operation.result(timeout=90) + + # Each result is for a consecutive portion of the audio. Iterate through + # them to get the transcripts for the entire audio file. + for result in response.results: + # The first alternative is the most likely one for this portion. + print("Transcript: {}".format(result.alternatives[0].transcript)) + print("Confidence: {}".format(result.alternatives[0].confidence)) + # [END speech_python_migration_async_response] + + +# [END speech_transcribe_async] diff --git a/speech/snippets/transcribe_async_file_test.py b/speech/snippets/transcribe_async_file_test.py new file mode 100644 index 00000000000..8659a7b0832 --- /dev/null +++ b/speech/snippets/transcribe_async_file_test.py @@ -0,0 +1,27 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re + +import transcribe_async_file + +RESOURCES = os.path.join(os.path.dirname(__file__), "resources") + + +def test_transcribe(capsys): + transcribe_async_file.transcribe_file(os.path.join(RESOURCES, "audio.raw")) + out, err = capsys.readouterr() + + assert re.search(r"how old is the Brooklyn Bridge", out, re.DOTALL | re.I) diff --git a/speech/snippets/transcribe_async_gcs.py b/speech/snippets/transcribe_async_gcs.py new file mode 100644 index 00000000000..3a0b5c973c3 --- /dev/null +++ b/speech/snippets/transcribe_async_gcs.py @@ -0,0 +1,47 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Google Cloud Speech-to-Text sample application using the gRPC for async +batch processing. +""" + + +# [START speech_transcribe_async_gcs] +def transcribe_gcs(gcs_uri): + """Asynchronously transcribes the audio file specified by the gcs_uri.""" + from google.cloud import speech + + client = speech.SpeechClient() + + audio = speech.RecognitionAudio(uri=gcs_uri) + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.FLAC, + sample_rate_hertz=16000, + language_code="en-US", + ) + + operation = client.long_running_recognize(config=config, audio=audio) + + print("Waiting for operation to complete...") + response = operation.result(timeout=90) + + # Each result is for a consecutive portion of the audio. Iterate through + # them to get the transcripts for the entire audio file. + for result in response.results: + # The first alternative is the most likely one for this portion. + print("Transcript: {}".format(result.alternatives[0].transcript)) + print("Confidence: {}".format(result.alternatives[0].confidence)) + + +# [END speech_transcribe_async_gcs] diff --git a/speech/snippets/transcribe_async_gcs_test.py b/speech/snippets/transcribe_async_gcs_test.py new file mode 100644 index 00000000000..bd533addbc2 --- /dev/null +++ b/speech/snippets/transcribe_async_gcs_test.py @@ -0,0 +1,28 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re + +import transcribe_async_gcs + +RESOURCES = os.path.join(os.path.dirname(__file__), "resources") + + +def test_transcribe_gcs(capsys): + gcs_path = "gs://python-docs-samples-tests/speech/audio.flac" + transcribe_async_gcs.transcribe_gcs(gcs_path) + out, err = capsys.readouterr() + + assert re.search(r"how old is the Brooklyn Bridge", out, re.DOTALL | re.I) diff --git a/speech/snippets/transcribe_auto_punctuation.py b/speech/snippets/transcribe_auto_punctuation.py new file mode 100644 index 00000000000..e3ce0895349 --- /dev/null +++ b/speech/snippets/transcribe_auto_punctuation.py @@ -0,0 +1,64 @@ +# Copyright 2018 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Google Cloud Speech API sample that demonstrates auto punctuation +and recognition metadata. + +Example usage: + python transcribe_auto_punctuation.py resources/commercial_mono.wav +""" + +import argparse +import io + + +def transcribe_file_with_auto_punctuation(path): + """Transcribe the given audio file with auto punctuation enabled.""" + # [START speech_transcribe_auto_punctuation] + from google.cloud import speech + + client = speech.SpeechClient() + + # path = 'resources/commercial_mono.wav' + with io.open(path, "rb") as audio_file: + content = audio_file.read() + + audio = speech.RecognitionAudio(content=content) + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, + sample_rate_hertz=8000, + language_code="en-US", + # Enable automatic punctuation + enable_automatic_punctuation=True, + ) + + response = client.recognize(config=config, audio=audio) + + for i, result in enumerate(response.results): + alternative = result.alternatives[0] + print("-" * 20) + print("First alternative of result {}".format(i)) + print("Transcript: {}".format(alternative.transcript)) + # [END speech_transcribe_auto_punctuation] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument("path", help="File to stream to the API") + + args = parser.parse_args() + + transcribe_file_with_auto_punctuation(args.path) diff --git a/speech/snippets/transcribe_auto_punctuation_test.py b/speech/snippets/transcribe_auto_punctuation_test.py new file mode 100644 index 00000000000..fa4714d0aa8 --- /dev/null +++ b/speech/snippets/transcribe_auto_punctuation_test.py @@ -0,0 +1,28 @@ +# Copyright 2018 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import transcribe_auto_punctuation + +RESOURCES = os.path.join(os.path.dirname(__file__), "resources") + + +def test_transcribe_file_with_auto_punctuation(capsys): + transcribe_auto_punctuation.transcribe_file_with_auto_punctuation( + "resources/commercial_mono.wav" + ) + out, _ = capsys.readouterr() + + assert "First alternative of result " in out diff --git a/speech/snippets/transcribe_context_classes.py b/speech/snippets/transcribe_context_classes.py new file mode 100644 index 00000000000..fc22e18404a --- /dev/null +++ b/speech/snippets/transcribe_context_classes.py @@ -0,0 +1,49 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def transcribe_context_classes(storage_uri): + """Provides "hints" to the speech recognizer to + favor specific classes of words in the results.""" + # [START speech_context_classes] + from google.cloud import speech + + client = speech.SpeechClient() + + # storage_uri = 'gs://YOUR_BUCKET_ID/path/to/your/file.wav' + audio = speech.RecognitionAudio(uri=storage_uri) + + # SpeechContext: to configure your speech_context see: + # https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#speechcontext + # Full list of supported phrases (class tokens) here: + # https://cloud.google.com/speech-to-text/docs/class-tokens + speech_context = speech.SpeechContext(phrases=["$TIME"]) + + # RecognitionConfig: to configure your encoding and sample_rate_hertz, see: + # https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#recognitionconfig + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, + sample_rate_hertz=8000, + language_code="en-US", + speech_contexts=[speech_context], + ) + + response = client.recognize(config=config, audio=audio) + + for i, result in enumerate(response.results): + alternative = result.alternatives[0] + print("-" * 20) + print("First alternative of result {}".format(i)) + print("Transcript: {}".format(alternative.transcript)) + # [END speech_context_classes] diff --git a/speech/snippets/transcribe_context_classes_test.py b/speech/snippets/transcribe_context_classes_test.py new file mode 100644 index 00000000000..1c3e8153d40 --- /dev/null +++ b/speech/snippets/transcribe_context_classes_test.py @@ -0,0 +1,24 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import transcribe_context_classes + + +def test_transcribe_context_classes(capsys): + transcribe_context_classes.transcribe_context_classes( + "gs://cloud-samples-data/speech/commercial_mono.wav" + ) + out, _ = capsys.readouterr() + + assert "First alternative of result " in out diff --git a/speech/snippets/transcribe_enhanced_model.py b/speech/snippets/transcribe_enhanced_model.py new file mode 100644 index 00000000000..3a80bc2bd91 --- /dev/null +++ b/speech/snippets/transcribe_enhanced_model.py @@ -0,0 +1,66 @@ +# Copyright 2018 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Google Cloud Speech API sample that demonstrates enhanced models +and recognition metadata. + +Example usage: + python transcribe_enhanced_model.py resources/commercial_mono.wav +""" + +import argparse + + +def transcribe_file_with_enhanced_model(path): + """Transcribe the given audio file using an enhanced model.""" + # [START speech_transcribe_enhanced_model] + import io + + from google.cloud import speech + + client = speech.SpeechClient() + + # path = 'resources/commercial_mono.wav' + with io.open(path, "rb") as audio_file: + content = audio_file.read() + + audio = speech.RecognitionAudio(content=content) + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, + sample_rate_hertz=8000, + language_code="en-US", + use_enhanced=True, + # A model must be specified to use enhanced model. + model="phone_call", + ) + + response = client.recognize(config=config, audio=audio) + + for i, result in enumerate(response.results): + alternative = result.alternatives[0] + print("-" * 20) + print("First alternative of result {}".format(i)) + print("Transcript: {}".format(alternative.transcript)) + # [END speech_transcribe_enhanced_model] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument("path", help="File to stream to the API") + + args = parser.parse_args() + + transcribe_file_with_enhanced_model(args.path) diff --git a/speech/snippets/transcribe_enhanced_model_test.py b/speech/snippets/transcribe_enhanced_model_test.py new file mode 100644 index 00000000000..eab6397d917 --- /dev/null +++ b/speech/snippets/transcribe_enhanced_model_test.py @@ -0,0 +1,28 @@ +# Copyright 2018 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import transcribe_enhanced_model + +RESOURCES = os.path.join(os.path.dirname(__file__), "resources") + + +def test_transcribe_file_with_enhanced_model(capsys): + transcribe_enhanced_model.transcribe_file_with_enhanced_model( + "resources/commercial_mono.wav" + ) + out, _ = capsys.readouterr() + + assert "Chrome" in out diff --git a/speech/snippets/transcribe_file_v2.py b/speech/snippets/transcribe_file_v2.py new file mode 100644 index 00000000000..1c50c7bf82b --- /dev/null +++ b/speech/snippets/transcribe_file_v2.py @@ -0,0 +1,62 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# [START speech_transcribe_file_v2] +import io + +from google.cloud.speech_v2 import SpeechClient +from google.cloud.speech_v2.types import cloud_speech + + +def transcribe_file_v2(project_id, recognizer_id, audio_file): + # Instantiates a client + client = SpeechClient() + + request = cloud_speech.CreateRecognizerRequest( + parent=f"projects/{project_id}/locations/global", + recognizer_id=recognizer_id, + recognizer=cloud_speech.Recognizer( + language_codes=["en-US"], model="latest_long" + ), + ) + + # Creates a Recognizer + operation = client.create_recognizer(request=request) + recognizer = operation.result() + + # Reads a file as bytes + with io.open(audio_file, "rb") as f: + content = f.read() + + config = cloud_speech.RecognitionConfig(auto_decoding_config={}) + + request = cloud_speech.RecognizeRequest( + recognizer=recognizer.name, config=config, content=content + ) + + # Transcribes the audio into text + response = client.recognize(request=request) + + for result in response.results: + print("Transcript: {}".format(result.alternatives[0].transcript)) + + return response + + +# [END speech_transcribe_file_v2] + + +if __name__ == "__main__": + transcribe_file_v2() diff --git a/speech/snippets/transcribe_file_v2_test.py b/speech/snippets/transcribe_file_v2_test.py new file mode 100644 index 00000000000..d135c6f71d8 --- /dev/null +++ b/speech/snippets/transcribe_file_v2_test.py @@ -0,0 +1,49 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +from uuid import uuid4 + +from google.cloud.speech_v2 import SpeechClient +from google.cloud.speech_v2.types import cloud_speech + +import transcribe_file_v2 + +RESOURCES = os.path.join(os.path.dirname(__file__), "resources") + + +def delete_recognizer(name): + client = SpeechClient() + request = cloud_speech.DeleteRecognizerRequest(name=name) + client.delete_recognizer(request=request) + + +def test_transcribe_file_v2(capsys): + project_id = os.getenv("GOOGLE_CLOUD_PROJECT") + + recognizer_id = "recognizer-" + str(uuid4()) + response = transcribe_file_v2.transcribe_file_v2( + project_id, recognizer_id, os.path.join(RESOURCES, "audio.wav") + ) + + assert re.search( + r"how old is the Brooklyn Bridge", + response.results[0].alternatives[0].transcript, + re.DOTALL | re.I, + ) + + delete_recognizer( + f"projects/{project_id}/locations/global/recognizers/{recognizer_id}" + ) diff --git a/speech/snippets/transcribe_gcs_v2.py b/speech/snippets/transcribe_gcs_v2.py new file mode 100644 index 00000000000..80d67c9a431 --- /dev/null +++ b/speech/snippets/transcribe_gcs_v2.py @@ -0,0 +1,56 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# [START speech_transcribe_gcs_v2] +from google.cloud.speech_v2 import SpeechClient +from google.cloud.speech_v2.types import cloud_speech + + +def transcribe_gcs_v2(project_id, recognizer_id, gcs_uri): + # Instantiates a client + client = SpeechClient() + + request = cloud_speech.CreateRecognizerRequest( + parent=f"projects/{project_id}/locations/global", + recognizer_id=recognizer_id, + recognizer=cloud_speech.Recognizer( + language_codes=["en-US"], model="latest_long" + ), + ) + + # Creates a Recognizer + operation = client.create_recognizer(request=request) + recognizer = operation.result() + + config = cloud_speech.RecognitionConfig(auto_decoding_config={}) + + request = cloud_speech.RecognizeRequest( + recognizer=recognizer.name, config=config, uri=gcs_uri + ) + + # Transcribes the audio into text + response = client.recognize(request=request) + + for result in response.results: + print("Transcript: {}".format(result.alternatives[0].transcript)) + + return response + + +# [END speech_transcribe_gcs_v2] + + +if __name__ == "__main__": + transcribe_gcs_v2() diff --git a/speech/snippets/transcribe_gcs_v2_test.py b/speech/snippets/transcribe_gcs_v2_test.py new file mode 100644 index 00000000000..cd712baf575 --- /dev/null +++ b/speech/snippets/transcribe_gcs_v2_test.py @@ -0,0 +1,47 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +from uuid import uuid4 + +from google.cloud.speech_v2 import SpeechClient +from google.cloud.speech_v2.types import cloud_speech + +import transcribe_gcs_v2 + + +def delete_recognizer(name): + client = SpeechClient() + request = cloud_speech.DeleteRecognizerRequest(name=name) + client.delete_recognizer(request=request) + + +def test_transcribe_gcs_v2(capsys): + project_id = os.getenv("GOOGLE_CLOUD_PROJECT") + + recognizer_id = "recognizer-" + str(uuid4()) + response = transcribe_gcs_v2.transcribe_gcs_v2( + project_id, recognizer_id, "gs://cloud-samples-data/speech/audio.flac" + ) + + assert re.search( + r"how old is the Brooklyn Bridge", + response.results[0].alternatives[0].transcript, + re.DOTALL | re.I, + ) + + delete_recognizer( + f"projects/{project_id}/locations/global/recognizers/{recognizer_id}" + ) diff --git a/speech/snippets/transcribe_model_selection.py b/speech/snippets/transcribe_model_selection.py new file mode 100644 index 00000000000..d682dfcb38d --- /dev/null +++ b/speech/snippets/transcribe_model_selection.py @@ -0,0 +1,109 @@ +# Copyright 2017 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Google Cloud Speech API sample that demonstrates how to select the model +used for speech recognition. + +Example usage: + python transcribe_model_selection.py \ + resources/Google_Gnome.wav --model video + python transcribe_model_selection.py \ + gs://cloud-samples-tests/speech/Google_Gnome.wav --model video +""" + +import argparse + + +# [START speech_transcribe_model_selection] +def transcribe_model_selection(speech_file, model): + """Transcribe the given audio file synchronously with + the selected model.""" + from google.cloud import speech + + client = speech.SpeechClient() + + with open(speech_file, "rb") as audio_file: + content = audio_file.read() + + audio = speech.RecognitionAudio(content=content) + + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, + sample_rate_hertz=16000, + language_code="en-US", + model=model, + ) + + response = client.recognize(config=config, audio=audio) + + for i, result in enumerate(response.results): + alternative = result.alternatives[0] + print("-" * 20) + print("First alternative of result {}".format(i)) + print("Transcript: {}".format(alternative.transcript)) + + +# [END speech_transcribe_model_selection] + + +# [START speech_transcribe_model_selection_gcs] +def transcribe_model_selection_gcs(gcs_uri, model): + """Transcribe the given audio file asynchronously with + the selected model.""" + from google.cloud import speech + + client = speech.SpeechClient() + + audio = speech.RecognitionAudio(uri=gcs_uri) + + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, + sample_rate_hertz=16000, + language_code="en-US", + model=model, + ) + + operation = client.long_running_recognize(config=config, audio=audio) + + print("Waiting for operation to complete...") + response = operation.result(timeout=90) + + for i, result in enumerate(response.results): + alternative = result.alternatives[0] + print("-" * 20) + print("First alternative of result {}".format(i)) + print("Transcript: {}".format(alternative.transcript)) + + +# [END speech_transcribe_model_selection_gcs] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument("path", help="File or GCS path for audio file to be recognized") + parser.add_argument( + "--model", + help="The speech recognition model to use", + choices=["command_and_search", "phone_call", "video", "default"], + default="default", + ) + + args = parser.parse_args() + + if args.path.startswith("gs://"): + transcribe_model_selection_gcs(args.path, args.model) + else: + transcribe_model_selection(args.path, args.model) diff --git a/speech/snippets/transcribe_model_selection_test.py b/speech/snippets/transcribe_model_selection_test.py new file mode 100644 index 00000000000..c3619034963 --- /dev/null +++ b/speech/snippets/transcribe_model_selection_test.py @@ -0,0 +1,38 @@ +# Copyright 2016 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re + +import transcribe_model_selection + +RESOURCES = os.path.join(os.path.dirname(__file__), "resources") + + +def test_transcribe_model_selection_file(capsys): + transcribe_model_selection.transcribe_model_selection( + os.path.join(RESOURCES, "Google_Gnome.wav"), "video" + ) + out, err = capsys.readouterr() + + assert re.search(r"the weather outside is sunny", out, re.DOTALL | re.I) + + +def test_transcribe_model_selection_gcs(capsys): + transcribe_model_selection.transcribe_model_selection_gcs( + "gs://cloud-samples-tests/speech/Google_Gnome.wav", "video" + ) + out, err = capsys.readouterr() + + assert re.search(r"the weather outside is sunny", out, re.DOTALL | re.I) diff --git a/speech/snippets/transcribe_multichannel.py b/speech/snippets/transcribe_multichannel.py new file mode 100644 index 00000000000..2e6ca5fde9b --- /dev/null +++ b/speech/snippets/transcribe_multichannel.py @@ -0,0 +1,96 @@ +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Google Cloud Speech API sample that demonstrates multichannel recognition. + +Example usage: + python transcribe_multichannel.py resources/multi.wav + python transcribe_multichannel.py \ + gs://cloud-samples-tests/speech/multi.wav +""" + +import argparse + + +def transcribe_file_with_multichannel(speech_file): + """Transcribe the given audio file synchronously with + multi channel.""" + # [START speech_transcribe_multichannel] + from google.cloud import speech + + client = speech.SpeechClient() + + with open(speech_file, "rb") as audio_file: + content = audio_file.read() + + audio = speech.RecognitionAudio(content=content) + + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, + sample_rate_hertz=44100, + language_code="en-US", + audio_channel_count=2, + enable_separate_recognition_per_channel=True, + ) + + response = client.recognize(config=config, audio=audio) + + for i, result in enumerate(response.results): + alternative = result.alternatives[0] + print("-" * 20) + print("First alternative of result {}".format(i)) + print("Transcript: {}".format(alternative.transcript)) + print("Channel Tag: {}".format(result.channel_tag)) + # [END speech_transcribe_multichannel] + + +def transcribe_gcs_with_multichannel(gcs_uri): + """Transcribe the given audio file on GCS with + multi channel.""" + # [START speech_transcribe_multichannel_gcs] + from google.cloud import speech + + client = speech.SpeechClient() + + audio = speech.RecognitionAudio(uri=gcs_uri) + + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, + sample_rate_hertz=44100, + language_code="en-US", + audio_channel_count=2, + enable_separate_recognition_per_channel=True, + ) + + response = client.recognize(config=config, audio=audio) + + for i, result in enumerate(response.results): + alternative = result.alternatives[0] + print("-" * 20) + print("First alternative of result {}".format(i)) + print("Transcript: {}".format(alternative.transcript)) + print("Channel Tag: {}".format(result.channel_tag)) + # [END speech_transcribe_multichannel_gcs] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument("path", help="File or GCS path for audio file to be recognized") + args = parser.parse_args() + if args.path.startswith("gs://"): + transcribe_gcs_with_multichannel(args.path) + else: + transcribe_file_with_multichannel(args.path) diff --git a/speech/snippets/transcribe_multichannel_test.py b/speech/snippets/transcribe_multichannel_test.py new file mode 100644 index 00000000000..92de5e46037 --- /dev/null +++ b/speech/snippets/transcribe_multichannel_test.py @@ -0,0 +1,36 @@ +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from transcribe_multichannel import ( + transcribe_file_with_multichannel, + transcribe_gcs_with_multichannel, +) + +RESOURCES = os.path.join(os.path.dirname(__file__), "resources") + + +def test_transcribe_multichannel_file(capsys): + transcribe_file_with_multichannel(os.path.join(RESOURCES, "multi.wav")) + out, err = capsys.readouterr() + + assert "how are you doing" in out + + +def test_transcribe_multichannel_gcs(capsys): + transcribe_gcs_with_multichannel("gs://cloud-samples-data/speech/multi.wav") + out, err = capsys.readouterr() + + assert "how are you doing" in out diff --git a/speech/snippets/transcribe_onprem/README.rst b/speech/snippets/transcribe_onprem/README.rst new file mode 100644 index 00000000000..dea6504637f --- /dev/null +++ b/speech/snippets/transcribe_onprem/README.rst @@ -0,0 +1,111 @@ +.. This file is automatically generated. Do not edit this file directly. + +Google Cloud Speech-to-Text On-Prem Python Samples +=============================================================================== + + +.. warning:: This product is only available to customers that have been granted access. Please `contact us`_ to request access to the Speech-to-Text On-Prem feature. + +This directory contains samples for `Google Cloud Speech-to-Text On-Prem`_. Speech-to-Text On-Prem enables easy integration of Google speech recognition technologies into your on-prem solution. + + +.. _Google Cloud Speech-to-Text On-Prem: https://cloud.google.com/speech-to-text/on-prem/priv/docs + +.. _contact us: https://cloud.google.com/contact + +.. _Google Cloud Speech-to-Text On-Prem: https://cloud.google.com/speech-to-text/on-prem/priv/docs + +Setup +------------------------------------------------------------------------------- + + +Prepare and Deploy API ++++++++++++++++++++++++ + +This sample requires you to have a Kubernetes cluster with the Speech-to-Text On-Prem service deployed. Follow the quickstart steps listed below: + +#. `Setup IAM, Kubernetes, Billing`_ + +#. `Deploy the API using the UI or command line`_ + +#. `Query the API to ensure it's working`_ + + +.. _Query the API to ensure it's working: + https://cloud.google.com/speech-to-text/on-prem/priv/docs/query + +.. _Deploy the API using the UI or command line: + https://cloud.google.com/speech-to-text/on-prem/priv/docs/deploy + +.. _Setup IAM, Kubernetes, Billing: + https://cloud.google.com/speech-to-text/on-prem/priv/docs/before-you-begin + +Install Dependencies +++++++++++++++++++++ + +#. Clone python-docs-samples and change directory to the sample directory you want to use. + + .. code-block:: bash + + $ git clone https://github.com/GoogleCloudPlatform/python-docs-samples.git + $ cd python-doc-samples/speech/cloud-client + +#. Install `pip`_ and `virtualenv`_ if you do not already have them. You may want to refer to the `Python Development Environment Setup Guide`_ for Google Cloud Platform for instructions. + + .. _Python Development Environment Setup Guide: + https://cloud.google.com/python/setup + +#. Create a virtualenv. Samples are compatible with Python 2.7 and 3.4+. + + .. code-block:: bash + + $ virtualenv env + $ source env/bin/activate + +#. Install the dependencies needed to run the samples. + + .. code-block:: bash + + $ pip install -r requirements.txt + +.. _pip: https://pip.pypa.io/ +.. _virtualenv: https://virtualenv.pypa.io/ + +Samples +------------------------------------------------------------------------------- + +transcribe_onprem ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + + +You can run this sample one of two ways, using a **public IP**: + +.. code-block:: bash + + # Using a Public IP + $ python transcribe_onprem.py --file_path="../resources/two_channel_16k.wav" --api_endpoint=${PUBLIC_IP}:443 + +or by using a **cluster level IP**: + +.. code-block:: bash + + # Using a cluster level IP + $ kubectl port-forward -n $NAMESPACE $POD 10000:443 + $ python transcribe_onprem.py --file_path="../resources/two_channel_16k.wav" --api_endpoint="0.0.0.0:10000" + +The client library +------------------------------------------------------------------------------- + +This sample uses the `Google Cloud Client Library for Python`_. +You can read the documentation for more details on API usage and use GitHub +to `browse the source`_ and `report issues`_. + +.. _Google Cloud Client Library for Python: + https://googlecloudplatform.github.io/google-cloud-python/ +.. _browse the source: + https://github.com/GoogleCloudPlatform/google-cloud-python +.. _report issues: + https://github.com/GoogleCloudPlatform/google-cloud-python/issues + + +.. _Google Cloud SDK: https://cloud.google.com/sdk/ \ No newline at end of file diff --git a/speech/snippets/transcribe_onprem/transcribe_onprem.py b/speech/snippets/transcribe_onprem/transcribe_onprem.py new file mode 100644 index 00000000000..0b057c22a19 --- /dev/null +++ b/speech/snippets/transcribe_onprem/transcribe_onprem.py @@ -0,0 +1,86 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + + +# [START speech_transcribe_onprem] +def transcribe_onprem(local_file_path, api_endpoint): + """ + Transcribe a short audio file using synchronous speech recognition on-prem + + Args: + local_file_path: The path to local audio file, e.g. /path/audio.wav + api_endpoint: Endpoint to call for speech recognition, e.g. 0.0.0.0:10000 + """ + from google.cloud import speech_v1p1beta1 + import grpc + import io + + # api_endpoint = '0.0.0.0:10000' + # local_file_path = '../resources/two_channel_16k.raw' + + # Create a gRPC channel to your server + channel = grpc.insecure_channel(target=api_endpoint) + transport = speech_v1p1beta1.services.speech.transports.SpeechGrpcTransport( + channel=channel + ) + + client = speech_v1p1beta1.SpeechClient(transport=transport) + + # The language of the supplied audio + language_code = "en-US" + + # Sample rate in Hertz of the audio data sent + sample_rate_hertz = 16000 + + # Encoding of audio data sent. This sample sets this explicitly. + # This field is optional for FLAC and WAV audio formats. + encoding = speech_v1p1beta1.RecognitionConfig.AudioEncoding.LINEAR16 + config = { + "encoding": encoding, + "language_code": language_code, + "sample_rate_hertz": sample_rate_hertz, + } + with io.open(local_file_path, "rb") as f: + content = f.read() + audio = {"content": content} + + response = client.recognize(request={"config": config, "audio": audio}) + for result in response.results: + # First alternative is the most probable result + alternative = result.alternatives[0] + print(f"Transcript: {alternative.transcript}") + + +# [END speech_transcribe_onprem] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument( + "--file_path", + required=True, + help="Path to local audio file to be recognized, e.g. /path/audio.wav", + ) + parser.add_argument( + "--api_endpoint", + required=True, + help="Endpoint to call for speech recognition, e.g. 0.0.0.0:10000", + ) + + args = parser.parse_args() + transcribe_onprem(local_file_path=args.file_path, api_endpoint=args.api_endpoint) diff --git a/speech/snippets/transcribe_streaming.py b/speech/snippets/transcribe_streaming.py new file mode 100644 index 00000000000..ceae7f3f947 --- /dev/null +++ b/speech/snippets/transcribe_streaming.py @@ -0,0 +1,83 @@ +# Copyright 2017 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Google Cloud Speech API sample application using the streaming API. + +Example usage: + python transcribe_streaming.py resources/audio.raw +""" + +import argparse + + +# [START speech_transcribe_streaming] +def transcribe_streaming(stream_file): + """Streams transcription of the given audio file.""" + import io + from google.cloud import speech + + client = speech.SpeechClient() + + # [START speech_python_migration_streaming_request] + with io.open(stream_file, "rb") as audio_file: + content = audio_file.read() + + # In practice, stream should be a generator yielding chunks of audio data. + stream = [content] + + requests = ( + speech.StreamingRecognizeRequest(audio_content=chunk) for chunk in stream + ) + + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, + sample_rate_hertz=16000, + language_code="en-US", + ) + + streaming_config = speech.StreamingRecognitionConfig(config=config) + + # streaming_recognize returns a generator. + # [START speech_python_migration_streaming_response] + responses = client.streaming_recognize( + config=streaming_config, + requests=requests, + ) + # [END speech_python_migration_streaming_request] + + for response in responses: + # Once the transcription has settled, the first result will contain the + # is_final result. The other results will be for subsequent portions of + # the audio. + for result in response.results: + print("Finished: {}".format(result.is_final)) + print("Stability: {}".format(result.stability)) + alternatives = result.alternatives + # The alternatives are ordered from most likely to least. + for alternative in alternatives: + print("Confidence: {}".format(alternative.confidence)) + print("Transcript: {}".format(alternative.transcript)) + # [END speech_python_migration_streaming_response] + + +# [END speech_transcribe_streaming] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument("stream", help="File to stream to the API") + args = parser.parse_args() + transcribe_streaming(args.stream) diff --git a/speech/snippets/transcribe_streaming_test.py b/speech/snippets/transcribe_streaming_test.py new file mode 100644 index 00000000000..13be2579e9a --- /dev/null +++ b/speech/snippets/transcribe_streaming_test.py @@ -0,0 +1,27 @@ +# Copyright 2017 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re + +import transcribe_streaming + +RESOURCES = os.path.join(os.path.dirname(__file__), "resources") + + +def test_transcribe_streaming(capsys): + transcribe_streaming.transcribe_streaming(os.path.join(RESOURCES, "audio.raw")) + out, err = capsys.readouterr() + + assert re.search(r"how old is the Brooklyn Bridge", out, re.DOTALL | re.I) diff --git a/speech/snippets/transcribe_streaming_v2.py b/speech/snippets/transcribe_streaming_v2.py new file mode 100644 index 00000000000..3042acdb717 --- /dev/null +++ b/speech/snippets/transcribe_streaming_v2.py @@ -0,0 +1,83 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# [START speech_transcribe_streaming_v2] +import io + +from google.cloud.speech_v2 import SpeechClient +from google.cloud.speech_v2.types import cloud_speech + + +def transcribe_streaming_v2(project_id, recognizer_id, audio_file): + # Instantiates a client + client = SpeechClient() + + request = cloud_speech.CreateRecognizerRequest( + parent=f"projects/{project_id}/locations/global", + recognizer_id=recognizer_id, + recognizer=cloud_speech.Recognizer( + language_codes=["en-US"], model="latest_long" + ), + ) + + # Creates a Recognizer + operation = client.create_recognizer(request=request) + recognizer = operation.result() + + # Reads a file as bytes + with io.open(audio_file, "rb") as f: + content = f.read() + + # In practice, stream should be a generator yielding chunks of audio data + chunk_length = len(content) // 5 + stream = [ + content[start : start + chunk_length] + for start in range(0, len(content), chunk_length) + ] + audio_requests = ( + cloud_speech.StreamingRecognizeRequest(audio=audio) for audio in stream + ) + + recognition_config = cloud_speech.RecognitionConfig(auto_decoding_config={}) + streaming_config = cloud_speech.StreamingRecognitionConfig( + config=recognition_config + ) + config_request = cloud_speech.StreamingRecognizeRequest( + recognizer=recognizer.name, streaming_config=streaming_config + ) + + def requests(config, audio): + yield config + for message in audio: + yield message + + # Transcribes the audio into text + responses_iterator = client.streaming_recognize( + requests=requests(config_request, audio_requests) + ) + responses = [] + for response in responses_iterator: + responses.append(response) + for result in response.results: + print("Transcript: {}".format(result.alternatives[0].transcript)) + + return responses + + +# [END speech_transcribe_streaming_v2] + + +if __name__ == "__main__": + transcribe_streaming_v2() diff --git a/speech/snippets/transcribe_streaming_v2_test.py b/speech/snippets/transcribe_streaming_v2_test.py new file mode 100644 index 00000000000..8c2ed2ac3cc --- /dev/null +++ b/speech/snippets/transcribe_streaming_v2_test.py @@ -0,0 +1,54 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +from uuid import uuid4 + +from google.cloud.speech_v2 import SpeechClient +from google.cloud.speech_v2.types import cloud_speech + +import transcribe_streaming_v2 + +RESOURCES = os.path.join(os.path.dirname(__file__), "resources") + + +def delete_recognizer(name): + client = SpeechClient() + request = cloud_speech.DeleteRecognizerRequest(name=name) + client.delete_recognizer(request=request) + + +def test_transcribe_streaming_v2(capsys): + project_id = os.getenv("GOOGLE_CLOUD_PROJECT") + + recognizer_id = "recognizer-" + str(uuid4()) + responses = transcribe_streaming_v2.transcribe_streaming_v2( + project_id, recognizer_id, os.path.join(RESOURCES, "audio.wav") + ) + + transcript = "" + for response in responses: + for result in response.results: + transcript += result.alternatives[0].transcript + + assert re.search( + r"how old is the Brooklyn Bridge", + transcript, + re.DOTALL | re.I, + ) + + delete_recognizer( + f"projects/{project_id}/locations/global/recognizers/{recognizer_id}" + ) diff --git a/speech/snippets/transcribe_streaming_voice_activity_events.py b/speech/snippets/transcribe_streaming_voice_activity_events.py new file mode 100644 index 00000000000..b16c655c0ef --- /dev/null +++ b/speech/snippets/transcribe_streaming_voice_activity_events.py @@ -0,0 +1,110 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import argparse + +# [START speech_transcribe_streaming_voice_activity_events] +import io + +from google.cloud.speech_v2 import SpeechClient +from google.cloud.speech_v2.types import cloud_speech + + +def transcribe_streaming_voice_activity_events(project_id, recognizer_id, audio_file): + # Instantiates a client + client = SpeechClient() + + request = cloud_speech.CreateRecognizerRequest( + parent=f"projects/{project_id}/locations/global", + recognizer_id=recognizer_id, + recognizer=cloud_speech.Recognizer( + language_codes=["en-US"], model="latest_long" + ), + ) + + # Creates a Recognizer + operation = client.create_recognizer(request=request) + recognizer = operation.result() + + # Reads a file as bytes + with io.open(audio_file, "rb") as f: + content = f.read() + + # In practice, stream should be a generator yielding chunks of audio data + chunk_length = len(content) // 5 + stream = [ + content[start : start + chunk_length] + for start in range(0, len(content), chunk_length) + ] + audio_requests = ( + cloud_speech.StreamingRecognizeRequest(audio=audio) for audio in stream + ) + + recognition_config = cloud_speech.RecognitionConfig(auto_decoding_config={}) + + # Sets the flag to enable voice activity events + streaming_features = cloud_speech.StreamingRecognitionFeatures( + enable_voice_activity_events=True + ) + streaming_config = cloud_speech.StreamingRecognitionConfig( + config=recognition_config, streaming_features=streaming_features + ) + + config_request = cloud_speech.StreamingRecognizeRequest( + recognizer=recognizer.name, streaming_config=streaming_config + ) + + def requests(config, audio): + yield config + for message in audio: + yield message + + # Transcribes the audio into text + responses_iterator = client.streaming_recognize( + requests=requests(config_request, audio_requests) + ) + responses = [] + for response in responses_iterator: + responses.append(response) + if ( + response.speech_event_type + == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_BEGIN + ): + print("Speech started.") + if ( + response.speech_event_type + == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_END + ): + print("Speech ended.") + for result in response.results: + print("Transcript: {}".format(result.alternatives[0].transcript)) + + return responses + + +# [END speech_transcribe_streaming_voice_activity_events] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument("project_id", help="project to create recognizer in") + parser.add_argument("recognizer_id", help="name of recognizer to create") + parser.add_argument("audio_file", help="audio file to stream") + args = parser.parse_args() + transcribe_streaming_voice_activity_events( + args.project_id, args.recognizer_id, args.audio_file + ) diff --git a/speech/snippets/transcribe_streaming_voice_activity_events_test.py b/speech/snippets/transcribe_streaming_voice_activity_events_test.py new file mode 100644 index 00000000000..5b81d9e529c --- /dev/null +++ b/speech/snippets/transcribe_streaming_voice_activity_events_test.py @@ -0,0 +1,59 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +from uuid import uuid4 + +from google.cloud.speech_v2 import SpeechClient +from google.cloud.speech_v2.types import cloud_speech + +import transcribe_streaming_voice_activity_events + +RESOURCES = os.path.join(os.path.dirname(__file__), "resources") + + +def delete_recognizer(name): + client = SpeechClient() + request = cloud_speech.DeleteRecognizerRequest(name=name) + client.delete_recognizer(request=request) + + +def test_transcribe_streaming_voice_activity_events(capsys): + project_id = os.getenv("GOOGLE_CLOUD_PROJECT") + + recognizer_id = "recognizer-" + str(uuid4()) + responses = transcribe_streaming_voice_activity_events.transcribe_streaming_voice_activity_events( + project_id, recognizer_id, os.path.join(RESOURCES, "audio.wav") + ) + + transcript = "" + for response in responses: + for result in response.results: + transcript += result.alternatives[0].transcript + + assert ( + responses[0].speech_event_type + == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_BEGIN + ) + + assert re.search( + r"how old is the Brooklyn Bridge", + transcript, + re.DOTALL | re.I, + ) + + delete_recognizer( + f"projects/{project_id}/locations/global/recognizers/{recognizer_id}" + ) diff --git a/speech/snippets/transcribe_streaming_voice_activity_timeouts.py b/speech/snippets/transcribe_streaming_voice_activity_timeouts.py new file mode 100644 index 00000000000..3050cc4b7e6 --- /dev/null +++ b/speech/snippets/transcribe_streaming_voice_activity_timeouts.py @@ -0,0 +1,133 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import argparse + +# [START speech_transcribe_streaming_voice_activity_timeouts] +import io +from time import sleep + +from google.cloud.speech_v2 import SpeechClient +from google.cloud.speech_v2.types import cloud_speech +from google.protobuf import duration_pb2 # type: ignore + + +def transcribe_streaming_voice_activity_timeouts( + project_id, recognizer_id, speech_start_timeout, speech_end_timeout, audio_file +): + # Instantiates a client + client = SpeechClient() + + request = cloud_speech.CreateRecognizerRequest( + parent=f"projects/{project_id}/locations/global", + recognizer_id=recognizer_id, + recognizer=cloud_speech.Recognizer( + language_codes=["en-US"], model="latest_long" + ), + ) + + # Creates a Recognizer + operation = client.create_recognizer(request=request) + recognizer = operation.result() + + # Reads a file as bytes + with io.open(audio_file, "rb") as f: + content = f.read() + + # In practice, stream should be a generator yielding chunks of audio data + chunk_length = len(content) // 20 + stream = [ + content[start : start + chunk_length] + for start in range(0, len(content), chunk_length) + ] + audio_requests = ( + cloud_speech.StreamingRecognizeRequest(audio=audio) for audio in stream + ) + + recognition_config = cloud_speech.RecognitionConfig(auto_decoding_config={}) + + # Sets the flag to enable voice activity events and timeout + speech_start_timeout = duration_pb2.Duration(seconds=speech_start_timeout) + speech_end_timeout = duration_pb2.Duration(seconds=speech_end_timeout) + voice_activity_timeout = ( + cloud_speech.StreamingRecognitionFeatures.VoiceActivityTimeout( + speech_start_timeout=speech_start_timeout, + speech_end_timeout=speech_end_timeout, + ) + ) + streaming_features = cloud_speech.StreamingRecognitionFeatures( + enable_voice_activity_events=True, voice_activity_timeout=voice_activity_timeout + ) + + streaming_config = cloud_speech.StreamingRecognitionConfig( + config=recognition_config, streaming_features=streaming_features + ) + + config_request = cloud_speech.StreamingRecognizeRequest( + recognizer=recognizer.name, streaming_config=streaming_config + ) + + def requests(config, audio): + yield config + for message in audio: + sleep(0.5) + yield message + + # Transcribes the audio into text + responses_iterator = client.streaming_recognize( + requests=requests(config_request, audio_requests) + ) + + responses = [] + for response in responses_iterator: + responses.append(response) + if ( + response.speech_event_type + == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_BEGIN + ): + print("Speech started.") + if ( + response.speech_event_type + == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_END + ): + print("Speech ended.") + for result in response.results: + print("Transcript: {}".format(result.alternatives[0].transcript)) + + return responses + + +# [END speech_transcribe_streaming_voice_activity_timeouts] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument("project_id", help="project to create recognizer in") + parser.add_argument("recognizer_id", help="name of recognizer to create") + parser.add_argument( + "speech_start_timeout", help="timeout in seconds for speech start" + ) + parser.add_argument("speech_end_timeout", help="timeout in seconds for speech end") + parser.add_argument("audio_file", help="audio file to stream") + args = parser.parse_args() + transcribe_streaming_voice_activity_timeouts( + args.project_id, + args.recognizer_id, + args.speech_start_timeout, + args.speech_end_timeout, + args.audio_file, + ) diff --git a/speech/snippets/transcribe_streaming_voice_activity_timeouts_test.py b/speech/snippets/transcribe_streaming_voice_activity_timeouts_test.py new file mode 100644 index 00000000000..2816953c049 --- /dev/null +++ b/speech/snippets/transcribe_streaming_voice_activity_timeouts_test.py @@ -0,0 +1,82 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +from uuid import uuid4 + +from google.cloud.speech_v2 import SpeechClient +from google.cloud.speech_v2.types import cloud_speech + +import transcribe_streaming_voice_activity_timeouts + +RESOURCES = os.path.join(os.path.dirname(__file__), "resources") + + +def delete_recognizer(name): + client = SpeechClient() + request = cloud_speech.DeleteRecognizerRequest(name=name) + client.delete_recognizer(request=request) + + +def test_transcribe_streaming_voice_activity_timeouts(capsys): + project_id = os.getenv("GOOGLE_CLOUD_PROJECT") + + recognizer_id = "recognizer-" + str(uuid4()) + responses = transcribe_streaming_voice_activity_timeouts.transcribe_streaming_voice_activity_timeouts( + project_id, + recognizer_id, + 1, + 5, + os.path.join(RESOURCES, "audio_silence_padding.wav"), + ) + + # This assert doesn't seem deterministic. We should consider removing or changing. + assert len(responses) == 0 + + recognizer_id_2 = "recognizer-2-" + str(uuid4()) + responses = transcribe_streaming_voice_activity_timeouts.transcribe_streaming_voice_activity_timeouts( + project_id, + recognizer_id_2, + 5, + 1, + os.path.join(RESOURCES, "audio_silence_padding.wav"), + ) + transcript = "" + for response in responses: + for result in response.results: + transcript += result.alternatives[0].transcript + + assert ( + responses[0].speech_event_type + == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_BEGIN + ) + + assert ( + responses[1].speech_event_type + == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_END + ) + + assert re.search( + r"how old is the Brooklyn Bridge", + transcript, + re.DOTALL | re.I, + ) + + delete_recognizer( + f"projects/{project_id}/locations/global/recognizers/{recognizer_id}" + ) + delete_recognizer( + f"projects/{project_id}/locations/global/recognizers/{recognizer_id_2}" + ) diff --git a/speech/snippets/transcribe_test.py b/speech/snippets/transcribe_test.py new file mode 100644 index 00000000000..88fa10a2183 --- /dev/null +++ b/speech/snippets/transcribe_test.py @@ -0,0 +1,34 @@ +# Copyright 2016 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re + +import transcribe + +RESOURCES = os.path.join(os.path.dirname(__file__), "resources") + + +def test_transcribe_file(capsys): + transcribe.transcribe_file(os.path.join(RESOURCES, "audio.raw")) + out, err = capsys.readouterr() + + assert re.search(r"how old is the Brooklyn Bridge", out, re.DOTALL | re.I) + + +def test_transcribe_gcs(capsys): + transcribe.transcribe_gcs("gs://python-docs-samples-tests/speech/audio.flac") + out, err = capsys.readouterr() + + assert re.search(r"how old is the Brooklyn Bridge", out, re.DOTALL | re.I) diff --git a/speech/snippets/transcribe_word_time_offsets.py b/speech/snippets/transcribe_word_time_offsets.py new file mode 100644 index 00000000000..3cda1943528 --- /dev/null +++ b/speech/snippets/transcribe_word_time_offsets.py @@ -0,0 +1,109 @@ +# Copyright 2017 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Google Cloud Speech API sample that demonstrates word time offsets. + +Example usage: + python transcribe_word_time_offsets.py resources/audio.raw + python transcribe_word_time_offsets.py \ + gs://cloud-samples-tests/speech/vr.flac +""" + +import argparse +import io + + +def transcribe_file_with_word_time_offsets(speech_file): + """Transcribe the given audio file synchronously and output the word time + offsets.""" + from google.cloud import speech + + client = speech.SpeechClient() + + with io.open(speech_file, "rb") as audio_file: + content = audio_file.read() + + audio = speech.RecognitionAudio(content=content) + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, + sample_rate_hertz=16000, + language_code="en-US", + enable_word_time_offsets=True, + ) + + response = client.recognize(config=config, audio=audio) + + for result in response.results: + alternative = result.alternatives[0] + print("Transcript: {}".format(alternative.transcript)) + + for word_info in alternative.words: + word = word_info.word + start_time = word_info.start_time + end_time = word_info.end_time + + print( + f"Word: {word}, start_time: {start_time.total_seconds()}, end_time: {end_time.total_seconds()}" + ) + + +# [START speech_transcribe_async_word_time_offsets_gcs] +def transcribe_gcs_with_word_time_offsets(gcs_uri): + """Transcribe the given audio file asynchronously and output the word time + offsets.""" + from google.cloud import speech + + client = speech.SpeechClient() + + audio = speech.RecognitionAudio(uri=gcs_uri) + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.FLAC, + sample_rate_hertz=16000, + language_code="en-US", + enable_word_time_offsets=True, + ) + + operation = client.long_running_recognize(config=config, audio=audio) + + print("Waiting for operation to complete...") + result = operation.result(timeout=90) + + for result in result.results: + alternative = result.alternatives[0] + print("Transcript: {}".format(alternative.transcript)) + print("Confidence: {}".format(alternative.confidence)) + + for word_info in alternative.words: + word = word_info.word + start_time = word_info.start_time + end_time = word_info.end_time + + print( + f"Word: {word}, start_time: {start_time.total_seconds()}, end_time: {end_time.total_seconds()}" + ) + + +# [END speech_transcribe_async_word_time_offsets_gcs] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument("path", help="File or GCS path for audio file to be recognized") + args = parser.parse_args() + if args.path.startswith("gs://"): + transcribe_gcs_with_word_time_offsets(args.path) + else: + transcribe_file_with_word_time_offsets(args.path) diff --git a/speech/snippets/transcribe_word_time_offsets_test.py b/speech/snippets/transcribe_word_time_offsets_test.py new file mode 100644 index 00000000000..561ac6c76f3 --- /dev/null +++ b/speech/snippets/transcribe_word_time_offsets_test.py @@ -0,0 +1,46 @@ +# Copyright 2016 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re + +import transcribe_word_time_offsets + +RESOURCES = os.path.join(os.path.dirname(__file__), "resources") + + +def test_transcribe_file_with_word_time_offsets(capsys): + transcribe_word_time_offsets.transcribe_file_with_word_time_offsets( + os.path.join(RESOURCES, "audio.raw") + ) + out, _ = capsys.readouterr() + + print(out) + match = re.search(r"Bridge, start_time: ([0-9.]+)", out, re.DOTALL | re.I) + time = float(match.group(1)) + + assert time > 0 + + +def test_transcribe_gcs_with_word_time_offsets(capsys): + transcribe_word_time_offsets.transcribe_gcs_with_word_time_offsets( + "gs://python-docs-samples-tests/speech/audio.flac" + ) + out, _ = capsys.readouterr() + + print(out) + match = re.search(r"Bridge, start_time: ([0-9.]+)", out, re.DOTALL | re.I) + time = float(match.group(1)) + + assert time > 0