#!/usr/bin/env python
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
def run_quickstart():
# [START speech_quickstart]
import io
import os
# Imports the Google Cloud client library
# [START migration_import]
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types
# [END migration_import]
# Instantiates a client
# [START migration_client]
client = speech.SpeechClient()
# [END migration_client]
# The name of the audio file to transcribe
file_name = os.path.join(
os.path.dirname(__file__),
'resources',
'audio.raw')
# Loads the audio into memory
with io.open(file_name, 'rb') as audio_file:
content = audio_file.read()
audio = types.RecognitionAudio(content=content)
config = types.RecognitionConfig(
encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=16000,
language_code='en-US')
# Detects speech in the audio file
response = client.recognize(config, audio)
for result in response.results:
print('Transcript: {}'.format(result.alternatives[0].transcript
# [END speech_quickstart]
if __name__ == '__main__':
run_quickstart()
Authentication and Configuration
SpeechClient
objects provide a means to configure your application. Each instance holds an authenticated to the Cloud Speech Service.
For an overview of authentication in google-cloud-python
, see Authentication.
Assuming your environment is set up as described in that document, create an instance of SpeechClient
.
>>> from google.cloud import speech
>>> client = speech.SpeechClient()
Asynchronous Recognition
The long_running_recognize()
method sends audio data to the Speech API and initiates a Long Running Operation.
Using this operation, you can periodically poll for recognition results. Use asynchronous reguests for audio data of any duration up to 80 minutes.
See Speech Asynchronous Recognize
>>> from google.cloud import speech
>>> client = speech.SpeechClient()
>>> operation = client.long_running_recognize(
audio = speech.types.RecognitionAudio(
uri = 'gs://my-bucket/recording.flac',
),
config = speech.types.RecognitionConfig(
encoding='LINEAR16',
language_code = 'en-US',
sample_rate_hertz=44100,
),
)
>>> op_result = operation.result()
>>> for result in op_result.results:
for alternative in result.alternatives:
print('=' * 20)
print(alternative.transcript)
print(alternative.confidence)
====================
'how old is the Brooklyn Bridge'
0.98267895
Synchronous Recognition
The recognize()
method converts speech data to text and returns alternative text transcriptions.
This example uses language_code='en-GB'
to better recognize a dialect from Great Britain.
>>> from google.cloud import speech
>>> client = speech.SpeechClient()
>>> results = client.recognize(
audio=speech.types.RecognitionAudio(
uri='gs://my-bucket/recording.flac',
),
config=speech.types.RecognitionConfig(
encoding='LINEAR16',
language_code='en-US',
sample_rate_hertz=44100,
),
)
>>> for result in results:
for alternative in result.alternatives:
print('=' * 20)
print('transcript: ' + alternative.transcript)
print('confidence: ' + str(alternative.confidence))
===================
transcript: Hello, this is a test
confidence: 0.81
===================
transcript: Hello, this is one test
confidence: 0
Example of using the profanity filter
>>> from google.cloud import speech
>>> client = speech.SpeechClient()
>>> results = client.recognize(
audio=speech.types.RecognitionAudio(
uri='gs://my-bucket/recording.flac',
),
config=speech.types.RecognitionConfig(
encoding='LINEAR16',
language_code='en-US',
profanity_filter=True,
sample_rate_hertz=44100,
),
)
>>> for result in results:
for alternative in result.alternatives:
print('=' * 20)
print('transcript: ' + alternative.transcript)
print('confidence: ' + str(alternative.confidence))
===================
transcript: Hello, this is a f****** test
confidence: 0.81
Using speech context hints to get better results. This can be used to improve the accuracy for specific words and phrases. This can be used to add new words to the vocabulary of the recognizer.
>>> from google.cloud import speech
>>> client = speech.SpeechClient()
>>> results = client.recognize(
audio=speech.types.RecognitionAudio(
uri='gs://my-bucket/recording.flac',
),
config=speech.types.RecognitionConfig(
encoding='LINEAR16',
language_code='en-US',
sample_rate_hertz=44100,
speech_contexts=[speech.types.SpeechContext(
phrases=['hi, 'good afternoon'],
)],
),
)
>>> for result in results:
for alternative in result.alternatives:
print('=' * 20)
print('transcript: ' + alternative.transcript)
print('confidence: ' + str(alternative.confidence))
====================
transcript: Hello, this is a test
confidence: 0.81
Streaming Recognition
The streaming_recognize()
method converts speech data to possible text alternatives on the fly. Note: Streaming recognition requests are limited to 1 minute of audio. See: https://cloud.google.com/speech/limits#content
>>> import io
>>> from google.cloud import speech
>>> client = speech.SpeechClient()
>>> config = speech.types.RecognitionConfig(
encoding='LINEAR16',
language_code='en-US',
sample_rate_hertz=44100,
)
>>> with io.open('./hello.wav', 'rb') as stream:
requests = [speech.types.StreamingRecognizeRequest(
audio_content=stream.read(),
)]
>>> results = sample.streaming_recognize(
config=speech.types.StreamingRecognitionConfig(config=config),
requests,
)
>>> for result in results:
for alternative in result.alternatives:
print('=' * 20)
print('transcript: ' + alternative.transcript)
print('confidence: ' + str(alternative.confidence))
====================
transcript: hello thank you for using Google Cloud platform
confidence: 0.927983105183
By default the API will perform continuous recognition (continuing to process audio even if the speaker in the audio pauses speaking) until the client closes the output stream or until the maximum time limit has been reached.
If you only want to recognize a single utterance you can set single_utterance
to True
and only one result will be returned.
```py
import io from google.cloud import speech client = speech.SpeechClient() config = speech.types.RecognitionConfig( encoding='LINEAR16', language_code='en-US', sample_rate_hertz=44100, ) with io.open('./hello-pause-goodbye.wav', 'rb') as stream: