#!/usr/bin/env python

# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


def run_quickstart():
    # [START speech_quickstart]
    import io
    import os

    # Imports the Google Cloud client library
    # [START migration_import]
    from google.cloud import speech
    from google.cloud.speech import enums
    from google.cloud.speech import types
    # [END migration_import]

    # Instantiates a client
    # [START migration_client]
    client = speech.SpeechClient()
    # [END migration_client]

    # The name of the audio file to transcribe
    file_name = os.path.join(
        os.path.dirname(__file__),
        'resources',
        'audio.raw')

    # Loads the audio into memory
    with io.open(file_name, 'rb') as audio_file:
        content = audio_file.read()
        audio = types.RecognitionAudio(content=content)

    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code='en-US')

    # Detects speech in the audio file
    response = client.recognize(config, audio)

    for result in response.results:
        print('Transcript: {}'.format(result.alternatives[0].transcript
    # [END speech_quickstart]


if __name__ == '__main__':
    run_quickstart()

Authentication and Configuration

SpeechClient objects provide a means to configure your application. Each instance holds an authenticated to the Cloud Speech Service.

For an overview of authentication in google-cloud-python, see Authentication.

Assuming your environment is set up as described in that document, create an instance of SpeechClient.

>>> from google.cloud import speech
>>> client = speech.SpeechClient()

Asynchronous Recognition

The long_running_recognize() method sends audio data to the Speech API and initiates a Long Running Operation.

Using this operation, you can periodically poll for recognition results. Use asynchronous reguests for audio data of any duration up to 80 minutes.

See $\colon$ Speech Asynchronous Recognize

>>> from google.cloud import speech
>>> client = speech.SpeechClient()
>>> operation = client.long_running_recognize(
        audio = speech.types.RecognitionAudio(
            uri = 'gs://my-bucket/recording.flac',
        ),
        config = speech.types.RecognitionConfig(
            encoding='LINEAR16',
            language_code = 'en-US',
            sample_rate_hertz=44100,
        ),
    )
>>> op_result = operation.result()
>>> for result in op_result.results:
        for alternative in result.alternatives:
            print('=' * 20)
            print(alternative.transcript)
            print(alternative.confidence)
====================
'how old is the Brooklyn Bridge'
0.98267895

Synchronous Recognition

The recognize() method converts speech data to text and returns alternative text transcriptions.

This example uses language_code='en-GB' to better recognize a dialect from Great Britain.

>>> from google.cloud import speech
>>> client = speech.SpeechClient()
>>> results = client.recognize(
        audio=speech.types.RecognitionAudio(
            uri='gs://my-bucket/recording.flac',
        ),
        config=speech.types.RecognitionConfig(
            encoding='LINEAR16',
            language_code='en-US',
            sample_rate_hertz=44100,
        ),
    )
>>> for result in results:
        for alternative in result.alternatives:
            print('=' * 20)
            print('transcript: ' + alternative.transcript)
            print('confidence: ' + str(alternative.confidence))
===================
transcript: Hello, this is a test
confidence: 0.81
===================
transcript: Hello, this is one test
confidence: 0

Example of using the profanity filter

>>> from google.cloud import speech
>>> client = speech.SpeechClient()
>>> results = client.recognize(
        audio=speech.types.RecognitionAudio(
            uri='gs://my-bucket/recording.flac',
        ),
        config=speech.types.RecognitionConfig(
            encoding='LINEAR16',
            language_code='en-US',
            profanity_filter=True,
            sample_rate_hertz=44100,
        ),
    )
>>> for result in results:
        for alternative in result.alternatives:
            print('=' * 20)
            print('transcript: ' + alternative.transcript)
            print('confidence: ' + str(alternative.confidence))
===================
transcript: Hello, this is a f****** test
confidence: 0.81

Using speech context hints to get better results. This can be used to improve the accuracy for specific words and phrases. This can be used to add new words to the vocabulary of the recognizer.

>>> from google.cloud import speech
>>> client = speech.SpeechClient()
>>> results = client.recognize(
        audio=speech.types.RecognitionAudio(
            uri='gs://my-bucket/recording.flac',
        ),
        config=speech.types.RecognitionConfig(
            encoding='LINEAR16',
            language_code='en-US',
            sample_rate_hertz=44100,
            speech_contexts=[speech.types.SpeechContext(
                phrases=['hi, 'good afternoon'],
            )],
        ),
    )
>>> for result in results:
        for alternative in result.alternatives:
            print('=' * 20)
            print('transcript: ' + alternative.transcript)
            print('confidence: ' + str(alternative.confidence))
====================
transcript: Hello, this is a test
confidence: 0.81

Streaming Recognition

The streaming_recognize() method converts speech data to possible text alternatives on the fly. Note: Streaming recognition requests are limited to 1 minute of audio. See: https://cloud.google.com/speech/limits#content

>>> import io
>>> from google.cloud import speech
>>> client = speech.SpeechClient()
>>> config = speech.types.RecognitionConfig(
        encoding='LINEAR16',
        language_code='en-US',
        sample_rate_hertz=44100,
    )
>>> with io.open('./hello.wav', 'rb') as stream:
        requests = [speech.types.StreamingRecognizeRequest(
            audio_content=stream.read(),
        )]
>>> results = sample.streaming_recognize(
        config=speech.types.StreamingRecognitionConfig(config=config),
        requests,
    )
>>> for result in results:
        for alternative in result.alternatives:
            print('=' * 20)
            print('transcript: ' + alternative.transcript)
            print('confidence: ' + str(alternative.confidence))
====================
transcript: hello thank you for using Google Cloud platform
confidence: 0.927983105183

By default the API will perform continuous recognition (continuing to process audio even if the speaker in the audio pauses speaking) until the client closes the output stream or until the maximum time limit has been reached.

If you only want to recognize a single utterance you can set single_utterance to True and only one result will be returned.

See $\colon$ Single Utterance

```py

import io from google.cloud import speech client = speech.SpeechClient() config = speech.types.RecognitionConfig( encoding='LINEAR16', language_code='en-US', sample_rate_hertz=44100, ) with io.open('./hello-pause-goodbye.wav', 'rb') as stream:

Google Speech-to-Text API Example Code

Authentication and Configuration

Asynchronous Recognition

Synchronous Recognition

Streaming Recognition

results matching ""

No results matching ""