-
Notifications
You must be signed in to change notification settings - Fork 887
/
transcribe.py
executable file
·56 lines (44 loc) · 1.65 KB
/
transcribe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#!/usr/bin/env python
from __future__ import unicode_literals, print_function
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types
import argparse
import ffmpeg
import logging
import sys
logging.basicConfig(level=logging.INFO, format='%(message)s')
logger = logging.getLogger(__file__)
logger.setLevel(logging.INFO)
parser = argparse.ArgumentParser(description='Convert speech audio to text using Google Speech API')
parser.add_argument('in_filename', help='Input filename (`-` for stdin)')
def decode_audio(in_filename, **input_kwargs):
try:
out, err = (ffmpeg
.input(in_filename, **input_kwargs)
.output('-', format='s16le', acodec='pcm_s16le', ac=1, ar='16k')
.overwrite_output()
.run(capture_stdout=True, capture_stderr=True)
)
except ffmpeg.Error as e:
print(e.stderr, file=sys.stderr)
sys.exit(1)
return out
def get_transcripts(audio_data):
client = speech.SpeechClient()
audio = types.RecognitionAudio(content=audio_data)
config = types.RecognitionConfig(
encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=16000,
language_code='en-US'
)
response = client.recognize(config, audio)
return [result.alternatives[0].transcript for result in response.results]
def transcribe(in_filename):
audio_data = decode_audio(in_filename)
transcripts = get_transcripts(audio_data)
for transcript in transcripts:
print(repr(transcript.encode('utf-8')))
if __name__ == '__main__':
args = parser.parse_args()
transcribe(args.in_filename)