-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
executable file
·313 lines (280 loc) · 9.11 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
#!/usr/bin/env python3
from allosaurus.app import read_recognizer
from elevenlabs import voices
from os import path
import csv
import json
import os
import sys
import time
import argparse
import base64
from tabulate import tabulate
from flask import Flask, json, request, abort
from urllib.parse import parse_qs
from audio.generator import ElevenLabsGenerator, FestivalGenerator
cli = argparse.ArgumentParser(
prog='SkeletonSpeechGenerator',
description='Generates audio and timestamped phonemes for use in automatically generating animatronic skeleton speech. This can be used in conjunction with the chatainge module to dynamically control skeletons')
subparsers = cli.add_subparsers(dest="subcommand")
# some plumbing so we can create a clean CLI interface
# taken mostly from https://mike.depalatis.net/blog/simplifying-argparse.html
def argument(*name_or_flags, **kwargs):
return ([*name_or_flags], kwargs)
def subcommand(args=[], parent=subparsers):
def decorator(func):
parser = parent.add_parser(func.__name__, description=func.__doc__)
for arg in args:
parser.add_argument(*arg[0], **arg[1])
parser.set_defaults(func=func)
return decorator
@subcommand()
def festivaltest(args = [], parent=subparsers):
f = FestivalGenerator()
f.generate("test", "", "this is a test")
#############################################
# api will start up an API server which can
# be used to generate metadatas
############################################
@subcommand()
def api(args = [], parent=subparsers):
# TODO: make a way to indicate which generator to use for each request
elevenLabsGenerator = ElevenLabsGenerator(os.environ.get("ELEVENLABS_TOKEN"))
festivalGenerator = FestivalGenerator()
api = Flask(__name__)
# create a map of the allophones to phonemes
print("loading phonemes mappings...")
phonemeMap = alloToPhoneme()
# load the english model
print("loading phonemes model...")
model = read_recognizer('eng2102')
@api.route('/last', methods=['POST'])
def test():
time.sleep(2)
with open('last.json', 'r') as openfile:
json_object = json.load(openfile)
return json_object
@api.route('/generate', methods=['POST'])
def generate():
args = parse_qs(request.get_data(cache=False, as_text=True))
print(args)
voiceName = args['voiceName'][0]
voiceID = args.get('voiceID', [None])[0]
text = args['text'][0]
outputName = args['name'][0]
emitRatio = 0.7 # TODO: make this an arg too
if text == "" or text == None:
print("no text provided, aborting")
abort(400)
# generate and return the audio and metadata
# TODO: parameterize which model to use
metadata = generateInternal(
# elevenLabsGenerator,
festivalGenerator,
text,
voiceID,
voiceName,
phonemeMap,
model,
outputName,
emitRatio
)
with open("last.json", "w") as outfile:
json.dump(metadata, outfile)
return json.dumps(metadata)
api.run(host='0.0.0.0')
#############################################
# listVoices will just print a table of the available voices
#############################################
@subcommand()
def listVoices(args = [], parent=subparsers):
set_api_key(os.environ.get("ELEVENLABS_TOKEN"))
table = [
["category", "id", "name", "labels"],
]
for v in voices():
table.append([v.category, v.voice_id, v.name, v.labels])
print(tabulate(table, headers='firstrow'))
#############################################
# generateFull will run the full pipeline:
# 1. generate audio for the given text input and voice
# 2. store the mp3 and convert the audio to a mono wav
# 3. run allosaurus to generate the phonemes
# 4. output a metadata file with the timed phonemes
#############################################
@subcommand([
argument("text", help="The text to generate"),
argument("--voice-name", "-v", help="The name of the voice to use", default="gomez-test"),
argument("--voice-id", help="The ID of the voice to use. If provided, will ignore `voice-name`."),
argument("--output-name", "-o", help="A stub to use for naming all of the output files. There will be <outputName>.(mp3|wav|json) files created.", default="out"),
argument("--emit-ratio", "-e", help="The emit value to pass to allosaurus, increasing means more phonemes will be generated", type=float, default=0.7),
])
def generateFull(args = [], parent=subparsers):
elevenLabsGenerator = ElevenLabsGenerator(os.environ.get("ELEVENLABS_TOKEN"))
text = args.text
voiceID = args.voice_id
voiceName = args.voice_name
emitRatio = args.emit_ratio
outputName = args.output_name
metadataFile = outputName + ".json"
if text == "" or text == None:
print("no text provided, aborting")
exit(1);
# create a map of the allophones to phonemes
print("loading phonemes mappings...")
alloToPhoneme = alloToPhoneme()
# load the english model
print("loading phonemes model...")
model = read_recognizer('eng2102')
# generate and return the audio and metadata
metadata = generateInternal(
elevenLabsGenerator,
text,
voiceID,
voiceName,
alloToPhoneme,
model,
outputName,
emitRatio
)
data = json.dumps(metadata, sort_keys=True, indent=4)
f = open(metadataFile, "w")
f.write(data)
f.close()
# abstract the core generation logic to this function to be leveraged by the api or cli
def generateInternal(generator, text, voiceID, voiceName, alloToPhoneme, model, outputName = "out", emitRatio = 0.7):
mp3File = outputName + ".mp3"
wavFile = outputName + ".wav"
## find the voice id from the voice names
if voiceID == None:
print("locating voice ID...")
voiceID = generator.getVoiceID(voiceName)
if voiceID == None:
raise ValueException("unable to locate voice " + voiceID)
print("generating speech audio using Voice ID `" + voiceID + "`...")
duration = generator.generate(outputName, voiceID, text)
print("running phoneme model...")
r = model.recognize(wavFile, timestamp=True, lang_id='eng', topk=1, emit=emitRatio)
print("outputting metadata...")
lines = r.split('\n')
results = []
for line in lines:
parts = line.split()
time = parts[0]
allophone = parts[2]
phoneme = alloToPhoneme.get(allophone, allophone)
results.append([time, phonemeToViseme.get(phoneme, 'unk')])
base64Audio = ""
with open(mp3File, "rb") as f:
base64Audio = base64.b64encode(f.read()).hex()
metadata = {}
metadata["outputName"] = outputName
metadata["prompt"] = text
metadata["voiceName"] = voiceName
metadata["voiceID"] = voiceID
metadata["mp3File"] = os.getcwd() + "/" + mp3File
metadata["emitRatio"] = emitRatio
metadata["results"] = results
metadata["audioLength"] = duration
metadata["audio"] = base64Audio
return metadata
def alloToPhoneme():
alloToPhoneme = {}
with open('./inventory-full.csv') as csvfile:
r = csv.reader(csvfile, delimiter=',')
r.__next__()
for row in r:
allophones = row[7].split()
for p in allophones:
alloToPhoneme[p] = row[6]
with open('./inventory.csv') as csvfile:
r = csv.reader(csvfile, delimiter=',')
r.__next__()
for row in r:
allophones = row[7].split()
for p in allophones:
alloToPhoneme[p] = row[6]
return alloToPhoneme
# convert a voice name to a voice ID using the elevenlabs api
def findVoice(voiceName):
for v in voices():
if v.name == voiceName:
return v.voice_id
return None
# map phonemes to visemes
# https://docs.aws.amazon.com/polly/latest/dg/ph-table-english-us.html
phonemeToViseme = {
"e": "e",
"eː": "e",
"e̞": "e",
"kʰ": "k",
"a": "a",
"aː": "a",
"b": "p",
"d": "t",
"d̠": "t",
"f": "f",
"h": "k",
"i": "i",
"iː": "i",
"j": "i",
"k": "k",
"l": "t",
"m": "p",
"n": "t",
"p": "p",
"ð": "T",
"θ": "T",
"pʰ": "p",
"s": "s",
"t": "t",
"tʰ": "t",
"t̠": "t",
"u": "u",
"uː": "u",
"v": "f",
"w": "u",
"x": "k",
"z": "s",
"æ": "a",
"ə": "@",
"əː": "@",
"ɛ": "E",
"ɛː": "E",
"ɜː": "E",
"ɡ": "k",
"ɪ": "i",
"ɵː": "T",
"ɹ": "r",
"ʃ": "S",
"ʒ": "S",
"ŋ": "k",
"ɒ": "O",
"ɒː": "O",
"ʊ": "u",
"ʌ": "E",
"o": "NM",
"oː": "NM",
"r": "NM",
"øː": "NM",
"ɐ": "NM",
"ɐː": "NM",
"ɑ": "NM",
"ɑː": "NM",
"ɔ": "NM",
"ɔː": "NM",
"ɘ": "NM",
"ɪ̯": "NM",
"ɯ": "NM",
"ɻ": "NM",
"ʉ": "NM",
"ʉː": "NM",
"ʍ": "NM",
"ʔ": "NM",
}
if __name__ == "__main__":
args = cli.parse_args()
if args.subcommand is None:
cli.print_help()
else:
args.func(args)