Skip to content

Commit

Permalink
Merge branch 'mimic3'
Browse files Browse the repository at this point in the history
  • Loading branch information
PeterBowman committed Mar 9, 2023
2 parents 2b205e1 + d7a01ff commit 618cb83
Show file tree
Hide file tree
Showing 5 changed files with 226 additions and 3 deletions.
1 change: 1 addition & 0 deletions programs/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
add_subdirectory(espeakServer)
add_subdirectory(speechRecognition)
add_subdirectory(speechSynthesis)
6 changes: 3 additions & 3 deletions programs/speechRecognition/README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Speech recognition software
# Speech recognition

### How to launch
## How to launch

1. First, follow the steps described on [installation instructions](doc/speech-install.md)
2. Be sure you have a microphone connected to your computer.
Expand All @@ -10,7 +10,7 @@ In that case, run `alsamixer` on the bash, press `F6`, select your Sound Card (
5. Try to say some orders of `follow-me` demo using the microphone and check if `speechRecognition` detects the words.
6. The final result in lower case comes out through a yarp port. You can read from the output port writing `yarp read ... /speechRecognition:o`.

### How to configure it
## How to configure it

Once `speechRecognition.py` has started, connect it to the yarp configuration dictionary port and change the language to use.
For example, if you want to change to waiter Spanish orders, put:
Expand Down
9 changes: 9 additions & 0 deletions programs/speechSynthesis/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
option(ENABLE_speechSynthesis "Install speechSynthesis program" ON)

if(ENABLE_speechSynthesis)

install(PROGRAMS speechSynthesis.py
TYPE BIN
RENAME speechSynthesis)

endif()
33 changes: 33 additions & 0 deletions programs/speechSynthesis/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Speech synthesis

## Installation

Through pip:

```bash
pip3 install mycroft-mimic3-tts
```

Alternatively, install from sources: https://github.com/MycroftAI/mimic3

## Download voice models

All voice data is located in a separate repository: https://github.com/MycroftAI/mimic3-voices

To manually issue the download of all Spanish voices, run:

```bash
mimic3-download 'es_ES/*'
```

In case the process gets stuck, download and unpack the files into `${HOME}/.local/share/mycroft/mimic3/voices`. However, you'll probably need to download the *generator.onnx* file separately (via GitHub) since it is handled by Git LFS.

## Troubleshooting

Try this:

```bash
mimic3 --voice es_ES/m-ailabs#tux "hola, me llamo teo y tengo 10 años"
```

To enable GPU acceleration, run `pip3 install onnxruntime-gpu` and issue the `mimic3` command with `--cuda`. The `speechSynthesis` app also accepts this parameter.
180 changes: 180 additions & 0 deletions programs/speechSynthesis/speechSynthesis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
#!/usr/bin/env python3

# adapted from https://github.com/MycroftAI/mimic3/blob/be72c18/mimic3_tts/__main__.py

import argparse
import queue
import signal
import shlex
import shutil
import subprocess
import tempfile
import threading
import time

import mimic3_tts
import yarp
import roboticslab_speech

PLAY_PROGRAMS = ['paplay', 'play -q', 'aplay -q']

class TextToSpeechResponder(roboticslab_speech.TextToSpeechIDL):
def __init__(self, engine):
super().__init__()
self.engine = engine
self.is_playing = False
self.p = None
self.result_queue = queue.Queue(maxsize=5)
self.result_thread = threading.Thread(target=self._process_result, daemon=True)
self.result_thread.start()

def setLanguage(self, language):
if language.startswith('#'):
# same voice, different speaker
self.engine.speaker = language[1:]
else:
# different voice
self.engine.voice = language

if self.engine.voice not in list(self.getSupportedLangs()):
print('Voice not available: %s' % self.engine.voice)
return False
else:
print('Loaded voice: %s (speaker: %s)' % (self.engine.voice, self.engine.speaker or 'default'))
return True

def setSpeed(self, speed):
self.engine.rate = float(speed) / 100
return True

def setPitch(self, pitch):
return super().setPitch(pitch)

def getSpeed(self):
return int(self.engine.rate * 100)

def getPitch(self):
return super().getPitch()

def getSupportedLangs(self):
all_voices = sorted(list(self.engine.get_voices()), key=lambda v: v.key)
local_voices = filter(lambda v: not v.location.startswith('http'), all_voices)
available_voices = [v.key for v in local_voices]
return yarp.SVector(available_voices)

def say(self, text):
self.engine.begin_utterance()
self.engine.speak_text(text)

for result in self.engine.end_utterance():
self.result_queue.put(result)

return True

def play(self):
return super().play()

def pause(self):
return super().pause()

def stop(self):
if self.p:
self.p.terminate()

return True

def checkSayDone(self):
return not self.is_playing

def _process_result(self):
while True:
result = self.result_queue.get()

if result is None:
break

wav_bytes = result.to_wav_bytes()

if not wav_bytes:
continue

with tempfile.NamedTemporaryFile(mode='wb+', suffix='.wav') as wav_file:
wav_file.write(wav_bytes)
wav_file.seek(0)

for play_program in reversed(PLAY_PROGRAMS):
play_cmd = shlex.split(play_program)

if not shutil.which(play_cmd[0]):
continue

play_cmd.append(wav_file.name)
self.is_playing = True

with subprocess.Popen(play_cmd) as self.p:
try:
self.p.wait()
except: # e.g. on keyboard interrupt
self.p.kill()

self.is_playing = False
break

parser = argparse.ArgumentParser(prog='speechSynthesis', description='TTS service running a Mimic 3 engine')
parser.add_argument('--voice', '-v', help='Name of voice (expected in <voices-dir>/<language>)', required=True)
parser.add_argument('--speaker', '-s', help='Name or number of speaker (default: first speaker)')
parser.add_argument('--noise-scale', type=float, help='Noise scale [0-1], default is 0.667')
parser.add_argument('--length-scale', type=float, help='Length scale (1.0 is default speed, 0.5 is 2x faster)')
parser.add_argument('--noise-w', type=float, help='Variation in cadence [0-1], default is 0.8')
parser.add_argument('--cuda', action='store_true', help='Use Onnx CUDA execution provider (requires onnxruntime-gpu)')
parser.add_argument('--port', '-p', default='/speechSynthesis', help='YARP port prefix')

args = parser.parse_args()

tts = mimic3_tts.Mimic3TextToSpeechSystem(
mimic3_tts.Mimic3Settings(
length_scale=args.length_scale,
noise_scale=args.noise_scale,
noise_w=args.noise_w,
use_cuda=args.cuda,
)
)

tts.voice = args.voice
tts.speaker = args.speaker

print('Preloading voice: %s' % args.voice)
tts.preload_voice(args.voice)

yarp.Network.init()

if not yarp.Network.checkNetwork():
print('YARP network not found')
raise SystemExit

rpc = yarp.RpcServer()
processor = TextToSpeechResponder(tts)

if not rpc.open(args.port + '/rpc:s'):
print('Cannot open port %s' % rpc.getName())
raise SystemExit

processor.yarp().attachAsServer(rpc)

quitRequested = False

def askToStop():
global quitRequested
quitRequested = True

signal.signal(signal.SIGINT, lambda signal, frame: askToStop())
signal.signal(signal.SIGTERM, lambda signal, frame: askToStop())

while not quitRequested:
time.sleep(0.1)

rpc.interrupt()
rpc.close()

processor.result_queue.put(None)
processor.result_thread.join()

0 comments on commit 618cb83

Please sign in to comment.