From afb099cbf2c91d3d2a5c9d82e68fbb9209862247 Mon Sep 17 00:00:00 2001 From: Mathieu Bernard Date: Thu, 26 Jul 2018 17:30:07 +0200 Subject: [PATCH] bugfix in festival backend, improved tests bugfix when empty or invalid input --- phonemizer/festival.py | 10 +++++++--- test/test_festival.py | 42 ++++++++++++++++++------------------------ test/test_main.py | 23 ++++++++++++++++------- 3 files changed, 41 insertions(+), 34 deletions(-) diff --git a/phonemizer/festival.py b/phonemizer/festival.py index f3a510d..469be4b 100644 --- a/phonemizer/festival.py +++ b/phonemizer/festival.py @@ -74,6 +74,8 @@ def phonemize(text, language='en-us', separator=default_separator, logger.debug('loading {}'.format(script)) a = _preprocess(text) + if len(a) == 0: + return [] b = _process(a, script, logger) c = _postprocess(b, separator, strip) @@ -87,7 +89,8 @@ def _double_quoted(line): def _cleaned(line): """Remove 'forbidden' characters from the line""" - return line.replace('"', "'").replace('(', '').replace(')', '') + return line.replace('"', '').replace("'", '').replace( + '(', '').replace(')', '').strip() def _preprocess(text): @@ -98,9 +101,10 @@ def _preprocess(text): a multiline string. Empty lines in inputs are ignored. """ + cleaned_text = ( + _cleaned(line) for line in text.split('\n') if line != '') return '\n'.join( - [_double_quoted(_cleaned(line)) - for line in text.split('\n') if line != '']) + _double_quoted(line) for line in cleaned_text if line != '') def _process(text, script, logger): diff --git a/test/test_festival.py b/test/test_festival.py index fc3df1c..99fb81b 100644 --- a/test/test_festival.py +++ b/test/test_festival.py @@ -15,39 +15,33 @@ """Test of the phonemizer.Phonemizer class""" import pytest -from phonemizer import phonemize, separator +from phonemizer import festival, separator def _test(text): - return phonemize( - text, language='en-us', backend='festival', strip=True, + return festival.phonemize( + text, language='en-us', strip=True, separator=separator.Separator(' ', '|', '-')) +@pytest.mark.skipif( + '2.1' in festival.festival_version(), + reason='festival-2.1 gives different results than further versions ' + 'for syllable boundaries') def test_hello(): - assert _test('hello world') == 'hh-ax-l|ow w-er-l-d' - assert _test('hello\nworld') == 'hh-ax-l|ow\nw-er-l-d' - assert _test('hello\nworld\n') == 'hh-ax-l|ow\nw-er-l-d' - + assert _test('hello world') == ['hh-ax|l-ow w-er-l-d'] + assert _test('hello\nworld') == ['hh-ax|l-ow', 'w-er-l-d'] + assert _test('hello\nworld\n') == ['hh-ax|l-ow', 'w-er-l-d'] @pytest.mark.parametrize('text', ['', ' ', ' ', '(', '()', '"', "'"]) -def test_empty(text): - assert _test(text) == '' +def test_bad_input(text): + assert _test(text) == [] def test_quote(): - assert _test("here a 'quote") == 'hh-ih-r ax k-w-ow-t' - assert _test('here a "quote') == 'hh-ih-r ax k-w-ow-t' + assert _test("here a 'quote") == ['hh-ih-r ax k-w-ow-t'] + assert _test('here a "quote') == ['hh-ih-r ax k-w-ow-t'] def test_its(): - assert _test("it's") == 'ih-t-s' - assert _test("its") == 'ih-t-s' - assert _test("it s") == 'ih-t eh-s' - assert _test('it "s') == 'ih-t eh-s' - -def test_list(): - assert _test(['hello world']) == ['hh-ax-l|ow w-er-l-d'] - assert _test(['hello\nworld']) == ['hh-ax-l|ow', 'w-er-l-d'] - assert _test(['hello', 'world']) == ['hh-ax-l|ow', 'w-er-l-d'] - -def test_tuple(): - # this is out of specifications - assert _test(('hello', 'world')) == ['hh-ax-l|ow', 'w-er-l-d'] + assert _test("it's") == ['ih-t-s'] + assert _test("its") == ['ih-t-s'] + assert _test("it s") == ['ih-t eh-s'] + assert _test('it "s') == ['ih-t eh-s'] diff --git a/test/test_main.py b/test/test_main.py index 72d66f1..50f1591 100644 --- a/test/test_main.py +++ b/test/test_main.py @@ -20,7 +20,7 @@ import tempfile import shlex -from phonemizer.main import main +from phonemizer import main, festival def _test(input, output, args=''): @@ -30,24 +30,33 @@ def _test(input, output, args=''): with tempfile.NamedTemporaryFile('w+', delete=False) as foutput: opts = '{} -o {} {}'.format(finput.name, foutput.name, args) - main(shlex.split(opts)) + main.main(shlex.split(opts)) assert foutput.read() == output + '\n' def test_help(): with pytest.raises(SystemExit): - main('-h'.split()) + main.main('-h'.split()) def test_readme(): _test(u'hello world', u'hhaxlow werld ') _test(u'hello world', u'hhaxlow werld', '--strip') + _test(u'hello world', u'həloʊ wɜːld ', '-l en-us') + _test(u'bonjour le monde', u'bɔ̃ʒuʁ lə- mɔ̃d ', '-l fr-fr') + _test(u'bonjour le monde', u'b ɔ̃ ʒ u ʁ ;eword l ə- ;eword m ɔ̃ d ;eword ', + '-l fr-fr -p " " -w ";eword "') + +@pytest.mark.skipif( + '2.1' in festival.festival_version(), + reason='festival-2.1 gives different results than further versions ' + 'for syllable boundaries') +def test_readme_festival_syll(): _test(u'hello world', - u'hh ax l ;esyll ow ;esyll ;eword w er l d ;esyll ;eword ', + u'hh ax ;esyll l ow ;esyll ;eword w er l d ;esyll ;eword ', u"-p ' ' -s ';esyll ' -w ';eword '") def test_njobs(): for njobs in range(1, 4): _test( u'hello world\ngoodbye\nthird line\nyet another', - u'hh-ax-l|ow w-er-l-d\ng-uh-d|b-ay\nth-er-d l-ay-n\n' - u'y-eh-t ax-n|ah-dh|er', - u'--strip -j {} -p "-" -s "|" -w " "'.format(njobs)) + u'h-ə-l-oʊ w-ɜː-l-d\nɡ-ʊ-d-b-aɪ\nθ-ɜː-d l-aɪ-n\nj-ɛ-t ɐ-n-ʌ-ð-ɚ', + u'--strip -j {} -l en-us -p "-" -s "|" -w " "'.format(njobs))