From aa7d4c24a8b9c074a933ff427904459459016825 Mon Sep 17 00:00:00 2001
From: Corentin Jemine <corentin.jemine@gmail.com>
Date: Wed, 19 Feb 2020 15:24:26 +0100
Subject: [PATCH] Update to 2.1 (#1)

* fixed #26

* phonemizer-2.0.1

* phonemizer-2.0.1

* phonemizer-2.0.1

* CI upload to pypi

* fixed #31

* bugfix in parsing espeak-ng version

* bugfix in parsing espeak-ng version

* update copyright

* WIP

* Allow sampa for espeak

* option to specify an alternative espeak/espeak-ng binary

* deploy only on new tags

* WIP

* Add replacing content

* add PyYaml requirement

* add test and replacement as str

* WIP

* merge PR #34 from @rachine

* can specify an alternative festival executable

* bugfix in setup.py

* fixed sampa mapping for French

* corrected ChangeLog

* CI on multiple versions of espeak

* CI on multiple versions of espeak

* minor improvments

* punctuation processing implemented

* release phonemizer-2.1

* updated README

* updated CHANGELOG

* fixing gitlab CI

* fixing gitlab CI

* fixed issue #39

* pep8

* fixed issue #40

* fixed a test on espeak>=1.50

Co-authored-by: Mathieu Bernard <mathieu.a.bernard@inria.fr>
Co-authored-by: Rachid Riad <riadrachid3@gmail.com>
---
 .gitlab-ci.yml                          |   4 +-
 CHANGELOG.md                            |  74 +--------
 README.md                               | 141 +++++++++++++++-
 phonemizer/__init__.py                  |   4 +-
 phonemizer/backend/__init__.py          |   2 +-
 phonemizer/backend/base.py              |  22 ++-
 phonemizer/backend/espeak.py            | 210 ++++++++++++++----------
 phonemizer/backend/festival.py          |  26 +--
 phonemizer/backend/segments.py          |   2 +-
 phonemizer/lispy.py                     |   2 +-
 phonemizer/logger.py                    |   2 +-
 phonemizer/main.py                      |  27 +--
 phonemizer/phonemize.py                 |  28 ++--
 phonemizer/punctuation.py               |  76 ++++-----
 phonemizer/separator.py                 |   6 +-
 phonemizer/share/espeak/sampa_fr-fr.txt |   5 +
 phonemizer/share/festival/phonemize.scm |   2 +-
 phonemizer/utils.py                     |  51 ++----
 phonemizer/version.py                   |   2 +-
 setup.py                                |   2 +-
 test/test_espeak.py                     | 153 ++++++-----------
 test/test_festival.py                   |   2 +-
 test/test_main.py                       |  17 +-
 test/test_phonemize.py                  |  40 +----
 test/test_punctuation.py                |  96 ++---------
 test/test_segments.py                   |   2 +-
 test/test_separator.py                  |   2 +-
 27 files changed, 457 insertions(+), 543 deletions(-)
 create mode 100644 phonemizer/share/espeak/sampa_fr-fr.txt

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 53b55cc..6a29fcd 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,6 +1,6 @@
 before_script:
   # load the requested modules on oberon
-  - module load anaconda/3 festival/2.4 mbrola
+  - module load anaconda/3 festival/2.4
 
 phonemizer-build:
   stage: build
@@ -19,7 +19,7 @@ phonemizer-build:
 # run the unit tests within the CI environment
 - conda activate phonemizer-ci
 - phonemize --version
-- coverage run && coverage report
+- python setup.py test
 
 phonemizer-test-espeak-1-48-04:
   stage: test
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9341e70..93ecddc 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,85 +2,15 @@
 
 Version numbers follow [semantic versioning](https://semver.org)
 
-## not yet released
-
-* **improvements**
-
-  * phonemizer's logger no more conflicts with other loggers when imported from
-    Python (see PR [#61](https://github.com/bootphon/phonemizer/pull/61)).
-
-## phonemizer-2.2.2
-
-* **bugfixes**
-
-  * Fixed installation from source (bug introduced in 2.2.1, see
-    issue [#52](https://github.com/bootphon/phonemizer/issues/52)).
-
-  * Fixed a bug when trying to restore punctuation on an empty text (see issue
-    [#54](https://github.com/bootphon/phonemizer/issues/54)).
 
-  * Fixed an edge case bug when using custom punctuation marks (see issue
-    [#55](https://github.com/bootphon/phonemizer/issues/55)).
-
-  * Fixed regex issue that causes digits to be considered punctuation (see
-    issue [#60](https://github.com/bootphon/phonemizer/pull/60)).
-
-
-## phonemizer-2.2.1
-
-* **improvements**
-
-  From Python import the phonemize function using `from phonemizer import
-  phonemize` instead of `from phonemizer.phonemize import phonemize`. The
-  second import is still available for compatibility.
-
-* **bugfixes**
-
-  * Fixed a minor bug in `utils.chunks`.
-
-  * Fixed warnings on language switching for espeak backend when using parallel
-    jobs (see issue [#50](https://github.com/bootphon/phonemizer/issues/50)).
-
-  * Save file in utf-8 explicitly for Windows compat (see issue
-    [#43](https://github.com/bootphon/phonemizer/issues/43)).
-
-  * Fixed build and tests in Dockerfile (see issue
-    [#45](https://github.com/bootphon/phonemizer/issues/45)).
-
-
-## phonemizer-2.2
-
-* **new features**
-
-  * New option ``--list-languages`` to list the available languages for a given
-    backend from the command line.
-
-  * The ``--sampa`` option of the ``espeak`` backend has been replaced by a new
-    backend ``espeak-mbrola``.
-
-    * The former ``--sampa`` option (introduced in phonemizer-2.0) outputs
-      phones that are not standard SAMPA but are adapted to the espeak TTS
-      front-end.
-
-    * On the other hand the ``espeak-mbrola`` backend allows espeak to output
-      phones in standard SAMPA (adapted to the mbrola TTS front-end). This
-      backend requires mbrola to be installed, as well as additional mbrola
-      voices to support needed languages. **This backend does not support word
-      separation nor punctuation preservation**.
+## not yet released
 
 * **bugfixes**
 
-  * Fixed issues with punctuation processing on some corner cases, see issues
+  * fixed issues with punctuation processing on some corner cases, see issues
     [#39](https://github.com/bootphon/phonemizer/issues/39) and
     [#40](https://github.com/bootphon/phonemizer/issues/40).
 
-  * Improvments and updates in the documentation (Readme, ``phonemize --help``
-    and Python code).
-
-  * Fixed a test when using ``espeak>=1.50``.
-
-  * Empty lines are correctly ignored when reading text from a file.
-
 
 ## phonemizer-2.1
 
diff --git a/README.md b/README.md
index a321334..627d284 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,10 @@ https://doi.org/10.5281/zenodo.1045825)
 
 # Phonemizer -- *foʊnmaɪzɚ*
 
-* The phonemizer allows simple phonemization of words and texts in many languages.
+* Simple text to phones converter for multiple languages, based on
+  [festival](http://www.cstr.ed.ac.uk/projects/festival),
+  [espeak-ng](https://github.com/espeak-ng/espeak-ng/)
+  and [segments](https://github.com/cldf/segments).
 
 * Provides both the `phonemize` command-line tool and the Python function
   `phonemizer.phonemize`.
@@ -76,7 +79,7 @@ the phonemizer.
 ### Docker image
 
 Alternatively you can run the phonemizer within docker, using the
-provided `Dockerfile**. To build the docker image, have a:
+provided `Dockerfile`. To build the docker image, have a:
 
     $ git clone https://github.com/bootphon/phonemizer
     $ cd phonemizer
@@ -116,8 +119,8 @@ For a complete list of available options, have a:
 See the installed backends with the `--version` option:
 
     $ phonemize --version
-    phonemizer-2.2
-    available backends: espeak-ng-1.49.3, espeak-mbrola, festival-2.5.0, segments-2.0.1
+    phonemizer-2.0
+    available backends: festival-2.5.0, espeak-ng-1.49.3, segments-2.0.1
 
 
 ### Input/output exemples
@@ -202,8 +205,8 @@ The exhaustive list of supported languages is available with the command
 
 ### Token separators
 
-You can specify separators for phones, syllables (**festival** only) and
-words (excepted **espeak-mbrola**).
+You can specify separators for phones, syllables (festival only) and
+words.
 
     $ echo "hello world" | phonemize -b festival -w ' ' -p ''
     hhaxlow werld
@@ -230,6 +233,18 @@ a space for both phones and words):
 
 ### Punctuation
 
+By default the punctuation is removed in the phonemized output. You can preserve
+it using the ``--preserve-punctuation`` option:
+
+    $ echo "hello, world!" | phonemize --strip
+    həloʊ wɜːld
+
+    $ echo "hello, world!" | phonemize --preserve-punctuation --strip
+    həloʊ, wɜːld!
+
+
+### Options
+
 By default the punctuation is removed in the phonemized output. You can preserve
 it using the ``--preserve-punctuation`` option (not supported by the
 **espeak-mbrola** backend):
@@ -243,7 +258,25 @@ it using the ``--preserve-punctuation`` option (not supported by the
 
 ### Espeak specific options
 
-* The **espeak** backend can output the stresses on phones:
+        $ echo "bonjour le monde" | phonemize -b espeak -l fr-fr -p ' ' -w ';eword '
+        b ɔ̃ ʒ u ʁ ;eword l ə- ;eword m ɔ̃ d ;eword
+
+* In Japanese, using **segments**
+
+        $ echo 'konnichiwa' | phonemize -b segments -l japanese
+        konnitʃiwa
+
+        $ echo 'konnichiwa' | phonemize -b segments -l ./phonemizer/share/japanese.g2p
+        konnitʃiwa
+
+* **Espeak** can output SAMPA phonemes instead of IPA ones (this is only supported
+  by espeak-ng, not by the original espeak)
+
+        $ echo "hello world" | phonemize -l en-us -b espeak --sampa
+        h@loU w3:ld
+
+* **Espeak** can output the stresses on phones (this is not supported by festival
+  or segments backends)
 
         $ echo "hello world" | phonemize -l en-us -b espeak --with-stress
         həlˈoʊ wˈɜːld
@@ -267,9 +300,101 @@ it using the ``--preserve-punctuation`` option (not supported by the
         [WARNING] removed 1 utterances containing language switches (applying "remove-utterance" policy)
 
 
+### Supported languages
+
+* Languages supported by festival are:
+
+        en-us	->	english-us
+
+* Languages supported by the segments backend are:
+
+        chintang  -> ./phonemizer/share/chintang.g2p
+	    cree	  -> ./phonemizer/share/cree.g2p
+	    inuktitut -> ./phonemizer/share/inuktitut.g2p
+	    japanese  -> ./phonemizer/share/japanese.g2p
+	    sesotho	  -> ./phonemizer/share/sesotho.g2p
+	    yucatec	  -> ./phonemizer/share/yucatec.g2p
+
+  Instead of a language you can also provide a file specifying a
+  grapheme to phone mapping (see the files above for exemples).
+
+* Languages supported by espeak are (espeak-ng supports even more of
+  them), type `phonemize --help` for an exhaustive list:
+
+        af	->	afrikaans
+        an	->	aragonese
+        bg	->	bulgarian
+        bs	->	bosnian
+        ca	->	catalan
+        cs	->	czech
+        cy	->	welsh
+        da	->	danish
+        de	->	german
+        el	->	greek
+        en	->	default
+        en-gb	->	english
+        en-sc	->	en-scottish
+        en-uk-north	->	english-north
+        en-uk-rp	->	english_rp
+        en-uk-wmids	->	english_wmids
+        en-us	->	english-us
+        en-wi	->	en-westindies
+        eo	->	esperanto
+        es	->	spanish
+        es-la	->	spanish-latin-am
+        et	->	estonian
+        fa	->	persian
+        fa-pin	->	persian-pinglish
+        fi	->	finnish
+        fr-be	->	french-Belgium
+        fr-fr	->	french
+        ga	->	irish-gaeilge
+        grc	->	greek-ancient
+        hi	->	hindi
+        hr	->	croatian
+        hu	->	hungarian
+        hy	->	armenian
+        hy-west	->	armenian-west
+        id	->	indonesian
+        is	->	icelandic
+        it	->	italian
+        jbo	->	lojban
+        ka	->	georgian
+        kn	->	kannada
+        ku	->	kurdish
+        la	->	latin
+        lfn	->	lingua_franca_nova
+        lt	->	lithuanian
+        lv	->	latvian
+        mk	->	macedonian
+        ml	->	malayalam
+        ms	->	malay
+        ne	->	nepali
+        nl	->	dutch
+        no	->	norwegian
+        pa	->	punjabi
+        pl	->	polish
+        pt-br	->	brazil
+        pt-pt	->	portugal
+        ro	->	romanian
+        ru	->	russian
+        sk	->	slovak
+        sq	->	albanian
+        sr	->	serbian
+        sv	->	swedish
+        sw	->	swahili-test
+        ta	->	tamil
+        tr	->	turkish
+        vi	->	vietnam
+        vi-hue	->	vietnam_hue
+        vi-sgn	->	vietnam_sgn
+        zh	->	Mandarin
+        zh-yue	->	cantonese
+
+
 ## Licence
 
-**Copyright 2015-2021 Mathieu Bernard**
+**Copyright 2015-2020 Mathieu Bernard**
 
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
diff --git a/phonemizer/__init__.py b/phonemizer/__init__.py
index b3cfd70..fb56b00 100644
--- a/phonemizer/__init__.py
+++ b/phonemizer/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2015-2021 Mathieu Bernard
+# Copyright 2015-2020 Mathieu Bernard
 #
 # This file is part of phonologizer: you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
@@ -14,4 +14,4 @@
 # along with phonologizer. If not, see <http://www.gnu.org/licenses/>.
 """Multilingual text to phones converter"""
 
-__version__ = '2.0.2-resemble'
+__version__ = '2.1-resemble'
diff --git a/phonemizer/backend/__init__.py b/phonemizer/backend/__init__.py
index c2d7065..8cb66dc 100644
--- a/phonemizer/backend/__init__.py
+++ b/phonemizer/backend/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2015-2021 Mathieu Bernard
+# Copyright 2015-2020 Mathieu Bernard
 #
 # This file is part of phonologizer: you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
diff --git a/phonemizer/backend/base.py b/phonemizer/backend/base.py
index e73e53c..db0c6d1 100644
--- a/phonemizer/backend/base.py
+++ b/phonemizer/backend/base.py
@@ -1,4 +1,4 @@
-# Copyright 2015-2021 Mathieu Bernard
+# Copyright 2015-2020 Mathieu Bernard
 #
 # This file is part of phonemizer: you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
@@ -90,7 +90,16 @@ def is_supported_language(cls, language):
     def phonemize(self, text, separator=default_separator,
                   strip=False, njobs=1):
         """Returns the `text` phonemized for the given language"""
-        text, text_type, punctuation_marks = self._phonemize_preprocess(text)
+        # remember the text type for output (either list or string)
+        text_type = type(text)
+
+        # deals with punctuation: remove it and keep track of it for
+        # restoration at the end if asked for
+        punctuation_marks = []
+        if self.preserve_punctuation:
+            text, punctuation_marks = self._punctuator.preserve(text)
+        else:
+            text = self._punctuator.remove(text)
 
         if njobs == 1:
             # phonemize the text forced as a string
@@ -113,7 +122,14 @@ def phonemize(self, text, separator=default_separator,
             # restore the log as it was before parallel processing
             self.logger = log_storage
 
-        return self._phonemize_postprocess(text, text_type, punctuation_marks)
+        # restore the punctuation is asked for
+        if self.preserve_punctuation:
+            text = self._punctuator.restore(text, punctuation_marks)
+
+        # output the result formatted as a string or a list of strings
+        # according to type(text)
+        return (list2str(text) if text_type in six.string_types
+                else str2list(text))
 
     @abc.abstractmethod
     def _phonemize_aux(self, text, separator, strip):
diff --git a/phonemizer/backend/espeak.py b/phonemizer/backend/espeak.py
index b4712bf..4bf9627 100644
--- a/phonemizer/backend/espeak.py
+++ b/phonemizer/backend/espeak.py
@@ -1,4 +1,4 @@
-# Copyright 2015-2021 Mathieu Bernard
+# Copyright 2015-2020 Mathieu Bernard
 #
 # This file is part of phonemizer: you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
@@ -28,8 +28,7 @@
 from phonemizer.backend.base import BaseBackend
 from phonemizer.logger import get_logger
 from phonemizer.punctuation import Punctuation
-from phonemizer.separator import default_separator
-from phonemizer.utils import list2str, chunks, cumsum
+from phonemizer.utils import get_package_resource
 
 
 # a regular expression to find language switching flags in espeak output,
@@ -44,19 +43,47 @@
 _ESPEAK_DEFAULT_PATH = None
 
 
-class BaseEspeakBackend(BaseBackend):
-    """Abstract espeak backend for the phonemizer
+class EspeakBackend(BaseBackend):
+    """Espeak backend for the phonemizer"""
+
+    espeak_version_re = r'.*: ([0-9]+(\.[0-9]+)+(\-dev)?)'
+
+    def __init__(self, language,
+                 punctuation_marks=Punctuation.default_marks(),
+                 preserve_punctuation=False,
+                 use_sampa=False,
+                 language_switch='keep-flags', with_stress=False,
+                 logger=get_logger()):
+        super(self.__class__, self).__init__(
+            language, punctuation_marks=punctuation_marks,
+            preserve_punctuation=preserve_punctuation, logger=logger)
+        self.logger.debug(f'espeak is {self.espeak_path()}')
 
     Base class of the concrete backends Espeak and EspeakMbrola. It provides
     facilities to find espeak executable path and read espeak version.
 
-    """
+        self.use_sampa = use_sampa
+        self.sampa_mapping = self._load_sampa_mapping()
+
+        self.sep = '--sep=_'
+        if version == '1.48.03' or version.split('.')[1] <= '47':
+            self.sep = ''  # pragma: nocover
 
     espeak_version_re = r'.*: ([0-9]+(\.[0-9]+)+(\-dev)?)'
 
+        self._with_stress = with_stress
+        if use_sampa is True:
+            self.ipa = '-x --pho'
+
+        if not (os.path.isfile(fpath) and os.access(fpath, os.X_OK)):
+            raise ValueError(
+                f'{fpath} is not an executable file')
+
+        _ESPEAK_DEFAULT_PATH = os.path.abspath(fpath)
+
     @staticmethod
     def set_espeak_path(fpath):
-        """Sets the espeak executable as `fpath`"""
+        """"""
         global _ESPEAK_DEFAULT_PATH
         if not fpath:
             _ESPEAK_DEFAULT_PATH = None
@@ -70,7 +97,6 @@ def set_espeak_path(fpath):
 
     @staticmethod
     def espeak_path():
-        """Returns the absolute path to the espeak executable"""
         if 'PHONEMIZER_ESPEAK_PATH' in os.environ:
             espeak = os.environ['PHONEMIZER_ESPEAK_PATH']
             if not (os.path.isfile(espeak) and os.access(espeak, os.X_OK)):
@@ -89,7 +115,7 @@ def espeak_path():
 
     @classmethod
     def is_available(cls):
-        return bool(cls.espeak_path())
+        return True if cls.espeak_path() else False
 
     @classmethod
     def long_version(cls):
@@ -117,22 +143,56 @@ def version(cls, as_tuple=False):
         try:
             version = re.match(cls.espeak_version_re, long_version).group(1)
         except AttributeError:
-            raise RuntimeError(
-                f'cannot extract espeak version from {cls.espeak_path()}')
+            raise RuntimeError(f'cannot extract espeak version from {cls.espeak_path()}')
 
         if as_tuple:
-            # ignore the '-dev' at the end
-            version = version.replace('-dev', '')
             version = tuple(int(v) for v in version.split('.'))
         return version
 
+    @classmethod
+    def supported_languages(cls):
+        # retrieve the languages from a call to 'espeak --voices'
+        voices = subprocess.check_output(shlex.split(
+            '{} --voices'.format(cls.espeak_path()), posix=False)).decode(
+                'utf8').split('\n')[1:-1]
+        voices = [v.split() for v in voices]
+
     @abc.abstractmethod
     def _command(self, fname):
         pass
 
-    @abc.abstractmethod
-    def _postprocess_line(self, line, separator, strip):
-        pass
+    def _load_sampa_mapping(self):
+        """Loads a sampa symbol map from a file in phonemizer/share/espeak
+
+        Returns it as a dictionary. Returns None if such a file does not exist.
+
+        """
+        if not self.use_sampa:
+            return None
+
+        # look for a file with SAMPA conversion mapping
+        filename = os.path.join(
+            get_package_resource('espeak'),
+            'sampa_{}.txt'.format(self.language))
+
+        if not os.path.isfile(filename):
+            return None
+
+        # build the mapping from the file
+        self.logger.debug('loading SAMPA mapping from %s', filename)
+        mapping = {}
+        for line in open(filename, 'r'):
+            symbols = line.strip().split()
+            if len(symbols) != 2:  # pragma: nocover
+                raise ValueError(
+                    'bad format in sampa mapping file {}: {}'
+                    .format(filename, line))
+            mapping[symbols[0]] = symbols[1]
+        return mapping
+
+    def _process_lang_switch(self, n, utt):
+        # look for language swith in the current utterance
+        flags = re.findall(_ESPEAK_FLAGS_RE, utt)
 
 
 class EspeakBackend(BaseEspeakBackend):
@@ -251,85 +311,57 @@ def _phonemize_aux(self, text, separator, strip):
                     data.close()
 
                     # generate the espeak command to run
-                    command = self._command(data.name)
+                    command = '{} -v{} {} -q -f {} {}'.format(
+                        self.espeak_path(), self.language, self.ipa,
+                        data.name, self.sep)
+
                     if self.logger:
                         self.logger.debug('running %s', command)
 
-                    # run the command
-                    completed = subprocess.run(
-                        shlex.split(command, posix=False),
-                        check=False,
-                        stdout=subprocess.PIPE,
-                        stderr=subprocess.PIPE)
-
-                    # retrieve the output line (raw phonemization)
-                    line = completed.stdout.decode('utf8')
-
-                    # ensure all was OK
-                    error = completed.stderr.decode('utf8')
-                    for err_line in error.split('\n'):  # pragma: nocover
-                        err_line = err_line.strip()
-                        if err_line:
-                            self.logger.error(err_line)
-                    if error or completed.returncode:  # pragma: nocover
-                        raise RuntimeError(
-                            f'espeak failed with return code '
-                            f'{completed.returncode}')
+                    line = subprocess.check_output(
+                        shlex.split(command, posix=False)).decode('utf8')
                 finally:
                     os.remove(data.name)
 
-                line, lang_switch = self._postprocess_line(
-                    line, separator, strip)
-                output.append(line)
-
-                if lang_switch:
-                    lang_switch_list.append(num)
-
-        return output, lang_switch_list
-
-    def _postprocess_line(self, line, separator, strip):
-        # espeak can split an utterance into several lines because
-        # of punctuation, here we merge the lines into a single one
-        line = line.strip().replace('\n', ' ').replace('  ', ' ')
-
-        # due to a bug in espeak-ng, some additional separators can be
-        # added at the end of a word. Here a quick fix to solve that
-        # issue. See https://github.com/espeak-ng/espeak-ng/issues/694
-        line = re.sub(r'_+', '_', line)
-        line = re.sub(r'_ ', ' ', line)
-
-        line, lang_switch = self._process_lang_switch(line)
-        if not line:
-            return '', lang_switch
-
-        out_line = ''
-        for word in line.split(u' '):
-            word = word.strip()
-
-            # remove the stresses on phonemes
-            if not self._with_stress:
-                word = word.replace("ˈ", '')
-                word = word.replace('ˌ', '')
-                word = word.replace("'", '')
-                word = word.replace("-", '')
-
-            if not strip:
-                word += '_'
-            word = word.replace('_', separator.phone)
-            out_line += word + separator.word
-
-        if strip and separator.word:
-            out_line = out_line[:-len(separator.word)]
-
-        return out_line, lang_switch
-
-    def _process_lang_switch(self, utt):
-        # look for language swith in the current utterance
-        flags = re.findall(_ESPEAK_FLAGS_RE, utt)
-
-        # no language switch, nothing to do
-        if not flags:
-            return utt, False
+                # espeak can split an utterance into several lines because
+                # of punctuation, here we merge the lines into a single one
+                line = line.strip().replace('\n', ' ').replace('  ', ' ')
+
+                # due to a bug in espeak-ng, some additional separators can be
+                # added at the end of a word. Here a quick fix to solve that
+                # issue. See https://github.com/espeak-ng/espeak-ng/issues/694
+                line = re.sub(r'_+', '_', line)
+                line = re.sub(r'_ ', ' ', line)
+
+                line = self._process_lang_switch(n, line)
+                if not line:
+                    continue
+
+                out_line = ''
+                for word in line.split(u' '):
+                    w = word.strip()
+
+                    # remove the stresses on phonemes
+                    if not self._with_stress:
+                        w = w.replace(u"ˈ", u'')
+                        w = w.replace(u'ˌ', u'')
+                        w = w.replace(u"'", u'')
+                        w = w.replace(u"-", u'')
+
+                    # replace the SAMPA symbols from espeak output to the
+                    # standardized ones
+                    if self.sampa_mapping:
+                        for k, v in self.sampa_mapping.items():
+                            w = w.replace(k, v)
+
+                    if not strip:
+                        w += '_'
+                    w = w.replace('_', separator.phone)
+                    out_line += w + separator.word
+
+                if strip:
+                    out_line = out_line[:-len(separator.word)]
+                output.append(out_line)
 
         # ignore the language switch but warn if one is found
         if self._lang_switch == 'keep-flags':
diff --git a/phonemizer/backend/festival.py b/phonemizer/backend/festival.py
index ea95735..e2de3be 100644
--- a/phonemizer/backend/festival.py
+++ b/phonemizer/backend/festival.py
@@ -1,4 +1,4 @@
-# Copyright 2015-2021 Mathieu Bernard
+# Copyright 2015-2020 Mathieu Bernard
 #
 # This file is part of phonemizer: you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
@@ -36,17 +36,16 @@
 
 
 class FestivalBackend(BaseBackend):
-    """Festival backend for the phonemizer"""
     def __init__(self, language,
                  punctuation_marks=Punctuation.default_marks(),
                  preserve_punctuation=False,
                  logger=get_logger()):
-        super().__init__(
+        super(self.__class__, self).__init__(
             language, punctuation_marks=punctuation_marks,
             preserve_punctuation=preserve_punctuation, logger=logger)
 
         self.script = get_package_resource('festival/phonemize.scm')
-        self.logger.info('loaded %s', self.script)
+        self.logger.info('loaded {}'.format(self.script))
 
     @staticmethod
     def name():
@@ -54,7 +53,7 @@ def name():
 
     @staticmethod
     def set_festival_path(fpath):
-        """Sets the festival path as `fpath`"""
+        """"""
         global _FESTIVAL_DEFAULT_PATH
         if not fpath:
             _FESTIVAL_DEFAULT_PATH = None
@@ -68,7 +67,6 @@ def set_festival_path(fpath):
 
     @staticmethod
     def festival_path():
-        """Returns the absolute path to the festival executable"""
         if 'PHONEMIZER_FESTIVAL_PATH' in os.environ:
             festival = os.environ['PHONEMIZER_FESTIVAL_PATH']
             if not (os.path.isfile(festival) and os.access(festival, os.X_OK)):
@@ -84,10 +82,10 @@ def festival_path():
 
     @classmethod
     def is_available(cls):
-        return bool(cls.festival_path())
+        return True if cls.festival_path() else False
 
     @classmethod
-    def version(cls, as_tuple=False):
+    def version(cls):
         # the full version version string includes extra information
         # we don't need
         long_version = subprocess.check_output(
@@ -96,17 +94,9 @@ def version(cls, as_tuple=False):
         # extract the version number with a regular expression
         festival_version_re = r'.* ([0-9\.]+[0-9]):'
         try:
-            version = re.match(festival_version_re, long_version).group(1)
+            return re.match(festival_version_re, long_version).group(1)
         except AttributeError:
-            raise RuntimeError(
-                f'cannot extract festival version from {cls.festival_path()}')
-
-        if as_tuple:
-            # ignore the '-dev' at the end
-            version = version.replace('-dev', '')
-            version = tuple(int(v) for v in version.split('.'))
-        return version
-
+            raise RuntimeError(f'cannot extract festival version from {cls.festival_path()}')
 
     @staticmethod
     def supported_languages():
diff --git a/phonemizer/backend/segments.py b/phonemizer/backend/segments.py
index 1de1921..e7105b5 100644
--- a/phonemizer/backend/segments.py
+++ b/phonemizer/backend/segments.py
@@ -1,4 +1,4 @@
-# Copyright 2015-2021 Mathieu Bernard
+# Copyright 2015-2020 Mathieu Bernard
 #
 # This file is part of phonemizer: you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
diff --git a/phonemizer/lispy.py b/phonemizer/lispy.py
index 9fe4ad3..4f677bf 100644
--- a/phonemizer/lispy.py
+++ b/phonemizer/lispy.py
@@ -1,4 +1,4 @@
-# Copyright 2015-2021 Mathieu Bernard
+# Copyright 2015-2020 Mathieu Bernard
 #
 # This file is part of phonemizer: you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
diff --git a/phonemizer/logger.py b/phonemizer/logger.py
index 0c37713..039409f 100644
--- a/phonemizer/logger.py
+++ b/phonemizer/logger.py
@@ -1,4 +1,4 @@
-# Copyright 2015-2021 Mathieu Bernard
+# Copyright 2015-2020 Mathieu Bernard
 #
 # This file is part of phonemizer: you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
diff --git a/phonemizer/main.py b/phonemizer/main.py
index 82ed031..aa94445 100755
--- a/phonemizer/main.py
+++ b/phonemizer/main.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# Copyright 2015-2021 Mathieu Bernard
+# Copyright 2015-2020 Mathieu Bernard
 #
 # This file is part of phonemizer: you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
@@ -19,8 +19,6 @@
 import codecs
 import sys
 
-import pkg_resources
-
 from phonemizer import phonemize, separator, version, logger, punctuation
 from phonemizer.backend import (
     EspeakBackend, EspeakMbrolaBackend, FestivalBackend, SegmentsBackend)
@@ -224,22 +222,18 @@ def parse_args():
         '--espeak-path', default=None, type=str, metavar='<executable>',
         help=f'''the path to the espeak executable to use (useful to overload
         the default espeak/espeak-ng installed on the system).
-        Default to {EspeakBackend.espeak_path()}.
-        This path can also be specified using the
-        $PHONEMIZER_ESPEAK_PATH environment variable.''')
+        Default to {EspeakBackend.espeak_path()}. This path can also be specified
+        using the $PHONEMIZER_ESPEAK_PATH environment variable.''')
 
     group = parser.add_argument_group('specific to festival backend')
     group.add_argument(
         '--festival-path', default=None, type=str, metavar='<executable>',
         help=f'''the path to the festival executable to use (useful to overload
         the default festival installed on the system).
-        Default to {FestivalBackend.festival_path()}.
-        This path can also be specified using the
-        $PHONEMIZER_FESTIVAL_PATH environment variable.''')
+        Default to {FestivalBackend.festival_path()}. This path can also be specified
+        using the $PHONEMIZER_FESTIVAL_PATH environment variable.''')
 
-    group = parser.add_argument_group(
-        'punctuation processing',
-        description='not available for espeak-mbrola backend')
+    group = parser.add_argument_group('punctuation processing')
     group.add_argument(
         '--preserve-punctuation', action='store_true',
         help='''preserve the punctuation marks in the phonemized output,
@@ -250,6 +244,14 @@ def parse_args():
         help='''the marks to consider during punctuation processing (either
         for removal or preservation). Default is %(default)s.''')
 
+    group = parser.add_argument_group('language')
+    group.add_argument(
+        '-l', '--language', metavar='<str|file>', default='en-us',
+        help='''the language code of the input text, see below for a list of
+        supported languages. According to the language code you
+        specify, the appropriate backend (segments, espeak or festival)
+        will be called in background. Default is %(default)s.''')
+
     return parser.parse_args()
 
 
@@ -265,7 +267,6 @@ def main():
     if args.festival_path:
         FestivalBackend.set_festival_path(args.festival_path)
 
-    # display version information and exit
     if args.version:
         print(version.version())
         return
diff --git a/phonemizer/phonemize.py b/phonemizer/phonemize.py
index e949767..dc04610 100644
--- a/phonemizer/phonemize.py
+++ b/phonemizer/phonemize.py
@@ -1,4 +1,4 @@
-# Copyright 2015-2021 Mathieu Bernard
+# Copyright 2015-2020 Mathieu Bernard
 #
 # This file is part of phonemizer: you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
@@ -25,7 +25,7 @@
 from phonemizer.logger import get_logger
 from phonemizer.separator import default_separator
 from phonemizer.backend import (
-    EspeakBackend, EspeakMbrolaBackend, FestivalBackend, SegmentsBackend)
+    EspeakBackend, FestivalBackend, SegmentsBackend)
 from phonemizer.punctuation import Punctuation
 
 
@@ -38,6 +38,7 @@ def phonemize(
         preserve_punctuation=False,
         punctuation_marks=Punctuation.default_marks(),
         with_stress=False,
+        use_sampa=False,
         language_switch='keep-flags',
         njobs=1,
         logger=get_logger()):
@@ -71,12 +72,14 @@ def phonemize(
       separators of a token, default to False.
 
     preserve_punctuation (bool): When True, will keep the punctuation in the
-        phonemized output. Not supported by the 'espeak-mbrola' backend.
-        Default to False and remove all the punctuation.
+        phonemized output. Default to False and remove all the punctuation.
 
     punctuation_marks (str): The punctuation marks to consider when dealing
-        with punctuation, either for removal or preservation. Default to
-        Punctuation.default_marks().
+        with punctuation. Default to Punctuation.default_marks().
+
+    with_stress (bool): This option is only valid for the espeak/espeak-ng
+      backend. When True the stresses on phonemes are present (stresses
+      characters are ˈ'ˌ). When False stresses are removed. Default to False.
 
     with_stress (bool): This option is only valid for the 'espeak' backend.
       When True the stresses on phonemes are present (stresses characters are
@@ -116,8 +119,13 @@ def phonemize(
     if backend not in ('espeak', 'espeak-mbrola', 'festival', 'segments'):
         raise RuntimeError(
             '{} is not a supported backend, choose in {}.'
-            .format(backend, ', '.join(
-                ('espeak', 'espeak-mbrola', 'festival', 'segments'))))
+            .format(backend, ', '.join(('espeak', 'festival', 'segments'))))
+
+    # ensure the phonetic alphabet is valid
+    if use_sampa is True:
+        if backend != 'espeak':
+            raise RuntimeError(
+                'sampa alphabet is only supported by espeak backend')
 
     # with_stress option only valid for espeak
     if with_stress and backend != 'espeak':
@@ -159,10 +167,6 @@ def phonemize(
             with_stress=with_stress,
             language_switch=language_switch,
             logger=logger)
-    elif backend == 'espeak-mbrola':
-        phonemizer = backends[backend](
-            language,
-            logger=logger)
     else:  # festival or segments
         phonemizer = backends[backend](
             language,
diff --git a/phonemizer/punctuation.py b/phonemizer/punctuation.py
index 62953ea..57f3463 100644
--- a/phonemizer/punctuation.py
+++ b/phonemizer/punctuation.py
@@ -1,4 +1,4 @@
-# Copyright 2015-2021 Mathieu Bernard
+# Copyright 2015-2020 Mathieu Bernard
 #
 # This file is part of phonemizer: you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
@@ -25,15 +25,15 @@
 _DEFAULT_MARKS = ';:,.!?¡¿—…"«»“”'
 
 
-_MarkIndex = collections.namedtuple(
+_mark_index = collections.namedtuple(
     '_mark_index', ['index', 'mark', 'position'])
 
 
 class Punctuation:
     """Preserve or remove the punctuation during phonemization
 
-    Backends behave differently with punctuation: festival and espeak ignore it
-    and remove it silently whereas segments will raise an error. The
+    Backends behave differently with punctuation: festival and espeak with
+    ignore it and remove ot silently whereas segments will raise an error. The
     Punctuation class solves that issue by "hiding" the punctuation to the
     phonemization backend and restoring it afterwards.
 
@@ -45,8 +45,6 @@ class Punctuation:
 
     """
     def __init__(self, marks=_DEFAULT_MARKS):
-        self._marks = None
-        self._marks_re = None
         self.marks = marks
 
     @staticmethod
@@ -56,7 +54,6 @@ def default_marks():
 
     @property
     def marks(self):
-        """The punctuation marks as a string"""
         return self._marks
 
     @marks.setter
@@ -67,7 +64,7 @@ def marks(self, value):
 
         # catching all the marks in one regular expression: zero or more spaces
         # + one or more marks + zero or more spaces.
-        self._marks_re = re.compile(fr'(\s*[{re.escape(self._marks)}]+\s*)+')
+        self._marks_re = re.compile(fr'(\s*[{self._marks}]+\s*)+')
 
     def remove(self, text):
         """Returns the `text` with all punctuation marks replaced by spaces
@@ -96,13 +93,13 @@ def preserve(self, text):
         preserved_text = []
         preserved_marks = []
 
-        for num, line in enumerate(text):
-            line, marks = self._preserve_line(line, num)
+        for n, line in enumerate(text):
+            line, marks = self._preserve_line(line, n)
             preserved_text += line
             preserved_marks += marks
         return [line for line in preserved_text if line], preserved_marks
 
-    def _preserve_line(self, line, num):
+    def _preserve_line(self, line, n):
         """Auxiliary method for Punctuation.preserve()"""
         matches = list(re.finditer(self._marks_re, line))
         if not matches:
@@ -110,25 +107,25 @@ def _preserve_line(self, line, num):
 
         # the line is made only of punctuation marks
         if len(matches) == 1 and matches[0].group() == line:
-            return [], [_MarkIndex(num, line, 'A')]
+            return [], [_mark_index(n, line, 'A')]
 
         # build the list of mark indexes required to restore the punctuation
         marks = []
-        for match in matches:
+        for m in matches:
             # find the position of the punctuation mark in the utterance:
             # begin (B), end (E), in the middle (I) or alone (A)
             position = 'I'
-            if match == matches[0] and line.startswith(match.group()):
+            if m == matches[0] and line.startswith(m.group()):
                 position = 'B'
-            elif match == matches[-1] and line.endswith(match.group()):
+            elif m == matches[-1] and line.endswith(m.group()):
                 position = 'E'
-            marks.append(_MarkIndex(num, match.group(), position))
+            marks.append(_mark_index(n, m.group(), position))
 
         # split the line into sublines, each separated by a punctuation mark
         preserved_line = []
-        for mark in marks:
-            split = line.split(mark.mark)
-            prefix, suffix = split[0], mark.mark.join(split[1:])
+        for m in marks:
+            split = line.split(m.mark)
+            prefix, suffix = split[0], m.mark.join(split[1:])
             preserved_line.append(prefix)
             line = suffix
 
@@ -149,36 +146,27 @@ def restore(cls, text, marks):
         return cls._restore_aux(str2list(text), marks, 0)
 
     @classmethod
-    def _restore_aux(cls, text, marks, num):
+    def _restore_aux(cls, text, marks, n):
         """Auxiliary method for Punctuation.restore()"""
-        if not marks:
+        if len(marks) == 0:
             return text
 
-        # nothing have been phonemized, returns the marks alone
-        if not text:
-            return [''.join(m.mark for m in marks)]
-
-        current = marks[0]
-        if current.index == num:  # place the current mark here
-            if current.position == 'B':
+        m = marks[0]
+        if m.index == n:  # place the current mark here
+            if m.position == 'B':
                 return cls._restore_aux(
-                    [current.mark + text[0]] + text[1:], marks[1:], num)
-            if current.position == 'E':
-                return [text[0] + current.mark] + cls._restore_aux(
-                    text[1:], marks[1:], num + 1)
-            if current.position == 'A':
-                return [current.mark] + cls._restore_aux(
-                    text, marks[1:], num + 1)
+                    [m.mark + text[0]] + text[1:], marks[1:], n)
+            if m.position == 'E':
+                return [text[0] + m.mark] + cls._restore_aux(
+                    text[1:], marks[1:], n+1)
+            if m.position == 'A':
+                return [m.mark] + cls._restore_aux(text, marks[1:], n+1)
             # position == 'I'
             if len(text) == 1:
                 # a corner case where the final part of an intermediate
                 # mark (I) has not been phonemized
-                restored = cls._restore_aux(
-                    [text[0] + current.mark], marks[1:], num)
-            else:
-                restored = cls._restore_aux(
-                    [text[0] + current.mark + text[1]] + text[2:],
-                    marks[1:], num)
-            return restored
-
-        return [text[0]] + cls._restore_aux(text[1:], marks, num + 1)
+                return cls._restore_aux([text[0] + m.mark], marks[1:], n)
+            return cls._restore_aux(
+                [text[0] + m.mark + text[1]] + text[2:], marks[1:], n)
+        else:
+            return [text[0]] + cls._restore_aux(text[1:], marks, n+1)
diff --git a/phonemizer/separator.py b/phonemizer/separator.py
index 768f2e1..ed18429 100644
--- a/phonemizer/separator.py
+++ b/phonemizer/separator.py
@@ -1,4 +1,4 @@
-# Copyright 2015-2021 Mathieu Bernard
+# Copyright 2015-2020 Mathieu Bernard
 #
 # This file is part of phonemizer: you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
@@ -37,6 +37,10 @@ def __eq__(self, other):
             and self.syllable == other.syllable
             and self.word == other.word)
 
+    def __str__(self):
+        def format(s):
+            return '"{}"'.format(s)
+
     def __str__(self):
         return (
             f'(phone: "{self.phone}", '
diff --git a/phonemizer/share/espeak/sampa_fr-fr.txt b/phonemizer/share/espeak/sampa_fr-fr.txt
new file mode 100644
index 0000000..ed7ad50
--- /dev/null
+++ b/phonemizer/share/espeak/sampa_fr-fr.txt
@@ -0,0 +1,5 @@
+W 9
+A~ a~
+O~ o~
+E~ e~
+^ j
diff --git a/phonemizer/share/festival/phonemize.scm b/phonemizer/share/festival/phonemize.scm
index 4b76828..349fd5e 100644
--- a/phonemizer/share/festival/phonemize.scm
+++ b/phonemizer/share/festival/phonemize.scm
@@ -1,4 +1,4 @@
-;; Copyright 2015-2021 Mathieu Bernard
+;; Copyright 2015-2020 Mathieu Bernard
 ;;
 ;; This file is part of phonemizer: you can redistribute it and/or
 ;; modify it under the terms of the GNU General Public License as
diff --git a/phonemizer/utils.py b/phonemizer/utils.py
index 60ea3bf..add300b 100644
--- a/phonemizer/utils.py
+++ b/phonemizer/utils.py
@@ -1,4 +1,4 @@
-# Copyright 2015-2021 Mathieu Bernard
+# Copyright 2015-2020 Mathieu Bernard
 #
 # This file is part of phonemizer: you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
@@ -19,58 +19,29 @@
 import six
 
 
-def cumsum(iterable):
-    """Returns the cumulative sum of the `iterable` as a list"""
-    res = []
-    cumulative = 0
-    for value in iterable:
-        cumulative += value
-        res.append(cumulative)
-    return res
-
-
 def str2list(s):
-    """Returns the string `s` as a list of lines, split by \n"""
+    """Returns the string `s` as a list of lines"""
     return s.strip().split('\n') if isinstance(s, six.string_types) else s
 
 
 def list2str(s):
-    """Returns the list of lines `s` as a single string separated by \n"""
+    """Returns the list of lines `s` as a single string"""
     return '\n'.join(s) if not isinstance(s, six.string_types) else s
 
 
-def chunks(text, num):
-    """Return a maximum of `num` equally sized chunks of a `text`
-
-    This method is usefull when phonemizing a single text on multiple jobs.
-
-    The exact number of chunks returned is `m = min(num, len(str2list(text)))`.
-    Only the m-1 first chunks have equal size. The last chunk can be longer.
-    The input `text` can be a list or a string. Return a list of `m` strings.
-
-    Parameters
-    ----------
-    text (str or list) : The text to divide in chunks
+def chunks(text, n):
+    """Return `n` equally sized chunks of a `text`
 
-    num (int) : The number of chunks to build, must be a strictly positive
-    integer.
+    Only the n-1 first chunks have equal size. The last chunk can be longer.
+    The input `text` can be a list or a string. Return a list of `n` strings.
 
-    Returns
-    -------
-    The chunked text as a list of str.
+    This method is usefull when phonemizing a single text on multiple jobs.
 
     """
     text = str2list(text)
-    size = int(max(1, len(text) / num))
-    nchunks = min(num, len(text))
-
-    result = [list2str(text[i*size:(i+1)*size]) for i in range(nchunks - 1)]
-
-    last = list2str(text[(nchunks - 1)*size:])
-    if last:
-        result.append(last)
-
-    return result
+    size = int(max(1, len(text)/n))
+    return [list2str(text[i:i+size])
+            for i in range(0, len(text), size)]
 
 
 def get_package_resource(path):
diff --git a/phonemizer/version.py b/phonemizer/version.py
index ed06fed..dc24e29 100644
--- a/phonemizer/version.py
+++ b/phonemizer/version.py
@@ -1,4 +1,4 @@
-# Copyright 2015-2021 Mathieu Bernard
+# Copyright 2015-2020 Mathieu Bernard
 #
 # This file is part of phonemizer: you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
diff --git a/setup.py b/setup.py
index e663bf1..954015a 100755
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 #
-# Copyright 2015-2021 Mathieu Bernard
+# Copyright 2015-2020 Mathieu Bernard
 #
 # This file is part of phonemizer: you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
diff --git a/test/test_espeak.py b/test/test_espeak.py
index ecf55c9..6a1625d 100644
--- a/test/test_espeak.py
+++ b/test/test_espeak.py
@@ -1,4 +1,4 @@
-# Copyright 2015-2021 Mathieu Bernard
+# Copyright 2015-2020 Mathieu Bernard
 #
 # This file is part of phonemizer: you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
@@ -62,8 +62,8 @@ def test_french():
     backend = EspeakBackend('fr-fr')
     text = u'bonjour le monde'
     sep = separator.Separator(word=';eword ', syllable=None, phone=' ')
-    expected = u'b ɔ̃ ʒ u ʁ ;eword l ə ;eword m ɔ̃ d ;eword '
-    out = backend.phonemize(text, sep, False)
+    expected = [u'b ɔ̃ ʒ u ʁ ;eword l ə ;eword m ɔ̃ d ;eword ']
+    out = backend._phonemize_aux(text, sep, False)
     assert out == expected
 
 
@@ -167,7 +167,6 @@ def test_phone_separator_simple():
     expected = 'ð_ə_ l_aɪə_n_ æ_n_d_ ð_ə_ t_aɪ_ɡ_ɚ_ ɹ_æ_n_ '
     assert expected == output
 
-
 @pytest.mark.parametrize(
     'text, expected',
     (('the hello but the', 'ð_ə h_ə_l_oʊ b_ʌ_t ð_ə'),
@@ -181,15 +180,8 @@ def test_phone_separator(text, expected):
     assert output == expected
 
 
-@pytest.mark.skipif(
-    'PHONEMIZER_ESPEAK_PATH' in os.environ,
-    reason='cannot modify environment')
 def test_path_good():
-    espeak = EspeakBackend.espeak_path()
     try:
-        EspeakBackend.set_espeak_path(None)
-        assert espeak == EspeakBackend.espeak_path()
-
         binary = distutils.spawn.find_executable('espeak')
         EspeakBackend.set_espeak_path(binary)
 
@@ -197,14 +189,10 @@ def test_path_good():
 
     # restore the espeak path to default
     finally:
-        EspeakBackend.set_espeak_path(espeak)
+        EspeakBackend.set_espeak_path(None)
 
 
-@pytest.mark.skipif(
-    'PHONEMIZER_ESPEAK_PATH' in os.environ,
-    reason='cannot modify environment')
 def test_path_bad():
-    espeak = EspeakBackend.espeak_path()
     try:
         # corrupt the default espeak path, try to use python executable instead
         binary = distutils.spawn.find_executable('python')
@@ -220,7 +208,7 @@ def test_path_bad():
 
     # restore the espeak path to default
     finally:
-        EspeakBackend.set_espeak_path(espeak)
+        EspeakBackend.set_espeak_path(None)
 
 
 @pytest.mark.skipif(
@@ -228,8 +216,7 @@ def test_path_bad():
     reason='cannot modify environment')
 def test_path_venv():
     try:
-        os.environ['PHONEMIZER_ESPEAK_PATH'] = (
-            distutils.spawn.find_executable('python'))
+        os.environ['PHONEMIZER_ESPEAK_PATH'] = distutils.spawn.find_executable('python')
         with pytest.raises(RuntimeError):
             EspeakBackend('en-us').phonemize('hello')
         with pytest.raises(RuntimeError):
@@ -246,84 +233,50 @@ def test_path_venv():
             pass
 
 
-@pytest.mark.skipif(
-    not EspeakMbrolaBackend.is_available() or
-    not EspeakMbrolaBackend.is_supported_language('mb-fr1'),
-    reason='mbrola or mb-fr1 voice not installed')
-@pytest.mark.parametrize(
-    'text, expected',
-    [
-        # plosives
-        ('pont', 'po~'),
-        ('bon', 'bo~'),
-        ('temps', 'ta~'),
-        ('dans', 'da~'),
-        ('quand', 'ka~'),
-        ('gant', 'ga~'),
-        # fricatives
-        ('femme', 'fam'),
-        ('vent', 'va~'),
-        ('sans', 'sa~'),
-        ('champ', 'Sa~'),
-        ('gens', 'Za~'),
-        ('ion', 'jo~'),
-        # nasals
-        ('mont', 'mo~'),
-        ('nom', 'no~'),
-        ('oignon', 'onjo~'),
-        ('ping', 'piN'),
-        # liquid glides
-        ('long', 'lo~'),
-        ('rond', 'Ro~'),
-        ('coin', 'kwe~'),
-        ('juin', 'Zye~'),
-        ('pierre', 'pjER'),
-        # vowels
-        ('si', 'si'),
-        ('ses', 'se'),
-        ('seize', 'sEz'),
-        ('patte', 'pat'),
-        ('pâte', 'pat'),
-        ('comme', 'kOm'),
-        ('gros', 'gRo'),
-        ('doux', 'du'),
-        ('du', 'dy'),
-        ('deux', 'd2'),
-        ('neuf', 'n9f'),
-        ('justement', 'Zystma~'),
-        ('vin', 've~'),
-        ('vent', 'va~'),
-        ('bon', 'bo~'),
-        ('brun', 'bR9~')])
-def test_sampa_fr(text, expected):
-    assert expected == EspeakMbrolaBackend('mb-fr1').phonemize(
-
-        text, strip=True, separator=Separator(phone=''))
-
-
-@pytest.mark.skipif(
-    not EspeakMbrolaBackend.is_available() or
-    not EspeakMbrolaBackend.is_supported_language('mb-fr1'),
-    reason='mbrola or mb-fr1 voice not installed')
-def test_french_sampa():
-    text = u'bonjour le monde'
-    backend = EspeakMbrolaBackend('mb-fr1')
-    sep = separator.Separator(word=None, phone=' ')
-
-    expected = 'b o~ Z u R l @ m o~ d '
-    out = backend.phonemize(text, separator=sep, strip=False)
-    assert out == expected
-
-    expected = 'b o~ Z u R l @ m o~ d'
-    out = backend.phonemize(text, separator=sep, strip=True)
-    assert out == expected
-
-    assert '' == backend.phonemize('', separator=sep, strip=True)
-    assert '' == backend.phonemize('"', separator=sep, strip=True)
-
-
-@pytest.mark.skipif(
-    not EspeakMbrolaBackend.is_available(),
-    reason='mbrola not installed')
-def test_mbrola_bad_language():
-    assert not EspeakMbrolaBackend.is_supported_language('foo-bar')
+def test_sampa_fr():
+    list_sampa_examples_plosives = [
+        'pont', 'bon', 'temps', 'dans', 'quand', 'gant']
+    list_sampa_examples_fricatives = [
+        'femme', 'vent', 'sans', 'champ', 'gens', 'ion']
+    list_sampa_examples_nasals = [
+        'mont', 'nom', 'oignon', 'camping']
+    list_sampa_examples_liquids_glides = [
+        'long', 'rond', 'coin', 'juin', 'pierre']
+    list_sampa_examples_vowels = [
+        'si', 'ses', 'seize', 'patte', 'pâte',
+        'comme', 'gros', 'doux', 'du', 'deux',
+        'neuf', 'justement', 'vin', 'vent', 'bon', 'brun']
+    list_sampa = {
+        'plosives': list_sampa_examples_plosives,
+        'fricatives': list_sampa_examples_fricatives,
+        'nasals': list_sampa_examples_nasals,
+        'liquids_glides': list_sampa_examples_liquids_glides,
+        'vowels': list_sampa_examples_vowels}
+    list_sampa_answers = {
+        'fricatives': ['fam', 'va~', 'sa~', 'Sa~', 'Za~', 'jo~'],
+        'liquids_glides': ['lo~', 'ro~', 'kwe~', 'Zye~', 'pjEr'],
+        'nasals': ['mo~', 'no~', 'onjo~', 'kampIN'],
+        'plosives': ['po~', 'bo~', 'ta~', 'da~', 'ka~', 'ga~'],
+        'vowels': ['si',
+                   'se',
+                   'sEz',
+                   'pat',
+                   'pa:t',
+                   'kOm',
+                   'gro',
+                   'du',
+                   'dy',
+                   'dY',
+                   'n9f',
+                   'Zystma~',
+                   've~',
+                   'va~',
+                   'bo~',
+                   'br9~']}
+
+    backend = EspeakBackend(
+        'fr-fr', use_sampa=True, language_switch='remove-flags')
+    for category in list_sampa.keys():
+        for idx, text in enumerate(list_sampa[category]):
+            out = backend.phonemize(text, strip=True)
+            assert out == list_sampa_answers[category][idx]
diff --git a/test/test_festival.py b/test/test_festival.py
index 13a06f4..24b0fd8 100644
--- a/test/test_festival.py
+++ b/test/test_festival.py
@@ -1,4 +1,4 @@
-# Copyright 2015-2021 Thomas Schatz, Xuan Nga Cao, Mathieu Bernard
+# Copyright 2015-2020 Thomas Schatz, Xuan Nga Cao, Mathieu Bernard
 #
 # This file is part of phonemizer: you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
diff --git a/test/test_main.py b/test/test_main.py
index 2d45cbe..1cae23e 100644
--- a/test/test_main.py
+++ b/test/test_main.py
@@ -1,4 +1,4 @@
-# Copyright 2015-2021 Thomas Schatz, Xuan Nga Cao, Mathieu Bernard
+# Copyright 2015-2020 Thomas Schatz, Xuan Nga Cao, Mathieu Bernard
 #
 # This file is part of phonemizer: you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
@@ -19,7 +19,7 @@
 import shlex
 import sys
 
-from phonemizer.backend import EspeakBackend, EspeakMbrolaBackend
+from phonemizer.backend import EspeakBackend, FestivalBackend
 from phonemizer import main, backend, logger
 
 
@@ -63,6 +63,7 @@ def test_readme():
     _test(u'hello world', u'həloʊ wɜːld ')
     _test(u'hello world', u'həloʊ wɜːld ', '--verbose')
     _test(u'hello world', u'həloʊ wɜːld ', '--quiet')
+    _test(u'hello world', u'h@loU w3:ld ', '--sampa')
     _test(u'hello world', u'hhaxlow werld', '-b festival --strip')
     _test(u'hello world', u'həloʊ wɜːld ', '-l en-us')
     _test(u'bonjour le monde', u'bɔ̃ʒuʁ lə mɔ̃d ', '-l fr-fr')
@@ -116,15 +117,6 @@ def test_logger():
         logger.get_logger(verbosity=1)
 
 
-@pytest.mark.skipif(
-    not EspeakMbrolaBackend.is_available() or
-    not EspeakMbrolaBackend.is_supported_language('mb-fr1'),
-    reason='mbrola or mb-fr1 voice not installed')
-def test_espeak_mbrola():
-    _test(u'coucou toi!', u'k u k u t w a ',
-          f'-b espeak-mbrola -l mb-fr1 -p" " --preserve-punctuation')
-
-
 def test_espeak_path():
     espeak = backend.EspeakBackend.espeak_path()
     _test(u'hello world', u'həloʊ wɜːld ', f'--espeak-path={espeak}')
@@ -132,5 +124,4 @@ def test_espeak_path():
 
 def test_festival_path():
     festival = backend.FestivalBackend.festival_path()
-    _test(u'hello world', u'hhaxlow werld ',
-          f'--festival-path={festival} -b festival')
+    _test(u'hello world', u'hhaxlow werld ', f'--festival-path={festival} -b festival')
diff --git a/test/test_phonemize.py b/test/test_phonemize.py
index 880deaf..46c4077 100644
--- a/test/test_phonemize.py
+++ b/test/test_phonemize.py
@@ -1,4 +1,4 @@
-# Copyright 2015-2021 Thomas Schatz, Xuan Nga Cao, Mathieu Bernard
+# Copyright 2015-2020 Thomas Schatz, Xuan Nga Cao, Mathieu Bernard
 #
 # This file is part of phonemizer: you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
@@ -88,43 +88,11 @@ def test_espeak(njobs):
         strip=False, njobs=njobs)
     assert out == '\n'.join(['wʌn tuː ', 'θɹiː ', 'foːɹ faɪv '])
 
-
-@pytest.mark.skipif(
-    not EspeakBackend.is_espeak_ng(),
-    reason='Language switch better supported by espeak-ng')
-@pytest.mark.parametrize('njobs', [1, 2])
-def test_espeak_langswitch(njobs, caplog):
-    text = ["j'aime le football", "moi aussi", "moi aussi j'aime le football"]
-    out = phonemize(
-        text, language='fr-fr', backend='espeak', njobs=njobs, strip=True)
-
-    assert out == [
-        'ʒɛm lə (en)fʊtbɔːl(fr)',
-        'mwa osi',
-        'mwa osi ʒɛm lə (en)fʊtbɔːl(fr)']
-
-    assert (
-        '2 utterances containing language switches on lines 1, 3'
-        in caplog.text)
-
-
-@pytest.mark.skipif(
-    not EspeakMbrolaBackend.is_available() or
-    not EspeakMbrolaBackend.is_supported_language('mb-fr1'),
-    reason='mbrola or mb-fr1 voice not installed')
-@pytest.mark.parametrize('njobs', [2, 4])
-def test_espeak_mbrola(njobs):
-    text = ['un deux', 'trois', 'quatre cinq']
-
+    # if EspeakBackend.is_espeak_ng():
     out = phonemize(
-        text, language='mb-fr1', backend='espeak-mbrola',
+        text, language='en-us', backend='espeak', use_sampa=True,
         strip=True, njobs=njobs)
-    assert out == ['9~d2', 'tRwa', 'katRse~k']
-
-    out = phonemize(
-        text, language='mb-fr1', backend='espeak-mbrola',
-        strip=False, njobs=njobs)
-    assert out == ['9~d2', 'tRwa', 'katRse~k']
+    assert out == ['wVn tu:', 'Tri:', 'fo@ faIv']
 
 
 @pytest.mark.parametrize('njobs', [2, 4])
diff --git a/test/test_punctuation.py b/test/test_punctuation.py
index 6ed642a..4344a27 100644
--- a/test/test_punctuation.py
+++ b/test/test_punctuation.py
@@ -1,4 +1,4 @@
-# Copyright 2015-2021 Mathieu Bernard
+# Copyright 2015-2020 Mathieu Bernard
 #
 # This file is part of phonemizer: you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
@@ -21,15 +21,9 @@
 from phonemizer.phonemize import phonemize
 
 
-# True if we are using espeak>=1.49.3
-ESPEAK_143 = (EspeakBackend.version(as_tuple=True) >= (1, 49, 3))
-
 # True if we are using espeak>=1.50
 ESPEAK_150 = (EspeakBackend.version(as_tuple=True) >= (1, 50))
 
-# True if we are using festival>=2.5
-FESTIVAL_25 = (FestivalBackend.version(as_tuple=True) >= (2, 5))
-
 
 @pytest.mark.parametrize(
     'inp, out', [
@@ -50,9 +44,7 @@ def test_remove(inp, out):
         ['a, a, a'],
         ['a, a?', 'aaa bb', '.bb, b', 'c', '!d.d. dd??  d!'],
         ['Truly replied, "Yes".'],
-        ['hi; ho,"'],
-        ["!?"],
-        ["!'"]])
+        ['hi; ho,"']])
 def test_preserve(inp):
     p = Punctuation()
     t, m = p.preserve(inp)
@@ -62,8 +54,8 @@ def test_preserve(inp):
 @pytest.mark.parametrize(
     'text, output', [
         (['hi; ho,"'], ['haɪ ; hoʊ ,']),
-        (['hi; "ho,'], ['haɪ ; hoʊ ,'] if ESPEAK_143 else ['haɪ ;  hoʊ ,']),
-        (['"hi; ho,'], ['haɪ ; hoʊ ,'] if ESPEAK_143 else [' haɪ ; hoʊ ,'])])
+        (['hi; "ho,'], ['haɪ ; hoʊ ,'] if ESPEAK_150 else ['haɪ ;  hoʊ ,']),
+        (['"hi; ho,'], ['haɪ ; hoʊ ,'] if ESPEAK_150 else [' haɪ ; hoʊ ,'])])
 def test_preserve_2(text, output):
     marks = ".!;:,?"
     p = Punctuation(marks=marks)
@@ -96,20 +88,16 @@ def test_espeak():
     expected3 = 'həloʊ wɜːld '
     expected4 = 'həloʊ , wɜːld !'
 
-    out1 = EspeakBackend('en-us', preserve_punctuation=False).phonemize(
-        text, strip=True)
+    out1 = EspeakBackend('en-us', preserve_punctuation=False).phonemize(text, strip=True)
     assert out1 == expected1
 
-    out2 = EspeakBackend('en-us', preserve_punctuation=True).phonemize(
-        text, strip=True)
+    out2 = EspeakBackend('en-us', preserve_punctuation=True).phonemize(text, strip=True)
     assert out2 == expected2
 
-    out3 = EspeakBackend('en-us', preserve_punctuation=False).phonemize(
-        text, strip=False)
+    out3 = EspeakBackend('en-us', preserve_punctuation=False).phonemize(text, strip=False)
     assert out3 == expected3
 
-    out4 = EspeakBackend('en-us', preserve_punctuation=True).phonemize(
-        text, strip=False)
+    out4 = EspeakBackend('en-us', preserve_punctuation=True).phonemize(text, strip=False)
     assert out4 == expected4
 
 
@@ -120,20 +108,16 @@ def test_festival():
     expected3 = 'hhaxlow werld '
     expected4 = 'hhaxlow , werld !'
 
-    out1 = FestivalBackend('en-us', preserve_punctuation=False).phonemize(
-        text, strip=True)
+    out1 = FestivalBackend('en-us', preserve_punctuation=False).phonemize(text, strip=True)
     assert out1 == expected1
 
-    out2 = FestivalBackend('en-us', preserve_punctuation=True).phonemize(
-        text, strip=True)
+    out2 = FestivalBackend('en-us', preserve_punctuation=True).phonemize(text, strip=True)
     assert out2 == expected2
 
-    out3 = FestivalBackend('en-us', preserve_punctuation=False).phonemize(
-        text, strip=False)
+    out3 = FestivalBackend('en-us', preserve_punctuation=False).phonemize(text, strip=False)
     assert out3 == expected3
 
-    out4 = FestivalBackend('en-us', preserve_punctuation=True).phonemize(
-        text, strip=False)
+    out4 = FestivalBackend('en-us', preserve_punctuation=True).phonemize(text, strip=False)
     assert out4 == expected4
 
 
@@ -144,62 +128,14 @@ def test_segments():
     expected3 = 'ʌtʃɪ ʌtʃʊ '
     expected4 = 'ʌtʃɪ , ʌtʃʊ !'
 
-    out1 = SegmentsBackend('cree', preserve_punctuation=False).phonemize(
-        text, strip=True)
+    out1 = SegmentsBackend('cree', preserve_punctuation=False).phonemize(text, strip=True)
     assert out1 == expected1
 
-    out2 = SegmentsBackend('cree', preserve_punctuation=True).phonemize(
-        text, strip=True)
+    out2 = SegmentsBackend('cree', preserve_punctuation=True).phonemize(text, strip=True)
     assert out2 == expected2
 
-    out3 = SegmentsBackend('cree', preserve_punctuation=False).phonemize(
-        text, strip=False)
+    out3 = SegmentsBackend('cree', preserve_punctuation=False).phonemize(text, strip=False)
     assert out3 == expected3
 
-    out4 = SegmentsBackend('cree', preserve_punctuation=True).phonemize(
-        text, strip=False)
+    out4 = SegmentsBackend('cree', preserve_punctuation=True).phonemize(text, strip=False)
     assert out4 == expected4
-
-
-# see https://github.com/bootphon/phonemizer/issues/54
-@pytest.mark.parametrize(
-    'text', ["!'", "'!", "!'!", "'!'"])
-def test_issue_54(text):
-    output = phonemize(
-        text, language='en-us', backend='espeak', preserve_punctuation=True)
-    assert text.replace("'", '') == output
-
-
-# see https://github.com/bootphon/phonemizer/issues/55
-@pytest.mark.parametrize(
-    'backend, marks, text, expected', [
-        ('espeak', 'default', ['"Hey! "', '"hey,"'], ['"heɪ ! "', '"heɪ ,"']),
-        ('espeak', '.!;:,?', ['"Hey! "', '"hey,"'],
-         ['heɪ ! ', 'heɪ ,'] if ESPEAK_150 else [' heɪ ! ', ' heɪ ,']),
-        ('espeak', 'default', ['! ?', 'hey!'], ['! ?', 'heɪ !']),
-        ('espeak', '!', ['! ?', 'hey!'], ['! ', 'heɪ !']),
-        ('segments', 'default', ['! ?', 'hey!'], ['! ?', 'heːj !']),
-        ('segments', '!', ['! ?', 'hey!'], ValueError),
-        ('festival', 'default', ['! ?', 'hey!'], ['! ?', 'hhey !']),
-        ('festival', '!', ['! ?', 'hey!'], ['!  ', 'hhey !'])])
-def test_issue55(backend, marks, text, expected):
-    if marks == 'default':
-        marks = Punctuation.default_marks()
-    language = 'cree' if backend == 'segments' else 'en-us'
-
-    try:
-        with pytest.raises(expected):
-            phonemize(
-                text, language=language, backend=backend,
-                preserve_punctuation=True, punctuation_marks=marks)
-    except TypeError:
-        try:
-            assert expected == phonemize(
-                text, language=language, backend=backend,
-                preserve_punctuation=True, punctuation_marks=marks)
-        except RuntimeError:
-            if backend == 'festival':
-                # TODO on some installations festival fails to phonemize "?".
-                # It ends with a segmentation fault. This seems to only appear
-                # with festival-2.5 (but is working on travis and docker image)
-                pass
diff --git a/test/test_segments.py b/test/test_segments.py
index f41d29a..e8bf891 100644
--- a/test/test_segments.py
+++ b/test/test_segments.py
@@ -1,4 +1,4 @@
-# Copyright 2015-2021 Mathieu Bernard
+# Copyright 2015-2020 Mathieu Bernard
 #
 # This file is part of phonemizer: you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
diff --git a/test/test_separator.py b/test/test_separator.py
index 2394211..4517eb9 100644
--- a/test/test_separator.py
+++ b/test/test_separator.py
@@ -1,4 +1,4 @@
-# Copyright 2015-2021 Thomas Schatz, Xuan Nga Cao, Mathieu Bernard
+# Copyright 2015-2020 Thomas Schatz, Xuan Nga Cao, Mathieu Bernard
 #
 # This file is part of phonemizer: you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as