From 575530b5cbe9c2267e926f69c9c86fc92382d510 Mon Sep 17 00:00:00 2001 From: Onur Solmaz <2453968+osolmaz@users.noreply.github.com> Date: Sun, 25 Feb 2024 18:57:36 +0100 Subject: [PATCH] Add Support for OpenAI TTS Service (#84) * Commit changes from @hdeep03's branch * Update docs * Bump version --- docs/source/api.rst | 4 + docs/source/services.rst | 35 +++++- examples/openai-example.py | 30 ++++++ manim_voiceover/services/elevenlabs.py | 9 +- manim_voiceover/services/openai.py | 118 ++++++++++++++++++++ manim_voiceover/tracker.py | 35 +++++- poetry.lock | 142 ++++++++++++++++++++++++- pyproject.toml | 5 +- 8 files changed, 369 insertions(+), 9 deletions(-) create mode 100644 examples/openai-example.py create mode 100644 manim_voiceover/services/openai.py diff --git a/docs/source/api.rst b/docs/source/api.rst index 9af526d..6dd0598 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -38,6 +38,10 @@ Speech services :members: :show-inheritance: +.. automodule:: manim_voiceover.services.openai + :members: + :show-inheritance: + .. automodule:: manim_voiceover.services.pyttsx3 :members: :show-inheritance: diff --git a/docs/source/services.rst b/docs/source/services.rst index 95193ee..e5e57c6 100644 --- a/docs/source/services.rst +++ b/docs/source/services.rst @@ -36,7 +36,7 @@ Manim Voiceover defines the :py:class:`~~base.SpeechService` class for adding ne - Very good, human-like - No - Yes - - `ElevenLabs `__ develops very advanced voice generative AI models. It has a range of realistic and emotive voices, and also allows you to clone your own voice by uploading a few minutes of your speech. + - Requires ElevenLabs account. Click `here `__ to sign up. * - :py:class:`~coqui.CoquiService` - Good, human-like - Yes @@ -47,6 +47,11 @@ Manim Voiceover defines the :py:class:`~~base.SpeechService` class for adding ne - No - No - It's a free API subsidized by Google, so there is a likelihood it may stop working in the future. + * - :py:class:`~openai.OpenAIService` + - Very good, human-like + - No + - Yes + - Requires OpenAI developer account. See `platform `__ to sign up, and the `pricing page `__ for more details. * - :py:class:`~pyttsx3.PyTTSX3Service` - Bad - Yes @@ -136,6 +141,32 @@ Install Manim Voiceover with the ``gtts`` extra in order to use :py:class:`~gtts Refer to the `example usage `__ to get started. +:py:class:`~openai.OpenAIService` +************************************* +`OpenAI `__ provides a text-to-speech service. It is through an API, so it requires an internet connection to work. It also requires an API key to use. Register for one `here `__. + +Install Manim Voiceover with the ``openai`` extra in order to use :py:class:`~openai.OpenAIService`: + +.. code:: sh + + pip install "manim-voiceover[openai]" + +Then, you need to find out your api key: + +- Sign in to `OpenAI platform `__ and click into Api Keys from the left panel. +- Click create a new secret key and copy it. + +Create a file called ``.env`` that contains your authentication +information in the same directory where you call Manim. + +.. code:: sh + + OPENAI_API_KEY="..." # insert the secret key here. It should start with "sk-" + +Check out `OpenAI docs `__ for more details. + +Refer to the `example usage `__ to get started. + :py:class:`~pyttsx3.PyTTSX3Service` *********************************** @@ -154,7 +185,7 @@ Refer to the `example usage `__ offers one of the most natural sounding speech service APIs. To use it, you will need to create an account at `Eleven Labs `__. +`ElevenLabs `__ offers one of the most natural sounding speech service APIs. It has a range of realistic and emotive voices, and also allows you to clone your own voice by uploading a few minutes of your speech. To use it, you will need to create an account at `Eleven Labs `__. .. tip:: ElevenLabs currently offers free TTS of 10,000 characters/month and up to 3 custom voices. diff --git a/examples/openai-example.py b/examples/openai-example.py new file mode 100644 index 0000000..b87127a --- /dev/null +++ b/examples/openai-example.py @@ -0,0 +1,30 @@ +from manim import * +from manim_voiceover import VoiceoverScene +from manim_voiceover.services.openai import OpenAIService + + +class OpenAIExample(VoiceoverScene): + def construct(self): + self.set_speech_service( + OpenAIService( + voice="fable", + model="tts-1-hd", + ) + ) + + circle = Circle() + square = Square().shift(2 * RIGHT) + + with self.voiceover(text="This circle is drawn as I speak.") as tracker: + self.play(Create(circle), run_time=tracker.duration) + + with self.voiceover(text="Let's shift it to the left 2 units.") as tracker: + self.play(circle.animate.shift(2 * LEFT), run_time=tracker.duration) + + with self.voiceover(text="Now, let's transform it into a square.") as tracker: + self.play(Transform(circle, square), run_time=tracker.duration) + + with self.voiceover(text="Thank you for watching.", speed=0.75): # You can also change the audio speed by specifying the speed argument. + self.play(Uncreate(circle)) + + self.wait() diff --git a/manim_voiceover/services/elevenlabs.py b/manim_voiceover/services/elevenlabs.py index 25a7b0b..222b4ba 100644 --- a/manim_voiceover/services/elevenlabs.py +++ b/manim_voiceover/services/elevenlabs.py @@ -150,8 +150,10 @@ def generate_from_text( input_data = { "input_text": input_text, "service": "elevenlabs", - "model": self.model, - "voice": self.voice.model_dump(exclude_none=True), + "config": { + "model": self.model, + "voice": self.voice.model_dump(exclude_none=True), + }, } # if not config.disable_caching: @@ -164,8 +166,9 @@ def generate_from_text( audio_path = self.get_audio_basename(input_data) + ".mp3" else: audio_path = path + try: - audio = generate(text=text, voice=self.voice, model=self.model) + audio = generate(text=input_text, voice=self.voice, model=self.model) save(audio, str(Path(cache_dir) / audio_path)) # type: ignore except Exception as e: logger.error(e) diff --git a/manim_voiceover/services/openai.py b/manim_voiceover/services/openai.py new file mode 100644 index 0000000..1198a42 --- /dev/null +++ b/manim_voiceover/services/openai.py @@ -0,0 +1,118 @@ +import os +import sys +from pathlib import Path +from manim import logger +from dotenv import load_dotenv, find_dotenv + +from manim_voiceover.helper import ( + create_dotenv_file, + prompt_ask_missing_extras, + remove_bookmarks, +) + +try: + import openai +except ImportError: + logger.error( + "Missing packages. " + 'Run `pip install "manim-voiceover[openai]"` to use OpenAIService.' + ) + +from manim_voiceover.services.base import SpeechService + +load_dotenv(find_dotenv(usecwd=True)) + + +def create_dotenv_openai(): + logger.info( + "Check out https://voiceover.manim.community/en/stable/services.html " + "to learn how to create an account and get your subscription key." + ) + if not create_dotenv_file(["OPENAI_API_KEY"]): + raise ValueError( + "The environment variable OPENAI_API_KEY is not set. Please set it " + "or create a .env file with the variables." + ) + logger.info("The .env file has been created. Please run Manim again.") + sys.exit() + + +class OpenAIService(SpeechService): + """ + Speech service class for OpenAI TTS Service. See the `OpenAI API page + `__ + for more information about voices and models. + """ + + def __init__( + self, + voice: str = "alloy", + model: str = "tts-1-hd", + transcription_model="base", + **kwargs + ): + """ + Args: + voice (str, optional): The voice to use. See the + `API page `__ + for all the available options. Defaults to ``"alloy"``. + model (str, optional): The TTS model to use. + See the `API page `__ + for all the available options. Defaults to ``"tts-1-hd"``. + """ + prompt_ask_missing_extras("openai", "openai", "OpenAIService") + self.voice = voice + self.model = model + + SpeechService.__init__(self, transcription_model=transcription_model, **kwargs) + + def generate_from_text( + self, text: str, cache_dir: str = None, path: str = None, **kwargs + ) -> dict: + """""" + if cache_dir is None: + cache_dir = self.cache_dir + + speed = kwargs.get("speed", 1.0) + + if not (0.25 <= speed <= 4.0): + raise ValueError("The speed must be between 0.25 and 4.0.") + + input_text = remove_bookmarks(text) + input_data = { + "input_text": input_text, + "service": "openai", + "config": { + "voice": self.voice, + "model": self.model, + "speed": speed, + }, + } + + cached_result = self.get_cached_result(input_data, cache_dir) + if cached_result is not None: + return cached_result + + if path is None: + audio_path = self.get_audio_basename(input_data) + ".mp3" + else: + audio_path = path + + if os.getenv("OPENAI_API_KEY") is None: + create_dotenv_openai() + + response = openai.audio.speech.create( + model=self.model, + voice=self.voice, + input=input_text, + speed=speed, + ) + response.stream_to_file(str(Path(cache_dir) / audio_path)) + + json_dict = { + "input_text": text, + "input_data": input_data, + "original_audio": audio_path, + } + + return json_dict diff --git a/manim_voiceover/tracker.py b/manim_voiceover/tracker.py index 7cece45..c138359 100644 --- a/manim_voiceover/tracker.py +++ b/manim_voiceover/tracker.py @@ -57,10 +57,43 @@ def __init__(self, scene: Scene, data: dict, cache_dir: str): if "word_boundaries" in self.data: self._process_bookmarks() + def _get_fallback_word_boundaries(self): + """ + Returns dummy word boundaries assuming a linear mapping between + text and audio. Used when word boundaries are not available. + """ + input_text = remove_bookmarks(self.data["input_text"]) + return [ + { + "audio_offset": 0, + "text_offset": 0, + "word_length": len(input_text), + "text": self.data["input_text"], + "boundary_type": "Word", + }, + { + "audio_offset": self.duration * AUDIO_OFFSET_RESOLUTION, + "text_offset": len(input_text), + "word_length": 1, + "text": ".", + "boundary_type": "Word", + }, + ] + def _process_bookmarks(self) -> None: self.bookmark_times = {} self.bookmark_distances = {} - self.time_interpolator = TimeInterpolator(self.data["word_boundaries"]) + + word_boundaries = self.data["word_boundaries"] + if not word_boundaries or len(word_boundaries) < 2: + logger.warning( + f"Word boundaries for voiceover {self.data['input_text']} are not " + "available or are insufficient. Using fallback word boundaries." + ) + word_boundaries = self._get_fallback_word_boundaries() + + self.time_interpolator = TimeInterpolator(word_boundaries) + net_text_len = len(remove_bookmarks(self.data["input_text"])) if "transcribed_text" in self.data: transcribed_text_len = len(self.data["transcribed_text"].strip()) diff --git a/poetry.lock b/poetry.lock index 2a4dbf7..6905a3d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -25,6 +25,28 @@ files = [ [package.dependencies] typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.9\""} +[[package]] +name = "anyio" +version = "4.3.0" +description = "High level compatibility layer for multiple asynchronous event loop implementations" +optional = true +python-versions = ">=3.8" +files = [ + {file = "anyio-4.3.0-py3-none-any.whl", hash = "sha256:048e05d0f6caeed70d731f3db756d35dcc1f35747c8c403364a8332c630441b8"}, + {file = "anyio-4.3.0.tar.gz", hash = "sha256:f75253795a87df48568485fd18cdd2a3fa5c4f7c5be8e5e36637733fce06fed6"}, +] + +[package.dependencies] +exceptiongroup = {version = ">=1.0.2", markers = "python_version < \"3.11\""} +idna = ">=2.8" +sniffio = ">=1.1" +typing-extensions = {version = ">=4.1", markers = "python_version < \"3.11\""} + +[package.extras] +doc = ["Sphinx (>=7)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"] +test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17)"] +trio = ["trio (>=0.23)"] + [[package]] name = "appnope" version = "0.1.4" @@ -876,6 +898,17 @@ files = [ {file = "distlib-0.3.8.tar.gz", hash = "sha256:1530ea13e350031b6312d8580ddb6b27a104275a31106523b8f123787f494f64"}, ] +[[package]] +name = "distro" +version = "1.9.0" +description = "Distro - an OS platform information API" +optional = true +python-versions = ">=3.6" +files = [ + {file = "distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2"}, + {file = "distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed"}, +] + [[package]] name = "docutils" version = "0.19" @@ -914,6 +947,20 @@ files = [ {file = "evdev-1.7.0.tar.gz", hash = "sha256:95bd2a1e0c6ce2cd7a2ecc6e6cd9736ff794b3ad5cb54d81d8cbc2e414d0b870"}, ] +[[package]] +name = "exceptiongroup" +version = "1.2.0" +description = "Backport of PEP 654 (exception groups)" +optional = true +python-versions = ">=3.7" +files = [ + {file = "exceptiongroup-1.2.0-py3-none-any.whl", hash = "sha256:4bfd3996ac73b41e9b9628b04e079f193850720ea5945fc96a08633c66912f14"}, + {file = "exceptiongroup-1.2.0.tar.gz", hash = "sha256:91f5c769735f051a4290d52edd0858999b57e5876e9f85937691bd4c9fa3ed68"}, +] + +[package.extras] +test = ["pytest (>=6)"] + [[package]] name = "execnet" version = "2.0.2" @@ -1228,6 +1275,62 @@ requests = ">=2.27,<3" docs = ["sphinx", "sphinx-autobuild", "sphinx-click", "sphinx-mdinclude", "sphinx-rtd-theme"] tests = ["pytest (>=7.1.3,<8.1.0)", "pytest-cov", "testfixtures"] +[[package]] +name = "h11" +version = "0.14.0" +description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1" +optional = true +python-versions = ">=3.7" +files = [ + {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"}, + {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"}, +] + +[[package]] +name = "httpcore" +version = "1.0.4" +description = "A minimal low-level HTTP client." +optional = true +python-versions = ">=3.8" +files = [ + {file = "httpcore-1.0.4-py3-none-any.whl", hash = "sha256:ac418c1db41bade2ad53ae2f3834a3a0f5ae76b56cf5aa497d2d033384fc7d73"}, + {file = "httpcore-1.0.4.tar.gz", hash = "sha256:cb2839ccfcba0d2d3c1131d3c3e26dfc327326fbe7a5dc0dbfe9f6c9151bb022"}, +] + +[package.dependencies] +certifi = "*" +h11 = ">=0.13,<0.15" + +[package.extras] +asyncio = ["anyio (>=4.0,<5.0)"] +http2 = ["h2 (>=3,<5)"] +socks = ["socksio (==1.*)"] +trio = ["trio (>=0.22.0,<0.25.0)"] + +[[package]] +name = "httpx" +version = "0.27.0" +description = "The next generation HTTP client." +optional = true +python-versions = ">=3.8" +files = [ + {file = "httpx-0.27.0-py3-none-any.whl", hash = "sha256:71d5465162c13681bff01ad59b2cc68dd838ea1f10e51574bac27103f00c91a5"}, + {file = "httpx-0.27.0.tar.gz", hash = "sha256:a0cb88a46f32dc874e04ee956e4c2764aba2aa228f650b06788ba6bda2962ab5"}, +] + +[package.dependencies] +anyio = "*" +certifi = "*" +httpcore = "==1.*" +idna = "*" +sniffio = "*" + +[package.extras] +brotli = ["brotli", "brotlicffi"] +cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"] +http2 = ["h2 (>=3,<5)"] +socks = ["socksio (==1.*)"] + [[package]] name = "huggingface-hub" version = "0.20.3" @@ -2270,6 +2373,29 @@ files = [ {file = "numpy-1.24.4.tar.gz", hash = "sha256:80f5e3a4e498641401868df4208b74581206afbee7cf7b8329daae82676d9463"}, ] +[[package]] +name = "openai" +version = "1.12.0" +description = "The official Python library for the openai API" +optional = true +python-versions = ">=3.7.1" +files = [ + {file = "openai-1.12.0-py3-none-any.whl", hash = "sha256:a54002c814e05222e413664f651b5916714e4700d041d5cf5724d3ae1a3e3481"}, + {file = "openai-1.12.0.tar.gz", hash = "sha256:99c5d257d09ea6533d689d1cc77caa0ac679fa21efef8893d8b0832a86877f1b"}, +] + +[package.dependencies] +anyio = ">=3.5.0,<5" +distro = ">=1.7.0,<2" +httpx = ">=0.23.0,<1" +pydantic = ">=1.9.0,<3" +sniffio = "*" +tqdm = ">4" +typing-extensions = ">=4.7,<5" + +[package.extras] +datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"] + [[package]] name = "openai-whisper" version = "20230314" @@ -6351,6 +6477,17 @@ files = [ {file = "smmap-5.0.1.tar.gz", hash = "sha256:dceeb6c0028fdb6734471eb07c0cd2aae706ccaecab45965ee83f11c8d3b1f62"}, ] +[[package]] +name = "sniffio" +version = "1.3.0" +description = "Sniff out which async library your code is running under" +optional = true +python-versions = ">=3.7" +files = [ + {file = "sniffio-1.3.0-py3-none-any.whl", hash = "sha256:eecefdce1e5bbfb7ad2eeaabf7c1eeb404d7757c379bd1f7e5cce9d8bf425384"}, + {file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"}, +] + [[package]] name = "snowballstemmer" version = "2.2.0" @@ -7387,11 +7524,12 @@ docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.link testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy (>=0.9.1)", "pytest-ruff"] [extras] -all = ["PyAudio", "azure-cognitiveservices-speech", "deepl", "elevenlabs", "gTTS", "openai-whisper", "pynput", "pyttsx3", "stable-ts"] +all = ["PyAudio", "azure-cognitiveservices-speech", "deepl", "elevenlabs", "gTTS", "openai", "openai-whisper", "pynput", "pyttsx3", "stable-ts"] azure = ["azure-cognitiveservices-speech"] coqui = [] elevenlabs = ["elevenlabs"] gtts = ["gTTS"] +openai = ["openai"] pyttsx3 = ["pyttsx3"] recorder = ["PyAudio", "pynput"] transcribe = ["openai-whisper", "stable-ts"] @@ -7400,4 +7538,4 @@ translate = ["deepl"] [metadata] lock-version = "2.0" python-versions = ">=3.8,<4" -content-hash = "a9582fee82570558a2ff806a13fcb646f47c0808fec45cd4bcb012f8bca2e8cd" +content-hash = "0050b83c943a22f400ce646092ed65936ef4a55c30c6c8613f0d8c7150f6480b" diff --git a/pyproject.toml b/pyproject.toml index 1c41dce..1808c1c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "manim-voiceover" -version = "0.3.5" +version = "0.3.6" description = "Manim plugin for all things voiceover" authors = ["The Manim Community Developers "] license = "MIT" @@ -51,6 +51,7 @@ azure-cognitiveservices-speech = { version = "^1.24.0", optional = true } PyAudio = { version = "^0.2.12", optional = true } gTTS = { version = "^2.2.4", optional = true } pyttsx3 = { version = "^2.90", optional = true } +openai = { version = "^1.6.1", optional = true } # torch = { version = "*", optional = true } # TTS = { version = "*", optional = true } pynput = { version = "^1.7.6", optional = true } @@ -65,6 +66,7 @@ elevenlabs = {version = "^0.2.27", optional = true} [tool.poetry.extras] azure = ["azure-cognitiveservices-speech"] gtts = ["gTTS"] +openai = ["openai"] pyttsx3 = ["pyttsx3"] # coqui = ["torch", "TTS"] coqui = [] # Removed TTS as deps for now @@ -80,6 +82,7 @@ all = [ "TTS", "PyAudio", "pynput", + "openai", "deepl", "openai-whisper", "stable-ts",