Skip to content

Commit 3628134

Browse files
committed
Commit changes from @hdeep03's branch
1 parent f02a5f8 commit 3628134

File tree

8 files changed

+367
-6
lines changed

8 files changed

+367
-6
lines changed

docs/source/api.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,10 @@ Speech services
3838
:members:
3939
:show-inheritance:
4040

41+
.. automodule:: manim_voiceover.services.openai
42+
:members:
43+
:show-inheritance:
44+
4145
.. automodule:: manim_voiceover.services.pyttsx3
4246
:members:
4347
:show-inheritance:

docs/source/services.rst

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,11 @@ Manim Voiceover defines the :py:class:`~~base.SpeechService` class for adding ne
4747
- No
4848
- No
4949
- It's a free API subsidized by Google, so there is a likelihood it may stop working in the future.
50+
* - :py:class:`~openai.OpenAIService`
51+
- Very good, human-like
52+
- No
53+
- Yes
54+
- Requires OpenAI developer account. See `platform <https://platform.openai.com/signup>`__ to sign up, and the `pricing page <https://openai.com/pricing#:~:text=%24-,0.030,-/%201K%20characters>`__ for more details.
5055
* - :py:class:`~pyttsx3.PyTTSX3Service`
5156
- Bad
5257
- Yes
@@ -136,6 +141,33 @@ Install Manim Voiceover with the ``gtts`` extra in order to use :py:class:`~gtts
136141
137142
Refer to the `example usage <https://github.com/ManimCommunity/manim-voiceover/blob/main/examples/gtts-example.py>`__ to get started.
138143

144+
:py:class:`~openai.OpenAIService`
145+
*************************************
146+
`OpenAI <https://platform.openai.com/docs/api-reference/audio/createSpeech/>`__ provides a text-to-speech service.
147+
Since it is an api request, it requires an internet connection to work. It requires an API key to use. Register for one `here <https://platform.openai.com/>`__.
148+
149+
Install Manim Voiceover with the ``openai`` extra in order to use :py:class:`~openai.OpenAIService`:
150+
151+
.. code:: sh
152+
153+
pip install "manim-voiceover[openai]"
154+
155+
Then, you need to find out your api key:
156+
157+
- Sign in to `OpenAI platform <https://platform.openai.com/>`__ and click into Api Keys from the left panel.
158+
- Click create a new secret key and copy it.
159+
160+
Create a file called ``.env`` that contains your authentication
161+
information in the same directory where you call Manim.
162+
163+
.. code:: sh
164+
165+
OPENAI_API_KEY="..." # insert the secret key here. It should start with "sk-"
166+
167+
Check out `OpenAI docs <https://platform.openai.com/docs/guides/text-to-speech/>`__ for more details.
168+
169+
Refer to the `example usage <https://github.com/ManimCommunity/manim-voiceover/blob/main/examples/openai-example.py>`__ to get started.
170+
139171
:py:class:`~pyttsx3.PyTTSX3Service`
140172
***********************************
141173

examples/openai-example.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
from manim import *
2+
from manim_voiceover import VoiceoverScene
3+
from manim_voiceover.services.openai import OpenAIService
4+
5+
6+
class OpenAIExample(VoiceoverScene):
7+
def construct(self):
8+
self.set_speech_service(
9+
OpenAIService(
10+
voice="fable",
11+
model="tts-1-hd",
12+
)
13+
)
14+
15+
circle = Circle()
16+
square = Square().shift(2 * RIGHT)
17+
18+
with self.voiceover(text="This circle is drawn as I speak.") as tracker:
19+
self.play(Create(circle), run_time=tracker.duration)
20+
21+
with self.voiceover(text="Let's shift it to the left 2 units.") as tracker:
22+
self.play(circle.animate.shift(2 * LEFT), run_time=tracker.duration)
23+
24+
with self.voiceover(text="Now, let's transform it into a square.") as tracker:
25+
self.play(Transform(circle, square), run_time=tracker.duration)
26+
27+
with self.voiceover(text="Thank you for watching.", speed=0.75): # You can also change the audio speed by specifying the speed argument.
28+
self.play(Uncreate(circle))
29+
30+
self.wait()

manim_voiceover/services/elevenlabs.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -150,8 +150,10 @@ def generate_from_text(
150150
input_data = {
151151
"input_text": input_text,
152152
"service": "elevenlabs",
153-
"model": self.model,
154-
"voice": self.voice.model_dump(exclude_none=True),
153+
"config": {
154+
"model": self.model,
155+
"voice": self.voice.model_dump(exclude_none=True),
156+
},
155157
}
156158

157159
# if not config.disable_caching:
@@ -164,8 +166,9 @@ def generate_from_text(
164166
audio_path = self.get_audio_basename(input_data) + ".mp3"
165167
else:
166168
audio_path = path
169+
167170
try:
168-
audio = generate(text=text, voice=self.voice, model=self.model)
171+
audio = generate(text=input_text, voice=self.voice, model=self.model)
169172
save(audio, str(Path(cache_dir) / audio_path)) # type: ignore
170173
except Exception as e:
171174
logger.error(e)

manim_voiceover/services/openai.py

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
import os
2+
import sys
3+
from pathlib import Path
4+
from manim import logger
5+
from dotenv import load_dotenv, find_dotenv
6+
7+
from manim_voiceover.helper import (
8+
create_dotenv_file,
9+
prompt_ask_missing_extras,
10+
remove_bookmarks,
11+
)
12+
13+
try:
14+
import openai
15+
except ImportError:
16+
logger.error(
17+
"Missing packages. "
18+
'Run `pip install "manim-voiceover[openai]"` to use OpenAIService.'
19+
)
20+
21+
from manim_voiceover.services.base import SpeechService
22+
23+
load_dotenv(find_dotenv(usecwd=True))
24+
25+
26+
def create_dotenv_openai():
27+
logger.info(
28+
"Check out https://voiceover.manim.community/en/stable/services.html "
29+
"to learn how to create an account and get your subscription key."
30+
)
31+
if not create_dotenv_file(["OPENAI_API_KEY"]):
32+
raise ValueError(
33+
"The environment variable OPENAI_API_KEY is not set. Please set it "
34+
"or create a .env file with the variables."
35+
)
36+
logger.info("The .env file has been created. Please run Manim again.")
37+
sys.exit()
38+
39+
40+
class OpenAIService(SpeechService):
41+
"""
42+
Speech service class for OpenAI TTS Service. See the `OpenAI API page
43+
<https://platform.openai.com/docs/api-reference/audio/createSpeech>`__
44+
for more information about voices and models.
45+
"""
46+
47+
def __init__(
48+
self,
49+
voice: str = "alloy",
50+
model: str = "tts-1-hd",
51+
transcription_model="base",
52+
**kwargs
53+
):
54+
"""
55+
Args:
56+
voice (str, optional): The voice to use. See the
57+
`API page <https://platform.openai.com/docs/api-reference/audio/createSpeech>`__
58+
for all the available options. Defaults to ``"alloy"``.
59+
model (str, optional): The TTS model to use.
60+
See the `API page <https://platform.openai.com/docs/api-reference/audio/createSpeech>`__
61+
for all the available options. Defaults to ``"tts-1-hd"``.
62+
"""
63+
prompt_ask_missing_extras("openai", "openai", "OpenAIService")
64+
self.voice = voice
65+
self.model = model
66+
67+
SpeechService.__init__(self, transcription_model=transcription_model, **kwargs)
68+
69+
def generate_from_text(
70+
self, text: str, cache_dir: str = None, path: str = None, **kwargs
71+
) -> dict:
72+
""""""
73+
if cache_dir is None:
74+
cache_dir = self.cache_dir
75+
76+
speed = kwargs.get("speed", 1.0)
77+
78+
if not (0.25 <= speed <= 4.0):
79+
raise ValueError("The speed must be between 0.25 and 4.0.")
80+
81+
input_text = remove_bookmarks(text)
82+
input_data = {
83+
"input_text": input_text,
84+
"service": "openai",
85+
"config": {
86+
"voice": self.voice,
87+
"model": self.model,
88+
"speed": speed,
89+
},
90+
}
91+
92+
cached_result = self.get_cached_result(input_data, cache_dir)
93+
if cached_result is not None:
94+
return cached_result
95+
96+
if path is None:
97+
audio_path = self.get_audio_basename(input_data) + ".mp3"
98+
else:
99+
audio_path = path
100+
101+
if os.getenv("OPENAI_API_KEY") is None:
102+
create_dotenv_openai()
103+
104+
response = openai.audio.speech.create(
105+
model=self.model,
106+
voice=self.voice,
107+
input=input_text,
108+
speed=speed,
109+
)
110+
response.stream_to_file(str(Path(cache_dir) / audio_path))
111+
112+
json_dict = {
113+
"input_text": text,
114+
"input_data": input_data,
115+
"original_audio": audio_path,
116+
}
117+
118+
return json_dict

manim_voiceover/tracker.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,10 +57,43 @@ def __init__(self, scene: Scene, data: dict, cache_dir: str):
5757
if "word_boundaries" in self.data:
5858
self._process_bookmarks()
5959

60+
def _get_fallback_word_boundaries(self):
61+
"""
62+
Returns dummy word boundaries assuming a linear mapping between
63+
text and audio. Used when word boundaries are not available.
64+
"""
65+
input_text = remove_bookmarks(self.data["input_text"])
66+
return [
67+
{
68+
"audio_offset": 0,
69+
"text_offset": 0,
70+
"word_length": len(input_text),
71+
"text": self.data["input_text"],
72+
"boundary_type": "Word",
73+
},
74+
{
75+
"audio_offset": self.duration * AUDIO_OFFSET_RESOLUTION,
76+
"text_offset": len(input_text),
77+
"word_length": 1,
78+
"text": ".",
79+
"boundary_type": "Word",
80+
},
81+
]
82+
6083
def _process_bookmarks(self) -> None:
6184
self.bookmark_times = {}
6285
self.bookmark_distances = {}
63-
self.time_interpolator = TimeInterpolator(self.data["word_boundaries"])
86+
87+
word_boundaries = self.data["word_boundaries"]
88+
if not word_boundaries or len(word_boundaries) < 2:
89+
logger.warning(
90+
f"Word boundaries for voiceover {self.data['input_text']} are not "
91+
"available or are insufficient. Using fallback word boundaries."
92+
)
93+
word_boundaries = self._get_fallback_word_boundaries()
94+
95+
self.time_interpolator = TimeInterpolator(word_boundaries)
96+
6497
net_text_len = len(remove_bookmarks(self.data["input_text"]))
6598
if "transcribed_text" in self.data:
6699
transcribed_text_len = len(self.data["transcribed_text"].strip())

0 commit comments

Comments
 (0)