feat(examples): use mp3 output by default (#449)

2noise · Jun 25, 2024 · 0744bb3 · 0744bb3
1 parent db13e42
commit 0744bb3
Show file tree

Hide file tree

Showing 10 changed files with 63 additions and 15 deletions.
diff --git a/.gitignore b/.gitignore
@@ -169,3 +169,4 @@ cython_debug/
 
 # inferred result
 *.wav
+*.mp3
diff --git a/README.md b/README.md
@@ -11,7 +11,7 @@ A generative speech model for daily dialogue.
 [![Open In Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)](https://colab.research.google.com/github/2noise/ChatTTS/blob/main/examples/ipynb/colab.ipynb)
 [![Discord](https://img.shields.io/badge/ChatTTS-Discord-7289DA?style=for-the-badge&logo=discord&logoColor=white)](https://discord.gg/Ud5Jxgx5yD)
 
-**English** | [**简体中文**](docs/cn/README.md) | [**日本語**](docs/jp/README.md) | [**Русский**](docs/ru/README.md)
+**English** | [**简体中文**](docs/cn/README.md) | [**日本語**](docs/jp/README.md) | [**Русский**](docs/ru/README.md) | [**Español**](docs/es/README.md)
 
 </div>
 
@@ -93,29 +93,31 @@ pip install -r requirements.txt
 ```
 
 ### Quick Start
+> Make sure you are under the project root directory when you execute these commands below.
+
 #### 1. Launch WebUI
 ```bash
 python examples/web/webui.py
 ```
 
 #### 2. Infer by Command Line
-> It will save audio to `./output_audio_xxx.wav`
+> It will save audio to `./output_audio_n.mp3`
 
 ```bash
-python examples/cmd/run.py "Please input your text."
+python examples/cmd/run.py "Your text 1." "Your text 2."
 ```
 
 ### Basic
 
 ```python
 import ChatTTS
-from IPython.display import Audio
+import torch
 import torchaudio
 
 chat = ChatTTS.Chat()
 chat.load(compile=False) # Set to True for better performance
 
-texts = ["PUT YOUR TEXT HERE",]
+texts = ["PUT YOUR 1st TEXT HERE", "PUT YOUR 2nd TEXT HERE"]
 
 wavs = chat.infer(texts)
 
@@ -154,6 +156,7 @@ wavs = chat.infer(
 
 ###################################
 # For word level manual control.
+
 text = 'What is [uv_break]your favorite english food?[laugh][lbreak]'
 wavs = chat.infer(text, skip_refine_text=True, params_refine_text=params_refine_text,  params_infer_code=params_infer_code)
 torchaudio.save("output2.wav", torch.from_numpy(wavs[0]), 24000)

diff --git a/docs/cn/README.md b/docs/cn/README.md
@@ -10,7 +10,7 @@
 [![Huggingface](https://img.shields.io/badge/🤗%20-Models-yellow.svg?style=for-the-badge)](https://huggingface.co/2Noise/ChatTTS)
 [![Open In Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)](https://colab.research.google.com/github/2noise/ChatTTS/blob/main/examples/ipynb/colab.ipynb)
 
-[**English**](../../README.md) | **简体中文** | [**日本語**](../jp/README.md) | [**Русский**](../ru/README.md)
+[**English**](../../README.md) | **简体中文** | [**日本語**](../jp/README.md) | [**Русский**](../ru/README.md) | [**Español**](../es/README.md)
 
 </div>
 

diff --git a/docs/jp/README.md b/docs/jp/README.md
@@ -4,7 +4,7 @@
 
 [![Huggingface](https://img.shields.io/badge/🤗%20-Models-yellow.svg?style=for-the-badge)](https://huggingface.co/2Noise/ChatTTS)
 
-[**English**](../../README.md) | [**简体中文**](../cn/README.md) | **日本語** | [**Русский**](../ru/README.md)
+[**English**](../../README.md) | [**简体中文**](../cn/README.md) | **日本語** | [**Русский**](../ru/README.md) | [**Español**](../es/README.md)
 
 ChatTTSは、LLMアシスタントなどの対話シナリオ用に特別に設計されたテキストから音声へのモデルです。英語と中国語の両方をサポートしています。私たちのモデルは、中国語と英語で構成される100,000時間以上でトレーニングされています。**[HuggingFace](https://huggingface.co/2Noise/ChatTTS)**でオープンソース化されているバージョンは、40,000時間の事前トレーニングモデルで、SFTは行われていません。
 

diff --git a/docs/ru/README.md b/docs/ru/README.md
@@ -4,7 +4,7 @@
 
 [![Huggingface](https://img.shields.io/badge/🤗%20-Models-yellow.svg?style=for-the-badge)](https://huggingface.co/2Noise/ChatTTS)
 
-[**English**](../../README.md) | [**简体中文**](../cn/README.md) | [**日本語**](../jp/README.md) | **Русский**
+[**English**](../../README.md) | [**简体中文**](../cn/README.md) | [**日本語**](../jp/README.md) | **Русский** | [**Español**](../es/README.md)
 
 ChatTTS - это модель преобразования текста в речь, специально разработанная для диалоговых сценариев, таких как помощник LLM. Она поддерживает как английский, так и китайский языки. Наша модель обучена на более чем 100 000 часах английского и китайского языков. Открытая версия на **[HuggingFace](https://huggingface.co/2Noise/ChatTTS)** - это предварительно обученная модель с 40 000 часами без SFT.
 

diff --git a/examples/cmd/run.py b/examples/cmd/run.py
@@ -8,23 +8,28 @@
 
 import wave
 import argparse
+from io import BytesIO
 
 import ChatTTS
 
-from tools.audio import unsafe_float_to_int16
+from tools.audio import unsafe_float_to_int16, wav2
 from tools.logger import get_logger
 
 logger = get_logger("Command")
 
 
-def save_wav_file(wav, index):
-    wav_filename = f"output_audio_{index}.wav"
-    with wave.open(wav_filename, "wb") as wf:
+def save_mp3_file(wav, index):
+    buf = BytesIO()
+    with wave.open(buf, "wb") as wf:
         wf.setnchannels(1)  # Mono channel
         wf.setsampwidth(2)  # Sample width in bytes
         wf.setframerate(24000)  # Sample rate in Hz
         wf.writeframes(unsafe_float_to_int16(wav))
-    logger.info(f"Audio saved to {wav_filename}")
+    buf.seek(0, 0)
+    mp3_filename = f"output_audio_{index}.mp3"
+    with open(mp3_filename, "wb") as f:
+        wav2(buf, f, "mp3")
+    logger.info(f"Audio saved to {mp3_filename}")
 
 
 def main(texts: list[str]):
@@ -42,7 +47,7 @@ def main(texts: list[str]):
     logger.info("Inference completed. Audio generation successful.")
     # Save each generated wav file to a local file
     for index, wav in enumerate(wavs):
-        save_wav_file(wav, index)
+        save_mp3_file(wav, index)
 
 
 if __name__ == "__main__":

diff --git a/examples/web/webui.py b/examples/web/webui.py
@@ -78,7 +78,7 @@ def main():
                 "Interrupt", scale=2, variant="stop", visible=False, interactive=False
             )
 
-        text_output = gr.Textbox(label="Output Text", interactive=False)
+        text_output = gr.Textbox(label="Output Text", interactive=False, show_copy_button=True)
 
         # 使用Gradio的回调功能来更新数值输入框
         voice_selection.change(
@@ -117,6 +117,7 @@ def make_audio(autoplay, stream):
                 streaming=stream,
                 interactive=False,
                 show_label=True,
+                format="mp3",
             )
             text_output.change(
                 text_output_listener,

diff --git a/requirements.txt b/requirements.txt
@@ -12,3 +12,4 @@ pybase16384
 pynini==2.1.5; sys_platform == 'linux'
 WeTextProcessing; sys_platform == 'linux'
 nemo_text_processing; sys_platform == 'linux'
+av
diff --git a/tools/audio/__init__.py b/tools/audio/__init__.py
@@ -1 +1,2 @@
 from .np import unsafe_float_to_int16
+from .av import wav2
diff --git a/tools/audio/av.py b/tools/audio/av.py
@@ -0,0 +1,36 @@
+from io import BufferedWriter, BytesIO
+from typing import Dict
+
+import av
+
+
+video_format_dict: Dict[str, str] = {
+    "m4a": "mp4",
+}
+
+audio_format_dict: Dict[str, str] = {
+    "ogg": "libvorbis",
+    "mp4": "aac",
+}
+
+
+def wav2(i: BytesIO, o: BufferedWriter, format: str):
+    """
+    https://github.com/fumiama/Retrieval-based-Voice-Conversion-WebUI/blob/412a9950a1e371a018c381d1bfb8579c4b0de329/infer/lib/audio.py#L20
+    """
+    inp = av.open(i, "r")
+    format = video_format_dict.get(format, format)
+    out = av.open(o, "w", format=format)
+    format = audio_format_dict.get(format, format)
+
+    ostream = out.add_stream(format)
+
+    for frame in inp.decode(audio=0):
+        for p in ostream.encode(frame):
+            out.mux(p)
+
+    for p in ostream.encode(None):
+        out.mux(p)
+
+    out.close()
+    inp.close()
Original file line number	Diff line number	Diff line change
Expand Up		@@ -169,3 +169,4 @@ cython_debug/

		# inferred result
		*.wav
		*.mp3
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		from .np import unsafe_float_to_int16
		from .av import wav2