-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtts_inference.py
124 lines (96 loc) · 3.48 KB
/
tts_inference.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import argparse
import os
import re
import subprocess
import torch
import torchaudio
from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_voice
PATH_TO_RVC = "Mangio-RVC-Fork/"
def clean_text(text: str, target_len: int = 200, max_len: int = 300) -> list[str]:
# remove double new line, redundant whitespace, convert non-ascii quotes to ascii quotes
text = re.sub(r"\n\n+", r"\n", text)
text = re.sub(r"\s+", r" ", text)
text = re.sub(r"[“”]", '"', text)
# split text into sentences, keep quotes together
sentences = re.split(r'(?<=[.!?])\s+(?=(?:[^"]*"[^"]*")*[^"]*$)', text)
# recombine sentences into chunks of desired length
chunks = []
chunk = ""
for sentence in sentences:
if len(chunk) + len(sentence) > target_len:
chunks.append(chunk)
chunk = ""
chunk += sentence + " "
if len(chunk) > max_len:
chunks.append(chunk)
chunk = ""
if chunk:
chunks.append(chunk)
# clean up chunks, remove leading/trailing whitespace, remove empty/unless chunks
chunks = [s.strip() for s in chunks]
chunks = [s for s in chunks if s and not re.match(r"^[\s\.,;:!?]*$", s)]
return chunks
def process_textfile(file_path: str) -> list[str]:
with open(file_path, "r", encoding="utf-8") as f:
text = " ".join([l for l in f.readlines()])
text = clean_text(text)
return text
def tts(paper_name: str):
# load tts model
tts = TextToSpeech(
autoregressive_model_path="./ai-voice-cloning/training/GlaDOS/finetune/models/5304_gpt.pth"
)
voice = "GlaDOS"
voice_samples, conditioning_latents = load_voice(
voice, extra_voice_dirs="./ai-voice-cloning/voices"
)
# process text file
texts = process_textfile(f"./llm/scripts/{paper_name}.txt")
# generate audio for each chunk of text
all_audio_chunks = []
for i, text in enumerate(texts):
gen = tts.tts(
text=text,
voice=voice,
voice_samples=voice_samples,
conditioning_latents=conditioning_latents,
)
torchaudio.save(f"./audio/raw/{i}.wav", gen.squeeze(0).cpu(), 24000)
all_audio_chunks.append(gen)
# concatenate all audio chunks
full_audio = torch.cat(all_audio_chunks, dim=-1)
torchaudio.save(f"./audio/raw/{paper_name}.wav", full_audio, 24000)
def rvc(paper_name: str):
output_file_name = f"./audio/processed/{paper_name}.wav"
input_file_name = f"./audio/raw/{paper_name}.wav"
model_path = "GlaDOS/glados2333333.pth"
index_path = "./Mangio-RVC-Fork/logs/GlaDOS/added_IVF2170_Flat_nprobe_1.index"
process = subprocess.Popen(
["make", "run-cli"],
cwd=PATH_TO_RVC,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
process.stdin.write(b"go infer\n")
process.stdin.flush()
args = (
f"{model_path} {input_file_name} {output_file_name} {index_path}\n"
).encode()
print(args)
process.stdin.write(args)
process.stdin.flush()
stdout, stderr = process.communicate()
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--file_path", type=str, required=True)
args = parser.parse_args()
file_path = args.file_path
# get paper name from file path
paper_name_ext = os.path.basename(file_path)
paper_name = os.path.splitext(paper_name_ext)[0]
tts(paper_name)
rvc(paper_name)
if __name__ == "__main__":
main()