diff --git a/programs/tts/silero_tts/README.md b/programs/tts/silero_tts/README.md new file mode 100644 index 0000000..a1b62b4 --- /dev/null +++ b/programs/tts/silero_tts/README.md @@ -0,0 +1,14 @@ +# Silero TTS + +Text to speech service for Rhasspy based on [Silero TTS](https://github.com/snakers4/silero-models). + +## Installation + +1. Copy the contents of this directory to `config/programs/tts/silero_tts/` +2. Run `script/setup` +3. Download a model with `script/download` + * Example: `script/download --language ru --model v3_1_ru` + * Models are downloaded to `config/data/tts/silero_tts/models` directory +4. Test with `bin/tts_synthesize.py` + * + Example `script/run bin/tts_synthesize.py --tts-program silero_tts -f test.wav --debug 'test text!'` diff --git a/programs/tts/silero_tts/bin/download.py b/programs/tts/silero_tts/bin/download.py new file mode 100644 index 0000000..a505ec1 --- /dev/null +++ b/programs/tts/silero_tts/bin/download.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python3 +import argparse +import logging +import tempfile +from pathlib import Path + +import torch +from omegaconf import OmegaConf + +_DIR = Path(__file__).parent +_LOGGER = logging.getLogger("setup") + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument( + "--language", + help="Voice language to download", + required=True + ) + parser.add_argument( + "--model", + help="Model to download", + required=True + ) + parser.add_argument( + "--destination", help="Path to destination directory (default: share)" + ) + + args = parser.parse_args() + logging.basicConfig(level=logging.INFO) + + with tempfile.NamedTemporaryFile() as latest_silero_models: + torch.hub.download_url_to_file('https://raw.githubusercontent.com/snakers4/silero-models/master/models.yml', + latest_silero_models.name, + progress=False) + models = OmegaConf.load(latest_silero_models.name) + + if args.destination: + data_path = Path(args.destination) + else: + data_path = _DIR.parent.parent.parent.parent / "data" / "tts" / "silero_tts" / "models" + + model_path = data_path / args.language + + model_path.mkdir(parents=True, exist_ok=True) + + torch.hub.download_url_to_file(models.tts_models[args.language][args.model].latest.package, + model_path / f'{args.model}.pt') + + +if __name__ == "__main__": + main() diff --git a/programs/tts/silero_tts/bin/list_models.py b/programs/tts/silero_tts/bin/list_models.py new file mode 100644 index 0000000..93ba525 --- /dev/null +++ b/programs/tts/silero_tts/bin/list_models.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 +import logging +import tempfile +from pathlib import Path + +import torch +from omegaconf import OmegaConf + +_DIR = Path(__file__).parent +_LOGGER = logging.getLogger("list_models") + + +def main() -> None: + """Main method.""" + with tempfile.NamedTemporaryFile() as latest_silero_models: + torch.hub.download_url_to_file('https://raw.githubusercontent.com/snakers4/silero-models/master/models.yml', + latest_silero_models.name, + progress=False) + models = OmegaConf.load(latest_silero_models.name) + + available_languages = list(models.tts_models.keys()) + print(f'Available languages {available_languages}') + + for lang in available_languages: + _models = list(models.tts_models.get(lang).keys()) + print(f'Available models for {lang}: {_models}') + + +# ----------------------------------------------------------------------------- + +if __name__ == "__main__": + main() diff --git a/programs/tts/silero_tts/bin/silero.py b/programs/tts/silero_tts/bin/silero.py new file mode 100644 index 0000000..ff12f9c --- /dev/null +++ b/programs/tts/silero_tts/bin/silero.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python3 +import argparse +import logging +import sys +from pathlib import Path + +import numpy as np +import torch + +from rhasspy3.audio import AudioStart, DEFAULT_SAMPLES_PER_CHUNK, AudioChunk, AudioStop +from rhasspy3.event import write_event, read_event +from rhasspy3.tts import Synthesize + +_FILE = Path(__file__) +_DIR = _FILE.parent +_LOGGER = logging.getLogger(_FILE.stem) + + +class StreamToLogger(object): + """ + Fake file-like stream object that redirects writes to a logger instance. + """ + + def __init__(self, logger, log_level=logging.INFO): + self.logger = logger + self.log_level = log_level + self.linebuf = '' + + def write(self, buf): + for line in buf.rstrip().splitlines(): + self.logger.log(self.log_level, line.rstrip()) + + def flush(self): + pass + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("--language", required=True, help="Language to use") + parser.add_argument("--model", required=True, help="Model to use") + parser.add_argument("--sample_rate", help="Sample rate", default=48000) + parser.add_argument("--speaker", help="Voice to use", default='random') + parser.add_argument("--put_accent", help="Add accent", default=True) + parser.add_argument("--put_yo", help="Put Yo", default=True) + parser.add_argument("--samples-per-chunk", type=int, default=DEFAULT_SAMPLES_PER_CHUNK) + parser.add_argument("--debug", action="store_true", help="Log DEBUG messages") + parser.add_argument("--destination", help="Path to destination directory") + parser.add_argument("--voice", help="Saved voice model") + args = parser.parse_args() + + logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) + + sys_stdout = sys.stdout + sys.stdout = StreamToLogger(_LOGGER, logging.INFO) + + if args.destination: + data_path = Path(args.destination) + else: + data_path = _DIR.parent.parent.parent.parent / "data" / "tts" / "silero_tts" + + model_path = data_path / "models" / args.language / f'{args.model}.pt' + + model_params = { + 'speaker': args.speaker, + 'sample_rate': args.sample_rate, + 'put_accent': args.put_accent, + 'put_yo': args.put_yo + } + + if args.voice: + voice_path = Path(args.voice) + if voice_path.is_absolute(): + model_params['voice_path'] = voice_path + else: + model_params['voice_path'] = data_path / 'voices' / voice_path + + device = torch.device('cpu') + model = torch.package.PackageImporter(model_path).load_pickle("tts_models", "model") + model.to(device) + # Listen for events + try: + while True: + event = read_event() + if event is None: + break + + if Synthesize.is_type(event.type): + synthesize = Synthesize.from_event(event) + _LOGGER.debug("synthesize: text='%s'", synthesize.text) + + audio = model.apply_tts(text=synthesize.text, **model_params) + + width = 2 + channels = 1 + timestamp = 0 + rate = args.sample_rate + bytes_per_chunk = args.samples_per_chunk * width + + start_event = AudioStart(rate, width, channels, timestamp=timestamp) + write_event(start_event.event(), sys_stdout.buffer) + _LOGGER.debug(start_event) + + # Audio + audio_bytes = (32767 * audio).numpy().astype(np.int16).tobytes() + + while audio_bytes: + chunk = AudioChunk( + rate, + width, + channels, + audio_bytes[:bytes_per_chunk], + timestamp=timestamp, + ) + write_event(chunk.event(), sys_stdout.buffer) + timestamp += chunk.milliseconds + audio_bytes = audio_bytes[bytes_per_chunk:] + + write_event(AudioStop(timestamp=timestamp).event(), sys_stdout.buffer) + + except KeyboardInterrupt: + pass + + +# ----------------------------------------------------------------------------- + +if __name__ == "__main__": + main() diff --git a/programs/tts/silero_tts/bin/silero_server.py b/programs/tts/silero_tts/bin/silero_server.py new file mode 100644 index 0000000..7fb2db5 --- /dev/null +++ b/programs/tts/silero_tts/bin/silero_server.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python3 +import argparse +import json +import logging +import os +import socket +from pathlib import Path + +import numpy as np +import torch + +from rhasspy3.audio import AudioStart, DEFAULT_SAMPLES_PER_CHUNK, AudioChunk, AudioStop +from rhasspy3.event import write_event + +_FILE = Path(__file__) +_DIR = _FILE.parent +_LOGGER = logging.getLogger(_FILE.stem) + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("--language", required=True, help="Language to use") + parser.add_argument("--model", required=True, help="Model to use") + parser.add_argument("--sample_rate", help="Sample rate", default=48000) + parser.add_argument("--speaker", help="Voice to use", default='random') + parser.add_argument("--put_accent", help="Add accent", default=True) + parser.add_argument("--put_yo", help="Put Yo", default=True) + parser.add_argument("--socketfile", required=True, help="Path to Unix domain socket file") + parser.add_argument("--samples-per-chunk", type=int, default=DEFAULT_SAMPLES_PER_CHUNK) + parser.add_argument("--debug", action="store_true", help="Log DEBUG messages") + parser.add_argument("--destination", help="Path to destination directory (default: share)") + parser.add_argument("--voice", help="Saved voice model") + args = parser.parse_args() + + logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) + + # Need to unlink socket if it exists + try: + os.unlink(args.socketfile) + except OSError: + pass + + try: + if args.destination: + data_path = Path(args.destination) + else: + data_path = _DIR.parent.parent.parent.parent / "data" / "tts" / "silero_tts" + + model_path = data_path / "models" / args.language / f'{args.model}.pt' + + model_params = { + 'speaker': args.speaker, + 'sample_rate': args.sample_rate, + 'put_accent': args.put_accent, + 'put_yo': args.put_yo + } + + if args.voice: + voice_path = Path(args.voice) + if voice_path.is_absolute(): + model_params['voice_path'] = voice_path + else: + model_params['voice_path'] = data_path / 'voices' / voice_path + + device = torch.device('cpu') + model = torch.package.PackageImporter(model_path).load_pickle("tts_models", "model") + model.to(device) + # Create socket server + sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + sock.bind(args.socketfile) + sock.listen() + + # Listen for connections + while True: + try: + connection, client_address = sock.accept() + _LOGGER.debug("Connection from %s", client_address) + with connection, connection.makefile(mode="rwb") as conn_file: + while True: + event_info = json.loads(conn_file.readline()) + event_type = event_info["type"] + + if event_type != "synthesize": + continue + + raw_text = event_info["data"]["text"] + text = raw_text.strip() + + _LOGGER.debug("synthesize: raw_text=%s, text='%s'", raw_text, text) + + audio = model.apply_tts(text=text, **model_params) + + width = 2 + channels = 1 + timestamp = 0 + rate = args.sample_rate + bytes_per_chunk = args.samples_per_chunk * width + + write_event(AudioStart(rate, width, channels, timestamp=timestamp).event(), conn_file) + + # Audio + audio_bytes = (32767 * audio).numpy().astype(np.int16).tobytes() + + while audio_bytes: + chunk = AudioChunk( + rate, + width, + channels, + audio_bytes[:bytes_per_chunk], + timestamp=timestamp, + ) + write_event(chunk.event(), conn_file) + timestamp += chunk.milliseconds + audio_bytes = audio_bytes[bytes_per_chunk:] + + write_event(AudioStop(timestamp=timestamp).event(), conn_file) + break + except KeyboardInterrupt: + break + except Exception: + _LOGGER.exception("Error communicating with socket client") + finally: + os.unlink(args.socketfile) + + +# ----------------------------------------------------------------------------- + +if __name__ == "__main__": + main() diff --git a/programs/tts/silero_tts/requirements.txt b/programs/tts/silero_tts/requirements.txt new file mode 100644 index 0000000..1463921 --- /dev/null +++ b/programs/tts/silero_tts/requirements.txt @@ -0,0 +1,3 @@ +torchaudio +omegaconf +numpy \ No newline at end of file diff --git a/programs/tts/silero_tts/script/download b/programs/tts/silero_tts/script/download new file mode 100644 index 0000000..02072d2 --- /dev/null +++ b/programs/tts/silero_tts/script/download @@ -0,0 +1,17 @@ +#!/usr/bin/env bash +set -eo pipefail + +# Directory of *this* script +this_dir="$( cd "$( dirname "$0" )" && pwd )" + +# Base directory of repo +base_dir="$(realpath "${this_dir}/..")" + +# Path to virtual environment +: "${venv:=${base_dir}/.venv}" + +if [ -d "${venv}" ]; then + source "${venv}/bin/activate" +fi + +python3 "${base_dir}/bin/download.py" "$@" diff --git a/programs/tts/silero_tts/script/list_models b/programs/tts/silero_tts/script/list_models new file mode 100755 index 0000000..c21f267 --- /dev/null +++ b/programs/tts/silero_tts/script/list_models @@ -0,0 +1,17 @@ +#!/usr/bin/env bash +set -eo pipefail + +# Directory of *this* script +this_dir="$( cd "$( dirname "$0" )" && pwd )" + +# Base directory of repo +base_dir="$(realpath "${this_dir}/..")" + +# Path to virtual environment +: "${venv:=${base_dir}/.venv}" + +if [ -d "${venv}" ]; then + source "${venv}/bin/activate" +fi + +python3 "${base_dir}/bin/list_models.py" "$@" diff --git a/programs/tts/silero_tts/script/server b/programs/tts/silero_tts/script/server new file mode 100755 index 0000000..f101459 --- /dev/null +++ b/programs/tts/silero_tts/script/server @@ -0,0 +1,20 @@ +#!/usr/bin/env bash +set -eo pipefail + +# Directory of *this* script +this_dir="$( cd "$( dirname "$0" )" && pwd )" + +# Base directory of repo +base_dir="$(realpath "${this_dir}/..")" + +# Path to virtual environment +: "${venv:=${base_dir}/.venv}" + +if [ -d "${venv}" ]; then + source "${venv}/bin/activate" +fi + +socket_dir="${base_dir}/var/run" +mkdir -p "${socket_dir}" + +python3 "${base_dir}/bin/silero_server.py" --socketfile "${socket_dir}/silero_tts.socket" "$@" diff --git a/programs/tts/silero_tts/script/setup b/programs/tts/silero_tts/script/setup new file mode 100755 index 0000000..0ad8ca3 --- /dev/null +++ b/programs/tts/silero_tts/script/setup @@ -0,0 +1,37 @@ +#!/usr/bin/env bash +set -eo pipefail + +# Directory of *this* script +this_dir="$( cd "$( dirname "$0" )" && pwd )" + +# Base directory of repo +base_dir="$(realpath "${this_dir}/..")" + +# Path to virtual environment +: "${venv:=${base_dir}/.venv}" + +# Python binary to use +: "${PYTHON=python3}" + +python_version="$(${PYTHON} --version)" + +if [ ! -d "${venv}" ]; then + # Create virtual environment + echo "Creating virtual environment at ${venv} (${python_version})" + rm -rf "${venv}" + "${PYTHON}" -m venv "${venv}" + source "${venv}/bin/activate" + + pip3 install --upgrade pip + pip3 install --upgrade wheel setuptools +else + source "${venv}/bin/activate" +fi + +# Install Python dependencies +echo 'Installing Python dependencies' +pip3 install -r "${base_dir}/requirements.txt" + +# ----------------------------------------------------------------------------- + +echo "OK" diff --git a/programs/tts/silero_tts/script/silero b/programs/tts/silero_tts/script/silero new file mode 100755 index 0000000..7ab3f70 --- /dev/null +++ b/programs/tts/silero_tts/script/silero @@ -0,0 +1,17 @@ +#!/usr/bin/env bash +set -eo pipefail + +# Directory of *this* script +this_dir="$( cd "$( dirname "$0" )" && pwd )" + +# Base directory of repo +base_dir="$(realpath "${this_dir}/..")" + +# Path to virtual environment +: "${venv:=${base_dir}/.venv}" + +if [ -d "${venv}" ]; then + source "${venv}/bin/activate" +fi + +python3 "${base_dir}/bin/silero.py" "$@" diff --git a/rhasspy3/configuration.yaml b/rhasspy3/configuration.yaml index 73b068b..8d0b023 100644 --- a/rhasspy3/configuration.yaml +++ b/rhasspy3/configuration.yaml @@ -347,6 +347,18 @@ programs: command: | client_unix_socket.py var/run/mimic3.socket + silero_tts: + command: | + script/silero --language "${language}" --model "${model}" + template_args: + language: "ru" + model: "v3_1_ru" + + # Run server: silero_tts + silero_tts.client: + command: | + client_unix_socket.py var/run/silero_tts.socket + # ------------------ # Intent recognition # ------------------ @@ -504,6 +516,13 @@ servers: template_args: model: "${data_dir}/en-us-blizzard_lessac-medium.onnx" + silero_tts: + command: | + script/server --language "${language}" --model "${model}" + template_args: + language: "ru" + model: "v3_1_ru" + larynx: command: | script/server --voices-dir "${data_dir}" --host "${host}"