diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..0cdf809 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,3 @@ +.github +docker-compose.yml +Dockerfile \ No newline at end of file diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..7585202 --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,44 @@ +--- +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +# GitHub recommends pinning actions to a commit SHA. +# To get a newer version, you will need to update the SHA. +# You can also reference a tag or branch, but the action may change without warning. + +name: Publish Docker image + +on: + release: + types: [published] + +jobs: + push_to_registry: + name: Push Docker image to Docker Hub + runs-on: ubuntu-latest + steps: + - name: Check out the repo + uses: actions/checkout@v3 + + - name: Log in to Docker Hub + uses: docker/login-action@f4ef78c080cd8ba55a85445d5b36e214a81df20a + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} + + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7 + with: + images: samboo/wyoming-tts + + - name: Build and push Docker image + uses: docker/build-push-action@3b5e8027fcad23fda98b2e3ac259d8d67585f671 + with: + context: . + file: ./Dockerfile + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..bd8e516 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,15 @@ +FROM python:3.7 + +WORKDIR /app + +RUN mkdir /data && mkdir -p /root/.local/share && ln -s /data /root/.local/share/tts + +COPY requirements.txt requirements.txt + +RUN pip3 install --no-cache-dir -r requirements.txt + +COPY . . + +VOLUME [ "/data" ] + +ENTRYPOINT ["python3", "wyoming_tts"] \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..19395e2 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 Sam Büth + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..ecb6bf8 --- /dev/null +++ b/README.md @@ -0,0 +1,13 @@ +# wyoming TTS + +coqui-ai TTS Wyoming protocol implementation. + +## TODO + +- [ ] Multi-lingual and multi-speaker selection via wyoming protocol (currently not transmitted by home-assistant) + +- [ ] GPU support + +## Contributions + +Pull request a very welcome. diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..92147cb --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,13 @@ +version: '3.0' + +services: + tts: + image: samboo/wyoming-tts + restart: always + command: --uri tcp://0.0.0.0:10201 --voice tts_models/de/thorsten/vits + environment: + - COQUI_STUDIO_TOKEN= #optional + volumes: + - ./tts:/data + ports: + - 10201:10201 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..f880d5c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +wyoming==0.0.1 +tts~=0.13.3 \ No newline at end of file diff --git a/wyoming_tts/__init__.py b/wyoming_tts/__init__.py new file mode 100644 index 0000000..fbc599d --- /dev/null +++ b/wyoming_tts/__init__.py @@ -0,0 +1 @@ +"""Wyoming server for tts.""" diff --git a/wyoming_tts/__main__.py b/wyoming_tts/__main__.py new file mode 100644 index 0000000..c4cdfcb --- /dev/null +++ b/wyoming_tts/__main__.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python3 +import argparse +import asyncio +import logging +from functools import partial + +from TTS.api import TTS +from wyoming.info import Attribution, Info, TtsProgram, TtsVoice +from wyoming.server import AsyncServer + +from handler import PiperEventHandler + +_LOGGER = logging.getLogger(__name__) + +async def main() -> None: + """Main entry point.""" + parser = argparse.ArgumentParser() + parser.add_argument( + "--voice", + default=None, + help="The Voice to use for TTS", + ) + parser.add_argument( + "--speaker", + help="Set the target speaker", + ) + parser.add_argument( + "--language", + help="Set the target language", + ) + parser.add_argument("--samples-per-chunk", type=int, default=1024) + parser.add_argument("--uri", required=True, help="unix:// or tcp://") + parser.add_argument("--debug", action="store_true", help="Log DEBUG messages") + args = parser.parse_args() + logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) + + if (args.voice == None): + _LOGGER.info("The following voices are available (specify with --voice [model_name]): ") + _LOGGER.info("\n".join(TTS.list_models())) + exit() + + tts = TTS(args.voice) + + if (tts.is_multi_lingual and args.language is None): + _LOGGER.error("The following languages are available (specify with --language [lang]): ") + _LOGGER.info("\n".join(tts.languages)) + exit() + if (tts.is_multi_speaker and args.speaker is None): + _LOGGER.error("The following speakers are available (specify with --speakers [speaker]): ") + _LOGGER.info("\n".join(tts.speakers)) + exit() + + language = None + if (tts.is_multi_lingual is False): + language = args.voice.split("/")[1] + _LOGGER.info("Using language: %s", language) + + _LOGGER.info("TTS ready") + + wyoming_info = Info( + tts=[ + TtsProgram( + name="coqui-ai TTS", + attribution=Attribution( + name="coqui-ai", url="https://github.com/coqui-ai/TTS" + ), + installed=True, + voices=[ + TtsVoice( + name=speaker, + attribution=Attribution( + name="coqui-ai", url="https://github.com/coqui-ai/TTS" + ), + installed=True, + languages=tts.languages if tts.is_multi_lingual else [language], + ) for speaker in ([args.speaker] if tts.is_multi_speaker else ["Default"]) # Preparation for multi speaker support in wyoming event + ], + ) + ], + ) + + server = AsyncServer.from_uri(args.uri) + _LOGGER.info("Ready") + await server.run( + partial( + PiperEventHandler, + wyoming_info, + args, + tts + ) + ) + + + +# ----------------------------------------------------------------------------- + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/wyoming_tts/handler.py b/wyoming_tts/handler.py new file mode 100644 index 0000000..9df6d6f --- /dev/null +++ b/wyoming_tts/handler.py @@ -0,0 +1,88 @@ +"""Event handler for clients of the server.""" +import argparse +import logging +import math +import wave + +from TTS.api import TTS +from wyoming.audio import AudioChunk, AudioStart, AudioStop +from wyoming.event import Event +from wyoming.info import Describe, Info +from wyoming.server import AsyncEventHandler +from wyoming.tts import Synthesize + +_LOGGER = logging.getLogger(__name__) + +class PiperEventHandler(AsyncEventHandler): + def __init__( + self, + wyoming_info: Info, + cli_args: argparse.Namespace, + tts: TTS, + *args, + ) -> None: + super().__init__(*args) + self.cli_args = cli_args + self.wyoming_info_event = wyoming_info.event() + self.tts = tts + + async def handle_event(self, event: Event) -> bool: + if Describe.is_type(event.type): + await self.write_event(self.wyoming_info_event) + _LOGGER.debug("Sent info") + return True + + if not Synthesize.is_type(event.type): + _LOGGER.warning("Unexpected event: %s", event) + return True + synthesize = Synthesize.from_event(event) + raw_text = synthesize.text + text = raw_text.strip() + + output_path = "/tmp/output.wav" + _LOGGER.debug(event) + tts_args = dict() + if (self.tts.is_multi_lingual): + tts_args["language"] = self.cli_args.language + + if (self.tts.is_multi_speaker): + tts_args["speaker"] = self.cli_args.speaker + self.tts.tts_to_file(text, **tts_args, file_path=output_path) + wav_file: wave.Wave_read = wave.open(output_path, "rb") + with wav_file: + rate = wav_file.getframerate() + width = wav_file.getsampwidth() + channels = wav_file.getnchannels() + + await self.write_event( + AudioStart( + rate=rate, + width=width, + channels=channels, + ).event(), + ) + + # Audio + audio_bytes = wav_file.readframes(wav_file.getnframes()) + bytes_per_sample = width * channels + bytes_per_chunk = bytes_per_sample * self.cli_args.samples_per_chunk + num_chunks = int(math.ceil(len(audio_bytes) / bytes_per_chunk)) + + # Split into chunks + for i in range(num_chunks): + offset = i * bytes_per_chunk + chunk = audio_bytes[offset : offset + bytes_per_chunk] + await self.write_event( + AudioChunk( + audio=chunk, + rate=rate, + width=width, + channels=channels, + ).event(), + ) + + await self.write_event(AudioStop().event()) + _LOGGER.debug("Completed request") + + + return True \ No newline at end of file