Initial commit

BuethSam · May 11, 2023 · 1f84fe5 · 1f84fe5
commit 1f84fe5
Show file tree

Hide file tree

Showing 10 changed files with 298 additions and 0 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,3 @@
+.github
+docker-compose.yml
+Dockerfile
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -0,0 +1,44 @@
+---
+# This workflow uses actions that are not certified by GitHub.
+# They are provided by a third-party and are governed by
+# separate terms of service, privacy policy, and support
+# documentation.
+
+# GitHub recommends pinning actions to a commit SHA.
+# To get a newer version, you will need to update the SHA.
+# You can also reference a tag or branch, but the action may change without warning.
+
+name: Publish Docker image
+
+on:
+ release:
+ types: [published]
+
+jobs:
+ push_to_registry:
+ name: Push Docker image to Docker Hub
+ runs-on: ubuntu-latest
+ steps:
+ - name: Check out the repo
+ uses: actions/checkout@v3
+
+ - name: Log in to Docker Hub
+ uses: docker/login-action@f4ef78c080cd8ba55a85445d5b36e214a81df20a
+ with:
+ username: ${{ secrets.DOCKER_USERNAME }}
+ password: ${{ secrets.DOCKER_PASSWORD }}
+
+ - name: Extract metadata (tags, labels) for Docker
+ id: meta
+ uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
+ with:
+ images: samboo/wyoming-tts
+
+ - name: Build and push Docker image
+ uses: docker/build-push-action@3b5e8027fcad23fda98b2e3ac259d8d67585f671
+ with:
+ context: .
+ file: ./Dockerfile
+ push: true
+ tags: ${{ steps.meta.outputs.tags }}
+ labels: ${{ steps.meta.outputs.labels }}
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,15 @@
+FROM python:3.7
+
+WORKDIR /app
+
+RUN mkdir /data && mkdir -p /root/.local/share && ln -s /data /root/.local/share/tts
+
+COPY requirements.txt requirements.txt
+
+RUN pip3 install --no-cache-dir -r requirements.txt
+
+COPY . .
+
+VOLUME [ "/data" ]
+
+ENTRYPOINT ["python3", "wyoming_tts"]
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 Sam Büth
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,13 @@
+# wyoming TTS
+
+coqui-ai TTS Wyoming protocol implementation. 
+
+## TODO
+
+- [ ] Multi-lingual and multi-speaker selection via wyoming protocol (currently not transmitted by home-assistant)
+
+- [ ] GPU support
+
+## Contributions
+
+Pull request a very welcome. 
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -0,0 +1,13 @@
+version: '3.0'
+
+services:
+ tts:
+ image: samboo/wyoming-tts
+ restart: always
+ command: --uri tcp://0.0.0.0:10201 --voice tts_models/de/thorsten/vits
+ environment:
+ - COQUI_STUDIO_TOKEN= #optional
+ volumes:
+ - ./tts:/data
+ ports:
+ - 10201:10201
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,2 @@
+wyoming==0.0.1
+tts~=0.13.3
diff --git a/wyoming_tts/__init__.py b/wyoming_tts/__init__.py
@@ -0,0 +1 @@
+"""Wyoming server for tts."""
diff --git a/wyoming_tts/__main__.py b/wyoming_tts/__main__.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+import argparse
+import asyncio
+import logging
+from functools import partial
+
+from TTS.api import TTS
+from wyoming.info import Attribution, Info, TtsProgram, TtsVoice
+from wyoming.server import AsyncServer
+
+from handler import PiperEventHandler
+
+_LOGGER = logging.getLogger(__name__)
+
+async def main() -> None:
+ """Main entry point."""
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--voice",
+ default=None,
+ help="The Voice to use for TTS",
+ )
+ parser.add_argument(
+ "--speaker",
+ help="Set the target speaker",
+ )
+ parser.add_argument(
+ "--language",
+ help="Set the target language",
+ )
+ parser.add_argument("--samples-per-chunk", type=int, default=1024)
+ parser.add_argument("--uri", required=True, help="unix:// or tcp://")
+ parser.add_argument("--debug", action="store_true", help="Log DEBUG messages")
+ args = parser.parse_args()
+ logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
+
+ if (args.voice == None): 
+ _LOGGER.info("The following voices are available (specify with --voice [model_name]): ")
+ _LOGGER.info("\n".join(TTS.list_models()))
+ exit()
+
+ tts = TTS(args.voice)
+
+ if (tts.is_multi_lingual and args.language is None): 
+ _LOGGER.error("The following languages are available (specify with --language [lang]): ")
+ _LOGGER.info("\n".join(tts.languages))
+ exit()
+ if (tts.is_multi_speaker and args.speaker is None):
+ _LOGGER.error("The following speakers are available (specify with --speakers [speaker]): ")
+ _LOGGER.info("\n".join(tts.speakers))
+ exit()
+
+ language = None
+ if (tts.is_multi_lingual is False):
+ language = args.voice.split("/")[1]
+ _LOGGER.info("Using language: %s", language)
+
+ _LOGGER.info("TTS ready")
+
+ wyoming_info = Info(
+ tts=[
+ TtsProgram(
+ name="coqui-ai TTS",
+ attribution=Attribution(
+ name="coqui-ai", url="https://github.com/coqui-ai/TTS"
+ ),
+ installed=True,
+ voices=[
+ TtsVoice(
+ name=speaker,
+ attribution=Attribution(
+ name="coqui-ai", url="https://github.com/coqui-ai/TTS"
+ ),
+ installed=True,
+ languages=tts.languages if tts.is_multi_lingual else [language],
+ ) for speaker in ([args.speaker] if tts.is_multi_speaker else ["Default"]) # Preparation for multi speaker support in wyoming event
+ ],
+ )
+ ],
+ )
+
+ server = AsyncServer.from_uri(args.uri)
+ _LOGGER.info("Ready")
+ await server.run(
+ partial(
+ PiperEventHandler,
+ wyoming_info,
+ args,
+ tts
+ )
+ )
+
+
+
+# -----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+ asyncio.run(main())
diff --git a/wyoming_tts/handler.py b/wyoming_tts/handler.py
@@ -0,0 +1,88 @@
+"""Event handler for clients of the server."""
+import argparse
+import logging
+import math
+import wave
+
+from TTS.api import TTS
+from wyoming.audio import AudioChunk, AudioStart, AudioStop
+from wyoming.event import Event
+from wyoming.info import Describe, Info
+from wyoming.server import AsyncEventHandler
+from wyoming.tts import Synthesize
+
+_LOGGER = logging.getLogger(__name__)
+
+class PiperEventHandler(AsyncEventHandler):
+ def __init__(
+ self,
+ wyoming_info: Info,
+ cli_args: argparse.Namespace,
+ tts: TTS,
+ *args,
+ ) -> None:
+ super().__init__(*args)
+ self.cli_args = cli_args
+ self.wyoming_info_event = wyoming_info.event()
+ self.tts = tts
+
+ async def handle_event(self, event: Event) -> bool:
+ if Describe.is_type(event.type):
+ await self.write_event(self.wyoming_info_event)
+ _LOGGER.debug("Sent info")
+ return True
+
+ if not Synthesize.is_type(event.type):
+ _LOGGER.warning("Unexpected event: %s", event)
+ return True
+ synthesize = Synthesize.from_event(event)
+ raw_text = synthesize.text
+ text = raw_text.strip()
+
+ output_path = "/tmp/output.wav"
+ _LOGGER.debug(event)
+ tts_args = dict()
+ if (self.tts.is_multi_lingual):
+ tts_args["language"] = self.cli_args.language
+
+ if (self.tts.is_multi_speaker):
+ tts_args["speaker"] = self.cli_args.speaker
+ self.tts.tts_to_file(text, **tts_args, file_path=output_path)
+ wav_file: wave.Wave_read = wave.open(output_path, "rb")
+ with wav_file:
+ rate = wav_file.getframerate()
+ width = wav_file.getsampwidth()
+ channels = wav_file.getnchannels()
+
+ await self.write_event(
+ AudioStart(
+ rate=rate,
+ width=width,
+ channels=channels,
+ ).event(),
+ )
+
+ # Audio
+ audio_bytes = wav_file.readframes(wav_file.getnframes())
+ bytes_per_sample = width * channels
+ bytes_per_chunk = bytes_per_sample * self.cli_args.samples_per_chunk
+ num_chunks = int(math.ceil(len(audio_bytes) / bytes_per_chunk))
+
+ # Split into chunks
+ for i in range(num_chunks):
+ offset = i * bytes_per_chunk
+ chunk = audio_bytes[offset : offset + bytes_per_chunk]
+ await self.write_event(
+ AudioChunk(
+ audio=chunk,
+ rate=rate,
+ width=width,
+ channels=channels,
+ ).event(),
+ )
+
+ await self.write_event(AudioStop().event())
+ _LOGGER.debug("Completed request")
+
+
+ return True