Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
BuethSam committed May 11, 2023
0 parents commit 1f84fe5
Show file tree
Hide file tree
Showing 10 changed files with 298 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
.github
docker-compose.yml
Dockerfile
44 changes: 44 additions & 0 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
---
# This workflow uses actions that are not certified by GitHub.
# They are provided by a third-party and are governed by
# separate terms of service, privacy policy, and support
# documentation.

# GitHub recommends pinning actions to a commit SHA.
# To get a newer version, you will need to update the SHA.
# You can also reference a tag or branch, but the action may change without warning.

name: Publish Docker image

on:
release:
types: [published]

jobs:
push_to_registry:
name: Push Docker image to Docker Hub
runs-on: ubuntu-latest
steps:
- name: Check out the repo
uses: actions/checkout@v3

- name: Log in to Docker Hub
uses: docker/login-action@f4ef78c080cd8ba55a85445d5b36e214a81df20a
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}

- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
with:
images: samboo/wyoming-tts

- name: Build and push Docker image
uses: docker/build-push-action@3b5e8027fcad23fda98b2e3ac259d8d67585f671
with:
context: .
file: ./Dockerfile
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
15 changes: 15 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
FROM python:3.7

WORKDIR /app

RUN mkdir /data && mkdir -p /root/.local/share && ln -s /data /root/.local/share/tts

COPY requirements.txt requirements.txt

RUN pip3 install --no-cache-dir -r requirements.txt

COPY . .

VOLUME [ "/data" ]

ENTRYPOINT ["python3", "wyoming_tts"]
21 changes: 21 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) 2023 Sam Büth

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# wyoming TTS

coqui-ai TTS Wyoming protocol implementation.

## TODO

- [ ] Multi-lingual and multi-speaker selection via wyoming protocol (currently not transmitted by home-assistant)

- [ ] GPU support

## Contributions

Pull request a very welcome.
13 changes: 13 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
version: '3.0'

services:
tts:
image: samboo/wyoming-tts
restart: always
command: --uri tcp://0.0.0.0:10201 --voice tts_models/de/thorsten/vits
environment:
- COQUI_STUDIO_TOKEN= #optional
volumes:
- ./tts:/data
ports:
- 10201:10201
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
wyoming==0.0.1
tts~=0.13.3
1 change: 1 addition & 0 deletions wyoming_tts/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Wyoming server for tts."""
98 changes: 98 additions & 0 deletions wyoming_tts/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
#!/usr/bin/env python3
import argparse
import asyncio
import logging
from functools import partial

from TTS.api import TTS
from wyoming.info import Attribution, Info, TtsProgram, TtsVoice
from wyoming.server import AsyncServer

from handler import PiperEventHandler

_LOGGER = logging.getLogger(__name__)

async def main() -> None:
"""Main entry point."""
parser = argparse.ArgumentParser()
parser.add_argument(
"--voice",
default=None,
help="The Voice to use for TTS",
)
parser.add_argument(
"--speaker",
help="Set the target speaker",
)
parser.add_argument(
"--language",
help="Set the target language",
)
parser.add_argument("--samples-per-chunk", type=int, default=1024)
parser.add_argument("--uri", required=True, help="unix:// or tcp://")
parser.add_argument("--debug", action="store_true", help="Log DEBUG messages")
args = parser.parse_args()
logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)

if (args.voice == None):
_LOGGER.info("The following voices are available (specify with --voice [model_name]): ")
_LOGGER.info("\n".join(TTS.list_models()))
exit()

tts = TTS(args.voice)

if (tts.is_multi_lingual and args.language is None):
_LOGGER.error("The following languages are available (specify with --language [lang]): ")
_LOGGER.info("\n".join(tts.languages))
exit()
if (tts.is_multi_speaker and args.speaker is None):
_LOGGER.error("The following speakers are available (specify with --speakers [speaker]): ")
_LOGGER.info("\n".join(tts.speakers))
exit()

language = None
if (tts.is_multi_lingual is False):
language = args.voice.split("/")[1]
_LOGGER.info("Using language: %s", language)

_LOGGER.info("TTS ready")

wyoming_info = Info(
tts=[
TtsProgram(
name="coqui-ai TTS",
attribution=Attribution(
name="coqui-ai", url="https://github.com/coqui-ai/TTS"
),
installed=True,
voices=[
TtsVoice(
name=speaker,
attribution=Attribution(
name="coqui-ai", url="https://github.com/coqui-ai/TTS"
),
installed=True,
languages=tts.languages if tts.is_multi_lingual else [language],
) for speaker in ([args.speaker] if tts.is_multi_speaker else ["Default"]) # Preparation for multi speaker support in wyoming event
],
)
],
)

server = AsyncServer.from_uri(args.uri)
_LOGGER.info("Ready")
await server.run(
partial(
PiperEventHandler,
wyoming_info,
args,
tts
)
)



# -----------------------------------------------------------------------------

if __name__ == "__main__":
asyncio.run(main())
88 changes: 88 additions & 0 deletions wyoming_tts/handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
"""Event handler for clients of the server."""
import argparse
import logging
import math
import wave

from TTS.api import TTS
from wyoming.audio import AudioChunk, AudioStart, AudioStop
from wyoming.event import Event
from wyoming.info import Describe, Info
from wyoming.server import AsyncEventHandler
from wyoming.tts import Synthesize

_LOGGER = logging.getLogger(__name__)

class PiperEventHandler(AsyncEventHandler):
def __init__(
self,
wyoming_info: Info,
cli_args: argparse.Namespace,
tts: TTS,
*args,
) -> None:
super().__init__(*args)
self.cli_args = cli_args
self.wyoming_info_event = wyoming_info.event()
self.tts = tts

async def handle_event(self, event: Event) -> bool:
if Describe.is_type(event.type):
await self.write_event(self.wyoming_info_event)
_LOGGER.debug("Sent info")
return True

if not Synthesize.is_type(event.type):
_LOGGER.warning("Unexpected event: %s", event)
return True
synthesize = Synthesize.from_event(event)
raw_text = synthesize.text
text = raw_text.strip()

output_path = "/tmp/output.wav"
_LOGGER.debug(event)
tts_args = dict()
if (self.tts.is_multi_lingual):
tts_args["language"] = self.cli_args.language

if (self.tts.is_multi_speaker):
tts_args["speaker"] = self.cli_args.speaker
self.tts.tts_to_file(text, **tts_args, file_path=output_path)
wav_file: wave.Wave_read = wave.open(output_path, "rb")
with wav_file:
rate = wav_file.getframerate()
width = wav_file.getsampwidth()
channels = wav_file.getnchannels()

await self.write_event(
AudioStart(
rate=rate,
width=width,
channels=channels,
).event(),
)

# Audio
audio_bytes = wav_file.readframes(wav_file.getnframes())
bytes_per_sample = width * channels
bytes_per_chunk = bytes_per_sample * self.cli_args.samples_per_chunk
num_chunks = int(math.ceil(len(audio_bytes) / bytes_per_chunk))

# Split into chunks
for i in range(num_chunks):
offset = i * bytes_per_chunk
chunk = audio_bytes[offset : offset + bytes_per_chunk]
await self.write_event(
AudioChunk(
audio=chunk,
rate=rate,
width=width,
channels=channels,
).event(),
)

await self.write_event(AudioStop().event())
_LOGGER.debug("Completed request")


return True

0 comments on commit 1f84fe5

Please sign in to comment.