diff --git a/deeppavlov/configs/nemo/asr.json b/deeppavlov/configs/nemo/asr.json new file mode 100644 index 0000000000..402d258337 --- /dev/null +++ b/deeppavlov/configs/nemo/asr.json @@ -0,0 +1,29 @@ +{ + "chainer": { + "in": "speech", + "pipe": [ + { + "class_name": "nemo_asr", + "nemo_params_path": "{NEMO_PATH}/quartznet15x5/quartznet15x5.yaml", + "load_path": "{NEMO_PATH}/quartznet15x5", + "in": ["speech"], + "out": ["text"] + } + ], + "out": ["text"] + }, + "metadata": { + "variables": { + "NEMO_PATH": "~/.deeppavlov/models/nemo" + }, + "requirements": [ + "{DEEPPAVLOV_PATH}/requirements/nemo-asr.txt" + ], + "download": [ + { + "url": "http://files.deeppavlov.ai/deeppavlov_data/nemo/quartznet15x5.tar.gz", + "subdir": "{NEMO_PATH}" + } + ] + } +} \ No newline at end of file diff --git a/deeppavlov/configs/nemo/asr_tts.json b/deeppavlov/configs/nemo/asr_tts.json new file mode 100644 index 0000000000..3cb01515cf --- /dev/null +++ b/deeppavlov/configs/nemo/asr_tts.json @@ -0,0 +1,52 @@ +{ + "chainer": { + "in": "speech_in_encoded", + "pipe": [ + { + "class_name": "base64_encode_bytesIO", + "in": ["speech_in_encoded"], + "out": ["speech_in"] + }, + { + "class_name": "nemo_asr", + "nemo_params_path": "{NEMO_PATH}/quartznet15x5/quartznet15x5.yaml", + "load_path": "{NEMO_PATH}/quartznet15x5", + "in": ["speech_in"], + "out": ["text"] + }, + { + "class_name": "nemo_tts", + "nemo_params_path": "{TTS_PATH}/tacotron2_waveglow.yaml", + "load_path": "{TTS_PATH}", + "in": ["text"], + "out": ["speech_out"] + }, + { + "class_name": "bytesIO_decode_base64", + "in": ["speech_out"], + "out": ["speech_out_encoded"] + } + ], + "out": ["text", "speech_out_encoded"] + }, + "metadata": { + "variables": { + "NEMO_PATH": "~/.deeppavlov/models/nemo", + "TTS_PATH": "{NEMO_PATH}/tacotron2_waveglow" + }, + "requirements": [ + "{DEEPPAVLOV_PATH}/requirements/nemo-asr.txt", + "{DEEPPAVLOV_PATH}/requirements/nemo-tts.txt" + ], + "download": [ + { + "url": "http://files.deeppavlov.ai/deeppavlov_data/nemo/quartznet15x5.tar.gz", + "subdir": "{NEMO_PATH}" + }, + { + "url": "http://files.deeppavlov.ai/deeppavlov_data/nemo/tacotron2_waveglow.tar.gz", + "subdir": "{NEMO_PATH}" + } + ] + } +} \ No newline at end of file diff --git a/deeppavlov/configs/nemo/tts.json b/deeppavlov/configs/nemo/tts.json new file mode 100644 index 0000000000..8973604539 --- /dev/null +++ b/deeppavlov/configs/nemo/tts.json @@ -0,0 +1,31 @@ +{ + "chainer": { + "in": ["text", "filepath"], + "pipe": [ + { + "class_name": "nemo_tts", + "nemo_params_path": "{TTS_PATH}/tacotron2_waveglow.yaml", + "load_path": "{TTS_PATH}", + "in": ["text", "filepath"], + "out": ["saved_path"] + } + ], + "out": ["saved_path"] + }, + "metadata": { + "variables": { + "NEMO_PATH": "~/.deeppavlov/models/nemo", + "TTS_PATH": "{NEMO_PATH}/tacotron2_waveglow" + }, + "requirements": [ + "{DEEPPAVLOV_PATH}/requirements/nemo-asr.txt", + "{DEEPPAVLOV_PATH}/requirements/nemo-tts.txt" + ], + "download": [ + { + "url": "http://files.deeppavlov.ai/deeppavlov_data/nemo/tacotron2_waveglow.tar.gz", + "subdir": "{NEMO_PATH}" + } + ] + } +} \ No newline at end of file diff --git a/deeppavlov/core/common/file.py b/deeppavlov/core/common/file.py index e502b6b139..6079c68203 100644 --- a/deeppavlov/core/common/file.py +++ b/deeppavlov/core/common/file.py @@ -19,6 +19,8 @@ from pathlib import Path from typing import Union, Any +from ruamel.yaml import YAML + log = getLogger(__name__) @@ -50,3 +52,9 @@ def save_pickle(data: dict, fpath: Union[str, Path]) -> None: def load_pickle(fpath: Union[str, Path]) -> Any: with open(fpath, 'rb') as fin: return pickle.load(fin) + + +def read_yaml(fpath: Union[str, Path]) -> dict: + yaml = YAML(typ="safe") + with open(fpath, encoding='utf8') as fin: + return yaml.load(fin) diff --git a/deeppavlov/core/common/registry.json b/deeppavlov/core/common/registry.json index 042dc5c411..d2c1b14d1b 100644 --- a/deeppavlov/core/common/registry.json +++ b/deeppavlov/core/common/registry.json @@ -6,6 +6,7 @@ "api_router": "deeppavlov.models.api_requester.api_router:ApiRouter", "basic_classification_iterator": "deeppavlov.dataset_iterators.basic_classification_iterator:BasicClassificationDatasetIterator", "basic_classification_reader": "deeppavlov.dataset_readers.basic_classification_reader:BasicClassificationDatasetReader", + "base64_encode_bytesIO": "deeppavlov.models.nemo.common:ascii_to_bytes_io", "bert_as_summarizer": "deeppavlov.models.bert.bert_as_summarizer:BertAsSummarizer", "bert_classifier": "deeppavlov.models.bert.bert_classifier:BertClassifierModel", "bert_ner_preprocessor": "deeppavlov.models.preprocessors.bert_preprocessor:BertNerPreprocessor", @@ -22,6 +23,7 @@ "bilstm_gru_nn": "deeppavlov.models.ranking.bilstm_gru_siamese_network:BiLSTMGRUSiameseNetwork", "bilstm_nn": "deeppavlov.models.ranking.bilstm_siamese_network:BiLSTMSiameseNetwork", "bow": "deeppavlov.models.embedders.bow_embedder:BoWEmbedder", + "bytesIO_decode_base64": "deeppavlov.models.nemo.common:bytes_io_to_ascii", "capitalization_featurizer": "deeppavlov.models.preprocessors.capitalization:CapitalizationPreprocessor", "char_splitter": "deeppavlov.models.preprocessors.char_splitter:CharSplitter", "char_splitting_lowercase_preprocessor": "deeppavlov.models.preprocessors.capitalization:CharSplittingLowercasePreprocessor", @@ -85,6 +87,8 @@ "multi_squad_dataset_reader": "deeppavlov.dataset_readers.squad_dataset_reader:MultiSquadDatasetReader", "multi_squad_iterator": "deeppavlov.dataset_iterators.squad_iterator:MultiSquadIterator", "multi_squad_retr_iterator": "deeppavlov.dataset_iterators.squad_iterator:MultiSquadRetrIterator", + "nemo_asr": "deeppavlov.models.nemo.asr:NeMoASR", + "nemo_tts": "deeppavlov.models.nemo.tts:NeMoTTS", "ner": "deeppavlov.models.ner.network:NerNetwork", "ner_bio_converter": "deeppavlov.models.ner.bio:BIOMarkupRestorer", "ner_few_shot_iterator": "deeppavlov.dataset_iterators.ner_few_shot_iterator:NERFewShotIterator", diff --git a/deeppavlov/models/nemo/__init__.py b/deeppavlov/models/nemo/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/deeppavlov/models/nemo/asr.py b/deeppavlov/models/nemo/asr.py new file mode 100644 index 0000000000..70527adea3 --- /dev/null +++ b/deeppavlov/models/nemo/asr.py @@ -0,0 +1,193 @@ +# Copyright 2020 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from io import BytesIO +from pathlib import Path +from typing import List, Optional, Tuple, Union, Dict + +import torch +from nemo.collections.asr import AudioToMelSpectrogramPreprocessor, JasperEncoder, JasperDecoderForCTC, GreedyCTCDecoder +from nemo.collections.asr.helpers import post_process_predictions +from nemo.collections.asr.parts.features import WaveformFeaturizer +from nemo.core.neural_types import AudioSignal, NeuralType, LengthsType +from nemo.utils.decorators import add_port_docs +from torch import Tensor +from torch.utils.data import Dataset, DataLoader + +from deeppavlov.core.common.registry import register +from deeppavlov.models.nemo.common import CustomDataLayerBase, NeMoBase + +log = logging.getLogger(__name__) + + +class AudioInferDataset(Dataset): + def __init__(self, audio_batch: List[Union[str, BytesIO]], sample_rate: int, int_values: bool, trim=False) -> None: + """Dataset reader for AudioInferDataLayer. + + Args: + audio_batch: Batch to be read. Elements could be either paths to audio files or Binary I/O objects. + sample_rate: Audio files sample rate. + int_values: If true, load samples as 32-bit integers. + trim: Trim leading and trailing silence from an audio signal if True. + + """ + self.audio_batch = audio_batch + self.featurizer = WaveformFeaturizer(sample_rate=sample_rate, int_values=int_values) + self.trim = trim + + def __getitem__(self, index: int) -> Tuple[Tensor, Tensor]: + """Processes audio batch item and extracts features. + + Args: + index: Audio batch item index. + + Returns: + features: Audio file's extracted features tensor. + features_length: Features length tensor. + + """ + sample = self.audio_batch[index] + features = self.featurizer.process(sample, trim=self.trim) + features_length = torch.tensor(features.shape[0]).long() + + return features, features_length + + def __len__(self) -> int: + return len(self.audio_batch) + + +class AudioInferDataLayer(CustomDataLayerBase): + """Data Layer for ASR pipeline inference.""" + + @property + @add_port_docs() + def output_ports(self) -> Dict[str, NeuralType]: + return { + "audio_signal": NeuralType(('B', 'T'), AudioSignal(freq=self._sample_rate)), + "a_sig_length": NeuralType(tuple('B'), LengthsType()) + } + + def __init__(self, *, + audio_batch: List[Union[str, BytesIO]], + batch_size: int = 32, + sample_rate: int = 16000, + int_values: bool = False, + trim_silence: bool = False, + **kwargs) -> None: + """Initializes Data Loader. + + Args: + audio_batch: Batch to be read. Elements could be either paths to audio files or Binary I/O objects. + batch_size: How many samples per batch to load. + sample_rate: Target sampling rate for data. Audio files will be resampled to sample_rate if + it is not already. + int_values: If true, load data as 32-bit integers. + trim_silence: Trim leading and trailing silence from an audio signal if True. + + """ + self._sample_rate = sample_rate + + dataset = AudioInferDataset(audio_batch=audio_batch, sample_rate=sample_rate, int_values=int_values, + trim=trim_silence) + + dataloader = DataLoader(dataset=dataset, batch_size=batch_size, collate_fn=self.seq_collate_fn) + super(AudioInferDataLayer, self).__init__(dataset, dataloader, **kwargs) + + @staticmethod + def seq_collate_fn(batch: Tuple[Tuple[Tensor], Tuple[Tensor]]) -> Tuple[Optional[Tensor], Optional[Tensor]]: + """Collates batch of audio signal and audio length, zero pads audio signal. + + Args: + batch: A tuple of tuples of audio signals and signal lengths. This collate function assumes the signals + are 1d torch tensors (i.e. mono audio). + + Returns: + audio_signal: Zero padded audio signal tensor. + audio_length: Audio signal length tensor. + + """ + _, audio_lengths = zip(*batch) + max_audio_len = 0 + has_audio = audio_lengths[0] is not None + if has_audio: + max_audio_len = max(audio_lengths).item() + + audio_signal = [] + for sig, sig_len in batch: + if has_audio: + sig_len = sig_len.item() + if sig_len < max_audio_len: + pad = (0, max_audio_len - sig_len) + sig = torch.nn.functional.pad(sig, pad) + audio_signal.append(sig) + + if has_audio: + audio_signal = torch.stack(audio_signal) + audio_lengths = torch.stack(audio_lengths) + else: + audio_signal, audio_lengths = None, None + + return audio_signal, audio_lengths + + +@register('nemo_asr') +class NeMoASR(NeMoBase): + """ASR model on NeMo modules.""" + + def __init__(self, load_path: Union[str, Path], nemo_params_path: Union[str, Path], **kwargs) -> None: + """Initializes NeuralModules for ASR. + + Args: + load_path: Path to a directory with pretrained checkpoints for JasperEncoder and JasperDecoderForCTC. + nemo_params_path: Path to a file containig labels and params for AudioToMelSpectrogramPreprocessor, + JasperEncoder, JasperDecoderForCTC and AudioInferDataLayer. + + """ + super(NeMoASR, self).__init__(load_path=load_path, nemo_params_path=nemo_params_path, **kwargs) + + self.labels = self.nemo_params['labels'] + + self.data_preprocessor = AudioToMelSpectrogramPreprocessor( + **self.nemo_params['AudioToMelSpectrogramPreprocessor'] + ) + self.jasper_encoder = JasperEncoder(**self.nemo_params['JasperEncoder']) + self.jasper_decoder = JasperDecoderForCTC(num_classes=len(self.labels), **self.nemo_params['JasperDecoder']) + self.greedy_decoder = GreedyCTCDecoder() + self.modules_to_restore = [self.jasper_encoder, self.jasper_decoder] + + self.load() + + def __call__(self, audio_batch: List[Union[str, BytesIO]]) -> List[str]: + """Transcripts audio batch to text. + + Args: + audio_batch: Batch to be transcribed. Elements could be either paths to audio files or Binary I/O objects. + + Returns: + text_batch: Batch of transcripts. + + """ + data_layer = AudioInferDataLayer(audio_batch=audio_batch, **self.nemo_params['AudioToTextDataLayer']) + audio_signal, audio_signal_len = data_layer() + processed_signal, processed_signal_len = self.data_preprocessor(input_signal=audio_signal, + length=audio_signal_len) + encoded, encoded_len = self.jasper_encoder(audio_signal=processed_signal, length=processed_signal_len) + log_probs = self.jasper_decoder(encoder_output=encoded) + predictions = self.greedy_decoder(log_probs=log_probs) + eval_tensors = [predictions] + tensors = self.neural_factory.infer(tensors=eval_tensors) + text_batch = post_process_predictions(tensors[0], self.labels) + + return text_batch diff --git a/deeppavlov/models/nemo/common.py b/deeppavlov/models/nemo/common.py new file mode 100644 index 0000000000..4fe2898195 --- /dev/null +++ b/deeppavlov/models/nemo/common.py @@ -0,0 +1,117 @@ +# Copyright 2020 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import base64 +from io import BytesIO +from logging import getLogger +from pathlib import Path +from typing import Union + +import nemo +import torch +from nemo.backends.pytorch import DataLayerNM +from torch.utils.data import Dataset, DataLoader + +from deeppavlov.core.commands.utils import expand_path +from deeppavlov.core.common.file import read_yaml +from deeppavlov.core.common.registry import register +from deeppavlov.core.models.component import Component +from deeppavlov.core.models.serializable import Serializable + +log = getLogger(__name__) + + +@register('base64_encode_bytesIO') +def ascii_to_bytes_io(batch: Union[str, list]) -> Union[BytesIO, list]: + """Recursively searches for strings in the input batch and converts them into the base64-encoded bytes wrapped in + Binary I/O objects. + + Args: + batch: A string or an iterable container with strings at some level of nesting. + + Returns: + The same structure where all strings are converted into the base64-encoded bytes wrapped in Binary I/O objects. + + """ + if isinstance(batch, str): + return BytesIO(base64.decodebytes(batch.encode())) + + return list(map(ascii_to_bytes_io, batch)) + + +@register('bytesIO_decode_base64') +def bytes_io_to_ascii(batch: Union[BytesIO, list]) -> Union[str, list]: + """Recursively searches for Binary I/O objects in the input batch and converts them into ASCII-strings. + + Args: + batch: A BinaryIO object or an iterable container with BinaryIO objects at some level of nesting. + + Returns: + The same structure where all BinaryIO objects are converted into strings. + + """ + if isinstance(batch, BytesIO): + return base64.encodebytes(batch.read()).decode('ascii') + + return list(map(bytes_io_to_ascii, batch)) + + +class NeMoBase(Component, Serializable): + """Base class for NeMo Chainer's pipeline components.""" + + def __init__(self, load_path: Union[str, Path], nemo_params_path: Union[str, Path], **kwargs) -> None: + """Initializes NeuralModuleFactory on CPU or GPU and reads nemo modules params from yaml. + + Args: + load_path: Path to a directory with pretrained checkpoints for NeMo modules. + nemo_params_path: Path to a file containig NeMo modules params. + + """ + super(NeMoBase, self).__init__(save_path=None, load_path=load_path, **kwargs) + placement = nemo.core.DeviceType.GPU if torch.cuda.is_available() else nemo.core.DeviceType.CPU + self.neural_factory = nemo.core.NeuralModuleFactory(placement=placement) + self.modules_to_restore = [] + self.nemo_params = read_yaml(expand_path(nemo_params_path)) + + def __call__(self, *args, **kwargs): + raise NotImplementedError + + def load(self) -> None: + """Loads pretrained checkpoints for modules from self.modules_to_restore list.""" + module_names = [str(module) for module in self.modules_to_restore] + checkpoints = nemo.utils.get_checkpoint_from_dir(module_names, self.load_path) + for module, checkpoint in zip(self.modules_to_restore, checkpoints): + log.info(f'Restoring {module} from {checkpoint}') + module.restore_from(checkpoint) + + def save(self, *args, **kwargs) -> None: + pass + + +class CustomDataLayerBase(DataLayerNM): + def __init__(self, dataset: Dataset, dataloader: DataLoader, **kwargs) -> None: + super(CustomDataLayerBase, self).__init__() + self._dataset = dataset + self._dataloader = dataloader + + def __len__(self) -> int: + return len(self._dataset) + + @property + def dataset(self) -> None: + return None + + @property + def data_iterator(self) -> torch.utils.data.DataLoader: + return self._dataloader diff --git a/deeppavlov/models/nemo/tts.py b/deeppavlov/models/nemo/tts.py new file mode 100644 index 0000000000..d31fa0bcfb --- /dev/null +++ b/deeppavlov/models/nemo/tts.py @@ -0,0 +1,210 @@ +# Copyright 2020 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import partial +from io import BytesIO +from logging import getLogger +from pathlib import Path +from typing import List, Optional, Tuple, Union, Dict + +import torch +from nemo.collections.asr.parts import collections, parsers +from nemo.collections.asr.parts.dataset import TranscriptDataset +from nemo.collections.tts import TextEmbedding, Tacotron2Encoder, Tacotron2DecoderInfer, Tacotron2Postnet +from nemo.core.neural_types import NeuralType, LabelsType, LengthsType +from nemo.utils.decorators import add_port_docs +from nemo.utils.misc import pad_to +from scipy.io import wavfile +from torch import Tensor + +from deeppavlov.core.commands.utils import expand_path +from deeppavlov.core.common.registry import register +from deeppavlov.models.nemo.common import CustomDataLayerBase, NeMoBase +from deeppavlov.models.nemo.vocoder import WaveGlow, GriffinLim + +log = getLogger(__name__) + + +class TextDataset(TranscriptDataset): + def __init__(self, + text_batch: List[str], + labels: List[str], + bos_id: Optional[int] = None, + eos_id: Optional[int] = None, + lowercase: bool = True) -> None: + """Text dataset reader for TextDataLayer. + + Args: + text_batch: Texts to be used for speech synthesis. + labels: List of string labels to use when to str2int translation. + bos_id: Label position of beginning of string symbol. + eos_id: Label position of end of string symbol. + lowercase: Whether to convert all uppercase characters in a text batch into lowercase characters. + + """ + parser = parsers.make_parser(labels, do_lowercase=lowercase) + self.texts = collections.Text(text_batch, parser) + self.bos_id = bos_id + self.eos_id = eos_id + + +class TextDataLayer(CustomDataLayerBase): + @property + @add_port_docs() + def output_ports(self) -> Dict[str, NeuralType]: + return { + 'texts': NeuralType(('B', 'T'), LabelsType()), + "texts_length": NeuralType(tuple('B'), LengthsType()) + } + + def __init__(self, *, + text_batch: List[str], + labels: List[str], + batch_size: int = 32, + bos_id: Optional[int] = None, + eos_id: Optional[int] = None, + pad_id: Optional[int] = None, + **kwargs) -> None: + """A simple Neural Module for loading text data. + + Args: + text_batch: Texts to be used for speech synthesis. + labels: List of string labels to use when to str2int translation. + batch_size: How many strings per batch to load. + bos_id: Label position of beginning of string symbol. If None is initialized as `len(labels)`. + eos_id: Label position of end of string symbol. If None is initialized as `len(labels) + 1`. + pad_id: Label position of pad symbol. If None is initialized as `len(labels) + 2`. + + """ + len_labels = len(labels) + if bos_id is None: + bos_id = len_labels + if eos_id is None: + eos_id = len_labels + 1 + if pad_id is None: + pad_id = len_labels + 2 + + dataset = TextDataset(text_batch=text_batch, labels=labels, bos_id=bos_id, eos_id=eos_id) + + dataloader = torch.utils.data.DataLoader(dataset=dataset, batch_size=batch_size, + collate_fn=partial(self._collate_fn, pad_id=pad_id)) + super(TextDataLayer, self).__init__(dataset, dataloader, **kwargs) + + @staticmethod + def _collate_fn(batch: Tuple[Tuple[Tensor], Tuple[Tensor]], pad_id: int) -> Tuple[Tensor, Tensor]: + """Collates batch of texts. + + Args: + batch: A tuple of tuples of audio signals and signal lengths. + pad_id: Label position of pad symbol. + + Returns: + texts: Padded texts tensor. + texts_len: Text lengths tensor. + + """ + texts_list, texts_len = zip(*batch) + max_len = max(texts_len) + max_len = pad_to(max_len, 8) + + texts = torch.empty(len(texts_list), max_len, dtype=torch.long) + texts.fill_(pad_id) + + for i, text in enumerate(texts_list): + texts[i].narrow(0, 0, text.size(0)).copy_(text) + + if len(texts.shape) != 2: + raise ValueError(f'Texts in collate function have shape {texts.shape}, should have 2 dimensions.') + + return texts, torch.stack(texts_len) + + +@register('nemo_tts') +class NeMoTTS(NeMoBase): + """TTS model on NeMo modules.""" + def __init__(self, + load_path: Union[str, Path], + nemo_params_path: Union[str, Path], + vocoder: str = 'waveglow', + **kwargs) -> None: + """Initializes NeuralModules for TTS. + + Args: + load_path: Path to a directory with pretrained checkpoints for TextEmbedding, Tacotron2Encoder, + Tacotron2DecoderInfer, Tacotron2Postnet and, if Waveglow vocoder is selected, WaveGlowInferNM. + nemo_params_path: Path to a file containig sample_rate, labels and params for TextEmbedding, + Tacotron2Encoder, Tacotron2Decoder, Tacotron2Postnet and TranscriptDataLayer. + vocoder: Vocoder used to convert from spectrograms to audio. Available options: `waveglow` (needs pretrained + checkpoint) and `griffin-lim`. + + """ + super(NeMoTTS, self).__init__(load_path=load_path, nemo_params_path=nemo_params_path, **kwargs) + + self.sample_rate = self.nemo_params['sample_rate'] + self.text_embedding = TextEmbedding( + len(self.nemo_params['labels']) + 3, # + 3 special chars + **self.nemo_params['TextEmbedding'] + ) + self.t2_enc = Tacotron2Encoder(**self.nemo_params['Tacotron2Encoder']) + self.t2_dec = Tacotron2DecoderInfer(**self.nemo_params['Tacotron2Decoder']) + self.t2_postnet = Tacotron2Postnet(**self.nemo_params['Tacotron2Postnet']) + self.modules_to_restore = [self.text_embedding, self.t2_enc, self.t2_dec, self.t2_postnet] + + if vocoder == 'waveglow': + self.vocoder = WaveGlow(**self.nemo_params['WaveGlowNM']) + self.modules_to_restore.append(self.vocoder) + elif vocoder == 'griffin-lim': + self.vocoder = GriffinLim(**self.nemo_params['GriffinLim']) + else: + raise ValueError(f'{vocoder} vocoder is not supported.') + + self.load() + + def __call__(self, + text_batch: List[str], + path_batch: Optional[List[str]] = None) -> Union[List[BytesIO], List[str]]: + """Creates wav files or file objects with speech. + + Args: + text_batch: Text from which human audible speech should be generated. + path_batch: i-th element of `path_batch` is the path to save i-th generated speech file. If argument isn't + specified, the synthesized speech will be stored to Binary I/O objects. + + Returns: + List of Binary I/O objects with generated speech if `path_batch` was not specified, list of paths to files + with synthesized speech otherwise. + + """ + if path_batch is None: + path_batch = [BytesIO() for _ in text_batch] + elif len(text_batch) != len(path_batch): + raise ValueError('Text batch length differs from path batch length.') + else: + path_batch = [expand_path(path) for path in path_batch] + + data_layer = TextDataLayer(text_batch=text_batch, **self.nemo_params['TranscriptDataLayer']) + transcript, transcript_len = data_layer() + transcript_embedded = self.text_embedding(char_phone=transcript) + transcript_encoded = self.t2_enc(char_phone_embeddings=transcript_embedded, embedding_length=transcript_len) + mel_decoder, gate, alignments, mel_len = self.t2_dec(char_phone_encoded=transcript_encoded, + encoded_length=transcript_len) + mel_postnet = self.t2_postnet(mel_input=mel_decoder) + infer_tensors = [self.vocoder(mel_postnet), mel_len] + evaluated_tensors = self.neural_factory.infer(tensors=infer_tensors) + synthesized_batch = self.vocoder.get_audio(*evaluated_tensors) + + for fout, synthesized_audio in zip(path_batch, synthesized_batch): + wavfile.write(fout, self.sample_rate, synthesized_audio) + + return path_batch diff --git a/deeppavlov/models/nemo/vocoder.py b/deeppavlov/models/nemo/vocoder.py new file mode 100644 index 0000000000..3ec918d266 --- /dev/null +++ b/deeppavlov/models/nemo/vocoder.py @@ -0,0 +1,131 @@ +# Copyright 2020 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from logging import getLogger +from typing import List + +import librosa +import numpy as np +from nemo.core.neural_types import NmTensor +from nemo.collections.tts import WaveGlowInferNM +from numpy import ndarray + +log = getLogger(__name__) + + +class BaseVocoder: + """Class is used to maintain consistency in the construction of the TTS pipeline based on NeMo modules.""" + + def __call__(self, tensor: NmTensor) -> NmTensor: + """Should return the tensor after the evaluation of which speech could be synthesized with `get_audio` method""" + raise NotImplementedError + + def get_audio(self, evaluated_tensor: list, mel_len: list): + """Synthesizes audio from the evaluated tensor constructed by `__call__` method.""" + raise NotImplementedError + + +class WaveGlow(BaseVocoder): + def __init__(self, *, denoiser_strength: float = 0.0, n_window_stride: int = 160, **kwargs) -> None: + """Wraps WaveGlowInferNM module. + + Args: + denoiser_strength: Denoiser strength for waveglow. + n_window_stride: Stride of window for FFT in samples used in model training. + kwargs: Named arguments for WaveGlowInferNM constructor. + + """ + self.waveglow = WaveGlowInferNM(**kwargs) + self.denoiser_strength = denoiser_strength + self.n_window_stride = n_window_stride + + def __call__(self, mel_postnet: NmTensor) -> NmTensor: + return self.waveglow(mel_spectrogram=mel_postnet) + + def __str__(self): + return str(self.waveglow) + + def restore_from(self, path: str) -> None: + """Wraps WaveGlowInferNM restore_from method.""" + self.waveglow.restore_from(path) + if self.denoiser_strength > 0: + log.info('Setup denoiser for WaveGlow') + self.waveglow.setup_denoiser() + + def get_audio(self, evaluated_audio: list, mel_len: list) -> List[ndarray]: + """Unpacks audio data from evaluated tensor and denoises it if `denoiser_strength` > 0.""" + audios = [] + for i, batch in enumerate(evaluated_audio): + audio = batch.cpu().numpy() + for j, sample in enumerate(audio): + sample_len = mel_len[i][j] * self.n_window_stride + sample = sample[:sample_len] + if self.denoiser_strength > 0: + sample, _ = self.waveglow.denoise(sample, strength=self.denoiser_strength) + audios.append(sample) + return audios + + +class GriffinLim(BaseVocoder): + def __init__(self, *, + sample_rate: float = 16000.0, + n_fft: int = 1024, + mag_scale: float = 2048.0, + power: float = 1.2, + n_iters: int = 50, + **kwargs) -> None: + """Uses Griffin Lim algorithm to generate speech from spectrograms. + + Args: + sample_rate: Generated audio data sample rate. + n_fft: The number of points to use for the FFT. + mag_scale: Multiplied with the linear spectrogram to avoid audio sounding muted due to mel filter + normalization. + power: The linear spectrogram is raised to this power prior to running the Griffin Lim algorithm. A power + of greater than 1 has been shown to improve audio quality. + n_iters: Number of iterations of convertion magnitude spectrograms to audio signal. + + """ + self.mag_scale = mag_scale + self.power = power + self.n_iters = n_iters + self.n_fft = n_fft + self.filterbank = librosa.filters.mel(sr=sample_rate, n_fft=n_fft, **kwargs) + + def __call__(self, mel_postnet: NmTensor) -> NmTensor: + return mel_postnet + + def get_audio(self, mel_spec: list, mel_len: list) -> List[ndarray]: + audios = [] + for i, batch in enumerate(mel_spec): + log_mel = batch.cpu().numpy().transpose(0, 2, 1) + mel = np.exp(log_mel) + magnitudes = np.dot(mel, self.filterbank) * self.mag_scale + for j, sample in enumerate(magnitudes): + sample = sample[:mel_len[i][j], :] + audio = self.griffin_lim(sample.T ** self.power) + audios.append(audio) + return audios + + def griffin_lim(self, magnitudes): + """Griffin-Lim algorithm to convert magnitude spectrograms to audio signals.""" + phase = np.exp(2j * np.pi * np.random.rand(*magnitudes.shape)) + complex_spec = magnitudes * phase + signal = librosa.istft(complex_spec) + + for _ in range(self.n_iters): + _, phase = librosa.magphase(librosa.stft(signal, n_fft=self.n_fft)) + complex_spec = magnitudes * phase + signal = librosa.istft(complex_spec) + return signal diff --git a/deeppavlov/requirements/nemo-asr.txt b/deeppavlov/requirements/nemo-asr.txt new file mode 100644 index 0000000000..136ecf09c9 --- /dev/null +++ b/deeppavlov/requirements/nemo-asr.txt @@ -0,0 +1,7 @@ +nemo-toolkit==0.10.0 +frozendict==1.2 +kaldi-io==0.9.4 +inflect==4.1.0 +unidecode==1.1.1 +librosa==0.7.2 +torch-stft==0.1.4 \ No newline at end of file diff --git a/deeppavlov/requirements/nemo-tts.txt b/deeppavlov/requirements/nemo-tts.txt new file mode 100644 index 0000000000..80f13f45dd --- /dev/null +++ b/deeppavlov/requirements/nemo-tts.txt @@ -0,0 +1,4 @@ +matplotlib==3.2.1 +sentencepiece==0.1.85 +transformers==2.8.0 +youtokentome==1.0.6 \ No newline at end of file diff --git a/deeppavlov/utils/server/server.py b/deeppavlov/utils/server/server.py index 751bda4814..faedec8b35 100644 --- a/deeppavlov/utils/server/server.py +++ b/deeppavlov/utils/server/server.py @@ -22,7 +22,7 @@ import uvicorn from fastapi import Body, FastAPI, HTTPException from fastapi.utils import generate_operation_id_for_path -from pydantic import BaseConfig, BaseModel, Schema +from pydantic import BaseConfig, BaseModel from pydantic.fields import Field, ModelField from pydantic.main import ModelMetaclass from starlette.middleware.cors import CORSMiddleware diff --git a/docs/apiref/models/nemo.rst b/docs/apiref/models/nemo.rst new file mode 100644 index 0000000000..27c2054336 --- /dev/null +++ b/docs/apiref/models/nemo.rst @@ -0,0 +1,32 @@ +deeppavlov.models.nemo +====================== + +.. autoclass:: deeppavlov.models.nemo.asr.NeMoASR + + .. automethod:: __init__ + .. automethod:: __call__ + +.. autoclass:: deeppavlov.models.nemo.tts.NeMoTTS + + .. automethod:: __init__ + .. automethod:: __call__ + +.. autofunction:: deeppavlov.models.nemo.common.ascii_to_bytes_io + +.. autofunction:: deeppavlov.models.nemo.common.bytes_io_to_ascii + +.. autoclass:: deeppavlov.models.nemo.asr.AudioInferDataLayer + + .. automethod:: __init__ + +.. autoclass:: deeppavlov.models.nemo.tts.TextDataLayer + + .. automethod:: __init__ + +.. autoclass:: deeppavlov.models.nemo.vocoder.WaveGlow + + .. automethod:: __init__ + +.. autoclass:: deeppavlov.models.nemo.vocoder.GriffinLim + + .. automethod:: __init__ diff --git a/docs/conf.py b/docs/conf.py index f867223fa3..79efc94ab8 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -190,9 +190,9 @@ # -- Extension configuration ------------------------------------------------- -autodoc_mock_imports = ['tensorflow', 'tensorflow_hub', 'fastText', 'nltk', 'gensim', 'kenlm', 'spacy', 'lxml', 'torch', - 'sortedcontainers', 'russian_tagsets', 'bert_dp', 'aiml', 'rasa', 'fasttext', 'sacremoses', - 'transformers'] +autodoc_mock_imports = ['aiml', 'bert_dp', 'fastText', 'fasttext', 'gensim', 'kenlm', 'librosa', 'lxml', 'nemo', + 'nemo_asr', 'nemo_tts', 'nltk', 'rasa', 'russian_tagsets', 'sacremoses', 'sortedcontainers', + 'spacy', 'tensorflow', 'tensorflow_hub', 'torch', 'transformers'] extlinks = { 'config': (f'https://github.com/deepmipt/DeepPavlov/blob/{release}/deeppavlov/configs/%s', None) diff --git a/docs/features/models/nemo.rst b/docs/features/models/nemo.rst new file mode 100644 index 0000000000..2024ee56e5 --- /dev/null +++ b/docs/features/models/nemo.rst @@ -0,0 +1,164 @@ +Speech recognition and synthesis (ASR and TTS) +============================================== + +DeepPavlov contains models for automatic speech recognition (ASR) and text synthesis (TTS) based on pre-build modules +from `NeMo `__ (v0.10.0) - NVIDIA toolkit for defining and building +Conversational AI applications. Named arguments for modules initialization are taken from the NeMo config file (please +do not confuse with the DeepPavlov config file that defines model pipeline). + +Speech recognition +------------------ + +The ASR pipeline is based on Jasper: an CTC-based end-to-end model. The model transcripts speech samples without +any additional alignment information. :class:`~deeppavlov.models.nemo.asr.NeMoASR` contains following modules: + +- `AudioToMelSpectrogramPreprocessor `_ - uses arguments from ``AudioToMelSpectrogramPreprocessor`` section of the NeMo config file. +- `JasperEncoder `__ - uses arguments from ``JasperEncoder`` section of the NeMo config file. Needs pretrained checkpoint. +- `JasperDecoderForCTC `__ - uses arguments from ``JasperDecoder`` section of the NeMo config file. Needs pretrained checkpoint. +- `GreedyCTCDecoder `__ - doesn't use any arguments. +- :class:`~deeppavlov.models.nemo.asr.AudioInferDataLayer` - uses arguments from ``AudioToTextDataLayer`` section of the NeMo config file. + +NeMo config file for ASR should contain ``labels`` argument besides named arguments for the modules above. ``labels`` is +a list of characters that can be output by the ASR model used in model training. + +Speech synthesis +---------------- + +The TTS pipeline that creates human audible speech from text is based on Tacotron 2 and Waveglow models. +:class:`~deeppavlov.models.nemo.tts.NeMoTTS` contains following modules: + +- `TextEmbedding `__ - uses arguments from ``TextEmbedding`` section of the NeMo config file. Needs pretrained checkpoint. +- `Tacotron2Encoder `__ - uses arguments from ``Tacotron2Encoder`` section of the NeMo config file. Needs pretrained checkpoint. +- `Tacotron2DecoderInfer `__ - uses arguments from ``Tacotron2Decoder`` section of the NeMo config file. Needs pretrained checkpoint. +- `Tacotron2Postnet `__ - uses arguments from ``Tacotron2Postnet`` section of the NeMo config file. Needs pretrained checkpoint. +- :class:`~deeppavlov.models.nemo.vocoder.WaveGlow` - uses arguments from ``WaveGlowNM`` section of the NeMo config file. Needs pretrained checkpoint. +- :class:`~deeppavlov.models.nemo.vocoder.GriffinLim` - uses arguments from ``GriffinLim`` section of the NeMo config file. +- :class:`~deeppavlov.models.nemo.tts.TextDataLayer` - uses arguments from ``TranscriptDataLayer`` section of the NeMo config file. + +NeMo config file for TTS should contain ``labels`` and ``sample_rate`` args besides named arguments for the modules +above. ``labels`` is a list of characters used in TTS model training. + +Audio encoding end decoding. +---------------------------- + +:class:`~deeppavlov.models.nemo.common.ascii_to_bytes_io` and :class:`~deeppavlov.models.nemo.common.bytes_io_to_ascii` +was added to the library to achieve uniformity at work with both text and audio data. Classes can be used to encode +binary data to ascii string and decode back. + +Quck Start +---------- + +Preparation +~~~~~~~~~~~ + +Install requirements and download model files. + +.. code:: bash + + python -m deeppavlov install asr_tts + python -m deeppavlov download asr_tts + +Examples below use `soundservice `_ library. Install +it with ``pip install soundservice==0.3.15``. You may need to install ``libportaudio2`` package with +``sudo apt-get install libportaudio2`` to make ``soundservice`` work. + +.. note:: + ASR reads and TTS generates single channel WAV files. Files transferred to ASR are resampled to the frequency + specified in the NeMo config file (16 kHz for models from DeepPavlov configs). + +Speech recognition +~~~~~~~~~~~~~~~~~~ + +DeepPavlov :config:`asr ` config contains minimal pipeline for english speech recognition using +`QuartzNet15x5En `_ pretrained model. +To record speech on your computer and print transcription run following script: + +.. code:: python + + from io import BytesIO + + import sounddevice as sd + from scipy.io.wavfile import write + + from deeppavlov import build_model, configs + + sr = 16000 + duration = 3 + + print('Recording...') + myrecording = sd.rec(duration*sr, samplerate=sr, channels=1) + sd.wait() + print('done') + + out = BytesIO() + write(out, sr, myrecording) + + model = build_model(configs.nemo.asr) + text_batch = model([out]) + + print(text_batch[0]) + +Speech synthesis +~~~~~~~~~~~~~~~~ + +DeepPavlov :config:`tts ` config contains minimal pipeline for speech synthesis using +`Tacotron2 `_ and +`WaveGlow `_ pretrained models. +To generate audiofile and save it to hard drive run following script: + +.. code:: python + + from deeppavlov import build_model, configs + + model = build_model(configs.nemo.tts) + filepath_batch = model(['Hello world'], ['~/hello_world.wav']) + + print(f'Generated speech has successfully saved at {filepath_batch[0]}') + +Speech to speech +~~~~~~~~~~~~~~~~ + +Previous examples assume files with speech to recognize and files to be generated are on the same system where the +DeepPavlov is running. DeepPavlov :config:`asr_tts ` config allows sending files with speech to +recognize and receiving files with generated speech from another system. This config is recognizes received speech and +re-sounds it. + +Run ``asr_tts`` in REST Api mode: + +.. code:: bash + + python -m deeppavlov riseapi asr_tts + +This python script supposes that you already have file with speech to recognize. You can use code from speech +recognition example to record speech on your system. ``127.0.0.1`` should be replased by address of system where +DeepPavlov has started. + +.. code:: python + + from base64 import encodebytes, decodebytes + + from requests import post + + with open('/path/to/wav/file/with/speech', 'rb') as fin: + input_speech = fin.read() + + input_ascii = encodebytes(input_speech).decode('ascii') + + resp = post('http://127.0.0.1:5000/model', json={"speech_in_encoded": [input_ascii]}) + text, generated_speech_ascii = resp.json()[0] + generated_speech = decodebytes(generated_speech_ascii.encode()) + + with open('/path/where/to/save/generated/wav/file', 'wb') as fout: + fout.write(generated_speech) + + print(f'Speech transcriptions is: {text}') + +.. warning:: + NeMo library v0.10.0 doesn't allow to infer batches longer than one without compatible NVIDIA GPU. + +Models training +--------------- + +To get your own pre-trained checkpoints for NeMo modules see `Speech recognition `_ +and `Speech Synthesis `_ tutorials. Pre-trained models list could be found +`here `_. \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst index 2f711a01a8..4b2edaec20 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -33,6 +33,7 @@ Welcome to DeepPavlov's documentation! Named Entity Recognition Neural Ranking Slot filling + Speech recognition and synthesis Spelling Correction Syntactic Parser TF-IDF Ranking diff --git a/requirements.txt b/requirements.txt index e291f03eb2..a2682276f0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,6 +13,7 @@ pymorphy2-dicts-ru pyopenssl==19.1.0 pytelegrambotapi==3.6.7 requests==2.22.0 +ruamel.yaml==0.15.100 rusenttokenize==0.0.5 scikit-learn==0.21.2 scipy==1.4.1 diff --git a/tests/test_configs/nemo/tts2asr_test.json b/tests/test_configs/nemo/tts2asr_test.json new file mode 100644 index 0000000000..a054239e8d --- /dev/null +++ b/tests/test_configs/nemo/tts2asr_test.json @@ -0,0 +1,53 @@ +{ + "chainer": { + "in": ["text"], + "pipe": [ + { + "class_name": "nemo_tts", + "nemo_params_path": "{TTS_PATH}/tacotron2_waveglow.yaml", + "load_path": "{TTS_PATH}", + "in": ["text"], + "out": ["speech"] + }, + { + "class_name": "bytesIO_decode_base64", + "in": ["speech"], + "out": ["ascii"] + }, + { + "class_name": "base64_encode_bytesIO", + "in": ["ascii"], + "out": ["speech_restored"] + }, + { + "class_name": "nemo_asr", + "nemo_params_path": "{NEMO_PATH}/quartznet15x5/quartznet15x5.yaml", + "load_path": "{NEMO_PATH}/quartznet15x5", + "in": ["speech_restored"], + "out": ["transcription"] + } + ], + "out": ["transcription"] + }, + "metadata": { + "variables": { + "ROOT_PATH": "~/.deeppavlov", + "NEMO_PATH": "{ROOT_PATH}/models/nemo", + "TTS_PATH": "{NEMO_PATH}/tacotron2_waveglow" + }, + "requirements": [ + "{DEEPPAVLOV_PATH}/requirements/nemo-asr.txt", + "{DEEPPAVLOV_PATH}/requirements/nemo-tts.txt" + ], + "download": [ + { + "url": "http://files.deeppavlov.ai/deeppavlov_data/nemo/tacotron2_waveglow.tar.gz", + "subdir": "{NEMO_PATH}" + }, + { + "url": "http://files.deeppavlov.ai/deeppavlov_data/nemo/quartznet15x5.tar.gz", + "subdir": "{NEMO_PATH}" + } + ] + } +} \ No newline at end of file diff --git a/tests/test_quick_start.py b/tests/test_quick_start.py index ea87cce053..09cc86a2a2 100644 --- a/tests/test_quick_start.py +++ b/tests/test_quick_start.py @@ -257,6 +257,9 @@ "syntax_tagger": { ("syntax/syntax_ru_syntagrus_bert.json", "syntax_ru_bert", ('IP', 'TI')): [ONE_ARGUMENT_INFER_CHECK], ("syntax/ru_syntagrus_joint_parsing.json", "syntax_ru_bert", ('IP',)): [ONE_ARGUMENT_INFER_CHECK] + }, + "nemo": { + ("nemo/tts2asr_test.json", "nemo", ('IP',)): [ONE_ARGUMENT_INFER_CHECK] } }