Skip to content

Commit

Permalink
feat: add ASR and TTS integration (deeppavlov#1162)
Browse files Browse the repository at this point in the history
* feat: bare minimum asr model

* feat: added simple asr config file

* feat: minimal asr model

* refactor: asr config + file names

* refactor: tts

* feat: nemo interfaces

* feat: interfaces refactor + full pipeline

* feat: nemo test + api-friendly additional models

* feat: refactor

* refactor: docstrings, type annotations, redundant args

* fix: refactor errors with params

* feat: docs and small fixes

* fix: docs build

* feat: updaate NeMo to 0.10.0

* fix (fix/nemo_style): fix style (deeppavlov#1174)

* fix (fix/nemo_style): fix style

* feat: undone some changes

Co-authored-by: Fedor Ignatov <[email protected]>

* feat: NeMo quickstart examples

* docs: Added NeMo file format info

* refactor: typos, documentation and variables naming changes

* fix: typo

Co-authored-by: Kuznetsov Denis <[email protected]>
  • Loading branch information
IgnatovFedor and kudep authored Apr 19, 2020
1 parent a797a4a commit 485ea8f
Show file tree
Hide file tree
Showing 20 changed files with 1,044 additions and 4 deletions.
29 changes: 29 additions & 0 deletions deeppavlov/configs/nemo/asr.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
{
"chainer": {
"in": "speech",
"pipe": [
{
"class_name": "nemo_asr",
"nemo_params_path": "{NEMO_PATH}/quartznet15x5/quartznet15x5.yaml",
"load_path": "{NEMO_PATH}/quartznet15x5",
"in": ["speech"],
"out": ["text"]
}
],
"out": ["text"]
},
"metadata": {
"variables": {
"NEMO_PATH": "~/.deeppavlov/models/nemo"
},
"requirements": [
"{DEEPPAVLOV_PATH}/requirements/nemo-asr.txt"
],
"download": [
{
"url": "http://files.deeppavlov.ai/deeppavlov_data/nemo/quartznet15x5.tar.gz",
"subdir": "{NEMO_PATH}"
}
]
}
}
52 changes: 52 additions & 0 deletions deeppavlov/configs/nemo/asr_tts.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
{
"chainer": {
"in": "speech_in_encoded",
"pipe": [
{
"class_name": "base64_encode_bytesIO",
"in": ["speech_in_encoded"],
"out": ["speech_in"]
},
{
"class_name": "nemo_asr",
"nemo_params_path": "{NEMO_PATH}/quartznet15x5/quartznet15x5.yaml",
"load_path": "{NEMO_PATH}/quartznet15x5",
"in": ["speech_in"],
"out": ["text"]
},
{
"class_name": "nemo_tts",
"nemo_params_path": "{TTS_PATH}/tacotron2_waveglow.yaml",
"load_path": "{TTS_PATH}",
"in": ["text"],
"out": ["speech_out"]
},
{
"class_name": "bytesIO_decode_base64",
"in": ["speech_out"],
"out": ["speech_out_encoded"]
}
],
"out": ["text", "speech_out_encoded"]
},
"metadata": {
"variables": {
"NEMO_PATH": "~/.deeppavlov/models/nemo",
"TTS_PATH": "{NEMO_PATH}/tacotron2_waveglow"
},
"requirements": [
"{DEEPPAVLOV_PATH}/requirements/nemo-asr.txt",
"{DEEPPAVLOV_PATH}/requirements/nemo-tts.txt"
],
"download": [
{
"url": "http://files.deeppavlov.ai/deeppavlov_data/nemo/quartznet15x5.tar.gz",
"subdir": "{NEMO_PATH}"
},
{
"url": "http://files.deeppavlov.ai/deeppavlov_data/nemo/tacotron2_waveglow.tar.gz",
"subdir": "{NEMO_PATH}"
}
]
}
}
31 changes: 31 additions & 0 deletions deeppavlov/configs/nemo/tts.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
{
"chainer": {
"in": ["text", "filepath"],
"pipe": [
{
"class_name": "nemo_tts",
"nemo_params_path": "{TTS_PATH}/tacotron2_waveglow.yaml",
"load_path": "{TTS_PATH}",
"in": ["text", "filepath"],
"out": ["saved_path"]
}
],
"out": ["saved_path"]
},
"metadata": {
"variables": {
"NEMO_PATH": "~/.deeppavlov/models/nemo",
"TTS_PATH": "{NEMO_PATH}/tacotron2_waveglow"
},
"requirements": [
"{DEEPPAVLOV_PATH}/requirements/nemo-asr.txt",
"{DEEPPAVLOV_PATH}/requirements/nemo-tts.txt"
],
"download": [
{
"url": "http://files.deeppavlov.ai/deeppavlov_data/nemo/tacotron2_waveglow.tar.gz",
"subdir": "{NEMO_PATH}"
}
]
}
}
8 changes: 8 additions & 0 deletions deeppavlov/core/common/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
from pathlib import Path
from typing import Union, Any

from ruamel.yaml import YAML

log = getLogger(__name__)


Expand Down Expand Up @@ -50,3 +52,9 @@ def save_pickle(data: dict, fpath: Union[str, Path]) -> None:
def load_pickle(fpath: Union[str, Path]) -> Any:
with open(fpath, 'rb') as fin:
return pickle.load(fin)


def read_yaml(fpath: Union[str, Path]) -> dict:
yaml = YAML(typ="safe")
with open(fpath, encoding='utf8') as fin:
return yaml.load(fin)
4 changes: 4 additions & 0 deletions deeppavlov/core/common/registry.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
"api_router": "deeppavlov.models.api_requester.api_router:ApiRouter",
"basic_classification_iterator": "deeppavlov.dataset_iterators.basic_classification_iterator:BasicClassificationDatasetIterator",
"basic_classification_reader": "deeppavlov.dataset_readers.basic_classification_reader:BasicClassificationDatasetReader",
"base64_encode_bytesIO": "deeppavlov.models.nemo.common:ascii_to_bytes_io",
"bert_as_summarizer": "deeppavlov.models.bert.bert_as_summarizer:BertAsSummarizer",
"bert_classifier": "deeppavlov.models.bert.bert_classifier:BertClassifierModel",
"bert_ner_preprocessor": "deeppavlov.models.preprocessors.bert_preprocessor:BertNerPreprocessor",
Expand All @@ -22,6 +23,7 @@
"bilstm_gru_nn": "deeppavlov.models.ranking.bilstm_gru_siamese_network:BiLSTMGRUSiameseNetwork",
"bilstm_nn": "deeppavlov.models.ranking.bilstm_siamese_network:BiLSTMSiameseNetwork",
"bow": "deeppavlov.models.embedders.bow_embedder:BoWEmbedder",
"bytesIO_decode_base64": "deeppavlov.models.nemo.common:bytes_io_to_ascii",
"capitalization_featurizer": "deeppavlov.models.preprocessors.capitalization:CapitalizationPreprocessor",
"char_splitter": "deeppavlov.models.preprocessors.char_splitter:CharSplitter",
"char_splitting_lowercase_preprocessor": "deeppavlov.models.preprocessors.capitalization:CharSplittingLowercasePreprocessor",
Expand Down Expand Up @@ -85,6 +87,8 @@
"multi_squad_dataset_reader": "deeppavlov.dataset_readers.squad_dataset_reader:MultiSquadDatasetReader",
"multi_squad_iterator": "deeppavlov.dataset_iterators.squad_iterator:MultiSquadIterator",
"multi_squad_retr_iterator": "deeppavlov.dataset_iterators.squad_iterator:MultiSquadRetrIterator",
"nemo_asr": "deeppavlov.models.nemo.asr:NeMoASR",
"nemo_tts": "deeppavlov.models.nemo.tts:NeMoTTS",
"ner": "deeppavlov.models.ner.network:NerNetwork",
"ner_bio_converter": "deeppavlov.models.ner.bio:BIOMarkupRestorer",
"ner_few_shot_iterator": "deeppavlov.dataset_iterators.ner_few_shot_iterator:NERFewShotIterator",
Expand Down
Empty file.
193 changes: 193 additions & 0 deletions deeppavlov/models/nemo/asr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
# Copyright 2020 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
from io import BytesIO
from pathlib import Path
from typing import List, Optional, Tuple, Union, Dict

import torch
from nemo.collections.asr import AudioToMelSpectrogramPreprocessor, JasperEncoder, JasperDecoderForCTC, GreedyCTCDecoder
from nemo.collections.asr.helpers import post_process_predictions
from nemo.collections.asr.parts.features import WaveformFeaturizer
from nemo.core.neural_types import AudioSignal, NeuralType, LengthsType
from nemo.utils.decorators import add_port_docs
from torch import Tensor
from torch.utils.data import Dataset, DataLoader

from deeppavlov.core.common.registry import register
from deeppavlov.models.nemo.common import CustomDataLayerBase, NeMoBase

log = logging.getLogger(__name__)


class AudioInferDataset(Dataset):
def __init__(self, audio_batch: List[Union[str, BytesIO]], sample_rate: int, int_values: bool, trim=False) -> None:
"""Dataset reader for AudioInferDataLayer.
Args:
audio_batch: Batch to be read. Elements could be either paths to audio files or Binary I/O objects.
sample_rate: Audio files sample rate.
int_values: If true, load samples as 32-bit integers.
trim: Trim leading and trailing silence from an audio signal if True.
"""
self.audio_batch = audio_batch
self.featurizer = WaveformFeaturizer(sample_rate=sample_rate, int_values=int_values)
self.trim = trim

def __getitem__(self, index: int) -> Tuple[Tensor, Tensor]:
"""Processes audio batch item and extracts features.
Args:
index: Audio batch item index.
Returns:
features: Audio file's extracted features tensor.
features_length: Features length tensor.
"""
sample = self.audio_batch[index]
features = self.featurizer.process(sample, trim=self.trim)
features_length = torch.tensor(features.shape[0]).long()

return features, features_length

def __len__(self) -> int:
return len(self.audio_batch)


class AudioInferDataLayer(CustomDataLayerBase):
"""Data Layer for ASR pipeline inference."""

@property
@add_port_docs()
def output_ports(self) -> Dict[str, NeuralType]:
return {
"audio_signal": NeuralType(('B', 'T'), AudioSignal(freq=self._sample_rate)),
"a_sig_length": NeuralType(tuple('B'), LengthsType())
}

def __init__(self, *,
audio_batch: List[Union[str, BytesIO]],
batch_size: int = 32,
sample_rate: int = 16000,
int_values: bool = False,
trim_silence: bool = False,
**kwargs) -> None:
"""Initializes Data Loader.
Args:
audio_batch: Batch to be read. Elements could be either paths to audio files or Binary I/O objects.
batch_size: How many samples per batch to load.
sample_rate: Target sampling rate for data. Audio files will be resampled to sample_rate if
it is not already.
int_values: If true, load data as 32-bit integers.
trim_silence: Trim leading and trailing silence from an audio signal if True.
"""
self._sample_rate = sample_rate

dataset = AudioInferDataset(audio_batch=audio_batch, sample_rate=sample_rate, int_values=int_values,
trim=trim_silence)

dataloader = DataLoader(dataset=dataset, batch_size=batch_size, collate_fn=self.seq_collate_fn)
super(AudioInferDataLayer, self).__init__(dataset, dataloader, **kwargs)

@staticmethod
def seq_collate_fn(batch: Tuple[Tuple[Tensor], Tuple[Tensor]]) -> Tuple[Optional[Tensor], Optional[Tensor]]:
"""Collates batch of audio signal and audio length, zero pads audio signal.
Args:
batch: A tuple of tuples of audio signals and signal lengths. This collate function assumes the signals
are 1d torch tensors (i.e. mono audio).
Returns:
audio_signal: Zero padded audio signal tensor.
audio_length: Audio signal length tensor.
"""
_, audio_lengths = zip(*batch)
max_audio_len = 0
has_audio = audio_lengths[0] is not None
if has_audio:
max_audio_len = max(audio_lengths).item()

audio_signal = []
for sig, sig_len in batch:
if has_audio:
sig_len = sig_len.item()
if sig_len < max_audio_len:
pad = (0, max_audio_len - sig_len)
sig = torch.nn.functional.pad(sig, pad)
audio_signal.append(sig)

if has_audio:
audio_signal = torch.stack(audio_signal)
audio_lengths = torch.stack(audio_lengths)
else:
audio_signal, audio_lengths = None, None

return audio_signal, audio_lengths


@register('nemo_asr')
class NeMoASR(NeMoBase):
"""ASR model on NeMo modules."""

def __init__(self, load_path: Union[str, Path], nemo_params_path: Union[str, Path], **kwargs) -> None:
"""Initializes NeuralModules for ASR.
Args:
load_path: Path to a directory with pretrained checkpoints for JasperEncoder and JasperDecoderForCTC.
nemo_params_path: Path to a file containig labels and params for AudioToMelSpectrogramPreprocessor,
JasperEncoder, JasperDecoderForCTC and AudioInferDataLayer.
"""
super(NeMoASR, self).__init__(load_path=load_path, nemo_params_path=nemo_params_path, **kwargs)

self.labels = self.nemo_params['labels']

self.data_preprocessor = AudioToMelSpectrogramPreprocessor(
**self.nemo_params['AudioToMelSpectrogramPreprocessor']
)
self.jasper_encoder = JasperEncoder(**self.nemo_params['JasperEncoder'])
self.jasper_decoder = JasperDecoderForCTC(num_classes=len(self.labels), **self.nemo_params['JasperDecoder'])
self.greedy_decoder = GreedyCTCDecoder()
self.modules_to_restore = [self.jasper_encoder, self.jasper_decoder]

self.load()

def __call__(self, audio_batch: List[Union[str, BytesIO]]) -> List[str]:
"""Transcripts audio batch to text.
Args:
audio_batch: Batch to be transcribed. Elements could be either paths to audio files or Binary I/O objects.
Returns:
text_batch: Batch of transcripts.
"""
data_layer = AudioInferDataLayer(audio_batch=audio_batch, **self.nemo_params['AudioToTextDataLayer'])
audio_signal, audio_signal_len = data_layer()
processed_signal, processed_signal_len = self.data_preprocessor(input_signal=audio_signal,
length=audio_signal_len)
encoded, encoded_len = self.jasper_encoder(audio_signal=processed_signal, length=processed_signal_len)
log_probs = self.jasper_decoder(encoder_output=encoded)
predictions = self.greedy_decoder(log_probs=log_probs)
eval_tensors = [predictions]
tensors = self.neural_factory.infer(tensors=eval_tensors)
text_batch = post_process_predictions(tensors[0], self.labels)

return text_batch
Loading

0 comments on commit 485ea8f

Please sign in to comment.