feat: add ASR and TTS integration (deeppavlov#1162)

* feat: bare minimum asr model * feat: added simple asr config file * feat: minimal asr model * refactor: asr config + file names * refactor: tts * feat: nemo interfaces * feat: interfaces refactor + full pipeline * feat: nemo test + api-friendly additional models * feat: refactor * refactor: docstrings, type annotations, redundant args * fix: refactor errors with params * feat: docs and small fixes * fix: docs build * feat: updaate NeMo to 0.10.0 * fix (fix/nemo_style): fix style (deeppavlov#1174) * fix (fix/nemo_style): fix style * feat: undone some changes Co-authored-by: Fedor Ignatov <[email protected]> * feat: NeMo quickstart examples * docs: Added NeMo file format info * refactor: typos, documentation and variables naming changes * fix: typo Co-authored-by: Kuznetsov Denis <[email protected]>
molodiuc · Apr 19, 2020 · 485ea8f · 485ea8f
1 parent a797a4a
commit 485ea8f
Show file tree

Hide file tree

Showing 20 changed files with 1,044 additions and 4 deletions.
diff --git a/deeppavlov/configs/nemo/asr.json b/deeppavlov/configs/nemo/asr.json
@@ -0,0 +1,29 @@
+{
+  "chainer": {
+    "in": "speech",
+    "pipe": [
+      {
+        "class_name": "nemo_asr",
+        "nemo_params_path": "{NEMO_PATH}/quartznet15x5/quartznet15x5.yaml",
+        "load_path": "{NEMO_PATH}/quartznet15x5",
+        "in": ["speech"],
+        "out": ["text"]
+      }
+    ],
+    "out": ["text"]
+  },
+  "metadata": {
+    "variables": {
+      "NEMO_PATH": "~/.deeppavlov/models/nemo"
+    },
+    "requirements": [
+      "{DEEPPAVLOV_PATH}/requirements/nemo-asr.txt"
+    ],
+    "download": [
+      {
+        "url": "http://files.deeppavlov.ai/deeppavlov_data/nemo/quartznet15x5.tar.gz",
+        "subdir": "{NEMO_PATH}"
+      }
+    ]
+  }
+}
diff --git a/deeppavlov/configs/nemo/asr_tts.json b/deeppavlov/configs/nemo/asr_tts.json
@@ -0,0 +1,52 @@
+{
+  "chainer": {
+    "in": "speech_in_encoded",
+    "pipe": [
+      {
+        "class_name": "base64_encode_bytesIO",
+        "in": ["speech_in_encoded"],
+        "out": ["speech_in"]
+      },
+      {
+        "class_name": "nemo_asr",
+        "nemo_params_path": "{NEMO_PATH}/quartznet15x5/quartznet15x5.yaml",
+        "load_path": "{NEMO_PATH}/quartznet15x5",
+        "in": ["speech_in"],
+        "out": ["text"]
+      },
+      {
+        "class_name": "nemo_tts",
+        "nemo_params_path": "{TTS_PATH}/tacotron2_waveglow.yaml",
+        "load_path": "{TTS_PATH}",
+        "in": ["text"],
+        "out": ["speech_out"]
+      },
+      {
+        "class_name": "bytesIO_decode_base64",
+        "in": ["speech_out"],
+        "out": ["speech_out_encoded"]
+      }
+    ],
+    "out": ["text", "speech_out_encoded"]
+  },
+  "metadata": {
+    "variables": {
+      "NEMO_PATH": "~/.deeppavlov/models/nemo",
+      "TTS_PATH": "{NEMO_PATH}/tacotron2_waveglow"
+    },
+    "requirements": [
+      "{DEEPPAVLOV_PATH}/requirements/nemo-asr.txt",
+      "{DEEPPAVLOV_PATH}/requirements/nemo-tts.txt"
+    ],
+    "download": [
+      {
+        "url": "http://files.deeppavlov.ai/deeppavlov_data/nemo/quartznet15x5.tar.gz",
+        "subdir": "{NEMO_PATH}"
+      },
+      {
+        "url": "http://files.deeppavlov.ai/deeppavlov_data/nemo/tacotron2_waveglow.tar.gz",
+        "subdir": "{NEMO_PATH}"
+      }
+    ]
+  }
+}
diff --git a/deeppavlov/configs/nemo/tts.json b/deeppavlov/configs/nemo/tts.json
@@ -0,0 +1,31 @@
+{
+  "chainer": {
+    "in": ["text", "filepath"],
+    "pipe": [
+      {
+        "class_name": "nemo_tts",
+        "nemo_params_path": "{TTS_PATH}/tacotron2_waveglow.yaml",
+        "load_path": "{TTS_PATH}",
+        "in": ["text", "filepath"],
+        "out": ["saved_path"]
+      }
+    ],
+    "out": ["saved_path"]
+  },
+  "metadata": {
+    "variables": {
+      "NEMO_PATH": "~/.deeppavlov/models/nemo",
+      "TTS_PATH": "{NEMO_PATH}/tacotron2_waveglow"
+    },
+    "requirements": [
+      "{DEEPPAVLOV_PATH}/requirements/nemo-asr.txt",
+      "{DEEPPAVLOV_PATH}/requirements/nemo-tts.txt"
+    ],
+    "download": [
+      {
+        "url": "http://files.deeppavlov.ai/deeppavlov_data/nemo/tacotron2_waveglow.tar.gz",
+        "subdir": "{NEMO_PATH}"
+      }
+    ]
+  }
+}
diff --git a/deeppavlov/core/common/file.py b/deeppavlov/core/common/file.py
@@ -19,6 +19,8 @@
 from pathlib import Path
 from typing import Union, Any
 
+from ruamel.yaml import YAML
+
 log = getLogger(__name__)
 
 
@@ -50,3 +52,9 @@ def save_pickle(data: dict, fpath: Union[str, Path]) -> None:
 def load_pickle(fpath: Union[str, Path]) -> Any:
     with open(fpath, 'rb') as fin:
         return pickle.load(fin)
+
+
+def read_yaml(fpath: Union[str, Path]) -> dict:
+    yaml = YAML(typ="safe")
+    with open(fpath, encoding='utf8') as fin:
+        return yaml.load(fin)
diff --git a/deeppavlov/core/common/registry.json b/deeppavlov/core/common/registry.json
@@ -6,6 +6,7 @@
   "api_router": "deeppavlov.models.api_requester.api_router:ApiRouter",
   "basic_classification_iterator": "deeppavlov.dataset_iterators.basic_classification_iterator:BasicClassificationDatasetIterator",
   "basic_classification_reader": "deeppavlov.dataset_readers.basic_classification_reader:BasicClassificationDatasetReader",
+  "base64_encode_bytesIO": "deeppavlov.models.nemo.common:ascii_to_bytes_io",
   "bert_as_summarizer": "deeppavlov.models.bert.bert_as_summarizer:BertAsSummarizer",
   "bert_classifier": "deeppavlov.models.bert.bert_classifier:BertClassifierModel",
   "bert_ner_preprocessor": "deeppavlov.models.preprocessors.bert_preprocessor:BertNerPreprocessor",
@@ -22,6 +23,7 @@
   "bilstm_gru_nn": "deeppavlov.models.ranking.bilstm_gru_siamese_network:BiLSTMGRUSiameseNetwork",
   "bilstm_nn": "deeppavlov.models.ranking.bilstm_siamese_network:BiLSTMSiameseNetwork",
   "bow": "deeppavlov.models.embedders.bow_embedder:BoWEmbedder",
+  "bytesIO_decode_base64": "deeppavlov.models.nemo.common:bytes_io_to_ascii",
   "capitalization_featurizer": "deeppavlov.models.preprocessors.capitalization:CapitalizationPreprocessor",
   "char_splitter": "deeppavlov.models.preprocessors.char_splitter:CharSplitter",
   "char_splitting_lowercase_preprocessor": "deeppavlov.models.preprocessors.capitalization:CharSplittingLowercasePreprocessor",
@@ -85,6 +87,8 @@
   "multi_squad_dataset_reader": "deeppavlov.dataset_readers.squad_dataset_reader:MultiSquadDatasetReader",
   "multi_squad_iterator": "deeppavlov.dataset_iterators.squad_iterator:MultiSquadIterator",
   "multi_squad_retr_iterator": "deeppavlov.dataset_iterators.squad_iterator:MultiSquadRetrIterator",
+  "nemo_asr": "deeppavlov.models.nemo.asr:NeMoASR",
+  "nemo_tts": "deeppavlov.models.nemo.tts:NeMoTTS",
   "ner": "deeppavlov.models.ner.network:NerNetwork",
   "ner_bio_converter": "deeppavlov.models.ner.bio:BIOMarkupRestorer",
   "ner_few_shot_iterator": "deeppavlov.dataset_iterators.ner_few_shot_iterator:NERFewShotIterator",

diff --git a/deeppavlov/models/nemo/__init__.py b/deeppavlov/models/nemo/__init__.py
diff --git a/deeppavlov/models/nemo/asr.py b/deeppavlov/models/nemo/asr.py
@@ -0,0 +1,193 @@
+# Copyright 2020 Neural Networks and Deep Learning lab, MIPT
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from io import BytesIO
+from pathlib import Path
+from typing import List, Optional, Tuple, Union, Dict
+
+import torch
+from nemo.collections.asr import AudioToMelSpectrogramPreprocessor, JasperEncoder, JasperDecoderForCTC, GreedyCTCDecoder
+from nemo.collections.asr.helpers import post_process_predictions
+from nemo.collections.asr.parts.features import WaveformFeaturizer
+from nemo.core.neural_types import AudioSignal, NeuralType, LengthsType
+from nemo.utils.decorators import add_port_docs
+from torch import Tensor
+from torch.utils.data import Dataset, DataLoader
+
+from deeppavlov.core.common.registry import register
+from deeppavlov.models.nemo.common import CustomDataLayerBase, NeMoBase
+
+log = logging.getLogger(__name__)
+
+
+class AudioInferDataset(Dataset):
+    def __init__(self, audio_batch: List[Union[str, BytesIO]], sample_rate: int, int_values: bool, trim=False) -> None:
+        """Dataset reader for AudioInferDataLayer.
+
+        Args:
+            audio_batch: Batch to be read. Elements could be either paths to audio files or Binary I/O objects.
+            sample_rate: Audio files sample rate.
+            int_values: If true, load samples as 32-bit integers.
+            trim: Trim leading and trailing silence from an audio signal if True.
+
+        """
+        self.audio_batch = audio_batch
+        self.featurizer = WaveformFeaturizer(sample_rate=sample_rate, int_values=int_values)
+        self.trim = trim
+
+    def __getitem__(self, index: int) -> Tuple[Tensor, Tensor]:
+        """Processes audio batch item and extracts features.
+
+        Args:
+            index: Audio batch item index.
+
+        Returns:
+            features: Audio file's extracted features tensor.
+            features_length: Features length tensor.
+
+        """
+        sample = self.audio_batch[index]
+        features = self.featurizer.process(sample, trim=self.trim)
+        features_length = torch.tensor(features.shape[0]).long()
+
+        return features, features_length
+
+    def __len__(self) -> int:
+        return len(self.audio_batch)
+
+
+class AudioInferDataLayer(CustomDataLayerBase):
+    """Data Layer for ASR pipeline inference."""
+
+    @property
+    @add_port_docs()
+    def output_ports(self) -> Dict[str, NeuralType]:
+        return {
+            "audio_signal": NeuralType(('B', 'T'), AudioSignal(freq=self._sample_rate)),
+            "a_sig_length": NeuralType(tuple('B'), LengthsType())
+        }
+
+    def __init__(self, *,
+                 audio_batch: List[Union[str, BytesIO]],
+                 batch_size: int = 32,
+                 sample_rate: int = 16000,
+                 int_values: bool = False,
+                 trim_silence: bool = False,
+                 **kwargs) -> None:
+        """Initializes Data Loader.
+
+        Args:
+            audio_batch: Batch to be read. Elements could be either paths to audio files or Binary I/O objects.
+            batch_size: How many samples per batch to load.
+            sample_rate: Target sampling rate for data. Audio files will be resampled to sample_rate if
+                it is not already.
+            int_values: If true, load data as 32-bit integers.
+            trim_silence: Trim leading and trailing silence from an audio signal if True.
+
+        """
+        self._sample_rate = sample_rate
+
+        dataset = AudioInferDataset(audio_batch=audio_batch, sample_rate=sample_rate, int_values=int_values,
+                                    trim=trim_silence)
+
+        dataloader = DataLoader(dataset=dataset, batch_size=batch_size, collate_fn=self.seq_collate_fn)
+        super(AudioInferDataLayer, self).__init__(dataset, dataloader, **kwargs)
+
+    @staticmethod
+    def seq_collate_fn(batch: Tuple[Tuple[Tensor], Tuple[Tensor]]) -> Tuple[Optional[Tensor], Optional[Tensor]]:
+        """Collates batch of audio signal and audio length, zero pads audio signal.
+
+        Args:
+            batch: A tuple of tuples of audio signals and signal lengths. This collate function assumes the signals
+                are 1d torch tensors (i.e. mono audio).
+
+        Returns:
+            audio_signal: Zero padded audio signal tensor.
+            audio_length: Audio signal length tensor.
+
+        """
+        _, audio_lengths = zip(*batch)
+        max_audio_len = 0
+        has_audio = audio_lengths[0] is not None
+        if has_audio:
+            max_audio_len = max(audio_lengths).item()
+
+        audio_signal = []
+        for sig, sig_len in batch:
+            if has_audio:
+                sig_len = sig_len.item()
+                if sig_len < max_audio_len:
+                    pad = (0, max_audio_len - sig_len)
+                    sig = torch.nn.functional.pad(sig, pad)
+                audio_signal.append(sig)
+
+        if has_audio:
+            audio_signal = torch.stack(audio_signal)
+            audio_lengths = torch.stack(audio_lengths)
+        else:
+            audio_signal, audio_lengths = None, None
+
+        return audio_signal, audio_lengths
+
+
+@register('nemo_asr')
+class NeMoASR(NeMoBase):
+    """ASR model on NeMo modules."""
+
+    def __init__(self, load_path: Union[str, Path], nemo_params_path: Union[str, Path], **kwargs) -> None:
+        """Initializes NeuralModules for ASR.
+
+        Args:
+            load_path: Path to a directory with pretrained checkpoints for JasperEncoder and JasperDecoderForCTC.
+            nemo_params_path: Path to a file containig labels and params for AudioToMelSpectrogramPreprocessor,
+                JasperEncoder, JasperDecoderForCTC and AudioInferDataLayer.
+
+        """
+        super(NeMoASR, self).__init__(load_path=load_path, nemo_params_path=nemo_params_path, **kwargs)
+
+        self.labels = self.nemo_params['labels']
+
+        self.data_preprocessor = AudioToMelSpectrogramPreprocessor(
+            **self.nemo_params['AudioToMelSpectrogramPreprocessor']
+        )
+        self.jasper_encoder = JasperEncoder(**self.nemo_params['JasperEncoder'])
+        self.jasper_decoder = JasperDecoderForCTC(num_classes=len(self.labels), **self.nemo_params['JasperDecoder'])
+        self.greedy_decoder = GreedyCTCDecoder()
+        self.modules_to_restore = [self.jasper_encoder, self.jasper_decoder]
+
+        self.load()
+
+    def __call__(self, audio_batch: List[Union[str, BytesIO]]) -> List[str]:
+        """Transcripts audio batch to text.
+
+        Args:
+            audio_batch: Batch to be transcribed. Elements could be either paths to audio files or Binary I/O objects.
+
+        Returns:
+            text_batch: Batch of transcripts.
+
+        """
+        data_layer = AudioInferDataLayer(audio_batch=audio_batch, **self.nemo_params['AudioToTextDataLayer'])
+        audio_signal, audio_signal_len = data_layer()
+        processed_signal, processed_signal_len = self.data_preprocessor(input_signal=audio_signal,
+                                                                        length=audio_signal_len)
+        encoded, encoded_len = self.jasper_encoder(audio_signal=processed_signal, length=processed_signal_len)
+        log_probs = self.jasper_decoder(encoder_output=encoded)
+        predictions = self.greedy_decoder(log_probs=log_probs)
+        eval_tensors = [predictions]
+        tensors = self.neural_factory.infer(tensors=eval_tensors)
+        text_batch = post_process_predictions(tensors[0], self.labels)
+
+        return text_batch