Merge branch 'r2.0.0rc0' of github.com:NVIDIA/NeMo into r2.0.0rc0

NVIDIA · Jun 5, 2024 · 265bd73 · 265bd73
2 parents 8b65e3e + d02bb32
commit 265bd73
Show file tree

Hide file tree

Showing 123 changed files with 1,896 additions and 8,913 deletions.
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
diff --git a/README.rst b/README.rst
@@ -247,7 +247,7 @@ Use this installation mode if you want the latest released version.
 .. code-block:: bash
 
     apt-get update && apt-get install -y libsndfile1 ffmpeg
-    pip install Cython
+    pip install Cython packaging
     pip install nemo_toolkit['all']
 
 Depending on the shell used, you may need to use ``"nemo_toolkit[all]"`` instead in the above command.
@@ -272,7 +272,7 @@ Use this installation mode if you want the version from a particular GitHub bran
 .. code-block:: bash
 
     apt-get update && apt-get install -y libsndfile1 ffmpeg
-    pip install Cython
+    pip install Cython packaging
     python -m pip install git+https://github.com/NVIDIA/NeMo.git@{BRANCH}#egg=nemo_toolkit[all]
 
 
@@ -310,7 +310,7 @@ To install NeMo on Mac with Apple M-Series GPU:
     conda install -c conda-forge pynini
 
     # install Cython manually
-    pip install cython
+    pip install cython packaging
 
     # clone the repo and install in development mode
     git clone https://github.com/NVIDIA/NeMo

diff --git a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_bpe_streaming.yaml b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_bpe_streaming.yaml
@@ -80,6 +80,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling parameters
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding

diff --git a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_char_streaming.yaml b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_char_streaming.yaml
@@ -78,6 +78,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling params
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding

diff --git a/.../asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_bpe_streaming.yaml b/.../asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_bpe_streaming.yaml
@@ -85,6 +85,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling parameters
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding

diff --git a/...asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_char_streaming.yaml b/...asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_char_streaming.yaml
@@ -84,6 +84,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling params
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding

diff --git a/...ormer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml b/...ormer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml
@@ -90,6 +90,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling parameters
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding

diff --git a/...rmer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_char_streaming.yaml b/...rmer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_char_streaming.yaml
@@ -88,6 +88,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling params
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding

diff --git a/...asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_bpe.yaml b/...asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_bpe.yaml
@@ -87,6 +87,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling parameters
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding

diff --git a/...sr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_char.yaml b/...sr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_char.yaml
@@ -85,6 +85,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling params
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding

diff --git a/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_ctc_bpe.yaml b/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_ctc_bpe.yaml
@@ -88,6 +88,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 18
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling params
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding

diff --git a/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_transducer_bpe.yaml b/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_transducer_bpe.yaml
@@ -90,6 +90,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling parameters
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding

diff --git a/examples/asr/conf/ssl/fastconformer/fast-conformer.yaml b/examples/asr/conf/ssl/fastconformer/fast-conformer.yaml
@@ -79,6 +79,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling params
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding

diff --git a/examples/asr/transcribe_speech.py b/examples/asr/transcribe_speech.py
@@ -163,9 +163,7 @@ class TranscriptionConfig:
 
     # Decoding strategy for RNNT models
     # enable CUDA graphs for transcription
-    rnnt_decoding: RNNTDecodingConfig = RNNTDecodingConfig(
-        fused_batch_size=-1, greedy=GreedyBatchedRNNTInferConfig(use_cuda_graph_decoder=True)
-    )
+    rnnt_decoding: RNNTDecodingConfig = RNNTDecodingConfig(fused_batch_size=-1)
 
     # Decoding strategy for AED models
     multitask_decoding: MultiTaskDecodingConfig = MultiTaskDecodingConfig()

diff --git a/examples/asr/transcribe_speech_parallel.py b/examples/asr/transcribe_speech_parallel.py
@@ -101,10 +101,8 @@ class ParallelTranscriptionConfig:
     use_cer: bool = False
 
     # decoding strategy for RNNT models
-    # enable CUDA graphs for transcription
-    rnnt_decoding: RNNTDecodingConfig = RNNTDecodingConfig(
-        fused_batch_size=-1, greedy=GreedyBatchedRNNTInferConfig(use_cuda_graph_decoder=True)
-    )
+    # Double check whether fused_batch_size=-1 is right
+    rnnt_decoding: RNNTDecodingConfig = RNNTDecodingConfig(fused_batch_size=-1)
 
     # decoder type: ctc or rnnt, can be used to switch between CTC and RNNT decoder for Hybrid RNNT/CTC models
     decoder_type: Optional[str] = None

diff --git a/examples/nlp/language_modeling/conf/megatron_bert_config.yaml b/examples/nlp/language_modeling/conf/megatron_bert_config.yaml
@@ -5,7 +5,7 @@ trainer:
   devices: 1
   num_nodes: 1
   accelerator: gpu
-  precision: 16
+  precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
   use_distributed_sampler: False
@@ -41,7 +41,7 @@ exp_manager:
 
 model:
   # model parallelism 
-  mcore_bert: False
+  mcore_bert: True
   micro_batch_size: 4
   global_batch_size: 8
   tensor_model_parallel_size: 1
@@ -85,7 +85,7 @@ model:
   fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
 
   # Megatron O2-style half-precision
-  megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
+  megatron_amp_O2: True # Enable O2-level automatic mixed precision using main parameters
   grad_allreduce_chunk_size_mb: 125
   grad_div_ar_fusion: False 
 
@@ -158,4 +158,4 @@ model:
       name: CosineAnnealing
       warmup_steps: 500
       constant_steps: 50000
-      min_lr: 2e-5
+      min_lr: 2e-5
diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -9,7 +9,7 @@ trainer:
   devices: 1
   num_nodes: 1
   accelerator: gpu
-  precision: 16
+  precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
   use_distributed_sampler: False
@@ -55,7 +55,7 @@ exp_manager:
 
 model:
   # use GPTModel from megatron.core
-  mcore_gpt: False
+  mcore_gpt: True
 
   # specify micro_batch_size, global_batch_size, and model parallelism
   # gradient accumulation will be done automatically based on data_parallel_size
@@ -120,7 +120,7 @@ model:
   fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
 
   # Megatron O2-style half-precision
-  megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
+  megatron_amp_O2: True # Enable O2-level automatic mixed precision using main parameters
   grad_allreduce_chunk_size_mb: 125
 
   # Fusion

diff --git a/nemo/collections/asr/losses/ssl_losses/contrastive.py b/nemo/collections/asr/losses/ssl_losses/contrastive.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from math import ceil
+
 import torch
 import torch.nn.functional as F
 from torch import nn
@@ -25,8 +27,7 @@
 class ContrastiveLoss(Loss):
     @property
     def input_types(self):
-        """Input types definitions for Contrastive.
-        """
+        """Input types definitions for Contrastive."""
         return {
             "spectrograms": NeuralType(("B", "D", "T"), SpectrogramType()),
             "spec_masks": NeuralType(("B", "D", "T"), SpectrogramType()),
@@ -147,13 +148,17 @@ def sample_negatives(self, y, num):
 
     @typecheck()
     def forward(self, spectrograms, spec_masks, decoder_outputs, decoder_lengths=None):
-        spec_in = spectrograms.transpose(-2, -1)
+        targets = spectrograms.transpose(-2, -1)
         masks = spec_masks.transpose(-2, -1)
-        targets = spec_in
         # BxTxC
+        diff = int(ceil(targets.shape[1] / decoder_outputs.shape[1]) * decoder_outputs.shape[1]) - targets.shape[1]
+
+        if diff > 0:
+            targets = F.pad(targets, (0, 0, 0, diff))
+            masks = F.pad(masks, (0, 0, 0, diff))
 
-        targets = targets.reshape(targets.shape[0], targets.shape[1] // self.combine_time_steps, -1)
-        masks = masks.reshape(targets.shape[0], targets.shape[1], -1)
+        targets = targets.reshape(targets.shape[0], decoder_outputs.shape[1], -1)
+        masks = masks.reshape(targets.shape[0], decoder_outputs.shape[1], -1)
 
         if self.quantized_targets:
             if self.store_ids:
@@ -198,7 +203,8 @@ def forward(self, spectrograms, spec_masks, decoder_outputs, decoder_lengths=Non
             if self.sample_from_non_masked:
                 # sample from all steps in utterance
                 negatives, _ = self.sample_negatives(
-                    targets.transpose(0, 1), targets_masked_only.size(0),  # TxBxC  # T'
+                    targets.transpose(0, 1),
+                    targets_masked_only.size(0),  # TxBxC  # T'
                 )
             else:
                 # only sample from masked steps in utterance
@@ -239,7 +245,8 @@ def forward(self, spectrograms, spec_masks, decoder_outputs, decoder_lengths=Non
             elif self.sample_from_non_masked:
                 # sample from all steps in batch
                 negatives, _ = self.sample_negatives(
-                    targets.reshape(targets.shape[0] * targets.shape[1], -1), targets_masked_only.size(0),  # BTxC
+                    targets.reshape(targets.shape[0] * targets.shape[1], -1),
+                    targets_masked_only.size(0),  # BTxC
                 )  # T'
             else:
                 # only sample from masked steps

diff --git a/nemo/collections/asr/models/clustering_diarizer.py b/nemo/collections/asr/models/clustering_diarizer.py
@@ -74,10 +74,10 @@ def get_available_model_names(class_name):
 
 class ClusteringDiarizer(torch.nn.Module, Model, DiarizationMixin):
     """
-    Inference model Class for offline speaker diarization. 
-    This class handles required functionality for diarization : Speech Activity Detection, Segmentation, 
-    Extract Embeddings, Clustering, Resegmentation and Scoring. 
-    All the parameters are passed through config file 
+    Inference model Class for offline speaker diarization.
+    This class handles required functionality for diarization : Speech Activity Detection, Segmentation,
+    Extract Embeddings, Clustering, Resegmentation and Scoring.
+    All the parameters are passed through config file
     """
 
     def __init__(self, cfg: Union[DictConfig, Any], speaker_model=None):
@@ -137,7 +137,10 @@ def _init_speaker_model(self, speaker_model=None):
         Initialize speaker embedding model with model name or path passed through config
         """
         if speaker_model is not None:
-            self._speaker_model = speaker_model
+            if self._cfg.device is None and torch.cuda.is_available():
+                self._speaker_model = speaker_model.to(torch.device('cuda'))
+            else:
+                self._speaker_model = speaker_model
         else:
             model_path = self._cfg.diarizer.speaker_embeddings.model_path
             if model_path is not None and model_path.endswith('.nemo'):
@@ -158,7 +161,6 @@ def _init_speaker_model(self, speaker_model=None):
                 self._speaker_model = EncDecSpeakerLabelModel.from_pretrained(
                     model_name=model_path, map_location=self._cfg.device
                 )
-
         self.multiscale_args_dict = parse_scale_configs(
             self._diarizer_params.speaker_embeddings.parameters.window_length_in_sec,
             self._diarizer_params.speaker_embeddings.parameters.shift_length_in_sec,
@@ -171,7 +173,9 @@ def _setup_vad_test_data(self, manifest_vad_input):
             'sample_rate': self._cfg.sample_rate,
             'batch_size': self._cfg.get('batch_size'),
             'vad_stream': True,
-            'labels': ['infer',],
+            'labels': [
+                'infer',
+            ],
             'window_length_in_sec': self._vad_window_length_in_sec,
             'shift_length_in_sec': self._vad_shift_length_in_sec,
             'trim_silence': False,
@@ -192,8 +196,8 @@ def _setup_spkr_test_data(self, manifest_file):
 
     def _run_vad(self, manifest_file):
         """
-        Run voice activity detection. 
-        Get log probability of voice activity detection and smoothes using the post processing parameters. 
+        Run voice activity detection.
+        Get log probability of voice activity detection and smoothes using the post processing parameters.
         Using generated frame level predictions generated manifest file for later speaker embedding extraction.
         input:
         manifest_file (str) : Manifest file containing path to audio file and label as infer
@@ -338,7 +342,7 @@ def _perform_speech_activity_detection(self):
     def _extract_embeddings(self, manifest_file: str, scale_idx: int, num_scales: int):
         """
         This method extracts speaker embeddings from segments passed through manifest_file
-        Optionally you may save the intermediate speaker embeddings for debugging or any use. 
+        Optionally you may save the intermediate speaker embeddings for debugging or any use.
         """
         logging.info("Extracting embeddings for Diarization")
         self._setup_spkr_test_data(manifest_file)

diff --git a/nemo/collections/asr/models/label_models.py b/nemo/collections/asr/models/label_models.py
@@ -136,7 +136,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
         if 'loss' in cfg:
             cfg_eval_loss = copy.deepcopy(cfg.loss)
 
-            if 'angular' in cfg.loss._target_:
+            if '_target_' in cfg.loss and 'angular' in cfg.loss._target_:
                 OmegaConf.set_struct(cfg, True)
                 with open_dict(cfg):
                     cfg.decoder.angular = True
@@ -341,7 +341,8 @@ def forward_for_export(self, processed_signal, processed_signal_len):
     @typecheck()
     def forward(self, input_signal, input_signal_length):
         processed_signal, processed_signal_len = self.preprocessor(
-            input_signal=input_signal, length=input_signal_length,
+            input_signal=input_signal,
+            length=input_signal_length,
         )
 
         if self.spec_augmentation is not None and self.training:
@@ -627,7 +628,9 @@ def batch_inference(self, manifest_filepath, batch_size=32, sample_rate=16000, d
         dataset = AudioToSpeechLabelDataset(manifest_filepath=manifest_filepath, labels=None, featurizer=featurizer)
 
         dataloader = torch.utils.data.DataLoader(
-            dataset=dataset, batch_size=batch_size, collate_fn=dataset.fixed_seq_collate_fn,
+            dataset=dataset,
+            batch_size=batch_size,
+            collate_fn=dataset.fixed_seq_collate_fn,
         )
 
         logits = []