Skip to content

Commit

Permalink
Merge branch 'r2.0.0rc0' of github.com:NVIDIA/NeMo into r2.0.0rc0
Browse files Browse the repository at this point in the history
  • Loading branch information
ericharper committed Jun 5, 2024
2 parents 8b65e3e + d02bb32 commit 265bd73
Show file tree
Hide file tree
Showing 123 changed files with 1,896 additions and 8,913 deletions.
1,114 changes: 45 additions & 1,069 deletions .github/workflows/cicd-main.yml

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ Use this installation mode if you want the latest released version.
.. code-block:: bash
apt-get update && apt-get install -y libsndfile1 ffmpeg
pip install Cython
pip install Cython packaging
pip install nemo_toolkit['all']
Depending on the shell used, you may need to use ``"nemo_toolkit[all]"`` instead in the above command.
Expand All @@ -272,7 +272,7 @@ Use this installation mode if you want the version from a particular GitHub bran
.. code-block:: bash
apt-get update && apt-get install -y libsndfile1 ffmpeg
pip install Cython
pip install Cython packaging
python -m pip install git+https://github.com/NVIDIA/NeMo.git@{BRANCH}#egg=nemo_toolkit[all]
Expand Down Expand Up @@ -310,7 +310,7 @@ To install NeMo on Mac with Apple M-Series GPU:
conda install -c conda-forge pynini
# install Cython manually
pip install cython
pip install cython packaging
# clone the repo and install in development mode
git clone https://github.com/NVIDIA/NeMo
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ model:
feat_out: -1 # you may set it if you need different output size other than the default d_model
n_layers: 17
d_model: 512
use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules

# Sub-sampling parameters
subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ model:
feat_out: -1 # you may set it if you need different output size other than the default d_model
n_layers: 17
d_model: 512
use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules

# Sub-sampling params
subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ model:
feat_out: -1 # you may set it if you need different output size other than the default d_model
n_layers: 17
d_model: 512
use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules

# Sub-sampling parameters
subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ model:
feat_out: -1 # you may set it if you need different output size other than the default d_model
n_layers: 17
d_model: 512
use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules

# Sub-sampling params
subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ model:
feat_out: -1 # you may set it if you need different output size other than the default d_model
n_layers: 17
d_model: 512
use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules

# Sub-sampling parameters
subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ model:
feat_out: -1 # you may set it if you need different output size other than the default d_model
n_layers: 17
d_model: 512
use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules

# Sub-sampling params
subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ model:
feat_out: -1 # you may set it if you need different output size other than the default d_model
n_layers: 17
d_model: 512
use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules

# Sub-sampling parameters
subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ model:
feat_out: -1 # you may set it if you need different output size other than the default d_model
n_layers: 17
d_model: 512
use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules

# Sub-sampling params
subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ model:
feat_out: -1 # you may set it if you need different output size other than the default d_model
n_layers: 18
d_model: 512
use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules

# Sub-sampling params
subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ model:
feat_out: -1 # you may set it if you need different output size other than the default d_model
n_layers: 17
d_model: 512
use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules

# Sub-sampling parameters
subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
Expand Down
1 change: 1 addition & 0 deletions examples/asr/conf/ssl/fastconformer/fast-conformer.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ model:
feat_out: -1 # you may set it if you need different output size other than the default d_model
n_layers: 17
d_model: 512
use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules

# Sub-sampling params
subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
Expand Down
4 changes: 1 addition & 3 deletions examples/asr/transcribe_speech.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,9 +163,7 @@ class TranscriptionConfig:

# Decoding strategy for RNNT models
# enable CUDA graphs for transcription
rnnt_decoding: RNNTDecodingConfig = RNNTDecodingConfig(
fused_batch_size=-1, greedy=GreedyBatchedRNNTInferConfig(use_cuda_graph_decoder=True)
)
rnnt_decoding: RNNTDecodingConfig = RNNTDecodingConfig(fused_batch_size=-1)

# Decoding strategy for AED models
multitask_decoding: MultiTaskDecodingConfig = MultiTaskDecodingConfig()
Expand Down
6 changes: 2 additions & 4 deletions examples/asr/transcribe_speech_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,10 +101,8 @@ class ParallelTranscriptionConfig:
use_cer: bool = False

# decoding strategy for RNNT models
# enable CUDA graphs for transcription
rnnt_decoding: RNNTDecodingConfig = RNNTDecodingConfig(
fused_batch_size=-1, greedy=GreedyBatchedRNNTInferConfig(use_cuda_graph_decoder=True)
)
# Double check whether fused_batch_size=-1 is right
rnnt_decoding: RNNTDecodingConfig = RNNTDecodingConfig(fused_batch_size=-1)

# decoder type: ctc or rnnt, can be used to switch between CTC and RNNT decoder for Hybrid RNNT/CTC models
decoder_type: Optional[str] = None
Expand Down
8 changes: 4 additions & 4 deletions examples/nlp/language_modeling/conf/megatron_bert_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ trainer:
devices: 1
num_nodes: 1
accelerator: gpu
precision: 16
precision: bf16
logger: False # logger provided by exp_manager
enable_checkpointing: False
use_distributed_sampler: False
Expand Down Expand Up @@ -41,7 +41,7 @@ exp_manager:

model:
# model parallelism
mcore_bert: False
mcore_bert: True
micro_batch_size: 4
global_batch_size: 8
tensor_model_parallel_size: 1
Expand Down Expand Up @@ -85,7 +85,7 @@ model:
fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16

# Megatron O2-style half-precision
megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
megatron_amp_O2: True # Enable O2-level automatic mixed precision using main parameters
grad_allreduce_chunk_size_mb: 125
grad_div_ar_fusion: False

Expand Down Expand Up @@ -158,4 +158,4 @@ model:
name: CosineAnnealing
warmup_steps: 500
constant_steps: 50000
min_lr: 2e-5
min_lr: 2e-5
6 changes: 3 additions & 3 deletions examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ trainer:
devices: 1
num_nodes: 1
accelerator: gpu
precision: 16
precision: bf16
logger: False # logger provided by exp_manager
enable_checkpointing: False
use_distributed_sampler: False
Expand Down Expand Up @@ -55,7 +55,7 @@ exp_manager:

model:
# use GPTModel from megatron.core
mcore_gpt: False
mcore_gpt: True

# specify micro_batch_size, global_batch_size, and model parallelism
# gradient accumulation will be done automatically based on data_parallel_size
Expand Down Expand Up @@ -120,7 +120,7 @@ model:
fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16

# Megatron O2-style half-precision
megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
megatron_amp_O2: True # Enable O2-level automatic mixed precision using main parameters
grad_allreduce_chunk_size_mb: 125

# Fusion
Expand Down
23 changes: 15 additions & 8 deletions nemo/collections/asr/losses/ssl_losses/contrastive.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from math import ceil

import torch
import torch.nn.functional as F
from torch import nn
Expand All @@ -25,8 +27,7 @@
class ContrastiveLoss(Loss):
@property
def input_types(self):
"""Input types definitions for Contrastive.
"""
"""Input types definitions for Contrastive."""
return {
"spectrograms": NeuralType(("B", "D", "T"), SpectrogramType()),
"spec_masks": NeuralType(("B", "D", "T"), SpectrogramType()),
Expand Down Expand Up @@ -147,13 +148,17 @@ def sample_negatives(self, y, num):

@typecheck()
def forward(self, spectrograms, spec_masks, decoder_outputs, decoder_lengths=None):
spec_in = spectrograms.transpose(-2, -1)
targets = spectrograms.transpose(-2, -1)
masks = spec_masks.transpose(-2, -1)
targets = spec_in
# BxTxC
diff = int(ceil(targets.shape[1] / decoder_outputs.shape[1]) * decoder_outputs.shape[1]) - targets.shape[1]

if diff > 0:
targets = F.pad(targets, (0, 0, 0, diff))
masks = F.pad(masks, (0, 0, 0, diff))

targets = targets.reshape(targets.shape[0], targets.shape[1] // self.combine_time_steps, -1)
masks = masks.reshape(targets.shape[0], targets.shape[1], -1)
targets = targets.reshape(targets.shape[0], decoder_outputs.shape[1], -1)
masks = masks.reshape(targets.shape[0], decoder_outputs.shape[1], -1)

if self.quantized_targets:
if self.store_ids:
Expand Down Expand Up @@ -198,7 +203,8 @@ def forward(self, spectrograms, spec_masks, decoder_outputs, decoder_lengths=Non
if self.sample_from_non_masked:
# sample from all steps in utterance
negatives, _ = self.sample_negatives(
targets.transpose(0, 1), targets_masked_only.size(0), # TxBxC # T'
targets.transpose(0, 1),
targets_masked_only.size(0), # TxBxC # T'
)
else:
# only sample from masked steps in utterance
Expand Down Expand Up @@ -239,7 +245,8 @@ def forward(self, spectrograms, spec_masks, decoder_outputs, decoder_lengths=Non
elif self.sample_from_non_masked:
# sample from all steps in batch
negatives, _ = self.sample_negatives(
targets.reshape(targets.shape[0] * targets.shape[1], -1), targets_masked_only.size(0), # BTxC
targets.reshape(targets.shape[0] * targets.shape[1], -1),
targets_masked_only.size(0), # BTxC
) # T'
else:
# only sample from masked steps
Expand Down
24 changes: 14 additions & 10 deletions nemo/collections/asr/models/clustering_diarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,10 @@ def get_available_model_names(class_name):

class ClusteringDiarizer(torch.nn.Module, Model, DiarizationMixin):
"""
Inference model Class for offline speaker diarization.
This class handles required functionality for diarization : Speech Activity Detection, Segmentation,
Extract Embeddings, Clustering, Resegmentation and Scoring.
All the parameters are passed through config file
Inference model Class for offline speaker diarization.
This class handles required functionality for diarization : Speech Activity Detection, Segmentation,
Extract Embeddings, Clustering, Resegmentation and Scoring.
All the parameters are passed through config file
"""

def __init__(self, cfg: Union[DictConfig, Any], speaker_model=None):
Expand Down Expand Up @@ -137,7 +137,10 @@ def _init_speaker_model(self, speaker_model=None):
Initialize speaker embedding model with model name or path passed through config
"""
if speaker_model is not None:
self._speaker_model = speaker_model
if self._cfg.device is None and torch.cuda.is_available():
self._speaker_model = speaker_model.to(torch.device('cuda'))
else:
self._speaker_model = speaker_model
else:
model_path = self._cfg.diarizer.speaker_embeddings.model_path
if model_path is not None and model_path.endswith('.nemo'):
Expand All @@ -158,7 +161,6 @@ def _init_speaker_model(self, speaker_model=None):
self._speaker_model = EncDecSpeakerLabelModel.from_pretrained(
model_name=model_path, map_location=self._cfg.device
)

self.multiscale_args_dict = parse_scale_configs(
self._diarizer_params.speaker_embeddings.parameters.window_length_in_sec,
self._diarizer_params.speaker_embeddings.parameters.shift_length_in_sec,
Expand All @@ -171,7 +173,9 @@ def _setup_vad_test_data(self, manifest_vad_input):
'sample_rate': self._cfg.sample_rate,
'batch_size': self._cfg.get('batch_size'),
'vad_stream': True,
'labels': ['infer',],
'labels': [
'infer',
],
'window_length_in_sec': self._vad_window_length_in_sec,
'shift_length_in_sec': self._vad_shift_length_in_sec,
'trim_silence': False,
Expand All @@ -192,8 +196,8 @@ def _setup_spkr_test_data(self, manifest_file):

def _run_vad(self, manifest_file):
"""
Run voice activity detection.
Get log probability of voice activity detection and smoothes using the post processing parameters.
Run voice activity detection.
Get log probability of voice activity detection and smoothes using the post processing parameters.
Using generated frame level predictions generated manifest file for later speaker embedding extraction.
input:
manifest_file (str) : Manifest file containing path to audio file and label as infer
Expand Down Expand Up @@ -338,7 +342,7 @@ def _perform_speech_activity_detection(self):
def _extract_embeddings(self, manifest_file: str, scale_idx: int, num_scales: int):
"""
This method extracts speaker embeddings from segments passed through manifest_file
Optionally you may save the intermediate speaker embeddings for debugging or any use.
Optionally you may save the intermediate speaker embeddings for debugging or any use.
"""
logging.info("Extracting embeddings for Diarization")
self._setup_spkr_test_data(manifest_file)
Expand Down
9 changes: 6 additions & 3 deletions nemo/collections/asr/models/label_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
if 'loss' in cfg:
cfg_eval_loss = copy.deepcopy(cfg.loss)

if 'angular' in cfg.loss._target_:
if '_target_' in cfg.loss and 'angular' in cfg.loss._target_:
OmegaConf.set_struct(cfg, True)
with open_dict(cfg):
cfg.decoder.angular = True
Expand Down Expand Up @@ -341,7 +341,8 @@ def forward_for_export(self, processed_signal, processed_signal_len):
@typecheck()
def forward(self, input_signal, input_signal_length):
processed_signal, processed_signal_len = self.preprocessor(
input_signal=input_signal, length=input_signal_length,
input_signal=input_signal,
length=input_signal_length,
)

if self.spec_augmentation is not None and self.training:
Expand Down Expand Up @@ -627,7 +628,9 @@ def batch_inference(self, manifest_filepath, batch_size=32, sample_rate=16000, d
dataset = AudioToSpeechLabelDataset(manifest_filepath=manifest_filepath, labels=None, featurizer=featurizer)

dataloader = torch.utils.data.DataLoader(
dataset=dataset, batch_size=batch_size, collate_fn=dataset.fixed_seq_collate_fn,
dataset=dataset,
batch_size=batch_size,
collate_fn=dataset.fixed_seq_collate_fn,
)

logits = []
Expand Down
Loading

0 comments on commit 265bd73

Please sign in to comment.