gutzcha
diff --git a/‎.github/workflows/build.yml
+5 b/‎.github/workflows/build.yml
+5
diff --git a/‎fairseq/__init__.py
+1 b/‎fairseq/__init__.py
+1
diff --git a/‎fairseq/benchmark/dummy_mt.py
+2-2 b/‎fairseq/benchmark/dummy_mt.py
+2-2
diff --git a/‎fairseq/checkpoint_utils.py
+15-22 b/‎fairseq/checkpoint_utils.py
+15-22
diff --git a/‎fairseq/criterions/fastspeech2_loss.py
+26-18 b/‎fairseq/criterions/fastspeech2_loss.py
+26-18
diff --git a/‎fairseq/criterions/hubert_criterion.py
+24-7 b/‎fairseq/criterions/hubert_criterion.py
+24-7
diff --git a/‎fairseq/criterions/label_smoothed_cross_entropy_latency_augmented.py
+17-30 b/‎fairseq/criterions/label_smoothed_cross_entropy_latency_augmented.py
+17-30
@@ -53,3 +53,8 @@ jobs:
     - name: Run tests
       run: |
           python setup.py test
+
+    - name: Lint with black
+      run: |
+        pip install black
+        black --check . --extend-exclude 'examples|fairseq\/model_parallel\/megatron'
@@ -27,6 +27,7 @@
 
 # initialize hydra
 from fairseq.dataclass.initialize import hydra_init
+
 hydra_init()
 
 import fairseq.criterions  # noqa
 
@@ -7,10 +7,10 @@
 
 import numpy as np
 import torch
+
 from fairseq.data import Dictionary, FairseqDataset
 from fairseq.tasks import LegacyFairseqTask, register_task
 
-
 logger = logging.getLogger(__name__)
 
 
@@ -36,7 +36,7 @@ def __init__(self, args, dictionary):
 
     @classmethod
     def setup_task(cls, args, **kwargs):
-        """Setup the task. """
+        """Setup the task."""
         dictionary = Dictionary()
         for i in range(args.dict_size):
             dictionary.add_symbol("word{}".format(i))
 
@@ -96,10 +96,7 @@ def is_better(a, b):
 
         checkpoint_conds[
             "checkpoint.best_{}_{:.3f}{}{}.pt".format(
-                cfg.best_checkpoint_metric,
-                val_loss,
-                rand_sfx,
-                suffix
+                cfg.best_checkpoint_metric, val_loss, rand_sfx, suffix
             )
         ] = worst_best is None or is_better(val_loss, worst_best)
     checkpoint_conds[
@@ -468,9 +465,7 @@ def load_model_ensemble_and_task(
                     and len(state["optimizer_history"]) > 0
                     and "num_updates" in state["optimizer_history"][-1]
                 ):
-                    model.set_num_updates(
-                        state["optimizer_history"][-1]["num_updates"]
-                    )
+                    model.set_num_updates(state["optimizer_history"][-1]["num_updates"])
                 model.load_state_dict(
                     state["model"], strict=strict, model_cfg=cfg.model
                 )
@@ -588,9 +583,8 @@ def _upgrade_state_dict(state):
     # backward compatibility, cfg updates
     if "args" in state and state["args"] is not None:
         # old model checkpoints may not have separate source/target positions
-        if (
-            hasattr(state["args"], "max_positions")
-            and not hasattr(state["args"], "max_source_positions")
+        if hasattr(state["args"], "max_positions") and not hasattr(
+            state["args"], "max_source_positions"
         ):
             state["args"].max_source_positions = state["args"].max_positions
             state["args"].max_target_positions = state["args"].max_positions
@@ -615,13 +609,10 @@ def _upgrade_state_dict(state):
             state["args"].stop_min_lr = state["args"].min_lr
             del state["args"].min_lr
         # binary_cross_entropy / kd_binary_cross_entropy => wav2vec criterion
-        if (
-            hasattr(state["args"], "criterion")
-            and state["args"].criterion in [
-                "binary_cross_entropy",
-                "kd_binary_cross_entropy",
-            ]
-        ):
+        if hasattr(state["args"], "criterion") and state["args"].criterion in [
+            "binary_cross_entropy",
+            "kd_binary_cross_entropy",
+        ]:
             state["args"].criterion = "wav2vec"
         # remove log_keys if it's None (criteria will supply a default value of [])
         if hasattr(state["args"], "log_keys") and state["args"].log_keys is None:
@@ -659,7 +650,9 @@ def _upgrade_state_dict(state):
             ):
                 cfg.task.eval_wer_config.print_alignment = "hard"
             if "generation" in cfg and isinstance(cfg.generation.print_alignment, bool):
-                cfg.generation.print_alignment = "hard" if cfg.generation.print_alignment else None
+                cfg.generation.print_alignment = (
+                    "hard" if cfg.generation.print_alignment else None
+                )
             if (
                 "model" in cfg
                 and "w2v_args" in cfg.model
@@ -833,16 +826,16 @@ def load_ema_from_checkpoint(fpath):
     params_dict = collections.OrderedDict()
     new_state = None
 
-    with PathManager.open(fpath, 'rb') as f:
+    with PathManager.open(fpath, "rb") as f:
         new_state = torch.load(
             f,
             map_location=(
-                lambda s, _: torch.serialization.default_restore_location(s, 'cpu')
+                lambda s, _: torch.serialization.default_restore_location(s, "cpu")
             ),
         )
 
         # EMA model is stored in a separate "extra state"
-        model_params = new_state['extra_state']['ema']
+        model_params = new_state["extra_state"]["ema"]
 
         for key in list(model_params.keys()):
             p = model_params[key]
@@ -860,5 +853,5 @@ def load_ema_from_checkpoint(fpath):
                 "ema model weights, is this model trained with EMA?"
             )
 
-    new_state['model'] = params_dict
+    new_state["model"] = params_dict
     return new_state
@@ -20,9 +20,7 @@
 
 @dataclass
 class FastSpeech2CriterionConfig(FairseqDataclass):
-    ctc_weight: float = field(
-        default=0.0, metadata={"help": "weight for CTC loss"}
-    )
+    ctc_weight: float = field(default=0.0, metadata={"help": "weight for CTC loss"})
 
 
 @register_criterion("fastspeech2", dataclass=FastSpeech2CriterionConfig)
@@ -44,7 +42,7 @@ def forward(self, model: FairseqEncoderModel, sample, reduction="mean"):
             speaker=sample["speaker"],
             durations=sample["durations"],
             pitches=sample["pitches"],
-            energies=sample["energies"]
+            energies=sample["energies"],
         )
 
         src_mask = lengths_to_mask(sample["net_input"]["src_lengths"])
@@ -57,8 +55,7 @@ def forward(self, model: FairseqEncoderModel, sample, reduction="mean"):
         feat_out, feat = _feat_out[tgt_mask], sample["target"][tgt_mask]
         l1_loss = F.l1_loss(feat_out, feat, reduction=reduction)
         if _feat_out_post is not None:
-            l1_loss += F.l1_loss(_feat_out_post[tgt_mask], feat,
-                                 reduction=reduction)
+            l1_loss += F.l1_loss(_feat_out_post[tgt_mask], feat, reduction=reduction)
 
         pitch_loss = F.mse_loss(pitch_out, pitches, reduction=reduction)
         energy_loss = F.mse_loss(energy_out, energies, reduction=reduction)
@@ -69,16 +66,23 @@ def forward(self, model: FairseqEncoderModel, sample, reduction="mean"):
         log_dur = torch.log(dur + 1)[src_mask]
         dur_loss = F.mse_loss(log_dur_out, log_dur, reduction=reduction)
 
-        ctc_loss = torch.tensor(0.).type_as(l1_loss)
-        if self.ctc_weight > 0.:
+        ctc_loss = torch.tensor(0.0).type_as(l1_loss)
+        if self.ctc_weight > 0.0:
             lprobs = model.get_normalized_probs((_feat_out,), log_probs=True)
             lprobs = lprobs.transpose(0, 1)  # T x B x C
             src_mask = lengths_to_mask(src_lens)
             src_tokens_flat = src_tokens.masked_select(src_mask)
-            ctc_loss = F.ctc_loss(
-                lprobs, src_tokens_flat, tgt_lens, src_lens,
-                reduction=reduction, zero_infinity=True
-            ) * self.ctc_weight
+            ctc_loss = (
+                F.ctc_loss(
+                    lprobs,
+                    src_tokens_flat,
+                    tgt_lens,
+                    src_lens,
+                    reduction=reduction,
+                    zero_infinity=True,
+                )
+                * self.ctc_weight
+            )
 
         loss = l1_loss + dur_loss + pitch_loss + energy_loss + ctc_loss
 
@@ -102,8 +106,12 @@ def reduce_metrics(cls, logging_outputs: List[Dict[str, Any]]) -> None:
         ntot = sum(ns)
         ws = [n / (ntot + 1e-8) for n in ns]
         for key in [
-            "loss", "l1_loss", "dur_loss", "pitch_loss", "energy_loss",
-            "ctc_loss"
+            "loss",
+            "l1_loss",
+            "dur_loss",
+            "pitch_loss",
+            "energy_loss",
+            "ctc_loss",
         ]:
             vals = [log.get(key, 0) for log in logging_outputs]
             val = sum(val * w for val, w in zip(vals, ws))
@@ -115,10 +123,10 @@ def reduce_metrics(cls, logging_outputs: List[Dict[str, Any]]) -> None:
             return
         n = sum(log.get("targ_frames", 0) for log in logging_outputs)
         for key, new_key in [
-                ("mcd_loss", "mcd_loss"),
-                ("pred_frames", "pred_ratio"),
-                ("nins", "ins_rate"),
-                ("ndel", "del_rate"),
+            ("mcd_loss", "mcd_loss"),
+            ("pred_frames", "pred_ratio"),
+            ("nins", "ins_rate"),
+            ("ndel", "del_rate"),
         ]:
             val = sum(log.get(key, 0) for log in logging_outputs)
             metrics.log_scalar(new_key, val / n, n, round=3)
 
@@ -37,7 +37,14 @@ class HubertCriterionConfig(FairseqDataclass):
 
 @register_criterion("hubert", dataclass=HubertCriterionConfig)
 class HubertCriterion(FairseqCriterion):
-    def __init__(self, task, pred_masked_weight, pred_nomask_weight, loss_weights=None, log_keys=None):
+    def __init__(
+        self,
+        task,
+        pred_masked_weight,
+        pred_nomask_weight,
+        loss_weights=None,
+        log_keys=None,
+    ):
         super().__init__(task)
         self.pred_masked_weight = pred_masked_weight
         self.pred_nomask_weight = pred_nomask_weight
@@ -52,7 +59,7 @@ def forward(self, model, sample, reduce=True, log_pred=False):
         3) logging outputs to display while training
         """
         net_output = model(target_list=sample["target_list"], **sample["net_input"])
-        loss = 0.
+        loss = 0.0
         sample_size = 0
         logging_output = {}
         reduction = "sum" if reduce else "none"
@@ -89,7 +96,9 @@ def forward(self, model, sample, reduce=True, log_pred=False):
                 names = [names]
             if len(self.loss_weights) == 1 and len(extra_losses) != 1:
                 self.loss_weights = [self.loss_weights[0]] * len(extra_losses)
-            assert len(extra_losses) == len(self.loss_weights), f"{len(extra_losses)}, {len(self.loss_weights)}"
+            assert len(extra_losses) == len(
+                self.loss_weights
+            ), f"{len(extra_losses)}, {len(self.loss_weights)}"
             for p, n, coef in zip(extra_losses, names, self.loss_weights):
                 if coef != 0 and p is not None:
                     p = coef * p.float() * sample_size
@@ -140,12 +149,20 @@ def reduce_metrics(logging_outputs) -> None:
         ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
         sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
 
-        metrics.log_scalar("loss", loss_sum / sample_size / math.log(2), sample_size, round=3)
+        metrics.log_scalar(
+            "loss", loss_sum / sample_size / math.log(2), sample_size, round=3
+        )
         if sample_size != ntokens:
-            metrics.log_scalar("nll_loss", loss_sum / ntokens / math.log(2), ntokens, round=3)
-            metrics.log_derived("ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg))
+            metrics.log_scalar(
+                "nll_loss", loss_sum / ntokens / math.log(2), ntokens, round=3
+            )
+            metrics.log_derived(
+                "ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg)
+            )
         else:
-            metrics.log_derived("ppl", lambda meters: utils.get_perplexity(meters["loss"].avg))
+            metrics.log_derived(
+                "ppl", lambda meters: utils.get_perplexity(meters["loss"].avg)
+            )
 
         counts = {}
         for lk in logging_outputs[0].keys():
 
@@ -9,19 +9,20 @@
 from fairseq.criterions import register_criterion
 from fairseq.criterions.label_smoothed_cross_entropy import (
     LabelSmoothedCrossEntropyCriterion,
-    LabelSmoothedCrossEntropyCriterionConfig
+    LabelSmoothedCrossEntropyCriterionConfig,
 )
 
 try:
     from simuleval.metrics.latency import (
         AverageLagging,
         AverageProportion,
-        DifferentiableAverageLagging
+        DifferentiableAverageLagging,
     )
+
     LATENCY_METRICS = {
         "average_lagging": AverageLagging,
         "average_proportion": AverageProportion,
-        "differentiable_average_lagging":  DifferentiableAverageLagging,
+        "differentiable_average_lagging": DifferentiableAverageLagging,
     }
 except ImportError:
     LATENCY_METRICS = None
@@ -56,9 +57,10 @@ class LabelSmoothedCrossEntropyCriterionLatencyAugmentConfig(
         metadata={"help": "Add latency loss after certain steps"},
     )
 
+
 @register_criterion(
     "latency_augmented_label_smoothed_cross_entropy",
-    dataclass=LabelSmoothedCrossEntropyCriterionLatencyAugmentConfig
+    dataclass=LabelSmoothedCrossEntropyCriterionLatencyAugmentConfig,
 )
 class LatencyAugmentedLabelSmoothedCrossEntropyCriterion(
     LabelSmoothedCrossEntropyCriterion
@@ -101,9 +103,9 @@ def forward(self, model, sample, reduce=True):
 
         if self.latency_update_after > 0:
             num_updates = getattr(model.decoder, "num_updates", None)
-            assert num_updates is not None, (
-                "model.decoder doesn't have attribute 'num_updates'"
-            )
+            assert (
+                num_updates is not None
+            ), "model.decoder doesn't have attribute 'num_updates'"
             if num_updates <= self.latency_update_after:
                 latency_loss = 0
 
@@ -134,9 +136,7 @@ def compute_latency_loss(self, model, sample, net_output):
         assert (
             net_output[-1].encoder_padding_mask is None
             or not net_output[-1].encoder_padding_mask[:, 0].any()
-        ), (
-            "Only right padding on source is supported."
-        )
+        ), "Only right padding on source is supported."
         # 1. Obtain the expected alignment
         alpha_list = [item["alpha"] for item in net_output[1].attn_list]
         num_layers = len(alpha_list)
@@ -174,8 +174,7 @@ def compute_latency_loss(self, model, sample, net_output):
             .view(-1)
         )
         expected_latency = LATENCY_METRICS[self.latency_avg_type](
-            expected_delays, src_lengths, None,
-            target_padding_mask=target_padding_mask
+            expected_delays, src_lengths, None, target_padding_mask=target_padding_mask
         )
 
         # 2.1 average expected latency of heads
@@ -210,24 +209,12 @@ def compute_latency_loss(self, model, sample, net_output):
     @classmethod
     def reduce_metrics(cls, logging_outputs) -> None:
         super().reduce_metrics(logging_outputs)
-        latency = sum(
-            log.get("latency", 0) for log in logging_outputs
-        )
-        delays_var = sum(
-            log.get("delays_var", 0) for log in logging_outputs
-        )
-        latency_loss = sum(
-            log.get("latency_loss", 0) for log in logging_outputs
-        )
+        latency = sum(log.get("latency", 0) for log in logging_outputs)
+        delays_var = sum(log.get("delays_var", 0) for log in logging_outputs)
+        latency_loss = sum(log.get("latency_loss", 0) for log in logging_outputs)
         nsentences = sum(log.get("nsentences", 0) for log in logging_outputs)
+        metrics.log_scalar("latency", latency.float() / nsentences, nsentences, round=3)
+        metrics.log_scalar("delays_var", delays_var / nsentences, nsentences, round=3)
         metrics.log_scalar(
-            "latency", latency.float() / nsentences, nsentences, round=3
-        )
-        metrics.log_scalar(
-            "delays_var", delays_var / nsentences,
-            nsentences, round=3
-        )
-        metrics.log_scalar(
-            "latency_loss", latency_loss / nsentences,
-            nsentences, round=3
+            "latency_loss", latency_loss / nsentences, nsentences, round=3
         )