diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py
index 85736a9422ab..11bea3c3aa88 100644
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -628,7 +628,7 @@ def model_init():
                 with CaptureStd() as cs:
                     trainer.hyperparameter_search(direction="maximize", n_trials=n_trials)
             self.assertIn("DeepSpeed info", cl.out, "expected DeepSpeed logger output but got none")
-            self.assertIn(f"Trial {n_trials-1} finished with value", cs.err, "expected hyperparameter_search output")
+            self.assertIn(f"Trial {n_trials - 1} finished with value", cs.err, "expected hyperparameter_search output")
             self.assertIn("Best is trial", cs.err, "expected hyperparameter_search output")
 
     # --- These tests need to run on both zero stages --- #
diff --git a/tests/models/bart/test_modeling_tf_bart.py b/tests/models/bart/test_modeling_tf_bart.py
index 87c3b895f17b..e93ab4479506 100644
--- a/tests/models/bart/test_modeling_tf_bart.py
+++ b/tests/models/bart/test_modeling_tf_bart.py
@@ -87,7 +87,7 @@ def prepare_config_and_inputs_for_common(self):
             clip_value_min=self.eos_token_id + 1,
             clip_value_max=self.vocab_size + 1,
         )
-        # Explicity add "end of sequence" to the inputs
+        # Explicitly add "end of sequence" to the inputs
         eos_tensor = tf.expand_dims(tf.constant([self.eos_token_id] * self.batch_size), 1)
         input_ids = tf.concat([input_ids, eos_tensor], axis=1)
 
@@ -225,7 +225,7 @@ def test_decoder_model_past_large_inputs(self):
         self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs)
 
     # TODO (Joao): fix me
-    @unittest.skip("Onnx compliancy broke with TF 2.10")
+    @unittest.skip("Onnx compliance broke with TF 2.10")
     def test_onnx_compliancy(self):
         pass
 
diff --git a/tests/models/bert/test_modeling_tf_bert.py b/tests/models/bert/test_modeling_tf_bert.py
index 335a184d2929..a8f8b7347767 100644
--- a/tests/models/bert/test_modeling_tf_bert.py
+++ b/tests/models/bert/test_modeling_tf_bert.py
@@ -735,7 +735,7 @@ def test_custom_load_tf_weights(self):
             self.assertTrue(layer.split("_")[0] in ["dropout", "classifier"])
 
     # TODO (Joao): fix me
-    @unittest.skip("Onnx compliancy broke with TF 2.10")
+    @unittest.skip("Onnx compliance broke with TF 2.10")
     def test_onnx_compliancy(self):
         pass
 
diff --git a/tests/models/blip/test_modeling_blip.py b/tests/models/blip/test_modeling_blip.py
index 50cc8baae5b8..3dac349fb4fd 100644
--- a/tests/models/blip/test_modeling_blip.py
+++ b/tests/models/blip/test_modeling_blip.py
@@ -474,7 +474,7 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    # override as the `logit_scale` parameter initilization is different for Blip
+    # override as the `logit_scale` parameter initialization is different for Blip
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -483,7 +483,7 @@ def test_initialization(self):
             model = model_class(config=configs_no_init)
             for name, param in model.named_parameters():
                 if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
+                    # check if `logit_scale` is initialized as per the original implementation
                     if name == "logit_scale":
                         self.assertAlmostEqual(
                             param.data.item(),
@@ -988,7 +988,7 @@ def test_training_gradient_checkpointing_use_reentrant(self):
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    # override as the `logit_scale` parameter initilization is different for Blip
+    # override as the `logit_scale` parameter initialization is different for Blip
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -997,7 +997,7 @@ def test_initialization(self):
             model = model_class(config=configs_no_init)
             for name, param in model.named_parameters():
                 if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
+                    # check if `logit_scale` is initialized as per the original implementation
                     if name == "logit_scale":
                         self.assertAlmostEqual(
                             param.data.item(),
@@ -1206,7 +1206,7 @@ def test_training_gradient_checkpointing(self):
             loss = model(**inputs).loss
             loss.backward()
 
-    # override as the `logit_scale` parameter initilization is different for Blip
+    # override as the `logit_scale` parameter initialization is different for Blip
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -1215,7 +1215,7 @@ def test_initialization(self):
             model = model_class(config=configs_no_init)
             for name, param in model.named_parameters():
                 if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
+                    # check if `logit_scale` is initialized as per the original implementation
                     if name == "logit_scale":
                         self.assertAlmostEqual(
                             param.data.item(),
diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py
index 6815b5757115..b55ec4a23c05 100644
--- a/tests/models/blip_2/test_modeling_blip_2.py
+++ b/tests/models/blip_2/test_modeling_blip_2.py
@@ -521,7 +521,7 @@ def test_save_load_fast_init_to_base(self):
     def test_sdpa_can_dispatch_composite_models(self):
         """
         Tests if composite models dispatch correctly on SDPA/eager when requested so when loading the model.
-        This tests only by looking at layer names, as usually SDPA layers are calles "SDPAAttention".
+        This tests only by looking at layer names, as usually SDPA layers are called "SDPAAttention".
         In contrast to the above test, this one checks if the "config._attn_implamentation" is a dict after the model
         is loaded, because we manually replicate requested attn implementation on each sub-config when loading.
         See https://github.com/huggingface/transformers/pull/32238 for more info
@@ -970,7 +970,7 @@ def test_cpu_offload(self):
     def test_sdpa_can_dispatch_composite_models(self):
         """
         Tests if composite models dispatch correctly on SDPA/eager when requested so when loading the model.
-        This tests only by looking at layer names, as usually SDPA layers are calles "SDPAAttention".
+        This tests only by looking at layer names, as usually SDPA layers are called "SDPAAttention".
         In contrast to the above test, this one checks if the "config._attn_implamentation" is a dict after the model
         is loaded, because we manually replicate requested attn implementation on each sub-config when loading.
         See https://github.com/huggingface/transformers/pull/32238 for more info
@@ -1647,7 +1647,7 @@ def test_initialization(self):
             model = model_class(config=configs_no_init)
             for name, param in model.named_parameters():
                 if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
+                    # check if `logit_scale` is initialized as per the original implementation
                     if name == "logit_scale":
                         self.assertAlmostEqual(
                             param.data.item(),
diff --git a/tests/models/bloom/test_tokenization_bloom.py b/tests/models/bloom/test_tokenization_bloom.py
index 71318d9dd174..6c2fffef64f1 100644
--- a/tests/models/bloom/test_tokenization_bloom.py
+++ b/tests/models/bloom/test_tokenization_bloom.py
@@ -135,7 +135,7 @@ def test_encodings_from_xnli_dataset(self):
     @require_jinja
     def test_tokenization_for_chat(self):
         tokenizer = self.get_rust_tokenizer()
-        tokenizer.chat_template = "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}"
+        tokenizer.chat_template = "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}"
         test_chats = [
             [{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}],
             [
diff --git a/tests/models/canine/test_tokenization_canine.py b/tests/models/canine/test_tokenization_canine.py
index e7e19c63ce93..13fcbb049991 100644
--- a/tests/models/canine/test_tokenization_canine.py
+++ b/tests/models/canine/test_tokenization_canine.py
@@ -64,7 +64,7 @@ def test_prepare_batch_integration(self):
     @require_torch
     def test_encoding_keys(self):
         tokenizer = self.canine_tokenizer
-        src_text = ["Once there was a man.", "He wrote a test in HuggingFace Tranformers."]
+        src_text = ["Once there was a man.", "He wrote a test in HuggingFace Transformers."]
         batch = tokenizer(src_text, padding=True, return_tensors="pt")
         # check if input_ids, attention_mask and token_type_ids are returned
         self.assertIn("input_ids", batch)
diff --git a/tests/models/chinese_clip/test_modeling_chinese_clip.py b/tests/models/chinese_clip/test_modeling_chinese_clip.py
index 4a27e7292ec6..bc14d80524c4 100644
--- a/tests/models/chinese_clip/test_modeling_chinese_clip.py
+++ b/tests/models/chinese_clip/test_modeling_chinese_clip.py
@@ -596,7 +596,7 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    # override as the `logit_scale` parameter initilization is different for CHINESE_CLIP
+    # override as the `logit_scale` parameter initialization is different for CHINESE_CLIP
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -608,7 +608,7 @@ def test_initialization(self):
             model = model_class(config=configs_no_init)
             for name, param in model.named_parameters():
                 if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
+                    # check if `logit_scale` is initialized as per the original implementation
                     if name == "logit_scale":
                         self.assertAlmostEqual(
                             param.data.item(),
diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py
index 559db26206b6..c8250648c670 100644
--- a/tests/models/clap/test_modeling_clap.py
+++ b/tests/models/clap/test_modeling_clap.py
@@ -543,7 +543,7 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    # override as the `logit_scale` parameter initilization is different for CLAP
+    # override as the `logit_scale` parameter initialization is different for CLAP
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -552,7 +552,7 @@ def test_initialization(self):
             model = model_class(config=configs_no_init)
             for name, param in model.named_parameters():
                 if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
+                    # check if `logit_scale` is initialized as per the original implementation
                     if name == "logit_scale":
                         self.assertAlmostEqual(
                             param.data.item(),
diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py
index 6b75769b5396..66f741e6f4b2 100644
--- a/tests/models/clip/test_modeling_clip.py
+++ b/tests/models/clip/test_modeling_clip.py
@@ -761,7 +761,7 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    # override as the `logit_scale` parameter initilization is different for CLIP
+    # override as the `logit_scale` parameter initialization is different for CLIP
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -770,7 +770,7 @@ def test_initialization(self):
             model = model_class(config=configs_no_init)
             for name, param in model.named_parameters():
                 if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
+                    # check if `logit_scale` is initialized as per the original implementation
                     if name == "logit_scale":
                         self.assertAlmostEqual(
                             param.data.item(),
diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py
index c2f77e30667a..a116b82f5f4e 100644
--- a/tests/models/clipseg/test_modeling_clipseg.py
+++ b/tests/models/clipseg/test_modeling_clipseg.py
@@ -519,7 +519,7 @@ def test_initialization(self):
             model = model_class(config=configs_no_init)
             for name, param in model.named_parameters():
                 if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
+                    # check if `logit_scale` is initialized as per the original implementation
                     if "logit_scale" in name:
                         self.assertAlmostEqual(
                             param.data.item(),
diff --git a/tests/models/clvp/test_modeling_clvp.py b/tests/models/clvp/test_modeling_clvp.py
index 0f59b91871c9..8064d4059ec6 100644
--- a/tests/models/clvp/test_modeling_clvp.py
+++ b/tests/models/clvp/test_modeling_clvp.py
@@ -500,7 +500,7 @@ def test_inputs_embeds(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    # override as the `logit_scale` parameter initilization is different for Clvp
+    # override as the `logit_scale` parameter initialization is different for Clvp
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -509,7 +509,7 @@ def test_initialization(self):
             model = model_class(config=configs_no_init)
             for name, param in model.named_parameters():
                 if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
+                    # check if `logit_scale` is initialized as per the original implementation
                     if name == "logit_scale":
                         expected_value = np.log(1 / 0.07)
                         returned_value = param.data.item()
diff --git a/tests/models/conditional_detr/test_modeling_conditional_detr.py b/tests/models/conditional_detr/test_modeling_conditional_detr.py
index db5afffe2131..118552fce5fd 100644
--- a/tests/models/conditional_detr/test_modeling_conditional_detr.py
+++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py
@@ -465,13 +465,13 @@ def test_different_timm_backbone(self):
                     self.model_tester.num_labels,
                 )
                 self.assertEqual(outputs.logits.shape, expected_shape)
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 3)
             elif model_class.__name__ == "ConditionalDetrForSegmentation":
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.conditional_detr.model.backbone.conv_encoder.intermediate_channel_sizes), 3)
             else:
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 3)
 
             self.assertTrue(outputs)
@@ -501,13 +501,13 @@ def test_hf_backbone(self):
                     self.model_tester.num_labels,
                 )
                 self.assertEqual(outputs.logits.shape, expected_shape)
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 3)
             elif model_class.__name__ == "ConditionalDetrForSegmentation":
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.conditional_detr.model.backbone.conv_encoder.intermediate_channel_sizes), 3)
             else:
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 3)
 
             self.assertTrue(outputs)
diff --git a/tests/models/cpmant/test_modeling_cpmant.py b/tests/models/cpmant/test_modeling_cpmant.py
index d3835ec2374b..4a6c28e1eb91 100644
--- a/tests/models/cpmant/test_modeling_cpmant.py
+++ b/tests/models/cpmant/test_modeling_cpmant.py
@@ -161,7 +161,7 @@ def test_inputs_embeds(self):
     def test_retain_grad_hidden_states_attentions(self):
         unittest.skip(
             "CPMAnt doesn't support retain grad in hidden_states or attentions, because prompt management will peel off the output.hidden_states from graph.\
-                 So is attentions. We strongly recommand you use loss to tune model."
+                 So is attentions. We strongly recommend you use loss to tune model."
         )(self.test_retain_grad_hidden_states_attentions)
 
     def test_cpmant_model(self):
diff --git a/tests/models/deformable_detr/test_modeling_deformable_detr.py b/tests/models/deformable_detr/test_modeling_deformable_detr.py
index 1e27aaabf8d8..e3441e606d23 100644
--- a/tests/models/deformable_detr/test_modeling_deformable_detr.py
+++ b/tests/models/deformable_detr/test_modeling_deformable_detr.py
@@ -542,10 +542,10 @@ def test_different_timm_backbone(self):
                     self.model_tester.num_labels,
                 )
                 self.assertEqual(outputs.logits.shape, expected_shape)
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 4)
             else:
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 4)
 
             self.assertTrue(outputs)
@@ -574,10 +574,10 @@ def test_hf_backbone(self):
                     self.model_tester.num_labels,
                 )
                 self.assertEqual(outputs.logits.shape, expected_shape)
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 4)
             else:
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 4)
 
             self.assertTrue(outputs)
diff --git a/tests/models/depth_anything/test_modeling_depth_anything.py b/tests/models/depth_anything/test_modeling_depth_anything.py
index 95026c1054e9..63d57d671706 100644
--- a/tests/models/depth_anything/test_modeling_depth_anything.py
+++ b/tests/models/depth_anything/test_modeling_depth_anything.py
@@ -214,7 +214,7 @@ def _validate_backbone_init():
                 model.to(torch_device)
                 model.eval()
 
-                # Confirm out_indices propogated to backbone
+                # Confirm out_indices propagated to backbone
                 self.assertEqual(len(model.backbone.out_indices), 2)
 
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py
index bd0237370720..cb38380dc740 100644
--- a/tests/models/depth_pro/test_modeling_depth_pro.py
+++ b/tests/models/depth_pro/test_modeling_depth_pro.py
@@ -323,7 +323,7 @@ def test_initialization(self):
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                         )
 
-    # this started when switched from normal initialization to kaiming_normal intialization
+    # this started when switched from normal initialization to kaiming_normal initialization
     # maybe because the magnitude of offset values from ViT-encoders increases when followed by many convolution layers
     def test_batching_equivalence(self, atol=1e-4, rtol=1e-4):
         super().test_batching_equivalence(atol=atol, rtol=rtol)
diff --git a/tests/models/detr/test_modeling_detr.py b/tests/models/detr/test_modeling_detr.py
index 381fa1d7cd23..003deceab762 100644
--- a/tests/models/detr/test_modeling_detr.py
+++ b/tests/models/detr/test_modeling_detr.py
@@ -465,13 +465,13 @@ def test_different_timm_backbone(self):
                     self.model_tester.num_labels + 1,
                 )
                 self.assertEqual(outputs.logits.shape, expected_shape)
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 3)
             elif model_class.__name__ == "DetrForSegmentation":
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.detr.model.backbone.conv_encoder.intermediate_channel_sizes), 3)
             else:
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 3)
 
             self.assertTrue(outputs)
@@ -500,13 +500,13 @@ def test_hf_backbone(self):
                     self.model_tester.num_labels + 1,
                 )
                 self.assertEqual(outputs.logits.shape, expected_shape)
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 3)
             elif model_class.__name__ == "DetrForSegmentation":
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.detr.model.backbone.conv_encoder.intermediate_channel_sizes), 3)
             else:
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 3)
 
             self.assertTrue(outputs)
diff --git a/tests/models/diffllama/test_modeling_diffllama.py b/tests/models/diffllama/test_modeling_diffllama.py
index 81b963cfc47f..9864b713a59d 100644
--- a/tests/models/diffllama/test_modeling_diffllama.py
+++ b/tests/models/diffllama/test_modeling_diffllama.py
@@ -433,7 +433,9 @@ def test_model_rope_scaling(self):
         long_input_length = int(config.max_position_embeddings * 1.5)
 
         # Inputs
-        x = torch.randn(1, dtype=torch.float32, device=torch_device)  # used exlusively to get the dtype and the device
+        x = torch.randn(
+            1, dtype=torch.float32, device=torch_device
+        )  # used exclusively to get the dtype and the device
         position_ids_short = torch.arange(short_input_length, dtype=torch.long, device=torch_device)
         position_ids_short = position_ids_short.unsqueeze(0)
         position_ids_long = torch.arange(long_input_length, dtype=torch.long, device=torch_device)
@@ -553,7 +555,7 @@ def _reinitialize_config(base_config, new_kwargs):
     @slow
     def test_flash_attn_2_generate_padding_right(self):
         """
-        Overwritting the common test as the test is flaky on tiny models
+        Overwriting the common test as the test is flaky on tiny models
         """
         model = DiffLlamaForCausalLM.from_pretrained(
             "kajuma/DiffLlama-0.3B-handcut",
@@ -617,7 +619,7 @@ def test_use_flash_attention_2_true(self):
     @slow
     def test_eager_matches_sdpa_generate(self):
         """
-        Overwritting the common test as the test is flaky on tiny models
+        Overwriting the common test as the test is flaky on tiny models
         """
         max_new_tokens = 30
 
diff --git a/tests/models/dpt/test_modeling_dpt.py b/tests/models/dpt/test_modeling_dpt.py
index da40466d484b..bdfd480a90ab 100644
--- a/tests/models/dpt/test_modeling_dpt.py
+++ b/tests/models/dpt/test_modeling_dpt.py
@@ -290,7 +290,7 @@ def _validate_backbone_init():
                 model.eval()
 
                 if model.__class__.__name__ == "DPTForDepthEstimation":
-                    # Confirm out_indices propogated to backbone
+                    # Confirm out_indices propagated to backbone
                     self.assertEqual(len(model.backbone.out_indices), 2)
 
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/models/encodec/test_modeling_encodec.py b/tests/models/encodec/test_modeling_encodec.py
index 2d5eca4b83ae..209d02ad19c3 100644
--- a/tests/models/encodec/test_modeling_encodec.py
+++ b/tests/models/encodec/test_modeling_encodec.py
@@ -492,7 +492,7 @@ def test_integration_24kHz(self):
 
         for bandwidth, expected_rmse in expected_rmse.items():
             with torch.no_grad():
-                # use max bandwith for best possible reconstruction
+                # use max bandwidth for best possible reconstruction
                 encoder_outputs = model.encode(inputs["input_values"], bandwidth=float(bandwidth))
 
                 audio_code_sums = [a[0].sum().cpu().item() for a in encoder_outputs[0]]
@@ -548,7 +548,7 @@ def test_integration_48kHz(self):
 
         for bandwidth, expected_rmse in expected_rmse.items():
             with torch.no_grad():
-                # use max bandwith for best possible reconstruction
+                # use max bandwidth for best possible reconstruction
                 encoder_outputs = model.encode(
                     inputs["input_values"], inputs["padding_mask"], bandwidth=float(bandwidth), return_dict=False
                 )
@@ -608,7 +608,7 @@ def test_batch_48kHz(self):
         input_values = inputs["input_values"].to(torch_device)
         for bandwidth, expected_rmse in expected_rmse.items():
             with torch.no_grad():
-                # use max bandwith for best possible reconstruction
+                # use max bandwidth for best possible reconstruction
                 encoder_outputs = model.encode(input_values, bandwidth=float(bandwidth), return_dict=False)
                 audio_code_sums_0 = [a[0][0].sum().cpu().item() for a in encoder_outputs[0]]
                 audio_code_sums_1 = [a[0][1].sum().cpu().item() for a in encoder_outputs[0]]
diff --git a/tests/models/encoder_decoder/test_modeling_encoder_decoder.py b/tests/models/encoder_decoder/test_modeling_encoder_decoder.py
index 1c4051f2e264..4f1cb32348d8 100644
--- a/tests/models/encoder_decoder/test_modeling_encoder_decoder.py
+++ b/tests/models/encoder_decoder/test_modeling_encoder_decoder.py
@@ -179,7 +179,10 @@ def check_encoder_decoder_model_from_pretrained_using_model_paths(
         **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        with tempfile.TemporaryDirectory() as encoder_tmp_dirname, tempfile.TemporaryDirectory() as decoder_tmp_dirname:
+        with (
+            tempfile.TemporaryDirectory() as encoder_tmp_dirname,
+            tempfile.TemporaryDirectory() as decoder_tmp_dirname,
+        ):
             encoder_model.save_pretrained(encoder_tmp_dirname)
             decoder_model.save_pretrained(decoder_tmp_dirname)
             model_kwargs = {"encoder_hidden_dropout_prob": 0.0}
@@ -306,7 +309,10 @@ def check_save_and_load_encoder_decoder_model(
             out_2 = outputs[0].cpu().numpy()
             out_2[np.isnan(out_2)] = 0
 
-            with tempfile.TemporaryDirectory() as encoder_tmp_dirname, tempfile.TemporaryDirectory() as decoder_tmp_dirname:
+            with (
+                tempfile.TemporaryDirectory() as encoder_tmp_dirname,
+                tempfile.TemporaryDirectory() as decoder_tmp_dirname,
+            ):
                 enc_dec_model.encoder.save_pretrained(encoder_tmp_dirname)
                 enc_dec_model.decoder.save_pretrained(decoder_tmp_dirname)
                 enc_dec_model = EncoderDecoderModel.from_encoder_decoder_pretrained(
diff --git a/tests/models/falcon/test_modeling_falcon.py b/tests/models/falcon/test_modeling_falcon.py
index a024b801e8bc..aec59eca3356 100644
--- a/tests/models/falcon/test_modeling_falcon.py
+++ b/tests/models/falcon/test_modeling_falcon.py
@@ -460,7 +460,9 @@ def test_model_rope_scaling(self):
         long_input_length = int(config.max_position_embeddings * 1.5)
 
         # Inputs
-        x = torch.randn(1, dtype=torch.float32, device=torch_device)  # used exlusively to get the dtype and the device
+        x = torch.randn(
+            1, dtype=torch.float32, device=torch_device
+        )  # used exclusively to get the dtype and the device
         position_ids_short = torch.arange(short_input_length, dtype=torch.long, device=torch_device)
         position_ids_short = position_ids_short.unsqueeze(0)
         position_ids_long = torch.arange(long_input_length, dtype=torch.long, device=torch_device)
diff --git a/tests/models/falcon_mamba/test_modeling_falcon_mamba.py b/tests/models/falcon_mamba/test_modeling_falcon_mamba.py
index 971468026d71..a75238265cd4 100644
--- a/tests/models/falcon_mamba/test_modeling_falcon_mamba.py
+++ b/tests/models/falcon_mamba/test_modeling_falcon_mamba.py
@@ -211,7 +211,7 @@ def create_and_check_state_equivalency(self, config, input_ids, *args):
         output_two = outputs.last_hidden_state
 
         self.parent.assertTrue(torch.allclose(torch.cat([output_one, output_two], dim=1), output_whole, atol=1e-5))
-        # TODO the orignal mamba does not support decoding more than 1 token neither do we
+        # TODO the original mamba does not support decoding more than 1 token neither do we
 
     def create_and_check_falcon_mamba_cached_slow_forward_and_backwards(
         self, config, input_ids, *args, gradient_checkpointing=False
diff --git a/tests/models/fastspeech2_conformer/test_modeling_fastspeech2_conformer.py b/tests/models/fastspeech2_conformer/test_modeling_fastspeech2_conformer.py
index cc413b94a63e..0480335f05aa 100644
--- a/tests/models/fastspeech2_conformer/test_modeling_fastspeech2_conformer.py
+++ b/tests/models/fastspeech2_conformer/test_modeling_fastspeech2_conformer.py
@@ -104,7 +104,7 @@ def create_and_check_model(self, config, input_ids, *args):
         # check batch sizes match
         for value in result.values():
             self.parent.assertEqual(value.size(0), self.batch_size)
-        # check duration, pitch, and energy have the appopriate shapes
+        # check duration, pitch, and energy have the appropriate shapes
         # duration: (batch_size, max_text_length), pitch and energy: (batch_size, max_text_length, 1)
         self.parent.assertEqual(result["duration_outputs"].shape + (1,), result["pitch_outputs"].shape)
         self.parent.assertEqual(result["pitch_outputs"].shape, result["energy_outputs"].shape)
@@ -527,7 +527,7 @@ def create_and_check_model(self, config, input_ids, *args):
         # check batch sizes match
         for value in result.values():
             self.parent.assertEqual(value.size(0), self.batch_size)
-        # check duration, pitch, and energy have the appopriate shapes
+        # check duration, pitch, and energy have the appropriate shapes
         # duration: (batch_size, max_text_length), pitch and energy: (batch_size, max_text_length, 1)
         self.parent.assertEqual(result["duration_outputs"].shape + (1,), result["pitch_outputs"].shape)
         self.parent.assertEqual(result["pitch_outputs"].shape, result["energy_outputs"].shape)
diff --git a/tests/models/fnet/test_modeling_fnet.py b/tests/models/fnet/test_modeling_fnet.py
index 9fdb7f240f2d..2aa35863697b 100644
--- a/tests/models/fnet/test_modeling_fnet.py
+++ b/tests/models/fnet/test_modeling_fnet.py
@@ -326,7 +326,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
                 )
         return inputs_dict
 
-    # Overriden Tests
+    # Overridden Tests
     @unittest.skip
     def test_attention_outputs(self):
         pass
diff --git a/tests/models/fnet/test_tokenization_fnet.py b/tests/models/fnet/test_tokenization_fnet.py
index 16f2e4950ef0..a55c142b25d8 100644
--- a/tests/models/fnet/test_tokenization_fnet.py
+++ b/tests/models/fnet/test_tokenization_fnet.py
@@ -141,7 +141,7 @@ def test_sequence_builders(self):
             tokenizer.sep_token_id
         ]
 
-    # Overriden Tests - loading the fast tokenizer from slow just takes too long
+    # Overridden Tests - loading the fast tokenizer from slow just takes too long
     def test_special_tokens_initialization(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -190,7 +190,7 @@ def test_special_tokens_initialization_from_slow(self):
                 self.assertTrue(special_token_id in p_output)
                 self.assertTrue(special_token_id in cr_output)
 
-    # Overriden Tests
+    # Overridden Tests
     def test_padding(self, max_length=50):
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
diff --git a/tests/models/gemma/test_tokenization_gemma.py b/tests/models/gemma/test_tokenization_gemma.py
index 3a9e7af4b6f5..0b43fec80556 100644
--- a/tests/models/gemma/test_tokenization_gemma.py
+++ b/tests/models/gemma/test_tokenization_gemma.py
@@ -342,8 +342,8 @@ def test_integration_test_xnli(self):
                 encoded1,
                 encoded2,
                 msg="Hint: the following tokenization diff were obtained for slow vs fast:\n "
-                f"elements in slow: {set(pyth_tokenizer.tokenize(string))-set(rust_tokenizer.tokenize(string))} \nvs\n "
-                f"elements in fast: {set(rust_tokenizer.tokenize(string))-set(pyth_tokenizer.tokenize(string))} \n\n{string}",
+                f"elements in slow: {set(pyth_tokenizer.tokenize(string)) - set(rust_tokenizer.tokenize(string))} \nvs\n "
+                f"elements in fast: {set(rust_tokenizer.tokenize(string)) - set(pyth_tokenizer.tokenize(string))} \n\n{string}",
             )
 
             decoded1 = pyth_tokenizer.decode(encoded1, skip_special_tokens=True)
diff --git a/tests/models/gpt2/test_modeling_gpt2.py b/tests/models/gpt2/test_modeling_gpt2.py
index 4af2739ff5fb..0f66e1681882 100644
--- a/tests/models/gpt2/test_modeling_gpt2.py
+++ b/tests/models/gpt2/test_modeling_gpt2.py
@@ -866,7 +866,7 @@ def test_contrastive_search_gpt2(self):
     @slow
     def test_flash_attn_2_generate_padding_left(self):
         """
-        Overwritting the common test as the test is flaky on tiny models
+        Overwriting the common test as the test is flaky on tiny models
         """
         model = GPT2LMHeadModel.from_pretrained("gpt2", torch_dtype=torch.float16).to(0)
 
diff --git a/tests/models/gpt2/test_modeling_tf_gpt2.py b/tests/models/gpt2/test_modeling_tf_gpt2.py
index c56d837939c5..ed4c4a2e89b7 100644
--- a/tests/models/gpt2/test_modeling_tf_gpt2.py
+++ b/tests/models/gpt2/test_modeling_tf_gpt2.py
@@ -451,7 +451,7 @@ def test_onnx_runtime_optimize(self):
             onnxruntime.InferenceSession(onnx_model_proto.SerializeToString())
 
     # TODO (Joao): fix me
-    @unittest.skip("Onnx compliancy broke with TF 2.10")
+    @unittest.skip("Onnx compliance broke with TF 2.10")
     def test_onnx_compliancy(self):
         pass
 
@@ -548,7 +548,7 @@ def test_lm_generate_greedy_distilgpt2_beam_search_special(self):
 
     @slow
     def test_lm_generate_distilgpt2_left_padding(self):
-        """Tests that the generated text is the same, regarless of left padding"""
+        """Tests that the generated text is the same, regardless of left padding"""
         model = TFGPT2LMHeadModel.from_pretrained("distilbert/distilgpt2")
         tokenizer = GPT2Tokenizer.from_pretrained("distilbert/distilgpt2")
 
diff --git a/tests/models/gpt_neo/test_modeling_gpt_neo.py b/tests/models/gpt_neo/test_modeling_gpt_neo.py
index 213c3ed497f1..c0c3639781ca 100644
--- a/tests/models/gpt_neo/test_modeling_gpt_neo.py
+++ b/tests/models/gpt_neo/test_modeling_gpt_neo.py
@@ -479,7 +479,7 @@ def test_local_attn_probs(self):
         # the last 2 tokens are masked, and should have 0 attn_probs
         self.assertTrue(torch.all(attn_probs[:, :, -mask_tokens:, -mask_tokens:] == 0))
 
-        # in loacal attention each token can only attend to the previous window_size tokens (inlcuding itself)
+        # in loacal attention each token can only attend to the previous window_size tokens (including itself)
         # here window_size is 4, so a token at index 5 can only attend to indcies [2, 3, 4, 5]
         # and the attn_probs should be 0 for token [0, 1]
         self.assertTrue(torch.all(attn_probs[:, :, 5, 2:6] != 0))
diff --git a/tests/models/gpt_neox/test_modeling_gpt_neox.py b/tests/models/gpt_neox/test_modeling_gpt_neox.py
index 34a8e54f700d..874c62f0c8ab 100644
--- a/tests/models/gpt_neox/test_modeling_gpt_neox.py
+++ b/tests/models/gpt_neox/test_modeling_gpt_neox.py
@@ -381,7 +381,9 @@ def test_model_rope_scaling(self):
         long_input_length = int(config.max_position_embeddings * 1.5)
 
         # Inputs
-        x = torch.randn(1, dtype=torch.float32, device=torch_device)  # used exlusively to get the dtype and the device
+        x = torch.randn(
+            1, dtype=torch.float32, device=torch_device
+        )  # used exclusively to get the dtype and the device
         position_ids_short = torch.arange(short_input_length, dtype=torch.long, device=torch_device)
         position_ids_short = position_ids_short.unsqueeze(0)
         position_ids_long = torch.arange(long_input_length, dtype=torch.long, device=torch_device)
diff --git a/tests/models/granite/test_modeling_granite.py b/tests/models/granite/test_modeling_granite.py
index 469e96fd8304..826cda3f67c8 100644
--- a/tests/models/granite/test_modeling_granite.py
+++ b/tests/models/granite/test_modeling_granite.py
@@ -356,7 +356,9 @@ def test_model_rope_scaling(self):
         long_input_length = int(config.max_position_embeddings * 1.5)
 
         # Inputs
-        x = torch.randn(1, dtype=torch.float32, device=torch_device)  # used exlusively to get the dtype and the device
+        x = torch.randn(
+            1, dtype=torch.float32, device=torch_device
+        )  # used exclusively to get the dtype and the device
         position_ids_short = torch.arange(short_input_length, dtype=torch.long, device=torch_device)
         position_ids_short = position_ids_short.unsqueeze(0)
         position_ids_long = torch.arange(long_input_length, dtype=torch.long, device=torch_device)
diff --git a/tests/models/granitemoe/test_modeling_granitemoe.py b/tests/models/granitemoe/test_modeling_granitemoe.py
index 0e64d29c9189..cd2470827b90 100644
--- a/tests/models/granitemoe/test_modeling_granitemoe.py
+++ b/tests/models/granitemoe/test_modeling_granitemoe.py
@@ -355,7 +355,9 @@ def test_model_rope_scaling(self):
         long_input_length = int(config.max_position_embeddings * 1.5)
 
         # Inputs
-        x = torch.randn(1, dtype=torch.float32, device=torch_device)  # used exlusively to get the dtype and the device
+        x = torch.randn(
+            1, dtype=torch.float32, device=torch_device
+        )  # used exclusively to get the dtype and the device
         position_ids_short = torch.arange(short_input_length, dtype=torch.long, device=torch_device)
         position_ids_short = position_ids_short.unsqueeze(0)
         position_ids_long = torch.arange(long_input_length, dtype=torch.long, device=torch_device)
diff --git a/tests/models/groupvit/test_modeling_groupvit.py b/tests/models/groupvit/test_modeling_groupvit.py
index 6eebdbf2e45e..4e836f827e9b 100644
--- a/tests/models/groupvit/test_modeling_groupvit.py
+++ b/tests/models/groupvit/test_modeling_groupvit.py
@@ -583,7 +583,7 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    # override as the `logit_scale` parameter initilization is different for GROUPVIT
+    # override as the `logit_scale` parameter initialization is different for GROUPVIT
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -592,7 +592,7 @@ def test_initialization(self):
             model = model_class(config=configs_no_init)
             for name, param in model.named_parameters():
                 if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
+                    # check if `logit_scale` is initialized as per the original implementation
                     if name == "logit_scale":
                         self.assertAlmostEqual(
                             param.data.item(),
diff --git a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
index a8b5083ebd51..ee62ed011bcd 100644
--- a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
@@ -497,7 +497,7 @@ def recursive_check(batched_object, single_row_object, model_name, key):
                     single_batch_shape = value.shape[0] // batch_size
                     single_row_input[key] = value[:single_batch_shape]
                 elif hasattr(value, "tensor"):
-                    # layoutlmv2uses ImageList intead of pixel values (needs for torchscript)
+                    # layoutlmv2uses ImageList instead of pixel values (needs for torchscript)
                     single_row_input[key] = value.tensor[:single_batch_shape]
 
             with torch.no_grad():
diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py
index 01d807fbdba2..319187f11346 100644
--- a/tests/models/llama/test_modeling_llama.py
+++ b/tests/models/llama/test_modeling_llama.py
@@ -426,7 +426,9 @@ def test_model_rope_scaling(self):
         long_input_length = int(config.max_position_embeddings * 1.5)
 
         # Inputs
-        x = torch.randn(1, dtype=torch.float32, device=torch_device)  # used exlusively to get the dtype and the device
+        x = torch.randn(
+            1, dtype=torch.float32, device=torch_device
+        )  # used exclusively to get the dtype and the device
         position_ids_short = torch.arange(short_input_length, dtype=torch.long, device=torch_device)
         position_ids_short = position_ids_short.unsqueeze(0)
         position_ids_long = torch.arange(long_input_length, dtype=torch.long, device=torch_device)
diff --git a/tests/models/llava_next/test_modeling_llava_next.py b/tests/models/llava_next/test_modeling_llava_next.py
index 327e33a9b7f9..8d517c644d1d 100644
--- a/tests/models/llava_next/test_modeling_llava_next.py
+++ b/tests/models/llava_next/test_modeling_llava_next.py
@@ -298,7 +298,7 @@ def test_mismatching_num_image_tokens(self):
         config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
             model = model_class(config).to(torch_device)
-            _ = model(**input_dict)  # successfull forward with no modifications
+            _ = model(**input_dict)  # successful forward with no modifications
 
             # remove one image but leave the image token in text
             input_dict["pixel_values"] = input_dict["pixel_values"][-1:, ...]
diff --git a/tests/models/llava_next_video/test_modeling_llava_next_video.py b/tests/models/llava_next_video/test_modeling_llava_next_video.py
index ba7323a07551..ab3a34ab3b12 100644
--- a/tests/models/llava_next_video/test_modeling_llava_next_video.py
+++ b/tests/models/llava_next_video/test_modeling_llava_next_video.py
@@ -315,7 +315,7 @@ def test_mismatching_num_image_tokens(self):
         config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
             model = model_class(config).to(torch_device)
-            _ = model(**input_dict)  # successfull forward with no modifications
+            _ = model(**input_dict)  # successful forward with no modifications
 
             # remove one image but leave the image token in text
             input_dict["pixel_values"] = input_dict["pixel_values"][-1:, ...]
diff --git a/tests/models/m2m_100/test_modeling_m2m_100.py b/tests/models/m2m_100/test_modeling_m2m_100.py
index 60b3d220b153..9cfcbfb778f3 100644
--- a/tests/models/m2m_100/test_modeling_m2m_100.py
+++ b/tests/models/m2m_100/test_modeling_m2m_100.py
@@ -433,7 +433,7 @@ def test_seq_to_seq_generation(self):
     @slow
     def test_flash_attn_2_seq_to_seq_generation(self):
         """
-        Overwritting the common test as the test is flaky on tiny models
+        Overwriting the common test as the test is flaky on tiny models
         """
         model = M2M100ForConditionalGeneration.from_pretrained(
             "facebook/m2m100_418M", attn_implementation="flash_attention_2"
diff --git a/tests/models/mgp_str/test_modeling_mgp_str.py b/tests/models/mgp_str/test_modeling_mgp_str.py
index 465444f6927e..849eb90a75a0 100644
--- a/tests/models/mgp_str/test_modeling_mgp_str.py
+++ b/tests/models/mgp_str/test_modeling_mgp_str.py
@@ -202,7 +202,7 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
-    # override as the `logit_scale` parameter initilization is different for MgpstrModel
+    # override as the `logit_scale` parameter initialization is different for MgpstrModel
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/models/mgp_str/test_processor_mgp_str.py b/tests/models/mgp_str/test_processor_mgp_str.py
index 783a61ebf144..f42dfb266149 100644
--- a/tests/models/mgp_str/test_processor_mgp_str.py
+++ b/tests/models/mgp_str/test_processor_mgp_str.py
@@ -70,7 +70,7 @@ def setUp(self):
         with open(self.image_processor_file, "w", encoding="utf-8") as fp:
             json.dump(image_processor_map, fp)
 
-    # We copy here rather than use the ProcessorTesterMixin as this processor has a `char_tokenizer` instad of a
+    # We copy here rather than use the ProcessorTesterMixin as this processor has a `char_tokenizer` instead of a
     # tokenizer attribute, which means all the tests would need to be overridden.
     @require_vision
     def prepare_image_inputs(self):
diff --git a/tests/models/mixtral/test_modeling_mixtral.py b/tests/models/mixtral/test_modeling_mixtral.py
index 7ad879243933..f660171774f0 100644
--- a/tests/models/mixtral/test_modeling_mixtral.py
+++ b/tests/models/mixtral/test_modeling_mixtral.py
@@ -450,7 +450,7 @@ def test_load_balancing_loss(self):
         padded_result = model(padded_input_ids, attention_mask=padded_attention_mask)
         torch.testing.assert_close(result.aux_loss.cpu(), padded_result.aux_loss.cpu(), rtol=1e-4, atol=1e-4)
 
-        # We make sure that the loss of includding padding tokens != the loss without padding tokens
+        # We make sure that the loss of including padding tokens != the loss without padding tokens
         # if attention_mask=None --> we don't exclude padding tokens
         include_padding_result = model(padded_input_ids, attention_mask=None)
 
@@ -480,7 +480,7 @@ def test_small_model_logits(self):
             torch_device
         )
         # TODO: might need to tweak it in case the logits do not match on our daily runners
-        # these logits have been obtained with the original megablocks impelmentation.
+        # these logits have been obtained with the original megablocks implementation.
         # Key 9 for MI300, Key 8 for A100/A10, and Key 7 for T4.
         #
         # Note: Key 9 is currently set for MI300, but may need potential future adjustments for H100s,
diff --git a/tests/models/mllama/test_modeling_mllama.py b/tests/models/mllama/test_modeling_mllama.py
index 5c8e5c0e70fa..023dd8ea2be3 100644
--- a/tests/models/mllama/test_modeling_mllama.py
+++ b/tests/models/mllama/test_modeling_mllama.py
@@ -414,7 +414,7 @@ def test_past_key_values_format(self):
             embed_dim = getattr(text_config, "d_model", text_config.hidden_size)
             per_head_embed_dim = embed_dim // num_attention_heads
 
-            # some models have diffent num-head for query vs key/value so we need to assign correct value
+            # some models have different num-head for query vs key/value so we need to assign correct value
             # BUT only after `per_head_embed_dim` is set
             num_attention_heads = (
                 text_config.num_key_value_heads
diff --git a/tests/models/mobilevit/test_modeling_tf_mobilevit.py b/tests/models/mobilevit/test_modeling_tf_mobilevit.py
index fcad3be021e2..61967ec3414f 100644
--- a/tests/models/mobilevit/test_modeling_tf_mobilevit.py
+++ b/tests/models/mobilevit/test_modeling_tf_mobilevit.py
@@ -284,7 +284,7 @@ def test_keras_fit(self):
                     super().test_keras_fit()
 
     # The default test_loss_computation() uses -100 as a proxy ignore_index
-    # to test masked losses. Overridding to avoid -100 since semantic segmentation
+    # to test masked losses. Overriding to avoid -100 since semantic segmentation
     #  models use `semantic_loss_ignore_index` from the config.
     def test_loss_computation(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/models/moshi/test_modeling_moshi.py b/tests/models/moshi/test_modeling_moshi.py
index e6ad1b639e92..4229410ed383 100644
--- a/tests/models/moshi/test_modeling_moshi.py
+++ b/tests/models/moshi/test_modeling_moshi.py
@@ -581,7 +581,7 @@ def prepare_config_and_inputs_for_generate(self, batch_size=2):
         return config, filtered_inputs_dict
 
     def _check_generate_outputs(self, output, config, use_cache=False, num_return_sequences=1, num_beams=1):
-        # Overwrite because the generate method actually alway uses `inputs_embeds` so `use_cache` is always `True`
+        # Overwrite because the generate method actually always uses `inputs_embeds` so `use_cache` is always `True`
         super()._check_generate_outputs(
             output, config, use_cache=True, num_return_sequences=num_return_sequences, num_beams=num_beams
         )
@@ -618,13 +618,13 @@ def test_contrastive_generate_low_memory(self):
         pass
 
     @unittest.skip(
-        "Moshi either needs deafult generation config or fix for fullgraph compile because it hardcodes SlidingWindowCache in custom generation loop."
+        "Moshi either needs default generation config or fix for fullgraph compile because it hardcodes SlidingWindowCache in custom generation loop."
     )
     def test_greedy_generate_dict_outputs_use_cache(self):
         pass
 
     @unittest.skip(
-        "Moshi either needs deafult generation config or fix for fullgraph compile because it hardcodes SlidingWindowCache in custom generation loop."
+        "Moshi either needs default generation config or fix for fullgraph compile because it hardcodes SlidingWindowCache in custom generation loop."
     )
     def test_beam_search_generate_dict_outputs_use_cache(self):
         pass
@@ -849,7 +849,7 @@ def test_generate_from_unconditional(self):
                 **model.get_unconditional_inputs(num_samples=4), max_new_tokens=5, concat_unconditional_inputs=False
             )
 
-            # check same results from uncondtional or no inputs
+            # check same results from unconditional or no inputs
             outputs_from_unconditional = model.generate(
                 **model.get_unconditional_inputs(num_samples=1), max_new_tokens=5, concat_unconditional_inputs=False
             )
diff --git a/tests/models/moshi/test_tokenization_moshi.py b/tests/models/moshi/test_tokenization_moshi.py
index a520cca94bbe..0aaa6295ea66 100644
--- a/tests/models/moshi/test_tokenization_moshi.py
+++ b/tests/models/moshi/test_tokenization_moshi.py
@@ -289,7 +289,7 @@ def test_training_new_tokenizer_with_special_tokens_change(self):
                 self.assertTrue(
                     find,
                     f"'{special_token.__repr__()}' should appear as an `AddedToken` in the all_special_tokens_extended = "
-                    f"{[k for k in new_tokenizer.all_special_tokens_extended if str(k)==new_special_token_str]} but it is missing"
+                    f"{[k for k in new_tokenizer.all_special_tokens_extended if str(k) == new_special_token_str]} but it is missing"
                     ", this means that the new tokenizers did not keep the `rstrip`, `lstrip`, `normalized` etc attributes.",
                 )
             elif special_token not in special_tokens_map:
diff --git a/tests/models/musicgen/test_modeling_musicgen.py b/tests/models/musicgen/test_modeling_musicgen.py
index c8faac1b7d3c..c14292b093f2 100644
--- a/tests/models/musicgen/test_modeling_musicgen.py
+++ b/tests/models/musicgen/test_modeling_musicgen.py
@@ -971,7 +971,7 @@ def test_greedy_generate_stereo_outputs(self):
         self.model_tester.audio_channels = original_audio_channels
 
     @unittest.skip(
-        reason="MusicgenModel is actually not the base of MusicgenForCausalLM as the latter is a composit model"
+        reason="MusicgenModel is actually not the base of MusicgenForCausalLM as the latter is a composite model"
     )
     def test_save_load_fast_init_from_base(self):
         pass
diff --git a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
index c89471d119a9..0fcfa254afa2 100644
--- a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
+++ b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
@@ -961,7 +961,7 @@ def test_greedy_generate_stereo_outputs(self):
         self.model_tester.audio_channels = original_audio_channels
 
     @unittest.skip(
-        reason="MusicgenMelodyModel is actually not the base of MusicgenMelodyForCausalLM as the latter is a composit model"
+        reason="MusicgenMelodyModel is actually not the base of MusicgenMelodyForCausalLM as the latter is a composite model"
     )
     def test_save_load_fast_init_from_base(self):
         pass
diff --git a/tests/models/oneformer/test_modeling_oneformer.py b/tests/models/oneformer/test_modeling_oneformer.py
index 8f1df74ea627..c544154b2364 100644
--- a/tests/models/oneformer/test_modeling_oneformer.py
+++ b/tests/models/oneformer/test_modeling_oneformer.py
@@ -173,7 +173,7 @@ def create_and_check_oneformer_model(
 
             output = model(pixel_values=pixel_values, task_inputs=task_inputs, pixel_mask=pixel_mask)
             output = model(pixel_values, task_inputs=task_inputs, output_hidden_states=True)
-        # the correct shape of output.transformer_decoder_hidden_states ensure the correcteness of the
+        # the correct shape of output.transformer_decoder_hidden_states ensure the correctness of the
         # encoder and pixel decoder
         self.parent.assertEqual(
             output.transformer_decoder_object_queries.shape,
diff --git a/tests/models/owlv2/test_modeling_owlv2.py b/tests/models/owlv2/test_modeling_owlv2.py
index e1278d3c937b..dff1cbe8c00c 100644
--- a/tests/models/owlv2/test_modeling_owlv2.py
+++ b/tests/models/owlv2/test_modeling_owlv2.py
@@ -475,7 +475,7 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    # override as the `logit_scale` parameter initilization is different for OWLV2
+    # override as the `logit_scale` parameter initialization is different for OWLV2
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -484,7 +484,7 @@ def test_initialization(self):
             model = model_class(config=configs_no_init)
             for name, param in model.named_parameters():
                 if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
+                    # check if `logit_scale` is initialized as per the original implementation
                     if name == "logit_scale":
                         self.assertAlmostEqual(
                             param.data.item(),
diff --git a/tests/models/owlvit/test_modeling_owlvit.py b/tests/models/owlvit/test_modeling_owlvit.py
index 315cdf813a6c..1ad85cb37919 100644
--- a/tests/models/owlvit/test_modeling_owlvit.py
+++ b/tests/models/owlvit/test_modeling_owlvit.py
@@ -470,7 +470,7 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    # override as the `logit_scale` parameter initilization is different for OWLVIT
+    # override as the `logit_scale` parameter initialization is different for OWLVIT
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -479,7 +479,7 @@ def test_initialization(self):
             model = model_class(config=configs_no_init)
             for name, param in model.named_parameters():
                 if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
+                    # check if `logit_scale` is initialized as per the original implementation
                     if name == "logit_scale":
                         self.assertAlmostEqual(
                             param.data.item(),
diff --git a/tests/models/persimmon/test_modeling_persimmon.py b/tests/models/persimmon/test_modeling_persimmon.py
index 744788cf6447..4867e38acb68 100644
--- a/tests/models/persimmon/test_modeling_persimmon.py
+++ b/tests/models/persimmon/test_modeling_persimmon.py
@@ -424,7 +424,9 @@ def test_model_rope_scaling(self):
         long_input_length = int(config.max_position_embeddings * 1.5)
 
         # Inputs
-        x = torch.randn(1, dtype=torch.float32, device=torch_device)  # used exlusively to get the dtype and the device
+        x = torch.randn(
+            1, dtype=torch.float32, device=torch_device
+        )  # used exclusively to get the dtype and the device
         position_ids_short = torch.arange(short_input_length, dtype=torch.long, device=torch_device)
         position_ids_short = position_ids_short.unsqueeze(0)
         position_ids_long = torch.arange(long_input_length, dtype=torch.long, device=torch_device)
diff --git a/tests/models/phi/test_modeling_phi.py b/tests/models/phi/test_modeling_phi.py
index 9b7d44ca1cb7..5fe681d99593 100644
--- a/tests/models/phi/test_modeling_phi.py
+++ b/tests/models/phi/test_modeling_phi.py
@@ -403,7 +403,9 @@ def test_model_rope_scaling(self):
         long_input_length = int(config.max_position_embeddings * 1.5)
 
         # Inputs
-        x = torch.randn(1, dtype=torch.float32, device=torch_device)  # used exlusively to get the dtype and the device
+        x = torch.randn(
+            1, dtype=torch.float32, device=torch_device
+        )  # used exclusively to get the dtype and the device
         position_ids_short = torch.arange(short_input_length, dtype=torch.long, device=torch_device)
         position_ids_short = position_ids_short.unsqueeze(0)
         position_ids_long = torch.arange(long_input_length, dtype=torch.long, device=torch_device)
diff --git a/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py b/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py
index 4b5bb61eb8f9..3e4f02626d5f 100644
--- a/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py
+++ b/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py
@@ -263,7 +263,7 @@ def test_mismatching_num_image_tokens(self):
         config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
             model = model_class(config).to(torch_device)
-            _ = model(**input_dict)  # successfull forward with no modifications
+            _ = model(**input_dict)  # successful forward with no modifications
 
             # remove one image but leave the image token in text
             patch_size = config.vision_config.patch_size
diff --git a/tests/models/reformer/test_modeling_reformer.py b/tests/models/reformer/test_modeling_reformer.py
index 235db79a9dbe..b4e493dd2c1d 100644
--- a/tests/models/reformer/test_modeling_reformer.py
+++ b/tests/models/reformer/test_modeling_reformer.py
@@ -696,7 +696,7 @@ def test_left_padding_compatibility(self):
         pass
 
     def prepare_config_and_inputs_for_generate(self, *args, **kwargs):
-        # override because overwise we hit max possible seq length for model (4*8=32)
+        # override because otherwise we hit max possible seq length for model (4*8=32)
         # decreasing the seq_length in tester causes errors for "training_tests", those need exactly max seq length
         # NOTE: seq_length has to be multiple of 4, otherwise it fails for other tests
         original_sequence_length = self.model_tester.seq_length
@@ -887,7 +887,7 @@ def test_left_padding_compatibility(self):
 @require_tokenizers
 class ReformerIntegrationTests(unittest.TestCase):
     """
-    These integration tests test the current layer activations and gradients againts the output of the Hugging Face Reformer model at time of integration: 29/06/2020. During integration, the model was tested against the output of the official Trax ReformerLM model for various cases ("lsh" only, "lsh" only, masked / non-masked, different chunk length, ....). In order to recover the original trax integration tests, one should use patrickvonplaten's fork of trax and the code that lives on the branch `reformer_trax_tests`.
+    These integration tests test the current layer activations and gradients against the output of the Hugging Face Reformer model at time of integration: 29/06/2020. During integration, the model was tested against the output of the official Trax ReformerLM model for various cases ("lsh" only, "lsh" only, masked / non-masked, different chunk length, ....). In order to recover the original trax integration tests, one should use patrickvonplaten's fork of trax and the code that lives on the branch `reformer_trax_tests`.
     """
 
     def _get_basic_config_and_input(self):
@@ -1246,7 +1246,7 @@ def test_local_lm_model_grad(self):
         )
         loss.backward()
 
-        # check last grads to cover all proable errors
+        # check last grads to cover all probable errors
         grad_slice_word = model.reformer.embeddings.word_embeddings.weight.grad[0, :5]
         expected_grad_slice_word = torch.tensor(
             [-0.0005, -0.0001, -0.0002, -0.0006, -0.0006],
@@ -1287,7 +1287,7 @@ def test_lsh_lm_model_grad(self):
             loss, torch.tensor(5.7854, dtype=torch.float, device=torch_device), rtol=1e-3, atol=1e-3
         )
         loss.backward()
-        # check last grads to cover all proable errors
+        # check last grads to cover all probable errors
         grad_slice_word = model.reformer.embeddings.word_embeddings.weight.grad[0, :5]
         expected_grad_slice_word = torch.tensor(
             [0.0004, 0.0003, 0.0006, -0.0004, 0.0002],
diff --git a/tests/models/rt_detr_v2/test_modeling_rt_detr_v2.py b/tests/models/rt_detr_v2/test_modeling_rt_detr_v2.py
index d5388cf41a99..e8af79ca7baf 100644
--- a/tests/models/rt_detr_v2/test_modeling_rt_detr_v2.py
+++ b/tests/models/rt_detr_v2/test_modeling_rt_detr_v2.py
@@ -545,10 +545,10 @@ def test_different_timm_backbone(self):
                     self.model_tester.num_labels,
                 )
                 self.assertEqual(outputs.logits.shape, expected_shape)
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.model.backbone.intermediate_channel_sizes), 3)
             else:
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.backbone.intermediate_channel_sizes), 3)
 
             self.assertTrue(outputs)
@@ -577,10 +577,10 @@ def test_hf_backbone(self):
                     self.model_tester.num_labels,
                 )
                 self.assertEqual(outputs.logits.shape, expected_shape)
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.model.backbone.intermediate_channel_sizes), 3)
             else:
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.backbone.intermediate_channel_sizes), 3)
 
             self.assertTrue(outputs)
diff --git a/tests/models/seamless_m4t/test_modeling_seamless_m4t.py b/tests/models/seamless_m4t/test_modeling_seamless_m4t.py
index 25bbc1c3040d..c30ebcc87fcd 100644
--- a/tests/models/seamless_m4t/test_modeling_seamless_m4t.py
+++ b/tests/models/seamless_m4t/test_modeling_seamless_m4t.py
@@ -1017,7 +1017,7 @@ def test_to_eng_text(self):
         output = model.generate(**self.input_text, num_beams=1, tgt_lang="eng", return_intermediate_token_ids=True)
 
         self.assertListEqual(expected_text_tokens, output.sequences.squeeze().tolist())
-        # FOR NOW, only first units correspondance
+        # FOR NOW, only first units correspondence
         self.assertListEqual(expected_unit_tokens[:10], output.unit_sequences.squeeze().tolist()[:10])
 
         self.assertListAlmostEqual(expected_wav_slice, output.waveform.squeeze().tolist()[50:60])
diff --git a/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py b/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py
index 897d4b056f19..399a111530d1 100644
--- a/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py
+++ b/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py
@@ -266,7 +266,10 @@ def check_save_and_load_encoder_decoder_model(
             out_2 = outputs[0].cpu().numpy()
             out_2[np.isnan(out_2)] = 0
 
-            with tempfile.TemporaryDirectory() as encoder_tmp_dirname, tempfile.TemporaryDirectory() as decoder_tmp_dirname:
+            with (
+                tempfile.TemporaryDirectory() as encoder_tmp_dirname,
+                tempfile.TemporaryDirectory() as decoder_tmp_dirname,
+            ):
                 enc_dec_model.encoder.save_pretrained(encoder_tmp_dirname)
                 enc_dec_model.decoder.save_pretrained(decoder_tmp_dirname)
                 SpeechEncoderDecoderModel.from_encoder_decoder_pretrained(
diff --git a/tests/models/stablelm/test_modeling_stablelm.py b/tests/models/stablelm/test_modeling_stablelm.py
index 946b220e0ea9..14af6d5f7275 100644
--- a/tests/models/stablelm/test_modeling_stablelm.py
+++ b/tests/models/stablelm/test_modeling_stablelm.py
@@ -409,7 +409,9 @@ def test_model_rope_scaling(self):
         long_input_length = int(config.max_position_embeddings * 1.5)
 
         # Inputs
-        x = torch.randn(1, dtype=torch.float32, device=torch_device)  # used exlusively to get the dtype and the device
+        x = torch.randn(
+            1, dtype=torch.float32, device=torch_device
+        )  # used exclusively to get the dtype and the device
         position_ids_short = torch.arange(short_input_length, dtype=torch.long, device=torch_device)
         position_ids_short = position_ids_short.unsqueeze(0)
         position_ids_long = torch.arange(long_input_length, dtype=torch.long, device=torch_device)
diff --git a/tests/models/t5/test_modeling_flax_t5.py b/tests/models/t5/test_modeling_flax_t5.py
index dc372c57694d..703b9973cb70 100644
--- a/tests/models/t5/test_modeling_flax_t5.py
+++ b/tests/models/t5/test_modeling_flax_t5.py
@@ -574,7 +574,7 @@ class FlaxT5ModelIntegrationTests(unittest.TestCase):
     @slow
     def test_small_integration_test(self):
         """
-        For comparision run:
+        For comparison run:
         >>> import t5  # pip install t5==0.7.1
         >>> from t5.data.sentencepiece_vocabulary import SentencePieceVocabulary
 
@@ -604,7 +604,7 @@ def test_small_integration_test(self):
     @slow
     def test_small_v1_1_integration_test(self):
         """
-        For comparision run:
+        For comparison run:
         >>> import t5  # pip install t5==0.7.1
         >>> from t5.data.sentencepiece_vocabulary import SentencePieceVocabulary
 
@@ -634,7 +634,7 @@ def test_small_v1_1_integration_test(self):
     @slow
     def test_small_byt5_integration_test(self):
         """
-        For comparision run:
+        For comparison run:
         >>> import t5  # pip install t5==0.9.1
 
         >>> path_to_byt5_small_checkpoint = '<fill_in>'
diff --git a/tests/models/t5/test_modeling_t5.py b/tests/models/t5/test_modeling_t5.py
index 03a6adb1a916..fb807edc0bc7 100644
--- a/tests/models/t5/test_modeling_t5.py
+++ b/tests/models/t5/test_modeling_t5.py
@@ -1208,7 +1208,7 @@ def test_small_generation(self):
     @slow
     def test_small_integration_test(self):
         """
-        For comparision run:
+        For comparison run:
         >>> import t5  # pip install t5==0.7.1
         >>> from t5.data.sentencepiece_vocabulary import SentencePieceVocabulary
 
@@ -1234,7 +1234,7 @@ def test_small_integration_test(self):
     @slow
     def test_small_v1_1_integration_test(self):
         """
-        For comparision run:
+        For comparison run:
         >>> import t5  # pip install t5==0.7.1
         >>> from t5.data.sentencepiece_vocabulary import SentencePieceVocabulary
 
@@ -1260,7 +1260,7 @@ def test_small_v1_1_integration_test(self):
     @slow
     def test_small_byt5_integration_test(self):
         """
-        For comparision run:
+        For comparison run:
         >>> import t5  # pip install t5==0.9.1
 
         >>> path_to_byt5_small_checkpoint = '<fill_in>'
diff --git a/tests/models/t5/test_modeling_tf_t5.py b/tests/models/t5/test_modeling_tf_t5.py
index 037f1b1e2188..7e6367582ead 100644
--- a/tests/models/t5/test_modeling_tf_t5.py
+++ b/tests/models/t5/test_modeling_tf_t5.py
@@ -618,7 +618,7 @@ def model(self):
     @slow
     def test_small_integration_test(self):
         """
-        For comparision run:
+        For comparison run:
         >>> import t5  # pip install t5==0.7.1
         >>> from t5.data.sentencepiece_vocabulary import SentencePieceVocabulary
 
@@ -644,7 +644,7 @@ def test_small_integration_test(self):
     @slow
     def test_small_v1_1_integration_test(self):
         """
-        For comparision run:
+        For comparison run:
         >>> import t5  # pip install t5==0.7.1
         >>> from t5.data.sentencepiece_vocabulary import SentencePieceVocabulary
 
@@ -670,7 +670,7 @@ def test_small_v1_1_integration_test(self):
     @slow
     def test_small_byt5_integration_test(self):
         """
-        For comparision run:
+        For comparison run:
         >>> import t5  # pip install t5==0.9.1
 
         >>> path_to_byt5_small_checkpoint = '<fill_in>'
diff --git a/tests/models/t5/test_tokenization_t5.py b/tests/models/t5/test_tokenization_t5.py
index e64882b6d3fc..a7ad5320af2f 100644
--- a/tests/models/t5/test_tokenization_t5.py
+++ b/tests/models/t5/test_tokenization_t5.py
@@ -498,7 +498,7 @@ def setUpClass(cls):
         tokenizer.add_special_tokens(
             {"additional_special_tokens": [AddedToken("<extra_id_0>", rstrip=False, lstrip=False)]}
         )
-        # TODO ArthurZ the above is necessary as addedTokens / intialization sucks. Trie is not correctly created
+        # TODO ArthurZ the above is necessary as addedTokens / initialization sucks. Trie is not correctly created
         # So the extra ids are split....
         cls.tokenizer = tokenizer
 
diff --git a/tests/models/table_transformer/test_modeling_table_transformer.py b/tests/models/table_transformer/test_modeling_table_transformer.py
index aa4b7131f949..9995aae7d4b4 100644
--- a/tests/models/table_transformer/test_modeling_table_transformer.py
+++ b/tests/models/table_transformer/test_modeling_table_transformer.py
@@ -477,10 +477,10 @@ def test_different_timm_backbone(self):
                     self.model_tester.num_labels + 1,
                 )
                 self.assertEqual(outputs.logits.shape, expected_shape)
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 3)
             else:
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 3)
 
             self.assertTrue(outputs)
@@ -509,10 +509,10 @@ def test_hf_backbone(self):
                     self.model_tester.num_labels + 1,
                 )
                 self.assertEqual(outputs.logits.shape, expected_shape)
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 3)
             else:
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 3)
 
             self.assertTrue(outputs)
diff --git a/tests/models/tvp/test_modeling_tvp.py b/tests/models/tvp/test_modeling_tvp.py
index 6ab0dffde4eb..3ac01c53d52b 100644
--- a/tests/models/tvp/test_modeling_tvp.py
+++ b/tests/models/tvp/test_modeling_tvp.py
@@ -194,7 +194,7 @@ def test_inputs_embeds(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    # override as the `logit_scale` parameter initilization is different for TVP
+    # override as the `logit_scale` parameter initialization is different for TVP
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -219,7 +219,7 @@ def _validate_backbone_init():
                 model.to(torch_device)
                 model.eval()
 
-                # Confirm out_indices propogated to backbone
+                # Confirm out_indices propagated to backbone
                 if model.__class__.__name__ == "TvpModel":
                     self.assertEqual(len(model.vision_model.backbone.out_indices), 2)
                 elif model.__class__.__name__ == "TvpForVideoGrounding":
diff --git a/tests/models/univnet/test_modeling_univnet.py b/tests/models/univnet/test_modeling_univnet.py
index 9a7ade715527..003c63a3e640 100644
--- a/tests/models/univnet/test_modeling_univnet.py
+++ b/tests/models/univnet/test_modeling_univnet.py
@@ -227,7 +227,7 @@ def get_inputs(self, device, num_samples: int = 3, noise_length: int = 10, seed:
             noise_sequence_shape = (64, noise_length)
         else:
             noise_sequence_shape = (num_samples, 64, noise_length)
-        # Explicity generate noise_sequence on CPU for consistency.
+        # Explicitly generate noise_sequence on CPU for consistency.
         noise_sequence = torch.randn(noise_sequence_shape, generator=generator, dtype=torch.float32, device="cpu")
         # Put noise_sequence on the desired device.
         noise_sequence = noise_sequence.to(device)
diff --git a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
index a680e504cd63..317048550e5c 100644
--- a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
+++ b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
@@ -216,7 +216,10 @@ def check_save_and_load_encoder_decoder_model(
             out_2 = outputs[0].cpu().numpy()
             out_2[np.isnan(out_2)] = 0
 
-            with tempfile.TemporaryDirectory() as encoder_tmp_dirname, tempfile.TemporaryDirectory() as decoder_tmp_dirname:
+            with (
+                tempfile.TemporaryDirectory() as encoder_tmp_dirname,
+                tempfile.TemporaryDirectory() as decoder_tmp_dirname,
+            ):
                 enc_dec_model.encoder.save_pretrained(encoder_tmp_dirname)
                 enc_dec_model.decoder.save_pretrained(decoder_tmp_dirname)
                 VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
diff --git a/tests/models/vitmatte/test_modeling_vitmatte.py b/tests/models/vitmatte/test_modeling_vitmatte.py
index 5abbc774135d..b75cfc886c0c 100644
--- a/tests/models/vitmatte/test_modeling_vitmatte.py
+++ b/tests/models/vitmatte/test_modeling_vitmatte.py
@@ -244,7 +244,7 @@ def _validate_backbone_init():
                 model.eval()
 
                 if model.__class__.__name__ == "VitMatteForImageMatting":
-                    # Confirm out_indices propogated to backbone
+                    # Confirm out_indices propagated to backbone
                     self.assertEqual(len(model.backbone.out_indices), 2)
 
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py b/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py
index 3dadab891e52..aa55557691b5 100644
--- a/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py
+++ b/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py
@@ -616,9 +616,10 @@ def test_wav2vec2_with_lm_pool(self):
         self.assertEqual(transcription[0], "bien y qué regalo vas a abrir primero")
 
         # user-managed pool + num_processes should trigger a warning
-        with CaptureLogger(processing_wav2vec2_with_lm.logger) as cl, multiprocessing.get_context("fork").Pool(
-            2
-        ) as pool:
+        with (
+            CaptureLogger(processing_wav2vec2_with_lm.logger) as cl,
+            multiprocessing.get_context("fork").Pool(2) as pool,
+        ):
             transcription = processor.batch_decode(np.array(logits), pool, num_processes=2).text
 
         self.assertIn("num_process", cl.out)
diff --git a/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py b/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py
index 7508e4fc01fb..593d627ccf97 100644
--- a/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py
+++ b/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py
@@ -712,9 +712,10 @@ def test_wav2vec2_with_lm_pool(self):
         self.assertEqual(transcription[0], "el libro ha sido escrito por cervantes")
 
         # user-managed pool + num_processes should trigger a warning
-        with CaptureLogger(processing_wav2vec2_with_lm.logger) as cl, multiprocessing.get_context("fork").Pool(
-            2
-        ) as pool:
+        with (
+            CaptureLogger(processing_wav2vec2_with_lm.logger) as cl,
+            multiprocessing.get_context("fork").Pool(2) as pool,
+        ):
             transcription = processor.batch_decode(logits.numpy(), pool, num_processes=2).text
 
         self.assertIn("num_process", cl.out)
diff --git a/tests/models/wav2vec2/test_modeling_wav2vec2.py b/tests/models/wav2vec2/test_modeling_wav2vec2.py
index 8199ba04e1f8..df5fd7c452bf 100644
--- a/tests/models/wav2vec2/test_modeling_wav2vec2.py
+++ b/tests/models/wav2vec2/test_modeling_wav2vec2.py
@@ -1885,9 +1885,10 @@ def test_wav2vec2_with_lm_pool(self):
         self.assertEqual(transcription[0], "habitan aguas poco profundas y rocosas")
 
         # user-managed pool + num_processes should trigger a warning
-        with CaptureLogger(processing_wav2vec2_with_lm.logger) as cl, multiprocessing.get_context("fork").Pool(
-            2
-        ) as pool:
+        with (
+            CaptureLogger(processing_wav2vec2_with_lm.logger) as cl,
+            multiprocessing.get_context("fork").Pool(2) as pool,
+        ):
             transcription = processor.batch_decode(logits.cpu().numpy(), pool, num_processes=2).text
 
         self.assertIn("num_process", cl.out)
diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py
index 85265589050a..ac402d2ff9ca 100644
--- a/tests/models/x_clip/test_modeling_x_clip.py
+++ b/tests/models/x_clip/test_modeling_x_clip.py
@@ -588,7 +588,7 @@ def test_initialization(self):
             model = model_class(config=configs_no_init)
             for name, param in model.named_parameters():
                 if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
+                    # check if `logit_scale` is initialized as per the original implementation
                     if name == "logit_scale":
                         self.assertAlmostEqual(
                             param.data.item(),
diff --git a/tests/models/xglm/test_modeling_tf_xglm.py b/tests/models/xglm/test_modeling_tf_xglm.py
index 7ddab81a2611..56094003279e 100644
--- a/tests/models/xglm/test_modeling_tf_xglm.py
+++ b/tests/models/xglm/test_modeling_tf_xglm.py
@@ -208,7 +208,7 @@ def test_batch_generation(self):
 
         # use different length sentences to test batching
         sentences = [
-            "This is an extremelly long sentence that only exists to test the ability of the model to cope with "
+            "This is an extremely long sentence that only exists to test the ability of the model to cope with "
             "left-padding, such as in batched generation. The output for the sequence below should be the same "
             "regardless of whether left padding is applied or not. When",
             "Hello, my dog is a little",
@@ -230,7 +230,7 @@ def test_batch_generation(self):
         padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True)
 
         expected_output_sentence = [
-            "This is an extremelly long sentence that only exists to test the ability of the model to cope with "
+            "This is an extremely long sentence that only exists to test the ability of the model to cope with "
             "left-padding, such as in batched generation. The output for the sequence below should be the same "
             "regardless of whether left padding is applied or not. When left padding is applied, the sequence will be "
             "a single",
diff --git a/tests/models/xglm/test_modeling_xglm.py b/tests/models/xglm/test_modeling_xglm.py
index 88f3c13497eb..31c298132e14 100644
--- a/tests/models/xglm/test_modeling_xglm.py
+++ b/tests/models/xglm/test_modeling_xglm.py
@@ -371,7 +371,7 @@ def test_batch_generation(self):
 
         # use different length sentences to test batching
         sentences = [
-            "This is an extremelly long sentence that only exists to test the ability of the model to cope with "
+            "This is an extremely long sentence that only exists to test the ability of the model to cope with "
             "left-padding, such as in batched generation. The output for the sequence below should be the same "
             "regardless of whether left padding is applied or not. When",
             "Hello, my dog is a little",
@@ -395,7 +395,7 @@ def test_batch_generation(self):
         padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True)
 
         expected_output_sentence = [
-            "This is an extremelly long sentence that only exists to test the ability of the model to cope with "
+            "This is an extremely long sentence that only exists to test the ability of the model to cope with "
             "left-padding, such as in batched generation. The output for the sequence below should be the same "
             "regardless of whether left padding is applied or not. When left padding is applied, the sequence will be "
             "a single",
diff --git a/tests/sagemaker/scripts/pytorch/run_ddp.py b/tests/sagemaker/scripts/pytorch/run_ddp.py
index 1191caeb96a2..474285841b0d 100644
--- a/tests/sagemaker/scripts/pytorch/run_ddp.py
+++ b/tests/sagemaker/scripts/pytorch/run_ddp.py
@@ -36,12 +36,12 @@ def main():
                 --master_addr={hosts[0]}  \
                 --master_port={port} \
                 ./run_glue.py \
-                {"".join([f" --{parameter} {value}" for parameter,value in args.__dict__.items()])}"""
+                {"".join([f" --{parameter} {value}" for parameter, value in args.__dict__.items()])}"""
     else:
         cmd = f"""python -m torch.distributed.launch \
             --nproc_per_node={num_gpus}  \
             ./run_glue.py \
-            {"".join([f" --{parameter} {value}" for parameter,value in args.__dict__.items()])}"""
+            {"".join([f" --{parameter} {value}" for parameter, value in args.__dict__.items()])}"""
     try:
         subprocess.run(cmd, shell=True)
     except Exception as e:
diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index f5723d58320f..eba94a45c927 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -2065,21 +2065,21 @@ def test_encode_decode_fast_slow_all_tokens(self):
 
                 for chunk in range(0, len(input_full_vocab_string) - 1024, 1024):
                     string_to_check = input_full_vocab_string[chunk : chunk + 1024]
-                    with self.subTest(f"{(chunk/len(input_full_vocab_string))*100}%"):
+                    with self.subTest(f"{(chunk / len(input_full_vocab_string)) * 100}%"):
                         slow_encode = slow_tokenizer.encode(string_to_check)
                         fast_encode = rust_tokenizer.encode(string_to_check)
                         self.assertEqual(
                             slow_encode,
                             fast_encode,
                             "Hint: the following tokenization diff were obtained for slow vs fast:\n "
-                            f"elements in slow: {set(slow_tokenizer.tokenize(string_to_check))-set(rust_tokenizer.tokenize(string_to_check))} \nvs\n "
-                            f"elements in fast: {set(rust_tokenizer.tokenize(string_to_check))-set(slow_tokenizer.tokenize(string_to_check))} \n"
+                            f"elements in slow: {set(slow_tokenizer.tokenize(string_to_check)) - set(rust_tokenizer.tokenize(string_to_check))} \nvs\n "
+                            f"elements in fast: {set(rust_tokenizer.tokenize(string_to_check)) - set(slow_tokenizer.tokenize(string_to_check))} \n"
                             f"string used     : {string_to_check}",
                         )
                 print(f"Length of the input ids that is tested: {len(input_full_vocab_ids)}")
                 for chunk in range(0, len(input_full_vocab_ids) - 100, 100):
                     ids_to_decode = input_full_vocab_ids[chunk : chunk + 100]
-                    with self.subTest(f"{(chunk/len(input_full_vocab_string))*100}%"):
+                    with self.subTest(f"{(chunk / len(input_full_vocab_string)) * 100}%"):
                         self.assertEqual(
                             slow_tokenizer.decode(
                                 ids_to_decode,
@@ -4423,7 +4423,7 @@ def test_training_new_tokenizer_with_special_tokens_change(self):
                 self.assertTrue(
                     find,
                     f"'{special_token.__repr__()}' should appear as an `AddedToken` in the all_special_tokens_extended = "
-                    f"{[k for k in new_tokenizer.all_special_tokens_extended if str(k)==new_special_token_str]} but it is missing"
+                    f"{[k for k in new_tokenizer.all_special_tokens_extended if str(k) == new_special_token_str]} but it is missing"
                     ", this means that the new tokenizers did not keep the `rstrip`, `lstrip`, `normalized` etc attributes.",
                 )
             elif special_token not in special_tokens_map:
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 254202142565..6ea5d785231c 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -3200,7 +3200,7 @@ def test_can_resume_training_lm(self):
 
             # Checkpoint at intermediate step
             enable_full_determinism(0)
-            checkpoint = os.path.join(tmpdir, f"checkpoint-{resume_from_step+1}")
+            checkpoint = os.path.join(tmpdir, f"checkpoint-{resume_from_step + 1}")
             trainer = get_language_model_trainer(**kwargs)
             trainer.train(resume_from_checkpoint=checkpoint)
             model_params = torch.cat([p.cpu().flatten() for p in trainer.model.parameters()])