fix compatibility with higher version of transformers

lfoppiano · lfoppiano · commit 77872d7e6677 · 2025-01-08T21:49:23.000+01:00
diff --git a/delft/sequenceLabelling/preprocess.py b/delft/sequenceLabelling/preprocess.py
@@ -321,8 +321,14 @@ def convert_single_text(self, text_tokens, chars_tokens, features_tokens, label_
                 chars_tokens.append(self.empty_char_vector)
 
         # sub-tokenization
-        encoded_result = self.tokenizer(text_tokens, add_special_tokens=True, is_split_into_words=True,
-            max_length=max_seq_length, truncation=True, return_offsets_mapping=True)
+        encoded_result = self.tokenizer(
+            text_tokens,
+            add_special_tokens=True,
+            is_split_into_words=True,
+            max_length=max_seq_length,
+            truncation=True,
+            return_offsets_mapping=True
+        )
 
         input_ids = encoded_result.input_ids
         offsets = encoded_result.offset_mapping
diff --git a/delft/sequenceLabelling/trainer.py b/delft/sequenceLabelling/trainer.py
@@ -168,8 +168,8 @@ def train_model(self, local_model, x_train, y_train, f_train=None,
 
         # multiple workers should work with transformer layers, but not with ELMo due to GPU memory limit (with GTX 1080Ti 11GB)
         if self.model_config.transformer_name is not None or (self.embeddings and self.embeddings.use_ELMo):
-            # worker at 0 means the training will be executed in the main thread
-            nb_workers = 0 
+            # worker at 1 means the training will be executed in the main thread
+            nb_workers = 1
             multiprocessing = False
 
         local_model.fit(training_generator,
diff --git a/delft/utilities/Transformer.py b/delft/utilities/Transformer.py
@@ -128,35 +128,30 @@ def init_preprocessor(self, max_sequence_length: int,
                 do_lower_case = False
 
             if do_lower_case is not None:
-                if self.auth_token != None:
+                if self.auth_token is not None:
                     self.tokenizer = AutoTokenizer.from_pretrained(self.name,
-                                                                add_special_tokens=add_special_tokens,
                                                                 max_length=max_sequence_length,
                                                                 add_prefix_space=add_prefix_space, 
                                                                 do_lower_case=do_lower_case, 
                                                                 use_auth_token=self.auth_token)
                 else:
                     self.tokenizer = AutoTokenizer.from_pretrained(self.name,
-                                                                add_special_tokens=add_special_tokens,
                                                                 max_length=max_sequence_length,
                                                                 add_prefix_space=add_prefix_space, 
                                                                 do_lower_case=do_lower_case)
             else:
                 if self.auth_token != None:
                     self.tokenizer = AutoTokenizer.from_pretrained(self.name,
-                                                                add_special_tokens=add_special_tokens,
                                                                 max_length=max_sequence_length,
                                                                 add_prefix_space=add_prefix_space, 
                                                                 use_auth_token=self.auth_token)
                 else:
                     self.tokenizer = AutoTokenizer.from_pretrained(self.name,
-                                                                add_special_tokens=add_special_tokens,
                                                                 max_length=max_sequence_length,
                                                                 add_prefix_space=add_prefix_space)
 
         elif self.loading_method == LOADING_METHOD_LOCAL_MODEL_DIR:
             self.tokenizer = AutoTokenizer.from_pretrained(self.local_dir_path,
-                                                           add_special_tokens=add_special_tokens,
                                                            max_length=max_sequence_length,
                                                            add_prefix_space=add_prefix_space)
         elif self.loading_method == LOADING_METHOD_PLAIN_MODEL: