fixes

abhishekkrthakur · abhishekkrthakur · commit e13b01cae8d6 · 2024-11-27T10:48:13.000+01:00
diff --git a/configs/token_classification/local_dataset.yml b/configs/token_classification/local_dataset.yml
@@ -7,10 +7,10 @@ backend: local
 data:
   path: data/ # this must be the path to the directory containing the train and valid files
   train_split: train # this must be either train.json
-  valid_split: valid # this must be either valid.json, can also be set to null
+  valid_split: test # this must be either valid.json, can also be set to null
   column_mapping:
-    text_column: text # this must be the name of the column containing the text
-    target_column: label # this must be the name of the column containing the target
+    tokens_column: tokens # this must be the name of the column containing the text
+    tags_column: tags # this must be the name of the column containing the target
 
 params:
   max_seq_length: 512
diff --git a/docs/source/tasks/token_classification.mdx b/docs/source/tasks/token_classification.mdx
@@ -11,8 +11,8 @@ The data should be in the following CSV format:
 
 ```csv
 tokens,tags
-"['I', 'love', 'Paris']", "['O', 'O', 'B-LOC']"
-"['I', 'live', 'in', 'New', 'York']", "['O', 'O', 'O', 'B-LOC', 'I-LOC']"
+"['I', 'love', 'Paris']","['O', 'O', 'B-LOC']"
+"['I', 'live', 'in', 'New', 'York']","['O', 'O', 'O', 'B-LOC', 'I-LOC']"
 .
 .
 .
@@ -21,8 +21,8 @@ tokens,tags
 or you can also use JSONL format:
 
 ```json
-{"tokens": ["I", "love", "Paris"], "tags": ["O", "O", "B-LOC"]}
-{"tokens": ["I", "live", "in", "New", "York"], "tags": ["O", "O", "O", "B-LOC", "I-LOC"]}
+{"tokens": ["I", "love", "Paris"],"tags": ["O", "O", "B-LOC"]}
+{"tokens": ["I", "live", "in", "New", "York"],"tags": ["O", "O", "O", "B-LOC", "I-LOC"]}
 .
 .
 .
@@ -51,6 +51,10 @@ for chunk in pd.read_csv('example.csv', chunksize=chunk_size):
     i += 1
 ```
 
+
+Sample dataset from HuggingFace Hub: [conll2003](https://huggingface.co/datasets/eriktks/conll2003)
+
+
 ## Columns
 
 Your CSV/JSONL dataset must have two columns: `tokens` and `tags`.
diff --git a/src/autotrain/app/params.py b/src/autotrain/app/params.py
@@ -397,8 +397,8 @@ def _munge_params_token_clf(self):
             _params["tags_column"] = "autotrain_label"
             _params["valid_split"] = "validation"
         else:
-            _params["tokens_column"] = self.column_mapping.get("text" if not self.api else "tokens_column", "text")
-            _params["tags_column"] = self.column_mapping.get("label" if not self.api else "tags_column", "label")
+            _params["tokens_column"] = self.column_mapping.get("tokens" if not self.api else "tokens_column", "tokens")
+            _params["tags_column"] = self.column_mapping.get("tags" if not self.api else "tags_column", "tags")
             _params["train_split"] = self.train_split
             _params["valid_split"] = self.valid_split
 
diff --git a/src/autotrain/app/templates/index.html b/src/autotrain/app/templates/index.html
@@ -73,7 +73,7 @@
                     fieldNames = ['text', 'target'];
                     break;
                 case 'token-classification':
-                    fields = ['text', 'label'];
+                    fields = ['tokens', 'tags'];
                     fieldNames = ['tokens', 'tags'];
                     break;
                 case 'dreambooth':
diff --git a/src/autotrain/project.py b/src/autotrain/project.py
@@ -262,8 +262,8 @@ def token_clf_munge_data(params, local):
         )
         params.data_path = dset.prepare()
         params.valid_split = "validation"
-        params.text_column = "autotrain_text"
-        params.target_column = "autotrain_label"
+        params.tokens_column = "autotrain_text"
+        params.tags_column = "autotrain_label"
     return params
 
 
diff --git a/src/autotrain/trainers/clm/utils.py b/src/autotrain/trainers/clm/utils.py
@@ -494,12 +494,14 @@ def process_input_data(config):
                 name=dataset_config_name,
                 split=split,
                 token=config.token,
+                trust_remote_code=ALLOW_REMOTE_CODE,
             )
         else:
             train_data = load_dataset(
                 config.data_path,
                 split=config.train_split,
                 token=config.token,
+                trust_remote_code=ALLOW_REMOTE_CODE,
             )
     # rename columns for reward trainer
     if config.trainer in ("dpo", "reward", "orpo"):
@@ -522,12 +524,14 @@ def process_input_data(config):
                     name=dataset_config_name,
                     split=split,
                     token=config.token,
+                    trust_remote_code=ALLOW_REMOTE_CODE,
                 )
             else:
                 valid_data = load_dataset(
                     config.data_path,
                     split=config.valid_split,
                     token=config.token,
+                    trust_remote_code=ALLOW_REMOTE_CODE,
                 )
 
         if config.trainer in ("dpo", "reward", "orpo"):
diff --git a/src/autotrain/trainers/extractive_question_answering/__main__.py b/src/autotrain/trainers/extractive_question_answering/__main__.py
@@ -59,12 +59,14 @@ def train(config):
                     name=dataset_config_name,
                     split=split,
                     token=config.token,
+                    trust_remote_code=ALLOW_REMOTE_CODE,
                 )
             else:
                 train_data = load_dataset(
                     config.data_path,
                     split=config.train_split,
                     token=config.token,
+                    trust_remote_code=ALLOW_REMOTE_CODE,
                 )
 
     if config.valid_split is not None:
@@ -79,12 +81,14 @@ def train(config):
                     name=dataset_config_name,
                     split=split,
                     token=config.token,
+                    trust_remote_code=ALLOW_REMOTE_CODE,
                 )
             else:
                 valid_data = load_dataset(
                     config.data_path,
                     split=config.valid_split,
                     token=config.token,
+                    trust_remote_code=ALLOW_REMOTE_CODE,
                 )
 
     logger.info(train_data)
diff --git a/src/autotrain/trainers/image_classification/__main__.py b/src/autotrain/trainers/image_classification/__main__.py
@@ -52,12 +52,14 @@ def train(config):
                 name=dataset_config_name,
                 split=split,
                 token=config.token,
+                trust_remote_code=ALLOW_REMOTE_CODE,
             )
         else:
             train_data = load_dataset(
                 config.data_path,
                 split=config.train_split,
                 token=config.token,
+                trust_remote_code=ALLOW_REMOTE_CODE,
             )
 
     if config.valid_split is not None:
@@ -71,12 +73,14 @@ def train(config):
                     name=dataset_config_name,
                     split=split,
                     token=config.token,
+                    trust_remote_code=ALLOW_REMOTE_CODE,
                 )
             else:
                 valid_data = load_dataset(
                     config.data_path,
                     split=config.valid_split,
                     token=config.token,
+                    trust_remote_code=ALLOW_REMOTE_CODE,
                 )
 
     logger.info(f"Train data: {train_data}")
diff --git a/src/autotrain/trainers/image_regression/__main__.py b/src/autotrain/trainers/image_regression/__main__.py
@@ -52,12 +52,14 @@ def train(config):
                 name=dataset_config_name,
                 split=split,
                 token=config.token,
+                trust_remote_code=ALLOW_REMOTE_CODE,
             )
         else:
             train_data = load_dataset(
                 config.data_path,
                 split=config.train_split,
                 token=config.token,
+                trust_remote_code=ALLOW_REMOTE_CODE,
             )
 
     if config.valid_split is not None:
@@ -71,12 +73,14 @@ def train(config):
                     name=dataset_config_name,
                     split=split,
                     token=config.token,
+                    trust_remote_code=ALLOW_REMOTE_CODE,
                 )
             else:
                 valid_data = load_dataset(
                     config.data_path,
                     split=config.valid_split,
                     token=config.token,
+                    trust_remote_code=ALLOW_REMOTE_CODE,
                 )
 
     logger.info(f"Train data: {train_data}")
diff --git a/src/autotrain/trainers/object_detection/__main__.py b/src/autotrain/trainers/object_detection/__main__.py
@@ -53,12 +53,14 @@ def train(config):
                 name=dataset_config_name,
                 split=split,
                 token=config.token,
+                trust_remote_code=ALLOW_REMOTE_CODE,
             )
         else:
             train_data = load_dataset(
                 config.data_path,
                 split=config.train_split,
                 token=config.token,
+                trust_remote_code=ALLOW_REMOTE_CODE,
             )
 
     if config.valid_split is not None:
@@ -72,12 +74,14 @@ def train(config):
                     name=dataset_config_name,
                     split=split,
                     token=config.token,
+                    trust_remote_code=ALLOW_REMOTE_CODE,
                 )
             else:
                 valid_data = load_dataset(
                     config.data_path,
                     split=config.valid_split,
                     token=config.token,
+                    trust_remote_code=ALLOW_REMOTE_CODE,
                 )
 
     logger.info(f"Train data: {train_data}")
diff --git a/src/autotrain/trainers/sent_transformers/__main__.py b/src/autotrain/trainers/sent_transformers/__main__.py
@@ -54,12 +54,14 @@ def train(config):
                     name=dataset_config_name,
                     split=split,
                     token=config.token,
+                    trust_remote_code=ALLOW_REMOTE_CODE,
                 )
             else:
                 train_data = load_dataset(
                     config.data_path,
                     split=config.train_split,
                     token=config.token,
+                    trust_remote_code=ALLOW_REMOTE_CODE,
                 )
 
     if config.valid_split is not None:
@@ -74,12 +76,14 @@ def train(config):
                     name=dataset_config_name,
                     split=split,
                     token=config.token,
+                    trust_remote_code=ALLOW_REMOTE_CODE,
                 )
             else:
                 valid_data = load_dataset(
                     config.data_path,
                     split=config.valid_split,
                     token=config.token,
+                    trust_remote_code=ALLOW_REMOTE_CODE,
                 )
 
     num_classes = None
diff --git a/src/autotrain/trainers/seq2seq/__main__.py b/src/autotrain/trainers/seq2seq/__main__.py
@@ -62,12 +62,14 @@ def train(config):
                     name=dataset_config_name,
                     split=split,
                     token=config.token,
+                    trust_remote_code=ALLOW_REMOTE_CODE,
                 )
             else:
                 train_data = load_dataset(
                     config.data_path,
                     split=config.train_split,
                     token=config.token,
+                    trust_remote_code=ALLOW_REMOTE_CODE,
                 )
 
     if config.valid_split is not None:
@@ -82,12 +84,14 @@ def train(config):
                     name=dataset_config_name,
                     split=split,
                     token=config.token,
+                    trust_remote_code=ALLOW_REMOTE_CODE,
                 )
             else:
                 valid_data = load_dataset(
                     config.data_path,
                     split=config.valid_split,
                     token=config.token,
+                    trust_remote_code=ALLOW_REMOTE_CODE,
                 )
 
     tokenizer = AutoTokenizer.from_pretrained(config.model, token=config.token, trust_remote_code=ALLOW_REMOTE_CODE)
diff --git a/src/autotrain/trainers/tabular/__main__.py b/src/autotrain/trainers/tabular/__main__.py
@@ -13,7 +13,13 @@
 from sklearn.compose import ColumnTransformer
 
 from autotrain import logger
-from autotrain.trainers.common import monitor, pause_space, remove_autotrain_data, save_training_params
+from autotrain.trainers.common import (
+    ALLOW_REMOTE_CODE,
+    monitor,
+    pause_space,
+    remove_autotrain_data,
+    save_training_params,
+)
 from autotrain.trainers.tabular import utils
 from autotrain.trainers.tabular.params import TabularParams
 
@@ -187,12 +193,14 @@ def train(config):
                 name=dataset_config_name,
                 split=split,
                 token=config.token,
+                trust_remote_code=ALLOW_REMOTE_CODE,
             )
         else:
             train_data = load_dataset(
                 config.data_path,
                 split=config.train_split,
                 token=config.token,
+                trust_remote_code=ALLOW_REMOTE_CODE,
             )
     train_data = train_data.to_pandas()
 
@@ -208,12 +216,14 @@ def train(config):
                     name=dataset_config_name,
                     split=split,
                     token=config.token,
+                    trust_remote_code=ALLOW_REMOTE_CODE,
                 )
             else:
                 valid_data = load_dataset(
                     config.data_path,
                     split=config.valid_split,
                     token=config.token,
+                    trust_remote_code=ALLOW_REMOTE_CODE,
                 )
         valid_data = valid_data.to_pandas()
 
diff --git a/src/autotrain/trainers/text_classification/__main__.py b/src/autotrain/trainers/text_classification/__main__.py
@@ -57,12 +57,14 @@ def train(config):
                     name=dataset_config_name,
                     split=split,
                     token=config.token,
+                    trust_remote_code=ALLOW_REMOTE_CODE,
                 )
             else:
                 train_data = load_dataset(
                     config.data_path,
                     split=config.train_split,
                     token=config.token,
+                    trust_remote_code=ALLOW_REMOTE_CODE,
                 )
 
     if config.valid_split is not None:
@@ -77,12 +79,14 @@ def train(config):
                     name=dataset_config_name,
                     split=split,
                     token=config.token,
+                    trust_remote_code=ALLOW_REMOTE_CODE,
                 )
             else:
                 valid_data = load_dataset(
                     config.data_path,
                     split=config.valid_split,
                     token=config.token,
+                    trust_remote_code=ALLOW_REMOTE_CODE,
                 )
 
     classes = train_data.features[config.target_column].names
diff --git a/src/autotrain/trainers/text_regression/__main__.py b/src/autotrain/trainers/text_regression/__main__.py
@@ -57,12 +57,14 @@ def train(config):
                     name=dataset_config_name,
                     split=split,
                     token=config.token,
+                    trust_remote_code=ALLOW_REMOTE_CODE,
                 )
             else:
                 train_data = load_dataset(
                     config.data_path,
                     split=config.train_split,
                     token=config.token,
+                    trust_remote_code=ALLOW_REMOTE_CODE,
                 )
 
     if config.valid_split is not None:
@@ -77,12 +79,14 @@ def train(config):
                     name=dataset_config_name,
                     split=split,
                     token=config.token,
+                    trust_remote_code=ALLOW_REMOTE_CODE,
                 )
             else:
                 valid_data = load_dataset(
                     config.data_path,
                     split=config.valid_split,
                     token=config.token,
+                    trust_remote_code=ALLOW_REMOTE_CODE,
                 )
 
     model_config = AutoConfig.from_pretrained(
diff --git a/src/autotrain/trainers/token_classification/__main__.py b/src/autotrain/trainers/token_classification/__main__.py

Original file line number	Diff line number	Diff line change
`@@ -11,8 +11,8 @@ The data should be in the following CSV format:`
`11`	`11`
`12`	`12`	```csv
`13`	`13`	`tokens,tags`
`14`		`-"['I', 'love', 'Paris']", "['O', 'O', 'B-LOC']"`
`15`		`-"['I', 'live', 'in', 'New', 'York']", "['O', 'O', 'O', 'B-LOC', 'I-LOC']"`
	`14`	`+"['I', 'love', 'Paris']","['O', 'O', 'B-LOC']"`
	`15`	`+"['I', 'live', 'in', 'New', 'York']","['O', 'O', 'O', 'B-LOC', 'I-LOC']"`
`16`	`16`	`.`
`17`	`17`	`.`
`18`	`18`	`.`
`@@ -21,8 +21,8 @@ tokens,tags`
`21`	`21`	`or you can also use JSONL format:`
`22`	`22`
`23`	`23`	```json
`24`		`-{"tokens": ["I", "love", "Paris"], "tags": ["O", "O", "B-LOC"]}`
`25`		`-{"tokens": ["I", "live", "in", "New", "York"], "tags": ["O", "O", "O", "B-LOC", "I-LOC"]}`
	`24`	`+{"tokens": ["I", "love", "Paris"],"tags": ["O", "O", "B-LOC"]}`
	`25`	`+{"tokens": ["I", "live", "in", "New", "York"],"tags": ["O", "O", "O", "B-LOC", "I-LOC"]}`
`26`	`26`	`.`
`27`	`27`	`.`
`28`	`28`	`.`
`@@ -51,6 +51,10 @@ for chunk in pd.read_csv('example.csv', chunksize=chunk_size):`
`51`	`51`	`i += 1`
`52`	`52`	```
`53`	`53`
	`54`	`+`
	`55`	`+Sample dataset from HuggingFace Hub: [conll2003](https://huggingface.co/datasets/eriktks/conll2003)`
	`56`	`+`
	`57`	`+`
`54`	`58`	`## Columns`
`55`	`59`
`56`	`60`	Your CSV/JSONL dataset must have two columns: `tokens` and `tags`.
Original file line number	Diff line number	Diff line change
`@@ -262,8 +262,8 @@ def token_clf_munge_data(params, local):`
`262`	`262`	`)`
`263`	`263`	`params.data_path = dset.prepare()`
`264`	`264`	`params.valid_split = "validation"`
`265`		`- params.text_column = "autotrain_text"`
`266`		`- params.target_column = "autotrain_label"`
	`265`	`+ params.tokens_column = "autotrain_text"`
	`266`	`+ params.tags_column = "autotrain_label"`
`267`	`267`	`return params`
`268`	`268`
`269`	`269`