Skip to content

Commit e13b01c

Browse files
fixes
1 parent e10241b commit e13b01c

File tree

16 files changed

+67
-13
lines changed

16 files changed

+67
-13
lines changed

configs/token_classification/local_dataset.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,10 @@ backend: local
77
data:
88
path: data/ # this must be the path to the directory containing the train and valid files
99
train_split: train # this must be either train.json
10-
valid_split: valid # this must be either valid.json, can also be set to null
10+
valid_split: test # this must be either valid.json, can also be set to null
1111
column_mapping:
12-
text_column: text # this must be the name of the column containing the text
13-
target_column: label # this must be the name of the column containing the target
12+
tokens_column: tokens # this must be the name of the column containing the text
13+
tags_column: tags # this must be the name of the column containing the target
1414

1515
params:
1616
max_seq_length: 512

docs/source/tasks/token_classification.mdx

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@ The data should be in the following CSV format:
1111

1212
```csv
1313
tokens,tags
14-
"['I', 'love', 'Paris']", "['O', 'O', 'B-LOC']"
15-
"['I', 'live', 'in', 'New', 'York']", "['O', 'O', 'O', 'B-LOC', 'I-LOC']"
14+
"['I', 'love', 'Paris']","['O', 'O', 'B-LOC']"
15+
"['I', 'live', 'in', 'New', 'York']","['O', 'O', 'O', 'B-LOC', 'I-LOC']"
1616
.
1717
.
1818
.
@@ -21,8 +21,8 @@ tokens,tags
2121
or you can also use JSONL format:
2222

2323
```json
24-
{"tokens": ["I", "love", "Paris"], "tags": ["O", "O", "B-LOC"]}
25-
{"tokens": ["I", "live", "in", "New", "York"], "tags": ["O", "O", "O", "B-LOC", "I-LOC"]}
24+
{"tokens": ["I", "love", "Paris"],"tags": ["O", "O", "B-LOC"]}
25+
{"tokens": ["I", "live", "in", "New", "York"],"tags": ["O", "O", "O", "B-LOC", "I-LOC"]}
2626
.
2727
.
2828
.
@@ -51,6 +51,10 @@ for chunk in pd.read_csv('example.csv', chunksize=chunk_size):
5151
i += 1
5252
```
5353

54+
55+
Sample dataset from HuggingFace Hub: [conll2003](https://huggingface.co/datasets/eriktks/conll2003)
56+
57+
5458
## Columns
5559

5660
Your CSV/JSONL dataset must have two columns: `tokens` and `tags`.

src/autotrain/app/params.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -397,8 +397,8 @@ def _munge_params_token_clf(self):
397397
_params["tags_column"] = "autotrain_label"
398398
_params["valid_split"] = "validation"
399399
else:
400-
_params["tokens_column"] = self.column_mapping.get("text" if not self.api else "tokens_column", "text")
401-
_params["tags_column"] = self.column_mapping.get("label" if not self.api else "tags_column", "label")
400+
_params["tokens_column"] = self.column_mapping.get("tokens" if not self.api else "tokens_column", "tokens")
401+
_params["tags_column"] = self.column_mapping.get("tags" if not self.api else "tags_column", "tags")
402402
_params["train_split"] = self.train_split
403403
_params["valid_split"] = self.valid_split
404404

src/autotrain/app/templates/index.html

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@
7373
fieldNames = ['text', 'target'];
7474
break;
7575
case 'token-classification':
76-
fields = ['text', 'label'];
76+
fields = ['tokens', 'tags'];
7777
fieldNames = ['tokens', 'tags'];
7878
break;
7979
case 'dreambooth':

src/autotrain/project.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -262,8 +262,8 @@ def token_clf_munge_data(params, local):
262262
)
263263
params.data_path = dset.prepare()
264264
params.valid_split = "validation"
265-
params.text_column = "autotrain_text"
266-
params.target_column = "autotrain_label"
265+
params.tokens_column = "autotrain_text"
266+
params.tags_column = "autotrain_label"
267267
return params
268268

269269

src/autotrain/trainers/clm/utils.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -494,12 +494,14 @@ def process_input_data(config):
494494
name=dataset_config_name,
495495
split=split,
496496
token=config.token,
497+
trust_remote_code=ALLOW_REMOTE_CODE,
497498
)
498499
else:
499500
train_data = load_dataset(
500501
config.data_path,
501502
split=config.train_split,
502503
token=config.token,
504+
trust_remote_code=ALLOW_REMOTE_CODE,
503505
)
504506
# rename columns for reward trainer
505507
if config.trainer in ("dpo", "reward", "orpo"):
@@ -522,12 +524,14 @@ def process_input_data(config):
522524
name=dataset_config_name,
523525
split=split,
524526
token=config.token,
527+
trust_remote_code=ALLOW_REMOTE_CODE,
525528
)
526529
else:
527530
valid_data = load_dataset(
528531
config.data_path,
529532
split=config.valid_split,
530533
token=config.token,
534+
trust_remote_code=ALLOW_REMOTE_CODE,
531535
)
532536

533537
if config.trainer in ("dpo", "reward", "orpo"):

src/autotrain/trainers/extractive_question_answering/__main__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,12 +59,14 @@ def train(config):
5959
name=dataset_config_name,
6060
split=split,
6161
token=config.token,
62+
trust_remote_code=ALLOW_REMOTE_CODE,
6263
)
6364
else:
6465
train_data = load_dataset(
6566
config.data_path,
6667
split=config.train_split,
6768
token=config.token,
69+
trust_remote_code=ALLOW_REMOTE_CODE,
6870
)
6971

7072
if config.valid_split is not None:
@@ -79,12 +81,14 @@ def train(config):
7981
name=dataset_config_name,
8082
split=split,
8183
token=config.token,
84+
trust_remote_code=ALLOW_REMOTE_CODE,
8285
)
8386
else:
8487
valid_data = load_dataset(
8588
config.data_path,
8689
split=config.valid_split,
8790
token=config.token,
91+
trust_remote_code=ALLOW_REMOTE_CODE,
8892
)
8993

9094
logger.info(train_data)

src/autotrain/trainers/image_classification/__main__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,12 +52,14 @@ def train(config):
5252
name=dataset_config_name,
5353
split=split,
5454
token=config.token,
55+
trust_remote_code=ALLOW_REMOTE_CODE,
5556
)
5657
else:
5758
train_data = load_dataset(
5859
config.data_path,
5960
split=config.train_split,
6061
token=config.token,
62+
trust_remote_code=ALLOW_REMOTE_CODE,
6163
)
6264

6365
if config.valid_split is not None:
@@ -71,12 +73,14 @@ def train(config):
7173
name=dataset_config_name,
7274
split=split,
7375
token=config.token,
76+
trust_remote_code=ALLOW_REMOTE_CODE,
7477
)
7578
else:
7679
valid_data = load_dataset(
7780
config.data_path,
7881
split=config.valid_split,
7982
token=config.token,
83+
trust_remote_code=ALLOW_REMOTE_CODE,
8084
)
8185

8286
logger.info(f"Train data: {train_data}")

src/autotrain/trainers/image_regression/__main__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,12 +52,14 @@ def train(config):
5252
name=dataset_config_name,
5353
split=split,
5454
token=config.token,
55+
trust_remote_code=ALLOW_REMOTE_CODE,
5556
)
5657
else:
5758
train_data = load_dataset(
5859
config.data_path,
5960
split=config.train_split,
6061
token=config.token,
62+
trust_remote_code=ALLOW_REMOTE_CODE,
6163
)
6264

6365
if config.valid_split is not None:
@@ -71,12 +73,14 @@ def train(config):
7173
name=dataset_config_name,
7274
split=split,
7375
token=config.token,
76+
trust_remote_code=ALLOW_REMOTE_CODE,
7477
)
7578
else:
7679
valid_data = load_dataset(
7780
config.data_path,
7881
split=config.valid_split,
7982
token=config.token,
83+
trust_remote_code=ALLOW_REMOTE_CODE,
8084
)
8185

8286
logger.info(f"Train data: {train_data}")

src/autotrain/trainers/object_detection/__main__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,12 +53,14 @@ def train(config):
5353
name=dataset_config_name,
5454
split=split,
5555
token=config.token,
56+
trust_remote_code=ALLOW_REMOTE_CODE,
5657
)
5758
else:
5859
train_data = load_dataset(
5960
config.data_path,
6061
split=config.train_split,
6162
token=config.token,
63+
trust_remote_code=ALLOW_REMOTE_CODE,
6264
)
6365

6466
if config.valid_split is not None:
@@ -72,12 +74,14 @@ def train(config):
7274
name=dataset_config_name,
7375
split=split,
7476
token=config.token,
77+
trust_remote_code=ALLOW_REMOTE_CODE,
7578
)
7679
else:
7780
valid_data = load_dataset(
7881
config.data_path,
7982
split=config.valid_split,
8083
token=config.token,
84+
trust_remote_code=ALLOW_REMOTE_CODE,
8185
)
8286

8387
logger.info(f"Train data: {train_data}")

0 commit comments

Comments
 (0)