forked from deeppavlov/DeepPavlov
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: add a new ner model trained on English and Vietnamese datasets (d…
…eeppavlov#1155) * New NER model trained on English and Vietnamese datasets (CoNLL2003 and VLSP2016) * Add sentence segmentation model trained on DailyDialog * fix the download path to pretrained word embeding file in vlsp2016_full config file * fixed typos and added missing files * added tests for ner and sentseg models * Fix bugs * fix bug downloading datasets and trained models * style: optimize imports in test_quick_start.py * chore: rebuild registry * feat: remove ner_prf1 in favor of ner_f1 expanded for IOB, BILOU and IOBES * fix: correct arguments for metrics in sentseg_dailydialog.json * documentation, description of word_emb_name * fix typos * Update deeppavlov/models/ner/NER_model.py Co-Authored-By: yurakuratov <[email protected]> * fix docstring Co-authored-by: Aleksei Lymar <[email protected]> Co-authored-by: yurakuratov <[email protected]>
- Loading branch information
1 parent
db31b59
commit a797a4a
Showing
11 changed files
with
1,144 additions
and
27 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,152 @@ | ||
{ | ||
"dataset_reader": { | ||
"class_name": "conll2003_reader", | ||
"data_path": "{DOWNLOADS_PATH}/conll2003/", | ||
"dataset_name": "conll2003", | ||
"provide_pos": true, | ||
"provide_chunk": false, | ||
"iobes": true | ||
}, | ||
"dataset_iterator": { | ||
"class_name": "data_learning_iterator" | ||
}, | ||
"chainer": { | ||
"in": ["x", "pos"], | ||
"in_y": ["y"], | ||
"pipe": [ | ||
{ | ||
"in": ["x"], | ||
"out": ["x_tokens"], | ||
"class_name": "lazy_tokenizer" | ||
}, | ||
{ | ||
"in": ["pos"], | ||
"out": ["pos_tokens"], | ||
"class_name": "lazy_tokenizer" | ||
}, | ||
{ | ||
"in": ["x_tokens"], | ||
"out": ["x_lower", "sent_lengths", "x_tokens_elmo"], | ||
"class_name": "ner_preprocessor", | ||
"get_x_padded_for_elmo": true | ||
}, | ||
{ | ||
"in": ["x_lower"], | ||
"out": ["x_tok_ind"], | ||
"fit_on": ["x_lower"], | ||
"class_name": "ner_vocab", | ||
"id": "word_vocab", | ||
"save_path": "{MODELS_PATH}/word.dict", | ||
"load_path": "{MODELS_PATH}/word.dict" | ||
}, | ||
{ | ||
"in": ["pos_tokens"], | ||
"out": ["pos_ind"], | ||
"fit_on": ["pos_tokens"], | ||
"class_name": "ner_vocab", | ||
"id": "pos_vocab", | ||
"save_path": "{MODELS_PATH}/pos.dict", | ||
"load_path": "{MODELS_PATH}/pos.dict" | ||
}, | ||
{ | ||
"in": ["y"], | ||
"out": ["y_ind"], | ||
"fit_on": ["y"], | ||
"class_name": "ner_vocab", | ||
"id": "tag_vocab", | ||
"save_path": "{MODELS_PATH}/tag.dict", | ||
"load_path": "{MODELS_PATH}/tag.dict" | ||
}, | ||
{ | ||
"in": ["x_tokens"], | ||
"out": ["x_char_ind"], | ||
"fit_on": ["x_tokens"], | ||
"class_name": "ner_vocab", | ||
"char_level": true, | ||
"id": "char_vocab", | ||
"save_path": "{MODELS_PATH}/char.dict", | ||
"load_path": "{MODELS_PATH}/char.dict" | ||
}, | ||
{ | ||
"in":[ | ||
"sent_lengths", | ||
"x_tok_ind", | ||
"pos_ind", | ||
"x_char_ind", | ||
"x_tokens_elmo" | ||
], | ||
"in_y": ["y_ind"], | ||
"out": ["y_predicted"], | ||
"class_name": "hybrid_ner_model", | ||
"n_tags": "#tag_vocab.len", | ||
"word_emb_path": "{DOWNLOADS_PATH}/embeddings/glove.6B.100d.txt", | ||
"word_emb_name": "glove", | ||
"word_dim": 100, | ||
"word_vocab": "#word_vocab", | ||
"char_vocab_size": "#char_vocab.len", | ||
"pos_vocab_size": "#pos_vocab.len", | ||
"pos_dim": 40, | ||
"char_dim": 100, | ||
"elmo_dim": 128, | ||
"lstm_hidden_size": 256, | ||
"save_path": "{MODELS_PATH}/conll2003_m1", | ||
"load_path": "{MODELS_PATH}/conll2003_m1", | ||
"learning_rate": 1e-3, | ||
"learning_rate_drop_patience": 5, | ||
"learning_rate_drop_div": 10, | ||
"dropout_keep_prob": 0.7 | ||
}, | ||
{ | ||
"in": ["y_predicted"], | ||
"out": ["tags"], | ||
"class_name": "convert_ids2tags", | ||
"id2tag": "#tag_vocab.i2t" | ||
} | ||
], | ||
"out": ["x_tokens", "tags"] | ||
}, | ||
"train": { | ||
"epochs": 100, | ||
"batch_size": 64, | ||
"metrics": [ | ||
{ | ||
"name": "ner_f1", | ||
"inputs": ["y", "tags"] | ||
}, | ||
{ | ||
"name": "ner_token_f1", | ||
"inputs": ["y", "tags"] | ||
} | ||
], | ||
"validation_patience": 10, | ||
"val_every_n_epochs": 1, | ||
"log_every_n_epochs": 1, | ||
"show_examples": false, | ||
"class_name": "nn_trainer", | ||
"evaluation_targets": [ | ||
"valid", | ||
"test" | ||
] | ||
}, | ||
"metadata": { | ||
"variables": { | ||
"ROOT_PATH": "~/.deeppavlov", | ||
"DOWNLOADS_PATH": "{ROOT_PATH}/downloads", | ||
"MODELS_PATH": "{ROOT_PATH}/models/conll2003_m1" | ||
}, | ||
"requirements": [ | ||
"{DEEPPAVLOV_PATH}/requirements/gensim.txt", | ||
"{DEEPPAVLOV_PATH}/requirements/tf.txt" | ||
], | ||
"download": [ | ||
{ | ||
"url": "http://files.deeppavlov.ai/deeppavlov_data/ner_conll2003_m1.tar.gz", | ||
"subdir": "{MODELS_PATH}" | ||
}, | ||
{ | ||
"url": "http://files.deeppavlov.ai/embeddings/glove.6B.100d.txt", | ||
"subdir": "{DOWNLOADS_PATH}/embeddings" | ||
} | ||
] | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,174 @@ | ||
{ | ||
"dataset_reader": { | ||
"class_name": "conll2003_reader", | ||
"data_path": "{DOWNLOADS_PATH}/vlsp2016/", | ||
"dataset_name": "vlsp2016", | ||
"provide_pos": true, | ||
"provide_chunk": true, | ||
"iobes": true | ||
}, | ||
"dataset_iterator": { | ||
"class_name": "data_learning_iterator" | ||
}, | ||
"chainer": { | ||
"in": ["x", "pos", "chunk"], | ||
"in_y": ["y"], | ||
"pipe": [ | ||
{ | ||
"in": ["x"], | ||
"out": ["x_tokens"], | ||
"class_name": "lazy_tokenizer" | ||
}, | ||
{ | ||
"in": ["pos"], | ||
"out": ["pos_tokens"], | ||
"class_name": "lazy_tokenizer" | ||
}, | ||
{ | ||
"in": ["chunk"], | ||
"out": ["chunk_tokens"], | ||
"class_name": "lazy_tokenizer" | ||
}, | ||
{ | ||
"in": ["x_tokens"], | ||
"out": ["x_lower", "sent_lengths"], | ||
"class_name": "ner_preprocessor", | ||
"id": "ner_preprocessor", | ||
"get_x_padded_for_elmo": false, | ||
"get_x_cap_padded": false | ||
}, | ||
{ | ||
"in": ["x_lower"], | ||
"out": ["x_tok_ind"], | ||
"fit_on": ["x_lower"], | ||
"class_name": "ner_vocab", | ||
"id": "word_vocab", | ||
"save_path": "{MODELS_PATH}/word.dict", | ||
"load_path": "{MODELS_PATH}/word.dict" | ||
}, | ||
{ | ||
"in": ["pos_tokens"], | ||
"out": ["pos_ind"], | ||
"fit_on": ["pos_tokens"], | ||
"class_name": "ner_vocab", | ||
"id": "pos_vocab", | ||
"save_path": "{MODELS_PATH}/pos.dict", | ||
"load_path": "{MODELS_PATH}/pos.dict" | ||
}, | ||
{ | ||
"in": ["chunk_tokens"], | ||
"out": ["chunk_ind"], | ||
"fit_on": ["chunk_tokens"], | ||
"class_name": "ner_vocab", | ||
"id": "chunk_vocab", | ||
"save_path": "{MODELS_PATH}/chunk.dict", | ||
"load_path": "{MODELS_PATH}/chunk.dict" | ||
}, | ||
{ | ||
"in": ["y"], | ||
"out": ["y_ind"], | ||
"fit_on": ["y"], | ||
"class_name": "ner_vocab", | ||
"id": "tag_vocab", | ||
"save_path": "{MODELS_PATH}/tag.dict", | ||
"load_path": "{MODELS_PATH}/tag.dict" | ||
}, | ||
{ | ||
"in": ["x_tokens"], | ||
"out": ["x_char"], | ||
"class_name": "char_splitter" | ||
}, | ||
{ | ||
"in": ["x_tokens"], | ||
"out": ["x_char_ind"], | ||
"fit_on": ["x_tokens"], | ||
"class_name": "ner_vocab", | ||
"char_level": true, | ||
"id": "char_vocab", | ||
"save_path": "{MODELS_PATH}/char.dict", | ||
"load_path": "{MODELS_PATH}/char.dict" | ||
}, | ||
{ | ||
"in":[ | ||
"sent_lengths", | ||
"x_tok_ind", | ||
"pos_ind", | ||
"chunk_ind", | ||
"x_char_ind" | ||
], | ||
"in_y": ["y_ind"], | ||
"out": ["y_predicted"], | ||
"class_name": "hybrid_ner_model", | ||
"n_tags": "#tag_vocab.len", | ||
"word_emb_path": "{DOWNLOADS_PATH}/embeddings/baomoi.bin", | ||
"word_emb_name": "baomoi", | ||
"word_dim": 300, | ||
"word_vocab": "#word_vocab", | ||
"char_vocab_size": "#char_vocab.len", | ||
"pos_vocab_size": "#pos_vocab.len", | ||
"chunk_vocab_size": "#chunk_vocab.len", | ||
"pos_dim": 40, | ||
"chunk_dim": 40, | ||
"char_dim": 100, | ||
"lstm_hidden_size": 256, | ||
"save_path": "{MODELS_PATH}/vlsp2016_full", | ||
"load_path": "{MODELS_PATH}/vlsp2016_full", | ||
"learning_rate": 1e-3, | ||
"learning_rate_drop_patience": 5, | ||
"learning_rate_drop_div": 10, | ||
"dropout_keep_prob": 0.7 | ||
}, | ||
{ | ||
"in": ["y_predicted"], | ||
"out": ["tags"], | ||
"class_name": "convert_ids2tags", | ||
"id2tag": "#tag_vocab.i2t" | ||
} | ||
], | ||
"out": ["x_tokens", "tags"] | ||
}, | ||
"train": { | ||
"epochs": 100, | ||
"batch_size": 64, | ||
"metrics": [ | ||
{ | ||
"name": "ner_f1", | ||
"inputs": ["y", "tags"] | ||
}, | ||
{ | ||
"name": "ner_token_f1", | ||
"inputs": ["y", "tags"] | ||
} | ||
], | ||
"validation_patience": 10, | ||
"val_every_n_epochs": 1, | ||
"log_every_n_epochs": 1, | ||
"show_examples": false, | ||
"class_name": "nn_trainer", | ||
"evaluation_targets": [ | ||
"valid", | ||
"test" | ||
] | ||
}, | ||
"metadata": { | ||
"variables": { | ||
"ROOT_PATH": "~/.deeppavlov", | ||
"DOWNLOADS_PATH": "{ROOT_PATH}/downloads", | ||
"MODELS_PATH": "{ROOT_PATH}/models/vlsp2016_full" | ||
}, | ||
"requirements": [ | ||
"{DEEPPAVLOV_PATH}/requirements/gensim.txt", | ||
"{DEEPPAVLOV_PATH}/requirements/tf.txt" | ||
], | ||
"download": [ | ||
{ | ||
"url": "http://files.deeppavlov.ai/deeppavlov_data/ner_vlsp2016_full.tar.gz", | ||
"subdir": "{MODELS_PATH}" | ||
}, | ||
{ | ||
"url": "http://files.deeppavlov.ai/embeddings/baomoi.bin", | ||
"subdir": "{DOWNLOADS_PATH}/embeddings" | ||
} | ||
] | ||
} | ||
} |
Oops, something went wrong.