Skip to content

Commit

Permalink
feat: add a new ner model trained on English and Vietnamese datasets (d…
Browse files Browse the repository at this point in the history
…eeppavlov#1155)

* New NER model trained on English and Vietnamese datasets (CoNLL2003 and VLSP2016)

* Add sentence segmentation model trained on DailyDialog

* fix the download path to pretrained word embeding file in vlsp2016_full config file

* fixed typos and added missing files

* added tests for ner and sentseg models

* Fix bugs

* fix bug downloading datasets and trained models

* style: optimize imports in test_quick_start.py

* chore: rebuild registry

* feat: remove ner_prf1 in favor of ner_f1 expanded for IOB, BILOU and IOBES

* fix: correct arguments for metrics in sentseg_dailydialog.json

* documentation, description of word_emb_name

* fix typos

* Update deeppavlov/models/ner/NER_model.py

Co-Authored-By: yurakuratov <[email protected]>

* fix docstring

Co-authored-by: Aleksei Lymar <[email protected]>
Co-authored-by: yurakuratov <[email protected]>
  • Loading branch information
3 people authored Apr 16, 2020
1 parent db31b59 commit a797a4a
Show file tree
Hide file tree
Showing 11 changed files with 1,144 additions and 27 deletions.
152 changes: 152 additions & 0 deletions deeppavlov/configs/ner/conll2003_m1.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
{
"dataset_reader": {
"class_name": "conll2003_reader",
"data_path": "{DOWNLOADS_PATH}/conll2003/",
"dataset_name": "conll2003",
"provide_pos": true,
"provide_chunk": false,
"iobes": true
},
"dataset_iterator": {
"class_name": "data_learning_iterator"
},
"chainer": {
"in": ["x", "pos"],
"in_y": ["y"],
"pipe": [
{
"in": ["x"],
"out": ["x_tokens"],
"class_name": "lazy_tokenizer"
},
{
"in": ["pos"],
"out": ["pos_tokens"],
"class_name": "lazy_tokenizer"
},
{
"in": ["x_tokens"],
"out": ["x_lower", "sent_lengths", "x_tokens_elmo"],
"class_name": "ner_preprocessor",
"get_x_padded_for_elmo": true
},
{
"in": ["x_lower"],
"out": ["x_tok_ind"],
"fit_on": ["x_lower"],
"class_name": "ner_vocab",
"id": "word_vocab",
"save_path": "{MODELS_PATH}/word.dict",
"load_path": "{MODELS_PATH}/word.dict"
},
{
"in": ["pos_tokens"],
"out": ["pos_ind"],
"fit_on": ["pos_tokens"],
"class_name": "ner_vocab",
"id": "pos_vocab",
"save_path": "{MODELS_PATH}/pos.dict",
"load_path": "{MODELS_PATH}/pos.dict"
},
{
"in": ["y"],
"out": ["y_ind"],
"fit_on": ["y"],
"class_name": "ner_vocab",
"id": "tag_vocab",
"save_path": "{MODELS_PATH}/tag.dict",
"load_path": "{MODELS_PATH}/tag.dict"
},
{
"in": ["x_tokens"],
"out": ["x_char_ind"],
"fit_on": ["x_tokens"],
"class_name": "ner_vocab",
"char_level": true,
"id": "char_vocab",
"save_path": "{MODELS_PATH}/char.dict",
"load_path": "{MODELS_PATH}/char.dict"
},
{
"in":[
"sent_lengths",
"x_tok_ind",
"pos_ind",
"x_char_ind",
"x_tokens_elmo"
],
"in_y": ["y_ind"],
"out": ["y_predicted"],
"class_name": "hybrid_ner_model",
"n_tags": "#tag_vocab.len",
"word_emb_path": "{DOWNLOADS_PATH}/embeddings/glove.6B.100d.txt",
"word_emb_name": "glove",
"word_dim": 100,
"word_vocab": "#word_vocab",
"char_vocab_size": "#char_vocab.len",
"pos_vocab_size": "#pos_vocab.len",
"pos_dim": 40,
"char_dim": 100,
"elmo_dim": 128,
"lstm_hidden_size": 256,
"save_path": "{MODELS_PATH}/conll2003_m1",
"load_path": "{MODELS_PATH}/conll2003_m1",
"learning_rate": 1e-3,
"learning_rate_drop_patience": 5,
"learning_rate_drop_div": 10,
"dropout_keep_prob": 0.7
},
{
"in": ["y_predicted"],
"out": ["tags"],
"class_name": "convert_ids2tags",
"id2tag": "#tag_vocab.i2t"
}
],
"out": ["x_tokens", "tags"]
},
"train": {
"epochs": 100,
"batch_size": 64,
"metrics": [
{
"name": "ner_f1",
"inputs": ["y", "tags"]
},
{
"name": "ner_token_f1",
"inputs": ["y", "tags"]
}
],
"validation_patience": 10,
"val_every_n_epochs": 1,
"log_every_n_epochs": 1,
"show_examples": false,
"class_name": "nn_trainer",
"evaluation_targets": [
"valid",
"test"
]
},
"metadata": {
"variables": {
"ROOT_PATH": "~/.deeppavlov",
"DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
"MODELS_PATH": "{ROOT_PATH}/models/conll2003_m1"
},
"requirements": [
"{DEEPPAVLOV_PATH}/requirements/gensim.txt",
"{DEEPPAVLOV_PATH}/requirements/tf.txt"
],
"download": [
{
"url": "http://files.deeppavlov.ai/deeppavlov_data/ner_conll2003_m1.tar.gz",
"subdir": "{MODELS_PATH}"
},
{
"url": "http://files.deeppavlov.ai/embeddings/glove.6B.100d.txt",
"subdir": "{DOWNLOADS_PATH}/embeddings"
}
]
}
}
174 changes: 174 additions & 0 deletions deeppavlov/configs/ner/vlsp2016_full.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
{
"dataset_reader": {
"class_name": "conll2003_reader",
"data_path": "{DOWNLOADS_PATH}/vlsp2016/",
"dataset_name": "vlsp2016",
"provide_pos": true,
"provide_chunk": true,
"iobes": true
},
"dataset_iterator": {
"class_name": "data_learning_iterator"
},
"chainer": {
"in": ["x", "pos", "chunk"],
"in_y": ["y"],
"pipe": [
{
"in": ["x"],
"out": ["x_tokens"],
"class_name": "lazy_tokenizer"
},
{
"in": ["pos"],
"out": ["pos_tokens"],
"class_name": "lazy_tokenizer"
},
{
"in": ["chunk"],
"out": ["chunk_tokens"],
"class_name": "lazy_tokenizer"
},
{
"in": ["x_tokens"],
"out": ["x_lower", "sent_lengths"],
"class_name": "ner_preprocessor",
"id": "ner_preprocessor",
"get_x_padded_for_elmo": false,
"get_x_cap_padded": false
},
{
"in": ["x_lower"],
"out": ["x_tok_ind"],
"fit_on": ["x_lower"],
"class_name": "ner_vocab",
"id": "word_vocab",
"save_path": "{MODELS_PATH}/word.dict",
"load_path": "{MODELS_PATH}/word.dict"
},
{
"in": ["pos_tokens"],
"out": ["pos_ind"],
"fit_on": ["pos_tokens"],
"class_name": "ner_vocab",
"id": "pos_vocab",
"save_path": "{MODELS_PATH}/pos.dict",
"load_path": "{MODELS_PATH}/pos.dict"
},
{
"in": ["chunk_tokens"],
"out": ["chunk_ind"],
"fit_on": ["chunk_tokens"],
"class_name": "ner_vocab",
"id": "chunk_vocab",
"save_path": "{MODELS_PATH}/chunk.dict",
"load_path": "{MODELS_PATH}/chunk.dict"
},
{
"in": ["y"],
"out": ["y_ind"],
"fit_on": ["y"],
"class_name": "ner_vocab",
"id": "tag_vocab",
"save_path": "{MODELS_PATH}/tag.dict",
"load_path": "{MODELS_PATH}/tag.dict"
},
{
"in": ["x_tokens"],
"out": ["x_char"],
"class_name": "char_splitter"
},
{
"in": ["x_tokens"],
"out": ["x_char_ind"],
"fit_on": ["x_tokens"],
"class_name": "ner_vocab",
"char_level": true,
"id": "char_vocab",
"save_path": "{MODELS_PATH}/char.dict",
"load_path": "{MODELS_PATH}/char.dict"
},
{
"in":[
"sent_lengths",
"x_tok_ind",
"pos_ind",
"chunk_ind",
"x_char_ind"
],
"in_y": ["y_ind"],
"out": ["y_predicted"],
"class_name": "hybrid_ner_model",
"n_tags": "#tag_vocab.len",
"word_emb_path": "{DOWNLOADS_PATH}/embeddings/baomoi.bin",
"word_emb_name": "baomoi",
"word_dim": 300,
"word_vocab": "#word_vocab",
"char_vocab_size": "#char_vocab.len",
"pos_vocab_size": "#pos_vocab.len",
"chunk_vocab_size": "#chunk_vocab.len",
"pos_dim": 40,
"chunk_dim": 40,
"char_dim": 100,
"lstm_hidden_size": 256,
"save_path": "{MODELS_PATH}/vlsp2016_full",
"load_path": "{MODELS_PATH}/vlsp2016_full",
"learning_rate": 1e-3,
"learning_rate_drop_patience": 5,
"learning_rate_drop_div": 10,
"dropout_keep_prob": 0.7
},
{
"in": ["y_predicted"],
"out": ["tags"],
"class_name": "convert_ids2tags",
"id2tag": "#tag_vocab.i2t"
}
],
"out": ["x_tokens", "tags"]
},
"train": {
"epochs": 100,
"batch_size": 64,
"metrics": [
{
"name": "ner_f1",
"inputs": ["y", "tags"]
},
{
"name": "ner_token_f1",
"inputs": ["y", "tags"]
}
],
"validation_patience": 10,
"val_every_n_epochs": 1,
"log_every_n_epochs": 1,
"show_examples": false,
"class_name": "nn_trainer",
"evaluation_targets": [
"valid",
"test"
]
},
"metadata": {
"variables": {
"ROOT_PATH": "~/.deeppavlov",
"DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
"MODELS_PATH": "{ROOT_PATH}/models/vlsp2016_full"
},
"requirements": [
"{DEEPPAVLOV_PATH}/requirements/gensim.txt",
"{DEEPPAVLOV_PATH}/requirements/tf.txt"
],
"download": [
{
"url": "http://files.deeppavlov.ai/deeppavlov_data/ner_vlsp2016_full.tar.gz",
"subdir": "{MODELS_PATH}"
},
{
"url": "http://files.deeppavlov.ai/embeddings/baomoi.bin",
"subdir": "{DOWNLOADS_PATH}/embeddings"
}
]
}
}
Loading

0 comments on commit a797a4a

Please sign in to comment.