feat: add a new ner model trained on English and Vietnamese datasets (d…

…eeppavlov#1155) * New NER model trained on English and Vietnamese datasets (CoNLL2003 and VLSP2016) * Add sentence segmentation model trained on DailyDialog * fix the download path to pretrained word embeding file in vlsp2016_full config file * fixed typos and added missing files * added tests for ner and sentseg models * Fix bugs * fix bug downloading datasets and trained models * style: optimize imports in test_quick_start.py * chore: rebuild registry * feat: remove ner_prf1 in favor of ner_f1 expanded for IOB, BILOU and IOBES * fix: correct arguments for metrics in sentseg_dailydialog.json * documentation, description of word_emb_name * fix typos * Update deeppavlov/models/ner/NER_model.py Co-Authored-By: yurakuratov <[email protected]> * fix docstring Co-authored-by: Aleksei Lymar <[email protected]> Co-authored-by: yurakuratov <[email protected]>
molodiuc · Apr 16, 2020 · a797a4a · a797a4a
1 parent db31b59
commit a797a4a
Show file tree

Hide file tree

Showing 11 changed files with 1,144 additions and 27 deletions.
diff --git a/deeppavlov/configs/ner/conll2003_m1.json b/deeppavlov/configs/ner/conll2003_m1.json
@@ -0,0 +1,152 @@
+{
+  "dataset_reader": {
+    "class_name": "conll2003_reader",
+    "data_path": "{DOWNLOADS_PATH}/conll2003/",
+    "dataset_name": "conll2003",
+    "provide_pos": true,
+    "provide_chunk": false,
+    "iobes": true
+  },
+  "dataset_iterator": {
+    "class_name": "data_learning_iterator"
+  },
+  "chainer": {
+    "in": ["x", "pos"],
+    "in_y": ["y"],
+    "pipe": [
+      {
+        "in": ["x"],
+        "out": ["x_tokens"],
+        "class_name": "lazy_tokenizer"
+      },
+      {
+        "in": ["pos"],
+        "out": ["pos_tokens"],
+        "class_name": "lazy_tokenizer"
+      },
+      {
+        "in": ["x_tokens"],
+        "out": ["x_lower", "sent_lengths", "x_tokens_elmo"],
+        "class_name": "ner_preprocessor",
+        "get_x_padded_for_elmo": true
+      },
+      {
+        "in": ["x_lower"],
+        "out": ["x_tok_ind"],
+        "fit_on": ["x_lower"],
+        "class_name": "ner_vocab",
+        "id": "word_vocab",
+        "save_path": "{MODELS_PATH}/word.dict",
+        "load_path": "{MODELS_PATH}/word.dict"
+      },
+      {
+        "in": ["pos_tokens"],
+        "out": ["pos_ind"],
+        "fit_on": ["pos_tokens"],
+        "class_name": "ner_vocab",
+        "id": "pos_vocab",
+        "save_path": "{MODELS_PATH}/pos.dict",
+        "load_path": "{MODELS_PATH}/pos.dict"
+      },
+      {
+        "in": ["y"],
+        "out": ["y_ind"],
+        "fit_on": ["y"],
+        "class_name": "ner_vocab",
+        "id": "tag_vocab",
+        "save_path": "{MODELS_PATH}/tag.dict",
+        "load_path": "{MODELS_PATH}/tag.dict"
+      },
+      {
+        "in": ["x_tokens"],
+        "out": ["x_char_ind"],
+        "fit_on": ["x_tokens"],
+        "class_name": "ner_vocab",
+        "char_level": true,
+        "id": "char_vocab",
+        "save_path": "{MODELS_PATH}/char.dict",
+        "load_path": "{MODELS_PATH}/char.dict"
+      },
+      {
+        "in":[
+          "sent_lengths",
+          "x_tok_ind",
+          "pos_ind",
+          "x_char_ind",
+          "x_tokens_elmo"
+        ],
+        "in_y": ["y_ind"],
+        "out": ["y_predicted"],
+        "class_name": "hybrid_ner_model",
+        "n_tags": "#tag_vocab.len",
+        "word_emb_path": "{DOWNLOADS_PATH}/embeddings/glove.6B.100d.txt",
+        "word_emb_name": "glove",
+        "word_dim": 100,
+        "word_vocab": "#word_vocab",
+        "char_vocab_size": "#char_vocab.len",
+        "pos_vocab_size": "#pos_vocab.len",
+        "pos_dim": 40,
+        "char_dim": 100,
+        "elmo_dim": 128,
+        "lstm_hidden_size": 256,
+        "save_path": "{MODELS_PATH}/conll2003_m1",
+        "load_path": "{MODELS_PATH}/conll2003_m1",
+        "learning_rate": 1e-3,
+        "learning_rate_drop_patience": 5,
+        "learning_rate_drop_div": 10,
+        "dropout_keep_prob": 0.7
+      },
+      {
+        "in": ["y_predicted"],
+        "out": ["tags"],
+        "class_name": "convert_ids2tags",
+        "id2tag": "#tag_vocab.i2t"
+      }
+    ],
+    "out": ["x_tokens", "tags"]
+  },
+  "train": {
+    "epochs": 100,
+    "batch_size": 64,
+    "metrics": [
+      {
+        "name": "ner_f1",
+        "inputs": ["y", "tags"]
+      },
+      {
+        "name": "ner_token_f1",
+        "inputs": ["y", "tags"]
+      }
+    ],
+    "validation_patience": 10,
+    "val_every_n_epochs": 1,
+    "log_every_n_epochs": 1,
+    "show_examples": false,
+    "class_name": "nn_trainer",
+    "evaluation_targets": [
+      "valid",
+      "test"
+    ]
+  },
+  "metadata": {
+    "variables": {
+      "ROOT_PATH": "~/.deeppavlov",
+      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
+      "MODELS_PATH": "{ROOT_PATH}/models/conll2003_m1"
+    },
+    "requirements": [
+      "{DEEPPAVLOV_PATH}/requirements/gensim.txt",
+      "{DEEPPAVLOV_PATH}/requirements/tf.txt"
+    ],
+    "download": [
+      {
+        "url": "http://files.deeppavlov.ai/deeppavlov_data/ner_conll2003_m1.tar.gz",
+        "subdir": "{MODELS_PATH}"
+      },
+      {
+        "url": "http://files.deeppavlov.ai/embeddings/glove.6B.100d.txt",
+        "subdir": "{DOWNLOADS_PATH}/embeddings"
+      }
+    ]
+  }
+}
diff --git a/deeppavlov/configs/ner/vlsp2016_full.json b/deeppavlov/configs/ner/vlsp2016_full.json
@@ -0,0 +1,174 @@
+{
+  "dataset_reader": {
+    "class_name": "conll2003_reader",
+    "data_path": "{DOWNLOADS_PATH}/vlsp2016/",
+    "dataset_name": "vlsp2016",
+    "provide_pos": true,
+    "provide_chunk": true,
+    "iobes": true
+  },
+  "dataset_iterator": {
+    "class_name": "data_learning_iterator"
+  },
+  "chainer": {
+    "in": ["x", "pos", "chunk"],
+    "in_y": ["y"],
+    "pipe": [
+      {
+        "in": ["x"],
+        "out": ["x_tokens"],
+        "class_name": "lazy_tokenizer"
+      },
+      {
+        "in": ["pos"],
+        "out": ["pos_tokens"],
+        "class_name": "lazy_tokenizer"
+      },
+      {
+        "in": ["chunk"],
+        "out": ["chunk_tokens"],
+        "class_name": "lazy_tokenizer"
+      },
+      {
+        "in": ["x_tokens"],
+        "out": ["x_lower", "sent_lengths"],
+        "class_name": "ner_preprocessor",
+        "id": "ner_preprocessor",
+        "get_x_padded_for_elmo": false,
+        "get_x_cap_padded": false
+      },
+      {
+        "in": ["x_lower"],
+        "out": ["x_tok_ind"],
+        "fit_on": ["x_lower"],
+        "class_name": "ner_vocab",
+        "id": "word_vocab",
+        "save_path": "{MODELS_PATH}/word.dict",
+        "load_path": "{MODELS_PATH}/word.dict"
+      },
+      {
+        "in": ["pos_tokens"],
+        "out": ["pos_ind"],
+        "fit_on": ["pos_tokens"],
+        "class_name": "ner_vocab",
+        "id": "pos_vocab",
+        "save_path": "{MODELS_PATH}/pos.dict",
+        "load_path": "{MODELS_PATH}/pos.dict"
+      },
+      {
+        "in": ["chunk_tokens"],
+        "out": ["chunk_ind"],
+        "fit_on": ["chunk_tokens"],
+        "class_name": "ner_vocab",
+        "id": "chunk_vocab",
+        "save_path": "{MODELS_PATH}/chunk.dict",
+        "load_path": "{MODELS_PATH}/chunk.dict"
+      },
+      {
+        "in": ["y"],
+        "out": ["y_ind"],
+        "fit_on": ["y"],
+        "class_name": "ner_vocab",
+        "id": "tag_vocab",
+        "save_path": "{MODELS_PATH}/tag.dict",
+        "load_path": "{MODELS_PATH}/tag.dict"
+      },
+      {
+        "in": ["x_tokens"],
+        "out": ["x_char"],
+        "class_name": "char_splitter"
+      },
+      {
+        "in": ["x_tokens"],
+        "out": ["x_char_ind"],
+        "fit_on": ["x_tokens"],
+        "class_name": "ner_vocab",
+        "char_level": true,
+        "id": "char_vocab",
+        "save_path": "{MODELS_PATH}/char.dict",
+        "load_path": "{MODELS_PATH}/char.dict"
+      },
+      {
+        "in":[
+          "sent_lengths",
+          "x_tok_ind",
+          "pos_ind",
+          "chunk_ind",
+          "x_char_ind"
+        ],
+        "in_y": ["y_ind"],
+        "out": ["y_predicted"],
+        "class_name": "hybrid_ner_model",
+        "n_tags": "#tag_vocab.len",
+        "word_emb_path": "{DOWNLOADS_PATH}/embeddings/baomoi.bin",
+        "word_emb_name": "baomoi",
+        "word_dim": 300,
+        "word_vocab": "#word_vocab",
+        "char_vocab_size": "#char_vocab.len",
+        "pos_vocab_size": "#pos_vocab.len",
+        "chunk_vocab_size": "#chunk_vocab.len",
+        "pos_dim": 40,
+        "chunk_dim": 40,
+        "char_dim": 100,
+        "lstm_hidden_size": 256,
+        "save_path": "{MODELS_PATH}/vlsp2016_full",
+        "load_path": "{MODELS_PATH}/vlsp2016_full",
+        "learning_rate": 1e-3,
+        "learning_rate_drop_patience": 5,
+        "learning_rate_drop_div": 10,
+        "dropout_keep_prob": 0.7
+      },
+      {
+        "in": ["y_predicted"],
+        "out": ["tags"],
+        "class_name": "convert_ids2tags",
+        "id2tag": "#tag_vocab.i2t"
+      }
+    ],
+    "out": ["x_tokens", "tags"]
+  },
+  "train": {
+    "epochs": 100,
+    "batch_size": 64,
+    "metrics": [
+      {
+        "name": "ner_f1",
+        "inputs": ["y", "tags"]
+      },
+      {
+        "name": "ner_token_f1",
+        "inputs": ["y", "tags"]
+      }
+    ],
+    "validation_patience": 10,
+    "val_every_n_epochs": 1,
+    "log_every_n_epochs": 1,
+    "show_examples": false,
+    "class_name": "nn_trainer",
+    "evaluation_targets": [
+      "valid",
+      "test"
+    ]
+  },
+  "metadata": {
+    "variables": {
+      "ROOT_PATH": "~/.deeppavlov",
+      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
+      "MODELS_PATH": "{ROOT_PATH}/models/vlsp2016_full"
+    },
+    "requirements": [
+      "{DEEPPAVLOV_PATH}/requirements/gensim.txt",
+      "{DEEPPAVLOV_PATH}/requirements/tf.txt"
+    ],
+    "download": [
+      {
+        "url": "http://files.deeppavlov.ai/deeppavlov_data/ner_vlsp2016_full.tar.gz",
+        "subdir": "{MODELS_PATH}"
+      },
+      {
+        "url": "http://files.deeppavlov.ai/embeddings/baomoi.bin",
+        "subdir": "{DOWNLOADS_PATH}/embeddings"
+      }
+    ]
+  }
+}