-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtest_ner.py
62 lines (47 loc) · 2.25 KB
/
test_ner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
'''
This script tests the Named Entity Recognition model against the unannotated probate data. The model can be imported to test the remaining entries that are not annotated to extract the entities.
The input to the script is the folder containing the training model generated during the train_ner.py and a text file containing all the entries that need to be extracted.
To run the script:
python3 test_ner.py <path_to_model_directory> <path_to_test_data> <destination_path_for_files>
Dependencies:
1. SpaCy :
pip install -U spacy
Then, download and install a language model so that the semantics for the English language are used while training.
python3 -m spacy download en
'''
import spacy
import sys
import pandas as pd
import os
entities = []
ocr_data = ""
model_name = sys.argv[1]
test_data = sys.argv[2]
output_directory = sys.argv[3]
if output_directory is not None:
if not os.path.exists(output_directory):
os.mkdir(output_directory)
model_name = model_name + "/" if (model_name[-1] is not '/') else model_name
output_directory = output_directory + "/" if (output_directory[-1] is not '/') else output_directory
with open(test_data) as myfile:
directory_data = [x.replace('\n', ' ') for x in myfile]
print("Loading from", model_name)
nlp = spacy.load(model_name) # loading the trained model
for entry in directory_data:
doc = nlp(entry) # finding the ner entities
entities.append([(ent.text, ent.label_) for ent in doc.ents])
df = pd.DataFrame() # creating a Pandas Dataframe
df['text'] = pd.Series(directory_data)
for i in range(0, len(entities)):
labels = entities[i]
for label in labels:
entity = label[1]
if entity.lower() not in df:
df[entity.lower()] = [[] for _ in range(len(df))]
df.loc[i][entity.lower()].append(label[0]) # adding entities
else:
df.loc[i][entity.lower()].append(label[0])
print("Saved .tsv, .csv and .xlsx files to ---> " + output_directory)
df.to_csv(output_directory + 'trained_ner.tsv', sep='\t', index=False) # saving the files in different formats
df.to_csv(output_directory + 'trained_ner.csv', index=False)
df.to_excel(output_directory + 'trained_ner.xlsx', index=False)