-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathevaluator.py
139 lines (112 loc) · 4.22 KB
/
evaluator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
from arsel import Arsel
import pandas as pd
from data_utils import *
import csv
from buckwalter import Buckwalter
class Evaluator(object):
"""
This class evaluates arsel lexicon and returns information given a list of
sentences in buckwalter form
"""
def __init__(self, arsl):
"""
intializes an evaluator and loads data required for evaluation
calculating import metrics.
Parameters
----------
arsl : Arsel instance
"""
self.arsel = arsl
self.dict_buckwalter = {}
self.dict_scores_true = {}
self.word_list = [] # used to count the number of uniq words
self.unsupported_madamira_list = []
self.dataset_stat = {'verbs_count': 0,
'nouns_count': 0,
'advs_count': 0,
'adjs_count': 0,
'words_count': 0,
'word_uniq_count': 0,
'unsupported_count': 0,
'unsupported_pos_count': 0,
'pos_uniq': False}
def load(self, file_sentences, file_scores, pos_uniq=False):
"""
The methods loads the a file with buckwalter sentences and
another file with the true scores of the sentences by id
Parameters
----------
file_sentences : str
file containing bucklwater form sentences in the form of
id # sentences in buckwalter form
file_scores : str
file containing the true scores of the file_sentences in the form of
id,anger,disgust,fear,joy,sadness,surprise
"""
self.dataset_stat['pos_uniq'] = pos_uniq
buck_sentences = open_file(file_sentences)
list_buckwalter = []
# load ids and buckwalter sentences
for l in buck_sentences:
self._check_buck(l)
split_line = l.split('#')
# removing new line character
list_buckwalter.append(split_line[1].strip())
self.buck_obj = Buckwalter(list_buckwalter, pos_uniq)
self.dataset_stat = self.buck_obj.sent_stat()
# load scores converting values to floats
with open(file_scores) as csvfile:
readCSV = csv.reader(csvfile, delimiter=',')
next(readCSV, None)
for row in readCSV:
self.dict_scores_true[int(row[0])] = [float(x) for x in row[1:]]
def buck_obj(self):
"""Returns a Buckwalter Object containing information about the list of buckwalter sentences
Returns
-------
Buckwalter obj
"""
return self.buck_obj
def dataset_info(self):
"""Returns a dict containing info about the current dataset
Returns
-------
dict
dict containing the different stats as keys:
uniq/verbs_count
uniq/nouns_count
uniq/advs_count
uniq/adjs_count
words_count
word_uniq_count
unsupported_count
unsupported_uniq_count
unsupported_pos_count
"""
# returns the uniq word count that contains everything including the supported and not unsupported
# and including all pos tags
return self.dataset_stat
def _dataframe_data(dict_data, columns):
"""
This method takes a dictionary and returns a data frame with the given columns
Parameters
----------
dict_data : type
Returns
-------
DataFrame
"""
return pd.DataFrame.from_dict(dict_data, orient='index', columns=columns)
def _check_buck(self, sentence):
"""
This methods checks if the buckwalter sentences
has any discripencies regarding the format of 'id#sentences'
Parameters
----------
file_sentences : str
"""
count = 0
for c in sentence:
if c == '#':
count += 1
assert (count == 1), sentence + ": sentence contains more than one #"