-
Notifications
You must be signed in to change notification settings - Fork 3
/
arff_helper.py
152 lines (127 loc) · 6.52 KB
/
arff_helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
from collections import OrderedDict
import arff
import warnings
import numpy as np
import numpy.lib.recfunctions as rfn
class ArffHelper(object):
"""
The class is based on general arff handler with an extra keyword %@METADATA
(they are comment lines in the description, i.e. lines *before* the
`relation` keyword).
Metadata fields contains metadata names and related values (separated by space characters).
- Lines starting with "%" are comments, except for line starting with %@METADATA
- Lines starting with "@" that is followed by a word (without space), are considered
keywords. The available keywords are the following:
@RELATION: a string with the name of the data set.
@ATTRIBUTES: a list of attributes representing names of data columns
followed by the types of data. The available data types
are 'NUMERIC', 'REAL', 'INTEGER' or a list of string.
@DESCRIPTION: a string with the description of the data set.
@DATA: a list of data instances. The data should follow the order that
the attributes were presented.
- Metadata ('%@METADATA <KEY> <VALUE>' lines) can have any keys, but is not currently used
"""
_METADATA_STRING = '@metadata'
_METADATA_COLUMNS_COUNT = 3 # @METADATA KEY VALUE
_METADATA_KEY_COLUMN = 1 # First key,
_METADATA_VALUE_COLUMN = 2 # then value
_ATTRIBUTES_TYPE = {'NUMERIC': np.float32, 'REAL': np.double, 'INTEGER': np.int64}
def __init__(self):
pass
# Public interface functions (I/O)
#
# I. Loading functions (from file or string)
#
@staticmethod
def add_column(obj, name, dtype, default_value):
"""
Add a new column to @obj['data'] and a new attribute to @obj['attributes']
(i.e. the name of the new column and the data type for this column).
This operation is performed in-place, so the @obj itself is changed.
:param obj: arff object before adding new column.
:param name: name of the new column.
:param dtype: data type of the new column.
Available data types:
'NUMERIC', 'REAL', 'INTEGER' or a list of strings (then it's a categorical column with
the provided values as options).
:param default_value: default value of the new column (we need to somehow assign the data in the new column).
:return: arff object with an additional column.
"""
obj['data'] = ArffHelper.add_column_to_array(obj['data'], name, dtype, default_value)
obj['attributes'].append((name, dtype))
return obj
@staticmethod
def add_column_to_array(arr, name, dtype, def_value):
"""
Add a new column to a structured numpy array.
:param arr: numpy array before adding column.
:param name: name of the new column.
:param dtype: data type of the new column.
Available data types:
'NUMERIC', 'REAL', 'INTEGER' or a list of strings (then it's a categorical column with
the provided values as options).
:param def_value: default value of the new column.
:return: numpy array with new column.
"""
# check if def_value is in dtype
if type(def_value) == str and def_value not in dtype:
warnings.warn("The type of the default value is not the same as type of column data"
" or the default value is not in the list (date type provided is {})".format(name))
if name in arr.dtype.names:
raise ValueError('Array @arr already has a field {}'.format(name))
if arr.size != 0:
arr = rfn.append_fields(base=arr,
names=name,
data=[def_value] * len(arr),
dtypes=ArffHelper._convert_dtype_to_numpy(dtype),
usemask=False)
else:
# If @arr is empty, it should have been created with ArffHelper.create_empty() method, or in a similar
# fashion. In that case, it has a length (passed as a parameter at creation), but no elements.
arr = np.array([def_value] * len(arr), dtype=[(name, ArffHelper._convert_dtype_to_numpy(dtype))])
return arr
@staticmethod
def remove_column(obj, name):
"""
Remove a column with respective name from @obj['data'] and its attributes (@obj['attributes']).
:param obj: arff object before adding new column.
:param name: name of the deleted column.
:return: arff object without the column @name.
"""
deleted_column_index = [column_name for column_name, _ in obj['attributes']].index(name)
obj['attributes'].pop(deleted_column_index)
# keep just the remaining attributes
obj['data'] = rfn.drop_fields(base=obj['data'],
drop_names=name,
usemask=False)
return obj
@staticmethod
def convert_data_to_structured_array(obj):
"""
Convert data in @obj['data'] into a structured numpy array according to the data type in
@obj['attributes'].
:param obj: arff object before data conversion.
:return: arff object after data conversion.
"""
d = np.dtype([(str(at[0]), ArffHelper._convert_dtype_to_numpy(at[1])) for at in obj['attributes']])
obj['data'] = np.array([tuple(item) for item in obj['data']], dtype=d)
return obj
@staticmethod
def _convert_dtype_to_numpy(data_type):
"""
Validate input @data_type as ARFF-supported data type and convert to numpy.dtype.
:param data_type: input data_type, string.
Available data types:
'NUMERIC', 'REAL', 'INTEGER' or a tuple of string (then it's a categorical attribute).
:return: converted numpy.dtype from input data_type.
"""
if data_type in ArffHelper._ATTRIBUTES_TYPE.keys():
return ArffHelper._ATTRIBUTES_TYPE[data_type]
else:
if type(data_type) == tuple:
max_length = max(map(len, data_type))
else:
raise ValueError("Wrong data type in attributes. "
"It should be a list of strings or one of the data types in {}".format(
', '.join(ArffHelper._ATTRIBUTES_TYPE.keys())))
return '<U{}'.format(max_length)