-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtables.py
412 lines (324 loc) · 15.6 KB
/
tables.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
"""A module to read and write text data from/to files.
The main functions are `read` and `write`, which take pathnames
for the file to read or write, not file-like objects, and which return and
accept (respectively) a list of dicts with the same keys and whose keys are
the column headings in the file.
The expected file format is that the first line is column names and that each
subsequent line is the entries of a record. Column names and record entries
are delimited by the same delimiter (and can be made to contain the delimiter
by using a text qualifier)."""
def _anneal_by_qualifier(elements, qualifier, delim):
"""Account for delimited text by squishing adjacent elements together.
:param elements: a list of strings
:param qualifier: string, often '"'
:param delim: string, often '\t' or ','
:returns: `elements`, mutated
"""
if qualifier:
while any(delimited_piece.startswith(qualifier)
for delimited_piece in elements):
i = next(iter(i
for i in range(len(elements))
if elements[i].startswith(qualifier)))
j = next(iter(j
for j in range(i, len(elements))
if elements[j].endswith(qualifier)))
elements[i] = elements[i][ len(qualifier):]
elements[j] = elements[j][:-len(qualifier) ]
elements[i:j+1] = [delim.join(elements[k]
for k in range(i, j+1))]
return elements
def _by_lines(line_source, delim, qualifier, translate):
"""Yield dict records from `line_source`
:param line_source: an iterable of strings
:param delim: the text delimiter, often '\t' or ','
:param qualifier: string that turns on/off ignoring delimiters
:param translate: a dict mapping from column name to a function that
transforms the file-stored version of a value to its proper runtime form
:returns: a generator
"""
columns = []
for line in line_source:
if line.endswith('\n'):
line = line[:-1]
elements = _anneal_by_qualifier(
line.split(delim), qualifier, delim)
if not columns:
columns = elements
continue
yield {c:translate.get(c, (lambda x:x))(e)
for c, e in zip(columns, elements)}
def _read(path_name, delim, qualifier, translate):
"""Read the file at `path_name` and return its content as a list of dicts.
:param path_name: name of (and path to) the file to read
:param delim: delimiter to separate entries within a record
:param qualifier: ignore delimiters inside a pair of these
:param translate: dict from column name to a callable that transforms
entries from that column"""
#returning the generator would exit the with-clause and close the file
with open(path_name, 'r') as outof:
for thing in _by_lines(outof, delim, qualifier, translate):
yield thing
def _check_delim(path_name, kwargs, by_extension={'.txt':'\t', '.csv':','}):
"""Return a delimiter string based on caller input or file extension.
Either extract a delimiter specified in `kwargs` as `delim`, or if
a delimiter isn't specified that way, return a standard delimiter based
on the file extension at the end of `path_name`. If `path_name` does
not match any pre-defined extension-to-delimiter relationship, raise a
ValueError, otherwise return the extracted/assumed delimiter.
:param path_name: a string ending with the name of the file that a
table is read from or written to
:param kwargs: a dict of named parameters specified by the user
calling `read()` or `write()`
:param by_extension: a dict mapping from file extension to a
pre-defined corresponding delimiter"""
try:
return kwargs['delim']
except KeyError:
pass
try:
return next(iter(
delim
for extension, delim in by_extension.items()
if path_name.endswith(extension)))
except StopIteration:
raise ValueError('Delimiter `delim` not specified. Could not '
'assume based on file extension.')
def _check_qualifier(path_name, delim, kwargs):
for q in ('qualifier', 'qual'):
try:
return kwargs['qualifier']
except KeyError:
pass
if path_name.lower().endswith('csv'):
first_record = next(iter(_read(path_name, delim, '', {})))
if all(column.startswith('"') and column.endswith('"')
for column in first_record):
return '"'
return ''
def read(path_name, *args, **kwargs):
"""Read a data table from a file specified by path_name.
:param path_name: the name of (or path to followed by name of) the
file to read
:param delim: the delimiter used between cell entries in the table.
Defaults to tab ('\t') for files ending in .txt and to comma (',')
for files ending in .csv.
:param qual: text qualifier, prepended and appended to cell entry text
to prevent delim instance being used to delimit text
:param qualifier: alias for qual
:param translate: a dict mapping from column name to a callable (such
as `int`, `float`, or `eval`) which translates a value in that column
into a new form.
"""
from os import PathLike
if isinstance(path_name, PathLike):
path_name = path_name.__fspath()
delim = _check_delim(path_name, kwargs)
qualifier = _check_qualifier(path_name, delim, kwargs)
translate = kwargs.get('translate', {})
return list(_read(path_name, delim, qualifier, translate))
def parse(text, *args, **kwargs):
"""Parse a table from text without needing to read a file."""
delim = kwargs.get('delim', '\t')
qualifier = kwargs.get('qualifier', '')
translate = kwargs.get('translate', {})
return list(_by_lines(text.split('\n'), delim, qualifier, translate))
def _maybe_qualify(text, qualifier, delim):
"""Surround `text` with `qualifier`s if and only if `delim` is in text."""
return text if delim not in text else f'{qualifier}{text}{qualifier}'
def write(table, path_name, *args, **kwargs):
"""Save the table to a file.
:param table: a list of dicts
:param path_name: the name of the file where `table` is saved
"""
delim = _check_delim(path_name, kwargs)
qualifier = kwargs.get('qualifier', '')
with open(path_name, 'w') as into:
columns = [a for a in table[0]]
into.write(delim.join(_maybe_qualify(a, qualifier, delim)
for a in columns)+'\n')
for record in table:
into.write(delim.join(
_maybe_qualify(entry, qualifier, delim)
for entry in (str(record[key])
for key in columns))+'\n')
class Table(list):
"""A glorified list of dicts.
This class extends `list` and supports indexing records by their values
in a particular column (see `Table.index_by()`) and supports accessing
setting, and deleting entire columns. Columns can be accessed by
subscripting the name of the column or by using the column's name as
an attribute of the table (except when there's a real attribute with
that name). Indexed records can be accessed by calling the table with
the index key (the row's ostensibly unique value in whicheve column was
used to index the table) as the sole parameter and can also be accessed
by using the index key as an attribute of the table (Just like accessing
columns, real attributes take precedence, and also columns take
precedence over index keys when retrieving a non-existent attribute.).
Subscripting the table with an int or slice (such as `1:9`) behaves
just like an ordinary `list`, because `__getitem__` delegates to the
superclass in those cases.
"""
@staticmethod
def read(filename, *args, **kwargs):
"""Read the file and return a Table of its contents.
:param filename: name of or path to a file"""
return Table(read(filename, *args, **kwargs))
def __init__(self, records, **formats):
super(Table, self).__init__(records)
self.format(**formats)
self.__index = {}
@property
def columns(self):
return list(self[0].keys())
def format(self, **formats):
"""Transform the types of the entries in the table.
:param formats: a mapping from column heading to a one-param
callable that returns a transformed version of the an entry from
the column named by the corresponding key
Example:
>>> table.format(lat=float, long=float, year=int)
"""
for record in self:
for column, _format in formats.items():
if column in record:
record[column] = _format(record[column])
def by(self, column, one_to_one=False, out=None):
"""Return a dict from values in a column to matching records (list).
The dict has as keys all the values from the column. Each key maps
onto a list of the records whose value in that column is the key.
If one_to_one is truthy, then the resulting dict maps instead onto
the last element of the list it would have mapped onto otherwise.
:param column: a column heading in this table
:param one_to_one: if True, the returned dict maps only onto the last
record in the table with the key value in the column
:param out: a column heading in the table. If specified, then
records in the values of the returned dict are replaced by those
records' elements from the column named by `out`.
"""
result = {}
for record in self:
key = record[column]
if key not in result:
result[key] = []
result[key].append(record)
if one_to_one and len(result[key]) > 1:
raise ValueError("Couldn't make it one-to-one")
if out is not None:
for key,subtable in result.items():
result[key] = [record[out] for record in subtable]
if one_to_one:
for key,subtable in result.items():
result[key] = subtable[-1]
return result
def index_by(self, column, wrapper=(lambda x : x)):
"""Index from each value in `column` to the corresponding record.
Set the internal index dict so that it maps from the values in
the specified column to the the last (and ostensibly sole) record
with that value in that column. Once this index is established,
the indexed records can be retrieved by calling the table with
an index key (a value from the column) as the sole parameter or
by treating that key as an attribute of the table (notwithstanding
real attributes and columns).
:param column: The name of the column whose entries to use as keys
to access the last/sole record with that value (the key) in that
column
:param wrapper: a callable which returns a modified version of a
newly-computed index dict when such a dict is sent to it as its sole
parameter. For example, a case-insensitive dict subclass.
"""
self.__index = wrapper({record[column]:record for record in self})
def __call__(self, key):
"""Return the record indexed from `key`
Example use:
state_table = Table(states_records)
state_table.index_by('fips_code')
nc_record = table('037')"""
return self.__index[key]
def __getitem__(self, column):
"""Either delegate retrieval of elements/slices to the superclass
or return a generator that iterates over the values in the specified
column of the table."""
#If it's a valid index for a list, send to superclass
if isinstance(column, slice) or isinstance(column, int):
return super(Table, self).__getitem__(column)
#otherwise send back the named column
return (record[column] for record in self)
def __getattr__(self, name):
#if that's an existing column name, return the column
#(as a generator); otherwise try to return the record
#corresponding to that name in the index, if there is one.
if name in self.columns:
return self.__getitem__(name)
else:
return self.__index[name]
def __setitem__(self, column, value):
#If it's a valid index for a list, send to superclass
if isinstance(column, slice) or isinstance(column, int):
super(Table, self).__setitem__(column, value)
return
if len(value) != len(self):
raise ValueError('length mismatch')
for record,element in zip(self, value):
record[column] = element
def __delitem__(self, item):
"""Delete the column or else delete the index/slice.
If `item` is an int or slice, delegate to superclass. Otherwise,
assume `item` is a column heading and try to remove `item` as a key
from every record.
:param item: an int index, a slice, or a column name"""
#If it's a valid index for a list, send to superclass
if isinstance(item, slice) or isinstance(item, int):
super(Table, self).__setitem__(item, value)
for record in self:
del record[item]
def __delattr__(self, name):
"""Delegate to `__delitem__`. `del table.a` means `del table['a']`."""
del self[name]
def _clean_(piece):
piece = piece.strip()
a = 1 if piece.startswith('"') else None
b = -1 if piece.endswith('"') else None
return piece[a:b]
def _split_(line):
ignore_comma = False
pieces = []
piece = ''
for c in line:
if c == '"':
ignore_comma = not ignore_comma
piece += c
elif c == ',' and not ignore_comma:
pieces.append(clean(piece))
piece = ''
else:
piece += c
else:
pieces.append(clean(piece))
return pieces
def _to_table_(file):
with open(file) as outof:
columns = []
table = []
for line in outof:
if line.endswith('\n'):
line = line[:-1]
elements = split(line)
if not columns:
columns = elements
continue
table.append({columns[i]:elements[i] for i in range(max(len(elements),len(columns)))})
return table
def pretty_print(table):
try:
columns = table.columns
except:
columns = list(table[0])
else:
columns = list(columns)
rows = [[str(record[column]) for column in columns] for record in table]
widths = [max(len(row[i]) for row in rows) for i in range(len(columns))]
for row in rows:
for item, width in zip(row, widths):
print(item, end=(' '*(1 + width - len(item))))
print()