3
3
import functools
4
4
import collections
5
5
import statistics
6
+ import itertools
6
7
7
- from visidata import Progress , Sheet , Column , ColumnsSheet , VisiData
8
+ from visidata import Progress , Sheet , Column , ColumnsSheet , VisiData , SettableColumn
8
9
from visidata import vd , anytype , vlen , asyncthread , wrapply , AttrDict , date , INPROGRESS , stacktrace , TypedExceptionWrapper
9
10
10
11
vd .help_aggregators = '''# Choose Aggregators
@@ -76,7 +77,7 @@ def aggregators_set(col, aggs):
76
77
77
78
78
79
class Aggregator :
79
- def __init__ (self , name , type , funcValues = None , helpstr = 'foo ' ):
80
+ def __init__ (self , name , type , funcValues = None , helpstr = '' ):
80
81
'Define aggregator `name` that calls funcValues(values)'
81
82
self .type = type
82
83
self .funcValues = funcValues # funcValues(values)
@@ -92,13 +93,48 @@ def aggregate(self, col, rows): # wrap builtins so they can have a .type
92
93
return None
93
94
raise e
94
95
96
+ class ListAggregator (Aggregator ):
97
+ '''A list aggregator is an aggregator that returns a list of values, generally
98
+ one value per input row, unlike ordinary aggregators that operate on rows
99
+ and return only a single value.
100
+ To implement a new list aggregator, subclass ListAggregator,
101
+ and override aggregate() and aggregate_list().'''
102
+ def __init__ (self , name , type , helpstr = '' , listtype = None ):
103
+ '''*listtype* determines the type of the column created by addcol_aggregate()
104
+ for list aggrs. If it is None, then the new column will match the type of the input column'''
105
+ super ().__init__ (name , type , helpstr = helpstr )
106
+ self .listtype = listtype
107
+
108
+ def aggregate (self , col , rows ) -> list :
109
+ '''Return a list, which can be shorter than *rows*, because it filters out nulls and errors.
110
+ Override in subclass.'''
111
+ vals = self .aggregate_list (col , rows )
112
+ # filter out nulls and errors
113
+ vals = [ v for v in vals if not col .sheet .isNullFunc ()(v ) ]
114
+ return vals
115
+
116
+ def aggregate_list (self , col , row_group ) -> list :
117
+ '''Return a list of results, which will be one result per input row.
118
+ *row_group* is an iterable that holds a "group" of rows to run the aggregator on.
119
+ rows in *row_group* are not necessarily in the same order they are in the sheet.
120
+ Override in subclass.'''
121
+ vals = [ col .getTypedValue (r ) for r in row_group ]
122
+ return vals
95
123
96
124
@VisiData .api
97
125
def aggregator (vd , name , funcValues , helpstr = '' , * , type = None ):
98
126
'''Define simple aggregator *name* that calls ``funcValues(values)`` to aggregate *values*.
99
127
Use *type* to force type of aggregated column (default to use type of source column).'''
100
128
vd .aggregators [name ] = Aggregator (name , type , funcValues = funcValues , helpstr = helpstr )
101
129
130
+ @VisiData .api
131
+ def aggregator_list (vd , name , helpstr = '' , type = anytype , listtype = anytype ):
132
+ '''Define simple aggregator *name* that calls ``funcValues(values)`` to aggregate *values*.
133
+ Use *type* to force type of aggregated column (default to use type of source column).
134
+ Use *listtype* to force the type of the new column created by addcol-aggregate.
135
+ If *listtype* is None, it will match the type of the source column.'''
136
+ vd .aggregators [name ] = ListAggregator (name , type , helpstr = helpstr , listtype = listtype )
137
+
102
138
## specific aggregator implementations
103
139
104
140
def mean (vals ):
@@ -147,10 +183,49 @@ def __init__(self, pct, helpstr=''):
147
183
def aggregate (self , col , rows ):
148
184
return _percentile (sorted (col .getValues (rows )), self .pct / 100 , key = float )
149
185
150
-
151
186
def quantiles (q , helpstr ):
152
187
return [PercentileAggregator (round (100 * i / q ), helpstr ) for i in range (1 , q )]
153
188
189
+ def aggregate_groups (sheet , col , rows , aggr ) -> list :
190
+ '''Returns a list, containing the result of the aggregator applied to each row.
191
+ *col* is a column whose values determine each row's rank within a group.
192
+ *rows* is a list of visidata rows.
193
+ *aggr* is an Aggregator object.
194
+ Rows are grouped by their key columns. Null key column cells are considered equal,
195
+ so nulls are grouped together. Cells with exceptions do not group together.
196
+ Each exception cell is grouped by itself, with only one row in the group.
197
+ '''
198
+ def _key_progress (prog ):
199
+ def identity (val ):
200
+ prog .addProgress (1 )
201
+ return val
202
+ return identity
203
+
204
+ with Progress (gerund = 'ranking' , total = 4 * sheet .nRows ) as prog :
205
+ p = _key_progress (prog ) # increment progress every time p() is called
206
+ # compile row data, for each row a list of tuples: (group_key, rank_key, rownum)
207
+ rowdata = [(sheet .rowkey (r ), col .getTypedValue (r ), p (rownum )) for rownum , r in enumerate (rows )]
208
+ # sort by row key and column value to prepare for grouping
209
+ try :
210
+ rowdata .sort (key = p )
211
+ except TypeError as e :
212
+ vd .fail (f'elements in a ranking column must be comparable: { e .args [0 ]} ' )
213
+ rowvals = []
214
+ #group by row key
215
+ for _ , group in itertools .groupby (rowdata , key = lambda v : v [0 ]):
216
+ # within a group, the rows have already been sorted by col_val
217
+ group = list (group )
218
+ if isinstance (aggr , ListAggregator ): # for list aggregators, each row gets its own value
219
+ aggr_vals = aggr .aggregate_list (col , [rows [rownum ] for _ , _ , rownum in group ])
220
+ rowvals += [(rownum , v ) for (_ , _ , rownum ), v in zip (group , aggr_vals )]
221
+ else : # for normal aggregators, each row in the group gets the same value
222
+ aggr_val = aggr .aggregate (col , [rows [rownum ] for _ , _ , rownum in group ])
223
+ rowvals += [(rownum , aggr_val ) for _ , _ , rownum in group ]
224
+ prog .addProgress (len (group ))
225
+ # sort by unique rownum, to make rank results match the original row order
226
+ rowvals .sort (key = p )
227
+ rowvals = [ v for rownum , v in rowvals ]
228
+ return rowvals
154
229
155
230
vd .aggregator ('min' , min , 'minimum value' )
156
231
vd .aggregator ('max' , max , 'maximum value' )
@@ -161,8 +236,8 @@ def quantiles(q, helpstr):
161
236
vd .aggregator ('sum' , vsum , 'sum of values' )
162
237
vd .aggregator ('distinct' , set , 'distinct values' , type = vlen )
163
238
vd .aggregator ('count' , lambda values : sum (1 for v in values ), 'number of values' , type = int )
164
- vd .aggregator ('list' , list , 'list of values' , type = anytype )
165
- vd .aggregator ('stdev' , stdev , 'standard deviation of values' , type = float )
239
+ vd .aggregator_list ('list' , 'list of values' , type = anytype , listtype = None )
240
+ vd .aggregator ('stdev' , statistics . stdev , 'standard deviation of values' , type = float )
166
241
167
242
vd .aggregators ['q3' ] = quantiles (3 , 'tertiles (33/66th pctile)' )
168
243
vd .aggregators ['q4' ] = quantiles (4 , 'quartiles (25/50/75th pctile)' )
@@ -267,9 +342,8 @@ def aggregator_choices(vd):
267
342
268
343
269
344
@VisiData .api
270
- def chooseAggregators (vd ):
345
+ def chooseAggregators (vd , prompt = 'choose aggregators: ' ):
271
346
'''Return a list of aggregator name strings chosen or entered by the user. User-entered names may be invalid.'''
272
- prompt = 'choose aggregators: '
273
347
def _fmt_aggr_summary (match , row , trigger_key ):
274
348
formatted_aggrname = match .formatted .get ('key' , row .key ) if match else row .key
275
349
r = ' ' * (len (prompt )- 3 )
@@ -296,10 +370,34 @@ def _fmt_aggr_summary(match, row, trigger_key):
296
370
vd .warning (f'aggregator does not exist: { aggr } ' )
297
371
return aggrs
298
372
299
- Sheet .addCommand ('+' , 'aggregate-col' , 'addAggregators([cursorCol], chooseAggregators())' , 'add aggregator to current column' )
373
+ @Sheet .api
374
+ @asyncthread
375
+ def addcol_aggregate (sheet , col , aggrnames ):
376
+ for aggrname in aggrnames :
377
+ aggrs = vd .aggregators .get (aggrname )
378
+ aggrs = aggrs if isinstance (aggrs , list ) else [aggrs ]
379
+ if not aggrs : continue
380
+ for aggr in aggrs :
381
+ rows = aggregate_groups (sheet , col , sheet .rows , aggr )
382
+ if isinstance (aggr , ListAggregator ):
383
+ t = aggr .listtype or col .type
384
+ else :
385
+ t = aggr .type or col .type
386
+ c = SettableColumn (name = f'{ col .name } _{ aggr .name } ' , type = t )
387
+ sheet .addColumnAtCursor (c )
388
+ c .setValues (sheet .rows , * rows )
389
+
390
+ Sheet .addCommand ('+' , 'aggregate-col' , 'addAggregators([cursorCol], chooseAggregators())' , 'Add aggregator to current column' )
300
391
Sheet .addCommand ('z+' , 'memo-aggregate' , 'cursorCol.memo_aggregate(chooseAggregators(), selectedRows or rows)' , 'memo result of aggregator over values in selected rows for current column' )
301
392
ColumnsSheet .addCommand ('g+' , 'aggregate-cols' , 'addAggregators(selectedRows or source[0].nonKeyVisibleCols, chooseAggregators())' , 'add aggregators to selected source columns' )
393
+ Sheet .addCommand ('' , 'addcol-aggregate' , 'addcol_aggregate(cursorCol, chooseAggregators(prompt="aggregator for groups: "))' , 'add column(s) with aggregator of rows grouped by key columns' )
394
+
395
+ vd .addGlobals (
396
+ ListAggregator = ListAggregator
397
+ )
302
398
303
399
vd .addMenuItems ('''
304
400
Column > Add aggregator > aggregate-col
401
+ Column > Add column > aggregate > addcol-aggregate
305
402
''' )
403
+
0 commit comments