-
-
Notifications
You must be signed in to change notification settings - Fork 283
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
handle dates and timedeltas in aggregators and DescribeSheet #2380
base: develop
Are you sure you want to change the base?
Changes from all commits
5183411
ef57111
0ab4426
a8af017
3cae0f1
edd60cd
2e3d985
a8f469f
92e80cf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,7 @@ | |
import functools | ||
import collections | ||
import statistics | ||
import datetime | ||
|
||
from visidata import Progress, Sheet, Column, ColumnsSheet, VisiData | ||
from visidata import vd, anytype, vlen, asyncthread, wrapply, AttrDict, date | ||
|
@@ -107,13 +108,48 @@ def _funcRows(col, rows): # wrap builtins so they can have a .type | |
def mean(vals): | ||
vals = list(vals) | ||
if vals: | ||
return float(sum(vals))/len(vals) | ||
if type(vals[0]) is date: | ||
vals = [d.timestamp() for d in vals] | ||
ans = float(sum(vals))/len(vals) | ||
return datetime.date.fromtimestamp(ans) | ||
elif isinstance(vals[0], datetime.timedelta): | ||
return datetime.timedelta(seconds=vsum(vals)/datetime.timedelta(seconds=len(vals))) | ||
else: | ||
return float(sum(vals))/len(vals) | ||
|
||
def _vsum(vals): | ||
return sum(vals, start=type(vals[0] if len(vals) else 0)()) #1996 | ||
if vals: | ||
if type(vals[0]) is date: | ||
vd.error('dates cannot be summed') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This would be a |
||
return None | ||
return sum(vals, start=type(vals[0])()) #1996 | ||
else: | ||
return 0 | ||
|
||
# start parameter in sum() added in Python 3.8 | ||
vsum = _vsum if sys.version_info[:2] >= (3, 8) else sum | ||
def median(vals): | ||
if not vals: | ||
return None | ||
if type(vals[0]) is date: | ||
# when the length is even, statistics.median needs to add | ||
# two midpoints to average them, so convert to timestamps | ||
vals = [d.timestamp() for d in vals] | ||
return datetime.date.fromtimestamp(statistics.median(vals)) | ||
return statistics.median(vals) | ||
|
||
def stdev(vals): | ||
if vals and len(vals) >= 2: | ||
if type(vals[0]) is date: | ||
vals = [d.timestamp() for d in vals] | ||
return datetime.timedelta(seconds=statistics.stdev(vals)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is very interesting, I wonder how we can codify that the output type of stdev for both date and datedelta are datedelta. It's not quite as simple as making the result type |
||
elif isinstance(vals[0], datetime.timedelta): | ||
vals = [d.total_seconds() for d in vals] | ||
return datetime.timedelta(seconds=statistics.stdev(vals)) | ||
return statistics.stdev(vals) | ||
else: | ||
vd.error('stdev requires at least two data points') | ||
return None | ||
|
||
# http://code.activestate.com/recipes/511478-finding-the-percentile-of-the-values/ | ||
def _percentile(N, percent, key=lambda x:x): | ||
|
@@ -146,17 +182,17 @@ def percentile(pct, helpstr=''): | |
def quantiles(q, helpstr): | ||
return [percentile(round(100*i/q), helpstr) for i in range(1, q)] | ||
|
||
vd.aggregator('min', min, 'minimum value') | ||
vd.aggregator('max', max, 'maximum value') | ||
vd.aggregator('avg', mean, 'arithmetic mean of values', type=float) | ||
vd.aggregator('mean', mean, 'arithmetic mean of values', type=float) | ||
vd.aggregator('median', statistics.median, 'median of values') | ||
vd.aggregator('min', min, 'minimum value', type=anytype) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why are min/max forcibly set to I understand why avg/mean/median want to be more generic; maybe this is where we have both an 'input type' which remains |
||
vd.aggregator('max', max, 'maximum value', type=anytype) | ||
vd.aggregator('avg', mean, 'arithmetic mean of values', type=anytype) | ||
vd.aggregator('mean', mean, 'arithmetic mean of values', type=anytype) | ||
vd.aggregator('median', median, 'median of values', type=anytype) | ||
vd.aggregator('mode', statistics.mode, 'mode of values') | ||
vd.aggregator('sum', vsum, 'sum of values') | ||
vd.aggregator('sum', vsum, 'sum of values', type=anytype) | ||
vd.aggregator('distinct', set, 'distinct values', type=vlen) | ||
vd.aggregator('count', lambda values: sum(1 for v in values), 'number of values', type=int) | ||
vd.aggregator('list', list, 'list of values', type=anytype) | ||
vd.aggregator('stdev', statistics.stdev, 'standard deviation of values', type=float) | ||
vd.aggregator('stdev', stdev, 'standard deviation of values', type=anytype) | ||
|
||
vd.aggregators['q3'] = quantiles(3, 'tertiles (33/66th pctile)') | ||
vd.aggregators['q4'] = quantiles(4, 'quartiles (25/50/75th pctile)') | ||
|
@@ -218,14 +254,20 @@ def aggname(col, agg): | |
@asyncthread | ||
def memo_aggregate(col, agg_choices, rows): | ||
'Show aggregated value in status, and add to memory.' | ||
if not rows: | ||
vd.fail('no rows to aggregate') | ||
for agg_choice in agg_choices: | ||
agg = vd.aggregators.get(agg_choice) | ||
if not agg: continue | ||
aggs = agg if isinstance(agg, list) else [agg] | ||
for agg in aggs: | ||
aggval = agg(col, rows) | ||
typedval = wrapply(agg.type or col.type, aggval) | ||
dispval = col.format(typedval) | ||
if agg.name == 'stdev' and (col.type is date): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We need to figure out how to not have a very special type check here. |
||
# col type is a date, but typedval is a timedelta | ||
dispval = str(typedval) | ||
else: | ||
dispval = col.format(typedval) | ||
k = col.name+'_'+agg.name | ||
vd.status(f'{k}={dispval}') | ||
vd.memory[k] = typedval | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,12 @@ | ||
from copy import copy | ||
from statistics import mode, median, mean, stdev | ||
from statistics import mode | ||
import datetime | ||
|
||
from visidata import vd, Column, ColumnAttr, vlen, RowColorizer, asyncthread, Progress, wrapply | ||
from visidata import vd, Column, ColumnAttr, vlen, RowColorizer, asyncthread, Progress, wrapply, anytype, date | ||
from visidata import BaseSheet, TableSheet, ColumnsSheet, SheetsSheet | ||
|
||
|
||
vd.option('describe_aggrs', 'mean stdev', 'numeric aggregators to calculate on Describe sheet', help=vd.help_aggregators) | ||
vd.option('describe_aggrs', 'min max sum median mean stdev', 'numeric aggregators to calculate on Describe sheet', help=vd.help_aggregators) | ||
|
||
|
||
@Column.api | ||
|
@@ -44,10 +45,6 @@ class DescribeSheet(ColumnsSheet): | |
DescribeColumn('nulls', type=vlen), | ||
DescribeColumn('distinct',type=vlen), | ||
DescribeColumn('mode', type=str), | ||
DescribeColumn('min', type=str), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there any functional reason to moving these into describe_aggrs? I think this must be why min/max types were changed to |
||
DescribeColumn('max', type=str), | ||
DescribeColumn('sum'), | ||
DescribeColumn('median', type=str), | ||
] | ||
colorizers = [ | ||
RowColorizer(7, 'color_key_col', lambda s,c,r,v: r and r in r.sheet.keyCols), | ||
|
@@ -61,7 +58,8 @@ def loader(self): | |
self.resetCols() | ||
|
||
for aggrname in vd.options.describe_aggrs.split(): | ||
self.addColumn(DescribeColumn(aggrname, type=float)) | ||
aggrtype = vd.aggregators[aggrname].type | ||
self.addColumn(DescribeColumn(aggrname, type=aggrtype)) | ||
|
||
for srccol in Progress(self.rows, 'categorizing'): | ||
if not srccol.hidden: | ||
|
@@ -87,12 +85,15 @@ def reloadColumn(self, srccol): | |
d['distinct'].add(v) | ||
except Exception as e: | ||
d['errors'].append(sr) | ||
if not vals: | ||
return | ||
|
||
d['mode'] = self.calcStatistic(d, mode, vals) | ||
if vd.isNumeric(srccol): | ||
for func in [min, max, sum, median]: # use type | ||
d[func.__name__] = self.calcStatistic(d, func, vals) | ||
if vd.isNumeric(srccol) or \ | ||
isinstance(vals[0], (datetime.timedelta, datetime.date)): | ||
for aggrname in vd.options.describe_aggrs.split(): | ||
if aggrname == 'sum' and (srccol.type is date or isinstance(vals[0], datetime.date)): | ||
continue | ||
aggr = vd.aggregators[aggrname].funcValues | ||
d[aggrname] = self.calcStatistic(d, aggr, vals) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think these special cases belong in the aggregators. The idea is that people can define their own aggregators with 'obvious' implementations and they should work reasonably. (and regardless, where these checks are, they should use
isinstance
with the Python stdlib datetime as you're doing below).To that end, the visidata
date
anddatedelta
classes both already have__float__
methods which convert them to float as you're doing here. So maybe the aggregator callsite can callaggregator.intype
on each value, and for these "more numeric" aggregators would set their intype tofloat
. Then they would need to convert the float result back into some result type, which as you've noted below can be tricky for date formats.