-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathdistribution
executable file
·182 lines (169 loc) · 6.83 KB
/
distribution
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
#!/usr/bin/env awk -f
# Comes from https://github.com/mrmanc/log-ninja/blob/master/distribution
#
# Requires Gnu awk. Expects a stream of numbers on STDIN.
#
# Supply max_lines and max as variables like this:
#
# cat /usr/share/dict/words | awk '{print length($1)}' | \
# distribution lines=5 max=10 min=3
#
# Should support negative numbers and floats.
# Can specify a bin width of N with option bin=N. Overrides the lines option.
# Can request a header summarizing the histogram with stats=true.
# Line labels in the result refer to the start of that bar.
# For example, a line labelled 4.00 followed by 5.00 will show values where 4.00 <= value < 5.00.
# If you want to output histograms in real time, specify interval=10 for a histogram every 10 records.
# The last 1000 records will be buffered for realtime histograms. Override this by specifying buffer=10000 for example.
# Be aware that a larger buffer will impact performance greatly, as it has to be copied and sorted for each histogram.
# To monitor whether the script is keeping up with realtime STDIN, pass a date string as the second field to be output for each iteration.
# See https://github.com/mrmanc/log-ninja#distribution for more information and examples.
BEGIN{
# If you want a filled in solid histogram use "\xDB", but that might not work on your terminal.
BAR_CHARACTER="_"
LINES_DEFAULT=50
WIDTH_DEFAULT=100
REALTIME_RECORDS_DEFAULT=1000
DONT_ACCUMULATE_DEFAULT="true"
VERBOSE_DEFAULT="true"
HIDDEN_DEFAULT=""
}
"$1" ~ /^[-0-9\.]+$/
{
if (accumulate == "false" ) { dontAccumulateHistogram="true" } else if ( accumulate != "" ) {dontAccumulateHistogram=""} else {dontAccumulateHistogram=DONT_ACCUMULATE_DEFAULT}
finalValueTotals[$1 + 0] += 1
records ++
if (interval > 0) {
realtimeIndex ++
if (realtimeIndex == realtimeRingBufferSize()) {
realtimeIndex=1 # reset to wrap around
iterations++
}
if (iterations > 0) {
# once we've been around once we need to start decrementing and sometimes dropping values from our aggregate array
if ( dontAccumulateHistogram ) realtimeValueTotals[entries[realtimeIndex + 0] + 0]-=1
if (realtimeValueTotals[entries[realtimeIndex + 0] + 0] == 0) delete realtimeValueTotals[entries[realtimeIndex + 0] + 0]
}
entries[realtimeIndex + 0]=$1
realtimeValueTotals[$1 + 0]+=1
if (records % interval == 0) {
print "Progress: record number " records " and time (if specified) is: " $2
printHistogram(realtimeValueTotals)
system("trap 'exit 1' 2; sleep 0.05")
}
}
}
END {
printHistogram(finalValueTotals)
}
function realtimeRingBufferSize() {
return buffer == "" ? REALTIME_RECORDS_DEFAULT : buffer
}
function printHistogram(valueTotals) {
if (stats != "") {verbose="true"} else {verbose=VERBOSE_DEFAULT}
if (hidden == "false") {showHidden=""} else if (hidden != "") {showHidden="true"} else {showHidden=HIDDEN_DEFAULT}
if (lines != "") {numberOfLines=lines} else {numberOfLines=LINES_DEFAULT}
if (width != "") {lineWidth=width}
else if (max_width != "") {lineWidth=max_width}
else {lineWidth=WIDTH_DEFAULT}
#distinctValues=asorti(valueTotals, values, "@ind_num_asc") # works in newer versions of Awk
distinctValues=qsorti(valueTotals,values)
# adding zero to force vars to be numbers, as the asorti function above seems to damage
# the numerical indices when a sorted_in value is provided, causing the comparisons to fail
for (valIndex in values) values[valIndex]=values[valIndex] + 0
firstValue=values[1]
lastValue=values[distinctValues]
if (min != "") minValue=min
else minValue=firstValue
if (max != "") maxValue=max
else maxValue=lastValue
if (bin !="") {numberOfLines= ((maxValue-minValue) / bin + 1)}
window=maxValue - minValue
if (numberOfLines -1 > window && lines == "") {
numberOfLines=int(window) + 1
}
if (numberOfLines > 1) lineHeight=window / (numberOfLines - 1) # last line will start at maxValue, so save that line for that
else if (window > 0) lineHeight=window
else lineHeight=1 # deals with rare case that one distinct value was found
currentValueIndex=1
lineNumber=0
runningTotal=0
mostRecordsInLine=0
while(lineNumber < numberOfLines) {
lineMin=minValue + (lineNumber * lineHeight)
lineMax=lineMin + lineHeight
lineRecords=0
while(currentValueIndex <= distinctValues && values[currentValueIndex] < lineMax) {
currentValue=values[currentValueIndex]
if (currentValue >= lineMin) {
lineRecords += valueTotals[currentValue]
}
runningTotal += valueTotals[currentValue]
currentValueIndex ++
}
if (lineRecords > mostRecordsInLine) mostRecordsInLine=lineRecords
label[lineNumber]=lineMin
quantity[lineNumber]=lineRecords
total[lineNumber]=runningTotal
lineNumber ++
}
recordsPerCharacter=lineWidth / mostRecordsInLine
recordsBeforeHistogram=total[0] - quantity[0]
recordsAfterHistogram=0
while(currentValueIndex <= distinctValues) {
currentValue=values[currentValueIndex]
recordsAfterHistogram += valueTotals[values[currentValueIndex]]
currentValueIndex ++
}
totalRecordsInHistogram=total[lineNumber-1] + recordsAfterHistogram
if (verbose) {
frame=sprintf("\nFound %s records distributed in %s distinct values between %s and %s\n\n", totalRecordsInHistogram, distinctValues, firstValue, lastValue)
frame=frame sprintf("%10s %8s %6s %s\n", "Value", "Quant", "%ile", "Histogram")
frame=frame sprintf("%10s %8s %6s %s\n", "-----", "-----", "----", "---------")
}
if (recordsBeforeHistogram > 0 && showHidden) {
frame=frame sprintf("<%9.2f %8s %6.2f (hidden)\n", minValue, recordsBeforeHistogram, percentile(recordsBeforeHistogram, totalRecordsInHistogram))
}
else frame=frame "\n"
lineNumber=0
while(lineNumber < (numberOfLines)) {
frame=frame sprintf("%10.2f %8d %6.2f %s\n", label[lineNumber], quantity[lineNumber], percentile(total[lineNumber], totalRecordsInHistogram), bar(quantity[lineNumber]*recordsPerCharacter))
lineNumber ++
}
if (recordsAfterHistogram > 0 && showHidden) frame=frame sprintf(">=%8.2f %8s %6.2f (hidden)\n", maxValue, recordsAfterHistogram, "100")
else frame=frame "\n"
print frame
}
function percentile(runningTotal, total) {
return 100 * runningTotal / total
}
function bar(characters) {
result=""
while (characters -- > 0) result=result BAR_CHARACTER
return result
}
function qsorti(A,B) {
j=0
for (i in A) B[++j]=i
qsort(B, 1, j)
return j
}
# Below code taken from http://awk.info/?quicksort to provide a numerical sort due to
# the absence of a three parameter asorti function in older versions of Awk (e.g. v3.1.3).
function qsort(A, left, right, i, last) {
if (left >= right)
return
swap(A, left, left + int((right - left + 1) * rand()))
last = left
for (i = left+1; i <= right; i++)
if (A[i] + 0 < A[left] + 0)
swap(A, ++last, i)
swap(A, left, last)
qsort(A, left, last - 1)
qsort(A, last + 1, right)
}
function swap(A, i, j, t) {
t = A[i]
A[i] = A[j]
A[j] = t
}