-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcount-unsorted.py
executable file
·70 lines (56 loc) · 2 KB
/
count-unsorted.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/usr/bin/env python3
# SPDX-License-Identifier: WTFPL
# count-unsorted: like uniq(1)'s -c but does not require lines to be sorted
import locale
import signal
import sys
from argparse import ArgumentParser
from collections import Counter
from fileinput import input
locale.setlocale(locale.LC_ALL, "")
signal.signal(signal.SIGINT, signal.SIG_DFL)
signal.signal(signal.SIGPIPE, signal.SIG_DFL)
def reversed_by_count(iterator):
# most_common return DESC for count, and ASC for occurence order with same
# count.
# so if we want ASC for count and we simply use reversed(), it will be
# DESC for occurence order as a side-effect!
# hence this function that reverses (again) occurence order if same count
group = []
for key, count in iterator:
if not group or group[0][1] == count:
group.append((key, count))
continue
yield from reversed(group)
group = [(key, count)]
yield from reversed(group)
parser = ArgumentParser(
epilog="By default, sort from most common occurrences to least common.",
)
parser.add_argument(
"-S", "--no-sort", action="store_false", default=True, dest="sort",
help="Don't sort lines by number of occurrences, use order of appearance",
)
parser.add_argument(
"-r", "--reverse", action="store_true", dest="desc",
help="Sort lines from least common occurrences to most common",
)
parser.add_argument("files", nargs="*")
args = parser.parse_args()
counter = Counter()
# open in binary because:
# - we don't care about their encoding
# - it avoids errors because of an incorrect setting
# - it's faster if we avoid decoding/encoding
# it won't work with utf-16 though
for line in input(args.files, mode="rb"):
counter[line] += 1
if args.sort:
elems = counter.most_common()
if not args.desc:
elems = reversed_by_count(reversed(elems))
else:
elems = counter.items()
for line, count in elems:
# input() keeps newlines, so don't append one
sys.stdout.buffer.write(f"{count}: ".encode() + line)