-
Notifications
You must be signed in to change notification settings - Fork 22
/
count_continuations.py
executable file
·127 lines (108 loc) · 4.98 KB
/
count_continuations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#!/usr/bin/env python3
"""Count the relative frequency of continuations in a corpus (from STDIN)."""
# The MIT License (MIT)
#
# Copyright (c) 2014 Florian Leitner <[email protected]>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
# This command-line tool takes a list of words, namely "continuations".
# Then, it counts how often each word is found in an input text coming from
# STDIN.
#
# Particularly, it counts:
# 1. capitalized occurrences after a dot or as the first word in a section,
# 2. in the input case after a dot, and
# 3. in the input case anywhere in the text (including after a dot in input
# case).
#
# Finally, these counts and a few percentages are reported on STDOUT.
# This makes it possible to measure the frequency of each word as a sentence
# starter and after the abbreviation marker versus its general corpus
# frequency in default cases.
import logging
import os
import sys
from argparse import ArgumentParser
import re
__author__ = 'Florian Leitner <[email protected]>'
__version__ = 1
# parser and arguments
epilog = "(C) 2015 {}. All rights reserved. License: MIT".format(__author__)
parser = ArgumentParser(usage='%(prog)s [options] TOKEN ...',
description=__doc__, epilog=epilog,
prog=os.path.basename(sys.argv[0]))
parser.add_argument('continuations', metavar='TOKEN', nargs='+',
help = 'lower-case continuation token(s)')
# logging/info options
parser.add_argument('--version', action='version',
version='v%s' % __version__)
parser.add_argument('--verbose', '-v', action='count', default=0,
help='increase log level [WARN]')
parser.add_argument('--quiet', '-q', action='count', default=0,
help='decrease log level [WARN]')
parser.add_argument('--abbreviations', action='store_true',
help='only count abbreviation-to-sentence-start usage')
args = parser.parse_args()
# logging setup
log_adjust = max(min(args.quiet - args.verbose, 2), -2) * 10
log_format = '%(levelname)-8s %(module) 10s: %(funcName)s %(message)s'
logging.basicConfig(level=logging.WARNING + log_adjust,
format=log_format)
logging.info('verbosity increased')
logging.debug('verbosity increased')
def sentence_start_pattern(continuation):
capitalized_continuation = continuation.capitalize()
pattern = r'(?:^|>|\t|\.\s)\s*{}\b'
return re.compile(pattern.format(capitalized_continuation))
def abbreviation_pattern(continuation):
pattern = r'\.\s+{}\b'
return re.compile(pattern.format(continuation))
inside_sentence = {}
sentence_start = {}
after_abbreviation = {}
for c in args.continuations:
inside_sentence[c] = [re.compile(r'\b{}\b'.format(c)), 0]
sentence_start[c] = [sentence_start_pattern(c), 0]
after_abbreviation[c] = [abbreviation_pattern(c), 0]
cases = (sentence_start, after_abbreviation) if args.abbreviations else \
(inside_sentence, sentence_start, after_abbreviation)
for line in sys.stdin:
for continuation in args.continuations:
for target in cases:
pattern_count = target[continuation]
pattern_count[1] += len(pattern_count[0].findall(line))
if not args.abbreviations:
print("Freq.SS | Likelih. | N.abbrev. | N.starters | N.inside | Word")
else:
print("Likelih. | N.abbrev. | N.starters | Word")
for continuation in inside_sentence:
starter_count = sentence_start[continuation][1]
inside_count = inside_sentence[continuation][1]
abbrev_count = after_abbreviation[continuation][1]
total = starter_count + inside_count
after_dot = starter_count + abbrev_count
ss_fraction = (starter_count / float(total)) if total > 0 else 0.0
likelihood = (abbrev_count / after_dot) if after_dot > 0 else 0.0
if not args.abbreviations:
print('%.3f' % ss_fraction, end=' | ')
print('%.3f' % likelihood, abbrev_count, starter_count,
sep=' | ', end=' | ')
if not args.abbreviations:
print(inside_count, end=' | ')
print(continuation)