-
Notifications
You must be signed in to change notification settings - Fork 0
/
phrasemachine.py
353 lines (295 loc) · 12.2 KB
/
phrasemachine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
"""
Noun phrase extraction using Python's regular expression library.
Only for the "SimpleNP" grammar.
"""
from pkg_resources import resource_filename
import sys,re,os
from collections import Counter
import cPickle as pickle
def logmsg(s):
# would be better to use python logger
print>>sys.stderr, "[phrasemachine] %s" % s
############## SimpleNP
## Uses a five-tag coarse grammar.
## tagset: A D P N O
# Requires conversion from PTB or Petrov/Gimpel tags to our system.
# "Coarse*" indicates petrov/gimpel
# Grammar change from the FST version: can't repeat NUM in both adj and noun.
# OLD COARSEMAP!
# coarsemap = {
# 'A': "JJ JJR JJS CoarseADJ CD CoarseNUM".split(),
# 'D': "DT CoarseDET".split(),
# 'P': "IN TO CoarseADP".split(),
# 'N': "NN NNS NNP NNPS FW CoarseNOUN".split(),
# # all other tags get O
# }
# New Coarsemap:
coarsemap = {
'A': "JJ JJR JJS CoarseADJ CD CoarseNUM A".split(),
'D': "DT CoarseDET D".split(),
'P': "IN TO CoarseADP P".split(),
'N': "NN NNS NNP NNPS FW CoarseNOUN N S Z ^".split(),
# all other tags get O
}
## OLDER ATTEMPT: tried to use direct tags as port from foma.
## but this was annoying. have to map back to token positions at the end.
## probably slower too since the python regex compiler is not as smart as foma
# def regex_or(items):
# return '|'.join(re.escape(x) for x in items)
# Adj = regex_or("JJ JJR JJS CD CoarseADJ CoarseNUM".split())
# Det = regex_or("DT CoarseDET".split())
# Prep= regex_or("IN TO CoarseADP".split())
# Noun= regex_or("NN NNS NNP NNPS FW CD CoarseNOUN CoarseNUM".split())
# ## convention: SPACES separate tags.
# BaseNP = "(({Adj}|{Noun}) )*({Noun} )+".format(**globals())
# PP = "{Prep} ({Det} )*{BaseNP}".format(**globals())
# NP = "{BaseNP}({PP} )*".format(**globals())
tag2coarse = {}
for coarsetag,inputtags in coarsemap.items():
for intag in inputtags:
assert intag not in tag2coarse
tag2coarse[intag] = coarsetag
## The grammar!
SimpleNP = "(A|N)*N(PD*(A|N)*N)*"
def coarse_tag_str(pos_seq):
"""Convert POS sequence to our coarse system, formatted as a string."""
global tag2coarse
tags = [tag2coarse.get(tag,'O') for tag in pos_seq]
return ''.join(tags)
# POS extraction assuming list of POS tags as input.
# >>> pyre.extract_finditer(["VB","JJ","NN","NN","QQ","QQ",])
# [(1, 4)]
# >>> pyre.extract_ngram_filter(["VB","JJ","NN","NN","QQ","QQ",])
# [(1, 3), (1, 4), (2, 3), (2, 4), (3, 4)]
def extract_finditer(pos_seq, regex=SimpleNP):
"""The "GreedyFSA" method in Handler et al. 2016.
Returns token position spans of valid ngrams."""
ss = coarse_tag_str(pos_seq)
def gen():
for m in re.finditer(regex, ss):
yield (m.start(),m.end())
return list(gen())
def extract_ngram_filter(pos_seq, regex=SimpleNP, minlen=1, maxlen=8):
"""The "FilterFSA" method in Handler et al. 2016.
Returns token position spans of valid ngrams."""
ss = coarse_tag_str(pos_seq)
def gen():
for s in xrange(len(ss)):
for n in xrange(minlen, 1 + min(maxlen, len(ss)-s)):
e = s+n
substr = ss[s:e]
if re.match(regex + "$", substr):
yield (s,e)
return list(gen())
def extract_JK(pos_seq):
"""The 'JK' method in Handler et al. 2016.
Returns token positions of valid ngrams."""
def find_ngrams(input_list, num_):
'''get ngrams of len n from input list'''
return zip(*[input_list[i:] for i in range(num_)])
# copied from M and S chp 5'''
patterns = set(['AN', 'NN', 'AAN', 'ANN', 'NAN', 'NNN', 'NPN'])
pos_seq = [tag2coarse.get(tag,'O') for tag in pos_seq]
pos_seq = [(i, p) for i, p in enumerate(pos_seq)]
ngrams = [ngram for n in range(1, 4) for ngram in find_ngrams(pos_seq, n)]
def stringify(s):
return "".join(a[1] for a in s)
def positionify(s):
return tuple(a[0] for a in s)
ngrams = filter(lambda x: stringify(x) in patterns, ngrams)
return [set(positionify(n)) for n in ngrams]
########
def unicodify(s, encoding='utf8', errors='ignore'):
# Force conversion to unicode
if isinstance(s,unicode): return s
if isinstance(s,str): return s.decode(encoding, errors)
return unicode(s)
def safejoin(list_of_str_or_unicode):
## can accept a list of str objects, or a list of unicodes.
## safely joins them, returning the same type.
xx = list_of_str_or_unicode
if not xx:
return u""
if isinstance(xx[0],str):
return ' '.join(xx)
if isinstance(xx[0],unicode):
return u' '.join(xx)
#########
class NLTKTagger:
'''
class that supplies part of speech tags using NLTK
note: avoids the NLTK downloader (see __init__ method)
'''
def __init__(self):
import nltk
from nltk.tag import PerceptronTagger
from nltk.tokenize import TreebankWordTokenizer
tokenizer_fn = os.path.abspath(resource_filename('phrasemachine.data', 'punkt.english.pickle'))
tagger_fn = os.path.abspath(resource_filename('phrasemachine.data', 'averaged_perceptron_tagger.pickle'))
# Load the tagger
self.tagger = PerceptronTagger(load=False)
self.tagger.load(tagger_fn)
# note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader.
# Calling the TreebankWordTokenizer like this allows skipping the downloader.
# It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads
# https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25
self.tokenize = TreebankWordTokenizer().tokenize
self.sent_detector = nltk.data.load(tokenizer_fn)
# http://www.nltk.org/book/ch05.html
def tag_text(self, text):
'''take input text and return tokens w/ part of speech tags using NLTK'''
# putting import here instead of top of file b.c. not all will have nltk installed
sents = self.sent_detector.tokenize(text) # TODO: this will fail on some unicode chars. I think assumes ascii
word_pos_pairs = []
all_tokens = []
for sent in sents:
tokens = self.tokenize(sent)
all_tokens = all_tokens + tokens
word_pos_pairs = word_pos_pairs + self.tagger.tag(tokens)
return {'tokens': all_tokens, 'pos': [tag for (w,tag) in word_pos_pairs]}
def tag_tokens(self, tokens):
word_pos_pairs = self.tagger.tag(tokens)
return {'tokens': tokens, 'pos': [tag for (w,tag) in word_pos_pairs]}
def get_stdeng_nltk_tagger(suppress_errors=False):
try:
tagger = NLTKTagger()
throw_away = tagger.tag_text("The red cat sat down.")
return NLTKTagger()
except ImportError:
if not suppress_errors: raise
except LookupError:
if not suppress_errors: raise
return None
SPACY_WRAPPER = None
class SpacyTagger:
# https://spacy.io/
def __init__(self):
self.spacy_object = None
def tag_text(self, text):
text = unicodify(text)
doc = self.spacy_object(text)
return {
'pos': [token.tag_ for token in doc],
'tokens': [token.text for token in doc],
}
def tag_tokens(self, tokens):
# tokens: a list of strings
# todo: would be better to force spacy to use the given tokenization
newtext = safejoin(tokens)
newtext = unicodify(newtext) ## spacy wants unicode objects only. problem if user gave us a string.
return self.tag_text(newtext)
def get_stdeng_spacy_tagger(suppress_errors=False):
global SPACY_WRAPPER
if SPACY_WRAPPER is not None:
return SPACY_WRAPPER
try:
import spacy
SPACY_WRAPPER = SpacyTagger()
SPACY_WRAPPER.spacy_object = spacy.load('en', parser=False, entity=False)
return SPACY_WRAPPER
except ImportError:
if not suppress_errors: raise
except RuntimeError:
## this seems to happen if the 'en' model is not installed. it might
## look like this:
# RuntimeError: Model 'en' not installed. Please run 'python -m spacy.en.download' to install latest compatible model.
if not suppress_errors: raise
return None
TAGGER_NAMES = {
'nltk': get_stdeng_nltk_tagger,
'spacy': get_stdeng_spacy_tagger,
# 'twitter': None,
}
def get_phrases(text=None, tokens=None, postags=None, tagger='nltk', grammar='SimpleNP', regex=None, minlen=2, maxlen=8, output='counts'):
"""Give a text (or POS tag sequence), return the phrases matching the given
grammar. Works on documents or sentences.
Returns a dict with one or more keys with the phrase information.
text: the text of the document. If supplied, we will try to POS tag it.
You can also do your own tokenzation and/or tagging and supply them as
'tokens' and/or 'postags', which are lists of strings (of the same length).
- Must supply both to get phrase counts back.
- With only postags, can get phrase token spans back.
- With only tokens, we will try to POS-tag them if possible.
output: a string, or list of strings, of information to return. Options include:
- counts: a Counter with phrase frequencies. (default)
- token_spans: a list of the token spans of each matched phrase. This is
a list of (start,end) pairs of integers, which refer to token positions.
- pos, tokens can be returned too.
tagger: if you're passing in raw text, can supply your own tagger, from one
of the get_*_tagger() functions. If this is not supplied, we will try to load one.
grammar: the grammar to use. Only one option right now...
regex: a custom regex to use, instead of a premade grammar. Currently,
this must work on the 5-tag system described near the top of this file.
"""
global SimpleNP
## try to get values for both 'postags' and 'tokens', parallel lists of strings
if postags is None:
if isinstance(tagger, (str,unicode)):
assert tagger in TAGGER_NAMES, "We don't support tagger %s" % tagger
tagger = TAGGER_NAMES[tagger]()
# otherwise, assume it's one of our wrapper *Tagger objects
d = None
if tokens is not None:
d = tagger.tag_tokens(tokens)
elif text is not None:
d = tagger.tag_text(text)
else:
raise Exception("Need to supply text or tokens.")
postags = d['pos']
tokens = d['tokens']
if regex is None:
if grammar=='SimpleNP':
regex = SimpleNP
else:
assert False, "Don't know grammar %s" % grammar
phrase_tokspans = extract_ngram_filter(postags, minlen=minlen, maxlen=maxlen)
## Handle multiple possible return info outputs
if isinstance(output, str):
output = [output]
our_options = set()
def retopt(x):
our_options.add(x)
return x in output
ret = {}
ret['num_tokens'] = len(postags)
if retopt('token_spans'):
ret['token_spans'] = phrase_tokspans
if retopt('counts'):
counts = Counter()
for (start,end) in phrase_tokspans:
phrase = safejoin([tokens[i] for i in xrange(start,end)])
phrase = phrase.lower()
counts[phrase] += 1
ret['counts'] = counts
if retopt('pos'):
ret['pos'] = postags
if retopt('tokens'):
ret['tokens'] = tokens
xx = set(output) - our_options
if xx:
raise Exception("Don't know how to handle output options: %s" % list(xx))
return ret
def ark_get_phrases_wrapper(ark_pos_tags_list):
tokens = []
pos_tags = []
for token, tag, confidence in ark_pos_tags_list:
tokens.append(token)
pos_tags.append(tag)
phrases_dict = get_phrases(tokens=tokens, postags=pos_tags, minlen=1)
phrases = phrases_dict['counts'].keys()
return phrases
if __name__ == "__main__":
fileName ="HSLLD/HV1/MT/admmt1.cha"
pos_tags_filename = "pos_tags/" + fileName
tokens = []
pos_tags = []
pos_tags_dict = pickle.load(open(
pos_tags_filename))
for sentence in pos_tags_dict.values():
for token, tag, confidence in sentence:
tokens.append(token)
pos_tags.append(tag)
phrases_dict = get_phrases(tokens=tokens, postags=pos_tags, minlen=1)
print(phrases_dict)
for val in pos_tags_dict.values():
print(ark_get_phrases_wrapper(val))