-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
206 lines (178 loc) · 6.55 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
# utils.py
from typing import List
import numpy as np
class Indexer(object):
"""
Bijection between objects and integers starting at 0. Useful for mapping
labels, features, etc. into coordinates of a vector space.
Attributes:
objs_to_ints
ints_to_objs
"""
def __init__(self):
self.objs_to_ints = {}
self.ints_to_objs = {}
def __repr__(self):
return str([str(self.get_object(i)) for i in range(0, len(self))])
def __str__(self):
return self.__repr__()
def __len__(self):
return len(self.objs_to_ints)
def get_object(self, index):
"""
:param index: integer index to look up
:return: Returns the object corresponding to the particular index or None if not found
"""
if (index not in self.ints_to_objs):
return None
else:
return self.ints_to_objs[index]
def contains(self, object):
"""
:param object: object to look up
:return: Returns True if it is in the Indexer, False otherwise
"""
return self.index_of(object) != -1
def index_of(self, object):
"""
:param object: object to look up
:return: Returns -1 if the object isn't present, index otherwise
"""
if (object not in self.objs_to_ints):
return -1
else:
return self.objs_to_ints[object]
def add_and_get_index(self, object, add=True):
"""
Adds the object to the index if it isn't present, always returns a nonnegative index
:param object: object to look up or add
:param add: True by default, False if we shouldn't add the object. If False, equivalent to index_of.
:return: The index of the object
"""
if not add:
return self.index_of(object)
if (object not in self.objs_to_ints):
new_idx = len(self.objs_to_ints)
self.objs_to_ints[object] = new_idx
self.ints_to_objs[new_idx] = object
return self.objs_to_ints[object]
class Beam(object):
"""
Beam data structure. Maintains a list of scored elements like a Counter, but only keeps the top n
elements after every insertion operation. Insertion is O(n) (list is maintained in
sorted order), access is O(1). Still fast enough for practical purposes for small beams.
"""
def __init__(self, size):
self.size = size
self.elts = []
self.scores = []
def __repr__(self):
return "Beam(" + repr(list(self.get_elts_and_scores())) + ")"
def __str__(self):
return self.__repr__()
def __len__(self):
return len(self.elts)
def add(self, elt, score):
"""
Adds the element to the beam with the given score if the beam has room or if the score
is better than the score of the worst element currently on the beam
:param elt: element to add
:param score: score corresponding to the element
"""
if len(self.elts) == self.size and score < self.scores[-1]:
# Do nothing because this element is the worst
return
# If the list contains the item with a lower score, remove it
i = 0
while i < len(self.elts):
if self.elts[i] == elt and score > self.scores[i]:
del self.elts[i]
del self.scores[i]
i += 1
# If the list is empty, just insert the item
if len(self.elts) == 0:
self.elts.insert(0, elt)
self.scores.insert(0, score)
# Find the insertion point with binary search
else:
lb = 0
ub = len(self.scores) - 1
# We're searching for the index of the first element with score less than score
while lb < ub:
m = (lb + ub) // 2
# Check > because the list is sorted in descending order
if self.scores[m] > score:
# Put the lower bound ahead of m because all elements before this are greater
lb = m + 1
else:
# m could still be the insertion point
ub = m
# lb and ub should be equal and indicate the index of the first element with score less than score.
# Might be necessary to insert at the end of the list.
if self.scores[lb] > score:
self.elts.insert(lb + 1, elt)
self.scores.insert(lb + 1, score)
else:
self.elts.insert(lb, elt)
self.scores.insert(lb, score)
# Drop and item from the beam if necessary
if len(self.scores) > self.size:
self.elts.pop()
self.scores.pop()
def get_elts(self):
return self.elts
def get_elts_and_scores(self):
return zip(self.elts, self.scores)
def head(self):
return self.elts[0]
def maybe_add_feature(feats: List[int], feature_indexer: Indexer, add_to_indexer: bool, feat: str):
"""
:param feats: list[int] features that we've built so far
:param feature_indexer: Indexer object to apply
:param add_to_indexer: True if we should expand the Indexer, false otherwise. If false, we discard feat if it isn't
in the indexer
:param feat: new feature to index and potentially add
:return:
"""
if add_to_indexer:
feats.append(feature_indexer.add_and_get_index(feat))
else:
feat_idx = feature_indexer.index_of(feat)
if feat_idx != -1:
feats.append(feat_idx)
def score_indexed_features(feats, weights: np.ndarray):
"""
Computes the dot product over a list of features (i.e., a sparse feature vector)
and a weight vector (numpy array)
:param feats: List[int] or numpy array of int features
:param weights: numpy array
:return: the score
"""
score = 0.0
for feat in feats:
score += weights[feat]
return score
def test_beam():
print("TESTING BEAM")
beam = Beam(3)
beam.add("a", 5)
beam.add("b", 7)
beam.add("c", 6)
beam.add("d", 4)
print("Should contain b, c, a: %s" % beam)
beam.add("e", 8)
beam.add("f", 6.5)
print("Should contain e, b, f: %s" % beam)
beam.add("f", 9.5)
print("Should contain f, e, b: %s" % beam)
beam = Beam(5)
beam.add("a", 5)
beam.add("b", 7)
beam.add("c", 6)
beam.add("d", 4)
print("Should contain b, c, a, d: %s" % beam)
beam.add("e", 8)
beam.add("f", 6.5)
print("Should contain e, b, f, c, a: %s" % beam)
if __name__ == '__main__':
test_beam()