-
Notifications
You must be signed in to change notification settings - Fork 0
/
vectorizer.py
235 lines (203 loc) · 9 KB
/
vectorizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
#!/usr/bin/env
from collections import Counter
from scipy import sparse
import numpy
import math
import random
class Counts:
"""
Counter
=====
Function to perform general count operations on featurized instances
Used as parent class in several classes
Parameters
------
Instances : list
list of featurized instances, as list with feature frequencies
Labels : list
list of labels (str) of the instances,
each index of a label corresponds to the index of the instance
"""
def __init__(self, instances, labels):
self.instances = instances
self.labels = labels
def count_document_frequency(self, label = False):
"""
Feature counter
=====
Function to return document counts of all features
Parameters
-----
label : str
Choose to count the frequency that each feature co-occurs with the given label
If False, the total document count is returned
Returns
-----
document_frequency : Counter
Counts of the number of documents or labels with which a feature occurs
key : The feature index (int)
value : The document / label count of the feature index (int)
"""
if label:
target_instances = self.instances[list(numpy.where(numpy.array(self.labels) == label)[0])]
else:
target_instances = self.instances
feature_indices = range(self.instances.shape[1])
feature_counts = target_instances.sum(axis = 0).tolist()[0]
document_frequency = dict(zip(feature_indices, feature_counts))
return document_frequency
def count_label_frequency(self):
"""
Label counter
=====
Function to return counts of all document labels
Returns
-----
label_frequency : dict
Counts of each label
key : The label (str)
value : The count of the label (int)
"""
label_frequency = {}
for label in set(self.labels):
label_frequency[label] = self.labels.count(label)
return label_frequency
def count_label_feature_frequency(self):
"""
Frequency calculator
=====
Function to calculate the frequency of each feature in combination with specific labels
Parameters
-----
labels : list
list of labels (str) of the train instances
Returns
-----
label_feature_frequency : dict of dicts
key1 : label, str
key2 : feature index, int
value : number of times the two co-occur on the document level, list
"""
label_feature_frequency = {}
for label in self.labels:
label_feature_frequency[label] = self.count_document_frequency(label)
return label_feature_frequency
def count_idf(self):
"""
Inverse Document Frequency counter
=====
Function to calculate the inverse document frequency of every feature
Returns
-----
idf : dict
The idf of every feature based on the training documents
key : The feature index
value : The idf of the feature index
"""
idf = dict.fromkeys(range(self.instances.shape[1]), 0) # initialize for all features
num_docs = self.instances.shape[0]
feature_counts = self.count_document_frequency()
for feature in feature_counts.keys():
idf[feature] = math.log((num_docs / feature_counts[feature]), 10) if feature_counts[feature] > 0 else 0
return idf
def balance_data(instances, labels):
# identify lowest frequency]
unique_labels = list(set(labels))
label_count_sorted = sorted([(label,labels.count(label)) for label in unique_labels], key = lambda k : k[1])
least_frequent_indices = [i for i,label in enumerate(labels) if label == label_count_sorted[0][0]]
least_frequent_count = label_count_sorted[0][1]
balanced_instances = instances[least_frequent_indices,:]
balanced_labels = [label_count_sorted[0][0]] * least_frequent_count
# impose lowest frequency on other labels
for cursorlabel in [lc[0] for lc in label_count_sorted[1:]]:
label_indices = [i for i,label in enumerate(labels) if label == cursorlabel]
samples = random.sample(label_indices, least_frequent_count)
sampled_instances = instances[samples,:]
balanced_instances = sparse.vstack((balanced_instances,sampled_instances), format='csr')
balanced_labels.extend([cursorlabel] * least_frequent_count)
return balanced_instances, balanced_labels
def return_document_frequency(instances, labels):
cnt = Counts(instances, labels)
document_frequency = cnt.count_document_frequency()
return document_frequency
def return_idf(instances, labels):
cnt = Counts(instances, labels)
idf = cnt.count_idf()
return idf
def return_infogain(instances, labels):
"""
Infogain calculator
=====
Function to calculate the information gain of each feature
Transforms
-----
self.feature_infogain : dict
key : feature index, int
value : information gain, float
"""
# some initial calculations
infogain = dict.fromkeys(range(instances.shape[1]), 0)
cnt = Counts(instances, labels)
len_instances = instances.shape[0]
feature_frequency = cnt.count_document_frequency()
label_frequency = cnt.count_label_frequency()
label_feature_frequency = cnt.count_label_feature_frequency()
label_probability = [(label_frequency[label] / len_instances) for label in label_frequency.keys()]
initial_entropy = -sum([prob * math.log(prob, 2) for prob in label_probability if prob != 0])
# assign infogain values to each feature
for feature in feature_frequency.keys():
# calculate positive entropy
frequency = feature_frequency[feature]
if frequency > 0:
feature_probability = frequency / len_instances
positive_label_probabilities = []
for label in labels:
if label_feature_frequency[label][feature] > 0:
positive_label_probabilities.append(label_feature_frequency[label][feature] / frequency)
else:
positive_label_probabilities.append(0)
positive_entropy = -sum([prob * math.log(prob, 2) for prob in positive_label_probabilities if prob != 0])
else:
positive_entropy = 0
# calculate negative entropy
inverse_frequency = len_instances - feature_frequency[feature]
negative_probability = inverse_frequency / len_instances
negative_label_probabilities = [((label_frequency[label] - label_feature_frequency[label][feature]) / inverse_frequency) for label in labels]
negative_entropy = -sum([prob * math.log(prob, 2) for prob in negative_label_probabilities if prob != 0])
# based on positive and negative entropy, calculate final entropy
final_entropy = positive_entropy - negative_entropy
infogain[feature] = initial_entropy - final_entropy
return infogain
def return_binary_vectors(instances, feature_weights):
binary_values = numpy.array([1 for cell in instances.data])
binary_vectors = sparse.csr_matrix((binary_values, instances.indices, instances.indptr), shape = instances.shape)
return binary_vectors
def return_tfidf_vectors(instances, idfs):
feature_idf_ordered = sparse.csr_matrix([idfs[feature] for feature in sorted(idfs.keys())])
tfidf_vectors = instances.multiply(feature_idf_ordered)
return tfidf_vectors
def return_infogain_vectors(instances, infogain):
infogain_ordered = sparse.csr_matrix([infogain[feature] for feature in sorted(infogain.keys())])
instances_binary = return_binary_vectors(instances)
infogain_vectors = instances_binary.multiply(infogain_ordered)
return infogain_vectors
def return_top_features(feature_weights, prune):
top_features = sorted(feature_weights, key = feature_weights.get, reverse = True)[:prune]
return top_features
def compress_vectors(instances, top_features):
compressed_vectors = instances[:, top_features]
return compressed_vectors
def align_vectors(instances, target_vocabulary, source_vocabulary):
source_feature_indices = dict([(feature, i) for i, feature in enumerate(source_vocabulary)])
target_feature_indices = dict([(feature, i) for i, feature in enumerate(target_vocabulary)])
keep_features = list(set(source_vocabulary).intersection(set(target_vocabulary)))
transform_dict = dict([(target_feature_indices[feature], source_feature_indices[feature]) for feature in keep_features])
num_instances = instances.shape[0]
columns = []
for index in range(len(target_vocabulary)):
try:
columns.append(instances.getcol(transform_dict[index]))
except:
columns.append(sparse.csr_matrix([[0]] * num_instances))
aligned_vectors = sparse.hstack(columns).tocsr()
return aligned_vectors