-
Notifications
You must be signed in to change notification settings - Fork 0
/
evaluate.py
136 lines (109 loc) · 4.46 KB
/
evaluate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import argparse
import numpy as np
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.utils import shuffle as skshuffle
from collections import defaultdict
from scipy import sparse
import warnings
warnings.filterwarnings("ignore")
from utils import load_labels, load_embedding, load_labels_youtube
class TopKRanker(OneVsRestClassifier):
def predict(self, X, top_k_list):
assert X.shape[0] == len(top_k_list)
probs = np.asarray(super(TopKRanker, self).predict_proba(X))
all_labels = sparse.lil_matrix(probs.shape)
for i, k in enumerate(top_k_list):
probs_ = probs[i, :]
labels = self.classes_[probs_.argsort()[-k:]].tolist()
for label in labels:
all_labels[i, label] = 1
return all_labels
def classifier(X_train, y_train, X_test, y_test):
clf = TopKRanker(LogisticRegression())
clf.fit(X_train, y_train)
# find out how many labels should be predicted
top_k_list = list(map(int, y_test.sum(axis=1).T.tolist()[0]))
preds = clf.predict(X_test, top_k_list)
results = {}
averages = ["micro", "macro", "samples", "weighted"]
for average in averages:
results[average] = f1_score(y_test, preds, average=average)
return results
def evaluate(emb, number_shuffles=5, label=None):
if type(emb) == str:
features_matrix = load_embedding(args.emb)
else:
features_matrix = emb
print(features_matrix.shape)
num_nodes = features_matrix.shape[0]
if "youtube" in label:
label_matrix, labeled_nodes = load_labels_youtube(label, num_nodes)
features_matrix = features_matrix[labeled_nodes]
num_nodes = len(labeled_nodes)
elif label is None:
label_matrix = load_labels(args.label, num_nodes)
else:
label_matrix = load_labels(label, num_nodes)
shuffles = []
for x in range(number_shuffles):
shuffles.append(skshuffle(features_matrix, label_matrix, random_state=0))
all_results = defaultdict(list)
if num_nodes < 20000:
training_percents = [0.1, 0.3, 0.5, 0.7, 0.9]
else:
training_percents = [0.01, 0.03, 0.05, 0.07, 0.09]
for train_percent in training_percents:
# pool = multiprocessing.Pool(processes=shuffles)
# results = []
for shuf in shuffles:
X, y = shuf
training_size = int(train_percent * num_nodes)
X_train = X[:training_size, :]
y_train = y[:training_size, :]
X_test = X[training_size:, :]
y_test = y[training_size:, :]
clf = TopKRanker(LogisticRegression())
clf.fit(X_train, y_train)
# find out how many labels should be predicted
top_k_list = list(map(int, y_test.sum(axis=1).T.tolist()[0]))
preds = clf.predict(X_test, top_k_list)
results = {}
averages = ["micro", "macro", "samples", "weighted"]
for average in averages:
results[average] = f1_score(y_test, preds, average=average)
all_results[train_percent].append(results)
# results.append(pool.apply_async(classifier, (X_train, y_train, X_test, y_test)))
# pool.close()
# pool.join()
# for res in results:
# all_results[train_percent].append(res.get())
print('Results, using embeddings of dimensionality', X.shape[1])
print('-------------------')
print('Train percent:', 'average f1-score')
fres = dict()
for train_percent in sorted(all_results.keys()):
av = 0
stder = np.ones(number_shuffles)
i = 0
for x in all_results[train_percent]:
stder[i] = x["micro"]
i += 1
av += x["micro"]
av /= number_shuffles
print(train_percent, ":", av)
fres[train_percent] = av
return fres
def parse_args():
parser = argparse.ArgumentParser(description="Community Discover.")
parser.add_argument('--label', nargs='?', default='data/PPI.cmty',
help='Input label file path')
parser.add_argument('--emb', nargs='?', default='emb/PPI.emb',
help='embeddings file path')
parser.add_argument('--shuffle', type=int, default=4,
help='number of shuffle')
return parser.parse_args()
if __name__ == '__main__':
args = parse_args()
evaluate(args.emb, args.shuffle, args.label)