forked from google/active-learning
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgraph_density.py
92 lines (81 loc) · 3.57 KB
/
graph_density.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# Copyright 2017 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Diversity promoting sampling method that uses graph density to determine
most representative points.
This is an implementation of the method described in
https://www.mpi-inf.mpg.de/fileadmin/inf/d2/Research_projects_files/EbertCVPR2012.pdf
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import copy
from sklearn.neighbors import kneighbors_graph
from sklearn.metrics import pairwise_distances
import numpy as np
from sampling_methods.sampling_def import SamplingMethod
class GraphDensitySampler(SamplingMethod):
"""Diversity promoting sampling method that uses graph density to determine
most representative points.
"""
def __init__(self, X, y, seed):
self.name = 'graph_density'
self.X = X
self.flat_X = self.flatten_X()
# Set gamma for gaussian kernel to be equal to 1/n_features
self.gamma = 1. / self.X.shape[1]
self.compute_graph_density()
def compute_graph_density(self, n_neighbor=10):
# kneighbors graph is constructed using k=10
connect = kneighbors_graph(self.flat_X, n_neighbor,p=1)
# Make connectivity matrix symmetric, if a point is a k nearest neighbor of
# another point, make it vice versa
neighbors = connect.nonzero()
inds = zip(neighbors[0],neighbors[1])
# Graph edges are weighted by applying gaussian kernel to manhattan dist.
# By default, gamma for rbf kernel is equal to 1/n_features but may
# get better results if gamma is tuned.
for entry in inds:
i = entry[0]
j = entry[1]
distance = pairwise_distances(self.flat_X[[i]],self.flat_X[[j]],metric='manhattan')
distance = distance[0,0]
weight = np.exp(-distance * self.gamma)
connect[i,j] = weight
connect[j,i] = weight
self.connect = connect
# Define graph density for an observation to be sum of weights for all
# edges to the node representing the datapoint. Normalize sum weights
# by total number of neighbors.
self.graph_density = np.zeros(self.X.shape[0])
for i in np.arange(self.X.shape[0]):
self.graph_density[i] = connect[i,:].sum() / (connect[i,:]>0).sum()
self.starting_density = copy.deepcopy(self.graph_density)
def select_batch_(self, N, already_selected, **kwargs):
# If a neighbor has already been sampled, reduce the graph density
# for its direct neighbors to promote diversity.
batch = set()
self.graph_density[already_selected] = min(self.graph_density) - 1
while len(batch) < N:
selected = np.argmax(self.graph_density)
neighbors = (self.connect[selected,:] > 0).nonzero()[1]
self.graph_density[neighbors] = self.graph_density[neighbors] - self.graph_density[selected]
batch.add(selected)
self.graph_density[already_selected] = min(self.graph_density) - 1
self.graph_density[list(batch)] = min(self.graph_density) - 1
return list(batch)
def to_dict(self):
output = {}
output['connectivity'] = self.connect
output['graph_density'] = self.starting_density
return output