Skip to content

Commit

Permalink
Add comment for sketch
Browse files Browse the repository at this point in the history
  • Loading branch information
dom-lee committed Dec 12, 2022
1 parent 14495ab commit 0ecdd24
Show file tree
Hide file tree
Showing 5 changed files with 236 additions and 7 deletions.
20 changes: 13 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,25 @@ This repository is for reproducibility project from EECS553 (Machine Leaning)
Course. We verified the paper "A Tighter Analysis of Spectral Clustering, and
Beyond", published in ICML 2022.

## Additional Test that we excecuted
1. **Less-separated Synthetic Dataset**: run
`python experiments.py complete`
## Additional Experiments
1. **Less-separated Synthetic Dataset**:
run `python experiments.py complete`
- Change 'r' value at https://github.com/dom-lee/EECS553-Reproducibility-Spectral-Clustering/blob/06541f4ec59481cefdc981144c0b80f75715a451/pysc/datasets.py#L379

2. **Test on BSDS dataset with different standard deviation**: run
`python experiments.py bsds`
2. **Test on BSDS dataset with different parameters**:
run `python experiments.py bsds`
- We set break condition to cluster only 25 images
-

3. **Test on MNIST dataset with different number of eigenvector for embedding**:
run
3. **Test on MNIST dataset with different parameters**:
run `python experiments.py mnist`
- Change parameter k to construct different K-NN graph
https://github.com/dom-lee/EECS553-Reproducibility-Spectral-Clustering/blob/14495ab4c592ec1349e059b60f3e594dd612fda3/experiments.py#L95
- Test different number of eigenvectors

4. **Check the performance of Spectral Clustering with fewer eigenvectors after
reducing dimensionality through Sketching
-


# Beyond Spectral Clustering
Expand Down
18 changes: 18 additions & 0 deletions experiments.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,20 @@
import pysc.objfunc
from pysc.sclogging import logger

### Function added by EECS 553- Group 17
def sub_process(dataset,k,num_eigenvalues: int, q):
logger.info(f"Starting clustering: {dataset} with {num_eigenvalues} eigenvalues.")
start_time = time.time()
found_clusters = sgtl.clustering.spectral_clustering(dataset.graph, num_clusters=k,
num_eigenvectors=num_eigenvalues)
end_time = time.time()
total_time = end_time - start_time
logger.info(f"Finished clustering: {dataset} with {num_eigenvalues} eigenvalues.")
this_rand_score = pysc.evaluation.adjusted_rand_index(dataset.gt_labels, found_clusters)
this_mutual_info = pysc.evaluation.mutual_information(dataset.gt_labels, found_clusters)
this_conductance = pysc.objfunc.KWayExpansion.apply(dataset.graph, found_clusters)
q.put((num_eigenvalues, this_rand_score, this_mutual_info, this_conductance, total_time))


def basic_experiment(dataset, k):
"""
Expand Down Expand Up @@ -386,6 +400,10 @@ def run_bsds_experiment(image_id=None):
for i, file in enumerate(image_files):
id = file.split(".")[0]

# [Jeongtaek Chang] Break condition
if i >= 25:
break

# Ignore any images we've already tried.
if os.path.exists(output_directory + id + ".mat"):
logger.debug(f"Skipping image {file} - output already exists.")
Expand Down
16 changes: 16 additions & 0 deletions pysc/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,10 @@ def load_graph(self, graph_file=None, graph_type="knn10"):
self.graph = sgtl.graph.knn_graph(self.raw_data, k)
elif graph_type[:3] == "rbf":
logger.info(f"Constructing the RBF graph for {self}...")

# [Jeongtaek Chang] Change variance to construct graph with
# different edge weights
# Variance = {10, 20, 40}
self.graph = sgtl.graph.rbf_graph(self.raw_data, variance=20)
else:
logger.debug(f"Skipping constructing graph for the {self.__class__.__name__}.")
Expand Down Expand Up @@ -184,6 +188,12 @@ def load_data(self, data_file):
# Normalise each number to be between 0 and 1 by dividing through by 255.
self.raw_data = numpy.reshape(train_x, (len(train_x), -1))
self.raw_data = self.raw_data / 255

# [Sachin Garg] Change alpha for different sketch sizes
# Uncomment for sketching
print("Old sketch size, ", np.shape(self.raw_data))
# self.raw_data = Sketch(self.raw_data.T, _type_="SHRT", alpha=0.2).T
print("Sketch Size: ", np.shape(self.raw_data))

# Set the total number of data points.
self.num_data_points = len(train_x)
Expand Down Expand Up @@ -269,6 +279,12 @@ def load_data(self, data_file):
self.raw_data = numpy.reshape(self.raw_data, (self.num_data_points, -1))
self.raw_data = self.raw_data / 255

# [Sachin Garg] Change alpha for different sketch sizes
# Uncomment for sketching
print("Old sketch size, ", np.shape(self.raw_data))
# self.raw_data = Sketch(self.raw_data.T, _type_="SHRT", alpha=0.2).T
print("Sketch Size: ", np.shape(self.raw_data))

def load_gt_clusters(self, gt_clusters_file):
"""
Load the ground truth clusters.
Expand Down
45 changes: 45 additions & 0 deletions results/sbm/complete_results.csv
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,48 @@ k, n, p, q, poverq, eigenvectors, conductance, rand
5, 1000, 0.01, 0.001, 10.0, 3, 0.3756434337075715, 0.9512978275655131
5, 1000, 0.01, 0.001, 10.0, 4, 0.31325660254706367, 0.9812917143428687
5, 1000, 0.01, 0.001, 10.0, 5, 0.28187226965267426, 0.995807473494699
5, 1000, 0.01, 0.002, 5.0, 1, 0.9383981744585708, 0.6591401560312063
5, 1000, 0.01, 0.002, 5.0, 2, 0.6877845807595184, 0.7650776635327065
5, 1000, 0.01, 0.002, 5.0, 3, 0.5192923484282195, 0.898663524704941
5, 1000, 0.01, 0.002, 5.0, 4, 0.4791219129494072, 0.9446712942588519
5, 1000, 0.01, 0.002, 5.0, 5, 0.43874760722567696, 0.9718757831566315
5, 1000, 0.01, 0.003, 3.3333333333333335, 1, 0.9502757724799562, 0.6588865773154631
5, 1000, 0.01, 0.003, 3.3333333333333335, 2, 0.7048159214521287, 0.7299415723144629
5, 1000, 0.01, 0.003, 3.3333333333333335, 3, 0.6286981036835309, 0.8202439847969594
5, 1000, 0.01, 0.003, 3.3333333333333335, 4, 0.5975725707841676, 0.8679531426285256
5, 1000, 0.01, 0.003, 3.3333333333333335, 5, 0.5459199432815701, 0.9050356551310262
5, 1000, 0.01, 0.004, 2.5, 1, 0.9349548847051299, 0.6606854410882177
5, 1000, 0.01, 0.004, 2.5, 2, 0.7266654672489465, 0.6978840088017602
5, 1000, 0.01, 0.004, 2.5, 3, 0.6970885216997054, 0.7453336427285457
5, 1000, 0.01, 0.004, 2.5, 4, 0.6628749799009077, 0.7694075455091018
5, 1000, 0.01, 0.004, 2.5, 5, 0.6331291033774379, 0.7759519103820766
5, 1000, 0.01, 0.005, 2.0, 1, 0.9296243271802715, 0.6596564272854571
5, 1000, 0.01, 0.005, 2.0, 2, 0.7577375942814981, 0.6692942028405682
5, 1000, 0.01, 0.005, 2.0, 3, 0.7166792627143006, 0.690493098619724
5, 1000, 0.01, 0.005, 2.0, 4, 0.6930270440575861, 0.6929926545309061
5, 1000, 0.01, 0.005, 2.0, 5, 0.6654327749601362, 0.6928433526705342
5, 1000, 0.01, 0.006, 1.6666666666666667, 1, 0.9404708367757729, 0.6566355671134227
5, 1000, 0.01, 0.006, 1.6666666666666667, 2, 0.7645322535995012, 0.6622097699539908
5, 1000, 0.01, 0.006, 1.6666666666666667, 3, 0.7202440715821494, 0.6792497459491899
5, 1000, 0.01, 0.006, 1.6666666666666667, 4, 0.7071413975011028, 0.6801579275855171
5, 1000, 0.01, 0.006, 1.6666666666666667, 5, 0.6748454734392261, 0.6818525945189038
5, 1000, 0.01, 0.007, 1.4285714285714286, 1, 0.9336636906362165, 0.6593897419483896
5, 1000, 0.01, 0.007, 1.4285714285714286, 2, 0.7723030312731063, 0.6622703740748149
5, 1000, 0.01, 0.007, 1.4285714285714286, 3, 0.7208346907474154, 0.6768851450290058
5, 1000, 0.01, 0.007, 1.4285714285714286, 4, 0.713082312748873, 0.6792471854370874
5, 1000, 0.01, 0.007, 1.4285714285714286, 5, 0.6833318718989513, 0.6804167553510703
5, 1000, 0.01, 0.008, 1.25, 1, 0.9322523923335652, 0.6589191598319664
5, 1000, 0.01, 0.008, 1.25, 2, 0.7859044036048213, 0.6611811962392478
5, 1000, 0.01, 0.008, 1.25, 3, 0.730609254726948, 0.6769326425285056
5, 1000, 0.01, 0.008, 1.25, 4, 0.7144300788979128, 0.6791316343268655
5, 1000, 0.01, 0.008, 1.25, 5, 0.6877165841068491, 0.6801530786157232
5, 1000, 0.01, 0.009000000000000001, 1.111111111111111, 1, 0.9330740351905227, 0.6602054330866174
5, 1000, 0.01, 0.009000000000000001, 1.111111111111111, 2, 0.7911247469503422, 0.6610277815563113
5, 1000, 0.01, 0.009000000000000001, 1.111111111111111, 3, 0.7252710713699497, 0.6772872734546909
5, 1000, 0.01, 0.009000000000000001, 1.111111111111111, 4, 0.7221732830050449, 0.6786906981396278
5, 1000, 0.01, 0.009000000000000001, 1.111111111111111, 5, 0.6982608708352067, 0.6800013362672536
5, 1000, 0.01, 0.01, 1.0, 1, 0.9240893086504034, 0.6606136587317464
5, 1000, 0.01, 0.01, 1.0, 2, 0.7978545056830212, 0.6615838447689538
5, 1000, 0.01, 0.01, 1.0, 3, 0.7307034561720516, 0.6769653290658132
5, 1000, 0.01, 0.01, 1.0, 4, 0.7277987489284804, 0.6787506861372273
5, 1000, 0.01, 0.01, 1.0, 5, 0.7004551280972275, 0.6800592838567713
144 changes: 144 additions & 0 deletions sketches.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
# -*- coding: utf-8 -*-
"""
Created on Fri Oct 21 18:03:16 2022
@author: sachg
"""


### As Rademacher variable is 1/sqrt{2} sub-gaussian


#### Code requirement , require a global (d,n) matrix A
### d is the dimension of every image
### m is the number of images

import numpy as np
import math
from sympy.stats import Rademacher

def Sketch(_type_="GA"):
d= np.shape(A)[0] ### dimensionality of each node(image)
m= np.shape(A)[1] #### number of nodes(images)
delta=0.1 ## Sketch failure probability
epsilon=0.4 ## should be less than 0.5
epsilon_2 =epsilon**2
if _type_ =="GA":
GA = Gaussian_JL(alpha=1,delta=delta,d=d,m=m,eps_2=epsilon_2)
return GA
if _type_ =="SG":
SG_A = Sub_Gaussian_JL(alpha=12,delta=delta,d=d,m=m,eps_2=epsilon_2)
return SG_A
if _type_ == "SRHT":
Z_A =zero_padding(d=d,m=m)
d_new= np.shape(Z_A)[0]
SRHT_A = SRHT(Z_A=Z_A,alpha=20,delta=delta,d=d_new,m=m,eps_2=epsilon_2)
return SRHT_A
if _type_ =="Sp_SRHT":
Z_A =zero_padding(d=d,m=m)
d_new= np.shape(Z_A)[0]
sparsity_alpha=2
Sp_SRHT_A = Sparse_RHT(Z_A=Z_A,alpha=20,delta=delta,sparsity_alpha=sparsity_alpha,d=d_new,m=m,eps_2=epsilon_2)
return Sp_SRHT_A
return


### Make Data suitable for Hadamard transformation
def zero_padding(d,m):
degree = int(2**(np.ceil(math.log(d,2))) -d)
zero_pad = np.zeros((degree,m))
t_A = np.concatenate((A,zero_pad),axis=0)
return t_A


## Gaussian Johnson Lindenstrauss Sketch
def Gaussian_JL(alpha,delta,d,m,eps_2):
sketch_size = int(alpha*12*math.log(m/delta)/eps_2) ### Gives the smallest sketch size
S = np.random.random((sketch_size,d))/math.sqrt(sketch_size)
GA = S.dot(A)
return GA



### Sub_Gaussian Johnson Lindenstrauss Sketch with Rademacher rabdom variable
def Sub_Gaussian_JL(alpha, delta,d,m,eps_2):
### Rademacher vector is 1/sqrt(log2) sub-gaussian
sketch_size = int(alpha*((1/math.sqrt(np.log(2)))**4)*math.log(m/delta)/eps_2)
S=np.random.binomial(1, 0.5, size=(sketch_size,d))
S[S==0]=-1
S=S/math.sqrt(sketch_size)
SG_A = S.dot(A)
return SG_A

### Subsampled Randomized Hadamard Transformation
def SRHT(Z_A,alpha,delta,d,m,eps_2):

### orthogonally randomize the subspace
padded_dim =np.shape(Z_A)[0]
sketch_size = int(alpha*(np.log(padded_dim/2)**2)*np.log(m/delta)/eps_2)
diag = np.random.binomial(1,0.5,size=padded_dim)
diag[diag==0]=-1
RHT = np.zeros(np.shape(Z_A))

### Rotate and get randomized directions to flatten leverage scores
for i in range(m):
RHT[:,i] = Hadamard(diag*Z_A[:,i])

### Uniform subsampling on constant coherence subspace
Sub_sample = np.random.choice(np.arange(padded_dim),sketch_size)
SRHT_A = np.sqrt(padded_dim/sketch_size)*RHT[Sub_sample,:]
return SRHT_A


### Fast JLT with Sparse Randomized Hadamard Transformation
def Sparse_RHT(Z_A,alpha,delta,sparsity_alpha,d,m,eps_2):

print("Still to be coded")
return
### orthogonally randomize the subspace
padded_dim =np.shape(Z_A)[0]
sketch_size =alpha*np.log(m/delta)/eps_2
sparsity = sparsity_alpha*(np.log(padded_dim/delta))**2
diag = np.random.binomial(1,0.5,size=padded_dim)
diag[diag==0]=-1
RHT = np.zeros(np.shape(Z_A))

### Rotate and get randomized directions to flatten leverage scores
for i in range(m):
RHT[:,i] = Hadamard(diag*Z_A[:,i])

S = Sparse_Sub_gaussian(r=sketch_size,c=padded_dim,sparsity = sparsity)
### Sparse Sub-gaussian Sketching

Sp_RHT_A = S.dot(RHT)

return Sp_RHT_A

### Hadamard transformation
def Hadamard(x):
### Base-case
if np.size(x)<=1:
return x
### Divide-conquer
else:
a= x[0:int(np.size(x)/2)]+ x[int(np.size(x)/2):]
b= x[0:int(np.size(x)/2)]- x[int(np.size(x)/2):]
return np.concatenate((Hadamard(a), Hadamard(b)),axis=0)

def Hadamard_2(x):
### Base-case
if np.shape(x)[0]<=1:
return x
### Divide-conquer
else:
a= x[0:int(np.shape(x)[0]/2),:]+ x[int(np.shape(x)[0]/2):,:]
b= x[0:int(np.shape(x)[0]/2),:]- x[int(np.shape(x)[0]/2):,:]
return np.concatenate((Hadamard_2(a), Hadamard_2(b)),axis=0)


### Sparse Sub-gaussian Matrix-- complete this function later.
def Sparse_Sub_gaussian(r,c,sparsity):
S = np.zeros((r,c))
for i in range(r):
idx = np.random.choice(np.arange(c),sparsity)
return

0 comments on commit 0ecdd24

Please sign in to comment.