From 0ecdd24d48188c51d7a5d71b165d14e00f70e609 Mon Sep 17 00:00:00 2001 From: Dongmyeong Lee Date: Sun, 11 Dec 2022 21:03:22 -0500 Subject: [PATCH] Add comment for sketch --- README.md | 20 +++-- experiments.py | 18 ++++ pysc/datasets.py | 16 ++++ results/sbm/complete_results.csv | 45 ++++++++++ sketches.py | 144 +++++++++++++++++++++++++++++++ 5 files changed, 236 insertions(+), 7 deletions(-) create mode 100644 sketches.py diff --git a/README.md b/README.md index 9501e09..51dd093 100644 --- a/README.md +++ b/README.md @@ -3,19 +3,25 @@ This repository is for reproducibility project from EECS553 (Machine Leaning) Course. We verified the paper "A Tighter Analysis of Spectral Clustering, and Beyond", published in ICML 2022. -## Additional Test that we excecuted -1. **Less-separated Synthetic Dataset**: run -`python experiments.py complete` +## Additional Experiments +1. **Less-separated Synthetic Dataset**: +run `python experiments.py complete` - Change 'r' value at https://github.com/dom-lee/EECS553-Reproducibility-Spectral-Clustering/blob/06541f4ec59481cefdc981144c0b80f75715a451/pysc/datasets.py#L379 -2. **Test on BSDS dataset with different standard deviation**: run -`python experiments.py bsds` +2. **Test on BSDS dataset with different parameters**: +run `python experiments.py bsds` - We set break condition to cluster only 25 images - -3. **Test on MNIST dataset with different number of eigenvector for embedding**: - run +3. **Test on MNIST dataset with different parameters**: +run `python experiments.py mnist` +- Change parameter k to construct different K-NN graph +https://github.com/dom-lee/EECS553-Reproducibility-Spectral-Clustering/blob/14495ab4c592ec1349e059b60f3e594dd612fda3/experiments.py#L95 +- Test different number of eigenvectors +4. **Check the performance of Spectral Clustering with fewer eigenvectors after + reducing dimensionality through Sketching +- # Beyond Spectral Clustering diff --git a/experiments.py b/experiments.py index 3b86ffe..0ae14e2 100644 --- a/experiments.py +++ b/experiments.py @@ -21,6 +21,20 @@ import pysc.objfunc from pysc.sclogging import logger +### Function added by EECS 553- Group 17 +def sub_process(dataset,k,num_eigenvalues: int, q): + logger.info(f"Starting clustering: {dataset} with {num_eigenvalues} eigenvalues.") + start_time = time.time() + found_clusters = sgtl.clustering.spectral_clustering(dataset.graph, num_clusters=k, + num_eigenvectors=num_eigenvalues) + end_time = time.time() + total_time = end_time - start_time + logger.info(f"Finished clustering: {dataset} with {num_eigenvalues} eigenvalues.") + this_rand_score = pysc.evaluation.adjusted_rand_index(dataset.gt_labels, found_clusters) + this_mutual_info = pysc.evaluation.mutual_information(dataset.gt_labels, found_clusters) + this_conductance = pysc.objfunc.KWayExpansion.apply(dataset.graph, found_clusters) + q.put((num_eigenvalues, this_rand_score, this_mutual_info, this_conductance, total_time)) + def basic_experiment(dataset, k): """ @@ -386,6 +400,10 @@ def run_bsds_experiment(image_id=None): for i, file in enumerate(image_files): id = file.split(".")[0] + # [Jeongtaek Chang] Break condition + if i >= 25: + break + # Ignore any images we've already tried. if os.path.exists(output_directory + id + ".mat"): logger.debug(f"Skipping image {file} - output already exists.") diff --git a/pysc/datasets.py b/pysc/datasets.py index 9968b36..ce6bac2 100644 --- a/pysc/datasets.py +++ b/pysc/datasets.py @@ -107,6 +107,10 @@ def load_graph(self, graph_file=None, graph_type="knn10"): self.graph = sgtl.graph.knn_graph(self.raw_data, k) elif graph_type[:3] == "rbf": logger.info(f"Constructing the RBF graph for {self}...") + + # [Jeongtaek Chang] Change variance to construct graph with + # different edge weights + # Variance = {10, 20, 40} self.graph = sgtl.graph.rbf_graph(self.raw_data, variance=20) else: logger.debug(f"Skipping constructing graph for the {self.__class__.__name__}.") @@ -184,6 +188,12 @@ def load_data(self, data_file): # Normalise each number to be between 0 and 1 by dividing through by 255. self.raw_data = numpy.reshape(train_x, (len(train_x), -1)) self.raw_data = self.raw_data / 255 + + # [Sachin Garg] Change alpha for different sketch sizes + # Uncomment for sketching + print("Old sketch size, ", np.shape(self.raw_data)) + # self.raw_data = Sketch(self.raw_data.T, _type_="SHRT", alpha=0.2).T + print("Sketch Size: ", np.shape(self.raw_data)) # Set the total number of data points. self.num_data_points = len(train_x) @@ -269,6 +279,12 @@ def load_data(self, data_file): self.raw_data = numpy.reshape(self.raw_data, (self.num_data_points, -1)) self.raw_data = self.raw_data / 255 + # [Sachin Garg] Change alpha for different sketch sizes + # Uncomment for sketching + print("Old sketch size, ", np.shape(self.raw_data)) + # self.raw_data = Sketch(self.raw_data.T, _type_="SHRT", alpha=0.2).T + print("Sketch Size: ", np.shape(self.raw_data)) + def load_gt_clusters(self, gt_clusters_file): """ Load the ground truth clusters. diff --git a/results/sbm/complete_results.csv b/results/sbm/complete_results.csv index b758351..62e9f21 100644 --- a/results/sbm/complete_results.csv +++ b/results/sbm/complete_results.csv @@ -4,3 +4,48 @@ k, n, p, q, poverq, eigenvectors, conductance, rand 5, 1000, 0.01, 0.001, 10.0, 3, 0.3756434337075715, 0.9512978275655131 5, 1000, 0.01, 0.001, 10.0, 4, 0.31325660254706367, 0.9812917143428687 5, 1000, 0.01, 0.001, 10.0, 5, 0.28187226965267426, 0.995807473494699 +5, 1000, 0.01, 0.002, 5.0, 1, 0.9383981744585708, 0.6591401560312063 +5, 1000, 0.01, 0.002, 5.0, 2, 0.6877845807595184, 0.7650776635327065 +5, 1000, 0.01, 0.002, 5.0, 3, 0.5192923484282195, 0.898663524704941 +5, 1000, 0.01, 0.002, 5.0, 4, 0.4791219129494072, 0.9446712942588519 +5, 1000, 0.01, 0.002, 5.0, 5, 0.43874760722567696, 0.9718757831566315 +5, 1000, 0.01, 0.003, 3.3333333333333335, 1, 0.9502757724799562, 0.6588865773154631 +5, 1000, 0.01, 0.003, 3.3333333333333335, 2, 0.7048159214521287, 0.7299415723144629 +5, 1000, 0.01, 0.003, 3.3333333333333335, 3, 0.6286981036835309, 0.8202439847969594 +5, 1000, 0.01, 0.003, 3.3333333333333335, 4, 0.5975725707841676, 0.8679531426285256 +5, 1000, 0.01, 0.003, 3.3333333333333335, 5, 0.5459199432815701, 0.9050356551310262 +5, 1000, 0.01, 0.004, 2.5, 1, 0.9349548847051299, 0.6606854410882177 +5, 1000, 0.01, 0.004, 2.5, 2, 0.7266654672489465, 0.6978840088017602 +5, 1000, 0.01, 0.004, 2.5, 3, 0.6970885216997054, 0.7453336427285457 +5, 1000, 0.01, 0.004, 2.5, 4, 0.6628749799009077, 0.7694075455091018 +5, 1000, 0.01, 0.004, 2.5, 5, 0.6331291033774379, 0.7759519103820766 +5, 1000, 0.01, 0.005, 2.0, 1, 0.9296243271802715, 0.6596564272854571 +5, 1000, 0.01, 0.005, 2.0, 2, 0.7577375942814981, 0.6692942028405682 +5, 1000, 0.01, 0.005, 2.0, 3, 0.7166792627143006, 0.690493098619724 +5, 1000, 0.01, 0.005, 2.0, 4, 0.6930270440575861, 0.6929926545309061 +5, 1000, 0.01, 0.005, 2.0, 5, 0.6654327749601362, 0.6928433526705342 +5, 1000, 0.01, 0.006, 1.6666666666666667, 1, 0.9404708367757729, 0.6566355671134227 +5, 1000, 0.01, 0.006, 1.6666666666666667, 2, 0.7645322535995012, 0.6622097699539908 +5, 1000, 0.01, 0.006, 1.6666666666666667, 3, 0.7202440715821494, 0.6792497459491899 +5, 1000, 0.01, 0.006, 1.6666666666666667, 4, 0.7071413975011028, 0.6801579275855171 +5, 1000, 0.01, 0.006, 1.6666666666666667, 5, 0.6748454734392261, 0.6818525945189038 +5, 1000, 0.01, 0.007, 1.4285714285714286, 1, 0.9336636906362165, 0.6593897419483896 +5, 1000, 0.01, 0.007, 1.4285714285714286, 2, 0.7723030312731063, 0.6622703740748149 +5, 1000, 0.01, 0.007, 1.4285714285714286, 3, 0.7208346907474154, 0.6768851450290058 +5, 1000, 0.01, 0.007, 1.4285714285714286, 4, 0.713082312748873, 0.6792471854370874 +5, 1000, 0.01, 0.007, 1.4285714285714286, 5, 0.6833318718989513, 0.6804167553510703 +5, 1000, 0.01, 0.008, 1.25, 1, 0.9322523923335652, 0.6589191598319664 +5, 1000, 0.01, 0.008, 1.25, 2, 0.7859044036048213, 0.6611811962392478 +5, 1000, 0.01, 0.008, 1.25, 3, 0.730609254726948, 0.6769326425285056 +5, 1000, 0.01, 0.008, 1.25, 4, 0.7144300788979128, 0.6791316343268655 +5, 1000, 0.01, 0.008, 1.25, 5, 0.6877165841068491, 0.6801530786157232 +5, 1000, 0.01, 0.009000000000000001, 1.111111111111111, 1, 0.9330740351905227, 0.6602054330866174 +5, 1000, 0.01, 0.009000000000000001, 1.111111111111111, 2, 0.7911247469503422, 0.6610277815563113 +5, 1000, 0.01, 0.009000000000000001, 1.111111111111111, 3, 0.7252710713699497, 0.6772872734546909 +5, 1000, 0.01, 0.009000000000000001, 1.111111111111111, 4, 0.7221732830050449, 0.6786906981396278 +5, 1000, 0.01, 0.009000000000000001, 1.111111111111111, 5, 0.6982608708352067, 0.6800013362672536 +5, 1000, 0.01, 0.01, 1.0, 1, 0.9240893086504034, 0.6606136587317464 +5, 1000, 0.01, 0.01, 1.0, 2, 0.7978545056830212, 0.6615838447689538 +5, 1000, 0.01, 0.01, 1.0, 3, 0.7307034561720516, 0.6769653290658132 +5, 1000, 0.01, 0.01, 1.0, 4, 0.7277987489284804, 0.6787506861372273 +5, 1000, 0.01, 0.01, 1.0, 5, 0.7004551280972275, 0.6800592838567713 diff --git a/sketches.py b/sketches.py new file mode 100644 index 0000000..6cff251 --- /dev/null +++ b/sketches.py @@ -0,0 +1,144 @@ +# -*- coding: utf-8 -*- +""" +Created on Fri Oct 21 18:03:16 2022 + +@author: sachg +""" + + +### As Rademacher variable is 1/sqrt{2} sub-gaussian + + +#### Code requirement , require a global (d,n) matrix A +### d is the dimension of every image +### m is the number of images + +import numpy as np +import math +from sympy.stats import Rademacher + +def Sketch(_type_="GA"): + d= np.shape(A)[0] ### dimensionality of each node(image) + m= np.shape(A)[1] #### number of nodes(images) + delta=0.1 ## Sketch failure probability + epsilon=0.4 ## should be less than 0.5 + epsilon_2 =epsilon**2 + if _type_ =="GA": + GA = Gaussian_JL(alpha=1,delta=delta,d=d,m=m,eps_2=epsilon_2) + return GA + if _type_ =="SG": + SG_A = Sub_Gaussian_JL(alpha=12,delta=delta,d=d,m=m,eps_2=epsilon_2) + return SG_A + if _type_ == "SRHT": + Z_A =zero_padding(d=d,m=m) + d_new= np.shape(Z_A)[0] + SRHT_A = SRHT(Z_A=Z_A,alpha=20,delta=delta,d=d_new,m=m,eps_2=epsilon_2) + return SRHT_A + if _type_ =="Sp_SRHT": + Z_A =zero_padding(d=d,m=m) + d_new= np.shape(Z_A)[0] + sparsity_alpha=2 + Sp_SRHT_A = Sparse_RHT(Z_A=Z_A,alpha=20,delta=delta,sparsity_alpha=sparsity_alpha,d=d_new,m=m,eps_2=epsilon_2) + return Sp_SRHT_A + return + + +### Make Data suitable for Hadamard transformation +def zero_padding(d,m): + degree = int(2**(np.ceil(math.log(d,2))) -d) + zero_pad = np.zeros((degree,m)) + t_A = np.concatenate((A,zero_pad),axis=0) + return t_A + + +## Gaussian Johnson Lindenstrauss Sketch +def Gaussian_JL(alpha,delta,d,m,eps_2): + sketch_size = int(alpha*12*math.log(m/delta)/eps_2) ### Gives the smallest sketch size + S = np.random.random((sketch_size,d))/math.sqrt(sketch_size) + GA = S.dot(A) + return GA + + + +### Sub_Gaussian Johnson Lindenstrauss Sketch with Rademacher rabdom variable +def Sub_Gaussian_JL(alpha, delta,d,m,eps_2): + ### Rademacher vector is 1/sqrt(log2) sub-gaussian + sketch_size = int(alpha*((1/math.sqrt(np.log(2)))**4)*math.log(m/delta)/eps_2) + S=np.random.binomial(1, 0.5, size=(sketch_size,d)) + S[S==0]=-1 + S=S/math.sqrt(sketch_size) + SG_A = S.dot(A) + return SG_A + +### Subsampled Randomized Hadamard Transformation +def SRHT(Z_A,alpha,delta,d,m,eps_2): + + ### orthogonally randomize the subspace + padded_dim =np.shape(Z_A)[0] + sketch_size = int(alpha*(np.log(padded_dim/2)**2)*np.log(m/delta)/eps_2) + diag = np.random.binomial(1,0.5,size=padded_dim) + diag[diag==0]=-1 + RHT = np.zeros(np.shape(Z_A)) + + ### Rotate and get randomized directions to flatten leverage scores + for i in range(m): + RHT[:,i] = Hadamard(diag*Z_A[:,i]) + + ### Uniform subsampling on constant coherence subspace + Sub_sample = np.random.choice(np.arange(padded_dim),sketch_size) + SRHT_A = np.sqrt(padded_dim/sketch_size)*RHT[Sub_sample,:] + return SRHT_A + + +### Fast JLT with Sparse Randomized Hadamard Transformation +def Sparse_RHT(Z_A,alpha,delta,sparsity_alpha,d,m,eps_2): + + print("Still to be coded") + return + ### orthogonally randomize the subspace + padded_dim =np.shape(Z_A)[0] + sketch_size =alpha*np.log(m/delta)/eps_2 + sparsity = sparsity_alpha*(np.log(padded_dim/delta))**2 + diag = np.random.binomial(1,0.5,size=padded_dim) + diag[diag==0]=-1 + RHT = np.zeros(np.shape(Z_A)) + + ### Rotate and get randomized directions to flatten leverage scores + for i in range(m): + RHT[:,i] = Hadamard(diag*Z_A[:,i]) + + S = Sparse_Sub_gaussian(r=sketch_size,c=padded_dim,sparsity = sparsity) + ### Sparse Sub-gaussian Sketching + + Sp_RHT_A = S.dot(RHT) + + return Sp_RHT_A + +### Hadamard transformation +def Hadamard(x): + ### Base-case + if np.size(x)<=1: + return x + ### Divide-conquer + else: + a= x[0:int(np.size(x)/2)]+ x[int(np.size(x)/2):] + b= x[0:int(np.size(x)/2)]- x[int(np.size(x)/2):] + return np.concatenate((Hadamard(a), Hadamard(b)),axis=0) + +def Hadamard_2(x): + ### Base-case + if np.shape(x)[0]<=1: + return x + ### Divide-conquer + else: + a= x[0:int(np.shape(x)[0]/2),:]+ x[int(np.shape(x)[0]/2):,:] + b= x[0:int(np.shape(x)[0]/2),:]- x[int(np.shape(x)[0]/2):,:] + return np.concatenate((Hadamard_2(a), Hadamard_2(b)),axis=0) + + +### Sparse Sub-gaussian Matrix-- complete this function later. +def Sparse_Sub_gaussian(r,c,sparsity): + S = np.zeros((r,c)) + for i in range(r): + idx = np.random.choice(np.arange(c),sparsity) + return