From 0ecdd24d48188c51d7a5d71b165d14e00f70e609 Mon Sep 17 00:00:00 2001
From: Dongmyeong Lee <dom9316@gmail.com>
Date: Sun, 11 Dec 2022 21:03:22 -0500
Subject: [PATCH] Add comment for sketch

---
 README.md                        |  20 +++--
 experiments.py                   |  18 ++++
 pysc/datasets.py                 |  16 ++++
 results/sbm/complete_results.csv |  45 ++++++++++
 sketches.py                      | 144 +++++++++++++++++++++++++++++++
 5 files changed, 236 insertions(+), 7 deletions(-)
 create mode 100644 sketches.py

diff --git a/README.md b/README.md
index 9501e09..51dd093 100644
--- a/README.md
+++ b/README.md
@@ -3,19 +3,25 @@ This repository is for reproducibility project from EECS553 (Machine Leaning)
 Course. We verified the paper "A Tighter Analysis of Spectral Clustering, and
 Beyond", published in ICML 2022.
 
-## Additional Test that we excecuted
-1. **Less-separated Synthetic Dataset**: run
-`python experiments.py complete`
+## Additional Experiments
+1. **Less-separated Synthetic Dataset**:
+run `python experiments.py complete`
 - Change 'r' value at https://github.com/dom-lee/EECS553-Reproducibility-Spectral-Clustering/blob/06541f4ec59481cefdc981144c0b80f75715a451/pysc/datasets.py#L379
 
-2. **Test on BSDS dataset with different standard deviation**: run
-`python experiments.py bsds`
+2. **Test on BSDS dataset with different parameters**:
+run `python experiments.py bsds`
 - We set break condition to cluster only 25 images
 - 
 
-3. **Test on MNIST dataset with different number of eigenvector for embedding**:
-   run
+3. **Test on MNIST dataset with different parameters**:
+run `python experiments.py mnist`
+- Change parameter k to construct different K-NN graph
+https://github.com/dom-lee/EECS553-Reproducibility-Spectral-Clustering/blob/14495ab4c592ec1349e059b60f3e594dd612fda3/experiments.py#L95
+- Test different number of eigenvectors 
 
+4. **Check the performance of Spectral Clustering with fewer eigenvectors after
+   reducing dimensionality through Sketching
+- 
 
 
 # Beyond Spectral Clustering
diff --git a/experiments.py b/experiments.py
index 3b86ffe..0ae14e2 100644
--- a/experiments.py
+++ b/experiments.py
@@ -21,6 +21,20 @@
 import pysc.objfunc
 from pysc.sclogging import logger
 
+### Function added by EECS 553- Group 17
+def sub_process(dataset,k,num_eigenvalues: int, q):
+    logger.info(f"Starting clustering: {dataset} with {num_eigenvalues} eigenvalues.")
+    start_time = time.time()
+    found_clusters = sgtl.clustering.spectral_clustering(dataset.graph, num_clusters=k,
+                                                              num_eigenvectors=num_eigenvalues)
+    end_time = time.time()
+    total_time = end_time - start_time
+    logger.info(f"Finished clustering: {dataset} with {num_eigenvalues} eigenvalues.")
+    this_rand_score = pysc.evaluation.adjusted_rand_index(dataset.gt_labels, found_clusters)
+    this_mutual_info = pysc.evaluation.mutual_information(dataset.gt_labels, found_clusters)
+    this_conductance = pysc.objfunc.KWayExpansion.apply(dataset.graph, found_clusters)
+    q.put((num_eigenvalues, this_rand_score, this_mutual_info, this_conductance, total_time))
+
 
 def basic_experiment(dataset, k):
     """
@@ -386,6 +400,10 @@ def run_bsds_experiment(image_id=None):
     for i, file in enumerate(image_files):
         id = file.split(".")[0]
 
+        # [Jeongtaek Chang] Break condition
+        if i >= 25:
+            break
+        
         # Ignore any images we've already tried.
         if os.path.exists(output_directory + id + ".mat"):
             logger.debug(f"Skipping image {file} - output already exists.")
diff --git a/pysc/datasets.py b/pysc/datasets.py
index 9968b36..ce6bac2 100644
--- a/pysc/datasets.py
+++ b/pysc/datasets.py
@@ -107,6 +107,10 @@ def load_graph(self, graph_file=None, graph_type="knn10"):
                 self.graph = sgtl.graph.knn_graph(self.raw_data, k)
             elif graph_type[:3] == "rbf":
                 logger.info(f"Constructing the RBF graph for {self}...")
+
+                # [Jeongtaek Chang] Change variance to construct graph with
+                # different edge weights 
+                # Variance = {10, 20, 40}
                 self.graph = sgtl.graph.rbf_graph(self.raw_data, variance=20)
         else:
             logger.debug(f"Skipping constructing graph for the {self.__class__.__name__}.")
@@ -184,6 +188,12 @@ def load_data(self, data_file):
                 # Normalise each number to be between 0 and 1 by dividing through by 255.
                 self.raw_data = numpy.reshape(train_x, (len(train_x), -1))
                 self.raw_data = self.raw_data / 255
+                
+                # [Sachin Garg] Change alpha for different sketch sizes
+                # Uncomment for sketching
+                print("Old sketch size, ", np.shape(self.raw_data))
+                # self.raw_data = Sketch(self.raw_data.T, _type_="SHRT", alpha=0.2).T
+                print("Sketch Size: ", np.shape(self.raw_data))
 
             # Set the total number of data points.
             self.num_data_points = len(train_x)
@@ -269,6 +279,12 @@ def load_data(self, data_file):
                 self.raw_data = numpy.reshape(self.raw_data, (self.num_data_points, -1))
                 self.raw_data = self.raw_data / 255
 
+                # [Sachin Garg] Change alpha for different sketch sizes
+                # Uncomment for sketching
+                print("Old sketch size, ", np.shape(self.raw_data))
+                # self.raw_data = Sketch(self.raw_data.T, _type_="SHRT", alpha=0.2).T
+                print("Sketch Size: ", np.shape(self.raw_data))
+
     def load_gt_clusters(self, gt_clusters_file):
         """
         Load the ground truth clusters.
diff --git a/results/sbm/complete_results.csv b/results/sbm/complete_results.csv
index b758351..62e9f21 100644
--- a/results/sbm/complete_results.csv
+++ b/results/sbm/complete_results.csv
@@ -4,3 +4,48 @@ k, n, p, q, poverq, eigenvectors, conductance, rand
 5, 1000, 0.01, 0.001, 10.0, 3, 0.3756434337075715, 0.9512978275655131
 5, 1000, 0.01, 0.001, 10.0, 4, 0.31325660254706367, 0.9812917143428687
 5, 1000, 0.01, 0.001, 10.0, 5, 0.28187226965267426, 0.995807473494699
+5, 1000, 0.01, 0.002, 5.0, 1, 0.9383981744585708, 0.6591401560312063
+5, 1000, 0.01, 0.002, 5.0, 2, 0.6877845807595184, 0.7650776635327065
+5, 1000, 0.01, 0.002, 5.0, 3, 0.5192923484282195, 0.898663524704941
+5, 1000, 0.01, 0.002, 5.0, 4, 0.4791219129494072, 0.9446712942588519
+5, 1000, 0.01, 0.002, 5.0, 5, 0.43874760722567696, 0.9718757831566315
+5, 1000, 0.01, 0.003, 3.3333333333333335, 1, 0.9502757724799562, 0.6588865773154631
+5, 1000, 0.01, 0.003, 3.3333333333333335, 2, 0.7048159214521287, 0.7299415723144629
+5, 1000, 0.01, 0.003, 3.3333333333333335, 3, 0.6286981036835309, 0.8202439847969594
+5, 1000, 0.01, 0.003, 3.3333333333333335, 4, 0.5975725707841676, 0.8679531426285256
+5, 1000, 0.01, 0.003, 3.3333333333333335, 5, 0.5459199432815701, 0.9050356551310262
+5, 1000, 0.01, 0.004, 2.5, 1, 0.9349548847051299, 0.6606854410882177
+5, 1000, 0.01, 0.004, 2.5, 2, 0.7266654672489465, 0.6978840088017602
+5, 1000, 0.01, 0.004, 2.5, 3, 0.6970885216997054, 0.7453336427285457
+5, 1000, 0.01, 0.004, 2.5, 4, 0.6628749799009077, 0.7694075455091018
+5, 1000, 0.01, 0.004, 2.5, 5, 0.6331291033774379, 0.7759519103820766
+5, 1000, 0.01, 0.005, 2.0, 1, 0.9296243271802715, 0.6596564272854571
+5, 1000, 0.01, 0.005, 2.0, 2, 0.7577375942814981, 0.6692942028405682
+5, 1000, 0.01, 0.005, 2.0, 3, 0.7166792627143006, 0.690493098619724
+5, 1000, 0.01, 0.005, 2.0, 4, 0.6930270440575861, 0.6929926545309061
+5, 1000, 0.01, 0.005, 2.0, 5, 0.6654327749601362, 0.6928433526705342
+5, 1000, 0.01, 0.006, 1.6666666666666667, 1, 0.9404708367757729, 0.6566355671134227
+5, 1000, 0.01, 0.006, 1.6666666666666667, 2, 0.7645322535995012, 0.6622097699539908
+5, 1000, 0.01, 0.006, 1.6666666666666667, 3, 0.7202440715821494, 0.6792497459491899
+5, 1000, 0.01, 0.006, 1.6666666666666667, 4, 0.7071413975011028, 0.6801579275855171
+5, 1000, 0.01, 0.006, 1.6666666666666667, 5, 0.6748454734392261, 0.6818525945189038
+5, 1000, 0.01, 0.007, 1.4285714285714286, 1, 0.9336636906362165, 0.6593897419483896
+5, 1000, 0.01, 0.007, 1.4285714285714286, 2, 0.7723030312731063, 0.6622703740748149
+5, 1000, 0.01, 0.007, 1.4285714285714286, 3, 0.7208346907474154, 0.6768851450290058
+5, 1000, 0.01, 0.007, 1.4285714285714286, 4, 0.713082312748873, 0.6792471854370874
+5, 1000, 0.01, 0.007, 1.4285714285714286, 5, 0.6833318718989513, 0.6804167553510703
+5, 1000, 0.01, 0.008, 1.25, 1, 0.9322523923335652, 0.6589191598319664
+5, 1000, 0.01, 0.008, 1.25, 2, 0.7859044036048213, 0.6611811962392478
+5, 1000, 0.01, 0.008, 1.25, 3, 0.730609254726948, 0.6769326425285056
+5, 1000, 0.01, 0.008, 1.25, 4, 0.7144300788979128, 0.6791316343268655
+5, 1000, 0.01, 0.008, 1.25, 5, 0.6877165841068491, 0.6801530786157232
+5, 1000, 0.01, 0.009000000000000001, 1.111111111111111, 1, 0.9330740351905227, 0.6602054330866174
+5, 1000, 0.01, 0.009000000000000001, 1.111111111111111, 2, 0.7911247469503422, 0.6610277815563113
+5, 1000, 0.01, 0.009000000000000001, 1.111111111111111, 3, 0.7252710713699497, 0.6772872734546909
+5, 1000, 0.01, 0.009000000000000001, 1.111111111111111, 4, 0.7221732830050449, 0.6786906981396278
+5, 1000, 0.01, 0.009000000000000001, 1.111111111111111, 5, 0.6982608708352067, 0.6800013362672536
+5, 1000, 0.01, 0.01, 1.0, 1, 0.9240893086504034, 0.6606136587317464
+5, 1000, 0.01, 0.01, 1.0, 2, 0.7978545056830212, 0.6615838447689538
+5, 1000, 0.01, 0.01, 1.0, 3, 0.7307034561720516, 0.6769653290658132
+5, 1000, 0.01, 0.01, 1.0, 4, 0.7277987489284804, 0.6787506861372273
+5, 1000, 0.01, 0.01, 1.0, 5, 0.7004551280972275, 0.6800592838567713
diff --git a/sketches.py b/sketches.py
new file mode 100644
index 0000000..6cff251
--- /dev/null
+++ b/sketches.py
@@ -0,0 +1,144 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Oct 21 18:03:16 2022
+
+@author: sachg
+"""
+
+
+### As Rademacher variable is 1/sqrt{2} sub-gaussian
+
+
+#### Code requirement , require a global (d,n) matrix A
+### d is the dimension of every image
+### m is the number of images
+
+import numpy as np
+import math
+from sympy.stats import Rademacher
+
+def Sketch(_type_="GA"):
+    d= np.shape(A)[0] ### dimensionality of each node(image)
+    m= np.shape(A)[1] #### number of nodes(images)
+    delta=0.1 ## Sketch failure probability
+    epsilon=0.4 ## should be less than 0.5
+    epsilon_2 =epsilon**2 
+    if _type_ =="GA":
+        GA = Gaussian_JL(alpha=1,delta=delta,d=d,m=m,eps_2=epsilon_2)
+        return GA
+    if _type_ =="SG":
+        SG_A = Sub_Gaussian_JL(alpha=12,delta=delta,d=d,m=m,eps_2=epsilon_2)
+        return SG_A
+    if _type_ == "SRHT":
+        Z_A =zero_padding(d=d,m=m)
+        d_new= np.shape(Z_A)[0]
+        SRHT_A = SRHT(Z_A=Z_A,alpha=20,delta=delta,d=d_new,m=m,eps_2=epsilon_2)
+        return SRHT_A
+    if _type_ =="Sp_SRHT":
+        Z_A =zero_padding(d=d,m=m)
+        d_new= np.shape(Z_A)[0]
+        sparsity_alpha=2
+        Sp_SRHT_A = Sparse_RHT(Z_A=Z_A,alpha=20,delta=delta,sparsity_alpha=sparsity_alpha,d=d_new,m=m,eps_2=epsilon_2)
+        return Sp_SRHT_A
+    return
+        
+        
+### Make Data suitable for Hadamard transformation
+def zero_padding(d,m):
+    degree = int(2**(np.ceil(math.log(d,2))) -d)
+    zero_pad = np.zeros((degree,m))
+    t_A = np.concatenate((A,zero_pad),axis=0)
+    return t_A
+
+
+## Gaussian Johnson Lindenstrauss Sketch
+def Gaussian_JL(alpha,delta,d,m,eps_2):
+    sketch_size = int(alpha*12*math.log(m/delta)/eps_2) ### Gives the smallest sketch size
+    S = np.random.random((sketch_size,d))/math.sqrt(sketch_size)
+    GA = S.dot(A)
+    return GA
+
+
+
+### Sub_Gaussian Johnson Lindenstrauss Sketch with Rademacher rabdom variable
+def Sub_Gaussian_JL(alpha, delta,d,m,eps_2):
+    ### Rademacher vector is 1/sqrt(log2) sub-gaussian
+    sketch_size = int(alpha*((1/math.sqrt(np.log(2)))**4)*math.log(m/delta)/eps_2)
+    S=np.random.binomial(1, 0.5, size=(sketch_size,d))
+    S[S==0]=-1
+    S=S/math.sqrt(sketch_size)
+    SG_A = S.dot(A)
+    return SG_A
+
+### Subsampled Randomized Hadamard Transformation
+def SRHT(Z_A,alpha,delta,d,m,eps_2):
+    
+    ### orthogonally randomize the subspace
+    padded_dim =np.shape(Z_A)[0]
+    sketch_size = int(alpha*(np.log(padded_dim/2)**2)*np.log(m/delta)/eps_2)
+    diag = np.random.binomial(1,0.5,size=padded_dim)
+    diag[diag==0]=-1
+    RHT = np.zeros(np.shape(Z_A))
+    
+    ### Rotate and get randomized directions to flatten leverage scores
+    for i in range(m):
+        RHT[:,i] = Hadamard(diag*Z_A[:,i])
+        
+    ### Uniform subsampling on constant coherence subspace
+    Sub_sample = np.random.choice(np.arange(padded_dim),sketch_size)
+    SRHT_A = np.sqrt(padded_dim/sketch_size)*RHT[Sub_sample,:]
+    return SRHT_A
+
+
+### Fast JLT with Sparse Randomized Hadamard Transformation
+def Sparse_RHT(Z_A,alpha,delta,sparsity_alpha,d,m,eps_2):
+    
+    print("Still to be coded")
+    return
+    ### orthogonally randomize the subspace
+    padded_dim =np.shape(Z_A)[0]
+    sketch_size =alpha*np.log(m/delta)/eps_2
+    sparsity = sparsity_alpha*(np.log(padded_dim/delta))**2
+    diag = np.random.binomial(1,0.5,size=padded_dim)
+    diag[diag==0]=-1
+    RHT = np.zeros(np.shape(Z_A))
+    
+    ### Rotate and get randomized directions to flatten leverage scores
+    for i in range(m):
+        RHT[:,i] = Hadamard(diag*Z_A[:,i])
+    
+    S = Sparse_Sub_gaussian(r=sketch_size,c=padded_dim,sparsity = sparsity)
+    ### Sparse Sub-gaussian Sketching
+    
+    Sp_RHT_A = S.dot(RHT)
+    
+    return Sp_RHT_A
+
+### Hadamard transformation
+def Hadamard(x):
+    ### Base-case
+    if np.size(x)<=1:
+        return x
+    ### Divide-conquer
+    else:
+        a= x[0:int(np.size(x)/2)]+ x[int(np.size(x)/2):]
+        b= x[0:int(np.size(x)/2)]- x[int(np.size(x)/2):]
+        return np.concatenate((Hadamard(a), Hadamard(b)),axis=0)
+
+def Hadamard_2(x):
+    ### Base-case
+    if np.shape(x)[0]<=1:
+        return x
+    ### Divide-conquer
+    else:
+        a= x[0:int(np.shape(x)[0]/2),:]+ x[int(np.shape(x)[0]/2):,:]
+        b= x[0:int(np.shape(x)[0]/2),:]- x[int(np.shape(x)[0]/2):,:]
+        return np.concatenate((Hadamard_2(a), Hadamard_2(b)),axis=0)
+    
+
+### Sparse Sub-gaussian Matrix-- complete this function later.
+def Sparse_Sub_gaussian(r,c,sparsity):
+    S = np.zeros((r,c))
+    for i in range(r):
+        idx = np.random.choice(np.arange(c),sparsity)   
+    return