hosseinmoein · hosseinmoein · Jan 2, 2025 · Dec 30, 2024 · Dec 31, 2024 · Dec 31, 2024
diff --git a/README.md b/README.md
@@ -64,7 +64,7 @@ Each program has three identical parts. First it generates and populates 3 colum
 The maximum dataset I could load into Polars was 300m rows per column. Any bigger dataset blew up the memory and caused OS to kill it. I ran C++ DataFrame with 10b rows per column and I am sure it would have run with bigger datasets too. So, I was forced to run both with 300m rows to compare.
 I ran each test 4 times and took the best time. Polars numbers varied a lot from one run to another, especially calculation and selection times. C++ DataFrame numbers were significantly more consistent.
 
-|                          | [<B>C++ DataFrame</B>](https://github.com/hosseinmoein/DataFrame/blob/master/benchmarks/dataframe_performance.cc) | [<B>Polars&nbsp;&nbsp;&nbsp;</B>](https://github.com/hosseinmoein/DataFrame/blob/master/benchmarks/polars_performance.py) | [<B>Pandas&nbsp;&nbsp;&nbsp;</B>](https://github.com/hosseinmoein/DataFrame/blob/master/benchmarks/pandas_performance.py) |
+|                          | [<B>C++ DataFrame</B>](https://github.com/hosseinmoein/DataFrame/blob/master/benchmarks/dataframe_performance.cc) | [<B>Polars</B>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;](https://github.com/hosseinmoein/DataFrame/blob/master/benchmarks/polars_performance.py) | [<B>Pandas</B>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;](https://github.com/hosseinmoein/DataFrame/blob/master/benchmarks/pandas_performance.py) |
 |            :--           |        ---:         |     ---:      |     ---:      |
 | Data generation/load time | 26.9459 secs | 28.4686 secs | 36.6799 secs |
 | Calculation time | 1.2602 secs | 4.8766 secs | 40.3264 secs |

diff --git a/docs/HTML/DataFrame.html b/docs/HTML/DataFrame.html
@@ -1115,6 +1115,10 @@ <H2 ID="2"><font color="blue">API Reference with code samples <font size="+4">&#
       <td title="Performs mean-shift clustring">struct <a href="https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/MeanShiftVisitor.html">MeanShiftVisitor</a>{}</td>
     </tr>
 
+    <tr class="item" onmouseover="this.style.backgroundColor='#ffff66';" onmouseout="this.style.backgroundColor='#d4e3e5';">
+      <td title="Performs spectral clustring">struct <a href="https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/SpectralClusteringVisitor.html">SpectralClusteringVisitor</a>{}</td>
+    </tr>
+
   </table>
   </div>
 

diff --git a/docs/HTML/SpectralClusteringVisitor.html b/docs/HTML/SpectralClusteringVisitor.html
diff --git a/docs/HTML/shuffle.html b/docs/HTML/shuffle.html
@@ -55,7 +55,6 @@
       </td>
       <td width = "33.3%">
         <B>also_shuffle_index</B>: If true, it shuffles the named column(s) and the index column. Otherwise, index is not shuffled.<BR>
-        <B>N</B>: Number of named columns<BR>
         <B>Ts</B>: The list of types for all columns. A type should be specified only once.
       </td>
     </tr>

diff --git a/include/DataFrame/DataFrameFinancialVisitors.h b/include/DataFrame/DataFrameFinancialVisitors.h
@@ -1,5 +1,4 @@
 // Hossein Moein
-
 // January 08, 2020
 /*
 Copyright (c) 2019-2026, Hossein Moein

diff --git a/include/DataFrame/DataFrameMLVisitors.h b/include/DataFrame/DataFrameMLVisitors.h
@@ -30,6 +30,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #pragma once
 
 #include <DataFrame/DataFrameStatsVisitors.h>
+#include <DataFrame/Utils/Matrix.h>
 #include <DataFrame/Vectors/VectorPtrView.h>
 
 #include <algorithm>
@@ -2699,6 +2700,243 @@ struct  VectorSimilarityVisitor  {
 template<vector_sim_type TYP, typename T, typename I = unsigned long>
 using vs_v = VectorSimilarityVisitor<TYP, T, I>;
 
+// ----------------------------------------------------------------------------
+
+template<std::size_t K,
+         typename T, typename I = unsigned long, std::size_t A = 0>
+struct  SpectralClusteringVisitor  {
+
+public:
+
+    DEFINE_VISIT_BASIC_TYPES
+
+    using similarity_func =
+        std::function<double(const value_type &x,
+                             const value_type &y,
+                             double sigma)>;
+    using seed_t = std::random_device::result_type;
+    using cluster_type = std::array<VectorConstPtrView<value_type, A>, K>;
+    using order_type =
+        std::array<std::vector<
+                       size_type,
+                       typename allocator_declare<size_type, A>::type>, K>;
+
+private:
+
+    using sym_mat_t = Matrix<T, matrix_orient::row_major, true>;
+    using mat_t = Matrix<T, matrix_orient::row_major, false>;
+    template<typename U>
+    using vec_t = std::vector<U, typename allocator_declare<U, A>::type>;
+
+    const size_type iter_num_;
+    const seed_t    seed_;
+    const double    sigma_;
+    similarity_func sfunc_;
+    cluster_type    clusters_ { };       // K Clusters
+    order_type      clusters_idxs_ { };  // K Clusters indices
+
+    template<typename H>
+    inline sym_mat_t
+    calc_similarity_(const H &column_begin, size_type col_s)  {
+
+        sym_mat_t   sim_mat(col_s, col_s);
+
+        for (long r = 0; r < sim_mat.rows(); ++r)
+            for (long c = r; c < sim_mat.cols(); ++c)
+                sim_mat(r, c) =
+                    sfunc_(*(column_begin + c), *(column_begin + r), sigma_);
+
+        return (sim_mat);
+    }
+
+    inline vec_t<T>
+    calc_degree_(const sym_mat_t &sim_mat)  {
+
+        vec_t<T>    deg_mat(sim_mat.rows());
+
+        for (long r = 0; r < sim_mat.rows(); ++r)  {
+            value_type  sum { 0 };
+
+            for (long c = 0; c < sim_mat.cols(); ++c)
+                sum += sim_mat(r, c);
+            deg_mat[r] = sum;
+            // deg_mat[r] = T(1) / std::sqrt(sum); // needs I - D * W * D
+        }
+
+        return (deg_mat);
+    }
+
+    inline mat_t
+    calc_laplacian_(const vec_t<T> &deg_mat, const sym_mat_t &sim_mat)  {
+
+        mat_t   lap_mat(sim_mat.rows(), sim_mat.cols());
+
+        for (long r = 0; r < sim_mat.rows(); ++r)
+            for (long c = 0; c < sim_mat.cols(); ++c)
+                if (r == c)
+                    lap_mat(r, r) = deg_mat[r] - sim_mat(r, r);
+                else
+                    lap_mat(r, c) = -sim_mat(r, c);
+
+        return (lap_mat);
+    }
+
+    inline double
+    distance_func_(const mat_t &x, const mat_t &y,
+                   long xr, long yr, long cols)  {
+
+        value_type  val { 0 };
+
+        for (long c = 0; c < cols; ++c) {
+            const value_type    diff = x(xr, c) - y(yr, c);
+
+            val += diff * diff;
+        }
+        return (val);
+    }
+
+    inline vec_t<long>
+    do_kmeans_(const mat_t &eigenvecs)  {
+
+        const long      rows = eigenvecs.rows();  // Samples
+        const long      cols = eigenvecs.cols();  // dimensions
+        vec_t<long>     cluster_idxs (rows, -1L);
+        constexpr long  k = long(K);
+
+        std::random_device                  rd;
+        std::mt19937                        gen (
+            (seed_ != seed_t(-1)) ? seed_ : rd());
+        std::uniform_int_distribution<long> rd_gen (0, rows - 1);
+
+        // Copy the top k rows of eigen vector.
+        //
+        mat_t   means { k, cols };
+
+        for (long r = 0; r < k; ++r)
+            for (long c = 0; c < cols; ++c)
+                means(r, c) = eigenvecs(r, c);
+
+        for (size_type iter = 0; iter < iter_num_; ++iter)  {
+             // Assign cluster_idxs based on closest means
+             //
+             for (long r = 0; r < rows; ++r) {
+                 double best_distance = std::numeric_limits<double>::max();
+
+                 for (long rr = 0; rr < k; ++rr) {
+                     const double   distance =
+                         distance_func_(eigenvecs, means, r, rr, cols);
+
+                     if (distance < best_distance) {
+                         best_distance = distance;
+                         cluster_idxs[r] = rr;
+                     }
+                 }
+             }
+
+             // Update means
+             //
+             mat_t          new_means { k, cols };
+             vec_t<long>    counts (k, 0L);
+
+             for (long r = 0; r < rows; ++r) {
+                 for (long c = 0; c < cols; ++c)
+                     new_means(cluster_idxs[r], c) += eigenvecs(r, c);
+                 counts[cluster_idxs[r]]++;
+             }
+
+             for (int r = 0; r < k; ++r) {
+                 if (counts[r] > 0)  {
+                     for (long c = 0; c < cols; ++c)
+                         new_means(r, c) /= T(counts[r]);
+                 }
+                 else  { // Reinitialize centroid if no points assigned
+                     const auto rr = rd_gen(gen);
+
+                     for (long c = 0; c < cols; ++c)
+                         new_means(r, c) = eigenvecs(rr, c);
+                 }
+             }
+             if ((means - new_means).norm() <= 0.0000001)  break;
+
+             means = new_means;
+        }
+
+        return (cluster_idxs);
+    }
+
+public:
+
+    template<typename IV, typename H>
+    inline void
+    operator() (const IV &idx_begin, const IV &idx_end,
+                const H &column_begin, const H &column_end)  {
+
+        GET_COL_SIZE
+
+#ifdef HMDF_SANITY_EXCEPTIONS
+        if (col_s <= K)
+           throw DataFrameError("SpectralClusteringVisitor: "
+                                "Data size must be bigger than K");
+#endif // HMDF_SANITY_EXCEPTIONS
+
+        mat_t   eigenvecs;
+
+        {
+            const auto  sim_mat = calc_similarity_(column_begin, col_s);
+            const auto  deg_mat = calc_degree_(sim_mat);
+            const auto  lap_mat = calc_laplacian_(deg_mat, sim_mat);
+            mat_t       eigenvals;
+
+            lap_mat.eigen_space(eigenvals, eigenvecs, true);
+        }  // Getting rid of the big things we don't need anymore
+
+        const auto  cluster_idxs = do_kmeans_(eigenvecs);
+
+        // Update the accessible data
+        //
+        for (size_type i = 0; i < K; ++i)  {
+            const auto  val  = col_s / K + 10;
+
+            clusters_[i].reserve(val);
+            clusters_idxs_[i].reserve(val);
+        }
+        for (size_type i = 0; i < cluster_idxs.size(); ++i)  {
+            clusters_[cluster_idxs[i]].push_back(&(*(column_begin + i)));
+            clusters_idxs_[cluster_idxs[i]].push_back(i);
+        }
+    }
+
+    inline void pre()  {
+
+        for (auto &iter : clusters_) iter.clear();
+        for (auto &iter : clusters_idxs_) iter.clear();
+    }
+    inline void post()  {  }
+    inline const cluster_type &get_result() const  { return (clusters_); }
+    inline cluster_type &get_result()  { return (clusters_); }
+    inline const order_type &
+    get_clusters_idxs() const  { return (clusters_idxs_); }
+
+    SpectralClusteringVisitor(
+        size_type num_of_iter,
+        double sigma,
+        similarity_func sf =
+            [](const value_type &x,
+               const value_type &y,
+               double sigma) -> double  {
+                return (std::exp(-((x - y) * (x - y)) / (2 * sigma * sigma)));
+            },
+        seed_t seed = seed_t(-1))
+        : iter_num_(num_of_iter),
+          seed_(seed),
+          sigma_(sigma),
+          sfunc_(sf)  {  }
+};
+
+template<std::size_t K, typename T,
+         typename I = unsigned long, std::size_t A = 0>
+using spect_v = SpectralClusteringVisitor<K, T, I, A>;
+
 } // namespace hmdf
 
 // ----------------------------------------------------------------------------

diff --git a/include/DataFrame/Utils/Matrix.h b/include/DataFrame/Utils/Matrix.h
@@ -63,9 +63,10 @@ class   Matrix  {
     using self_t = Matrix<value_type, MO>;
 
     using trans_result_t =
-        typename std::conditional<MO == matrix_orient::column_major,
-                                  Matrix<T, matrix_orient::row_major>,
-                                  Matrix<T, matrix_orient::column_major>>::type;
+        typename std::conditional<
+            MO == matrix_orient::column_major,
+            Matrix<T, matrix_orient::row_major>,
+            Matrix<T, matrix_orient::column_major>>::type;
 
     Matrix() = default;
     Matrix(size_type rows, size_type cols, const_reference def_v = T());
@@ -123,6 +124,12 @@ class   Matrix  {
     //
     Matrix inverse() const;
 
+    // Frobenius Norm:
+    // The Frobenius norm of a matrix is the square root of the sum of
+    // the squares of the values of the elements of the matrix.
+    //
+    value_type norm() const noexcept;
+
     // Degree matrix is a square diagonal matrix where each diagonal value
     // represents the number of connections in a row of an adjacency matrix.
     //

diff --git a/include/DataFrame/Utils/Matrix.tcc b/include/DataFrame/Utils/Matrix.tcc
@@ -404,6 +404,65 @@ Matrix<T, MO, IS_SYM>::inverse() const  {
 
 // ----------------------------------------------------------------------------
 
+template<typename T,  matrix_orient MO, bool IS_SYM>
+Matrix<T, MO, IS_SYM>::value_type
+Matrix<T, MO, IS_SYM>::norm() const noexcept  {
+
+    value_type  result { 0 };
+
+    if constexpr (IS_SYM)  {
+        if constexpr (MO == matrix_orient::column_major)  {
+            for (size_type c = 0; c < cols(); ++c)  {
+                for (size_type r = c + 1; r < rows(); ++r)  {
+                    const auto  val = at(r, c);
+
+                    result += val * val;
+                }
+            }
+        }
+        else  {
+            for (size_type r = 0; r < rows(); ++r)  {
+                for (size_type c = r + 1; c < cols(); ++c)  {
+                    const auto  val = at(r, c);
+
+                    result += val * val;
+                }
+            }
+        }
+
+        result *= T(2);
+        for (size_type c = 0; c < cols(); ++c)  {
+            const auto  val = at(c, c);
+
+            result += val * val;
+        }
+    }
+    else  {
+        if constexpr (MO == matrix_orient::column_major)  {
+            for (size_type c = 0; c < cols(); ++c)  {
+                for (size_type r = 0; r < rows(); ++r)  {
+                    const auto  val = at(r, c);
+
+                    result += val * val;
+                }
+            }
+        }
+        else  {
+            for (size_type r = 0; r < rows(); ++r)  {
+                for (size_type c = 0; c < cols(); ++c)  {
+                    const auto  val = at(r, c);
+
+                    result += val * val;
+                }
+            }
+        }
+    }
+
+    return (std::sqrt(result));
+}
+
+// ----------------------------------------------------------------------------
+
 template<typename T,  matrix_orient MO, bool IS_SYM>
 Matrix<T, MO, IS_SYM>
 Matrix<T, MO, IS_SYM>::degree_matrix() const  {