Skip to content

Commit

Permalink
Merge pull request #343 from hosseinmoein/Hossein/Spectral
Browse files Browse the repository at this point in the history
Implementing SpectralClusteringVisitor visitor
  • Loading branch information
hosseinmoein authored Jan 2, 2025
2 parents ce4d64b + 7f9c1ef commit 0d937aa
Show file tree
Hide file tree
Showing 11 changed files with 667 additions and 24 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ Each program has three identical parts. First it generates and populates 3 colum
The maximum dataset I could load into Polars was 300m rows per column. Any bigger dataset blew up the memory and caused OS to kill it. I ran C++ DataFrame with 10b rows per column and I am sure it would have run with bigger datasets too. So, I was forced to run both with 300m rows to compare.
I ran each test 4 times and took the best time. Polars numbers varied a lot from one run to another, especially calculation and selection times. C++ DataFrame numbers were significantly more consistent.

| | [<B>C++ DataFrame</B>](https://github.com/hosseinmoein/DataFrame/blob/master/benchmarks/dataframe_performance.cc) | [<B>Polars&nbsp;&nbsp;&nbsp;</B>](https://github.com/hosseinmoein/DataFrame/blob/master/benchmarks/polars_performance.py) | [<B>Pandas&nbsp;&nbsp;&nbsp;</B>](https://github.com/hosseinmoein/DataFrame/blob/master/benchmarks/pandas_performance.py) |
| | [<B>C++ DataFrame</B>](https://github.com/hosseinmoein/DataFrame/blob/master/benchmarks/dataframe_performance.cc) | [<B>Polars</B>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;](https://github.com/hosseinmoein/DataFrame/blob/master/benchmarks/polars_performance.py) | [<B>Pandas</B>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;](https://github.com/hosseinmoein/DataFrame/blob/master/benchmarks/pandas_performance.py) |
| :-- | ---: | ---: | ---: |
| Data generation/load time | 26.9459 secs | 28.4686 secs | 36.6799 secs |
| Calculation time | 1.2602 secs | 4.8766 secs | 40.3264 secs |
Expand Down
4 changes: 4 additions & 0 deletions docs/HTML/DataFrame.html
Original file line number Diff line number Diff line change
Expand Up @@ -1115,6 +1115,10 @@ <H2 ID="2"><font color="blue">API Reference with code samples <font size="+4">&#
<td title="Performs mean-shift clustring">struct <a href="https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/MeanShiftVisitor.html">MeanShiftVisitor</a>{}</td>
</tr>

<tr class="item" onmouseover="this.style.backgroundColor='#ffff66';" onmouseout="this.style.backgroundColor='#d4e3e5';">
<td title="Performs spectral clustring">struct <a href="https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/SpectralClusteringVisitor.html">SpectralClusteringVisitor</a>{}</td>
</tr>

</table>
</div>

Expand Down
154 changes: 154 additions & 0 deletions docs/HTML/SpectralClusteringVisitor.html

Large diffs are not rendered by default.

1 change: 0 additions & 1 deletion docs/HTML/shuffle.html
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@
</td>
<td width = "33.3%">
<B>also_shuffle_index</B>: If true, it shuffles the named column(s) and the index column. Otherwise, index is not shuffled.<BR>
<B>N</B>: Number of named columns<BR>
<B>Ts</B>: The list of types for all columns. A type should be specified only once.
</td>
</tr>
Expand Down
1 change: 0 additions & 1 deletion include/DataFrame/DataFrameFinancialVisitors.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
// Hossein Moein

// January 08, 2020
/*
Copyright (c) 2019-2026, Hossein Moein
Expand Down
238 changes: 238 additions & 0 deletions include/DataFrame/DataFrameMLVisitors.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once

#include <DataFrame/DataFrameStatsVisitors.h>
#include <DataFrame/Utils/Matrix.h>
#include <DataFrame/Vectors/VectorPtrView.h>

#include <algorithm>
Expand Down Expand Up @@ -2699,6 +2700,243 @@ struct VectorSimilarityVisitor {
template<vector_sim_type TYP, typename T, typename I = unsigned long>
using vs_v = VectorSimilarityVisitor<TYP, T, I>;

// ----------------------------------------------------------------------------

template<std::size_t K,
typename T, typename I = unsigned long, std::size_t A = 0>
struct SpectralClusteringVisitor {

public:

DEFINE_VISIT_BASIC_TYPES

using similarity_func =
std::function<double(const value_type &x,
const value_type &y,
double sigma)>;
using seed_t = std::random_device::result_type;
using cluster_type = std::array<VectorConstPtrView<value_type, A>, K>;
using order_type =
std::array<std::vector<
size_type,
typename allocator_declare<size_type, A>::type>, K>;

private:

using sym_mat_t = Matrix<T, matrix_orient::row_major, true>;
using mat_t = Matrix<T, matrix_orient::row_major, false>;
template<typename U>
using vec_t = std::vector<U, typename allocator_declare<U, A>::type>;

const size_type iter_num_;
const seed_t seed_;
const double sigma_;
similarity_func sfunc_;
cluster_type clusters_ { }; // K Clusters
order_type clusters_idxs_ { }; // K Clusters indices

template<typename H>
inline sym_mat_t
calc_similarity_(const H &column_begin, size_type col_s) {

sym_mat_t sim_mat(col_s, col_s);

for (long r = 0; r < sim_mat.rows(); ++r)
for (long c = r; c < sim_mat.cols(); ++c)
sim_mat(r, c) =
sfunc_(*(column_begin + c), *(column_begin + r), sigma_);

return (sim_mat);
}

inline vec_t<T>
calc_degree_(const sym_mat_t &sim_mat) {

vec_t<T> deg_mat(sim_mat.rows());

for (long r = 0; r < sim_mat.rows(); ++r) {
value_type sum { 0 };

for (long c = 0; c < sim_mat.cols(); ++c)
sum += sim_mat(r, c);
deg_mat[r] = sum;
// deg_mat[r] = T(1) / std::sqrt(sum); // needs I - D * W * D
}

return (deg_mat);
}

inline mat_t
calc_laplacian_(const vec_t<T> &deg_mat, const sym_mat_t &sim_mat) {

mat_t lap_mat(sim_mat.rows(), sim_mat.cols());

for (long r = 0; r < sim_mat.rows(); ++r)
for (long c = 0; c < sim_mat.cols(); ++c)
if (r == c)
lap_mat(r, r) = deg_mat[r] - sim_mat(r, r);
else
lap_mat(r, c) = -sim_mat(r, c);

return (lap_mat);
}

inline double
distance_func_(const mat_t &x, const mat_t &y,
long xr, long yr, long cols) {

value_type val { 0 };

for (long c = 0; c < cols; ++c) {
const value_type diff = x(xr, c) - y(yr, c);

val += diff * diff;
}
return (val);
}

inline vec_t<long>
do_kmeans_(const mat_t &eigenvecs) {

const long rows = eigenvecs.rows(); // Samples
const long cols = eigenvecs.cols(); // dimensions
vec_t<long> cluster_idxs (rows, -1L);
constexpr long k = long(K);

std::random_device rd;
std::mt19937 gen (
(seed_ != seed_t(-1)) ? seed_ : rd());
std::uniform_int_distribution<long> rd_gen (0, rows - 1);

// Copy the top k rows of eigen vector.
//
mat_t means { k, cols };

for (long r = 0; r < k; ++r)
for (long c = 0; c < cols; ++c)
means(r, c) = eigenvecs(r, c);

for (size_type iter = 0; iter < iter_num_; ++iter) {
// Assign cluster_idxs based on closest means
//
for (long r = 0; r < rows; ++r) {
double best_distance = std::numeric_limits<double>::max();

for (long rr = 0; rr < k; ++rr) {
const double distance =
distance_func_(eigenvecs, means, r, rr, cols);

if (distance < best_distance) {
best_distance = distance;
cluster_idxs[r] = rr;
}
}
}

// Update means
//
mat_t new_means { k, cols };
vec_t<long> counts (k, 0L);

for (long r = 0; r < rows; ++r) {
for (long c = 0; c < cols; ++c)
new_means(cluster_idxs[r], c) += eigenvecs(r, c);
counts[cluster_idxs[r]]++;
}

for (int r = 0; r < k; ++r) {
if (counts[r] > 0) {
for (long c = 0; c < cols; ++c)
new_means(r, c) /= T(counts[r]);
}
else { // Reinitialize centroid if no points assigned
const auto rr = rd_gen(gen);

for (long c = 0; c < cols; ++c)
new_means(r, c) = eigenvecs(rr, c);
}
}
if ((means - new_means).norm() <= 0.0000001) break;

means = new_means;
}

return (cluster_idxs);
}

public:

template<typename IV, typename H>
inline void
operator() (const IV &idx_begin, const IV &idx_end,
const H &column_begin, const H &column_end) {

GET_COL_SIZE

#ifdef HMDF_SANITY_EXCEPTIONS
if (col_s <= K)
throw DataFrameError("SpectralClusteringVisitor: "
"Data size must be bigger than K");
#endif // HMDF_SANITY_EXCEPTIONS

mat_t eigenvecs;

{
const auto sim_mat = calc_similarity_(column_begin, col_s);
const auto deg_mat = calc_degree_(sim_mat);
const auto lap_mat = calc_laplacian_(deg_mat, sim_mat);
mat_t eigenvals;

lap_mat.eigen_space(eigenvals, eigenvecs, true);
} // Getting rid of the big things we don't need anymore

const auto cluster_idxs = do_kmeans_(eigenvecs);

// Update the accessible data
//
for (size_type i = 0; i < K; ++i) {
const auto val = col_s / K + 10;

clusters_[i].reserve(val);
clusters_idxs_[i].reserve(val);
}
for (size_type i = 0; i < cluster_idxs.size(); ++i) {
clusters_[cluster_idxs[i]].push_back(&(*(column_begin + i)));
clusters_idxs_[cluster_idxs[i]].push_back(i);
}
}

inline void pre() {

for (auto &iter : clusters_) iter.clear();
for (auto &iter : clusters_idxs_) iter.clear();
}
inline void post() { }
inline const cluster_type &get_result() const { return (clusters_); }
inline cluster_type &get_result() { return (clusters_); }
inline const order_type &
get_clusters_idxs() const { return (clusters_idxs_); }

SpectralClusteringVisitor(
size_type num_of_iter,
double sigma,
similarity_func sf =
[](const value_type &x,
const value_type &y,
double sigma) -> double {
return (std::exp(-((x - y) * (x - y)) / (2 * sigma * sigma)));
},
seed_t seed = seed_t(-1))
: iter_num_(num_of_iter),
seed_(seed),
sigma_(sigma),
sfunc_(sf) { }
};

template<std::size_t K, typename T,
typename I = unsigned long, std::size_t A = 0>
using spect_v = SpectralClusteringVisitor<K, T, I, A>;

} // namespace hmdf

// ----------------------------------------------------------------------------
Expand Down
13 changes: 10 additions & 3 deletions include/DataFrame/Utils/Matrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,10 @@ class Matrix {
using self_t = Matrix<value_type, MO>;

using trans_result_t =
typename std::conditional<MO == matrix_orient::column_major,
Matrix<T, matrix_orient::row_major>,
Matrix<T, matrix_orient::column_major>>::type;
typename std::conditional<
MO == matrix_orient::column_major,
Matrix<T, matrix_orient::row_major>,
Matrix<T, matrix_orient::column_major>>::type;

Matrix() = default;
Matrix(size_type rows, size_type cols, const_reference def_v = T());
Expand Down Expand Up @@ -123,6 +124,12 @@ class Matrix {
//
Matrix inverse() const;

// Frobenius Norm:
// The Frobenius norm of a matrix is the square root of the sum of
// the squares of the values of the elements of the matrix.
//
value_type norm() const noexcept;

// Degree matrix is a square diagonal matrix where each diagonal value
// represents the number of connections in a row of an adjacency matrix.
//
Expand Down
59 changes: 59 additions & 0 deletions include/DataFrame/Utils/Matrix.tcc
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,65 @@ Matrix<T, MO, IS_SYM>::inverse() const {

// ----------------------------------------------------------------------------

template<typename T, matrix_orient MO, bool IS_SYM>
Matrix<T, MO, IS_SYM>::value_type
Matrix<T, MO, IS_SYM>::norm() const noexcept {

value_type result { 0 };

if constexpr (IS_SYM) {
if constexpr (MO == matrix_orient::column_major) {
for (size_type c = 0; c < cols(); ++c) {
for (size_type r = c + 1; r < rows(); ++r) {
const auto val = at(r, c);

result += val * val;
}
}
}
else {
for (size_type r = 0; r < rows(); ++r) {
for (size_type c = r + 1; c < cols(); ++c) {
const auto val = at(r, c);

result += val * val;
}
}
}

result *= T(2);
for (size_type c = 0; c < cols(); ++c) {
const auto val = at(c, c);

result += val * val;
}
}
else {
if constexpr (MO == matrix_orient::column_major) {
for (size_type c = 0; c < cols(); ++c) {
for (size_type r = 0; r < rows(); ++r) {
const auto val = at(r, c);

result += val * val;
}
}
}
else {
for (size_type r = 0; r < rows(); ++r) {
for (size_type c = 0; c < cols(); ++c) {
const auto val = at(r, c);

result += val * val;
}
}
}
}

return (std::sqrt(result));
}

// ----------------------------------------------------------------------------

template<typename T, matrix_orient MO, bool IS_SYM>
Matrix<T, MO, IS_SYM>
Matrix<T, MO, IS_SYM>::degree_matrix() const {
Expand Down
Loading

0 comments on commit 0d937aa

Please sign in to comment.