Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implementing SpectralClusteringVisitor visitor #343

Merged
merged 7 commits into from
Jan 2, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ Each program has three identical parts. First it generates and populates 3 colum
The maximum dataset I could load into Polars was 300m rows per column. Any bigger dataset blew up the memory and caused OS to kill it. I ran C++ DataFrame with 10b rows per column and I am sure it would have run with bigger datasets too. So, I was forced to run both with 300m rows to compare.
I ran each test 4 times and took the best time. Polars numbers varied a lot from one run to another, especially calculation and selection times. C++ DataFrame numbers were significantly more consistent.

| | [<B>C++ DataFrame</B>](https://github.com/hosseinmoein/DataFrame/blob/master/benchmarks/dataframe_performance.cc) | [<B>Polars&nbsp;&nbsp;&nbsp;</B>](https://github.com/hosseinmoein/DataFrame/blob/master/benchmarks/polars_performance.py) | [<B>Pandas&nbsp;&nbsp;&nbsp;</B>](https://github.com/hosseinmoein/DataFrame/blob/master/benchmarks/pandas_performance.py) |
| | [<B>C++ DataFrame</B>](https://github.com/hosseinmoein/DataFrame/blob/master/benchmarks/dataframe_performance.cc) | [<B>Polars</B>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;](https://github.com/hosseinmoein/DataFrame/blob/master/benchmarks/polars_performance.py) | [<B>Pandas</B>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;](https://github.com/hosseinmoein/DataFrame/blob/master/benchmarks/pandas_performance.py) |
| :-- | ---: | ---: | ---: |
| Data generation/load time | 26.9459 secs | 28.4686 secs | 36.6799 secs |
| Calculation time | 1.2602 secs | 4.8766 secs | 40.3264 secs |
Expand Down
4 changes: 4 additions & 0 deletions docs/HTML/DataFrame.html
Original file line number Diff line number Diff line change
Expand Up @@ -1115,6 +1115,10 @@ <H2 ID="2"><font color="blue">API Reference with code samples <font size="+4">&#
<td title="Performs mean-shift clustring">struct <a href="https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/MeanShiftVisitor.html">MeanShiftVisitor</a>{}</td>
</tr>

<tr class="item" onmouseover="this.style.backgroundColor='#ffff66';" onmouseout="this.style.backgroundColor='#d4e3e5';">
<td title="Performs spectral clustring">struct <a href="https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/SpectralClusteringVisitor.html">SpectralClusteringVisitor</a>{}</td>
</tr>

</table>
</div>

Expand Down
154 changes: 154 additions & 0 deletions docs/HTML/SpectralClusteringVisitor.html

Large diffs are not rendered by default.

1 change: 0 additions & 1 deletion docs/HTML/shuffle.html
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@
</td>
<td width = "33.3%">
<B>also_shuffle_index</B>: If true, it shuffles the named column(s) and the index column. Otherwise, index is not shuffled.<BR>
<B>N</B>: Number of named columns<BR>
<B>Ts</B>: The list of types for all columns. A type should be specified only once.
</td>
</tr>
Expand Down
1 change: 0 additions & 1 deletion include/DataFrame/DataFrameFinancialVisitors.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
// Hossein Moein

// January 08, 2020
/*
Copyright (c) 2019-2026, Hossein Moein
Expand Down
238 changes: 238 additions & 0 deletions include/DataFrame/DataFrameMLVisitors.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once

#include <DataFrame/DataFrameStatsVisitors.h>
#include <DataFrame/Utils/Matrix.h>
#include <DataFrame/Vectors/VectorPtrView.h>

#include <algorithm>
Expand Down Expand Up @@ -2699,6 +2700,243 @@ struct VectorSimilarityVisitor {
template<vector_sim_type TYP, typename T, typename I = unsigned long>
using vs_v = VectorSimilarityVisitor<TYP, T, I>;

// ----------------------------------------------------------------------------

template<std::size_t K,
typename T, typename I = unsigned long, std::size_t A = 0>
struct SpectralClusteringVisitor {

public:

DEFINE_VISIT_BASIC_TYPES

using similarity_func =
std::function<double(const value_type &x,
const value_type &y,
double sigma)>;
using seed_t = std::random_device::result_type;
using cluster_type = std::array<VectorConstPtrView<value_type, A>, K>;
using order_type =
std::array<std::vector<
size_type,
typename allocator_declare<size_type, A>::type>, K>;

private:

using sym_mat_t = Matrix<T, matrix_orient::row_major, true>;
using mat_t = Matrix<T, matrix_orient::row_major, false>;
template<typename U>
using vec_t = std::vector<U, typename allocator_declare<U, A>::type>;

const size_type iter_num_;
const seed_t seed_;
const double sigma_;
similarity_func sfunc_;
cluster_type clusters_ { }; // K Clusters
order_type clusters_idxs_ { }; // K Clusters indices

template<typename H>
inline sym_mat_t
calc_similarity_(const H &column_begin, size_type col_s) {

sym_mat_t sim_mat(col_s, col_s);

for (long r = 0; r < sim_mat.rows(); ++r)
for (long c = r; c < sim_mat.cols(); ++c)
sim_mat(r, c) =
sfunc_(*(column_begin + c), *(column_begin + r), sigma_);

return (sim_mat);
}

inline vec_t<T>
calc_degree_(const sym_mat_t &sim_mat) {

vec_t<T> deg_mat(sim_mat.rows());

for (long r = 0; r < sim_mat.rows(); ++r) {
value_type sum { 0 };

for (long c = 0; c < sim_mat.cols(); ++c)
sum += sim_mat(r, c);
deg_mat[r] = sum;
// deg_mat[r] = T(1) / std::sqrt(sum); // needs I - D * W * D
}

return (deg_mat);
}

inline mat_t
calc_laplacian_(const vec_t<T> &deg_mat, const sym_mat_t &sim_mat) {

mat_t lap_mat(sim_mat.rows(), sim_mat.cols());

for (long r = 0; r < sim_mat.rows(); ++r)
for (long c = 0; c < sim_mat.cols(); ++c)
if (r == c)
lap_mat(r, r) = deg_mat[r] - sim_mat(r, r);
else
lap_mat(r, c) = -sim_mat(r, c);

return (lap_mat);
}

inline double
distance_func_(const mat_t &x, const mat_t &y,
long xr, long yr, long cols) {

value_type val { 0 };

for (long c = 0; c < cols; ++c) {
const value_type diff = x(xr, c) - y(yr, c);

val += diff * diff;
}
return (val);
}

inline vec_t<long>
do_kmeans_(const mat_t &eigenvecs) {

const long rows = eigenvecs.rows(); // Samples
const long cols = eigenvecs.cols(); // dimensions
vec_t<long> cluster_idxs (rows, -1L);
constexpr long k = long(K);

std::random_device rd;
std::mt19937 gen (
(seed_ != seed_t(-1)) ? seed_ : rd());
std::uniform_int_distribution<long> rd_gen (0, rows - 1);

// Copy the top k rows of eigen vector.
//
mat_t means { k, cols };

for (long r = 0; r < k; ++r)
for (long c = 0; c < cols; ++c)
means(r, c) = eigenvecs(r, c);

for (size_type iter = 0; iter < iter_num_; ++iter) {
// Assign cluster_idxs based on closest means
//
for (long r = 0; r < rows; ++r) {
double best_distance = std::numeric_limits<double>::max();

for (long rr = 0; rr < k; ++rr) {
const double distance =
distance_func_(eigenvecs, means, r, rr, cols);

if (distance < best_distance) {
best_distance = distance;
cluster_idxs[r] = rr;
}
}
}

// Update means
//
mat_t new_means { k, cols };
vec_t<long> counts (k, 0L);

for (long r = 0; r < rows; ++r) {
for (long c = 0; c < cols; ++c)
new_means(cluster_idxs[r], c) += eigenvecs(r, c);
counts[cluster_idxs[r]]++;
}

for (int r = 0; r < k; ++r) {
if (counts[r] > 0) {
for (long c = 0; c < cols; ++c)
new_means(r, c) /= T(counts[r]);
}
else { // Reinitialize centroid if no points assigned
const auto rr = rd_gen(gen);

for (long c = 0; c < cols; ++c)
new_means(r, c) = eigenvecs(rr, c);
}
}
if ((means - new_means).norm() <= 0.0000001) break;

means = new_means;
}

return (cluster_idxs);
}

public:

template<typename IV, typename H>
inline void
operator() (const IV &idx_begin, const IV &idx_end,
const H &column_begin, const H &column_end) {

GET_COL_SIZE

#ifdef HMDF_SANITY_EXCEPTIONS
if (col_s <= K)
throw DataFrameError("SpectralClusteringVisitor: "
"Data size must be bigger than K");
#endif // HMDF_SANITY_EXCEPTIONS

mat_t eigenvecs;

{
const auto sim_mat = calc_similarity_(column_begin, col_s);
const auto deg_mat = calc_degree_(sim_mat);
const auto lap_mat = calc_laplacian_(deg_mat, sim_mat);
mat_t eigenvals;

lap_mat.eigen_space(eigenvals, eigenvecs, true);
} // Getting rid of the big things we don't need anymore

const auto cluster_idxs = do_kmeans_(eigenvecs);

// Update the accessible data
//
for (size_type i = 0; i < K; ++i) {
const auto val = col_s / K + 10;

clusters_[i].reserve(val);
clusters_idxs_[i].reserve(val);
}
for (size_type i = 0; i < cluster_idxs.size(); ++i) {
clusters_[cluster_idxs[i]].push_back(&(*(column_begin + i)));
clusters_idxs_[cluster_idxs[i]].push_back(i);
}
}

inline void pre() {

for (auto &iter : clusters_) iter.clear();
for (auto &iter : clusters_idxs_) iter.clear();
}
inline void post() { }
inline const cluster_type &get_result() const { return (clusters_); }
inline cluster_type &get_result() { return (clusters_); }
inline const order_type &
get_clusters_idxs() const { return (clusters_idxs_); }

SpectralClusteringVisitor(
size_type num_of_iter,
double sigma,
similarity_func sf =
[](const value_type &x,
const value_type &y,
double sigma) -> double {
return (std::exp(-((x - y) * (x - y)) / (2 * sigma * sigma)));
},
seed_t seed = seed_t(-1))
: iter_num_(num_of_iter),
seed_(seed),
sigma_(sigma),
sfunc_(sf) { }
};

template<std::size_t K, typename T,
typename I = unsigned long, std::size_t A = 0>
using spect_v = SpectralClusteringVisitor<K, T, I, A>;

} // namespace hmdf

// ----------------------------------------------------------------------------
Expand Down
13 changes: 10 additions & 3 deletions include/DataFrame/Utils/Matrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,10 @@ class Matrix {
using self_t = Matrix<value_type, MO>;

using trans_result_t =
typename std::conditional<MO == matrix_orient::column_major,
Matrix<T, matrix_orient::row_major>,
Matrix<T, matrix_orient::column_major>>::type;
typename std::conditional<
MO == matrix_orient::column_major,
Matrix<T, matrix_orient::row_major>,
Matrix<T, matrix_orient::column_major>>::type;

Matrix() = default;
Matrix(size_type rows, size_type cols, const_reference def_v = T());
Expand Down Expand Up @@ -123,6 +124,12 @@ class Matrix {
//
Matrix inverse() const;

// Frobenius Norm:
// The Frobenius norm of a matrix is the square root of the sum of
// the squares of the values of the elements of the matrix.
//
value_type norm() const noexcept;

// Degree matrix is a square diagonal matrix where each diagonal value
// represents the number of connections in a row of an adjacency matrix.
//
Expand Down
59 changes: 59 additions & 0 deletions include/DataFrame/Utils/Matrix.tcc
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,65 @@ Matrix<T, MO, IS_SYM>::inverse() const {

// ----------------------------------------------------------------------------

template<typename T, matrix_orient MO, bool IS_SYM>
Matrix<T, MO, IS_SYM>::value_type
Matrix<T, MO, IS_SYM>::norm() const noexcept {

value_type result { 0 };

if constexpr (IS_SYM) {
if constexpr (MO == matrix_orient::column_major) {
for (size_type c = 0; c < cols(); ++c) {
for (size_type r = c + 1; r < rows(); ++r) {
const auto val = at(r, c);

result += val * val;
}
}
}
else {
for (size_type r = 0; r < rows(); ++r) {
for (size_type c = r + 1; c < cols(); ++c) {
const auto val = at(r, c);

result += val * val;
}
}
}

result *= T(2);
for (size_type c = 0; c < cols(); ++c) {
const auto val = at(c, c);

result += val * val;
}
}
else {
if constexpr (MO == matrix_orient::column_major) {
for (size_type c = 0; c < cols(); ++c) {
for (size_type r = 0; r < rows(); ++r) {
const auto val = at(r, c);

result += val * val;
}
}
}
else {
for (size_type r = 0; r < rows(); ++r) {
for (size_type c = 0; c < cols(); ++c) {
const auto val = at(r, c);

result += val * val;
}
}
}
}

return (std::sqrt(result));
}

// ----------------------------------------------------------------------------

template<typename T, matrix_orient MO, bool IS_SYM>
Matrix<T, MO, IS_SYM>
Matrix<T, MO, IS_SYM>::degree_matrix() const {
Expand Down
Loading
Loading