From c2a5916379503b20532555046d1059aa213dbc75 Mon Sep 17 00:00:00 2001 From: Hossein Moein Date: Mon, 16 Dec 2024 11:00:31 -0500 Subject: [PATCH] Implemented pca_by_eigen() --- README.md | 2 +- docs/HTML/DataFrame.html | 4 + docs/HTML/covariance_matrix.html | 2 +- docs/HTML/pca_by_eigen.html | 182 ++++++++++++++++++ include/DataFrame/DataFrame.h | 55 +++--- include/DataFrame/DataFrameTypes.h | 16 +- include/DataFrame/Internals/DataFrame_get.tcc | 99 ++++++++++ include/DataFrame/Utils/Matrix.h | 9 - include/DataFrame/Utils/Matrix.tcc | 4 +- test/dataframe_tester_4.cc | 72 +++++++ test/matrix_tester.cc | 25 +-- 11 files changed, 407 insertions(+), 63 deletions(-) create mode 100644 docs/HTML/pca_by_eigen.html diff --git a/README.md b/README.md index 8325c819..f24a0390 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ## DataFrame documentation with code samples This is a C++ analytical library designed for data analysis similar to libraries in Python and R. For example, you would compare this to [Pandas](https://pandas.pydata.org) or [R data.frame](https://www.w3schools.com/r/r_data_frames.asp)
You can slice the data in many different ways. You can join, merge, group-by the data. You can run various statistical, summarization, financial, and ML algorithms on the data. You can add your custom algorithms easily. You can multi-column sort, custom pick and delete the data. And more …
-DataFrame also includes a large collection of analytical algorithms in form of visitors. These are from basic stats such as Mean, Std Deviation, Return, … to more involved analysis such as Affinity Propagation, Polynomial Fit, Fast Fourier transform of arbitrary length … including a good collection of trading indicators. You can also easily add your own algorithms.
+DataFrame also includes a large collection of analytical algorithms in form of visitors. These are from basic stats such as Mean, Std Deviation, Return, … to more involved analysis such as PCA, Polynomial Fit, Fast Fourier transform of arbitrary length … including a good collection of trading indicators. You can also easily add your own algorithms.
DataFrame also employs extensive multithreading in almost all its API’s, for large datasets. That makes DataFrame especially suitable for analyzing large datasets.
For basic operations to start you off, see [Hello World](examples/hello_world.cc). For a complete list of features with code samples, see documentation. diff --git a/docs/HTML/DataFrame.html b/docs/HTML/DataFrame.html index 9ac3a0e9..677e9059 100644 --- a/docs/HTML/DataFrame.html +++ b/docs/HTML/DataFrame.html @@ -325,6 +325,10 @@

API Reference with code samples &# pattern_match() + + pca_by_eigen() + + peaks() diff --git a/docs/HTML/covariance_matrix.html b/docs/HTML/covariance_matrix.html index 0dd3b729..8087e87e 100644 --- a/docs/HTML/covariance_matrix.html +++ b/docs/HTML/covariance_matrix.html @@ -49,7 +49,7 @@

 template<typename T>
 Matrix<T, matrix_orient::column_major>
-covariance_matrix(std::vector &&col_names,
+covariance_matrix(std::vector &&col_names,
                   normalization_type norm_type =
                       normalization_type::none) const;
 
diff --git a/docs/HTML/pca_by_eigen.html b/docs/HTML/pca_by_eigen.html new file mode 100644 index 00000000..4fbe4fdf --- /dev/null +++ b/docs/HTML/pca_by_eigen.html @@ -0,0 +1,182 @@ + + + + + + + + + + Back to Documentations

+ + + + + + + + + + + +
Signature Description
+

+struct  PCAParams  {
+
+    normalization_type  norm_type { normalization_type::z_score };
+
+    // If populated (set above zero), number of top eigen values to keep.
+    //
+    long                num_comp_to_keep { 0 };
+
+    // If populated (num_comp_is 0), percentage of eigen values to keep.
+    // 0.9 means 90%.
+    //
+    double              pct_comp_to_keep { 0.9 };
+};
+        
+        
+
+ A structure containing the parameteres to pca_by_eigen() call
+
+ +
+ + + + + + + + + + + + +
Signature Description Parameters
+

+template<typename T>
+Matrix<T, matrix_orient::column_major>
+pca_by_eigen(std::vector &&col_names,
+             const PCAParams params = { }) const;
+
+
+ This uses Eigenspace evaluation to calculate Principal Component Analysis (PCA). It returns a matrix whose columns are the reduced dimensions with most significant information.

+ PCA is a dimensionality reduction method that is often used to reduce the dimensionality of large data sets, by transforming a large set of variables into a smaller one that still contains most of the information in the large set.
+ Reducing the number of variables of a data set naturally comes at the expense of accuracy, but the trick in dimensionality reduction is to trade a little accuracy for simplicity. Because smaller data sets are easier to explore and visualize, and thus make analyzing data points much easier and faster for machine learning algorithms without extraneous variables to process. +
+ T: Type of the named columns
+ col_names: Vector of column names
+ params: Parameters necessary for for this operation
+
+ +
static void test_pca_by_eigen()  {
+
+    std::cout << "\nTesting pca_by_eigen( ) ..." << std::endl;
+
+    StrDataFrame    df;
+
+    try  {
+        df.read("IBM.csv", io_format::csv2);
+    }
+    catch (const DataFrameError &ex)  {
+        std::cout << ex.what() << std::endl;
+    }
+
+    const auto  pca_mat = df.pca_by_eigen<double>({ "IBM_Close", "IBM_Open", "IBM_High", "IBM_Low" });
+
+    // Dimensions were reduced to 1 containing at least 90% of the information.
+    // This makes sense, since these 4 columns are highly correlated.
+    //
+    assert(pca_mat.cols() == 1);
+    assert(pca_mat.rows() == 5031);
+    assert(std::fabs(pca_mat(0, 0) - 197.063) < 0.001);
+    assert(std::fabs(pca_mat(1, 0) - 200.875) < 0.001);
+    assert(std::fabs(pca_mat(491, 0) - 149.02) < 0.01);
+    assert(std::fabs(pca_mat(1348, 0) - 166.44) < 0.01);
+    assert(std::fabs(pca_mat(2677, 0) - 333.405) < 0.001);
+    assert(std::fabs(pca_mat(5029, 0) - 216.175) < 0.001);
+    assert(std::fabs(pca_mat(5030, 0) - 219.555) < 0.001);
+
+    const auto  pca_mat2 = df.pca_by_eigen<double>({ "IBM_Close", "IBM_Open", "IBM_High", "IBM_Low" },
+                                                   { .num_comp_to_keep = 3 });
+
+    // 3 most significant dimensions are kept.
+    // As you can see the first column is unchanged and clearly contains
+    // almost all of the information.
+    //
+    assert(pca_mat2.cols() == 3);
+    assert(pca_mat2.rows() == 5031);
+
+    assert(std::fabs(pca_mat2(0, 0) - 197.063) < 0.001);
+    assert(std::fabs(pca_mat2(0, 1) - -0.0951913) < 0.001);
+    assert(std::fabs(pca_mat2(0, 2) - 1.85473) < 0.001);
+
+    assert(std::fabs(pca_mat2(1, 0) - 200.875) < 0.001);
+    assert(std::fabs(pca_mat2(1, 1) - -2.08604) < 0.001);
+    assert(std::fabs(pca_mat2(1, 2) - 2.68895) < 0.001);
+
+    assert(std::fabs(pca_mat2(491, 0) - 149.02) < 0.01);
+    assert(std::fabs(pca_mat2(491, 1) - -1.34957) < 0.01);
+    assert(std::fabs(pca_mat2(491, 2) - 2.09026) < 0.01);
+
+    assert(std::fabs(pca_mat2(1348, 0) - 166.44) < 0.01);
+    assert(std::fabs(pca_mat2(1348, 1) - 0.0354559) < 0.01);
+    assert(std::fabs(pca_mat2(1348, 2) - 0.41972) < 0.01);
+
+    assert(std::fabs(pca_mat2(2677, 0) - 333.405) < 0.001);
+    assert(std::fabs(pca_mat2(2677, 1) - -1.33686) < 0.001);
+    assert(std::fabs(pca_mat2(2677, 2) - 2.13684) < 0.001);
+
+    assert(std::fabs(pca_mat2(5029, 0) - 216.175) < 0.001);
+    assert(std::fabs(pca_mat2(5029, 1) - -1.18141) < 0.001);
+    assert(std::fabs(pca_mat2(5029, 2) - 2.18029) < 0.001);
+
+    assert(std::fabs(pca_mat2(5030, 0) - 219.555) < 0.001);
+    assert(std::fabs(pca_mat2(5030, 1) - -2.66858) < 0.001);
+    assert(std::fabs(pca_mat2(5030, 2) - 2.85412) < 0.001);
+}
+
+ +
C++ DataFrame + + + + + diff --git a/include/DataFrame/DataFrame.h b/include/DataFrame/DataFrame.h index 685f18f9..b39117f9 100644 --- a/include/DataFrame/DataFrame.h +++ b/include/DataFrame/DataFrame.h @@ -3739,7 +3739,7 @@ class DataFrame : public ThreadGranularity { // Name of the column // template> - size_type + [[nodiscard]] size_type inversion_count(const char *col_name) const; // This calculates and returns the variance/covariance matrix of the @@ -3748,44 +3748,43 @@ class DataFrame : public ThreadGranularity { // T: // Type of the named columns // col_names: - // Vector of column names + // Vector of column names // norm_type: // The method to normalize the columns first before calculations. // Default is not normalizing // template - Matrix + [[nodiscard]] Matrix covariance_matrix( std::vector &&col_names, normalization_type norm_type = normalization_type::none) const; - - - - - - - - - - - // Principal Component Analysis (PCA) + // This uses Eigenspace evaluation to calculate Principal Component + // Analysis (PCA). + // It returns a matrix whose columns are the reduced dimensions with most + // significant information. + // PCA is a dimensionality reduction method that is often used to reduce + // the dimensionality of large data sets, by transforming a large set of + // variables into a smaller one that still contains most of the information + // in the large set. + // Reducing the number of variables of a data set naturally comes at the + // expense of accuracy, but the trick in dimensionality reduction is to + // trade a little accuracy for simplicity. Because smaller data sets are + // easier to explore and visualize, and thus make analyzing data points + // much easier and faster for machine learning algorithms without + // extraneous variables to process. + // + // T: + // Type of the named columns + // col_names: + // Vector of column names + // params: + // Parameters necessary for for this operation // template - EigenSpace - prin_comp_analysis(std::vector &&col_names, - const PCAParams params = { }) const; - - - - - - - - - - - + [[nodiscard]] Matrix + pca_by_eigen(std::vector &&col_names, + const PCAParams params = { }) const; // This function returns a DataFrame indexed by std::string that provides // a few statistics about the columns of the calling DataFrame. diff --git a/include/DataFrame/DataFrameTypes.h b/include/DataFrame/DataFrameTypes.h index 8c4b9b93..5ec0d834 100644 --- a/include/DataFrame/DataFrameTypes.h +++ b/include/DataFrame/DataFrameTypes.h @@ -717,24 +717,18 @@ struct StationaryTestParams { // ---------------------------------------------------------------------------- -enum class pca_method : unsigned char { - - eigen = 1, // Eigen decomposition of the covariance matrix - svd = 2, // Singular Value Decomposition of the data matrix -}; - struct PCAParams { - pca_method method { pca_method::eigen }; normalization_type norm_type { normalization_type::z_score }; - // if populated, number of eigen components kept. + // If populated (set above zero), number of top eigen values to keep. // - std::size_t num_comp_kept { 0 }; + long num_comp_to_keep { 0 }; - // if populated, percentage of eigen components kept -- 0.9 means 90%. + // If populated (num_comp_is 0), percentage of eigen values to keep. + // 0.9 means 90%. // - double pct_comp_kept { 0.9 }; + double pct_comp_to_keep { 0.9 }; }; // ---------------------------------------------------------------------------- diff --git a/include/DataFrame/Internals/DataFrame_get.tcc b/include/DataFrame/Internals/DataFrame_get.tcc index 040be20e..dd2764a2 100644 --- a/include/DataFrame/Internals/DataFrame_get.tcc +++ b/include/DataFrame/Internals/DataFrame_get.tcc @@ -979,6 +979,105 @@ covariance_matrix(std::vector &&col_names, return (data_mat.covariance()); } +// ---------------------------------------------------------------------------- + +template +template +Matrix DataFrame:: +pca_by_eigen(std::vector &&col_names, + const PCAParams params) const { + +#ifdef HMDF_SANITY_EXCEPTIONS + if (params.num_comp_to_keep == 0 && params.pct_comp_to_keep < 0.01) + throw NotFeasible("pca_by_eigen(): Parameters don't make sense"); + if (params.num_comp_to_keep > long(col_names.size())) + throw NotFeasible("pca_by_eigen(): num_comp_to_keep > #input columns"); +#endif // HMDF_SANITY_EXCEPTIONS + + // Get the covariance matrix of normalized data + // + const auto var_cov = + covariance_matrix( + std::forward>(col_names), + params.norm_type); + + // Calculate Eigen space + // + Matrix eigenvals; + Matrix eigenvecs; + + var_cov.eigen_space(eigenvals, eigenvecs, true); + + // Keep the most significant columns + // + Matrix mod_evecs { }; + long col_count { 0 }; + + if (params.num_comp_to_keep > 0) { + col_count = params.num_comp_to_keep; + } + else { + T ev_sum { 0 }; + + for (long c = 0; c < eigenvals.cols(); ++c) + ev_sum += std::fabs(eigenvals(0, c)); + + T kept_sum { 0 }; + + for (long c = eigenvals.cols() - 1; c >= 0; --c) { + kept_sum += std::fabs(eigenvals(0, c)); + col_count += 1; + if ((kept_sum / ev_sum) >= params.pct_comp_to_keep) + break; + } + } + mod_evecs.resize(eigenvecs.rows(), col_count); + for (long c = 0; c < col_count; ++c) { + const long col = eigenvecs.cols() - c - 1; + + for (long r = 0; r < eigenvecs.rows(); ++r) + mod_evecs(r, c) = eigenvecs(r, col); + } + + // Copy the data matrix + // + const size_type col_num = col_names.size(); + size_type min_col_s { indices_.size() }; + std::vector *> columns(col_num, nullptr); + SpinGuard guard { lock_ }; + + for (size_type i { 0 }; i < col_num; ++i) { + columns[i] = &get_column(col_names[i], false); + if (columns[i]->size() < min_col_s) + min_col_s = columns[i]->size(); + } + guard.release(); + + Matrix data_mat { + long(min_col_s), long(col_num) }; + auto lbd = + [&data_mat, &columns = std::as_const(columns)] + (auto begin, auto end) -> void { + for (auto i { begin }; i < end; ++i) + data_mat.set_column(columns[i]->begin(), long(i)); + }; + const auto thread_level = + (min_col_s >= ThreadPool::MUL_THR_THHOLD || col_num >= 20 ) + ? get_thread_level() : 0L; + + if (thread_level > 2) { + auto futuers = + thr_pool_.parallel_loop(size_type(0), col_num, std::move(lbd)); + + for (auto &fut : futuers) fut.get(); + } + else lbd(size_type(0), col_num); + + // Return PCA + // + return (data_mat * mod_evecs); +} + } // namespace hmdf // ---------------------------------------------------------------------------- diff --git a/include/DataFrame/Utils/Matrix.h b/include/DataFrame/Utils/Matrix.h index 78645b29..66b5497f 100644 --- a/include/DataFrame/Utils/Matrix.h +++ b/include/DataFrame/Utils/Matrix.h @@ -1487,15 +1487,6 @@ operator * (const Matrix &lhs, const Matrix &rhs) { return (result); } -// ---------------------------------------------------------------------------- - -template -struct EigenSpace { - - Matrix eigen_vals { }; - Matrix eigen_vecs { }; -}; - } // namespace hmdf // ---------------------------------------------------------------------------- diff --git a/include/DataFrame/Utils/Matrix.tcc b/include/DataFrame/Utils/Matrix.tcc index 1d31ee90..da552261 100644 --- a/include/DataFrame/Utils/Matrix.tcc +++ b/include/DataFrame/Utils/Matrix.tcc @@ -1190,11 +1190,13 @@ eigen_space(MA1 &eigenvalues, MA2 &eigenvectors, bool sort_values) const { if (sort_values) { for (size_type c = 0; c < cols() - 1; ++c) { size_type min_col { c }; + value_type abs_min_val { std::fabs(tmp_evals(0, c)) }; value_type min_val { tmp_evals(0, c) }; for (size_type cc = c + 1; cc < cols(); ++cc) - if (tmp_evals(0, cc) < min_val) { + if (std::fabs(tmp_evals(0, cc)) < abs_min_val) { min_col = cc; + abs_min_val = std::fabs(tmp_evals(0, cc)); min_val = tmp_evals(0, cc); } diff --git a/test/dataframe_tester_4.cc b/test/dataframe_tester_4.cc index 8bb03058..b0f4640c 100644 --- a/test/dataframe_tester_4.cc +++ b/test/dataframe_tester_4.cc @@ -2269,6 +2269,77 @@ static void test_covariance_matrix() { // ---------------------------------------------------------------------------- +static void test_pca_by_eigen() { + + std::cout << "\nTesting pca_by_eigen( ) ..." << std::endl; + + StrDataFrame df; + + try { + df.read("IBM.csv", io_format::csv2); + } + catch (const DataFrameError &ex) { + std::cout << ex.what() << std::endl; + } + + const auto pca_mat = df.pca_by_eigen( + { "IBM_Close", "IBM_Open", "IBM_High", "IBM_Low" }); + + // Dimensions were reduced to 1 containing at least 90% of the information. + // This makes sense, since these 4 columns are highly correlated. + // + assert(pca_mat.cols() == 1); + assert(pca_mat.rows() == 5031); + assert(std::fabs(pca_mat(0, 0) - 197.063) < 0.001); + assert(std::fabs(pca_mat(1, 0) - 200.875) < 0.001); + assert(std::fabs(pca_mat(491, 0) - 149.02) < 0.01); + assert(std::fabs(pca_mat(1348, 0) - 166.44) < 0.01); + assert(std::fabs(pca_mat(2677, 0) - 333.405) < 0.001); + assert(std::fabs(pca_mat(5029, 0) - 216.175) < 0.001); + assert(std::fabs(pca_mat(5030, 0) - 219.555) < 0.001); + + const auto pca_mat2 = df.pca_by_eigen( + { "IBM_Close", "IBM_Open", "IBM_High", "IBM_Low" }, + { .num_comp_to_keep = 3 }); + + // 3 most significant dimensions are kept. + // As you can see the first column is unchanged and clearly contains + // almost all of the information. + // + assert(pca_mat2.cols() == 3); + assert(pca_mat2.rows() == 5031); + + assert(std::fabs(pca_mat2(0, 0) - 197.063) < 0.001); + assert(std::fabs(pca_mat2(0, 1) - -0.0951913) < 0.001); + assert(std::fabs(pca_mat2(0, 2) - 1.85473) < 0.001); + + assert(std::fabs(pca_mat2(1, 0) - 200.875) < 0.001); + assert(std::fabs(pca_mat2(1, 1) - -2.08604) < 0.001); + assert(std::fabs(pca_mat2(1, 2) - 2.68895) < 0.001); + + assert(std::fabs(pca_mat2(491, 0) - 149.02) < 0.01); + assert(std::fabs(pca_mat2(491, 1) - -1.34957) < 0.01); + assert(std::fabs(pca_mat2(491, 2) - 2.09026) < 0.01); + + assert(std::fabs(pca_mat2(1348, 0) - 166.44) < 0.01); + assert(std::fabs(pca_mat2(1348, 1) - 0.0354559) < 0.01); + assert(std::fabs(pca_mat2(1348, 2) - 0.41972) < 0.01); + + assert(std::fabs(pca_mat2(2677, 0) - 333.405) < 0.001); + assert(std::fabs(pca_mat2(2677, 1) - -1.33686) < 0.001); + assert(std::fabs(pca_mat2(2677, 2) - 2.13684) < 0.001); + + assert(std::fabs(pca_mat2(5029, 0) - 216.175) < 0.001); + assert(std::fabs(pca_mat2(5029, 1) - -1.18141) < 0.001); + assert(std::fabs(pca_mat2(5029, 2) - 2.18029) < 0.001); + + assert(std::fabs(pca_mat2(5030, 0) - 219.555) < 0.001); + assert(std::fabs(pca_mat2(5030, 1) - -2.66858) < 0.001); + assert(std::fabs(pca_mat2(5030, 2) - 2.85412) < 0.001); +} + +// ---------------------------------------------------------------------------- + int main(int, char *[]) { MyDataFrame::set_optimum_thread_level(); @@ -2310,6 +2381,7 @@ int main(int, char *[]) { test_make_stationary(); test_StationaryCheckVisitor(); test_covariance_matrix(); + test_pca_by_eigen(); return (0); } diff --git a/test/matrix_tester.cc b/test/matrix_tester.cc index 3255b544..bb1609c9 100644 --- a/test/matrix_tester.cc +++ b/test/matrix_tester.cc @@ -261,17 +261,18 @@ int main(int, char *[]) { assert(eigenvals.cols() == 10); assert(eigenvals.rows() == 1); - assert((std::fabs(eigenvals(0, 0) - -124.177) < 0.001)); - assert((std::fabs(eigenvals(0, 1) - -24.5492) < 0.0001)); - assert((std::fabs(eigenvals(0, 5) - -3.4454) < 0.0001)); + + assert((std::fabs(eigenvals(0, 0) - -2.30681) < 0.001)); + assert((std::fabs(eigenvals(0, 1) - -2.48865) < 0.0001)); + assert((std::fabs(eigenvals(0, 5) - -6.56515) < 0.0001)); assert((std::fabs(eigenvals(0, 9) - 687.09) < 0.01)); assert(eigenvecs.cols() == 10); assert(eigenvecs.rows() == 10); - assert((std::fabs(eigenvecs(0, 0) - -0.477637) < 0.000001)); + assert((std::fabs(eigenvecs(0, 0) - 0.0698916) < 0.000001)); assert((std::fabs(eigenvecs(2, 4) - -0.320417) < 0.000001)); - assert((std::fabs(eigenvecs(5, 6) - 0.396209) < 0.000001)); - assert((std::fabs(eigenvecs(8, 2) - -0.027937) < 0.000001)); + assert((std::fabs(eigenvecs(5, 6) - 0.181206) < 0.000001)); + assert((std::fabs(eigenvecs(8, 2) - 0.44074) < 0.00001)); assert((std::fabs(eigenvecs(9, 9) - 0.432927) < 0.000001)); // non-symmetric matrix @@ -285,16 +286,16 @@ int main(int, char *[]) { assert(eigenvals.cols() == 10); assert(eigenvals.rows() == 1); - assert((std::fabs(eigenvals(0, 0) - -15.8398) < 0.0001)); - assert(eigenvals(0, 1) > -0.00000000001); // -8.34036e-15 - assert(eigenvals(0, 5) < 0.00000000001); // 4.83968e-15 + assert(eigenvals(0, 0) > -0.00000000001); // -2.87473e-15 + assert(eigenvals(0, 1) > -0.00000000001); // -2.87473e-15 + assert(eigenvals(0, 5) < 0.00000000001); // 6.12134e-15 assert((std::fabs(eigenvals(0, 9) - 520.84) < 0.01)); assert(eigenvecs.cols() == 10); assert(eigenvecs.rows() == 10); - assert((std::fabs(eigenvecs(0, 0) - 0.568403) < 0.000001)); - assert((std::fabs(eigenvecs(2, 4) - 0.088418) < 0.000001)); - assert((std::fabs(eigenvecs(5, 6) - 0.199127) < 0.000001)); + assert((std::fabs(eigenvecs(0, 0) - -0.0833988) < 0.000001)); + assert((std::fabs(eigenvecs(2, 4) - 0.32935) < 0.00001)); + assert((std::fabs(eigenvecs(5, 6) - -0.410279) < 0.000001)); assert((std::fabs(eigenvecs(8, 2) - 9.34286) < 0.00001)); assert((std::fabs(eigenvecs(9, 9) - -0.51616) < 0.00001)); }