Implemented pca_by_eigen()

hosseinmoein · Dec 16, 2024 · c2a5916 · c2a5916
1 parent e574261
commit c2a5916
Show file tree

Hide file tree

Showing 11 changed files with 407 additions and 63 deletions.
diff --git a/README.md b/README.md
@@ -37,7 +37,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ## <a href="https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/DataFrame.html" target="_blank"><B>DataFrame documentation with code samples</B></a>
 This is a C++ analytical library designed for data analysis similar to libraries in Python and R. For example, you would compare this to [Pandas](https://pandas.pydata.org) or [R data.frame](https://www.w3schools.com/r/r_data_frames.asp)<BR>
 You can slice the data in many different ways. You can join, merge, group-by the data. You can run various statistical, summarization, financial, and ML algorithms on the data. You can add your custom algorithms easily. You can multi-column sort, custom pick and delete the data. And more …<BR>
-DataFrame also includes a large collection of analytical algorithms in form of visitors. These are from basic stats such as <I>Mean</I>, <I>Std Deviation</I>, <I>Return</I>, … to more involved analysis such as <I>Affinity Propagation</I>, <I>Polynomial Fit</I>, <I>Fast Fourier transform of arbitrary length</I> … including a good collection of trading indicators. You can also easily add your own algorithms.<BR>
+DataFrame also includes a large collection of analytical algorithms in form of visitors. These are from basic stats such as <I>Mean</I>, <I>Std Deviation</I>, <I>Return</I>, … to more involved analysis such as <I>PCA</I>, <I>Polynomial Fit</I>, <I>Fast Fourier transform of arbitrary length</I> … including a good collection of trading indicators. You can also easily add your own algorithms.<BR>
 DataFrame also employs extensive multithreading in almost all its API’s, for large datasets. That makes DataFrame especially suitable for analyzing large datasets.<BR>
 For basic operations to start you off, see [Hello World](examples/hello_world.cc). For a complete list of features with code samples, see <a href="https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/DataFrame.html" target="_blank">documentation</a>.
 

diff --git a/docs/HTML/DataFrame.html b/docs/HTML/DataFrame.html
@@ -325,6 +325,10 @@ <H2 ID="2"><font color="blue">API Reference with code samples <font size="+4">&#
       <td title="True, if matches an statistical pattern"><a href="https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/pattern_spec.html">pattern_match</a>()</td>
     </tr>
 
+    <tr class="item" onmouseover="this.style.backgroundColor='#ffff66';" onmouseout="this.style.backgroundColor='#d4e3e5';">
+      <td title="Calculates Principal Component Analysis (PCA)."><a href="https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/pca_by_eigen.html">pca_by_eigen</a>()</td>
+    </tr>
+
     <tr class="item" onmouseover="this.style.backgroundColor='#ffff66';" onmouseout="this.style.backgroundColor='#d4e3e5';">
       <td title="Returns a mask vector of peaks"><a href="https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/peaks.html">peaks</a>()</td>
     </tr>

diff --git a/docs/HTML/covariance_matrix.html b/docs/HTML/covariance_matrix.html
@@ -49,7 +49,7 @@
 <PRE><B>
 template&lt;typename T&gt;
 <a href="https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/Matrix.html">Matrix</a>&lt;T, <a href="https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/Matrix.html">matrix_orient</a>::column_major&gt;
-covariance_matrix(std::vector<const char *> &&col_names,
+covariance_matrix(std::vector<const char *> &amp;&amp;col_names,
                   <a href="https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/NormalizeVisitor.html">normalization_type</a> norm_type =
                       normalization_type::none) const;
 </B></PRE></font>

diff --git a/docs/HTML/pca_by_eigen.html b/docs/HTML/pca_by_eigen.html
diff --git a/include/DataFrame/DataFrame.h b/include/DataFrame/DataFrame.h
@@ -3739,7 +3739,7 @@ class   DataFrame : public ThreadGranularity {
     //   Name of the column
     //
     template<typename T, typename C = std::less<T>>
-    size_type
+    [[nodiscard]] size_type
     inversion_count(const char *col_name) const;
 
     // This calculates and returns the variance/covariance matrix of the
@@ -3748,44 +3748,43 @@ class   DataFrame : public ThreadGranularity {
     // T:
     //   Type of the named columns
     // col_names:
-    //    Vector of column names
+    //   Vector of column names
     // norm_type:
     //   The method to normalize the columns first before calculations.
     //   Default is not normalizing
     //
     template<typename T>
-    Matrix<T, matrix_orient::column_major>
+    [[nodiscard]] Matrix<T, matrix_orient::column_major>
     covariance_matrix(
         std::vector<const char *> &&col_names,
         normalization_type norm_type = normalization_type::none) const;
 
-
-
-
-
-
-
-
-
-
-
-    // Principal Component Analysis (PCA)
+    // This uses Eigenspace evaluation to calculate Principal Component
+    // Analysis (PCA).
+    // It returns a matrix whose columns are the reduced dimensions with most
+    // significant information.
+    // PCA is a dimensionality reduction method that is often used to reduce
+    // the dimensionality of large data sets, by transforming a large set of
+    // variables into a smaller one that still contains most of the information
+    // in the large set.
+    // Reducing the number of variables of a data set naturally comes at the
+    // expense of accuracy, but the trick in dimensionality reduction is to
+    // trade a little accuracy for simplicity. Because smaller data sets are
+    // easier to explore and visualize, and thus make analyzing data points
+    // much easier and faster for machine learning algorithms without
+    // extraneous variables to process.
+    //
+    // T:
+    //   Type of the named columns
+    // col_names:
+    //   Vector of column names
+    // params:
+    //   Parameters necessary for for this operation
     //
     template<typename T>
-    EigenSpace<T>
-    prin_comp_analysis(std::vector<const char *> &&col_names,
-                       const PCAParams params = { }) const;
-
-
-
-
-
-
-
-
-
-
-
+    [[nodiscard]] Matrix<T, matrix_orient::column_major>
+    pca_by_eigen(std::vector<const char *> &&col_names,
+                 const PCAParams params = { }) const;
 
     // This function returns a DataFrame indexed by std::string that provides
     // a few statistics about the columns of the calling DataFrame.

diff --git a/include/DataFrame/DataFrameTypes.h b/include/DataFrame/DataFrameTypes.h
@@ -717,24 +717,18 @@ struct  StationaryTestParams  {
 
 // ----------------------------------------------------------------------------
 
-enum class  pca_method : unsigned char  {
-
-    eigen = 1,  // Eigen decomposition of the covariance matrix
-    svd = 2,    // Singular Value Decomposition of the data matrix
-};
-
 struct  PCAParams  {
 
-    pca_method          method { pca_method::eigen };
     normalization_type  norm_type { normalization_type::z_score };
 
-    // if populated, number of eigen components kept.
+    // If populated (set above zero), number of top eigen values to keep.
     //
-    std::size_t         num_comp_kept { 0 };
+    long                num_comp_to_keep { 0 };
 
-    // if populated, percentage of eigen components kept -- 0.9 means 90%.
+    // If populated (num_comp_is 0), percentage of eigen values to keep.
+    // 0.9 means 90%.
     //
-    double              pct_comp_kept { 0.9 };
+    double              pct_comp_to_keep { 0.9 };
 };
 
 // ----------------------------------------------------------------------------

diff --git a/include/DataFrame/Internals/DataFrame_get.tcc b/include/DataFrame/Internals/DataFrame_get.tcc
@@ -979,6 +979,105 @@ covariance_matrix(std::vector<const char *> &&col_names,
     return (data_mat.covariance());
 }
 
+// ----------------------------------------------------------------------------
+
+template<typename I, typename H>
+template<typename T>
+Matrix<T, matrix_orient::column_major> DataFrame<I, H>::
+pca_by_eigen(std::vector<const char *> &&col_names,
+             const PCAParams params) const  {
+
+#ifdef HMDF_SANITY_EXCEPTIONS
+    if (params.num_comp_to_keep == 0 && params.pct_comp_to_keep < 0.01)
+        throw NotFeasible("pca_by_eigen(): Parameters don't make sense");
+    if (params.num_comp_to_keep > long(col_names.size()))
+        throw NotFeasible("pca_by_eigen(): num_comp_to_keep > #input columns");
+#endif // HMDF_SANITY_EXCEPTIONS
+
+    // Get the covariance matrix of normalized data
+    //
+    const auto  var_cov =
+        covariance_matrix<T>(
+            std::forward<std::vector<const char *>>(col_names),
+            params.norm_type);
+
+    // Calculate Eigen space
+    //
+    Matrix<T, matrix_orient::row_major>     eigenvals;
+    Matrix<T, matrix_orient::column_major>  eigenvecs;
+
+    var_cov.eigen_space(eigenvals, eigenvecs, true);
+
+    // Keep the most significant columns
+    //
+    Matrix<T, matrix_orient::column_major>  mod_evecs { };
+    long                                    col_count { 0 };
+
+    if (params.num_comp_to_keep > 0)  {
+        col_count = params.num_comp_to_keep;
+    }
+    else  {
+        T   ev_sum { 0 };
+
+        for (long c = 0; c < eigenvals.cols(); ++c)
+            ev_sum += std::fabs(eigenvals(0, c));
+
+        T   kept_sum { 0 };
+
+        for (long c = eigenvals.cols() - 1; c >= 0; --c)  {
+            kept_sum += std::fabs(eigenvals(0, c));
+            col_count += 1;
+            if ((kept_sum / ev_sum) >= params.pct_comp_to_keep)
+                break;
+        }
+    }
+    mod_evecs.resize(eigenvecs.rows(), col_count);
+    for (long c = 0; c < col_count; ++c)  {
+        const long  col = eigenvecs.cols() - c - 1;
+
+        for (long r = 0; r < eigenvecs.rows(); ++r)
+            mod_evecs(r, c) = eigenvecs(r, col);
+    }
+
+    // Copy the data matrix
+    //
+    const size_type                         col_num = col_names.size();
+    size_type                               min_col_s { indices_.size() };
+    std::vector<const ColumnVecType<T> *>   columns(col_num, nullptr);
+    SpinGuard                               guard { lock_ };
+
+    for (size_type i { 0 }; i < col_num; ++i)  {
+        columns[i] = &get_column<T>(col_names[i], false);
+        if (columns[i]->size() < min_col_s)
+            min_col_s = columns[i]->size();
+    }
+    guard.release();
+
+    Matrix<T, matrix_orient::column_major>  data_mat {
+        long(min_col_s), long(col_num) };
+    auto                                    lbd =
+        [&data_mat, &columns = std::as_const(columns)]
+        (auto begin, auto end) -> void  {
+            for (auto i { begin }; i < end; ++i)
+                data_mat.set_column(columns[i]->begin(), long(i));
+        };
+    const auto                              thread_level =
+        (min_col_s >= ThreadPool::MUL_THR_THHOLD || col_num >= 20 )
+            ? get_thread_level() : 0L;
+
+    if (thread_level > 2)  {
+        auto    futuers =
+            thr_pool_.parallel_loop(size_type(0), col_num, std::move(lbd));
+
+        for (auto &fut : futuers)  fut.get();
+    }
+    else  lbd(size_type(0), col_num);
+
+    // Return PCA
+    //
+    return (data_mat * mod_evecs);
+}
+
 } // namespace hmdf
 
 // ----------------------------------------------------------------------------

diff --git a/include/DataFrame/Utils/Matrix.h b/include/DataFrame/Utils/Matrix.h
@@ -1487,15 +1487,6 @@ operator * (const Matrix<T, MO1> &lhs, const Matrix<T, MO2> &rhs)  {
     return (result);
 }
 
-// ----------------------------------------------------------------------------
-
-template<typename T>
-struct  EigenSpace  {
-
-    Matrix<T, matrix_orient::row_major>     eigen_vals { };
-    Matrix<T, matrix_orient::column_major>  eigen_vecs { };
-};
-
 } // namespace hmdf
 
 // ----------------------------------------------------------------------------

diff --git a/include/DataFrame/Utils/Matrix.tcc b/include/DataFrame/Utils/Matrix.tcc
@@ -1190,11 +1190,13 @@ eigen_space(MA1 &eigenvalues, MA2 &eigenvectors, bool sort_values) const  {
     if (sort_values)  {
         for (size_type c = 0; c < cols() - 1; ++c)  {
             size_type   min_col { c };
+            value_type  abs_min_val { std::fabs(tmp_evals(0, c)) };
             value_type  min_val { tmp_evals(0, c) };
 
             for (size_type cc = c + 1; cc < cols(); ++cc)
-                if (tmp_evals(0, cc) < min_val)  {
+                if (std::fabs(tmp_evals(0, cc)) < abs_min_val)  {
                     min_col = cc;
+                    abs_min_val = std::fabs(tmp_evals(0, cc));
                     min_val = tmp_evals(0, cc);
                 }
 

diff --git a/test/dataframe_tester_4.cc b/test/dataframe_tester_4.cc
@@ -2269,6 +2269,77 @@ static void test_covariance_matrix()  {
 
 // ----------------------------------------------------------------------------
 
+static void test_pca_by_eigen()  {
+
+    std::cout << "\nTesting pca_by_eigen( ) ..." << std::endl;
+
+    StrDataFrame    df;
+
+    try  {
+        df.read("IBM.csv", io_format::csv2);
+    }
+    catch (const DataFrameError &ex)  {
+        std::cout << ex.what() << std::endl;
+    }
+
+    const auto  pca_mat = df.pca_by_eigen<double>(
+        { "IBM_Close", "IBM_Open", "IBM_High", "IBM_Low" });
+
+    // Dimensions were reduced to 1 containing at least 90% of the information.
+    // This makes sense, since these 4 columns are highly correlated.
+    //
+    assert(pca_mat.cols() == 1);
+    assert(pca_mat.rows() == 5031);
+    assert(std::fabs(pca_mat(0, 0) - 197.063) < 0.001);
+    assert(std::fabs(pca_mat(1, 0) - 200.875) < 0.001);
+    assert(std::fabs(pca_mat(491, 0) - 149.02) < 0.01);
+    assert(std::fabs(pca_mat(1348, 0) - 166.44) < 0.01);
+    assert(std::fabs(pca_mat(2677, 0) - 333.405) < 0.001);
+    assert(std::fabs(pca_mat(5029, 0) - 216.175) < 0.001);
+    assert(std::fabs(pca_mat(5030, 0) - 219.555) < 0.001);
+
+    const auto  pca_mat2 = df.pca_by_eigen<double>(
+        { "IBM_Close", "IBM_Open", "IBM_High", "IBM_Low" },
+        { .num_comp_to_keep = 3 });
+
+    // 3 most significant dimensions are kept.
+    // As you can see the first column is unchanged and clearly contains
+    // almost all of the information.
+    //
+    assert(pca_mat2.cols() == 3);
+    assert(pca_mat2.rows() == 5031);
+
+    assert(std::fabs(pca_mat2(0, 0) - 197.063) < 0.001);
+    assert(std::fabs(pca_mat2(0, 1) - -0.0951913) < 0.001);
+    assert(std::fabs(pca_mat2(0, 2) - 1.85473) < 0.001);
+
+    assert(std::fabs(pca_mat2(1, 0) - 200.875) < 0.001);
+    assert(std::fabs(pca_mat2(1, 1) - -2.08604) < 0.001);
+    assert(std::fabs(pca_mat2(1, 2) - 2.68895) < 0.001);
+
+    assert(std::fabs(pca_mat2(491, 0) - 149.02) < 0.01);
+    assert(std::fabs(pca_mat2(491, 1) - -1.34957) < 0.01);
+    assert(std::fabs(pca_mat2(491, 2) - 2.09026) < 0.01);
+
+    assert(std::fabs(pca_mat2(1348, 0) - 166.44) < 0.01);
+    assert(std::fabs(pca_mat2(1348, 1) - 0.0354559) < 0.01);
+    assert(std::fabs(pca_mat2(1348, 2) - 0.41972) < 0.01);
+
+    assert(std::fabs(pca_mat2(2677, 0) - 333.405) < 0.001);
+    assert(std::fabs(pca_mat2(2677, 1) - -1.33686) < 0.001);
+    assert(std::fabs(pca_mat2(2677, 2) - 2.13684) < 0.001);
+
+    assert(std::fabs(pca_mat2(5029, 0) - 216.175) < 0.001);
+    assert(std::fabs(pca_mat2(5029, 1) - -1.18141) < 0.001);
+    assert(std::fabs(pca_mat2(5029, 2) - 2.18029) < 0.001);
+
+    assert(std::fabs(pca_mat2(5030, 0) - 219.555) < 0.001);
+    assert(std::fabs(pca_mat2(5030, 1) - -2.66858) < 0.001);
+    assert(std::fabs(pca_mat2(5030, 2) - 2.85412) < 0.001);
+}
+
+// ----------------------------------------------------------------------------
+
 int main(int, char *[]) {
 
     MyDataFrame::set_optimum_thread_level();
@@ -2310,6 +2381,7 @@ int main(int, char *[]) {
     test_make_stationary();
     test_StationaryCheckVisitor();
     test_covariance_matrix();
+    test_pca_by_eigen();
 
     return (0);
 }