From a120a5123d760b5dc3c6aaaedb4d58d2e8c20b28 Mon Sep 17 00:00:00 2001 From: Hossein Moein Date: Fri, 13 Dec 2024 10:29:54 -0500 Subject: [PATCH] Implemented covariance_matrix() --- docs/HTML/DataFrame.html | 4 + docs/HTML/Matrix.html | 4 +- docs/HTML/NormalizeVisitor.html | 2 + docs/HTML/covariance_matrix.html | 119 ++++++++++++++++++ include/DataFrame/DataFrame.h | 21 +++- include/DataFrame/DataFrameTypes.h | 1 + include/DataFrame/Internals/DataFrame_get.tcc | 51 +++++++- include/DataFrame/Utils/Matrix.tcc | 22 ++-- test/dataframe_tester_4.cc | 44 +++++++ 9 files changed, 253 insertions(+), 15 deletions(-) create mode 100644 docs/HTML/covariance_matrix.html diff --git a/docs/HTML/DataFrame.html b/docs/HTML/DataFrame.html index fc62ef46..9ac3a0e9 100644 --- a/docs/HTML/DataFrame.html +++ b/docs/HTML/DataFrame.html @@ -261,6 +261,10 @@

API Reference with code samples &# col_idx_to_name() + + covariance_matrix() + + describe() diff --git a/docs/HTML/Matrix.html b/docs/HTML/Matrix.html index c8b6a80b..71da2c9b 100644 --- a/docs/HTML/Matrix.html +++ b/docs/HTML/Matrix.html @@ -46,8 +46,8 @@

 enum class  matrix_orient : unsigned char  {
 
-    column_major = 1,
-    row_major = 2,
+    column_major = 1,  // Data is laid out column by column
+    row_major = 2,     // Data is laid out row by row
 };
 
 // -----------------------
diff --git a/docs/HTML/NormalizeVisitor.html b/docs/HTML/NormalizeVisitor.html
index b5678f58..13ca2bf0 100644
--- a/docs/HTML/NormalizeVisitor.html
+++ b/docs/HTML/NormalizeVisitor.html
@@ -93,6 +93,8 @@
        
         

 enum class  normalization_type : unsigned char  {
+
+    none = 0,
     simple = 1,           // 
V / ∑ xi
euclidean = 2, //
V / ∑ xi2
maxi = 3, //
V / MAX(xi)
diff --git a/docs/HTML/covariance_matrix.html b/docs/HTML/covariance_matrix.html new file mode 100644 index 00000000..0dd3b729 --- /dev/null +++ b/docs/HTML/covariance_matrix.html @@ -0,0 +1,119 @@ + + + + + + + + + + Back to Documentations

+ + + + + + + + + + + + + +
Signature Description Parameters
+

+template<typename T>
+Matrix<T, matrix_orient::column_major>
+covariance_matrix(std::vector &&col_names,
+                  normalization_type norm_type =
+                      normalization_type::none) const;
+
+
+ This calculates and returns the variance/covariance matrix of the specified columns, optionally normalizing the columns first.
+
+ T: Type of the named columns
+ col_names: Vector of column names
+ norm_type: The method to normalize the columns first before calculations. Default is not normalizing
+
+ +
static void test_covariance_matrix()  {
+
+    std::cout << "\nTesting covariance_matrix( ) ..." << std::endl;
+
+    StrDataFrame    df;
+
+    try  {
+        df.read("IBM.csv", io_format::csv2);
+    }
+    catch (const DataFrameError &ex)  {
+        std::cout << ex.what() << std::endl;
+    }
+
+    const auto  cov_mat = df.covariance_matrix<double>({ "IBM_Close", "IBM_Open", "IBM_High", "IBM_Low" });
+
+    assert(cov_mat.rows() == 4);
+    assert(cov_mat.cols() == 4);
+    assert(std::fabs(cov_mat(0, 0) - 1467.58) < 0.01);
+    assert(std::fabs(cov_mat(0, 2) - 1469.69) < 0.01);
+    assert(std::fabs(cov_mat(2, 1) - 1469.48) < 0.01);
+    assert(std::fabs(cov_mat(2, 2) - 1472.86) < 0.01);
+    assert(std::fabs(cov_mat(3, 2) - 1466.15) < 0.01);
+    assert(std::fabs(cov_mat(3, 3) - 1461.0) < 0.01);
+
+    const auto  cov_mat2 = df.covariance_matrix<double>({ "IBM_Close", "IBM_Open", "IBM_High", "IBM_Low" },
+                                                        normalization_type::z_score);
+
+    assert(cov_mat2.rows() == 4);
+    assert(cov_mat2.cols() == 4);
+    assert(std::fabs(cov_mat2(0, 0) - 1.0) < 0.01);
+    assert(std::fabs(cov_mat2(0, 2) - 0.99964) < 0.00001);
+    assert(std::fabs(cov_mat2(2, 1) - 0.99963) < 0.00001);
+    assert(std::fabs(cov_mat2(2, 2) - 1.0) < 0.01);
+    assert(std::fabs(cov_mat2(3, 2) - 0.99948) < 0.00001);
+    assert(std::fabs(cov_mat2(3, 3) - 1.0) < 0.01);
+}
+
+ +
C++ DataFrame + + + + + diff --git a/include/DataFrame/DataFrame.h b/include/DataFrame/DataFrame.h index 7bd65f84..b5c0619b 100644 --- a/include/DataFrame/DataFrame.h +++ b/include/DataFrame/DataFrame.h @@ -35,6 +35,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include @@ -3738,7 +3739,25 @@ class DataFrame : public ThreadGranularity { // Name of the column // template> - size_type inversion_count(const char *col_name) const; + size_type + inversion_count(const char *col_name) const; + + // This calculates and returns the variance/covariance matrix of the + // specified columns, optionally normalizing the columns first. + // + // T: + // Type of the named columns + // col_names: + // Vector of column names + // norm_type: + // The method to normalize the columns first before calculations. + // Default is not normalizing + // + template + Matrix + covariance_matrix( + std::vector &&col_names, + normalization_type norm_type = normalization_type::none) const; // This function returns a DataFrame indexed by std::string that provides // a few statistics about the columns of the calling DataFrame. diff --git a/include/DataFrame/DataFrameTypes.h b/include/DataFrame/DataFrameTypes.h index 918baab7..e284e6b8 100644 --- a/include/DataFrame/DataFrameTypes.h +++ b/include/DataFrame/DataFrameTypes.h @@ -530,6 +530,7 @@ enum class prob_dist_type : unsigned char { enum class normalization_type : unsigned char { + none = 0, simple = 1, // V / sum(xi) euclidean = 2, // V / sqrt(sum(xi^2)) maxi = 3, // V / max(xi) diff --git a/include/DataFrame/Internals/DataFrame_get.tcc b/include/DataFrame/Internals/DataFrame_get.tcc index 36bf1178..040be20e 100644 --- a/include/DataFrame/Internals/DataFrame_get.tcc +++ b/include/DataFrame/Internals/DataFrame_get.tcc @@ -177,7 +177,7 @@ DataFrame::get_column (const char *name, bool do_lock) const { template template const typename DataFrame::template ColumnVecType & -DataFrame::get_column () const { +DataFrame::get_column() const { return (const_cast(this)->get_column( T::name)); @@ -930,6 +930,55 @@ DataFrame::difference(const DataFrame &other) const { return (result); } +// ---------------------------------------------------------------------------- + +template +template +Matrix DataFrame:: +covariance_matrix(std::vector &&col_names, + normalization_type norm_type) const { + + const size_type col_num = col_names.size(); + +#ifdef HMDF_SANITY_EXCEPTIONS + if (col_num < 2) + throw NotFeasible("covariance_matrix(): " + "You must specify at least two columns"); +#endif // HMDF_SANITY_EXCEPTIONS + + size_type min_col_s { indices_.size() }; + std::vector *> columns(col_num, nullptr); + SpinGuard guard (lock_); + + for (size_type i { 0 }; i < col_num; ++i) { + columns[i] = &get_column(col_names[i], false); + if (columns[i]->size() < min_col_s) + min_col_s = columns[i]->size(); + } + guard.release(); + + Matrix data_mat { + long(min_col_s), long(col_num) }; + + if (norm_type > normalization_type::none) { + for (size_type i { 0 }; i < col_num; ++i) { + NormalizeVisitor norm_v { norm_type }; + + norm_v.pre(); + norm_v(indices_.begin(), indices_.end(), + columns[i]->begin(), columns[i]->end()); + norm_v.post(); + data_mat.set_column(norm_v.get_result().begin(), i); + } + } + else { + for (size_type i { 0 }; i < col_num; ++i) + data_mat.set_column(columns[i]->begin(), i); + } + + return (data_mat.covariance()); +} + } // namespace hmdf // ---------------------------------------------------------------------------- diff --git a/include/DataFrame/Utils/Matrix.tcc b/include/DataFrame/Utils/Matrix.tcc index c0385763..4b31667c 100644 --- a/include/DataFrame/Utils/Matrix.tcc +++ b/include/DataFrame/Utils/Matrix.tcc @@ -1189,22 +1189,22 @@ eigen_space(MA1 &eigenvalues, MA2 &eigenvectors, bool sort_values) const { if (sort_values) { for (size_type c = 0; c < cols() - 1; ++c) { - size_type tmp_c { c }; - value_type p { tmp_evals(0, c) }; + size_type min_col { c }; + value_type min_val { tmp_evals(0, c) }; for (size_type cc = c + 1; cc < cols(); ++cc) - if (tmp_evals(0, cc) < p) { - tmp_c = cc; - p = tmp_evals(0, cc); + if (tmp_evals(0, cc) < min_val) { + min_col = cc; + min_val = tmp_evals(0, cc); } - if (tmp_c != c) { - tmp_evals(0, tmp_c) = tmp_evals(0, c); - tmp_evals(0, c) = p; + if (min_col != c) { + tmp_evals(0, min_col) = tmp_evals(0, c); + tmp_evals(0, c) = min_val; for (size_type r = 0; r < rows(); ++r) { - p = tmp_evecs(r, c); - tmp_evecs(r, c) = tmp_evecs(r, tmp_c); - tmp_evecs(r, tmp_c) = p; + min_val = tmp_evecs(r, c); + tmp_evecs(r, c) = tmp_evecs(r, min_col); + tmp_evecs(r, min_col) = min_val; } } } diff --git a/test/dataframe_tester_4.cc b/test/dataframe_tester_4.cc index 2e98bdb4..8bb03058 100644 --- a/test/dataframe_tester_4.cc +++ b/test/dataframe_tester_4.cc @@ -2226,6 +2226,49 @@ static void test_StationaryCheckVisitor() { // ---------------------------------------------------------------------------- +static void test_covariance_matrix() { + + std::cout << "\nTesting covariance_matrix( ) ..." << std::endl; + + StrDataFrame df; + + try { + df.read("IBM.csv", io_format::csv2); + } + catch (const DataFrameError &ex) { + std::cout << ex.what() << std::endl; + } + + const auto cov_mat = + df.covariance_matrix({ "IBM_Close", "IBM_Open", + "IBM_High", "IBM_Low" }); + + assert(cov_mat.rows() == 4); + assert(cov_mat.cols() == 4); + assert(std::fabs(cov_mat(0, 0) - 1467.58) < 0.01); + assert(std::fabs(cov_mat(0, 2) - 1469.69) < 0.01); + assert(std::fabs(cov_mat(2, 1) - 1469.48) < 0.01); + assert(std::fabs(cov_mat(2, 2) - 1472.86) < 0.01); + assert(std::fabs(cov_mat(3, 2) - 1466.15) < 0.01); + assert(std::fabs(cov_mat(3, 3) - 1461.0) < 0.01); + + const auto cov_mat2 = + df.covariance_matrix({ "IBM_Close", "IBM_Open", + "IBM_High", "IBM_Low" }, + normalization_type::z_score); + + assert(cov_mat2.rows() == 4); + assert(cov_mat2.cols() == 4); + assert(std::fabs(cov_mat2(0, 0) - 1.0) < 0.01); + assert(std::fabs(cov_mat2(0, 2) - 0.99964) < 0.00001); + assert(std::fabs(cov_mat2(2, 1) - 0.99963) < 0.00001); + assert(std::fabs(cov_mat2(2, 2) - 1.0) < 0.01); + assert(std::fabs(cov_mat2(3, 2) - 0.99948) < 0.00001); + assert(std::fabs(cov_mat2(3, 3) - 1.0) < 0.01); +} + +// ---------------------------------------------------------------------------- + int main(int, char *[]) { MyDataFrame::set_optimum_thread_level(); @@ -2266,6 +2309,7 @@ int main(int, char *[]) { test_PartialAutoCorrVisitor(); test_make_stationary(); test_StationaryCheckVisitor(); + test_covariance_matrix(); return (0); }