diff --git a/include/DataFrame/Utils/Matrix.h b/include/DataFrame/Utils/Matrix.h index 0835a6f3..bb805af2 100644 --- a/include/DataFrame/Utils/Matrix.h +++ b/include/DataFrame/Utils/Matrix.h @@ -118,6 +118,26 @@ class Matrix { // Matrix inverse() const; + // Variance/Covariance matrix. + // The columns of the matrix are assumed to be observations of some + // random variable. So the covariance matrix is a square matrix + // containing the covariances of the above random variables. + // + // If it is unbiased the estimate is divided by n - 1, otherwise by n, + // n being the number of rows. + // + // The reason for dividing by n - 1 is because we are dividing by + // degrees of freedom (i.e. number of independent observations). + // For example, if we have two observations, when calculating the mean + // we have two independent observations; however, when calculating the + // variance, we have only one independent observation, since the two + // observations are equally distant from the mean. + // + // For a nXm matrix, you will get a mXm covariance matrix + // + Matrix + covariance(bool is_unbiased = true) const; + // Let A be an nXn matrix. The number l is an eigenvalue of A if there // exists a non-zero vector v such that // Av = lv @@ -195,7 +215,7 @@ class Matrix { using storage_t = std::vector; - inline static constexpr value_type EPSILON_ { 2.220446e-16 }; + inline static constexpr value_type EPSILON_ { value_type(2.220446e-16) }; // Partial pivoting for Gaussian elimination: // diff --git a/include/DataFrame/Utils/Matrix.tcc b/include/DataFrame/Utils/Matrix.tcc index 0b71f283..49e48a93 100644 --- a/include/DataFrame/Utils/Matrix.tcc +++ b/include/DataFrame/Utils/Matrix.tcc @@ -1215,6 +1215,46 @@ eigen_space(MA1 &eigenvalues, MA2 &eigenvectors, bool sort_values) const { return; } + +// ---------------------------------------------------------------------------- + +template +Matrix Matrix:: +covariance(bool is_unbiased) const { + + const value_type denom = is_unbiased ? rows() - 1 : rows(); + + if (denom <= value_type(0)) + throw DataFrameError("Matrix::covariance(): Not solvable"); + + Matrix result (cols(), cols()); + row_iterator res_iter = result.row_begin (); + + // I don't know how I can take advantage of the fact this is a + // symmetric matrix by definition + // + for (size_type c = 0; c < cols(); ++c) { + value_type col_mean { 0 }; + size_type counter { 0 }; + + for (auto cciter = col_begin() + c * rows(); + counter != rows(); ++cciter, ++counter) + col_mean += *cciter; + + col_mean /= value_type(rows()); + + for (size_type cc = 0; cc < cols(); ++cc) { + value_type var_covar { 0 }; + + for (size_type r = 0; r < rows(); ++r) + var_covar += (at(r, c) - col_mean) * (at(r, cc) - col_mean); + + *res_iter++ = var_covar / denom; + } + } + + return (result); +} } // namespace hmdf