Skip to content

Commit

Permalink
Implemented covariance_matrix()
Browse files Browse the repository at this point in the history
  • Loading branch information
hosseinmoein committed Dec 13, 2024
1 parent d6fa594 commit a120a51
Show file tree
Hide file tree
Showing 9 changed files with 253 additions and 15 deletions.
4 changes: 4 additions & 0 deletions docs/HTML/DataFrame.html
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,10 @@ <H2 ID="2"><font color="blue">API Reference with code samples <font size="+4">&#
<td title="Get column name for the given column index"><a href="https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/col_name_to_idx.html">col_idx_to_name</a>()</td>
</tr>

<tr class="item" onmouseover="this.style.backgroundColor='#ffff66';" onmouseout="this.style.backgroundColor='#d4e3e5';">
<td title="Calculates and returns the variance/covariance matrix"><a href="https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/covariance_matrix.html">covariance_matrix</a>()</td>
</tr>

<tr class="item" onmouseover="this.style.backgroundColor='#ffff66';" onmouseout="this.style.backgroundColor='#d4e3e5';">
<td title="Returns a DataFrame describing the columns"><a href="https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/describe.html">describe</a>()</td>
</tr>
Expand Down
4 changes: 2 additions & 2 deletions docs/HTML/Matrix.html
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@
<PRE><B>
enum class matrix_orient : unsigned char {

column_major = 1,
row_major = 2,
column_major = 1, // Data is laid out column by column
row_major = 2, // Data is laid out row by row
};

// -----------------------
Expand Down
2 changes: 2 additions & 0 deletions docs/HTML/NormalizeVisitor.html
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@
<td bgcolor="blue"> <font color="white">
<PRE><B>
enum class normalization_type : unsigned char {

none = 0,
simple = 1, // <div class="frac"> <span>V</span> <span class="symbol">/</span> <span class="bottom">&sum; x<sub>i</sub></span> </div>
euclidean = 2, // <div class="frac"> <span>V</span> <span class="symbol">/</span> <span class="bottom"><span>&radic;<span style="text-decoration:overline;">&sum; x<sub>i</sub><sup>2</sup></span></span> </div>
maxi = 3, // <div class="frac"> <span>V</span> <span class="symbol">/</span> <span class="bottom">MAX(x<sub>i</sub>)</span> </div>
Expand Down
119 changes: 119 additions & 0 deletions docs/HTML/covariance_matrix.html

Large diffs are not rendered by default.

21 changes: 20 additions & 1 deletion include/DataFrame/DataFrame.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <DataFrame/Utils/Concepts.h>
#include <DataFrame/Utils/DateTime.h>
#include <DataFrame/Utils/FixedSizeString.h>
#include <DataFrame/Utils/Matrix.h>
#include <DataFrame/Utils/Threads/ThreadGranularity.h>
#include <DataFrame/Utils/Utils.h>

Expand Down Expand Up @@ -3738,7 +3739,25 @@ class DataFrame : public ThreadGranularity {
// Name of the column
//
template<typename T, typename C = std::less<T>>
size_type inversion_count(const char *col_name) const;
size_type
inversion_count(const char *col_name) const;

// This calculates and returns the variance/covariance matrix of the
// specified columns, optionally normalizing the columns first.
//
// T:
// Type of the named columns
// col_names:
// Vector of column names
// norm_type:
// The method to normalize the columns first before calculations.
// Default is not normalizing
//
template<typename T>
Matrix<T, matrix_orient::column_major>
covariance_matrix(
std::vector<const char *> &&col_names,
normalization_type norm_type = normalization_type::none) const;

// This function returns a DataFrame indexed by std::string that provides
// a few statistics about the columns of the calling DataFrame.
Expand Down
1 change: 1 addition & 0 deletions include/DataFrame/DataFrameTypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -530,6 +530,7 @@ enum class prob_dist_type : unsigned char {

enum class normalization_type : unsigned char {

none = 0,
simple = 1, // V / sum(xi)
euclidean = 2, // V / sqrt(sum(xi^2))
maxi = 3, // V / max(xi)
Expand Down
51 changes: 50 additions & 1 deletion include/DataFrame/Internals/DataFrame_get.tcc
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ DataFrame<I, H>::get_column (const char *name, bool do_lock) const {
template<typename I, typename H>
template<typename T>
const typename DataFrame<I, H>::template ColumnVecType<typename T::type> &
DataFrame<I, H>::get_column () const {
DataFrame<I, H>::get_column() const {

return (const_cast<DataFrame *>(this)->get_column<typename T::type>(
T::name));
Expand Down Expand Up @@ -930,6 +930,55 @@ DataFrame<I, H>::difference(const DataFrame &other) const {
return (result);
}

// ----------------------------------------------------------------------------

template<typename I, typename H>
template<typename T>
Matrix<T, matrix_orient::column_major> DataFrame<I, H>::
covariance_matrix(std::vector<const char *> &&col_names,
normalization_type norm_type) const {

const size_type col_num = col_names.size();

#ifdef HMDF_SANITY_EXCEPTIONS
if (col_num < 2)
throw NotFeasible("covariance_matrix(): "
"You must specify at least two columns");
#endif // HMDF_SANITY_EXCEPTIONS

size_type min_col_s { indices_.size() };
std::vector<const ColumnVecType<T> *> columns(col_num, nullptr);
SpinGuard guard (lock_);

for (size_type i { 0 }; i < col_num; ++i) {
columns[i] = &get_column<T>(col_names[i], false);
if (columns[i]->size() < min_col_s)
min_col_s = columns[i]->size();
}
guard.release();

Matrix<T, matrix_orient::column_major> data_mat {
long(min_col_s), long(col_num) };

if (norm_type > normalization_type::none) {
for (size_type i { 0 }; i < col_num; ++i) {
NormalizeVisitor<T, I> norm_v { norm_type };

norm_v.pre();
norm_v(indices_.begin(), indices_.end(),
columns[i]->begin(), columns[i]->end());
norm_v.post();
data_mat.set_column(norm_v.get_result().begin(), i);
}
}
else {
for (size_type i { 0 }; i < col_num; ++i)
data_mat.set_column(columns[i]->begin(), i);
}

return (data_mat.covariance());
}

} // namespace hmdf

// ----------------------------------------------------------------------------
Expand Down
22 changes: 11 additions & 11 deletions include/DataFrame/Utils/Matrix.tcc
Original file line number Diff line number Diff line change
Expand Up @@ -1189,22 +1189,22 @@ eigen_space(MA1 &eigenvalues, MA2 &eigenvectors, bool sort_values) const {

if (sort_values) {
for (size_type c = 0; c < cols() - 1; ++c) {
size_type tmp_c { c };
value_type p { tmp_evals(0, c) };
size_type min_col { c };
value_type min_val { tmp_evals(0, c) };

for (size_type cc = c + 1; cc < cols(); ++cc)
if (tmp_evals(0, cc) < p) {
tmp_c = cc;
p = tmp_evals(0, cc);
if (tmp_evals(0, cc) < min_val) {
min_col = cc;
min_val = tmp_evals(0, cc);
}

if (tmp_c != c) {
tmp_evals(0, tmp_c) = tmp_evals(0, c);
tmp_evals(0, c) = p;
if (min_col != c) {
tmp_evals(0, min_col) = tmp_evals(0, c);
tmp_evals(0, c) = min_val;
for (size_type r = 0; r < rows(); ++r) {
p = tmp_evecs(r, c);
tmp_evecs(r, c) = tmp_evecs(r, tmp_c);
tmp_evecs(r, tmp_c) = p;
min_val = tmp_evecs(r, c);
tmp_evecs(r, c) = tmp_evecs(r, min_col);
tmp_evecs(r, min_col) = min_val;
}
}
}
Expand Down
44 changes: 44 additions & 0 deletions test/dataframe_tester_4.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2226,6 +2226,49 @@ static void test_StationaryCheckVisitor() {

// ----------------------------------------------------------------------------

static void test_covariance_matrix() {

std::cout << "\nTesting covariance_matrix( ) ..." << std::endl;

StrDataFrame df;

try {
df.read("IBM.csv", io_format::csv2);
}
catch (const DataFrameError &ex) {
std::cout << ex.what() << std::endl;
}

const auto cov_mat =
df.covariance_matrix<double>({ "IBM_Close", "IBM_Open",
"IBM_High", "IBM_Low" });

assert(cov_mat.rows() == 4);
assert(cov_mat.cols() == 4);
assert(std::fabs(cov_mat(0, 0) - 1467.58) < 0.01);
assert(std::fabs(cov_mat(0, 2) - 1469.69) < 0.01);
assert(std::fabs(cov_mat(2, 1) - 1469.48) < 0.01);
assert(std::fabs(cov_mat(2, 2) - 1472.86) < 0.01);
assert(std::fabs(cov_mat(3, 2) - 1466.15) < 0.01);
assert(std::fabs(cov_mat(3, 3) - 1461.0) < 0.01);

const auto cov_mat2 =
df.covariance_matrix<double>({ "IBM_Close", "IBM_Open",
"IBM_High", "IBM_Low" },
normalization_type::z_score);

assert(cov_mat2.rows() == 4);
assert(cov_mat2.cols() == 4);
assert(std::fabs(cov_mat2(0, 0) - 1.0) < 0.01);
assert(std::fabs(cov_mat2(0, 2) - 0.99964) < 0.00001);
assert(std::fabs(cov_mat2(2, 1) - 0.99963) < 0.00001);
assert(std::fabs(cov_mat2(2, 2) - 1.0) < 0.01);
assert(std::fabs(cov_mat2(3, 2) - 0.99948) < 0.00001);
assert(std::fabs(cov_mat2(3, 3) - 1.0) < 0.01);
}

// ----------------------------------------------------------------------------

int main(int, char *[]) {

MyDataFrame::set_optimum_thread_level();
Expand Down Expand Up @@ -2266,6 +2309,7 @@ int main(int, char *[]) {
test_PartialAutoCorrVisitor();
test_make_stationary();
test_StationaryCheckVisitor();
test_covariance_matrix();

return (0);
}
Expand Down

0 comments on commit a120a51

Please sign in to comment.