Skip to content

Commit

Permalink
Implemented get_above_quantile_[data|view]()
Browse files Browse the repository at this point in the history
  • Loading branch information
hosseinmoein committed Jul 19, 2024
1 parent 8ee8698 commit ce13e44
Show file tree
Hide file tree
Showing 8 changed files with 488 additions and 21 deletions.
4 changes: 4 additions & 0 deletions docs/HTML/DataFrame.html
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,10 @@ <H2><font color="blue">API Reference with code samples</font></H2>
<td title="Generates sequenced index values"><a href="https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/gen_sequence_index.html">static<BR>gen_sequence_index</a>()</td>
</tr>

<tr class="item" onmouseover="this.style.backgroundColor='#ffff66';" onmouseout="this.style.backgroundColor='#d4e3e5';">
<td title="Gets data or view of above quantile rows"><a href="https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/get_above_quantile_data.html">get_above_quantile_data()<BR>get_above_quantile_view()</a></td>
</tr>

<tr class="item" onmouseover="this.style.backgroundColor='#ffff66';" onmouseout="this.style.backgroundColor='#d4e3e5';">
<td title="Gets data or view of n bottom rows"><a href="https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/get_top_n_data.html">get_bottom_n_data()<BR>get_bottom_n_view()</a></td>
</tr>
Expand Down
197 changes: 197 additions & 0 deletions docs/HTML/get_above_quantile_data.html

Large diffs are not rendered by default.

12 changes: 6 additions & 6 deletions docs/HTML/get_top_n_data.html
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
<tr bgcolor="Azure">
<td bgcolor="blue"> <font color="white">
<PRE><B>
template&lt;typename T, typename F, typename ... Ts&gt;
template&lt;typename T, typename ... Ts&gt;
DataFrame
get_top_n_data(const char *col_name, size_type n) const;
</B></PRE></font>
Expand All @@ -64,7 +64,7 @@
<tr bgcolor="Azure">
<td bgcolor="blue"> <font color="white">
<PRE><B>
template&lt;typename T, typename F, typename ... Ts&gt;
template&lt;typename T, typename ... Ts&gt;
PtrView
get_top_n_view(const char *col_name, size_type n);
</B></PRE></font>
Expand All @@ -89,7 +89,7 @@
<tr bgcolor="Azure">
<td bgcolor="blue"> <font color="white">
<PRE><B>
template&lt;typename T, typename F, typename ... Ts&gt;
template&lt;typename T, typename ... Ts&gt;
ConstPtrView
get_top_n_view(const char *col_name, size_type n) const;
</B></PRE></font>
Expand All @@ -109,7 +109,7 @@
<tr bgcolor="Azure">
<td bgcolor="blue"> <font color="white">
<PRE><B>
template&lt;typename T, typename F, typename ... Ts&gt;
template&lt;typename T, typename ... Ts&gt;
DataFrame
get_bottom_n_data(const char *col_name, size_type n) const;
</B></PRE></font>
Expand All @@ -130,7 +130,7 @@
<tr bgcolor="Azure">
<td bgcolor="blue"> <font color="white">
<PRE><B>
template&lt;typename T, typename F, typename ... Ts&gt;
template&lt;typename T, typename ... Ts&gt;
PtrView
get_bottom_n_view(const char *col_name, size_type n);
</B></PRE></font>
Expand All @@ -155,7 +155,7 @@
<tr bgcolor="Azure">
<td bgcolor="blue"> <font color="white">
<PRE><B>
template&lt;typename T, typename F, typename ... Ts&gt;
template&lt;typename T, typename ... Ts&gt;
ConstPtrView
get_bottom_n_view(const char *col_name, size_type n) const;
</B></PRE></font>
Expand Down
35 changes: 34 additions & 1 deletion include/DataFrame/DataFrame.h
Original file line number Diff line number Diff line change
Expand Up @@ -2837,7 +2837,7 @@ class DataFrame : public ThreadGranularity {
[[nodiscard]] DataFrame<I, HeteroVector<std::size_t(H::align_value)>>
get_bottom_n_data(const char *col_name, size_type n) const;

// Smae as above but it returns a View with the n bottom rows of
// Same as above but it returns a View with the n bottom rows of
// the given column.
//
template<typename T, typename ... Ts>
Expand All @@ -2851,6 +2851,39 @@ class DataFrame : public ThreadGranularity {
[[nodiscard]] ConstPtrView
get_bottom_n_view(const char *col_name, size_type n) const;

// This returns a new DataFrame with rows greater than the specified
// quantile of the gievn column. The row equal to the quantile is also
// included.
// The returned DataFrame rows will be in the same order as self.
//
// NOTE: Comparison operators (<, >, ==) must be well defined for type T.
//
// T:
// Type of the named column
// Ts:
// List all the types of all data columns. A type should be specified in
// the list only once.
// col_name:
// Name of the given column
// quantile:
// qunatile specified as fraction. For example, 0.35 for 35% quantile.
//
template<typename T, typename ... Ts>
[[nodiscard]] DataFrame<I, HeteroVector<std::size_t(H::align_value)>>
get_above_quantile_data(const char *col_name, double quantile) const;

// Same as above but it returns a View with above quantile rows
//
template<typename T, typename ... Ts>
[[nodiscard]] PtrView
get_above_quantile_view(const char *col_name, double quantile);

// Same as above but it returns a const View with above quantile rows
//
template<typename T, typename ... Ts>
[[nodiscard]] ConstPtrView
get_above_quantile_view(const char *col_name, double quantile) const;

// This returns a new DataFrame with the same index column as self and an
// integer column with the same name for each column in self.
// The integer columns in returned DataFrame show a duplication mask for
Expand Down
53 changes: 52 additions & 1 deletion include/DataFrame/Internals/DataFrame_get.tcc
Original file line number Diff line number Diff line change
Expand Up @@ -2635,7 +2635,58 @@ get_bottom_n_view(const char *name, size_type n) const {
}

// ----------------------------------------------------------------------------


template<typename I, typename H>
template<typename T, typename ... Ts>
DataFrame<I, HeteroVector<std::size_t(H::align_value)>> DataFrame<I, H>::
get_above_quantile_data(const char *col_name, double quantile) const {

using res_t = DataFrame<I, HeteroVector<align_value>>;
using comp_func_t = std::greater_equal<T>;

res_t result;

above_quantile_common_<T, comp_func_t, res_t, Ts ...>
(col_name, quantile, comp_func_t { }, result);
return (result);
}

// ----------------------------------------------------------------------------

template<typename I, typename H>
template<typename T, typename ... Ts>
typename DataFrame<I, H>::PtrView DataFrame<I, H>::
get_above_quantile_view(const char *col_name, double quantile) {

using res_t = PtrView;
using comp_func_t = std::greater_equal<T>;

res_t result;

above_quantile_common_<T, comp_func_t, res_t, Ts ...>
(col_name, quantile, comp_func_t { }, result);
return (result);
}

// ----------------------------------------------------------------------------

template<typename I, typename H>
template<typename T, typename ... Ts>
typename DataFrame<I, H>::ConstPtrView DataFrame<I, H>::
get_above_quantile_view(const char *col_name, double quantile) const {

using res_t = ConstPtrView;
using comp_func_t = std::greater_equal<T>;

res_t result;

above_quantile_common_<T, comp_func_t, res_t, Ts ...>
(col_name, quantile, comp_func_t { }, result);
return (result);
}

// ----------------------------------------------------------------------------

template<typename I, typename H>
template<hashable_equal ... Ts>
DataFrame<I, HeteroVector<std::size_t(H::align_value)>> DataFrame<I, H>::
Expand Down
64 changes: 64 additions & 0 deletions include/DataFrame/Internals/DataFrame_private_decl.h
Original file line number Diff line number Diff line change
Expand Up @@ -968,6 +968,70 @@ void top_n_common_(const char *col_name, V &&visitor, R &result) const {

// ----------------------------------------------------------------------------

template<typename T, typename C, typename R, typename ... Ts>
void
above_quantile_common_(const char *col_name,
double quantile,
C &&comp_func,
R &result) const {

using res_t = R;

const ColumnVecType<T> *vec { nullptr };

if (! ::strcmp(col_name, DF_INDEX_COL_NAME))
vec = (const ColumnVecType<T> *) &(get_index());
else
vec = (const ColumnVecType<T> *) &(get_column<T>(col_name));

QuantileVisitor<T, I> quant { quantile };

quant.pre();
quant(indices_.begin(), indices_.end(), vec->begin(), vec->end());
quant.post();

typename res_t::IndexVecType new_index;
StlVecType<size_type> idxs;

new_index.reserve(vec->size() / 2);
idxs.reserve(vec->size() / 2);
for (size_type i { 0 }; i < vec->size(); ++i) {
if (comp_func((*vec)[i], quant.get_result())) {
if constexpr (std::is_same_v<
res_t,
DataFrame<I, HeteroVector<align_value>>>)
new_index.push_back(indices_[i]);
else // Views
new_index.push_back(
&(const_cast<DataFrame *>(this)->indices_[i]));
idxs.push_back(i);
}
}
result.indices_ = std::move(new_index);

const SpinGuard guard(lock_);

if constexpr (std::is_same_v<res_t,
DataFrame<I, HeteroVector<align_value>>>) {
for (const auto &[name, idx] : column_list_) [[likely]] {
sel_load_functor_<res_t, size_type, Ts ...> functor(
name.c_str(), idxs, 0, result);

data_[idx].change(functor);
}
}
else { // Views
for (const auto &[name, idx] : column_list_) [[likely]] {
sel_load_view_functor_<size_type, res_t, Ts ...> functor(
name.c_str(), idxs, 0, result);

data_[idx].change(functor);
}
}
}

// ----------------------------------------------------------------------------

template<typename V, typename T>
inline static void
replace_vector_vals_(V &data_vec,
Expand Down
104 changes: 104 additions & 0 deletions test/dataframe_tester_3.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4657,6 +4657,109 @@ static void test_get_bottom_n_data() {

// -----------------------------------------------------------------------------

static void test_get_above_quantile_data() {

std::cout << "\nTesting get_above_quantile_data( ) ..." << std::endl;

StlVecType<unsigned long> idx =
{ 123450, 123451, 123452, 123453, 123454, 123455, 123456,
123457, 123458, 123459, 123460, 123461, 123462, 123466 };
StlVecType<double> d1 =
{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
StlVecType<double> d2 =
{ 8, 9, 10, 11, 12, 13, 14, 20, 22, 23, 30, 31, 32, 1.89 };
StlVecType<double> d3 =
{ 15, 16, 15, 18, 19, 16, 21, 0.34, 1.56, 0.34, 2.3, 0.34, 19.0, 10 };
StlVecType<int> i1 = { 22, 23, 24, 25, 99 };
MyDataFrame df;

df.load_data(std::move(idx),
std::make_pair("col_1", d1),
std::make_pair("col_2", d2),
std::make_pair("col_3", d3),
std::make_pair("col_4", i1));

auto lbd =
[](const unsigned long &, const double &val) -> bool {
return (val < 100.0);
};
auto view =
df.get_view_by_sel<double, decltype(lbd), double, int, std::string>
("col_1", lbd);

auto res1 =
df.get_above_quantile_data<double, int, double, std::string>
("col_3", 0.45);
auto res2 =
view.get_above_quantile_data<double, int, double, std::string>
("col_3", 0.45);
auto res3 =
df.get_above_quantile_view<double, int, double, std::string>
("col_3", 0.45);
auto res4 =
view.get_above_quantile_view<double, int, double, std::string>
("col_3", 0.45);
auto res5 =
view.get_above_quantile_data<unsigned int, int, double, std::string>
(DF_INDEX_COL_NAME, 0.45);

{
StlVecType<unsigned long> out_idx =
{ 123450, 123451, 123452, 123453, 123454, 123455, 123456, 123462 };
StlVecType<double> out_col_2 =
{ 8, 9, 10, 11, 12, 13, 14, 32 };
StlVecType<double> out_col_3 =
{ 15, 16, 15, 18, 19, 16, 21, 19 };
StlVecType<int> out_col_4 = { 22, 23, 24, 25, 99, 0, 0, 0 };

assert(res1.get_index() == out_idx);
assert(res1.get_column<double>("col_2") == out_col_2);
assert(res1.get_column<double>("col_3") == out_col_3);
assert(res1.get_column<int>("col_4") == out_col_4);
}
{
StlVecType<unsigned long> out_idx =
{ 123450, 123451, 123452, 123453, 123454, 123455, 123456, 123462 };
StlVecType<double> out_col_2 =
{ 8, 9, 10, 11, 12, 13, 14, 32 };
StlVecType<double> out_col_3 =
{ 15, 16, 15, 18, 19, 16, 21, 19 };
StlVecType<int> out_col_4 = { 22, 23, 24, 25, 99, 0, 0, 0 };

assert(res2.get_index() == out_idx);
assert(res2.get_column<double>("col_2") == out_col_2);
assert(res2.get_column<double>("col_3") == out_col_3);
assert(res2.get_column<int>("col_4") == out_col_4);
}

res3.write<std::ostream, double, int, std::string>
(std::cout, io_format::csv);
std::cout << std::endl;

res4.write<std::ostream, double, int, std::string>
(std::cout, io_format::csv);
std::cout << std::endl;

{
StlVecType<unsigned long> out_idx =
{ 123455, 123456, 123457, 123458, 123459, 123460, 123461,
123462, 123466 };
StlVecType<double> out_col_2 =
{ 13, 14, 20, 22, 23, 30, 31, 32, 1.89 };
StlVecType<double> out_col_3 =
{ 16, 21, 0.34, 1.56, 0.34, 2.3, 0.34, 19, 10 };
StlVecType<int> out_col_4 =
{ 0, 0, 0, 0, 0, 0, 0, 0, 0 };

assert(res5.get_index() == out_idx);
assert(res5.get_column<double>("col_2") == out_col_2);
assert(res5.get_column<double>("col_3") == out_col_3);
assert(res5.get_column<int>("col_4") == out_col_4);
}
}

// -----------------------------------------------------------------------------

int main(int, char *[]) {

MyDataFrame::set_optimum_thread_level();
Expand Down Expand Up @@ -4747,6 +4850,7 @@ int main(int, char *[]) {
test_duplication_mask();
test_get_top_n_data();
test_get_bottom_n_data();
test_get_above_quantile_data();

return (0);
}
Expand Down
Loading

0 comments on commit ce13e44

Please sign in to comment.