Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding more container types to CSV2 reading/writing format #261

Merged
merged 3 commits into from
Oct 9, 2023
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Added seed option to all random algos
hosseinmoein committed Oct 9, 2023
commit d9d82b3a0aa1f50369c8203e766a97973c2727fc
12 changes: 6 additions & 6 deletions docs/HTML/get_data_by_rand.html
Original file line number Diff line number Diff line change
@@ -66,7 +66,7 @@
DataFrame<I, H>
get_data_by_rand(random_policy spec,
double n,
std::size_t seed = 0) const;
seed_t seed = 0) const;
</B></PRE></font>
</td>
<td>
@@ -78,7 +78,7 @@
<B>Ts</b>: The list of types for all columns. A type should be specified only once.<BR>
<B>random_policy</b>: Please see random_policy in DataFrameTypes.h. It specifies how this function should proceed.<BR>
<B>n</b>: Depending on the random policy, it is either the number of rows to sample or a fraction of rows to sample. In case of fraction, for example 0.4 means 40% of rows.<BR>
<B>seed</b>: Depending on the random policy, user could specify a seed. The same seed should always produce the same random selection.<BR>
<B>seed</b>: (unsigned int) Depending on the random policy, user could specify a seed. The same seed should always produce the same random selection.<BR>
</td>
</tr>

@@ -89,7 +89,7 @@
PtrView
get_view_by_rand(random_policy spec,
double n,
std::size_t seed = 0);
seed_t seed = 0);
</B></PRE></font>
</td>
<td>
@@ -101,7 +101,7 @@
<B>Ts</b>: The list of types for all columns. A type should be specified only once.<BR>
<B>random_policy</b>: Please see random_policy in DataFrameTypes.h. It specifies how this function should proceed.<BR>
<B>n</b>: Depending on the random policy, it is either the number of rows to sample or a fraction of rows to sample. In case of fraction, for example 0.4 means 40% of rows.<BR>
<B>seed</b>: Depending on the random policy, user could specify a seed. The same seed should always produce the same random selection.<BR>
<B>seed</b>: (unsigned int) Depending on the random policy, user could specify a seed. The same seed should always produce the same random selection.<BR>
</td>
</tr>

@@ -112,7 +112,7 @@
ConstPtrView
get_view_by_rand(random_policy spec,
double n,
std::size_t seed = 0) const;
seed_t seed = 0) const;
</B></PRE></font>
</td>
<td>
@@ -122,7 +122,7 @@
<B>Ts</b>: The list of types for all columns. A type should be specified only once.<BR>
<B>random_policy</b>: Please see random_policy in DataFrameTypes.h. It specifies how this function should proceed.<BR>
<B>n</b>: Depending on the random policy, it is either the number of rows to sample or a fraction of rows to sample. In case of fraction, for example 0.4 means 40% of rows.<BR>
<B>seed</b>: Depending on the random policy, user could specify a seed. The same seed should always produce the same random selection.<BR>
<B>seed</b>: (unsigned int) Depending on the random policy, user could specify a seed. The same seed should always produce the same random selection.<BR>
</td>
</tr>

11 changes: 7 additions & 4 deletions include/DataFrame/DataFrame.h
Original file line number Diff line number Diff line change
@@ -115,6 +115,8 @@ class DataFrame : public ThreadGranularity {
template<typename T>
using StlVecType = std::vector<T, AllocatorType<T>>;

using seed_t = std::random_device::result_type;

DataFrame() = default;

// Because of thread safety, these need tender loving care
@@ -937,7 +939,8 @@ class DataFrame : public ThreadGranularity {
template<typename ... Ts>
void
shuffle(const StlVecType<const char *> &col_names,
bool also_shuffle_index);
bool also_shuffle_index,
seed_t seed = seed_t(-1));

// It fills all the "missing values" with the given values, and/or using
// the given method.
@@ -2492,7 +2495,7 @@ class DataFrame : public ThreadGranularity {
//
template<typename ... Ts>
[[nodiscard]] DataFrame
get_data_by_rand(random_policy spec, double n, size_type seed = 0) const;
get_data_by_rand(random_policy spec, double n, seed_t seed = 0) const;

// It behaves like get_data_by_rand(), but it returns a PtrView.
// A view is a DataFrame that is a reference to the original DataFrame.
@@ -2521,11 +2524,11 @@ class DataFrame : public ThreadGranularity {
//
template<typename ... Ts>
[[nodiscard]] PtrView
get_view_by_rand(random_policy spec, double n, size_type seed = 0);
get_view_by_rand(random_policy spec, double n, seed_t seed = 0);

template<typename ... Ts>
[[nodiscard]] ConstPtrView
get_view_by_rand(random_policy spec, double n, size_type seed = 0) const;
get_view_by_rand(random_policy spec, double n, seed_t seed = 0) const;

// This returns a DataFrame with index and col_names copied from the
// original DataFrame
15 changes: 12 additions & 3 deletions include/DataFrame/DataFrameMLVisitors.h
Original file line number Diff line number Diff line change
@@ -134,11 +134,13 @@ struct KMeansVisitor {
using cluster_type = std::array<VectorConstPtrView<value_type, A>, K>;
using distance_func =
std::function<double(const value_type &x, const value_type &y)>;
using seed_t = std::random_device::result_type;

private:

const size_type iter_num_;
const bool cc_;
const seed_t seed_;
distance_func dfunc_;
result_type result_ { }; // K Means
cluster_type clusters_ { }; // K Clusters
@@ -147,10 +149,12 @@ struct KMeansVisitor {
inline void calc_k_means_(const H &column_begin, size_type col_s) {

std::random_device rd;
std::mt19937 gen(rd());
std::mt19937 gen(
(seed_ != seed_t(-1)) ? seed_ : rd());
std::uniform_int_distribution<size_type> rd_gen(0, col_s - 1);

// Pick centroids as random points from the col.
//
for (auto &k_mean : result_) [[likely]] {
const value_type &value = *(column_begin + rd_gen(gen));

@@ -280,8 +284,9 @@ struct KMeansVisitor {
distance_func f =
[](const value_type &x, const value_type &y) -> double {
return ((x - y) * (x - y));
})
: iter_num_(num_of_iter), cc_(calc_clusters), dfunc_(f) { }
},
seed_t seed = seed_t(-1))
: iter_num_(num_of_iter), cc_(calc_clusters), seed_(seed), dfunc_(f) { }
};

// ----------------------------------------------------------------------------
@@ -316,6 +321,7 @@ struct AffinityPropVisitor {
double min_dist = std::numeric_limits<double>::max();

// Compute similarity between distinct data points i and j
//
for (size_type i = 0; i < csize - 1; ++i) [[likely]] {
const value_type &i_val = *(column_begin + i);

@@ -328,6 +334,7 @@ struct AffinityPropVisitor {
}

// Assign min to diagonals
//
for (size_type i = 0; i < csize; ++i)
simil[(i * csize) + i - ((i * (i + 1)) >> 1)] = min_dist;

@@ -345,6 +352,7 @@ struct AffinityPropVisitor {

for (size_type m = 0; m < iter_num_; ++m) [[likely]] {
// Update responsibility
//
for (size_type i = 0; i < csize; ++i) [[likely]] {
for (size_type j = 0; j < csize; ++j) [[likely]] {
double max_diff = -std::numeric_limits<double>::max();
@@ -370,6 +378,7 @@ struct AffinityPropVisitor {

// Update availability
// Do diagonals first
//
for (size_type i = 0; i < csize; ++i) [[likely]] {
const size_type s1 = i * csize;
double sum = 0.0;
14 changes: 7 additions & 7 deletions include/DataFrame/Internals/DataFrame.tcc
Original file line number Diff line number Diff line change
@@ -143,17 +143,17 @@ template<typename I, typename H>
template<typename ... Ts>
void
DataFrame<I, H>::shuffle(const StlVecType<const char *> &col_names,
bool also_shuffle_index) {
bool also_shuffle_index,
seed_t seed) {

if (also_shuffle_index) {
std::random_device rd;
std::mt19937 g(rd());
std::random_device rd;
std::mt19937 g ((seed != seed_t(-1)) ? seed : rd());

if (also_shuffle_index)
std::shuffle(indices_.begin(), indices_.end(), g);
}

shuffle_functor_<Ts ...> functor;
const SpinGuard guard(lock_);
shuffle_functor_<Ts ...> functor (g);
const SpinGuard guard (lock_);

for (const auto &name_citer : col_names) [[likely]] {
const auto citer = column_tb_.find (name_citer);
4 changes: 3 additions & 1 deletion include/DataFrame/Internals/DataFrame_functors.h
Original file line number Diff line number Diff line change
@@ -526,7 +526,9 @@ struct sel_remove_functor_ : DataVec::template visitor_base<Ts ...> {
template<typename ... Ts>
struct shuffle_functor_ : DataVec::template visitor_base<Ts ...> {

inline shuffle_functor_ () { }
inline shuffle_functor_ (std::mt19937 &g) : g_(g) { }

std::mt19937 &g_;

template<typename T>
void operator() (T &vec) const;
39 changes: 15 additions & 24 deletions include/DataFrame/Internals/DataFrame_get.tcc
Original file line number Diff line number Diff line change
@@ -2128,7 +2128,7 @@ get_view_by_sel(const char *name1,
template<typename I, typename H>
template<typename ... Ts>
DataFrame<I, H> DataFrame<I, H>::
get_data_by_rand(random_policy spec, double n, size_type seed) const {
get_data_by_rand(random_policy spec, double n, seed_t seed) const {

bool use_seed = false;
size_type n_rows = static_cast<size_type>(n);
@@ -2146,13 +2146,10 @@ get_data_by_rand(random_policy spec, double n, size_type seed) const {
}

if (index_s > 0 && n_rows <= index_s) [[likely]] {
std::random_device rd;
std::mt19937 gen(rd());

if (use_seed) gen.seed(static_cast<unsigned int>(seed));

std::uniform_int_distribution<size_type> dis(0, index_s - 1);
StlVecType<size_type> rand_indices(n_rows);
std::random_device rd;
std::mt19937 gen(use_seed ? seed : rd());
std::uniform_int_distribution<size_type> dis(0, index_s - 1);
StlVecType<size_type> rand_indices(n_rows);

for (size_type i = 0; i < n_rows; ++i)
rand_indices[i] = dis(gen);
@@ -2204,7 +2201,7 @@ get_data_by_rand(random_policy spec, double n, size_type seed) const {
template<typename I, typename H>
template<typename ... Ts>
typename DataFrame<I, H>::PtrView DataFrame<I, H>::
get_view_by_rand (random_policy spec, double n, size_type seed) {
get_view_by_rand (random_policy spec, double n, seed_t seed) {

bool use_seed = false;
size_type n_rows = static_cast<size_type>(n);
@@ -2222,13 +2219,10 @@ get_view_by_rand (random_policy spec, double n, size_type seed) {
}

if (index_s > 0 && n_rows <= index_s) [[likely]] {
std::random_device rd;
std::mt19937 gen(rd());

if (use_seed) gen.seed(static_cast<unsigned int>(seed));

std::uniform_int_distribution<size_type> dis(0, index_s - 1);
StlVecType<size_type> rand_indices(n_rows);
std::random_device rd;
std::mt19937 gen(use_seed ? seed : rd());
std::uniform_int_distribution<size_type> dis(0, index_s - 1);
StlVecType<size_type> rand_indices(n_rows);

for (size_type i = 0; i < n_rows; ++i) [[likely]]
rand_indices[i] = dis(gen);
@@ -2281,7 +2275,7 @@ template<typename I, typename H>
template<typename ... Ts>
typename DataFrame<I, H>::ConstPtrView
DataFrame<I, H>::
get_view_by_rand (random_policy spec, double n, size_type seed) const {
get_view_by_rand (random_policy spec, double n, seed_t seed) const {

bool use_seed = false;
size_type n_rows = static_cast<size_type>(n);
@@ -2299,13 +2293,10 @@ get_view_by_rand (random_policy spec, double n, size_type seed) const {
}

if (index_s > 0 && n_rows <= index_s) [[likely]] {
std::random_device rd;
std::mt19937 gen(rd());

if (use_seed) gen.seed(static_cast<unsigned int>(seed));

std::uniform_int_distribution<size_type> dis(0, index_s - 1);
StlVecType<size_type> rand_indices(n_rows);
std::random_device rd;
std::mt19937 gen(use_seed ? seed : rd());
std::uniform_int_distribution<size_type> dis(0, index_s - 1);
StlVecType<size_type> rand_indices(n_rows);

for (size_type i = 0; i < n_rows; ++i) [[likely]]
rand_indices[i] = dis(gen);
5 changes: 1 addition & 4 deletions include/DataFrame/Internals/DataFrame_misc.tcc
Original file line number Diff line number Diff line change
@@ -684,10 +684,7 @@ DataFrame<I, H>::
shuffle_functor_<Ts ...>::
operator() (T &vec) const {

std::random_device rd;
std::mt19937 g(rd());

std::shuffle(vec.begin(), vec.end(), g);
std::shuffle(vec.begin(), vec.end(), g_);
return;
}

24 changes: 19 additions & 5 deletions test/dataframe_tester.cc
Original file line number Diff line number Diff line change
@@ -3719,7 +3719,14 @@ static void test_k_means() {
std::make_pair("col1",
gen_lognormal_dist<double, 128>(item_cnt, p)));

KMeansVisitor<5, double, unsigned long, 128> km_visitor(1000);
KMeansVisitor<5,
double,
unsigned long,
128> km_visitor(1000, true,
[](const double &x, const double &y) {
return ((x - y) * (x - y));
},
10);

df.single_act_visit<double>("col1", km_visitor);
std::cout << "Means of clusters are: ";
@@ -3797,7 +3804,7 @@ static void test_k_means() {
KMeansVisitor<5,
Point,
unsigned long,
128> km_visitor2(1000, true, point_distance);
128> km_visitor2(1000, true, point_distance, 10);

df.single_act_visit<Point>("point_col", km_visitor2);

@@ -3883,7 +3890,7 @@ static void test_affinity_propagation() {
StlVecType<double> final_col;
StlVecType<double> col_data;

p.seed = 3575984165U;
p.seed = 10U;

p.min_value = 0;
p.max_value = 10;
@@ -3912,9 +3919,16 @@ static void test_affinity_propagation() {

df.load_data(MyDataFrame::gen_sequence_index(0, item_cnt * 5, 1),
std::make_pair("col1", final_col));
df.shuffle<double>({"col1"}, false);
df.shuffle<double>({"col1"}, false, 10);

KMeansVisitor<5, double, unsigned long, 128> km_visitor(1000);
KMeansVisitor<5,
double,
unsigned long,
128> km_visitor(1000, true,
[](const double &x, const double &y) {
return ((x - y) * (x - y));
},
10);
AffinityPropVisitor<double, unsigned long, 128> ap_visitor(50);

df.single_act_visit<double>("col1", km_visitor);
32 changes: 16 additions & 16 deletions test/dataframe_tester_output.txt

Large diffs are not rendered by default.