Skip to content

Commit

Permalink
Added seed option to all random algos
Browse files Browse the repository at this point in the history
  • Loading branch information
hosseinmoein committed Oct 9, 2023
1 parent 91210a2 commit d9d82b3
Show file tree
Hide file tree
Showing 9 changed files with 86 additions and 70 deletions.
12 changes: 6 additions & 6 deletions docs/HTML/get_data_by_rand.html
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@
DataFrame<I, H>
get_data_by_rand(random_policy spec,
double n,
std::size_t seed = 0) const;
seed_t seed = 0) const;
</B></PRE></font>
</td>
<td>
Expand All @@ -78,7 +78,7 @@
<B>Ts</b>: The list of types for all columns. A type should be specified only once.<BR>
<B>random_policy</b>: Please see random_policy in DataFrameTypes.h. It specifies how this function should proceed.<BR>
<B>n</b>: Depending on the random policy, it is either the number of rows to sample or a fraction of rows to sample. In case of fraction, for example 0.4 means 40% of rows.<BR>
<B>seed</b>: Depending on the random policy, user could specify a seed. The same seed should always produce the same random selection.<BR>
<B>seed</b>: (unsigned int) Depending on the random policy, user could specify a seed. The same seed should always produce the same random selection.<BR>
</td>
</tr>

Expand All @@ -89,7 +89,7 @@
PtrView
get_view_by_rand(random_policy spec,
double n,
std::size_t seed = 0);
seed_t seed = 0);
</B></PRE></font>
</td>
<td>
Expand All @@ -101,7 +101,7 @@
<B>Ts</b>: The list of types for all columns. A type should be specified only once.<BR>
<B>random_policy</b>: Please see random_policy in DataFrameTypes.h. It specifies how this function should proceed.<BR>
<B>n</b>: Depending on the random policy, it is either the number of rows to sample or a fraction of rows to sample. In case of fraction, for example 0.4 means 40% of rows.<BR>
<B>seed</b>: Depending on the random policy, user could specify a seed. The same seed should always produce the same random selection.<BR>
<B>seed</b>: (unsigned int) Depending on the random policy, user could specify a seed. The same seed should always produce the same random selection.<BR>
</td>
</tr>

Expand All @@ -112,7 +112,7 @@
ConstPtrView
get_view_by_rand(random_policy spec,
double n,
std::size_t seed = 0) const;
seed_t seed = 0) const;
</B></PRE></font>
</td>
<td>
Expand All @@ -122,7 +122,7 @@
<B>Ts</b>: The list of types for all columns. A type should be specified only once.<BR>
<B>random_policy</b>: Please see random_policy in DataFrameTypes.h. It specifies how this function should proceed.<BR>
<B>n</b>: Depending on the random policy, it is either the number of rows to sample or a fraction of rows to sample. In case of fraction, for example 0.4 means 40% of rows.<BR>
<B>seed</b>: Depending on the random policy, user could specify a seed. The same seed should always produce the same random selection.<BR>
<B>seed</b>: (unsigned int) Depending on the random policy, user could specify a seed. The same seed should always produce the same random selection.<BR>
</td>
</tr>

Expand Down
11 changes: 7 additions & 4 deletions include/DataFrame/DataFrame.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,8 @@ class DataFrame : public ThreadGranularity {
template<typename T>
using StlVecType = std::vector<T, AllocatorType<T>>;

using seed_t = std::random_device::result_type;

DataFrame() = default;

// Because of thread safety, these need tender loving care
Expand Down Expand Up @@ -937,7 +939,8 @@ class DataFrame : public ThreadGranularity {
template<typename ... Ts>
void
shuffle(const StlVecType<const char *> &col_names,
bool also_shuffle_index);
bool also_shuffle_index,
seed_t seed = seed_t(-1));

// It fills all the "missing values" with the given values, and/or using
// the given method.
Expand Down Expand Up @@ -2492,7 +2495,7 @@ class DataFrame : public ThreadGranularity {
//
template<typename ... Ts>
[[nodiscard]] DataFrame
get_data_by_rand(random_policy spec, double n, size_type seed = 0) const;
get_data_by_rand(random_policy spec, double n, seed_t seed = 0) const;

// It behaves like get_data_by_rand(), but it returns a PtrView.
// A view is a DataFrame that is a reference to the original DataFrame.
Expand Down Expand Up @@ -2521,11 +2524,11 @@ class DataFrame : public ThreadGranularity {
//
template<typename ... Ts>
[[nodiscard]] PtrView
get_view_by_rand(random_policy spec, double n, size_type seed = 0);
get_view_by_rand(random_policy spec, double n, seed_t seed = 0);

template<typename ... Ts>
[[nodiscard]] ConstPtrView
get_view_by_rand(random_policy spec, double n, size_type seed = 0) const;
get_view_by_rand(random_policy spec, double n, seed_t seed = 0) const;

// This returns a DataFrame with index and col_names copied from the
// original DataFrame
Expand Down
15 changes: 12 additions & 3 deletions include/DataFrame/DataFrameMLVisitors.h
Original file line number Diff line number Diff line change
Expand Up @@ -134,11 +134,13 @@ struct KMeansVisitor {
using cluster_type = std::array<VectorConstPtrView<value_type, A>, K>;
using distance_func =
std::function<double(const value_type &x, const value_type &y)>;
using seed_t = std::random_device::result_type;

private:

const size_type iter_num_;
const bool cc_;
const seed_t seed_;
distance_func dfunc_;
result_type result_ { }; // K Means
cluster_type clusters_ { }; // K Clusters
Expand All @@ -147,10 +149,12 @@ struct KMeansVisitor {
inline void calc_k_means_(const H &column_begin, size_type col_s) {

std::random_device rd;
std::mt19937 gen(rd());
std::mt19937 gen(
(seed_ != seed_t(-1)) ? seed_ : rd());
std::uniform_int_distribution<size_type> rd_gen(0, col_s - 1);

// Pick centroids as random points from the col.
//
for (auto &k_mean : result_) [[likely]] {
const value_type &value = *(column_begin + rd_gen(gen));

Expand Down Expand Up @@ -280,8 +284,9 @@ struct KMeansVisitor {
distance_func f =
[](const value_type &x, const value_type &y) -> double {
return ((x - y) * (x - y));
})
: iter_num_(num_of_iter), cc_(calc_clusters), dfunc_(f) { }
},
seed_t seed = seed_t(-1))
: iter_num_(num_of_iter), cc_(calc_clusters), seed_(seed), dfunc_(f) { }
};

// ----------------------------------------------------------------------------
Expand Down Expand Up @@ -316,6 +321,7 @@ struct AffinityPropVisitor {
double min_dist = std::numeric_limits<double>::max();

// Compute similarity between distinct data points i and j
//
for (size_type i = 0; i < csize - 1; ++i) [[likely]] {
const value_type &i_val = *(column_begin + i);

Expand All @@ -328,6 +334,7 @@ struct AffinityPropVisitor {
}

// Assign min to diagonals
//
for (size_type i = 0; i < csize; ++i)
simil[(i * csize) + i - ((i * (i + 1)) >> 1)] = min_dist;

Expand All @@ -345,6 +352,7 @@ struct AffinityPropVisitor {

for (size_type m = 0; m < iter_num_; ++m) [[likely]] {
// Update responsibility
//
for (size_type i = 0; i < csize; ++i) [[likely]] {
for (size_type j = 0; j < csize; ++j) [[likely]] {
double max_diff = -std::numeric_limits<double>::max();
Expand All @@ -370,6 +378,7 @@ struct AffinityPropVisitor {

// Update availability
// Do diagonals first
//
for (size_type i = 0; i < csize; ++i) [[likely]] {
const size_type s1 = i * csize;
double sum = 0.0;
Expand Down
14 changes: 7 additions & 7 deletions include/DataFrame/Internals/DataFrame.tcc
Original file line number Diff line number Diff line change
Expand Up @@ -143,17 +143,17 @@ template<typename I, typename H>
template<typename ... Ts>
void
DataFrame<I, H>::shuffle(const StlVecType<const char *> &col_names,
bool also_shuffle_index) {
bool also_shuffle_index,
seed_t seed) {

if (also_shuffle_index) {
std::random_device rd;
std::mt19937 g(rd());
std::random_device rd;
std::mt19937 g ((seed != seed_t(-1)) ? seed : rd());

if (also_shuffle_index)
std::shuffle(indices_.begin(), indices_.end(), g);
}

shuffle_functor_<Ts ...> functor;
const SpinGuard guard(lock_);
shuffle_functor_<Ts ...> functor (g);
const SpinGuard guard (lock_);

for (const auto &name_citer : col_names) [[likely]] {
const auto citer = column_tb_.find (name_citer);
Expand Down
4 changes: 3 additions & 1 deletion include/DataFrame/Internals/DataFrame_functors.h
Original file line number Diff line number Diff line change
Expand Up @@ -526,7 +526,9 @@ struct sel_remove_functor_ : DataVec::template visitor_base<Ts ...> {
template<typename ... Ts>
struct shuffle_functor_ : DataVec::template visitor_base<Ts ...> {

inline shuffle_functor_ () { }
inline shuffle_functor_ (std::mt19937 &g) : g_(g) { }

std::mt19937 &g_;

template<typename T>
void operator() (T &vec) const;
Expand Down
39 changes: 15 additions & 24 deletions include/DataFrame/Internals/DataFrame_get.tcc
Original file line number Diff line number Diff line change
Expand Up @@ -2128,7 +2128,7 @@ get_view_by_sel(const char *name1,
template<typename I, typename H>
template<typename ... Ts>
DataFrame<I, H> DataFrame<I, H>::
get_data_by_rand(random_policy spec, double n, size_type seed) const {
get_data_by_rand(random_policy spec, double n, seed_t seed) const {

bool use_seed = false;
size_type n_rows = static_cast<size_type>(n);
Expand All @@ -2146,13 +2146,10 @@ get_data_by_rand(random_policy spec, double n, size_type seed) const {
}

if (index_s > 0 && n_rows <= index_s) [[likely]] {
std::random_device rd;
std::mt19937 gen(rd());

if (use_seed) gen.seed(static_cast<unsigned int>(seed));

std::uniform_int_distribution<size_type> dis(0, index_s - 1);
StlVecType<size_type> rand_indices(n_rows);
std::random_device rd;
std::mt19937 gen(use_seed ? seed : rd());
std::uniform_int_distribution<size_type> dis(0, index_s - 1);
StlVecType<size_type> rand_indices(n_rows);

for (size_type i = 0; i < n_rows; ++i)
rand_indices[i] = dis(gen);
Expand Down Expand Up @@ -2204,7 +2201,7 @@ get_data_by_rand(random_policy spec, double n, size_type seed) const {
template<typename I, typename H>
template<typename ... Ts>
typename DataFrame<I, H>::PtrView DataFrame<I, H>::
get_view_by_rand (random_policy spec, double n, size_type seed) {
get_view_by_rand (random_policy spec, double n, seed_t seed) {

bool use_seed = false;
size_type n_rows = static_cast<size_type>(n);
Expand All @@ -2222,13 +2219,10 @@ get_view_by_rand (random_policy spec, double n, size_type seed) {
}

if (index_s > 0 && n_rows <= index_s) [[likely]] {
std::random_device rd;
std::mt19937 gen(rd());

if (use_seed) gen.seed(static_cast<unsigned int>(seed));

std::uniform_int_distribution<size_type> dis(0, index_s - 1);
StlVecType<size_type> rand_indices(n_rows);
std::random_device rd;
std::mt19937 gen(use_seed ? seed : rd());
std::uniform_int_distribution<size_type> dis(0, index_s - 1);
StlVecType<size_type> rand_indices(n_rows);

for (size_type i = 0; i < n_rows; ++i) [[likely]]
rand_indices[i] = dis(gen);
Expand Down Expand Up @@ -2281,7 +2275,7 @@ template<typename I, typename H>
template<typename ... Ts>
typename DataFrame<I, H>::ConstPtrView
DataFrame<I, H>::
get_view_by_rand (random_policy spec, double n, size_type seed) const {
get_view_by_rand (random_policy spec, double n, seed_t seed) const {

bool use_seed = false;
size_type n_rows = static_cast<size_type>(n);
Expand All @@ -2299,13 +2293,10 @@ get_view_by_rand (random_policy spec, double n, size_type seed) const {
}

if (index_s > 0 && n_rows <= index_s) [[likely]] {
std::random_device rd;
std::mt19937 gen(rd());

if (use_seed) gen.seed(static_cast<unsigned int>(seed));

std::uniform_int_distribution<size_type> dis(0, index_s - 1);
StlVecType<size_type> rand_indices(n_rows);
std::random_device rd;
std::mt19937 gen(use_seed ? seed : rd());
std::uniform_int_distribution<size_type> dis(0, index_s - 1);
StlVecType<size_type> rand_indices(n_rows);

for (size_type i = 0; i < n_rows; ++i) [[likely]]
rand_indices[i] = dis(gen);
Expand Down
5 changes: 1 addition & 4 deletions include/DataFrame/Internals/DataFrame_misc.tcc
Original file line number Diff line number Diff line change
Expand Up @@ -684,10 +684,7 @@ DataFrame<I, H>::
shuffle_functor_<Ts ...>::
operator() (T &vec) const {

std::random_device rd;
std::mt19937 g(rd());

std::shuffle(vec.begin(), vec.end(), g);
std::shuffle(vec.begin(), vec.end(), g_);
return;
}

Expand Down
24 changes: 19 additions & 5 deletions test/dataframe_tester.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3719,7 +3719,14 @@ static void test_k_means() {
std::make_pair("col1",
gen_lognormal_dist<double, 128>(item_cnt, p)));

KMeansVisitor<5, double, unsigned long, 128> km_visitor(1000);
KMeansVisitor<5,
double,
unsigned long,
128> km_visitor(1000, true,
[](const double &x, const double &y) {
return ((x - y) * (x - y));
},
10);

df.single_act_visit<double>("col1", km_visitor);
std::cout << "Means of clusters are: ";
Expand Down Expand Up @@ -3797,7 +3804,7 @@ static void test_k_means() {
KMeansVisitor<5,
Point,
unsigned long,
128> km_visitor2(1000, true, point_distance);
128> km_visitor2(1000, true, point_distance, 10);

df.single_act_visit<Point>("point_col", km_visitor2);

Expand Down Expand Up @@ -3883,7 +3890,7 @@ static void test_affinity_propagation() {
StlVecType<double> final_col;
StlVecType<double> col_data;

p.seed = 3575984165U;
p.seed = 10U;

p.min_value = 0;
p.max_value = 10;
Expand Down Expand Up @@ -3912,9 +3919,16 @@ static void test_affinity_propagation() {

df.load_data(MyDataFrame::gen_sequence_index(0, item_cnt * 5, 1),
std::make_pair("col1", final_col));
df.shuffle<double>({"col1"}, false);
df.shuffle<double>({"col1"}, false, 10);

KMeansVisitor<5, double, unsigned long, 128> km_visitor(1000);
KMeansVisitor<5,
double,
unsigned long,
128> km_visitor(1000, true,
[](const double &x, const double &y) {
return ((x - y) * (x - y));
},
10);
AffinityPropVisitor<double, unsigned long, 128> ap_visitor(50);

df.single_act_visit<double>("col1", km_visitor);
Expand Down
32 changes: 16 additions & 16 deletions test/dataframe_tester_output.txt

Large diffs are not rendered by default.

0 comments on commit d9d82b3

Please sign in to comment.