Added seed option to all random algos

hosseinmoein · hosseinmoein · Oct 9, 2023 · Oct 8, 2023 · Oct 9, 2023 · Oct 9, 2023
commit d9d82b3a0aa1f50369c8203e766a97973c2727fc
diff --git a/docs/HTML/get_data_by_rand.html b/docs/HTML/get_data_by_rand.html
@@ -66,7 +66,7 @@
 DataFrame&lt;I, H&gt;
 get_data_by_rand(random_policy spec,
                  double n,
-                 std::size_t seed = 0) const;
+                 seed_t seed = 0) const;
         </B></PRE></font>
       </td>
       <td>
@@ -78,7 +78,7 @@
         <B>Ts</b>: The list of types for all columns. A type should be specified only once.<BR>
         <B>random_policy</b>: Please see random_policy in DataFrameTypes.h. It specifies how this function should proceed.<BR>
         <B>n</b>: Depending on the random policy, it is either the number of rows to sample or a fraction of rows to sample. In case of fraction, for example 0.4 means 40% of rows.<BR>
-        <B>seed</b>: Depending on the random policy, user could specify a seed. The same seed should always produce the same random selection.<BR>
+        <B>seed</b>: (unsigned int) Depending on the random policy, user could specify a seed. The same seed should always produce the same random selection.<BR>
       </td>
     </tr>
 
@@ -89,7 +89,7 @@
 PtrView
 get_view_by_rand(random_policy spec,
                  double n,
-                 std::size_t seed = 0);
+                 seed_t seed = 0);
         </B></PRE></font>
       </td>
       <td>
@@ -101,7 +101,7 @@
         <B>Ts</b>: The list of types for all columns. A type should be specified only once.<BR>
         <B>random_policy</b>: Please see random_policy in DataFrameTypes.h. It specifies how this function should proceed.<BR>
         <B>n</b>: Depending on the random policy, it is either the number of rows to sample or a fraction of rows to sample. In case of fraction, for example 0.4 means 40% of rows.<BR>
-        <B>seed</b>: Depending on the random policy, user could specify a seed. The same seed should always produce the same random selection.<BR>
+        <B>seed</b>: (unsigned int) Depending on the random policy, user could specify a seed. The same seed should always produce the same random selection.<BR>
       </td>
     </tr>
 
@@ -112,7 +112,7 @@
 ConstPtrView
 get_view_by_rand(random_policy spec,
                  double n,
-                 std::size_t seed = 0) const;
+                 seed_t seed = 0) const;
         </B></PRE></font>
       </td>
       <td>
@@ -122,7 +122,7 @@
         <B>Ts</b>: The list of types for all columns. A type should be specified only once.<BR>
         <B>random_policy</b>: Please see random_policy in DataFrameTypes.h. It specifies how this function should proceed.<BR>
         <B>n</b>: Depending on the random policy, it is either the number of rows to sample or a fraction of rows to sample. In case of fraction, for example 0.4 means 40% of rows.<BR>
-        <B>seed</b>: Depending on the random policy, user could specify a seed. The same seed should always produce the same random selection.<BR>
+        <B>seed</b>: (unsigned int) Depending on the random policy, user could specify a seed. The same seed should always produce the same random selection.<BR>
       </td>
     </tr>
 

diff --git a/include/DataFrame/DataFrame.h b/include/DataFrame/DataFrame.h
@@ -115,6 +115,8 @@ class   DataFrame : public ThreadGranularity {
     template<typename T>
     using StlVecType = std::vector<T, AllocatorType<T>>;
 
+    using seed_t = std::random_device::result_type;
+
     DataFrame() = default;
 
     // Because of thread safety, these need tender loving care
@@ -937,7 +939,8 @@ class   DataFrame : public ThreadGranularity {
     template<typename ... Ts>
     void
     shuffle(const StlVecType<const char *> &col_names,
-            bool also_shuffle_index);
+            bool also_shuffle_index,
+            seed_t seed = seed_t(-1));
 
     // It fills all the "missing values" with the given values, and/or using
     // the given method.
@@ -2492,7 +2495,7 @@ class   DataFrame : public ThreadGranularity {
     //
     template<typename ... Ts>
     [[nodiscard]] DataFrame
-    get_data_by_rand(random_policy spec, double n, size_type seed = 0) const;
+    get_data_by_rand(random_policy spec, double n, seed_t seed = 0) const;
 
     // It behaves like get_data_by_rand(), but it returns a PtrView.
     // A view is a DataFrame that is a reference to the original DataFrame.
@@ -2521,11 +2524,11 @@ class   DataFrame : public ThreadGranularity {
     //
     template<typename ... Ts>
     [[nodiscard]] PtrView
-    get_view_by_rand(random_policy spec, double n, size_type seed = 0);
+    get_view_by_rand(random_policy spec, double n, seed_t seed = 0);
 
     template<typename ... Ts>
     [[nodiscard]] ConstPtrView
-    get_view_by_rand(random_policy spec, double n, size_type seed = 0) const;
+    get_view_by_rand(random_policy spec, double n, seed_t seed = 0) const;
 
     // This returns a DataFrame with index and col_names copied from the
     // original DataFrame

diff --git a/include/DataFrame/DataFrameMLVisitors.h b/include/DataFrame/DataFrameMLVisitors.h
@@ -134,11 +134,13 @@ struct  KMeansVisitor  {
     using cluster_type = std::array<VectorConstPtrView<value_type, A>, K>;
     using distance_func =
         std::function<double(const value_type &x, const value_type &y)>;
+    using seed_t = std::random_device::result_type;
 
 private:
 
     const size_type iter_num_;
     const bool      cc_;
+    const seed_t    seed_;
     distance_func   dfunc_;
     result_type     result_ { };    // K Means
     cluster_type    clusters_ { };  // K Clusters
@@ -147,10 +149,12 @@ struct  KMeansVisitor  {
     inline void calc_k_means_(const H &column_begin, size_type col_s)  {
 
         std::random_device                          rd;
-        std::mt19937                                gen(rd());
+        std::mt19937                                gen(
+            (seed_ != seed_t(-1)) ? seed_ : rd());
         std::uniform_int_distribution<size_type>    rd_gen(0, col_s - 1);
 
         // Pick centroids as random points from the col.
+        //
         for (auto &k_mean : result_) [[likely]]  {
             const value_type    &value = *(column_begin + rd_gen(gen));
 
@@ -280,8 +284,9 @@ struct  KMeansVisitor  {
         distance_func f =
             [](const value_type &x, const value_type &y) -> double  {
                 return ((x - y) * (x - y));
-            })
-        : iter_num_(num_of_iter), cc_(calc_clusters), dfunc_(f)  {   }
+            },
+        seed_t seed = seed_t(-1))
+        : iter_num_(num_of_iter), cc_(calc_clusters), seed_(seed), dfunc_(f) {  }
 };
 
 // ----------------------------------------------------------------------------
@@ -316,6 +321,7 @@ struct  AffinityPropVisitor  {
         double          min_dist = std::numeric_limits<double>::max();
 
         // Compute similarity between distinct data points i and j
+        //
         for (size_type i = 0; i < csize - 1; ++i) [[likely]]  {
             const value_type    &i_val = *(column_begin + i);
 
@@ -328,6 +334,7 @@ struct  AffinityPropVisitor  {
         }
 
         // Assign min to diagonals
+        //
         for (size_type i = 0; i < csize; ++i)
             simil[(i * csize) + i - ((i * (i + 1)) >> 1)] = min_dist;
 
@@ -345,6 +352,7 @@ struct  AffinityPropVisitor  {
 
         for (size_type m = 0; m < iter_num_; ++m) [[likely]]  {
             // Update responsibility
+            //
             for (size_type i = 0; i < csize; ++i) [[likely]]  {
                 for (size_type j = 0; j < csize; ++j) [[likely]]  {
                     double  max_diff = -std::numeric_limits<double>::max();
@@ -370,6 +378,7 @@ struct  AffinityPropVisitor  {
 
             // Update availability
             // Do diagonals first
+            //
             for (size_type i = 0; i < csize; ++i) [[likely]]  {
                 const size_type s1 = i * csize;
                 double          sum = 0.0;

diff --git a/include/DataFrame/Internals/DataFrame.tcc b/include/DataFrame/Internals/DataFrame.tcc
@@ -143,17 +143,17 @@ template<typename I, typename H>
 template<typename ... Ts>
 void
 DataFrame<I, H>::shuffle(const StlVecType<const char *> &col_names,
-                         bool also_shuffle_index)  {
+                         bool also_shuffle_index,
+                         seed_t seed)  {
 
-    if (also_shuffle_index)  {
-        std::random_device  rd;
-        std::mt19937        g(rd());
+    std::random_device  rd;
+    std::mt19937        g ((seed != seed_t(-1)) ? seed : rd());
 
+    if (also_shuffle_index)
         std::shuffle(indices_.begin(), indices_.end(), g);
-    }
 
-    shuffle_functor_<Ts ...>    functor;
-    const SpinGuard             guard(lock_);
+    shuffle_functor_<Ts ...>    functor (g);
+    const SpinGuard             guard (lock_);
 
     for (const auto &name_citer : col_names) [[likely]]  {
         const auto  citer = column_tb_.find (name_citer);

diff --git a/include/DataFrame/Internals/DataFrame_functors.h b/include/DataFrame/Internals/DataFrame_functors.h
@@ -526,7 +526,9 @@ struct sel_remove_functor_ : DataVec::template visitor_base<Ts ...>  {
 template<typename ... Ts>
 struct shuffle_functor_ : DataVec::template visitor_base<Ts ...>  {
 
-    inline shuffle_functor_ ()  {  }
+    inline shuffle_functor_ (std::mt19937 &g) : g_(g)  {  }
+
+    std::mt19937    &g_;
 
     template<typename T>
     void operator() (T &vec) const;

diff --git a/include/DataFrame/Internals/DataFrame_get.tcc b/include/DataFrame/Internals/DataFrame_get.tcc
@@ -2128,7 +2128,7 @@ get_view_by_sel(const char *name1,
 template<typename I, typename H>
 template<typename ... Ts>
 DataFrame<I, H> DataFrame<I, H>::
-get_data_by_rand(random_policy spec, double n, size_type seed) const  {
+get_data_by_rand(random_policy spec, double n, seed_t seed) const  {
 
     bool            use_seed = false;
     size_type       n_rows = static_cast<size_type>(n);
@@ -2146,13 +2146,10 @@ get_data_by_rand(random_policy spec, double n, size_type seed) const  {
     }
 
     if (index_s > 0 && n_rows <= index_s) [[likely]]  {
-        std::random_device  rd;
-        std::mt19937        gen(rd());
-
-        if (use_seed)  gen.seed(static_cast<unsigned int>(seed));
-
-        std::uniform_int_distribution<size_type>    dis(0, index_s - 1);
-        StlVecType<size_type>                       rand_indices(n_rows);
+        std::random_device                        rd;
+        std::mt19937                              gen(use_seed ? seed : rd());
+        std::uniform_int_distribution<size_type>  dis(0, index_s - 1);
+        StlVecType<size_type>                     rand_indices(n_rows);
 
         for (size_type i = 0; i < n_rows; ++i)
             rand_indices[i] = dis(gen);
@@ -2204,7 +2201,7 @@ get_data_by_rand(random_policy spec, double n, size_type seed) const  {
 template<typename I, typename H>
 template<typename ... Ts>
 typename DataFrame<I, H>::PtrView DataFrame<I, H>::
-get_view_by_rand (random_policy spec, double n, size_type seed)  {
+get_view_by_rand (random_policy spec, double n, seed_t seed)  {
 
     bool            use_seed = false;
     size_type       n_rows = static_cast<size_type>(n);
@@ -2222,13 +2219,10 @@ get_view_by_rand (random_policy spec, double n, size_type seed)  {
     }
 
     if (index_s > 0 && n_rows <= index_s) [[likely]]  {
-        std::random_device  rd;
-        std::mt19937        gen(rd());
-
-        if (use_seed)  gen.seed(static_cast<unsigned int>(seed));
-
-        std::uniform_int_distribution<size_type>    dis(0, index_s - 1);
-        StlVecType<size_type>                       rand_indices(n_rows);
+        std::random_device                        rd;
+        std::mt19937                              gen(use_seed ? seed : rd());
+        std::uniform_int_distribution<size_type>  dis(0, index_s - 1);
+        StlVecType<size_type>                     rand_indices(n_rows);
 
         for (size_type i = 0; i < n_rows; ++i) [[likely]]
             rand_indices[i] = dis(gen);
@@ -2281,7 +2275,7 @@ template<typename I, typename H>
 template<typename ... Ts>
 typename DataFrame<I, H>::ConstPtrView
 DataFrame<I, H>::
-get_view_by_rand (random_policy spec, double n, size_type seed) const  {
+get_view_by_rand (random_policy spec, double n, seed_t seed) const  {
 
     bool            use_seed = false;
     size_type       n_rows = static_cast<size_type>(n);
@@ -2299,13 +2293,10 @@ get_view_by_rand (random_policy spec, double n, size_type seed) const  {
     }
 
     if (index_s > 0 && n_rows <= index_s) [[likely]]  {
-        std::random_device  rd;
-        std::mt19937        gen(rd());
-
-        if (use_seed)  gen.seed(static_cast<unsigned int>(seed));
-
-        std::uniform_int_distribution<size_type>    dis(0, index_s - 1);
-        StlVecType<size_type>                       rand_indices(n_rows);
+        std::random_device                        rd;
+        std::mt19937                              gen(use_seed ? seed : rd());
+        std::uniform_int_distribution<size_type>  dis(0, index_s - 1);
+        StlVecType<size_type>                     rand_indices(n_rows);
 
         for (size_type i = 0; i < n_rows; ++i) [[likely]]
             rand_indices[i] = dis(gen);

diff --git a/include/DataFrame/Internals/DataFrame_misc.tcc b/include/DataFrame/Internals/DataFrame_misc.tcc
@@ -684,10 +684,7 @@ DataFrame<I, H>::
 shuffle_functor_<Ts ...>::
 operator() (T &vec) const  {
 
-    std::random_device  rd;
-    std::mt19937        g(rd());
-
-    std::shuffle(vec.begin(), vec.end(), g);
+    std::shuffle(vec.begin(), vec.end(), g_);
     return;
 }
 

diff --git a/test/dataframe_tester.cc b/test/dataframe_tester.cc
@@ -3719,7 +3719,14 @@ static void test_k_means()  {
                  std::make_pair("col1",
                                 gen_lognormal_dist<double, 128>(item_cnt, p)));
 
-    KMeansVisitor<5, double, unsigned long, 128>    km_visitor(1000);
+    KMeansVisitor<5,
+                  double,
+                  unsigned long,
+                  128>  km_visitor(1000, true,
+                                   [](const double &x, const double &y)  {
+                                       return ((x - y) * (x - y));
+                                   },
+                                   10);
 
     df.single_act_visit<double>("col1", km_visitor);
     std::cout << "Means of clusters are: ";
@@ -3797,7 +3804,7 @@ static void test_k_means()  {
     KMeansVisitor<5,
                   Point,
                   unsigned long,
-                  128> km_visitor2(1000, true, point_distance);
+                  128> km_visitor2(1000, true, point_distance, 10);
 
     df.single_act_visit<Point>("point_col", km_visitor2);
 
@@ -3883,7 +3890,7 @@ static void test_affinity_propagation()  {
     StlVecType<double>     final_col;
     StlVecType<double>     col_data;
 
-    p.seed = 3575984165U;
+    p.seed = 10U;
 
     p.min_value = 0;
     p.max_value = 10;
@@ -3912,9 +3919,16 @@ static void test_affinity_propagation()  {
 
     df.load_data(MyDataFrame::gen_sequence_index(0, item_cnt * 5, 1),
                  std::make_pair("col1", final_col));
-    df.shuffle<double>({"col1"}, false);
+    df.shuffle<double>({"col1"}, false, 10);
 
-    KMeansVisitor<5, double, unsigned long, 128>    km_visitor(1000);
+    KMeansVisitor<5,
+                  double,
+                  unsigned long,
+                  128>  km_visitor(1000, true,
+                                   [](const double &x, const double &y)  {
+                                       return ((x - y) * (x - y));
+                                   },
+                                   10);
     AffinityPropVisitor<double, unsigned long, 128> ap_visitor(50);
 
     df.single_act_visit<double>("col1", km_visitor);

diff --git a/test/dataframe_tester_output.txt b/test/dataframe_tester_output.txt