From 4b63bbd5019cf7cde63dad233f8c37faa16cf086 Mon Sep 17 00:00:00 2001
From: Hossein Moein <myseajune@aol.com>
Date: Wed, 13 Dec 2023 13:23:37 -0500
Subject: [PATCH 1/7] Added parallel computing logic to the following visitors:
 Sum, Mean, Prod, Extermum

---
 docs/HTML/DataFrame.html                     |   6 +-
 include/DataFrame/DataFrameStatsVisitors.h   | 168 ++++++++++++++++++-
 include/DataFrame/Utils/Threads/ThreadPool.h |   2 +-
 test/dataframe_tester.cc                     |   4 +-
 4 files changed, 166 insertions(+), 14 deletions(-)
diff --git a/docs/HTML/DataFrame.html b/docs/HTML/DataFrame.html
index 14a1670f..9019310e 100644
--- a/docs/HTML/DataFrame.html
+++ b/docs/HTML/DataFrame.html
@@ -1441,16 +1441,16 @@ <H2><font color="blue">API Reference with code samples</font></H2>
 
   <H2><font color="blue">Multithreading</font></H2>
     In general, multithreading could be very tricky. A lot of times you think by using multithreading you enhance the performance of your program. But in fact, you are hindering it. It requires measuring and careful adjustments. It is recommended to start with a single-threaded version and when that is <I>working correctly</I>, take measurements and adjust to move to multithreading version.<BR>
-DataFrame uses multithreading extensively and provides granular tools to adjust your program. Let’s divide the multithreading subject in DataFrame into two categories:<BR>
+DataFrame uses multithreading extensively and provides granular tools to adjust your environment. Let’s divide the multithreading subject in DataFrame into two categories:<BR>
 
-  <H4>1. User Utilizing Multithreading</H4>
+  <H4>1. User Multithreading</H4>
     <UL>
       <LI>DataFrame uses static containers to achieve type heterogeneity. By default, these static containers are unprotected. This is done by design. So, by default there is no locking overhead. If you use DataFrame in a multithreaded program, you must provide a <I>SpinLock</I> defined in <I>ThreadGranularity.h</I> file. DataFrame will use your <I>SpinLock</I> to protect the containers.<BR>Please see above, <I>set_lock()</I>, <I>remove_lock()</I>, and <I>dataframe_tester.cc#3767</I> for code example.</LI>
       <LI>In addition, instances of DataFrame are not multithreaded safe either. In other words, a single instance of DataFrame must not be used in multiple threads without protection, unless it is used as read-only.</LI>
     </UL>
     So, In general if you as the user of DataFrame utilize multithreading, you must protect the DataFrame with a synchronization tool (i.e. SpinLock)
   <H4>2. DataFrame Internal Multithreading</H4>
-  Whether or not you, as the user, use multithreading, DataFrame utilizes a versatile thread-pool to employ parallel computing extensively in almost all its functionalities, when appropriate. DataFrame also gives you the interface to control and tweak that. You do not need to worry about synchronization for DataFrame internal multithreading.<BR>
+  Whether or not you, as the user, use multithreading, DataFrame utilizes a versatile thread-pool to employ parallel computing extensively in almost all its functionalities, when appropriate -- currently, most parallel algorithms trigger when number of items exceeds 150k and number of threads exceeds 2. DataFrame also gives you the interface to control and tweak that. You do not need to worry about synchronization for DataFrame internal multithreading.<BR>
   <UL>
     <LI> There are asynchronous versions of some methods. For example, you have sort()/sort_async(), visit()/visit_async(), ... The latter versions return a std::future and would execute in parallel.<BR>If you chose to use DataFrame async interfaces, it is highly recommended to call <I>ThreadGranularity::set_optimum_thread_level()</I>, So your thread-pool is populated with optimal number of threads. Otherwise, if thread-pool is empty, async interfaces will add one thread to it. Having only one thread in thread-pool could be suboptimal and hinder performance.</LI>
     <LI>As mentioned above, DataFrame uses parallel computing extensively. But by default, DataFrame is single threaded, because by default its thread-pool is empty. If you want to fully take advantage of DataFrame parallel computing, it is recommended to call <I>ThreadGranularity::set_optimum_thread_level()</I> at the beginning of your program. Alternatively you could call <I>ThreadGranularity:: set_thread_level(n)</I> to add a custom number of threads to the thread-pool. But you better have a good reason for that.<BR>Thread-pool and thread level are static properties of DataFrame. Once the thread level is set, it applies to all DataFrame instances.</LI>
diff --git a/include/DataFrame/DataFrameStatsVisitors.h b/include/DataFrame/DataFrameStatsVisitors.h
index ded22c3a..eee7121b 100644
--- a/include/DataFrame/DataFrameStatsVisitors.h
+++ b/include/DataFrame/DataFrameStatsVisitors.h
@@ -164,7 +164,16 @@ struct  LastVisitor  {
 
         if (! skip_nan_ || ! is_nan__(val)) [[likely]]  result_ = val;
     }
-    PASS_DATA_ONE_BY_ONE
+    template <forward_iterator K, forward_iterator H>
+    inline void
+    operator() (K /*idx_begin*/, K /*idx_end*/, H column_begin, H column_end) {
+
+        for (auto citer = --column_end; citer >= column_begin; --citer)
+            if (! skip_nan_ || ! is_nan__(*citer)) [[likely]]  {
+                result_ = *citer;
+                break;
+            }
+    }
 
     inline void pre ()  { result_ = result_type { }; }
     inline void post ()  {  }
@@ -195,7 +204,16 @@ struct  FirstVisitor  {
             }
         }
     }
-    PASS_DATA_ONE_BY_ONE
+    template <forward_iterator K, forward_iterator H>
+    inline void
+    operator() (K /*idx_begin*/, K /*idx_end*/, H column_begin, H column_end) {
+
+        for (auto citer = column_begin; citer < column_end; ++citer)
+            if (! skip_nan_ || ! is_nan__(*citer)) [[likely]]  {
+                result_ = *citer;
+                break;
+            }
+    }
 
     inline void pre ()  { result_ = result_type { }; started_ = false; }
     inline void post ()  {  }
@@ -222,7 +240,12 @@ struct  CountVisitor  {
 
         if (! skip_nan_ || ! is_nan__(val)) [[likely]]  result_ += 1;
     }
-    PASS_DATA_ONE_BY_ONE
+    template <forward_iterator K, forward_iterator H>
+    inline void
+    operator() (K /*idx_begin*/, K /*idx_end*/, H column_begin, H column_end) {
+
+        result_ = result_type(std::distance(column_begin, column_end));
+    }
 
     inline void pre ()  { result_ = 0; }
     inline void post ()  {  }
@@ -249,7 +272,34 @@ struct  SumVisitor  {
 
         result_ += val;
     }
-    PASS_DATA_ONE_BY_ONE
+    template <forward_iterator K, forward_iterator H>
+    inline void
+    operator() (K /*idx_begin*/, K /*idx_end*/, H column_begin, H column_end) {
+
+        if (ThreadGranularity::get_thread_level() > 2 &&
+            std::distance(column_begin, column_end) >=
+                ThreadPool::MUL_THR_THHOLD)  {
+            auto    lbd =
+                [this] (const auto &begin, const auto &end) -> value_type  {
+                    value_type  sum { };
+
+                    for (auto citer = begin; citer < end; ++citer)
+                        if (! this->skip_nan_ || ! is_nan__(*citer))
+                            sum += *citer;
+                    return(sum);
+                };
+            auto    futures =
+                ThreadGranularity::thr_pool_.parallel_loop(column_begin,
+                                                           column_end,
+                                                           std::move(lbd));
+
+            for (auto &fut : futures)  result_ += fut.get();
+        }
+        else
+            for (auto citer = column_begin; citer < column_end; ++citer)
+                if (! skip_nan_ || ! is_nan__(*citer))
+                    result_ += *citer;
+    }
 
     inline void pre ()  { result_ = value_type { }; }
     inline void post ()  {  }
@@ -299,7 +349,15 @@ struct  MeanVisitor : public MeanBase<T, I>  {
         BaseClass::cnt_ += 1;
         BaseClass::sum_(idx, val);
     }
-    PASS_DATA_ONE_BY_ONE
+    template <forward_iterator K, forward_iterator H>
+    inline void
+    operator() (K idx_begin, K idx_end, H column_begin, H column_end) {
+
+        BaseClass::sum_(idx_begin, idx_end, column_begin, column_end);
+        for (auto citer = column_begin; citer < column_end; ++citer)
+            if (! BaseClass::skip_nan_ || ! is_nan__(*citer))
+                BaseClass::cnt_ += 1;
+    }
 
     inline void post ()  {
 
@@ -468,7 +526,34 @@ struct  ProdVisitor  {
 
         result_ *= val;
     }
-    PASS_DATA_ONE_BY_ONE
+    template <forward_iterator K, forward_iterator H>
+    inline void
+    operator() (K /*idx_begin*/, K /*idx_end*/, H column_begin, H column_end) {
+
+        if (ThreadGranularity::get_thread_level() > 2 &&
+            std::distance(column_begin, column_end) >=
+                ThreadPool::MUL_THR_THHOLD)  {
+            auto    lbd =
+                [this] (const auto &begin, const auto &end) -> value_type  {
+                    value_type  prod { 1 };
+
+                    for (auto citer = begin; citer < end; ++citer)
+                        if (! this->skip_nan_ || ! is_nan__(*citer))
+                            prod *= *citer;
+                    return(prod);
+                };
+            auto    futures =
+                ThreadGranularity::thr_pool_.parallel_loop(column_begin,
+                                                           column_end,
+                                                           std::move(lbd));
+
+            for (auto &fut : futures)  result_ *= fut.get();
+        }
+        else
+            for (auto citer = column_begin; citer < column_end; ++citer)
+                if (! skip_nan_ || ! is_nan__(*citer))
+                    result_ *= *citer;
+    }
 
     inline void pre ()  { result_ = 1; }
     inline void post ()  {  }
@@ -509,9 +594,76 @@ struct  ExtremumVisitor  {
             is_first = false;
         }
     }
-    PASS_DATA_ONE_BY_ONE
+    template <forward_iterator K, forward_iterator H>
+    inline void
+    operator() (K idx_begin, K idx_end, H column_begin, H column_end) {
+
+        assert((std::distance(idx_begin, idx_end) >=
+                    std::distance(column_begin, column_end)));
+
+        auto    citer { column_begin };
+
+        for (; citer < column_end; ++citer, ++idx_begin, ++counter_)  {
+            if (! skip_nan_ || ! is_nan__(*citer)) [[likely]]  {
+                pos_ = counter_;
+                index_ = *idx_begin;
+                extremum_ = *citer;
+                ++citer;
+                ++idx_begin;
+                break;
+            }
+        }
+
+        // NOTE: Currently in multi-threading mode, pos_ and index_ are not
+        //       updated.
+        if (ThreadGranularity::get_thread_level() > 2 &&
+            std::distance(column_begin, column_end) >=
+                ThreadPool::MUL_THR_THHOLD)  {
+            auto    lbd =
+                [this] (const auto &begin, const auto &end) -> value_type  {
+                    value_type  extremum { *begin };
+
+                    for (auto citer = begin + 1; citer < end; ++citer)  {
+                        if (! this->skip_nan_ || ! is_nan__(*citer)) [[likely]]
+                            if (this->cmp_(extremum, *citer))
+                                extremum = *citer;
+                    }
+                    return (extremum);
+                };
+            auto    futures =
+                ThreadGranularity::thr_pool_.parallel_loop(citer,
+                                                           column_end,
+                                                           std::move(lbd));
+
+            if (! futures.empty())  {
+                extremum_ = futures[0].get();
+                for (size_type i = 1; i < futures.size(); ++i)  {
+                    const auto  val = std::move(futures[i].get());
+
+                    if (cmp_(extremum_, val))
+                        extremum_ = val;
+                }
+            }
+        }
+        else  {
+            for (; citer < column_end; ++citer, ++idx_begin, ++counter_)  {
+                if (! skip_nan_ || ! is_nan__(*citer)) [[likely]]  {
+                    if (cmp_(extremum_, *citer))  {
+                        extremum_ = *citer;
+                        index_ = *idx_begin;
+                        pos_ = counter_;
+                    }
+                }
+            }
+        }
+    }
 
-    inline void pre ()  { is_first = true; pos_ = 0; counter_ = 0; }
+    inline void pre ()  {
+        is_first = true;
+        pos_ = 0;
+        counter_ = 0;
+        extremum_ = value_type { };
+    }
     inline void post ()  {  }
     inline result_type get_result () const  { return (extremum_); }
     inline index_type get_index () const  { return (index_); }
diff --git a/include/DataFrame/Utils/Threads/ThreadPool.h b/include/DataFrame/Utils/Threads/ThreadPool.h
index e603dc2f..3d164900 100644
--- a/include/DataFrame/Utils/Threads/ThreadPool.h
+++ b/include/DataFrame/Utils/Threads/ThreadPool.h
@@ -80,7 +80,7 @@ class   ThreadPool  {
     using time_type = time_t;
     using thread_type = std::thread;
 
-    inline static constexpr size_type   MUL_THR_THHOLD = 250'000L;
+    inline static constexpr size_type   MUL_THR_THHOLD = 150'000L;
 
     ThreadPool(const ThreadPool &) = delete;
     ThreadPool &operator = (const ThreadPool &) = delete;
diff --git a/test/dataframe_tester.cc b/test/dataframe_tester.cc
index 82cb4da4..70f8561b 100644
--- a/test/dataframe_tester.cc
+++ b/test/dataframe_tester.cc
@@ -2942,8 +2942,8 @@ static void test_SimpleRollAdopter()  {
     assert(std::isnan(result4[1]));
     assert(result4[2] == 10.0);
     assert(result4[3] == 11.0);
-    assert(std::isnan(result4[4]));
-    assert(std::isnan(result4[8]));
+    assert(result4[4] == 11.0);
+    assert(result4[8] == 16.0);
     assert(std::isnan(result4[9]));
     assert(result4[10] == 18.0);
 }

From 6e1f0f425748912ba080f73ce0b15ad2edc5e415 Mon Sep 17 00:00:00 2001
From: Hossein Moein <myseajune@aol.com>
Date: Thu, 14 Dec 2023 12:30:32 -0500
Subject: [PATCH 2/7] Added parallel computing logic to the following visitors:
 Covariance, Variance, Correlation, Standard Deviation, Bets, SEM

---
 benchmarks/dataframe_performance.cc           |  12 +-
 benchmarks/polars_performance.py              |   4 +-
 include/DataFrame/DataFrameStatsVisitors.h    | 273 ++++++++++++++----
 include/DataFrame/Utils/Threads/ThreadPool.h  |  15 +
 .../DataFrame/Utils/Threads/ThreadPool.tcc    |  55 ++++
 5 files changed, 290 insertions(+), 69 deletions(-)

diff --git a/benchmarks/dataframe_performance.cc b/benchmarks/dataframe_performance.cc
index f617a91c..a202ec17 100644
--- a/benchmarks/dataframe_performance.cc
+++ b/benchmarks/dataframe_performance.cc
@@ -36,8 +36,8 @@ using namespace hmdf;
 using namespace std::chrono;
 
 constexpr std::size_t   ALIGNMENT = 64;
-// constexpr std::size_t   SIZE = 300000000;
-constexpr std::size_t   SIZE = 10000000;
+constexpr std::size_t   SIZE = 300000000;
+// constexpr std::size_t   SIZE = 10000000;
 
 typedef StdDataFrame64<time_t> MyDataFrame;
 
@@ -45,6 +45,8 @@ typedef StdDataFrame64<time_t> MyDataFrame;
 
 int main(int, char *[]) {
 
+    MyDataFrame::set_optimum_thread_level();
+	
     const auto  first = high_resolution_clock::now();
     MyDataFrame df;
 
@@ -64,9 +66,9 @@ int main(int, char *[]) {
     VarVisitor<double, time_t>  ln_vv;
     CorrVisitor<double, time_t> e_ln_cv;
 
-    auto    mean = df.visit_async<double>("normal", n_mv);
-    auto    var = df.visit_async<double>("log_normal", ln_vv);
-    auto    corr = df.visit_async<double, double>("exponential", "log_normal", e_ln_cv);
+    auto    mean = df.single_act_visit_async<double>("normal", n_mv);
+    auto    var = df.single_act_visit_async<double>("log_normal", ln_vv);
+    auto    corr = df.single_act_visit_async<double, double>("exponential", "log_normal", e_ln_cv);
 
     std::cout << mean.get().get_result() << ", "
               << var.get().get_result() << ", "
diff --git a/benchmarks/polars_performance.py b/benchmarks/polars_performance.py
index f6b9463f..49e84c80 100644
--- a/benchmarks/polars_performance.py
+++ b/benchmarks/polars_performance.py
@@ -4,8 +4,8 @@
 
 # ------------------------------------------------------------------------------
 
-# SIZE: int = 300000000
-SIZE: int = 10000000
+SIZE: int = 300000000
+# SIZE: int = 10000000
 
 first = datetime.datetime.now()
 df = pl.DataFrame({"normal": np.random.normal(size=SIZE),
diff --git a/include/DataFrame/DataFrameStatsVisitors.h b/include/DataFrame/DataFrameStatsVisitors.h
index eee7121b..988564a5 100644
--- a/include/DataFrame/DataFrameStatsVisitors.h
+++ b/include/DataFrame/DataFrameStatsVisitors.h
@@ -698,8 +698,9 @@ using MinVisitor = ExtremumVisitor<T, I, std::greater<T>>;
 // than O(MlogM) vs. O(N*M) for majority of usage.
 // By default, this is a NLargestVisitor
 //
-template<std::size_t N, typename T, typename I = unsigned long,
-         typename Cmp = std::less<T>>
+template<std::size_t N,
+         typename T, typename I = unsigned long,
+         typename C = std::less<T>>
 struct  NExtremumVisitor  {
 
     DEFINE_VISIT_BASIC_TYPES
@@ -709,7 +710,7 @@ struct  NExtremumVisitor  {
         index_type  index { };
     };
 
-    using compare_type = Cmp;
+    using compare_type = C;
     using result_type = std::array<DataItem, N>;
 
     inline void operator() (const index_type &idx, const value_type &val)  {
@@ -720,14 +721,14 @@ struct  NExtremumVisitor  {
             result_[counter_] = { val, idx };
             if (extremum_index_ < 0 ||
                 cmp_(val, result_[extremum_index_].value))
-                extremum_index_ = static_cast<int>(counter_);
+                extremum_index_ = static_cast<long>(counter_);
         }
         else if (cmp_(result_[extremum_index_].value, val))  {
             result_[extremum_index_] = { val, idx };
             extremum_index_ = 0;
             for (size_type i = 1; i < N; ++i)
                 if (cmp_(result_[i].value, result_[extremum_index_].value))
-                    extremum_index_ = static_cast<int>(i);
+                    extremum_index_ = static_cast<long>(i);
         }
 
         counter_ += 1;
@@ -760,7 +761,7 @@ struct  NExtremumVisitor  {
 
     result_type     result_ { };
     size_type       counter_ { 0 };
-    int             extremum_index_ { -1 };
+    long            extremum_index_ { -1 };
     compare_type    cmp_ {  };
     const bool      skip_nan_;
 };
@@ -777,66 +778,150 @@ struct  CovVisitor  {
 
     DEFINE_VISIT_BASIC_TYPES_2
 
+    struct  InterResults  {
+        value_type  total1 { 0 };
+        value_type  total2 { 0 };
+        value_type  dot_prod { 0 };
+        value_type  dot_prod1 { 0 };
+        value_type  dot_prod2 { 0 };
+        size_type   cnt { 0 };
+
+        inline void clear()  {
+            total1 = total2 = dot_prod = dot_prod1 = dot_prod2 = 0;
+            cnt = 0;
+        }
+    };
+
     inline void operator() (const index_type &,
                             const value_type &val1, const value_type &val2)  {
 
         if (skip_nan_ && (is_nan__(val1) || is_nan__(val2))) [[unlikely]]
             return;
 
-        total1_ += val1;
-        total2_ += val2;
-        dot_prod_ += (val1 * val2);
-        dot_prod1_ += (val1 * val1);
-        dot_prod2_ += (val2 * val2);
-        cnt_ += 1;
+        inter_result_.total1 += val1;
+        inter_result_.total2 += val2;
+        inter_result_.dot_prod += (val1 * val2);
+        inter_result_.dot_prod1 += (val1 * val1);
+        inter_result_.dot_prod2 += (val2 * val2);
+        inter_result_.cnt += 1;
+    }
+    template <forward_iterator K, forward_iterator H>
+    inline void
+    operator() (K /*idx_begin*/, K /*idx_end*/,
+                H column_begin1, H column_end1,
+                H column_begin2, H column_end2)  {
+
+        if (ThreadGranularity::get_thread_level() > 2 &&
+            std::distance(column_begin1, column_end1) >=
+                ThreadPool::MUL_THR_THHOLD)  {
+            auto    lbd =
+                [this]
+                (const auto &begin1, const auto &end1,
+                 const auto &begin2) -> InterResults  {
+                    InterResults    result { };
+                    auto            iter2 = begin2;
+
+                    for (auto iter1 = begin1; iter1 < end1; ++iter1, ++iter2) {
+                        const value_type    &val1 = *iter1;
+                        const value_type    &val2 = *iter2;
+
+                        if (! this->skip_nan_ ||
+                            (! is_nan__(val1) && ! is_nan__(val2)))  {
+                            result.total1 += val1;
+                            result.total2 += val2;
+                            result.dot_prod += (val1 * val2);
+                            result.dot_prod1 += (val1 * val1);
+                            result.dot_prod2 += (val2 * val2);
+                            result.cnt += 1;
+                        }
+                    }
+                    return (result);
+                };
+            auto    futures =
+                ThreadGranularity::thr_pool_.parallel_loop2(column_begin1,
+                                                            column_end1,
+                                                            column_begin2,
+                                                            column_end2,
+                                                            std::move(lbd));
+
+            for (auto &fut : futures)  {
+                const auto  &result = fut.get();
+
+                inter_result_.total1 += result.total1;
+                inter_result_.total2 += result.total2;
+                inter_result_.dot_prod += result.dot_prod;
+                inter_result_.dot_prod1 += result.dot_prod1;
+                inter_result_.dot_prod2 += result.dot_prod2;
+                inter_result_.cnt += result.cnt;
+            }
+        }
+        else  {
+            for (; column_begin1 < column_end1 && column_begin2 < column_end2;
+                 ++column_begin1, ++column_begin2)  {
+                const value_type    &val1 = *column_begin1;
+                const value_type    &val2 = *column_begin2;
+
+                if (! skip_nan_ || (! is_nan__(val1) && ! is_nan__(val2)))  {
+                    inter_result_.total1 += val1;
+                    inter_result_.total2 += val2;
+                    inter_result_.dot_prod += (val1 * val2);
+                    inter_result_.dot_prod1 += (val1 * val1);
+                    inter_result_.dot_prod2 += (val2 * val2);
+                    inter_result_.cnt += 1;
+                }
+            }
+        }
     }
-    PASS_DATA_ONE_BY_ONE_2
 
     inline void pre ()  {
 
-        total1_ = total2_ = dot_prod_ = dot_prod1_ = dot_prod2_ = result_ = 0;
-        cnt_ = 0;
+        inter_result_.clear();
+        result_ = 0;
     }
     inline void post ()  {
 
-        const value_type    d = value_type(cnt_) - b_;
+        const value_type    d = value_type(inter_result_.cnt) - b_;
 
         if (d != 0) [[likely]]
-            result_ = (dot_prod_ - (total1_ * total2_) / value_type(cnt_)) / d;
+            result_ = (inter_result_.dot_prod -
+                       (inter_result_.total1 * inter_result_.total2) /
+                       value_type(inter_result_.cnt)) /
+                      d;
         else  result_ = std::numeric_limits<value_type>::quiet_NaN();
     }
 
     inline result_type get_result () const  { return (result_); }
     inline value_type get_var1 () const  {
 
-        const value_type    d = value_type(cnt_) - b_;
+        const value_type    d = value_type(inter_result_.cnt) - b_;
 
         if (d != 0) [[likely]]
-            return ((dot_prod1_ - (total1_ * total1_) / value_type(cnt_)) / d);
+            return ((inter_result_.dot_prod1 -
+                     (inter_result_.total1 * inter_result_.total1) /
+                     value_type(inter_result_.cnt)) /
+                    d);
         else  return (std::numeric_limits<value_type>::quiet_NaN());
     }
     inline value_type get_var2 () const  {
 
-        const value_type    d = value_type(cnt_) - b_;
+        const value_type    d = value_type(inter_result_.cnt) - b_;
 
         if (d != 0) [[likely]]
-            return ((dot_prod2_ - (total2_ * total2_) / value_type(cnt_)) / d);
+            return ((inter_result_.dot_prod2 -
+                     (inter_result_.total2 * inter_result_.total2) /
+                     value_type(inter_result_.cnt)) /
+                    d);
         else  return (std::numeric_limits<value_type>::quiet_NaN());
     }
-    inline size_type get_count() const  { return (cnt_); }
+    inline size_type get_count() const  { return (inter_result_.cnt); }
 
     explicit CovVisitor (bool biased = false, bool skipnan = true)
         : b_ (biased ? 0 : 1), skip_nan_(skipnan)  {  }
 
 private:
 
-    value_type          total1_ { 0 };
-    value_type          total2_ { 0 };
-    value_type          dot_prod_ { 0 };
-    value_type          dot_prod1_ { 0 };
-    value_type          dot_prod2_ { 0 };
+    InterResults        inter_result_ { };
     result_type         result_ { 0 };
-    size_type           cnt_ { 0 };
     const value_type    b_;
     const bool          skip_nan_;
 };
@@ -852,7 +937,13 @@ struct  VarVisitor  {
 
         cov_ (idx, val, val);
     }
-    PASS_DATA_ONE_BY_ONE
+    template <forward_iterator K, forward_iterator H>
+    inline void
+    operator() (K idx_begin, K idx_end, H column_begin, H column_end) {
+
+        cov_ (idx_begin, idx_end,
+              column_begin, column_end, column_begin, column_end);
+    }
 
     inline void pre ()  { cov_.pre(); }
     inline void post ()  { cov_.post(); }
@@ -878,7 +969,15 @@ struct  BetaVisitor  {
 
         cov_ (idx, val, bmark);
     }
-    PASS_DATA_ONE_BY_ONE_2
+    template <forward_iterator K, forward_iterator H>
+    inline void
+    operator() (K idx_begin, K idx_end,
+                H data_begin, H data_end,
+                H benchmark_begin, H benchmark_end)  {
+
+        cov_ (idx_begin, idx_end,
+              data_begin, data_end, benchmark_begin, benchmark_end);
+    }
 
     inline void pre ()  { cov_.pre(); result_ = 0; }
     inline void post ()  {
@@ -913,7 +1012,12 @@ struct  StdVisitor   {
 
         var_ (idx, val);
     }
-    PASS_DATA_ONE_BY_ONE
+    template <forward_iterator K, forward_iterator H>
+    inline void
+    operator() (K idx_begin, K idx_end, H column_begin, H column_end) {
+
+        var_ (idx_begin, idx_end, column_begin, column_end);
+    }
 
     inline void pre ()  { var_.pre(); result_ = 0; }
     inline void post ()  { var_.post(); result_ = ::sqrt(var_.get_result()); }
@@ -941,7 +1045,12 @@ struct  SEMVisitor   {
 
         std_ (idx, val);
     }
-    PASS_DATA_ONE_BY_ONE
+    template <forward_iterator K, forward_iterator H>
+    inline void
+    operator() (K idx_begin, K idx_end, H column_begin, H column_end) {
+
+        std_ (idx_begin, idx_end, column_begin, column_end);
+    }
 
     inline void pre ()  { std_.pre(); result_ = 0; }
     inline void post ()  {
@@ -1011,8 +1120,12 @@ struct  RankVisitor  {
                 const H &column_begin, const H &column_end)  {
 
         GET_COL_SIZE
-        std::vector<size_type, typename allocator_declare<size_type, A>::type>
-            rank_vec(col_s);
+
+        using vec_t =
+            std::vector<size_type,
+                        typename allocator_declare<size_type, A>::type>;
+
+        vec_t   rank_vec(col_s);
 
         std::iota(rank_vec.begin(), rank_vec.end(), 0);
         std::stable_sort(
@@ -1103,41 +1216,77 @@ struct  CorrVisitor  {
                 H column_begin1, H column_end1,
                 H column_begin2, H column_end2)  {
 
-        const auto      &idx = *idx_begin;
         const size_type col_s =
             std::min ({ std::distance(idx_begin, idx_end),
                         std::distance(column_begin1, column_end1),
                         std::distance(column_begin2, column_end2) });
 
         if (type_ == correlation_type::pearson)  {
-            while (column_begin1 < column_end1 && column_begin2 < column_end2)
-                (*this)(idx, *column_begin1++, *column_begin2++);
+            cov_ (idx_begin, idx_end,
+                  column_begin1, column_end1, column_begin2, column_end2);
         }
         else  {  // correlation_type::spearman
-            RankVisitor<T, I>   rank;
+            auto    calc_lbd =
+                [col_s, this](const auto &rank1, const auto &rank2) -> void  {
+                value_type  diff_sum { 0 };
 
-            rank.pre();
-            rank(idx_begin, idx_end, column_begin1, column_end1);
-            rank.post();
+                for (size_type i = 0; i < col_s; ++i)  {
+                    const value_type diff = rank1[i] - rank2[i];
 
-            const auto  rank1 = rank.get_result();
+                    diff_sum += diff * diff;
+                }
 
-            rank.pre();
-            rank(idx_begin, idx_end, column_begin2, column_end2);
-            rank.post();
+                this->result_ = value_type(1) -
+                                ((value_type(6) * diff_sum) /
+                                 (value_type(col_s * (col_s * col_s - 1))));
+            };
 
-            const auto  rank2 = std::move(rank.get_result());
-            value_type  diff_sum { 0 };
+            if (ThreadGranularity::get_thread_level() > 2 &&
+                col_s >= ThreadPool::MUL_THR_THHOLD)  {
+                auto        lbd =
+                    [](const K &ib, const K &ie,
+                       const H &cb, const H &ce) -> RankVisitor<T, I>  {
+                        RankVisitor<T, I>   rank;
+
+                        rank.pre();
+                        rank(ib, ie, cb, ce);
+                        rank.post();
+                        return (rank);
+                    };
+                auto        fut1 =
+                    ThreadGranularity::thr_pool_.dispatch(
+                          false,
+                          lbd,
+                              std::cref(idx_begin),
+                              std::cref(idx_end),
+                              std::cref(column_begin1),
+                              std::cref(column_end1));
+                auto        fut2 =
+                    ThreadGranularity::thr_pool_.dispatch(
+                          false,
+                          lbd,
+                              std::cref(idx_begin),
+                              std::cref(idx_end),
+                              std::cref(column_begin2),
+                              std::cref(column_end2));
+
+                calc_lbd(fut1.get().get_result(), fut2.get().get_result());
+            }
+            else  {
+                RankVisitor<T, I>   rank;
 
-            for (size_type i = 0; i < col_s; ++i)  {
-                const value_type diff = rank1[i] - rank2[i];
+                rank.pre();
+                rank(idx_begin, idx_end, column_begin1, column_end1);
+                rank.post();
 
-                diff_sum += diff * diff;
-            }
+                const auto  rank1 = rank.get_result();
 
-            result_ = value_type(1) -
-                      ((value_type(6) * diff_sum) /
-                       (value_type(col_s * (col_s * col_s - 1))));
+                rank.pre();
+                rank(idx_begin, idx_end, column_begin2, column_end2);
+                rank.post();
+
+                calc_lbd(rank1, rank.get_result());
+            }
         }
     }
 
@@ -1187,12 +1336,12 @@ struct  DotProdVisitor  {
 
 // ----------------------------------------------------------------------------
 
-template<typename T, typename I = unsigned long, typename Cmp = std::less<T>>
+template<typename T, typename I = unsigned long, typename C = std::less<T>>
 struct  ExtremumSubArrayVisitor  {
 
     DEFINE_VISIT_BASIC_TYPES_2
 
-    using compare_type = Cmp;
+    using compare_type = C;
 
     inline void operator() (const index_type &, const value_type &val)  {
 
@@ -1275,7 +1424,7 @@ using MinSubArrayVisitor = ExtremumSubArrayVisitor<T, I, std::greater<T>>;
 
 template<std::size_t N, typename T,
          typename I = unsigned long,
-         typename Cmp = std::less<T>,
+         typename C = std::less<T>,
          std::size_t A = 0>
 struct  NExtremumSubArrayVisitor  {
 
@@ -1302,7 +1451,7 @@ struct  NExtremumSubArrayVisitor  {
     using result_type =
         std::vector<SubArrayInfo,
                     typename allocator_declare<SubArrayInfo, A>::type>;
-    using compare_type = Cmp;
+    using compare_type = C;
 
     inline void operator() (const index_type &idx, const value_type &val)  {
 
@@ -1337,10 +1486,10 @@ struct  NExtremumSubArrayVisitor  {
 
 private:
 
-    ExtremumSubArrayVisitor<T, I, Cmp>                      extremum_sub_array_;
+    ExtremumSubArrayVisitor<T, I, C>                        extremum_sub_array_;
     FixedSizePriorityQueue<
         SubArrayInfo, N,
-        typename template_switch<SubArrayInfo, Cmp>::type>  q_ {  };
+        typename template_switch<SubArrayInfo, C>::type>    q_ {  };
     result_type                                             result_ {  };
     compare_type                                            cmp_ {  };
 };
@@ -1757,12 +1906,12 @@ struct  CumProdVisitor  {
 // ----------------------------------------------------------------------------
 
 template<typename T, typename I = unsigned long,
-         typename Cmp = std::less<T>, std::size_t A = 0>
+         typename C = std::less<T>, std::size_t A = 0>
 struct  CumExtremumVisitor  {
 
     DEFINE_VISIT_BASIC_TYPES_3
 
-    using compare_type = Cmp;
+    using compare_type = C;
 
     template <forward_iterator K, forward_iterator H>
     inline void
diff --git a/include/DataFrame/Utils/Threads/ThreadPool.h b/include/DataFrame/Utils/Threads/ThreadPool.h
index 3d164900..ab8f367a 100644
--- a/include/DataFrame/Utils/Threads/ThreadPool.h
+++ b/include/DataFrame/Utils/Threads/ThreadPool.h
@@ -110,6 +110,14 @@ class   ThreadPool  {
                                                      std::decay_t<I>,
                                                      std::decay_t<I>,
                                                      std::decay_t<As> ...>>>;
+    template<typename F, typename I1, typename I2, typename ... As>
+    requires std::invocable<F, I1, I1, I2, As ...>
+    using loop2_res_t =
+        std::vector<std::future<std::invoke_result_t<std::decay_t<F>,
+                                                     std::decay_t<I1>,
+                                                     std::decay_t<I1>,
+                                                     std::decay_t<I2>,
+                                                     std::decay_t<As> ...>>>;
 
     // The return type of dispatch is std::future of return type of routine
     //
@@ -124,6 +132,13 @@ class   ThreadPool  {
     loop_res_t<F, I, As ...>
     parallel_loop(I begin, I end, F &&routine, As && ... args);
 
+    // Parallel loop operating with two ranges
+    //
+    template<typename F, typename I1, typename I2, typename ... As>
+    loop2_res_t<F, I1, I2, As ...>
+    parallel_loop2(I1 begin1, I1 end1, I2 begin2, I2 end2,
+                   F &&routine, As && ... args);
+
     template<std::random_access_iterator I, long TH = MUL_THR_THHOLD>
     void parallel_sort(const I begin, const I end);
     template<std::random_access_iterator I, typename P,
diff --git a/include/DataFrame/Utils/Threads/ThreadPool.tcc b/include/DataFrame/Utils/Threads/ThreadPool.tcc
index 3afd1e90..46120fa9 100644
--- a/include/DataFrame/Utils/Threads/ThreadPool.tcc
+++ b/include/DataFrame/Utils/Threads/ThreadPool.tcc
@@ -241,6 +241,61 @@ ThreadPool::parallel_loop(I begin, I end, F &&routine, As && ... args)  {
 
 // ----------------------------------------------------------------------------
 
+template<typename F, typename I1, typename I2, typename ... As>
+ThreadPool::loop2_res_t<F, I1, I2, As ...>
+ThreadPool::parallel_loop2(I1 begin1, I1 end1, I2 begin2, I2 end2,
+                           F &&routine, As && ... args)  {
+
+    using task_return_t =
+        std::invoke_result_t<std::decay_t<F>,
+                             std::decay_t<I1>,
+                             std::decay_t<I1>,
+                             std::decay_t<I2>,
+                             std::decay_t<As> ...>;
+    using future_t = std::future<task_return_t>;
+
+    size_type   n { 0 };
+
+    if constexpr (std::is_integral<I1>::value)
+        n = std::min(end1 - begin1, end2 - begin2);
+    else
+        n = std::min(std::distance(begin1, end1), std::distance(begin2, end2));
+
+    const size_type         cap_thrs { capacity_threads() };
+    const size_type         block_size { (n > cap_thrs) ? n / cap_thrs : n };
+    std::vector<future_t>   ret;
+
+    if (block_size == n)  {
+        ret.reserve(n);
+        for (size_type i = 0; i < n; ++i)
+            ret.emplace_back(dispatch(false,
+                                      routine,
+                                          begin1 + i,
+                                          begin1 + (i + 1),
+                                          begin2 + i,
+                                          std::forward<As>(args) ...));
+    }
+    else  {
+        ret.reserve(cap_thrs + 1);
+        for (size_type i = 0; i < n; i += block_size)  {
+            const size_type block_end {
+                ((i + block_size) > n) ? n : i + block_size
+            };
+
+            ret.emplace_back(dispatch(false,
+                                      routine,
+                                          begin1 + i,
+                                          begin1 + block_end,
+                                          begin2 + i,
+                                          std::forward<As>(args) ...));
+        }
+    }
+
+    return (ret);
+}
+
+// ----------------------------------------------------------------------------
+
 template<std::random_access_iterator I, long TH>
 void
 ThreadPool::parallel_sort(const I begin, const I end)  {

From a64cc6587dca759c82ff2f578ef69d27fe6bf6eb Mon Sep 17 00:00:00 2001
From: Hossein Moein <myseajune@aol.com>
Date: Thu, 14 Dec 2023 13:21:32 -0500
Subject: [PATCH 3/7] Fixed RankVisitor copy constructor for Windows compiler

---
 include/DataFrame/DataFrameStatsVisitors.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/DataFrame/DataFrameStatsVisitors.h b/include/DataFrame/DataFrameStatsVisitors.h
index 988564a5..caf13cf3 100644
--- a/include/DataFrame/DataFrameStatsVisitors.h
+++ b/include/DataFrame/DataFrameStatsVisitors.h
@@ -1188,6 +1188,8 @@ struct  RankVisitor  {
 
     explicit
     RankVisitor(rank_policy p = rank_policy::actual) : policy_(p)  {   }
+    RankVisitor(const RankVisitor &) = default;
+    RankVisitor &operator =(const RankVisitor &) = default;
 
 private:
 

From b154bf839d3f91e871170a4083bbcf34008db62f Mon Sep 17 00:00:00 2001
From: Hossein Moein <myseajune@aol.com>
Date: Thu, 14 Dec 2023 13:28:10 -0500
Subject: [PATCH 4/7] Fixed RankVisitor copy constructor for Windows compiler 2

---
 include/DataFrame/DataFrameStatsVisitors.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/include/DataFrame/DataFrameStatsVisitors.h b/include/DataFrame/DataFrameStatsVisitors.h
index caf13cf3..2b729838 100644
--- a/include/DataFrame/DataFrameStatsVisitors.h
+++ b/include/DataFrame/DataFrameStatsVisitors.h
@@ -1188,13 +1188,11 @@ struct  RankVisitor  {
 
     explicit
     RankVisitor(rank_policy p = rank_policy::actual) : policy_(p)  {   }
-    RankVisitor(const RankVisitor &) = default;
-    RankVisitor &operator =(const RankVisitor &) = default;
 
 private:
 
-    const rank_policy   policy_;
-    result_type         result_ { };
+    rank_policy	policy_;
+    result_type result_ { };
 };
 
 // ----------------------------------------------------------------------------

From 0d0a84f72503ec3f241b1dae459d4013a220dbcb Mon Sep 17 00:00:00 2001
From: Hossein Moein <myseajune@aol.com>
Date: Fri, 15 Dec 2023 14:22:04 -0500
Subject: [PATCH 5/7] Added parallel computing logic to the following visitors:
 DotProd, TTest, Factorize, AutoCorr, ZeroLagMovingMean, LinregMovingMean

---
 README.md                                  |   8 +-
 include/DataFrame/DataFrameStatsVisitors.h | 366 +++++++++++++++++----
 2 files changed, 311 insertions(+), 63 deletions(-)

diff --git a/README.md b/README.md
index 2ef8428c..66f7de7a 100644
--- a/README.md
+++ b/README.md
@@ -75,10 +75,10 @@ Polars:
     Overall time:              36.876345 secs
 
 C++ DataFrame:
-    Data generation/load time: 28.8234 secs
-    Calculation time:           2.30939 secs
-    Selection time:             0.762463 secs
-    Overall time:              31.8952 secs
+    Data generation/load time: 26.9459   secs
+    Calculation time:           1.26015  secs
+    Selection time:             0.742493 secs
+    Overall time:              28.9486   secs
 
 For comparison, Pandas numbers running the same test:
     Data generation/load time: 36.678976 secs
diff --git a/include/DataFrame/DataFrameStatsVisitors.h b/include/DataFrame/DataFrameStatsVisitors.h
index 2b729838..de89a776 100644
--- a/include/DataFrame/DataFrameStatsVisitors.h
+++ b/include/DataFrame/DataFrameStatsVisitors.h
@@ -1191,7 +1191,11 @@ struct  RankVisitor  {
 
 private:
 
-    rank_policy	policy_;
+    // I had to make policy_ non-const, because in Windows compiler the
+    // assignment operator is implicitly deleted. We need this visitor to be
+    // assignable for multithreading
+    //
+    rank_policy policy_;
     result_type result_ { };
 };
 
@@ -1216,17 +1220,16 @@ struct  CorrVisitor  {
                 H column_begin1, H column_end1,
                 H column_begin2, H column_end2)  {
 
-        const size_type col_s =
-            std::min ({ std::distance(idx_begin, idx_end),
-                        std::distance(column_begin1, column_end1),
-                        std::distance(column_begin2, column_end2) });
-
         if (type_ == correlation_type::pearson)  {
             cov_ (idx_begin, idx_end,
                   column_begin1, column_end1, column_begin2, column_end2);
         }
         else  {  // correlation_type::spearman
-            auto    calc_lbd =
+            const size_type col_s =
+                std::min ({ std::distance(idx_begin, idx_end),
+                            std::distance(column_begin1, column_end1),
+                            std::distance(column_begin2, column_end2) });
+            auto            calc_lbd =
                 [col_s, this](const auto &rank1, const auto &rank2) -> void  {
                 value_type  diff_sum { 0 };
 
@@ -1323,7 +1326,46 @@ struct  DotProdVisitor  {
 
         result_ += (val1 * val2);
     }
-    PASS_DATA_ONE_BY_ONE_2
+    template <forward_iterator K, forward_iterator H>
+    inline void
+    operator() (K /*idx_begin*/, K /*idx_end*/,
+                H column_begin1, H column_end1,
+                H column_begin2, H column_end2)  {
+
+        if (ThreadGranularity::get_thread_level() > 2 &&
+            std::distance(column_begin1, column_end1) >=
+                ThreadPool::MUL_THR_THHOLD)  {
+            auto    lbd =
+                []
+                (const auto &begin1, const auto &end1,
+                 const auto &begin2) -> value_type  {
+                    value_type  result { 0 };
+                    auto        iter2 = begin2;
+
+                    for (auto iter1 = begin1; iter1 < end1; ++iter1, ++iter2)
+                        result += *iter1 * *iter2;
+                    return (result);
+                };
+            auto    futures =
+                ThreadGranularity::thr_pool_.parallel_loop2(column_begin1,
+                                                            column_end1,
+                                                            column_begin2,
+                                                            column_end2,
+                                                            std::move(lbd));
+
+            for (auto &fut : futures)
+                result_ += fut.get();
+        }
+        else  {
+            const size_type col_s =
+                std::min(std::distance(column_begin1, column_end1),
+                         std::distance(column_begin2, column_end2));
+
+            for (size_type i = 0; i < col_s; ++i)
+                result_ += *(column_begin1 + i) * *(column_begin2 + i);
+        }
+
+    }
 
     inline void pre ()  { result_ = 0; }
     inline void post ()  {  }
@@ -1486,12 +1528,12 @@ struct  NExtremumSubArrayVisitor  {
 
 private:
 
-    ExtremumSubArrayVisitor<T, I, C>                        extremum_sub_array_;
+    ExtremumSubArrayVisitor<T, I, C>                       extremum_sub_array_;
     FixedSizePriorityQueue<
         SubArrayInfo, N,
-        typename template_switch<SubArrayInfo, C>::type>    q_ {  };
-    result_type                                             result_ {  };
-    compare_type                                            cmp_ {  };
+        typename template_switch<SubArrayInfo, C>::type>   q_ {  };
+    result_type                                            result_ {  };
+    compare_type                                           cmp_ {  };
 };
 
 template<std::size_t N, typename T, typename I = unsigned long,
@@ -1689,11 +1731,9 @@ struct  StatsVisitor  {
         delta_n2 = delta_n * delta_n;
         term1 = delta * delta_n * value_type(n1);
         m1_ += delta_n;
-        m4_ = m4_ + static_cast<value_type>
-                     (term1 * delta_n2 * value_type(n_ * n_ - 3 * n_ + 3) +
-                   6.0 * delta_n2 * m2_ - 4.0 * delta_n * m3_);
-        m3_ = m3_ + static_cast<value_type>
-            (term1 * delta_n * value_type(n_ - 2) - 3.0 * delta_n * m2_);
+        m4_ += (term1 * delta_n2 * value_type(n_ * n_ - 3 * n_ + 3) +
+                6.0 * delta_n2 * m2_ - 4.0 * delta_n * m3_);
+        m3_ += (term1 * delta_n * value_type(n_ - 2) - 3.0 * delta_n * m2_);
         m2_ += term1;
     }
     PASS_DATA_ONE_BY_ONE
@@ -1764,7 +1804,119 @@ struct  TTestVisitor  {
             deg_freedom_ += 1;
         }
     }
-    PASS_DATA_ONE_BY_ONE_2
+    template <forward_iterator K, forward_iterator H>
+    inline void
+    operator() (K idx_begin, K idx_end,
+                H x_begin, H x_end, H y_begin, H y_end)  {
+
+        const size_type col_s =
+            std::min ({ std::distance(idx_begin, idx_end),
+                        std::distance(x_begin, x_end),
+                        std::distance(y_begin, y_end) });
+
+        if (ThreadGranularity::get_thread_level() > 2 &&
+            col_s >= ThreadPool::MUL_THR_THHOLD)  {
+            std::vector<std::future<void>>  futures;
+
+            if (! related_ts_)  {
+                auto    lbd =
+                    [](auto &vis,
+                       const auto &begin, const auto &end,
+                       const auto &idx_begin) -> void  {
+
+                        vis(idx_begin, idx_begin, begin, end);
+                    };
+
+                futures.reserve(4);
+                futures.emplace_back(
+                    ThreadGranularity::thr_pool_.dispatch(
+                        false,
+                        lbd,
+                            std::ref(m_x_),
+                            std::cref(x_begin),
+                            std::cref(x_begin + col_s),
+                            std::cref(idx_begin)));
+                futures.emplace_back(
+                    ThreadGranularity::thr_pool_.dispatch(
+                        false,
+                        lbd,
+                            std::ref(m_y_),
+                            std::cref(y_begin),
+                            std::cref(y_begin + col_s),
+                            std::cref(idx_begin)));
+                futures.emplace_back(
+                    ThreadGranularity::thr_pool_.dispatch(
+                        false,
+                        lbd,
+                            std::ref(v_x_),
+                            std::cref(x_begin),
+                            std::cref(x_begin + col_s),
+                            std::cref(idx_begin)));
+                futures.emplace_back(
+                    ThreadGranularity::thr_pool_.dispatch(
+                        false,
+                        lbd,
+                            std::ref(v_y_),
+                            std::cref(y_begin),
+                            std::cref(y_begin + col_s),
+                            std::cref(idx_begin)));
+            }
+            else  {
+                auto    lbd =
+                    [col_s](auto &vis,
+                            const auto &x_begin, const auto &y_begin,
+                            const auto &idx_begin) -> void  {
+
+                        for (size_type i = 0; i < col_s; ++i)  {
+                            const value_type    val =
+                                *(x_begin + i) - *(y_begin + i);
+
+                            vis(*idx_begin, val);
+                        }
+                    };
+
+                futures.reserve(2);
+                futures.emplace_back(
+                    ThreadGranularity::thr_pool_.dispatch(
+                        false,
+                        lbd,
+                            std::ref(m_x_),
+                            std::cref(x_begin),
+                            std::cref(y_begin),
+                            std::cref(idx_begin)));
+                futures.emplace_back(
+                    ThreadGranularity::thr_pool_.dispatch(
+                        false,
+                        lbd,
+                            std::ref(v_x_),
+                            std::cref(x_begin),
+                            std::cref(y_begin),
+                            std::cref(idx_begin)));
+            }
+
+            for (auto &fut : futures)  fut.get();
+        }
+        else  {
+            if (! related_ts_)  {
+                m_x_(idx_begin, idx_end, x_begin, x_begin + col_s);
+                m_y_(idx_begin, idx_end, y_begin, y_begin + col_s);
+                v_x_(idx_begin, idx_end, x_begin, x_begin + col_s);
+                v_y_(idx_begin, idx_end, y_begin, y_begin + col_s);
+            }
+            else  {
+                for (size_type i = 0; i < col_s; ++i)  {
+                    const value_type    val = *(x_begin + i) - *(y_begin + i);
+
+                    m_x_(*idx_begin, val);
+                    v_x_(*idx_begin, val);
+                }
+            }
+        }
+        if (! related_ts_)
+            deg_freedom_ = col_s * 2;
+        else
+            deg_freedom_ = col_s;
+    }
 
     inline void pre ()  {
 
@@ -1969,7 +2121,7 @@ struct  CategoryVisitor  {
 
     struct t_less_  {
         inline bool
-        operator() (const T *lhs, const T *rhs) const  { return (*lhs < *rhs); }
+        operator() (const T *lhs, const T *rhs) const { return (*lhs < *rhs); }
     };
 
 public:
@@ -2052,13 +2204,33 @@ struct  FactorizeVisitor  {
     operator() (const K &, const K &,
                 const H &column_begin, const H &column_end)  {
 
-        result_type result;
+        const size_type col_s = std::distance(column_begin, column_end);
+        result_type     result;
 
-        result.reserve(std::distance(column_begin, column_end));
-        std::for_each(column_begin, column_end,
-                      [&result, this](const auto &val) -> void  {
-                          result.push_back(this->ffunc_(val));
-                      });
+        if (ThreadGranularity::get_thread_level() > 2 &&
+            col_s >= ThreadPool::MUL_THR_THHOLD)  {
+            result.resize(col_s);
+
+            auto    lbd =
+                [&result, &column_begin, this]
+                (auto begin, auto end) mutable -> void  {
+                    for (; begin < end; ++begin)
+                        result[begin] = this->ffunc_(*(column_begin + begin));
+                };
+            auto    futures =
+                ThreadGranularity::thr_pool_.parallel_loop(size_type(0),
+                                                           col_s,
+                                                           std::move(lbd));
+
+            for (auto &fut : futures)  fut.get();
+        }
+        else  {
+            result.reserve(col_s);
+            std::for_each(column_begin, column_end,
+                          [&result, this](const auto &val) -> void  {
+                              result.push_back(this->ffunc_(val));
+                          });
+        }
         result_.swap(result);
     }
 
@@ -2095,11 +2267,10 @@ struct  AutoCorrVisitor  {
 
         vec_type<value_type>    tmp_result(col_s - 4);
         size_type               lag = 1;
-        const size_type         thread_level =
-            ThreadGranularity::get_thread_level();
 
         tmp_result[0] = 1.0;
-        if (thread_level > 0)  {
+        if (ThreadGranularity::get_thread_level() > 2 &&
+            col_s >= ThreadPool::MUL_THR_THHOLD)  {
             vec_type<std::future<CorrResult>>   futures;
 
             futures.reserve((col_s - 4) - lag);
@@ -2108,9 +2279,9 @@ struct  AutoCorrVisitor  {
                     ThreadGranularity::thr_pool_.dispatch(
                         false,
                         &AutoCorrVisitor::get_auto_corr_<H>,
-                        col_s,
-                        lag,
-                        std::cref(column_begin)));
+                            col_s,
+                            lag,
+                            std::cref(column_begin)));
                 lag += 1;
             }
             for (auto &fut : futures)  {
@@ -2284,10 +2455,10 @@ struct  ExponentiallyWeightedMeanVisitor  {
 
             std::transform(column_begin + (starting + 1), column_end,
                            result.begin() + (starting + 1),
-                           [&decay_comp_prod,
-                            &decay_comp,
-                            &denominator,
-                            &numerator](const auto &val) -> value_type  {
+                           [decay_comp_prod,
+                            decay_comp,
+                            denominator,
+                            numerator](const auto &val) mutable -> value_type  {
                                 if (! is_nan__(val)) [[likely]]  {
                                     decay_comp_prod *= decay_comp;
                                     denominator += decay_comp_prod;
@@ -2373,6 +2544,7 @@ struct  ExponentiallyWeightedVarVisitor  {
             }
 
             // Calculate exponential moving average
+            //
             const value_type    ewma = sum_weighted_input / sum_weights;
             value_type          factor_sum = 0;
 
@@ -2385,6 +2557,7 @@ struct  ExponentiallyWeightedVarVisitor  {
 
             // Calculate exponential moving variance and standard deviation
             // with bias
+            //
             const value_type    sum_weights_sq = sum_weights * sum_weights;
             const value_type    bias =
                     sum_weights_sq / (sum_weights_sq - sum_sq_weights);
@@ -2652,9 +2825,27 @@ struct  ZeroLagMovingMeanVisitor  {
 
         const size_type lag = size_type (0.5 * double(roll_period_ - 1));
 
-        for (size_type i = starting + lag; i < col_s; ++i) [[likely]]
-            result_[i] =
-                T(2) * *(column_begin + i) - *(column_begin + (i - lag));
+        if (ThreadGranularity::get_thread_level() > 2 &&
+            col_s >= ThreadPool::MUL_THR_THHOLD)  {
+            auto    lbd =
+                [lag, &column_begin, this]
+                (auto begin, auto end) mutable -> void  {
+                    for (size_type i = begin; i < end; ++i) [[likely]]
+                        this->result_[i] = T(2) * *(column_begin + i) -
+                                           *(column_begin + (i - lag));
+                };
+            auto    futures =
+                ThreadGranularity::thr_pool_.parallel_loop(starting + lag,
+                                                           col_s,
+                                                           std::move(lbd));
+
+            for (auto &fut : futures)  fut.get();
+        }
+        else  {
+            for (size_type i = starting + lag; i < col_s; ++i) [[likely]]
+                result_[i] =
+                    T(2) * *(column_begin + i) - *(column_begin + (i - lag));
+        }
 
         ewm_v<T, I, A> ewm(exponential_decay_spec::span, roll_period_, true);
 
@@ -2701,16 +2892,41 @@ struct  LinregMovingMeanVisitor  {
         assert(col_s > 3);
         assert(roll_period_ < col_s - 1);
 
-        const value_type    sum_x = 0.5 * T(roll_period_) * T(roll_period_ + 1);
+        const value_type    sum_x =
+            0.5 * T(roll_period_) * T(roll_period_ + 1);
         const value_type    sum_x2 =
             sum_x * (2.0 * T(roll_period_) + 1.0) / 3.0;
         const value_type    divisor = T(roll_period_) * sum_x2 - sum_x * sum_x;
-        result_type         result (col_s, std::numeric_limits<T>::quiet_NaN());
+        result_type         result (col_s,
+                                    std::numeric_limits<T>::quiet_NaN());
+        const auto          thread_level =
+            ThreadGranularity::get_thread_level();
 
-        for (size_type i = roll_period_; i < col_s; ++i) [[likely]]
-            result[i] = linreg_(column_begin + (i - roll_period_),
-                                column_begin + i,
-                                sum_x, sum_x2, divisor);
+        if (thread_level > 2 && col_s >= ThreadPool::MUL_THR_THHOLD)  {
+            auto    lbd =
+                [&column_begin, &result, sum_x, sum_x2, divisor, this]
+                (auto begin, auto end) mutable -> void  {
+                    for (size_type i = begin; i < end; ++i) [[likely]]
+                        result[i] =
+                            linreg_(column_begin + (i - this->roll_period_),
+                                    column_begin + i,
+                                    sum_x, sum_x2, divisor,
+                                    this->type_, this->roll_period_);
+                };
+            auto    futures =
+                ThreadGranularity::thr_pool_.parallel_loop(roll_period_,
+                                                           col_s,
+                                                           std::move(lbd));
+
+            for (auto &fut : futures)  fut.get();
+        }
+        else  {
+            for (size_type i = roll_period_; i < col_s; ++i) [[likely]]
+                result[i] = linreg_(column_begin + (i - roll_period_),
+                                    column_begin + i,
+                                    sum_x, sum_x2, divisor,
+                                    type_, roll_period_);
+        }
         result_.swap(result);
     }
 
@@ -2726,42 +2942,74 @@ struct  LinregMovingMeanVisitor  {
 private:
 
     template <typename H>
-    inline value_type
+    inline static value_type
     linreg_(const H &column_begin, const H &column_end,
-            value_type sum_x, value_type sum_x2, value_type divisor)  {
+            value_type sum_x, value_type sum_x2, value_type divisor,
+            linreg_moving_mean_type lmm_type, value_type roll_period)  {
 
         const size_type col_s = std::distance(column_begin, column_end);
         value_type      sum_y { 0 };
         value_type      sum_xy { 0 };
 
-        for (size_type i = 0; i < col_s; ++i) [[likely]]  {
-            const value_type    val = *(column_begin + i);
+        if (ThreadGranularity::get_thread_level() > 2 &&
+            col_s >= ThreadPool::MUL_THR_THHOLD)  {
+            auto    lbd =
+                [&column_begin]
+                (auto begin, auto end) mutable -> std::pair<T, T>  {
+                    value_type  sum_y { 0 };
+                    value_type  sum_xy { 0 };
+
+                    for (size_type i = begin; i < end; ++i) [[likely]]  {
+                        const value_type    val = *(column_begin + i);
+
+                        sum_y += val;
+                        sum_xy += val * T(i + 1);
+                    }
+                    return (std::make_pair(sum_y, sum_xy));
+                };
+            auto    futures =
+                ThreadGranularity::thr_pool_.parallel_loop(size_type(0),
+                                                           col_s,
+                                                           std::move(lbd));
 
-            sum_y += val;
-            sum_xy += val * T(i + 1);
+            for (auto &fut : futures)  {
+                const auto &val = fut.get();
+
+                sum_y += val.first;
+                sum_xy += val.second;
+            }
+        }
+        else  {
+            for (size_type i = 0; i < col_s; ++i) [[likely]]  {
+                const value_type    val = *(column_begin + i);
+
+                sum_y += val;
+                sum_xy += val * T(i + 1);
+            }
         }
 
         const value_type    slope =
-            (T(roll_period_) * sum_xy - sum_x * sum_y) / divisor;
+            (roll_period * sum_xy - sum_x * sum_y) / divisor;
 
-        if (type_ == linreg_moving_mean_type::slope)  return (slope);
+        if (lmm_type == linreg_moving_mean_type::slope)  return (slope);
 
-        if (type_ == linreg_moving_mean_type::theta ||
-            type_ == linreg_moving_mean_type::degree)  {
+        if (lmm_type == linreg_moving_mean_type::theta ||
+            lmm_type == linreg_moving_mean_type::degree)  {
             const value_type    theta = std::atan(slope);
 
-            return (type_ == linreg_moving_mean_type::theta
+            return (lmm_type == linreg_moving_mean_type::theta
                         ? theta : theta * (180.0 / M_PI));
         }
 
         const value_type    intercept =
             (sum_y * sum_x2 - sum_x * sum_xy) / divisor;
 
-        if (type_ == linreg_moving_mean_type::intercept)  return (intercept);
+        if (lmm_type == linreg_moving_mean_type::intercept)
+            return (intercept);
 
-        return (type_ == linreg_moving_mean_type::forecast
-                    ? slope * T(roll_period_) + intercept
-                    : slope * T(roll_period_ - 1) + intercept);
+        return (lmm_type == linreg_moving_mean_type::forecast
+                    ? slope * roll_period + intercept
+                : slope * (roll_period - T(1)) + intercept);
     }
 
     const size_type                 roll_period_;

From 37e8863921d90387e79879289f93d813cdbde37b Mon Sep 17 00:00:00 2001
From: Hossein Moein <myseajune@aol.com>
Date: Mon, 18 Dec 2023 10:49:42 -0500
Subject: [PATCH 6/7] Added parallel computing logic to the following visitors:
 Mode, MAD, ZScore, SampleZScore, BoxCox, ProbabilityDist, Normalize,
 Standardize

---
 docs/HTML/DataFrame.html                      |    2 +-
 include/DataFrame/DataFrameStatsVisitors.h    | 1090 +++++++++++++----
 include/DataFrame/Utils/Threads/SharedQueue.h |   10 -
 .../DataFrame/Utils/Threads/SharedQueue.tcc   |   46 -
 include/DataFrame/Utils/Threads/ThreadPool.h  |   50 +-
 .../DataFrame/Utils/Threads/ThreadPool.tcc    |   86 +-
 6 files changed, 844 insertions(+), 440 deletions(-)

diff --git a/docs/HTML/DataFrame.html b/docs/HTML/DataFrame.html
index 9019310e..bcb5bdde 100644
--- a/docs/HTML/DataFrame.html
+++ b/docs/HTML/DataFrame.html
@@ -1450,7 +1450,7 @@ <H4>1. User Multithreading</H4>
     </UL>
     So, In general if you as the user of DataFrame utilize multithreading, you must protect the DataFrame with a synchronization tool (i.e. SpinLock)
   <H4>2. DataFrame Internal Multithreading</H4>
-  Whether or not you, as the user, use multithreading, DataFrame utilizes a versatile thread-pool to employ parallel computing extensively in almost all its functionalities, when appropriate -- currently, most parallel algorithms trigger when number of items exceeds 150k and number of threads exceeds 2. DataFrame also gives you the interface to control and tweak that. You do not need to worry about synchronization for DataFrame internal multithreading.<BR>
+  Whether or not you, as the user, use multithreading, DataFrame utilizes a versatile thread-pool to employ parallel computing extensively in almost all its functionalities, when appropriate -- currently, most parallel algorithms trigger when number of items exceeds 250k and number of threads exceeds 2. DataFrame also gives you the interface to control and tweak that. You do not need to worry about synchronization for DataFrame internal multithreading.<BR>
   <UL>
     <LI> There are asynchronous versions of some methods. For example, you have sort()/sort_async(), visit()/visit_async(), ... The latter versions return a std::future and would execute in parallel.<BR>If you chose to use DataFrame async interfaces, it is highly recommended to call <I>ThreadGranularity::set_optimum_thread_level()</I>, So your thread-pool is populated with optimal number of threads. Otherwise, if thread-pool is empty, async interfaces will add one thread to it. Having only one thread in thread-pool could be suboptimal and hinder performance.</LI>
     <LI>As mentioned above, DataFrame uses parallel computing extensively. But by default, DataFrame is single threaded, because by default its thread-pool is empty. If you want to fully take advantage of DataFrame parallel computing, it is recommended to call <I>ThreadGranularity::set_optimum_thread_level()</I> at the beginning of your program. Alternatively you could call <I>ThreadGranularity:: set_thread_level(n)</I> to add a custom number of threads to the thread-pool. But you better have a good reason for that.<BR>Thread-pool and thread level are static properties of DataFrame. Once the thread level is set, it applies to all DataFrame instances.</LI>
diff --git a/include/DataFrame/DataFrameStatsVisitors.h b/include/DataFrame/DataFrameStatsVisitors.h
index de89a776..fe28a4c9 100644
--- a/include/DataFrame/DataFrameStatsVisitors.h
+++ b/include/DataFrame/DataFrameStatsVisitors.h
@@ -3123,8 +3123,8 @@ struct  KthValueVisitor  {
     const bool      skip_nan_;
 
     template<typename V>
-    inline size_type
-    parttition_ (V &vec, size_type begin, size_type end) const  {
+    inline static size_type
+    parttition_(V &vec, size_type begin, size_type end)  {
 
         const value_type x = vec[end];
         size_type        i = begin;
@@ -3141,11 +3141,8 @@ struct  KthValueVisitor  {
     }
 
     template<typename V>
-    inline value_type
-    find_kth_element_ (V &vec,
-                       size_type begin,
-                       size_type end,
-                       size_type k) const  {
+    inline static value_type
+    find_kth_element_(V &vec, size_type begin, size_type end, size_type k)  {
 
         // If k is smaller than number of elements in array
         //
@@ -3431,10 +3428,20 @@ struct  ModeVisitor  {
                           val_vec.push_back(map_pair.second);
                       });
 
-        std::sort(val_vec.begin(), val_vec.end(),
-                  [](const DataItem &lhs, const DataItem &rhs) -> bool  {
-                      return (lhs.repeat_count() > rhs.repeat_count()); // dec
-                  });
+        if (ThreadGranularity::get_thread_level() > 2 &&
+            val_vec.size() >= ThreadPool::MUL_THR_THHOLD)  {
+            ThreadGranularity::thr_pool_.parallel_sort(
+                val_vec.begin(), val_vec.end(),
+                [](const DataItem &lhs, const DataItem &rhs) -> bool  {
+                    return (lhs.repeat_count() > rhs.repeat_count());
+                });  // Descending
+        }
+        else  {
+            std::sort(val_vec.begin(), val_vec.end(),
+                      [](const DataItem &lhs, const DataItem &rhs) -> bool  {
+                          return (lhs.repeat_count() > rhs.repeat_count());
+                      });  // Descending
+        }
         for (size_type i = 0; i < N && i < val_vec.size(); ++i)
             result_[i] = val_vec[i];
     }
@@ -3454,17 +3461,33 @@ struct  ModeVisitor  {
 
     inline void sort_by_repeat_count()  {
 
-        std::sort(result_.begin(), result_.end(),
-                  [](const DataItem &lhs, const DataItem &rhs) -> bool  {
-                      return (lhs.repeat_count() < rhs.repeat_count());
-                  });
+        if (ThreadGranularity::get_thread_level() > 2 &&
+            result_.size() >= ThreadPool::MUL_THR_THHOLD)
+            ThreadGranularity::thr_pool_.parallel_sort(
+                result_.begin(), result_.end(),
+                [](const DataItem &lhs, const DataItem &rhs) -> bool  {
+                    return (lhs.repeat_count() < rhs.repeat_count());
+                });
+        else
+            std::sort(result_.begin(), result_.end(),
+                      [](const DataItem &lhs, const DataItem &rhs) -> bool  {
+                          return (lhs.repeat_count() < rhs.repeat_count());
+                      });
     }
     inline void sort_by_value()  {
 
-        std::sort(result_.begin(), result_.end(),
-                  [](const DataItem &lhs, const DataItem &rhs) -> bool  {
-                      return (*(lhs.value) < *(rhs.value));
-                  });
+        if (ThreadGranularity::get_thread_level() > 2 &&
+            result_.size() >= ThreadPool::MUL_THR_THHOLD)
+            ThreadGranularity::thr_pool_.parallel_sort(
+                result_.begin(), result_.end(),
+                [](const DataItem &lhs, const DataItem &rhs) -> bool  {
+                    return (*(lhs.value) < *(rhs.value));
+                });
+        else
+            std::sort(result_.begin(), result_.end(),
+                      [](const DataItem &lhs, const DataItem &rhs) -> bool  {
+                          return (*(lhs.value) < *(rhs.value));
+                      });
     }
 
 private:
@@ -3493,19 +3516,17 @@ struct  MADVisitor  {
 
     template <forward_iterator K, forward_iterator H>
     inline void
-    calc_mean_abs_dev_around_mean_(const K &,
-                                   const K &,
+    calc_mean_abs_dev_around_mean_(const K &idx_begin,
+                                   const K &idx_end,
                                    const H &column_begin,
                                    const H &column_end)  {
 
         GET_COL_SIZE2
 
         MeanVisitor<T, I>   mean_visitor(skip_nan_);
-        const index_type    idx_value { };
 
         mean_visitor.pre();
-        for (std::size_t i = 0; i < col_s; ++i) [[likely]]
-            mean_visitor(idx_value, *(column_begin + i));
+        mean_visitor(idx_begin, idx_end, column_begin, column_end);
         mean_visitor.post();
 
         MeanVisitor<T, I>   mean_mean_visitor(skip_nan_);
@@ -3515,8 +3536,9 @@ struct  MADVisitor  {
             const value_type    value = *(column_begin + i);
 
             if (! is_nan__(value) || ! skip_nan_) [[likely]]
-                mean_mean_visitor(idx_value,
-                                  std::fabs(value - mean_visitor.get_result()));
+                mean_mean_visitor(
+                    *idx_begin,
+                    std::fabs(value - mean_visitor.get_result()));
         }
         mean_mean_visitor.post();
 
@@ -3539,15 +3561,15 @@ struct  MADVisitor  {
         GET_COL_SIZE2
 
         MeanVisitor<T, I>   mean_median_visitor(skip_nan_);
-        const index_type    idx_value { };
 
         mean_median_visitor.pre();
         for (std::size_t i = 0; i < col_s; ++i) [[likely]]  {
             const value_type    value = *(column_begin + i);
 
             if (skip_nan_ && is_nan__(value)) [[unlikely]]  continue;
-            mean_median_visitor(idx_value,
-                                std::fabs(value - median_visitor.get_result()));
+            mean_median_visitor(
+                *idx_begin,
+                std::fabs(value - median_visitor.get_result()));
         }
         mean_median_visitor.post();
 
@@ -3561,18 +3583,18 @@ struct  MADVisitor  {
                                      const H &column_begin,
                                      const H &column_end)  {
 
+        using vec_t = std::vector<T, typename allocator_declare<T, A>::type>;
+
         GET_COL_SIZE2
 
         MeanVisitor<T, I>   mean_visitor(skip_nan_);
-        const index_type    idx_value { };
 
         mean_visitor.pre();
-        for (std::size_t i = 0; i < col_s; ++i) [[likely]]
-            mean_visitor(idx_value, *(column_begin + i));
+        mean_visitor(idx_begin, idx_end, column_begin, column_end);
         mean_visitor.post();
 
-        MedianVisitor<T, I, A>                            median_mean_visitor;
-        std::vector<T, typename allocator_declare<T, A>::type>  mean_dists;
+        MedianVisitor<T, I, A>  median_mean_visitor;
+        vec_t                   mean_dists;
 
         mean_dists.reserve(col_s);
         for (std::size_t i = 0; i < col_s; ++i) [[likely]]
@@ -3593,6 +3615,8 @@ struct  MADVisitor  {
                                        const H &column_begin,
                                        const H &column_end)  {
 
+        using vec_t = std::vector<T, typename allocator_declare<T, A>::type>;
+
         MedianVisitor<T, I, A> median_visitor;
 
         median_visitor.pre();
@@ -3601,8 +3625,8 @@ struct  MADVisitor  {
 
         GET_COL_SIZE2
 
-        MedianVisitor<T, I, A> median_median_visitor;
-        std::vector<T, typename allocator_declare<T, A>::type>  median_dists;
+        MedianVisitor<T, I, A>  median_median_visitor;
+        vec_t                   median_dists;
 
         median_dists.reserve(col_s);
         for (std::size_t i = 0; i < col_s; ++i) [[likely]]
@@ -3789,27 +3813,42 @@ struct  ZScoreVisitor  {
 
     template <forward_iterator K, forward_iterator H>
     inline void
-    operator() (const K &, const K &,
+    operator() (const K &idx_begin, const K &idx_end,
                 const H &column_begin, const H &column_end)  {
 
         GET_COL_SIZE2
 
-        MeanVisitor<T, I>   mvisit;
+        MeanVisitor<T, I>   mvisit { skip_nan_ };
         StdVisitor<T, I>    svisit;
-
-        // None of these visitors look at the index value
-        //
-        const index_type    idx_value { };
+        const auto          thread_level {
+            ThreadGranularity::get_thread_level() };
 
         mvisit.pre();
         svisit.pre();
-        for (size_type i = 0; i < col_s; ++i) [[likely]]  {
-            const value_type    value = *(column_begin + i);
+        if (thread_level > 2 && col_s >= ThreadPool::MUL_THR_THHOLD)  {
+            auto    fut1 =
+                ThreadGranularity::thr_pool_.dispatch(
+                      false,
+                      [&svisit,
+                       &idx_begin, &idx_end,
+                       &column_begin, &column_end]() -> void  {
+                          svisit(idx_begin, idx_end, column_begin, column_end);
+                      });
+            auto    fut2 =
+                ThreadGranularity::thr_pool_.dispatch(
+                      false,
+                      [&mvisit,
+                       &idx_begin, &idx_end,
+                       &column_begin, &column_end]() -> void  {
+                          mvisit(idx_begin, idx_end, column_begin, column_end);
+                      });
 
-            if (! skip_nan_ || ! is_nan__(value)) [[likely]]  {
-                mvisit(idx_value, value);
-                svisit(idx_value, value);
-            }
+            fut1.get();
+            fut2.get();
+        }
+        else  {
+            mvisit(idx_begin, idx_end, column_begin, column_end);
+            svisit(idx_begin, idx_end, column_begin, column_end);
         }
         mvisit.post();
         svisit.post();
@@ -3818,12 +3857,29 @@ struct  ZScoreVisitor  {
         const value_type    s = svisit.get_result();
         result_type         result;
 
-        result.reserve(col_s);
-        std::transform(column_begin, column_end,
-                       std::back_inserter(result),
-                       [m, s](const auto &val) -> value_type  {
-                           return ((val - m) / s);
-                       });
+        if (thread_level > 2 && col_s >= ThreadPool::MUL_THR_THHOLD)  {
+            result.resize(col_s);
+
+            auto    futures =
+                ThreadGranularity::thr_pool_.parallel_loop(
+                    size_type(0),
+                    col_s,
+                    [m, s, &result, &column_begin]
+                    (auto begin, auto end) -> void  {
+                        for (size_type i = begin; i < end; ++i)
+                            result[i] = (*(column_begin + i) - m) / s;
+                    });
+
+            for (auto &fut : futures)  fut.get();
+        }
+        else  {
+            result.reserve(col_s);
+            std::transform(column_begin, column_end,
+                           std::back_inserter(result),
+                           [m, s](const auto &val) -> value_type  {
+                               return ((val - m) / s);
+                           });
+        }
 
         result_.swap(result);
     }
@@ -3860,46 +3916,61 @@ struct  SampleZScoreVisitor  {
 
     template <forward_iterator K, forward_iterator H>
     inline void
-    operator() (const K &, const K &,
+    operator() (const K &idx_begin, const K &idx_end,
                 const H &population_begin, const H &population_end,
                 const H &sample_begin, const H &sample_end)  {
 
-        MeanVisitor<T, I>   p_mvisit;
+        MeanVisitor<T, I>   p_mvisit { skip_nan_ };
         StdVisitor<T, I>    p_svisit;
-        MeanVisitor<T, I>   s_mvisit;
-        const size_type     p_col_s =
-            std::distance(population_begin, population_end);
-        const size_type     s_col_s = std::distance(sample_begin, sample_end);
-        const size_type     max_s = std::max(p_col_s, s_col_s);
-
-        // None of these visitors look at the index value
-        //
-        const index_type    idx_value { };
+        MeanVisitor<T, I>   s_mvisit { skip_nan_ };
 
         p_mvisit.pre();
         p_svisit.pre();
         s_mvisit.pre();
-        for (size_type i = 0; i < max_s; ++i)  [[likely]]  {
-            if (i < p_col_s)  {
-                const value_type    value = *(population_begin + i);
-
-                if (! skip_nan_ || ! is_nan__(value)) [[likely]]  {
-                    p_mvisit(idx_value, value);
-                    p_svisit(idx_value, value);
-                }
-            }
-            if (i < s_col_s)  {
-                const value_type    value = *(sample_begin + i);
+        if (ThreadGranularity::get_thread_level() > 3)  {
+            auto    fut1 =
+                ThreadGranularity::thr_pool_.dispatch(
+                      false,
+                      [&p_svisit,
+                       &idx_begin, &idx_end,
+                       &population_begin, &population_end]() -> void  {
+                          p_svisit(idx_begin, idx_end,
+                                   population_begin, population_end);
+                      });
+            auto    fut2 =
+                ThreadGranularity::thr_pool_.dispatch(
+                      false,
+                      [&p_mvisit,
+                       &idx_begin, &idx_end,
+                       &population_begin, &population_end]() -> void  {
+                          p_mvisit(idx_begin, idx_end,
+                                   population_begin, population_end);
+                      });
+            auto    fut3 =
+                ThreadGranularity::thr_pool_.dispatch(
+                      false,
+                      [&s_mvisit,
+                       &idx_begin, &idx_end,
+                       &sample_begin, &sample_end]() -> void  {
+                          s_mvisit(idx_begin, idx_end,
+                                   sample_begin, sample_end);
+                      });
 
-                if (! skip_nan_ || ! is_nan__(value))  {
-                    s_mvisit(idx_value, value);
-                }
-            }
+            fut1.get();
+            fut2.get();
+            fut3.get();
+        }
+        else  {
+            p_mvisit(idx_begin, idx_end, population_begin, population_end);
+            p_svisit(idx_begin, idx_end, population_begin, population_end);
+            s_mvisit(idx_begin, idx_end, sample_begin, sample_end);
         }
         p_mvisit.post();
         p_svisit.post();
         s_mvisit.post();
 
+        const size_type s_col_s = std::distance(sample_begin, sample_end);
+
         result_ = (s_mvisit.get_result() - p_mvisit.get_result()) /
                   (p_svisit.get_result() / ::sqrt(s_col_s));
     }
@@ -3929,74 +4000,209 @@ struct  BoxCoxVisitor  {
 private:
 
     template<forward_iterator H>
-    inline void modulus_(const H &column_begin, const H &column_end)  {
+    inline void modulus_(const H &column_begin, const H &column_end,
+                         size_type col_s, size_type thread_level)  {
 
         if (lambda_ != 0)  {
-            std::transform(
-                column_begin, column_end,
-                std::back_inserter(result_),
-                [this](const auto &val) -> value_type  {
-                    const value_type    sign = std::signbit(val) ? -1 : 1;
-                    const value_type    v =
-                        (std::pow(std::fabs(val) + (1), this->lambda_) -
-                         T(1)) / lambda_;
-
-                    return (sign * v);
-                });
+            if (thread_level > 2 && col_s >= ThreadPool::MUL_THR_THHOLD)  {
+                result_.resize(col_s);
+                auto    futures =
+                    ThreadGranularity::thr_pool_.parallel_loop(
+                        size_type(0),
+                        col_s,
+                        [this, &column_begin]
+                        (auto begin, auto end) -> void  {
+                            for (auto i = begin; i < end; ++i)  {
+                                const auto          val = *(column_begin + i);
+                                const value_type    sign =
+                                    std::signbit(val) ? -1 : 1;
+                                const value_type    v =
+                                    (std::pow(std::fabs(val) + (1),
+                                              this->lambda_) -
+                                     T(1)) / this->lambda_;
+
+                                this->result_[i] = sign * v;
+                            }
+                       });
+
+                for (auto &fut : futures)  fut.get();
+            }
+            else  {
+                result_.reserve(col_s);
+                std::transform(
+                    column_begin, column_end,
+                    std::back_inserter(result_),
+                    [this](const auto &val) -> value_type  {
+                        const value_type    sign = std::signbit(val) ? -1 : 1;
+                        const value_type    v =
+                            (std::pow(std::fabs(val) + (1), this->lambda_) -
+                             T(1)) / this->lambda_;
+
+                        return (sign * v);
+                    });
+            }
         }
         else  {
-            std::transform(
-                column_begin, column_end,
-                std::back_inserter(result_),
-                [](const auto &val) -> value_type  {
-                    const value_type    sign = std::signbit(val) ? -1 : 1;
+            if (thread_level > 2 && col_s >= ThreadPool::MUL_THR_THHOLD)  {
+                result_.resize(col_s);
+                auto    futures =
+                    ThreadGranularity::thr_pool_.parallel_loop(
+                        size_type(0),
+                        col_s,
+                        [this, &column_begin]
+                        (auto begin, auto end) -> void  {
+                            for (auto i = begin; i < end; ++i)  {
+                                const auto          val = *(column_begin + i);
+                                const value_type    sign =
+                                    std::signbit(val) ? -1 : 1;
+
+                                result_[i] =
+                                    sign * std::log(std::fabs(val) + T(1));
+                            }
+                       });
 
-                    return (sign * std::log(std::fabs(val) + T(1)));
-                });
+                for (auto &fut : futures)  fut.get();
+            }
+            else  {
+                result_.reserve(col_s);
+                std::transform(
+                    column_begin, column_end,
+                    std::back_inserter(result_),
+                    [](const auto &val) -> value_type  {
+                        const value_type    sign = std::signbit(val) ? -1 : 1;
+
+                        return (sign * std::log(std::fabs(val) + T(1)));
+                    });
+            }
         }
     }
 
     template<forward_iterator H>
-    inline void exponential_(const H &column_begin, const H &column_end)  {
+    inline void exponential_(const H &column_begin, const H &column_end,
+                             size_type col_s, size_type thread_level)  {
 
         if (lambda_ != 0)  {
-            std::transform(
-                column_begin, column_end,
-                std::back_inserter(result_),
-                [this](const auto &val) -> value_type  {
-                    return ((std::exp(this->lambda_ * val) - T(1)) /
-                            this->lambda_);
-                });
+            if (thread_level > 2 && col_s >= ThreadPool::MUL_THR_THHOLD)  {
+                result_.resize(col_s);
+                auto    futures =
+                    ThreadGranularity::thr_pool_.parallel_loop(
+                        size_type(0),
+                        col_s,
+                        [this, &column_begin]
+                        (auto begin, auto end) -> void  {
+                            for (auto i = begin; i < end; ++i)  {
+                                const auto  val = *(column_begin + i);
+
+                                this->result_[i] =
+                                    (std::exp(this->lambda_ * val) - T(1)) /
+                                    this->lambda_;
+                            }
+                       });
+
+                for (auto &fut : futures)  fut.get();
+            }
+            else  {
+                result_.reserve(col_s);
+                std::transform(
+                    column_begin, column_end,
+                    std::back_inserter(result_),
+                    [this](const auto &val) -> value_type  {
+                        return ((std::exp(this->lambda_ * val) - T(1)) /
+                                this->lambda_);
+                    });
+            }
         }
         else  {
-            std::transform(column_begin, column_end,
-                           std::back_inserter(result_),
-                           [](const auto &val) -> value_type  {
-                               return (val);
-                           });
+            if (thread_level > 2 && col_s >= ThreadPool::MUL_THR_THHOLD)  {
+                result_.resize(col_s);
+                auto    futures =
+                    ThreadGranularity::thr_pool_.parallel_loop(
+                        size_type(0),
+                        col_s,
+                        [this, &column_begin]
+                        (auto begin, auto end) -> void  {
+                            for (auto i = begin; i < end; ++i)  {
+                                this->result_[i] = *(column_begin + i);
+                            }
+                       });
+
+                for (auto &fut : futures)  fut.get();
+            }
+            else  {
+                result_.reserve(col_s);
+                std::transform(column_begin, column_end,
+                               std::back_inserter(result_),
+                               [](const auto &val) -> value_type  {
+                                   return (val);
+                               });
+            }
         }
     }
 
     template<forward_iterator H>
     inline void original_(const H &column_begin,
                           const H &column_end,
-                          value_type shift)  {
+                          value_type shift,
+                          size_type col_s,
+                          size_type thread_level)  {
 
         if (lambda_ != 0)  {
-            std::transform(
-                column_begin, column_end,
-                std::back_inserter(result_),
-                [this, shift](const auto &val) -> value_type  {
-                    return ((std::pow(val + shift, this->lambda_) -
-                             T(1)) / this->lambda_);
-                });
+            if (thread_level > 2 && col_s >= ThreadPool::MUL_THR_THHOLD)  {
+                result_.resize(col_s);
+                auto    futures =
+                    ThreadGranularity::thr_pool_.parallel_loop(
+                        size_type(0),
+                        col_s,
+                        [this, shift, &column_begin]
+                        (auto begin, auto end) -> void  {
+                            for (auto i = begin; i < end; ++i)  {
+                                const auto  val = *(column_begin + i);
+
+                                this->result_[i] =
+                                    (std::pow(val + shift, this->lambda_) -
+                                    T(1)) / this->lambda_;
+                            }
+                       });
+
+                for (auto &fut : futures)  fut.get();
+            }
+            else  {
+                result_.reserve(col_s);
+                std::transform(
+                    column_begin, column_end,
+                    std::back_inserter(result_),
+                    [this, shift](const auto &val) -> value_type  {
+                        return ((std::pow(val + shift, this->lambda_) -
+                                 T(1)) / this->lambda_);
+                    });
+            }
         }
         else  {
-            std::transform(column_begin, column_end,
-                           std::back_inserter(result_),
-                           [shift](const auto &val) -> value_type  {
-                               return (std::log(val + shift));
-                           });
+            if (thread_level > 2 && col_s >= ThreadPool::MUL_THR_THHOLD)  {
+                result_.resize(col_s);
+                auto    futures =
+                    ThreadGranularity::thr_pool_.parallel_loop(
+                        size_type(0),
+                        col_s,
+                        [this, shift, &column_begin]
+                        (auto begin, auto end) -> void  {
+                            for (auto i = begin; i < end; ++i)  {
+                                const auto  val = *(column_begin + i);
+
+                                this->result_[i] = std::log(val + shift);
+                            }
+                       });
+
+                for (auto &fut : futures)  fut.get();
+            }
+            else  {
+                result_.reserve(col_s);
+                std::transform(column_begin, column_end,
+                               std::back_inserter(result_),
+                               [shift](const auto &val) -> value_type  {
+                                   return (std::log(val + shift));
+                               });
+            }
         }
     }
 
@@ -4004,7 +4210,8 @@ struct  BoxCoxVisitor  {
     inline void geometric_mean_(const K &dummy,
                                 const H &column_begin,
                                 const H &column_end,
-                                value_type shift)  {
+                                value_type shift,
+                                size_type col_s, size_type thread_level)  {
 
         H                           citer = column_begin;
         GeometricMeanVisitor<T, I>  gm;
@@ -4015,29 +4222,74 @@ struct  BoxCoxVisitor  {
                 gm(dummy, *citer++ + shift);
             gm.post();
 
-            std::transform(
-                column_begin, column_end,
-                std::back_inserter(result_),
-                [this, shift, gm = gm.get_result()]
-                (const auto &val) -> value_type  {
-                    const value_type    raw_v = val + shift;
+            if (thread_level > 2 && col_s >= ThreadPool::MUL_THR_THHOLD)  {
+                result_.resize(col_s);
+                auto    futures =
+                    ThreadGranularity::thr_pool_.parallel_loop(
+                        size_type(0),
+                        col_s,
+                        [this, shift, gm = gm.get_result(), &column_begin]
+                        (auto begin, auto end) -> void  {
+                            for (auto i = begin; i < end; ++i)  {
+                                const auto          val = *(column_begin + i);
+                                const value_type    raw_v = val + shift;
+
+                                this->result_[i] =
+                                    (std::pow(raw_v, this->lambda_) -  T(1)) /
+                                    (this->lambda_ *
+                                     std::pow(gm, this->lambda_ - T(1)));
+
+                            }
+                       });
 
-                    return ((std::pow(raw_v, this->lambda_) -  T(1)) /
-                            (this->lambda_ *
-                             std::pow(gm, this->lambda_ - T(1))));
-                });
+                for (auto &fut : futures)  fut.get();
+            }
+            else  {
+                result_.reserve(col_s);
+                std::transform(
+                    column_begin, column_end,
+                    std::back_inserter(result_),
+                    [this, shift, gm = gm.get_result()]
+                    (const auto &val) -> value_type  {
+                        const value_type    raw_v = val + shift;
+
+                        return ((std::pow(raw_v, this->lambda_) -  T(1)) /
+                                (this->lambda_ *
+                                 std::pow(gm, this->lambda_ - T(1))));
+                    });
+            }
         }
         else  {
             while (citer < column_end) [[likely]]
                 gm(dummy, std::log(*citer++ + shift));
             gm.post();
 
-            std::transform(column_begin, column_end,
-                           std::back_inserter(result_),
-                           [shift, gm = gm.get_result()]
-                           (const auto &val) -> value_type  {
-                               return ((val + shift) * gm);
-                           });
+            if (thread_level > 2 && col_s >= ThreadPool::MUL_THR_THHOLD)  {
+                result_.resize(col_s);
+                auto    futures =
+                    ThreadGranularity::thr_pool_.parallel_loop(
+                        size_type(0),
+                        col_s,
+                        [this, shift, gm = gm.get_result(), &column_begin]
+                        (auto begin, auto end) -> void  {
+                            for (auto i = begin; i < end; ++i)  {
+                                const auto  val = *(column_begin + i);
+
+                                this->result_[i] = (val + shift) * gm;
+                            }
+                       });
+
+                for (auto &fut : futures)  fut.get();
+            }
+            else  {
+                result_.reserve(col_s);
+                std::transform(column_begin, column_end,
+                               std::back_inserter(result_),
+                               [shift, gm = gm.get_result()]
+                               (const auto &val) -> value_type  {
+                                   return ((val + shift) * gm);
+                               });
+            }
         }
     }
 
@@ -4062,15 +4314,18 @@ struct  BoxCoxVisitor  {
             shift = std::fabs(mv.get_result()) + value_type(0.0000001);
         }
 
-        result_.reserve(std::distance(column_begin, column_end));
+        const size_type col_s = std::distance(column_begin, column_end);
+        const size_type thread_level = ThreadGranularity::get_thread_level();
+
         if (box_cox_type_ == box_cox_type::original)
-            original_(column_begin, column_end, shift);
+            original_(column_begin, column_end, shift, col_s, thread_level);
         else if (box_cox_type_ == box_cox_type::geometric_mean)
-            geometric_mean_(*idx_begin, column_begin, column_end, shift);
+            geometric_mean_(*idx_begin, column_begin, column_end,
+                            shift, col_s, thread_level);
         else if (box_cox_type_ == box_cox_type::modulus)
-            modulus_(column_begin, column_end);
+            modulus_(column_begin, column_end, col_s, thread_level);
         else if (box_cox_type_ == box_cox_type::exponential)
-            exponential_(column_begin, column_end);
+            exponential_(column_begin, column_end, col_s, thread_level);
     }
 
     DEFINE_PRE_POST
@@ -4109,54 +4364,200 @@ struct  ProbabilityDistVisitor  {
         result_type result;
         value_type  sum { 0 };
 
-        result.reserve(col_s);
-        if (pdtype_ == prob_dist_type::arithmetic)  {
-            std::for_each(column_begin, column_end,
-                          [&sum](const auto &v) -> void  { sum += v; });
-            std::for_each(column_begin, column_end,
-                          [&sum, &result](const auto &v) -> void  {
-                              result.push_back(v / sum);
-                          });
-        }
-        else if (pdtype_ == prob_dist_type::log)  {
-            std::for_each(column_begin, column_end,
-                          [&sum](const auto &v) -> void  {
-                              sum += std::log(v);
-                          });
-            std::for_each(column_begin, column_end,
-                          [&sum, &result](const auto &v) -> void  {
-                              result.push_back(std::log(v) / sum);
-                          });
-        }
-        else if (pdtype_ == prob_dist_type::softmax)  {
-            std::for_each(column_begin, column_end,
-                          [&sum](const auto &v) -> void  {
-                              sum += std::exp(v);
-                          });
-            std::for_each(column_begin, column_end,
-                          [&sum, &result](const auto &v) -> void  {
-                              result.push_back(std::exp(v) / sum);
-                          });
-        }
-        else if (pdtype_ == prob_dist_type::pow2)  {
-            std::for_each(column_begin, column_end,
-                          [&sum](const auto &v) -> void  {
-                              sum += std::pow(T(2), v);
-                          });
-            std::for_each(column_begin, column_end,
-                          [&sum, &result](const auto &v) -> void  {
-                              result.push_back(std::pow(T(2), v) / sum);
-                          });
+        if (ThreadGranularity::get_thread_level() > 2 &&
+            std::distance(column_begin, column_end) >=
+                ThreadPool::MUL_THR_THHOLD)  {
+            result.resize(col_s);
+            if (pdtype_ == prob_dist_type::arithmetic)  {
+                auto    futures =
+                    ThreadGranularity::thr_pool_.parallel_loop(
+                        column_begin,
+                        column_end,
+                        []
+                        (const auto &begin, const auto &end) -> value_type  {
+                            value_type  sum { 0 };
+
+                            for (auto citer = begin; citer < end; ++citer)
+                                sum += *citer;
+                            return (sum);
+                        });
+
+                for (auto &fut : futures)  sum += fut.get();
+                futures =
+                    ThreadGranularity::thr_pool_.parallel_loop(
+                        size_type(0),
+                        col_s,
+                        [&column_begin, &result, sum]
+                        (auto begin, auto end) -> value_type  {
+                            for (auto i = begin; i < end; ++i)
+                                result[i] = *(column_begin + i) / sum;
+                            return (0);
+                        });
+                for (auto &fut : futures)  fut.get();
+            }
+            else if (pdtype_ == prob_dist_type::log)  {
+                auto    futures =
+                    ThreadGranularity::thr_pool_.parallel_loop(
+                        column_begin,
+                        column_end,
+                        []
+                        (const auto &begin, const auto &end) -> value_type  {
+                            value_type  sum { 0 };
+
+                            for (auto citer = begin; citer < end; ++citer)
+                                sum += std::log(*citer);
+                            return (sum);
+                        });
+
+                for (auto &fut : futures)  sum += fut.get();
+                futures =
+                    ThreadGranularity::thr_pool_.parallel_loop(
+                        size_type(0),
+                        col_s,
+                        [&column_begin, &result, sum]
+                        (auto begin, auto end) -> value_type  {
+                            for (auto i = begin; i < end; ++i)
+                                result[i] =
+                                    std::log(*(column_begin + i)) / sum;
+                            return (0);
+                        });
+                for (auto &fut : futures)  fut.get();
+            }
+            else if (pdtype_ == prob_dist_type::softmax)  {
+                auto    futures =
+                    ThreadGranularity::thr_pool_.parallel_loop(
+                        column_begin,
+                        column_end,
+                        []
+                        (const auto &begin, const auto &end) -> value_type  {
+                            value_type  sum { 0 };
+
+                            for (auto citer = begin; citer < end; ++citer)
+                                sum += std::exp(*citer);
+                            return (sum);
+                        });
+
+                for (auto &fut : futures)  sum += fut.get();
+                futures =
+                    ThreadGranularity::thr_pool_.parallel_loop(
+                        size_type(0),
+                        col_s,
+                        [&column_begin, &result, sum]
+                        (auto begin, auto end) -> value_type  {
+                            for (auto i = begin; i < end; ++i)
+                                result[i] =
+                                    std::exp(*(column_begin + i)) / sum;
+                            return (0);
+                        });
+                for (auto &fut : futures)  fut.get();
+            }
+            else if (pdtype_ == prob_dist_type::pow2)  {
+                auto    futures =
+                    ThreadGranularity::thr_pool_.parallel_loop(
+                        column_begin,
+                        column_end,
+                        []
+                        (const auto &begin, const auto &end) -> value_type  {
+                            value_type  sum { 0 };
+
+                            for (auto citer = begin; citer < end; ++citer)
+                                sum += std::pow(T(2), *citer);
+                            return (sum);
+                        });
+
+                for (auto &fut : futures)  sum += fut.get();
+                futures =
+                    ThreadGranularity::thr_pool_.parallel_loop(
+                        size_type(0),
+                        col_s,
+                        [&column_begin, &result, sum]
+                        (auto begin, auto end) -> value_type  {
+                            for (auto i = begin; i < end; ++i)
+                                result[i] =
+                                    std::pow(T(2), *(column_begin + i)) / sum;
+                            return (0);
+                        });
+                for (auto &fut : futures)  fut.get();
+            }
+            else if (pdtype_ == prob_dist_type::pow10)  {
+                auto    futures =
+                    ThreadGranularity::thr_pool_.parallel_loop(
+                        column_begin,
+                        column_end,
+                        []
+                        (const auto &begin, const auto &end) -> value_type  {
+                            value_type  sum { 0 };
+
+                            for (auto citer = begin; citer < end; ++citer)
+                                sum += std::pow(T(10), *citer);
+                            return (sum);
+                        });
+
+                for (auto &fut : futures)  sum += fut.get();
+                futures =
+                    ThreadGranularity::thr_pool_.parallel_loop(
+                        size_type(0),
+                        col_s,
+                        [&column_begin, &result, sum]
+                        (auto begin, auto end) -> value_type  {
+                            for (auto i = begin; i < end; ++i)
+                                result[i] =
+                                    std::pow(T(10), *(column_begin + i)) / sum;
+                            return (0);
+                        });
+                for (auto &fut : futures)  fut.get();
+            }
         }
-        else if (pdtype_ == prob_dist_type::pow10)  {
-            std::for_each(column_begin, column_end,
-                          [&sum](const auto &v) -> void  {
-                              sum += std::pow(T(10), v);
-                          });
-            std::for_each(column_begin, column_end,
-                          [&sum, &result](const auto &v) -> void  {
-                              result.push_back(std::pow(T(10), v) / sum);
-                          });
+        else  {
+            result.reserve(col_s);
+            if (pdtype_ == prob_dist_type::arithmetic)  {
+                std::for_each(column_begin, column_end,
+                              [&sum](const auto &v) -> void  { sum += v; });
+                std::for_each(column_begin, column_end,
+                              [&sum, &result](const auto &v) -> void  {
+                                  result.push_back(v / sum);
+                              });
+            }
+            else if (pdtype_ == prob_dist_type::log)  {
+                std::for_each(column_begin, column_end,
+                              [&sum](const auto &v) -> void  {
+                                  sum += std::log(v);
+                              });
+                std::for_each(column_begin, column_end,
+                              [&sum, &result](const auto &v) -> void  {
+                                  result.push_back(std::log(v) / sum);
+                              });
+            }
+            else if (pdtype_ == prob_dist_type::softmax)  {
+                std::for_each(column_begin, column_end,
+                              [&sum](const auto &v) -> void  {
+                                  sum += std::exp(v);
+                              });
+                std::for_each(column_begin, column_end,
+                              [&sum, &result](const auto &v) -> void  {
+                                  result.push_back(std::exp(v) / sum);
+                              });
+            }
+            else if (pdtype_ == prob_dist_type::pow2)  {
+                std::for_each(column_begin, column_end,
+                              [&sum](const auto &v) -> void  {
+                                  sum += std::pow(T(2), v);
+                              });
+                std::for_each(column_begin, column_end,
+                              [&sum, &result](const auto &v) -> void  {
+                                  result.push_back(std::pow(T(2), v) / sum);
+                              });
+            }
+            else if (pdtype_ == prob_dist_type::pow10)  {
+                std::for_each(column_begin, column_end,
+                              [&sum](const auto &v) -> void  {
+                                  sum += std::pow(T(10), v);
+                              });
+                std::for_each(column_begin, column_end,
+                              [&sum, &result](const auto &v) -> void  {
+                                  result.push_back(std::pow(T(10), v) / sum);
+                              });
+            }
         }
 
         result_.swap(result);
@@ -4232,14 +4633,34 @@ struct  NormalizeVisitor  {
         maxv.post();
 
         const value_type    diff = maxv.get_result() - minv.get_result();
+        const size_type     col_s = std::distance(column_begin, column_end);
 
-        result_.reserve(std::distance(column_begin, column_end));
-        std::transform(column_begin, column_end,
-                       std::back_inserter(result_),
-                       [minv = minv.get_result(), diff]
-                       (const auto &val) -> value_type  {
-                           return ((val - minv) / diff);
-                       });
+        if (ThreadGranularity::get_thread_level() > 2 &&
+            col_s >= ThreadPool::MUL_THR_THHOLD)  {
+            result_.resize(col_s);
+
+            auto    futures =
+                ThreadGranularity::thr_pool_.parallel_loop(
+                    size_type(0),
+                    col_s,
+                    [minv = minv.get_result(), &column_begin, diff, this]
+                    (auto begin, auto end) -> void  {
+                        for (size_type i = begin; i < end; ++i)
+                            this->result_[i] =
+                                (*(column_begin + i) - minv) / diff;
+                    });
+
+            for (auto &fut : futures)  fut.get();
+        }
+        else  {
+            result_.reserve(col_s);
+            std::transform(column_begin, column_end,
+                           std::back_inserter(result_),
+                           [minv = minv.get_result(), diff]
+                           (const auto &val) -> value_type  {
+                               return ((val - minv) / diff);
+                           });
+        }
     }
     template<forward_iterator K, forward_iterator H>
     inline void
@@ -4252,13 +4673,33 @@ struct  NormalizeVisitor  {
         sumv(idx_begin, idx_end, column_begin, column_end);
         sumv.post();
 
-        result_.reserve(std::distance(column_begin, column_end));
-        std::transform(column_begin, column_end,
-                       std::back_inserter(result_),
-                       [sumv = sumv.get_result()]
-                       (const auto &val) -> value_type  {
-                           return (val / sumv);
-                       });
+        const size_type col_s = std::distance(column_begin, column_end);
+
+        if (ThreadGranularity::get_thread_level() > 2 &&
+            col_s >= ThreadPool::MUL_THR_THHOLD)  {
+            result_.resize(col_s);
+
+            auto    futures =
+                ThreadGranularity::thr_pool_.parallel_loop(
+                    size_type(0),
+                    col_s,
+                    [sumv = sumv.get_result(), &column_begin, this]
+                    (auto begin, auto end) -> void  {
+                        for (size_type i = begin; i < end; ++i)
+                            this->result_[i] = *(column_begin + i) / sumv;
+                    });
+
+            for (auto &fut : futures)  fut.get();
+        }
+        else  {
+            result_.reserve(col_s);
+            std::transform(column_begin, column_end,
+                           std::back_inserter(result_),
+                           [sumv = sumv.get_result()]
+                           (const auto &val) -> value_type  {
+                               return (val / sumv);
+                           });
+        }
     }
     template<forward_iterator K, forward_iterator H>
     inline void
@@ -4271,13 +4712,33 @@ struct  NormalizeVisitor  {
         eucliv(idx_begin, idx_end, column_begin, column_end);
         eucliv.post();
 
-        result_.reserve(std::distance(column_begin, column_end));
-        std::transform(column_begin, column_end,
-                       std::back_inserter(result_),
-                       [eucli = eucliv.get_euclidean_norm()]
-                       (const auto &val) -> value_type  {
-                           return (val / eucli);
-                       });
+        const size_type col_s = std::distance(column_begin, column_end);
+
+        if (ThreadGranularity::get_thread_level() > 2 &&
+            col_s >= ThreadPool::MUL_THR_THHOLD)  {
+            result_.resize(col_s);
+
+            auto    futures =
+                ThreadGranularity::thr_pool_.parallel_loop(
+                    size_type(0),
+                    col_s,
+                    [eucli = eucliv.get_euclidean_norm(), &column_begin, this]
+                    (auto begin, auto end) -> void  {
+                        for (size_type i = begin; i < end; ++i)
+                            this->result_[i] = *(column_begin + i) / eucli;
+                    });
+
+            for (auto &fut : futures)  fut.get();
+        }
+        else  {
+            result_.reserve(col_s);
+            std::transform(column_begin, column_end,
+                           std::back_inserter(result_),
+                           [eucli = eucliv.get_euclidean_norm()]
+                           (const auto &val) -> value_type  {
+                               return (val / eucli);
+                           });
+        }
     }
     template<forward_iterator K, forward_iterator H>
     inline void
@@ -4290,13 +4751,33 @@ struct  NormalizeVisitor  {
         maxv(idx_begin, idx_end, column_begin, column_end);
         maxv.post();
 
-        result_.reserve(std::distance(column_begin, column_end));
-        std::transform(column_begin, column_end,
-                       std::back_inserter(result_),
-                       [maxv = maxv.get_result()]
-                       (const auto &val) -> value_type  {
-                           return (val / maxv);
-                       });
+        const size_type col_s = std::distance(column_begin, column_end);
+
+        if (ThreadGranularity::get_thread_level() > 2 &&
+            col_s >= ThreadPool::MUL_THR_THHOLD)  {
+            result_.resize(col_s);
+
+            auto    futures =
+                ThreadGranularity::thr_pool_.parallel_loop(
+                    size_type(0),
+                    col_s,
+                    [maxv = maxv.get_result(), &column_begin, this]
+                    (auto begin, auto end) -> void  {
+                        for (size_type i = begin; i < end; ++i)
+                            this->result_[i] = *(column_begin + i) / maxv;
+                    });
+
+            for (auto &fut : futures)  fut.get();
+        }
+        else  {
+            result_.reserve(col_s);
+            std::transform(column_begin, column_end,
+                           std::back_inserter(result_),
+                           [maxv = maxv.get_result()]
+                           (const auto &val) -> value_type  {
+                               return (val / maxv);
+                           });
+        }
     }
     template<forward_iterator K, forward_iterator H>
     inline void
@@ -4313,13 +4794,36 @@ struct  NormalizeVisitor  {
         meanv.post();
         stdv.post();
 
-        result_.reserve(std::distance(column_begin, column_end));
-        std::transform(column_begin, column_end,
-                       std::back_inserter(result_),
-                       [meanv = meanv.get_result(), stdv = stdv.get_result()]
-                       (const auto &val) -> value_type  {
-                           return ((val - meanv) / stdv);
-                       });
+        const size_type col_s = std::distance(column_begin, column_end);
+
+        if (ThreadGranularity::get_thread_level() > 2 &&
+            col_s >= ThreadPool::MUL_THR_THHOLD)  {
+            result_.resize(col_s);
+
+            auto    futures =
+                ThreadGranularity::thr_pool_.parallel_loop(
+                    size_type(0),
+                    col_s,
+                    [meanv = meanv.get_result(), stdv = stdv.get_result(),
+                     &column_begin, this]
+                    (auto begin, auto end) -> void  {
+                        for (size_type i = begin; i < end; ++i)
+                            this->result_[i] =
+                                (*(column_begin + i) - meanv) / stdv;
+                    });
+
+            for (auto &fut : futures)  fut.get();
+        }
+        else  {
+            result_.reserve(col_s);
+            std::transform(column_begin, column_end,
+                           std::back_inserter(result_),
+                           [meanv = meanv.get_result(),
+                            stdv = stdv.get_result()]
+                           (const auto &val) -> value_type  {
+                               return ((val - meanv) / stdv);
+                           });
+        }
     }
     template<forward_iterator K, forward_iterator H>
     inline void
@@ -4336,35 +4840,92 @@ struct  NormalizeVisitor  {
         //
         const value_type    scale =
             std::pow(10, std::log10(maxv.get_result()) + 1);
+        const size_type     col_s = std::distance(column_begin, column_end);
 
-        result_.reserve(std::distance(column_begin, column_end));
-        std::transform(column_begin, column_end,
-                       std::back_inserter(result_),
-                       [scale](const auto &val) -> value_type  {
-                           return (val / scale);
-                       });
+        if (ThreadGranularity::get_thread_level() > 2 &&
+            col_s >= ThreadPool::MUL_THR_THHOLD)  {
+            result_.resize(col_s);
+
+            auto    futures =
+                ThreadGranularity::thr_pool_.parallel_loop(
+                    size_type(0),
+                    col_s,
+                    [scale, &column_begin, this]
+                    (auto begin, auto end) -> void  {
+                        for (size_type i = begin; i < end; ++i)
+                            this->result_[i] = *(column_begin + i) / scale;
+                    });
+
+            for (auto &fut : futures)  fut.get();
+        }
+        else  {
+            result_.reserve(col_s);
+            std::transform(column_begin, column_end,
+                           std::back_inserter(result_),
+                           [scale](const auto &val) -> value_type  {
+                               return (val / scale);
+                           });
+        }
     }
     template<forward_iterator H>
     inline void
     log_transform_(const H &column_begin, const H &column_end)  {
 
-        result_.reserve(std::distance(column_begin, column_end));
-        std::transform(column_begin, column_end,
-                       std::back_inserter(result_),
-                       [](const auto &val) -> value_type  {
-                           return (std::log(val));
-                       });
+        const size_type col_s = std::distance(column_begin, column_end);
+
+        if (ThreadGranularity::get_thread_level() > 2 &&
+            col_s >= ThreadPool::MUL_THR_THHOLD)  {
+            result_.resize(col_s);
+
+            auto    futures =
+                ThreadGranularity::thr_pool_.parallel_loop(
+                    size_type(0),
+                    col_s,
+                    [&column_begin, this](auto begin, auto end) -> void  {
+                        for (size_type i = begin; i < end; ++i)
+                            this->result_[i] = std::log(*(column_begin + i));
+                    });
+
+            for (auto &fut : futures)  fut.get();
+        }
+        else  {
+            result_.reserve(std::distance(column_begin, column_end));
+            std::transform(column_begin, column_end,
+                           std::back_inserter(result_),
+                           [](const auto &val) -> value_type  {
+                               return (std::log(val));
+                           });
+        }
     }
     template<forward_iterator H>
     inline void
     root_transform_(const H &column_begin, const H &column_end)  {
 
-        result_.reserve(std::distance(column_begin, column_end));
-        std::transform(column_begin, column_end,
-                       std::back_inserter(result_),
-                       [](const auto &val) -> value_type  {
-                           return (std::sqrt(val));
-                       });
+        const size_type col_s = std::distance(column_begin, column_end);
+
+        if (ThreadGranularity::get_thread_level() > 2 &&
+            col_s >= ThreadPool::MUL_THR_THHOLD)  {
+            result_.resize(col_s);
+
+            auto    futures =
+                ThreadGranularity::thr_pool_.parallel_loop(
+                    size_type(0),
+                    col_s,
+                    [&column_begin, this](auto begin, auto end) -> void  {
+                        for (size_type i = begin; i < end; ++i)
+                            this->result_[i] = std::sqrt(*(column_begin + i));
+                    });
+
+            for (auto &fut : futures)  fut.get();
+        }
+        else  {
+            result_.reserve(col_s);
+            std::transform(column_begin, column_end,
+                           std::back_inserter(result_),
+                           [](const auto &val) -> value_type  {
+                               return (std::sqrt(val));
+                           });
+        }
     }
 
     result_type                 result_ {  };  // Normalized
@@ -4396,13 +4957,34 @@ struct  StandardizeVisitor  {
         mv.post();
         sv.post();
 
-        result_.reserve(std::distance(column_begin, column_end));
-        std::transform(column_begin, column_end,
-                       std::back_inserter(result_),
-                       [mv = mv.get_result(), sv = sv.get_result()]
-                       (const auto &val) -> value_type  {
-                           return ((val - mv) / sv);
-                       });
+        const size_type col_s = std::distance(column_begin, column_end);
+
+        if (ThreadGranularity::get_thread_level() > 2 &&
+            col_s >= ThreadPool::MUL_THR_THHOLD)  {
+            result_.resize(col_s);
+
+            auto    futures =
+                ThreadGranularity::thr_pool_.parallel_loop(
+                    size_type(0),
+                    col_s,
+                    [mv = mv.get_result(), sv = sv.get_result(),
+                     &column_begin, this]
+                    (auto begin, auto end) -> void  {
+                        for (size_type i = begin; i < end; ++i)
+                            this->result_[i] = (*(column_begin + i) - mv) / sv;
+                    });
+
+            for (auto &fut : futures)  fut.get();
+        }
+        else  {
+            result_.reserve(col_s);
+            std::transform(column_begin, column_end,
+                           std::back_inserter(result_),
+                           [mv = mv.get_result(), sv = sv.get_result()]
+                           (const auto &val) -> value_type  {
+                               return ((val - mv) / sv);
+                           });
+        }
     }
 
     DEFINE_PRE_POST
diff --git a/include/DataFrame/Utils/Threads/SharedQueue.h b/include/DataFrame/Utils/Threads/SharedQueue.h
index 814e8c85..17f51dab 100644
--- a/include/DataFrame/Utils/Threads/SharedQueue.h
+++ b/include/DataFrame/Utils/Threads/SharedQueue.h
@@ -40,10 +40,6 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 namespace hmdf
 {
 
-class   SQEmpty  { public: inline SQEmpty () noexcept  {   } };
-
-// ----------------------------------------------------------------------------
-
 template<typename T>
 class   SharedQueue  {
 
@@ -61,11 +57,6 @@ class   SharedQueue  {
 
     inline void push(const value_type &element) noexcept;
 
-    inline const value_type &
-    front(bool wait_on_front = true) const; // throw (SQEmpty);
-    inline value_type &
-    front(bool wait_on_front = true); // throw (SQEmpty);
-
     // NOTE: The following method returns the data by value.
     //       Therefore it is not as efficient as front().
     //       Use it only if you have to.
@@ -73,7 +64,6 @@ class   SharedQueue  {
     inline optional_ret
     pop_front(bool wait_on_front = true) noexcept;
 
-    void pop() noexcept;
     bool empty() const noexcept;
     size_type size() const noexcept;
 
diff --git a/include/DataFrame/Utils/Threads/SharedQueue.tcc b/include/DataFrame/Utils/Threads/SharedQueue.tcc
index 9813af8c..556e8209 100644
--- a/include/DataFrame/Utils/Threads/SharedQueue.tcc
+++ b/include/DataFrame/Utils/Threads/SharedQueue.tcc
@@ -51,24 +51,6 @@ SharedQueue<T>::push(const value_type &element) noexcept  {
 
 // ----------------------------------------------------------------------------
 
-template<typename T>
-inline const typename SharedQueue<T>::value_type &
-SharedQueue<T>::front(bool wait_on_front) const  { // throw (SQEmpty)
-
-    std::unique_lock<std::mutex>    ul { mutex_ };
-
-    if (queue_.empty())  {
-        if (wait_on_front)
-            while (queue_.empty())  cvx_.wait(ul);
-        else
-            throw SQEmpty { };
-    }
-
-    return (queue_.front());
-}
-
-// ----------------------------------------------------------------------------
-
 template<typename T>
 inline typename SharedQueue<T>::optional_ret
 SharedQueue<T>::pop_front(bool wait_on_front) noexcept  {
@@ -88,34 +70,6 @@ SharedQueue<T>::pop_front(bool wait_on_front) noexcept  {
 
 // ----------------------------------------------------------------------------
 
-template<typename T>
-inline typename SharedQueue<T>::value_type &
-SharedQueue<T>::front(bool wait_on_front)  { // throw (SQEmpty)
-
-    std::unique_lock<std::mutex>    ul { mutex_ };
-
-    if (queue_.empty())  {
-        if (wait_on_front)
-            while (queue_.empty())  cvx_.wait(ul);
-        else
-            throw SQEmpty { };
-    }
-
-    return (queue_.front());
-}
-
-// ----------------------------------------------------------------------------
-
-template<typename T>
-void SharedQueue<T>::pop() noexcept  {
-
-    const AutoLockable  lock { mutex_ };
-
-    queue_.pop();
-}
-
-// ----------------------------------------------------------------------------
-
 template<typename T>
 bool SharedQueue<T>::empty() const noexcept  {
 
diff --git a/include/DataFrame/Utils/Threads/ThreadPool.h b/include/DataFrame/Utils/Threads/ThreadPool.h
index ab8f367a..aacb828f 100644
--- a/include/DataFrame/Utils/Threads/ThreadPool.h
+++ b/include/DataFrame/Utils/Threads/ThreadPool.h
@@ -50,53 +50,22 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 namespace hmdf
 {
 
-struct  Conditioner  {
-
-    template<typename F, typename ... As>
-    requires std::invocable<F, As ...>
-    explicit Conditioner(F &&routine, As && ... args);
-
-    Conditioner() = default;
-    Conditioner(const Conditioner &) = default;
-    Conditioner(Conditioner &&) = default;
-    ~Conditioner() = default;
-
-    void execute();
-
-private:
-
-    using routine_type = std::function<void()>;
-
-    routine_type    func_ { [] () -> void  { } };
-};
-
-// ----------------------------------------------------------------------------
-
 class   ThreadPool  {
 
 public:
 
     using size_type = long;
-    using time_type = time_t;
     using thread_type = std::thread;
 
-    inline static constexpr size_type   MUL_THR_THHOLD = 150'000L;
+    inline static constexpr size_type   MUL_THR_THHOLD = 250'000L;
 
     ThreadPool(const ThreadPool &) = delete;
     ThreadPool &operator = (const ThreadPool &) = delete;
 
-    // Conditioner(s) are a handy interface, if threads need to be initialized
-    // before doing anything. And/or they need a clean up before exiting.
-    // For example, see Windows CoInitializeEx function in COM library
-    //
     explicit
-    ThreadPool(size_type thr_num = std::thread::hardware_concurrency(),
-               Conditioner pre_conditioner = Conditioner { },
-               Conditioner post_conditioner = Conditioner { });
+    ThreadPool(size_type thr_num = std::thread::hardware_concurrency());
     ~ThreadPool();
 
-    void set_timeout(bool timeout_flag, time_type timeout_time = 30 * 60);
-
     template<typename F, typename ... As>
     requires std::invocable<F, As ...>
     using dispatch_res_t =
@@ -145,14 +114,6 @@ class   ThreadPool  {
              long TH = MUL_THR_THHOLD>
     void parallel_sort(const I begin, const I end, P compare);
 
-
-    // It attaches the current thread to the pool so that it may be used for
-    // executing submitted tasks. It blocks the calling thread until the pool
-    // is shutdown or the thread is timed-out.
-    // This is handy, if you already have thread(s), and want to repurpose them
-    //
-    void attach(thread_type &&this_thr);
-
     // If the pool is not shutdown and there is a pending task, run the one
     // task on the calling thread.
     // Return true, if a task was executed, otherwise false.
@@ -176,7 +137,6 @@ class   ThreadPool  {
         _undefined_ = 0,
         _client_service_ = 1,
         _terminate_ = 2,
-        _timeout_ = 3,
     };
 
     struct  WorkUnit  {
@@ -197,7 +157,6 @@ class   ThreadPool  {
     };
 
     bool thread_routine_(size_type local_q_idx) noexcept;  // Engine routine
-    void queue_timed_outs_() noexcept;
     WorkUnit get_one_local_task_() noexcept;
 
     using guard_type = std::lock_guard<std::mutex>;
@@ -216,12 +175,7 @@ class   ThreadPool  {
     std::atomic<size_type>  available_threads_ { 0 };
     std::atomic<size_type>  capacity_threads_ { 0 };
     std::atomic_bool        shutdown_flag_ { false };
-    time_type               timeout_time_ { 30 * 60 };
     mutable std::mutex      state_ { };
-    bool                    timeout_flag_ { false };
-
-    Conditioner pre_conditioner_ { };
-    Conditioner post_conditioner_ { };
 };
 
 } // namespace hmdf
diff --git a/include/DataFrame/Utils/Threads/ThreadPool.tcc b/include/DataFrame/Utils/Threads/ThreadPool.tcc
index 46120fa9..0532540f 100644
--- a/include/DataFrame/Utils/Threads/ThreadPool.tcc
+++ b/include/DataFrame/Utils/Threads/ThreadPool.tcc
@@ -31,7 +31,6 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <chrono>
 #include <cstdlib>
-#include <ctime>
 #include <functional>
 #include <memory>
 #include <stdexcept>
@@ -42,21 +41,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 namespace hmdf
 {
 
-template<typename F, typename ... As>
-requires std::invocable<F, As ...>
-Conditioner::Conditioner(F &&routine, As && ... args)
-    : func_([&] () -> void { routine(std::forward<As>(args) ...); })  {   }
-
-// ----------------------------------------------------------------------------
-
-void Conditioner::execute()  { func_(); }
-
-// ----------------------------------------------------------------------------
-
-ThreadPool::ThreadPool(size_type thr_num,
-                       Conditioner pre_conditioner,
-                       Conditioner post_conditioner)
-    : pre_conditioner_(pre_conditioner), post_conditioner_(post_conditioner)  {
+ThreadPool::ThreadPool(size_type thr_num)  {
 
     threads_.reserve(thr_num * 2);
     for (size_type i = 0; i < thr_num; ++i)  {
@@ -82,31 +67,6 @@ ThreadPool::~ThreadPool()  {
 
 // ----------------------------------------------------------------------------
 
-void
-ThreadPool::set_timeout(bool timeout_flag, time_type timeout_time)  {
-
-    timeout_flag_ = timeout_flag;
-    timeout_time_ = timeout_time;
-}
-
-// ----------------------------------------------------------------------------
-
-void
-ThreadPool::queue_timed_outs_() noexcept  {
-
-    const size_type timeys { capacity_threads() };
-
-    for (size_type i = 0; i < timeys; ++i)  {
-        const WorkUnit  work_unit { WORK_TYPE::_timeout_ };
-
-        global_queue_.push(work_unit);
-    }
-
-    return;
-}
-
-// ----------------------------------------------------------------------------
-
 bool
 ThreadPool::add_thread(size_type thr_num)  {
 
@@ -183,8 +143,6 @@ ThreadPool::dispatch(bool immediately, F &&routine, As && ... args)  {
     else
         global_queue_.push(work_unit);
 
-    if (timeout_flag_)
-        queue_timed_outs_();
     return (return_fut);
 }
 
@@ -388,27 +346,6 @@ ThreadPool::parallel_sort(const I begin, const I end, P compare)  {
 
 // ----------------------------------------------------------------------------
 
-void
-ThreadPool::attach(thread_type &&this_thr)  {
-
-    if (is_shutdown())
-        throw std::runtime_error("ThreadPool::attach(): "
-                                 "Thread pool is shutdown.");
-
-    size_type   local_size { 0 };
-
-    {
-        const guard_type    guard { state_ };
-
-        local_size = size_type(threads_.size());
-        local_queues_.push_back(LocalQueueType { });
-        threads_.push_back(std::move(this_thr));
-    }
-    thread_routine_(local_size);
-}
-
-// ----------------------------------------------------------------------------
-
 ThreadPool::size_type
 ThreadPool::available_threads() const noexcept  {
 
@@ -514,10 +451,7 @@ ThreadPool::thread_routine_(size_type local_q_idx) noexcept  {
     if (is_shutdown())
         return (false);
 
-    pre_conditioner_.execute();
-
-    time_type   last_busy_time { timeout_flag_ ? ::time(nullptr) : 0 };
-    auto        iter = local_queues_.begin();
+    auto    iter = local_queues_.begin();
 
     std::advance(iter, local_q_idx);
     local_queue_ = &(*iter);
@@ -537,23 +471,13 @@ ThreadPool::thread_routine_(size_type local_q_idx) noexcept  {
 
         --available_threads_;
 
-        if (work_unit.work_type == WORK_TYPE::_terminate_)  {
-            break;
-        }
-        else if (work_unit.work_type == WORK_TYPE::_timeout_)  {
-            if (timeout_flag_ &&
-                ((::time(nullptr) - last_busy_time) >= timeout_time_))
-                break;
-        }
-        else if (work_unit.work_type == WORK_TYPE::_client_service_)  {
-            if (timeout_flag_)
-                last_busy_time = ::time(nullptr);
+        if (work_unit.work_type == WORK_TYPE::_client_service_)
             (work_unit.func)();  // Execute the callable
-        }
+        else if (work_unit.work_type == WORK_TYPE::_terminate_)
+            break;
     }
     --capacity_threads_;
     local_queue_ = nullptr;
-    post_conditioner_.execute();
 
     return (true);
 }

From 678120d4ebbbe1e977e3fa7c102483b62ba82b8e Mon Sep 17 00:00:00 2001
From: Hossein Moein <myseajune@aol.com>
Date: Tue, 19 Dec 2023 11:35:16 -0500
Subject: [PATCH 7/7] Added parallel computing logic to the following visitors:
 PolyFit, LogFit, ExponencialFit, LinearFit, CubicSplinFit, Lowess, Decompose,
 Bias, NonZeroRange

---
 include/DataFrame/DataFrameStatsVisitors.h | 753 +++++++++++++++++----
 1 file changed, 615 insertions(+), 138 deletions(-)

diff --git a/include/DataFrame/DataFrameStatsVisitors.h b/include/DataFrame/DataFrameStatsVisitors.h
index fe28a4c9..3d3adf2f 100644
--- a/include/DataFrame/DataFrameStatsVisitors.h
+++ b/include/DataFrame/DataFrameStatsVisitors.h
@@ -54,6 +54,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <limits>
 #include <map>
 #include <numeric>
+#include <tuple>
 #include <type_traits>
 #include <unordered_map>
 #include <utility>
@@ -5025,6 +5026,7 @@ struct  PolyFitVisitor  {
                 const Hy &y_begin, const Hy &y_end)  {
 
         const size_type col_s = std::distance(x_begin, x_end);
+        const size_type thread_level = ThreadGranularity::get_thread_level();
 
         assert((col_s == size_type(std::distance(y_begin, y_end))));
 
@@ -5042,10 +5044,32 @@ struct  PolyFitVisitor  {
             // consecutive positions of the array will store
             // col_s, sigma(xi), sigma(xi^2), sigma(xi^3) ... sigma(xi^2n)
             //
-            for (size_type j = 0; j < col_s; ++j) [[likely]]  {
-                const value_type    w = weights_(*(idx_begin + j), j);
+            if (thread_level > 2 && col_s >= ThreadPool::MUL_THR_THHOLD)  {
+                auto    futures =
+                    ThreadGranularity::thr_pool_.parallel_loop(
+                        size_type(0),
+                        col_s,
+                        [&x_begin, &idx_begin, i, this]
+                        (auto begin, auto end) -> value_type  {
+                            value_type  sum { 0 };
 
-                sigma_x[i] += std::pow(*(x_begin + j), i) * w;
+                            for (auto j = begin; j < end; ++j)  {
+                               const value_type    w =
+                                   this->weights_(*(idx_begin + j), j);
+
+                                sum += std::pow(*(x_begin + j), i) * w;
+                            }
+                            return (sum);
+                       });
+
+                for (auto &fut : futures)  sigma_x[i] += fut.get();
+            }
+            else  {
+                for (size_type j = 0; j < col_s; ++j) [[likely]]  {
+                    const value_type    w = weights_(*(idx_begin + j), j);
+
+                    sigma_x[i] += std::pow(*(x_begin + j), i) * w;
+                }
             }
         }
 
@@ -5072,10 +5096,34 @@ struct  PolyFitVisitor  {
             // consecutive positions will store
             // sigma(yi), sigma(xi * yi), sigma(xi^2 * yi) ... sigma(xi^n * yi)
             //
-            for (size_type j = 0; j < col_s; ++j)  {
-                const value_type    w = weights_(*(idx_begin + j), j);
+            if (thread_level > 2 && col_s >= ThreadPool::MUL_THR_THHOLD)  {
+                auto    futures =
+                    ThreadGranularity::thr_pool_.parallel_loop(
+                        size_type(0),
+                        col_s,
+                        [&x_begin, &y_begin, &idx_begin, i, this]
+                        (auto begin, auto end) -> value_type  {
+                            value_type  sum { 0 };
+
+                            for (auto j = begin; j < end; ++j)  {
+                               const value_type    w =
+                                   this->weights_(*(idx_begin + j), j);
+
+                                sum += std::pow(*(x_begin + j), i) *
+                                       *(y_begin + j) * w;
+                            }
+                            return (sum);
+                       });
 
-                sigma_y[i] += std::pow(*(x_begin + j), i) * *(y_begin + j) * w;
+                for (auto &fut : futures)  sigma_y[i] += fut.get();
+            }
+            else  {
+                for (size_type j = 0; j < col_s; ++j)  {
+                    const value_type    w = weights_(*(idx_begin + j), j);
+
+                    sigma_y[i] +=
+                        std::pow(*(x_begin + j), i) * *(y_begin + j) * w;
+                }
             }
         }
 
@@ -5151,12 +5199,12 @@ struct  PolyFitVisitor  {
 
             for (size_type j = 0; j < deg; ++j)
                 pred += coeffs_[j] * std::pow(*(x_begin + i), j);
+            y_fits_.push_back(pred);
 
             const value_type    w = weights_(*(idx_begin + i), i);
 
             // y fits at given x points
             //
-            y_fits_.push_back(pred);
             residual_ += ((*(y_begin + i) - pred) * w) *
                          ((*(y_begin + i) - pred) * w);
         }
@@ -5205,26 +5253,74 @@ struct  LogFitVisitor  {
                 const H &x_begin, const H &x_end,
                 const H &y_begin, const H &y_end)  {
 
+        const size_type col_s = std::distance(x_begin, x_end);
+        const size_type thread_level = ThreadGranularity::get_thread_level();
+
         result_type logx (x_begin, x_end);
 
-        std::transform(logx.begin(), logx.end(), logx.begin(),
-                       (value_type(*)(value_type)) std::log);
-        poly_fit_(idx_begin, idx_end, logx.begin(), logx.end(), y_begin, y_end);
+        if (thread_level > 2 && col_s >= ThreadPool::MUL_THR_THHOLD)  {
+            auto    futures =
+                ThreadGranularity::thr_pool_.parallel_loop(
+                    size_type(0),
+                    col_s,
+                    [&logx](auto begin, auto end) -> void  {
+                        for (auto i = begin; i < end; ++i)
+                            logx[i] = std::log(logx[i]);
+                    });
 
-        const size_type col_s = std::distance(x_begin, x_end);
+            for (auto &fut : futures)  fut.get();
+        }
+        else  {
+            std::transform(logx.begin(), logx.end(), logx.begin(),
+                           (value_type(*)(value_type)) std::log);
+        }
 
-        y_fits_.reserve(col_s);
-        for (size_type i = 0; i < col_s; ++i) [[likely]]  {
-            const value_type    pred =
-                poly_fit_.get_result()[0] +
-                poly_fit_.get_result()[1] * std::log(*(x_begin + i));
-            const value_type    w = weights_(*(idx_begin + i), i);
+        poly_fit_(idx_begin, idx_end,
+                  logx.begin(), logx.end(),
+                  y_begin, y_end);
 
-            // y fits at given x points
-            //
-            y_fits_.push_back(pred);
-            residual_ += ((*(y_begin + i) - pred) * w) *
-                         ((*(y_begin + i) - pred) * w);
+        y_fits_.resize(col_s);
+        if (thread_level > 2 && col_s >= ThreadPool::MUL_THR_THHOLD)  {
+            auto    futures =
+                ThreadGranularity::thr_pool_.parallel_loop(
+                    size_type(0),
+                    col_s,
+                    [&x_begin, &y_begin, &idx_begin, this]
+                    (auto begin, auto end) -> value_type  {
+                        value_type  residual { 0 };
+
+                        for (auto i = begin; i < end; ++i)  {
+                            const value_type    pred =
+                                this->poly_fit_.get_result()[0] +
+                                this->poly_fit_.get_result()[1] *
+                                std::log(*(x_begin + i));
+                            const value_type    w =
+                                this->weights_(*(idx_begin + i), i);
+
+                            // y fits at given x points
+                            //
+                            this->y_fits_[i] = pred;
+                            residual += ((*(y_begin + i) - pred) * w) *
+                                        ((*(y_begin + i) - pred) * w);
+                        }
+                        return (residual);
+                   });
+
+             for (auto &fut : futures)  residual_ += fut.get();
+        }
+        else  {
+            for (size_type i = 0; i < col_s; ++i) [[likely]]  {
+                const value_type    pred =
+                    poly_fit_.get_result()[0] +
+                    poly_fit_.get_result()[1] * std::log(*(x_begin + i));
+                const value_type    w = weights_(*(idx_begin + i), i);
+
+                // y fits at given x points
+                //
+                y_fits_[i] = pred;
+                residual_ += ((*(y_begin + i) - pred) * w) *
+                             ((*(y_begin + i) - pred) * w);
+            }
         }
     }
 
@@ -5268,28 +5364,67 @@ struct  ExponentialFitVisitor  {
                 const H &y_begin, const H &y_end)  {
 
         const size_type col_s = std::distance(x_begin, x_end);
+        const size_type thread_level = ThreadGranularity::get_thread_level();
 
         assert((col_s == size_type(std::distance(y_begin, y_end))));
 
-        value_type  sum_x = 0;   // Sum of all observed x
-        value_type  sum_y = 0;   // Sum of all observed y
-        value_type  sum_x2 = 0;  // Sum of all observed x squared
-        value_type  sum_xy = 0;  // Sum of all x times sum of all observed y
+        value_type  sum_x { 0 };   // Sum of all observed x
+        value_type  sum_y { 0 };   // Sum of all observed y
+        value_type  sum_x2 { 0 };  // Sum of all observed x squared
+        value_type  sum_xy { 0 };  // Sum of all x times sum of all observed y
 
-        for (size_type i = 0; i < col_s; ++i) [[likely]]  {
-            const value_type    x = *(x_begin + i);
-            const value_type    log_y = std::log(*(y_begin + i));
+        if (thread_level > 2 && col_s >= ThreadPool::MUL_THR_THHOLD)  {
+            using sum_t =
+                std::tuple<value_type, value_type, value_type, value_type>;
 
-            sum_x += x;
-            sum_y += log_y;
-            sum_xy += x * log_y;
-            sum_x2 += x * x;
+            auto    futures =
+                ThreadGranularity::thr_pool_.parallel_loop(
+                    size_type(0),
+                    col_s,
+                    [&x_begin, &y_begin](auto begin, auto end) -> sum_t  {
+                        value_type  sum_x { 0 };
+                        value_type  sum_y { 0 };
+                        value_type  sum_x2 { 0 };
+                        value_type  sum_xy { 0 };
+
+                        for (auto i = begin; i < end; ++i)  {
+                            const value_type    x = *(x_begin + i);
+                            const value_type    log_y =
+                                std::log(*(y_begin + i));
+
+                            sum_x += x;
+                            sum_y += log_y;
+                            sum_xy += x * log_y;
+                            sum_x2 += x * x;
+                        }
+                        return (std::make_tuple(sum_x, sum_y, sum_xy, sum_x2));
+                    });
+
+            for (auto &fut : futures)  {
+                const auto  &sums = fut.get();
+
+                sum_x += std::get<0>(sums);
+                sum_y += std::get<1>(sums);
+                sum_xy += std::get<2>(sums);
+                sum_x2 += std::get<3>(sums);
+            }
+        }
+        else  {
+            for (size_type i = 0; i < col_s; ++i) [[likely]]  {
+                const value_type    x = *(x_begin + i);
+                const value_type    log_y = std::log(*(y_begin + i));
+
+                sum_x += x;
+                sum_y += log_y;
+                sum_xy += x * log_y;
+                sum_x2 += x * x;
+            }
         }
 
         // The slope (the the power of exp) of best fit line
         //
-        slope_ =
-            (col_s * sum_xy - sum_x * sum_y) / (col_s * sum_x2 - sum_x * sum_x);
+        slope_ = (col_s * sum_xy - sum_x * sum_y) /
+                 (col_s * sum_x2 - sum_x * sum_x);
 
         // The intercept of best fit line
         //
@@ -5297,18 +5432,47 @@ struct  ExponentialFitVisitor  {
 
         const value_type    prefactor = std::exp(intercept_);
 
-        y_fits_.reserve(col_s);
-        for (size_type i = 0; i < col_s; ++i) [[likely]]  {
-            const value_type    x = *(x_begin + i);
-            const value_type    pred = prefactor * std::exp(x * slope_);
+        y_fits_.resize(col_s);
+        if (thread_level > 2 && col_s >= ThreadPool::MUL_THR_THHOLD)  {
+            auto    futures =
+                ThreadGranularity::thr_pool_.parallel_loop(
+                    size_type(0),
+                    col_s,
+                    [&x_begin, &y_begin, prefactor, this]
+                    (auto begin, auto end) -> value_type  {
+                        value_type  residual { 0 };
 
-            // y fits at given x points
-            //
-            y_fits_.push_back(pred);
+                        for (auto i = begin; i < end; ++i)  {
+                            const value_type    x = *(x_begin + i);
+                            const value_type    pred =
+                                prefactor * std::exp(x * this->slope_);
+
+                            // y fits at given x points
+                            //
+                            this->y_fits_[i] = pred;
+
+                            const value_type    r = *(y_begin + i) - pred;
+
+                            residual += r * r;
+                        }
+                        return (residual);
+                   });
 
-            const value_type    r = *(y_begin + i) - pred;
+             for (auto &fut : futures)  residual_ += fut.get();
+        }
+        else  {
+            for (size_type i = 0; i < col_s; ++i) [[likely]]  {
+                const value_type    x = *(x_begin + i);
+                const value_type    pred = prefactor * std::exp(x * slope_);
+
+                // y fits at given x points
+                //
+                y_fits_[i] = pred;
 
-            residual_ += r * r;
+                const value_type    r = *(y_begin + i) - pred;
+
+                residual_ += r * r;
+            }
         }
     }
 
@@ -5353,22 +5517,60 @@ struct  LinearFitVisitor  {
                 const H &y_begin, const H &y_end)  {
 
         const size_type col_s = std::distance(x_begin, x_end);
+        const size_type thread_level = ThreadGranularity::get_thread_level();
 
         assert((col_s == size_type(std::distance(y_begin, y_end))));
 
-        value_type  sum_x = 0;   // Sum of all observed x
-        value_type  sum_y = 0;   // Sum of all observed y
-        value_type  sum_x2 = 0;  // Sum of all observed x squared
-        value_type  sum_xy = 0;  // Sum of all x times sum of all observed y
+        value_type  sum_x { 0 };   // Sum of all observed x
+        value_type  sum_y { 0 };   // Sum of all observed y
+        value_type  sum_x2 { 0 };  // Sum of all observed x squared
+        value_type  sum_xy { 0 };  // Sum of all x times sum of all observed y
 
-        for (size_type i = 0; i < col_s; ++i)  {
-            const value_type    x = *(x_begin + i);
-            const value_type    y = *(y_begin + i);
+        if (thread_level > 2 && col_s >= ThreadPool::MUL_THR_THHOLD)  {
+            using sum_t =
+                std::tuple<value_type, value_type, value_type, value_type>;
 
-            sum_x += x;
-            sum_y += y;
-            sum_xy += x * y;
-            sum_x2 += x * x;
+            auto    futures =
+                ThreadGranularity::thr_pool_.parallel_loop(
+                    size_type(0),
+                    col_s,
+                    [&x_begin, &y_begin](auto begin, auto end) -> sum_t  {
+                        value_type  sum_x { 0 };
+                        value_type  sum_y { 0 };
+                        value_type  sum_x2 { 0 };
+                        value_type  sum_xy { 0 };
+
+                        for (auto i = begin; i < end; ++i)  {
+                            const value_type    x = *(x_begin + i);
+                            const value_type    y = *(y_begin + i);
+
+                            sum_x += x;
+                            sum_y += y;
+                            sum_xy += x * y;
+                            sum_x2 += x * x;
+                        }
+                        return (std::make_tuple(sum_x, sum_y, sum_xy, sum_x2));
+                    });
+
+            for (auto &fut : futures)  {
+                const auto  &sums = fut.get();
+
+                sum_x += std::get<0>(sums);
+                sum_y += std::get<1>(sums);
+                sum_xy += std::get<2>(sums);
+                sum_x2 += std::get<3>(sums);
+            }
+        }
+        else  {
+            for (size_type i = 0; i < col_s; ++i)  {
+                const value_type    x = *(x_begin + i);
+                const value_type    y = *(y_begin + i);
+
+                sum_x += x;
+                sum_y += y;
+                sum_xy += x * y;
+                sum_x2 += x * x;
+            }
         }
 
         const value_type    divisor = sum_x2 * col_s - sum_x * sum_x;
@@ -5381,18 +5583,49 @@ struct  LinearFitVisitor  {
         //
         intercept_ = (sum_x2 * sum_y - sum_x * sum_xy) / divisor;
 
-        y_fits_.reserve(col_s);
-        std::transform(x_begin, x_end,
-                       y_begin,
-                       std::back_inserter(y_fits_),
-                       [this](const auto &x, const auto &y) -> value_type  {
-                           const value_type    pred =
-                               this->slope_ * x + this->intercept_;
-                           const value_type    r = y - pred;
-
-                           this->residual_ += r * r;
-                           return (pred);  // y fits at given x points
-                       });
+        if (thread_level > 2 && col_s >= ThreadPool::MUL_THR_THHOLD)  {
+            y_fits_.resize(col_s);
+
+            auto    futures =
+                ThreadGranularity::thr_pool_.parallel_loop(
+                    size_type(0),
+                    col_s,
+                    [&x_begin, &y_begin, this]
+                    (auto begin, auto end) -> value_type  {
+                        value_type  residual { 0 };
+
+                        for (auto i = begin; i < end; ++i)  {
+                            const value_type    x = *(x_begin + i);
+                            const value_type    y = *(y_begin + i);
+                            const value_type    pred =
+                                this->slope_ * x + this->intercept_;
+                            const value_type    r = y - pred;
+
+                            // y fits at given x points
+                            //
+                            this->y_fits_[i] = pred;
+                            residual += r * r;
+                        }
+                        return (residual);
+                   });
+
+             for (auto &fut : futures)  residual_ += fut.get();
+        }
+        else  {
+            y_fits_.reserve(col_s);
+            std::transform(x_begin, x_end,
+                           y_begin,
+                           std::back_inserter(y_fits_),
+                           [this]
+                           (const auto &x, const auto &y) -> value_type  {
+                               const value_type    pred =
+                                   this->slope_ * x + this->intercept_;
+                               const value_type    r = y - pred;
+
+                               this->residual_ += r * r;
+                               return (pred);  // y fits at given x points
+                           });
+        }
     }
 
     inline void pre ()  {
@@ -5441,15 +5674,32 @@ struct  CubicSplineFitVisitor  {
                 const H &y_begin, const H &y_end)  {
 
         const size_type col_s = std::distance(x_begin, x_end);
+        const size_type thread_level = ThreadGranularity::get_thread_level();
 
         assert(col_s > 3);
         assert((col_s == size_type(std::distance(y_begin, y_end))));
 
         result_type h;
 
-        h.reserve(col_s - 1);
-        for(size_type i = 0; i < col_s - 1; ++i) [[likely]]
-            h.push_back (*(x_begin + (i + 1)) - *(x_begin + i));
+        if (thread_level > 2 && col_s >= ThreadPool::MUL_THR_THHOLD)  {
+            h.resize(col_s - 1);
+
+            auto    futures =
+                ThreadGranularity::thr_pool_.parallel_loop(
+                    size_type(0),
+                    col_s - 1,
+                    [&x_begin, &h](auto begin, auto end) -> void  {
+                        for (auto i = begin; i < end; ++i)
+                             h[i] = *(x_begin + (i + 1)) - *(x_begin + i);
+                    });
+
+            for (auto &fut : futures)  fut.get();
+        }
+        else  {
+            h.reserve(col_s - 1);
+            for(size_type i = 0; i < col_s - 1; ++i) [[likely]]
+                h.push_back (*(x_begin + (i + 1)) - *(x_begin + i));
+        }
 
         result_type             mu (col_s, 0);
         result_type             z (col_s, 0);
@@ -5552,28 +5802,68 @@ struct  LowessVisitor  {
     // function.
     //
     template<forward_iterator X>
-    inline static void bi_square_(X x_begin, X x_end)  {
+    inline static
+    void bi_square_(X x_begin, X x_end, long thread_level)  {
 
-        std::for_each(x_begin, x_end,
-                      [](auto &x) -> void  {
-                          const value_type    val = T(1) - x * x;
+        if (thread_level > 2 &&
+            std::distance(x_begin, x_end) >= ThreadPool::MUL_THR_THHOLD)  {
+            auto    futures =
+                ThreadGranularity::thr_pool_.parallel_loop(
+                    x_begin,
+                    x_end,
+                    [](const auto &begin, const auto &end) -> void  {
+                        for (auto citer = begin; citer < end; ++citer)  {
+                            value_type          &x = *citer;
+                            const value_type    val = T(1) - x * x;
+
+                            x = val * val;
+                        }
+                    });
 
-                          x = val * val;
-                      });
+            for (auto &fut : futures)  fut.get();
+        }
+        else  {
+            std::for_each(x_begin, x_end,
+                          [](auto &x) -> void  {
+                              const value_type    val = T(1) - x * x;
+
+                              x = val * val;
+                          });
+        }
     }
 
     // The tri-cubic function (1 - x^3)^3. Used to weight neighboring points
     // along the x-axis based on their distance to the current point.
     //
     template<forward_iterator X>
-    inline static void tri_cube_(X x_begin, X x_end)  {
+    inline static
+    void tri_cube_(X x_begin, X x_end, long thread_level)  {
 
-        std::for_each(x_begin, x_end,
-                      [](auto &x) -> void  {
-                          const value_type    val = T(1) - x * x * x;
+        if (thread_level > 2 &&
+            std::distance(x_begin, x_end) >= ThreadPool::MUL_THR_THHOLD)  {
+            auto    futures =
+                ThreadGranularity::thr_pool_.parallel_loop(
+                    x_begin,
+                    x_end,
+                    [](const auto &begin, const auto &end) -> void  {
+                        for (auto citer = begin; citer < end; ++citer)  {
+                            value_type          &x = *citer;
+                            const value_type    val = T(1) - x * x * x;
+
+                            x = val * val * val;
+                        }
+                    });
 
-                          x = val * val * val;
-                      });
+            for (auto &fut : futures)  fut.get();
+        }
+        else  {
+            std::for_each(x_begin, x_end,
+                          [](auto &x) -> void  {
+                              const value_type    val = T(1) - x * x * x;
+
+                              x = val * val * val;
+                          });
+        }
     }
 
     // Calculate residual weights for the next robustifying iteration.
@@ -5582,14 +5872,35 @@ struct  LowessVisitor  {
     inline void
     calc_residual_weights_(const IDX &idx_begin, const IDX &idx_end,
                            const Y &y_begin, const Y &y_end,
-                           const K &y_fits_begin, const K & /*y_fits_end*/)  {
+                           const K &y_fits_begin, const K &y_fits_end)  {
 
-        std::transform(y_begin, y_end,
-                       y_fits_begin,
-                       resid_weights_.begin(),
-                       [](auto y, auto y_fit) -> value_type  {
-                           return (std::fabs(y - y_fit));
-                       });
+        const size_type col_s = std::distance(y_begin, y_end);
+
+        if (thread_level_ > 2 && col_s >= ThreadPool::MUL_THR_THHOLD)  {
+            auto    futures =
+                ThreadGranularity::thr_pool_.parallel_loop2(
+                    size_type(0),
+                    col_s,
+                    size_type(0),
+                    size_type(std::distance(y_fits_begin, y_fits_end)),
+                    [&y_begin, &y_fits_begin, this]
+                    (auto begin, auto end, auto) -> void  {
+                        for (size_type i = begin; i < end; ++i) [[likely]]
+                            this->resid_weights_[i] =
+                                std::fabs(*(y_begin + i) -
+                                          *(y_fits_begin + i));
+                    });
+
+            for (auto &fut : futures)  fut.get();
+        }
+        else  {
+            std::transform(y_begin, y_end,
+                           y_fits_begin,
+                           resid_weights_.begin(),
+                           [](auto y, auto y_fit) -> value_type  {
+                               return (std::fabs(y - y_fit));
+                           });
+        }
 
         MedianVisitor<T, I, A> median_v;
 
@@ -5607,9 +5918,25 @@ struct  LowessVisitor  {
         else  {
             const value_type    val = T(6) * median_v.get_result();
 
-            std::transform(resid_weights_.begin(), resid_weights_.end(),
-                           resid_weights_.begin(),
-                           [val](auto c) -> value_type  { return (c / val); });
+            if (thread_level_ > 2 && col_s >= ThreadPool::MUL_THR_THHOLD)  {
+                auto    futures =
+                    ThreadGranularity::thr_pool_.parallel_loop(
+                        resid_weights_.begin(),
+                        resid_weights_.end(),
+                        [val](const auto &begin, const auto &end) -> void  {
+                            for (auto citer = begin; citer < end; ++citer)
+                                *citer /= val;
+                        });
+
+                for (auto &fut : futures)  fut.get();
+            }
+            else  {
+                std::transform(resid_weights_.begin(), resid_weights_.end(),
+                               resid_weights_.begin(),
+                               [val](auto c) -> value_type  {
+                                   return (c / val);
+                               });
+            }
         }
 
         // Some trimming of outlier residuals.
@@ -5628,7 +5955,8 @@ struct  LowessVisitor  {
         //                           std::placeholders::_1, value_type(0.001)),
         //                 0);
 
-        bi_square_(resid_weights_.begin(), resid_weights_.end());
+        bi_square_(resid_weights_.begin(), resid_weights_.end(),
+                   thread_level_);
     }
 
     // Update the counters of the local regression.
@@ -5803,7 +6131,7 @@ struct  LowessVisitor  {
             *(w_begin + j) = dist_i_j_.back();
         }
 
-        tri_cube_(w_begin + left_end, w_begin + right_end);
+        tri_cube_(w_begin + left_end, w_begin + right_end, thread_level_);
         for (size_type j = left_end; j < right_end; ++j) [[likely]]
             *(w_begin + j) *= resid_weights_[j];
 
@@ -6030,7 +6358,8 @@ struct  LowessVisitor  {
         : frac_(frac),
           loop_n_(loop_n + 1),
           delta_(delta),
-          sorted_(sorted)  {   }
+          sorted_(sorted),
+          thread_level_(ThreadGranularity::get_thread_level())  {   }
 
 private:
 
@@ -6049,6 +6378,8 @@ struct  LowessVisitor  {
     //
     const bool          sorted_;
 
+    const long          thread_level_;
+
     result_type         y_fits_ {  };
     result_type         resid_weights_ {  };
 
@@ -6078,7 +6409,10 @@ struct  DecomposeVisitor  {
 
         std::iota(xvals.begin(), xvals.end(), 0);
 
-        LowessVisitor<T, I, A> l_v (3, frac_, delta_ * value_type(col_s), true);
+        LowessVisitor<T, I, A> l_v (3,
+                                    frac_,
+                                    delta_ * value_type(col_s),
+                                    true);
 
         // Calculate trend
         //
@@ -6138,16 +6472,51 @@ struct  DecomposeVisitor  {
         // What is left is residual
         //
         residual_.resize(col_s, 0);
-        if (type_ == decompose_type::additive)
-            std::transform(detrended.begin(), detrended.end(),
-                           seasonal_.begin(),
-                           residual_.begin(),
-                           std::minus<value_type>());
-        else
-            std::transform(detrended.begin(), detrended.end(),
-                           seasonal_.begin(),
-                           residual_.begin(),
-                           std::divides<value_type>());
+        if (thread_level_ > 2 &&
+            detrended.size() >= ThreadPool::MUL_THR_THHOLD)  {
+            std::vector<std::future<void>>  futures;
+
+            if (type_ == decompose_type::additive)
+                futures =
+                    ThreadGranularity::thr_pool_.parallel_loop2(
+                        size_type(0),
+                        detrended.size(),
+                        size_type(0),
+                        seasonal_.size(),
+                        [this, &detrended]
+                        (auto begin, auto end, auto) -> void  {
+                            for (size_type i = begin; i < end; ++i) [[likely]]
+                                this->residual_[i] =
+                                    detrended[i] - this->seasonal_[i];
+                        });
+            else
+                futures =
+                    ThreadGranularity::thr_pool_.parallel_loop2(
+                        size_type(0),
+                        detrended.size(),
+                        size_type(0),
+                        seasonal_.size(),
+                        [this, &detrended]
+                        (auto begin, auto end, auto) -> void  {
+                            for (size_type i = begin; i < end; ++i) [[likely]]
+                                this->residual_[i] =
+                                    detrended[i] / this->seasonal_[i];
+                        });
+
+            for (auto &fut : futures)  fut.get();
+        }
+        else  {
+            if (type_ == decompose_type::additive)
+                std::transform(detrended.begin(), detrended.end(),
+                               seasonal_.begin(),
+                               residual_.begin(),
+                               std::minus<value_type>());
+            else
+                std::transform(detrended.begin(), detrended.end(),
+                               seasonal_.begin(),
+                               residual_.begin(),
+                               std::divides<value_type>());
+        }
     }
 
 public:
@@ -6172,16 +6541,50 @@ struct  DecomposeVisitor  {
 
         // Remove trend from observations in y
         //
-        if (type_ == decompose_type::additive)
-            std::transform(y_begin, y_end,
-                           trend_.begin(),
-                           detrended.begin(),
-                           std::minus<value_type>());
-        else
-            std::transform(y_begin, y_end,
-                           trend_.begin(),
-                           detrended.begin(),
-                           std::divides<value_type>());
+        if (thread_level_ > 2 && col_s >= ThreadPool::MUL_THR_THHOLD)  {
+            std::vector<std::future<void>>  futures;
+
+            if (type_ == decompose_type::additive)
+                futures =
+                    ThreadGranularity::thr_pool_.parallel_loop2(
+                        size_type(0),
+                        col_s,
+                        size_type(0),
+                        trend_.size(),
+                        [this, &detrended, &y_begin]
+                        (auto begin, auto end, auto) -> void  {
+                            for (size_type i = begin; i < end; ++i) [[likely]]
+                                detrended[i] =
+                                    *(y_begin + i) - this->trend_[i];
+                        });
+            else
+                futures =
+                    ThreadGranularity::thr_pool_.parallel_loop2(
+                        size_type(0),
+                        col_s,
+                        size_type(0),
+                        trend_.size(),
+                        [this, &detrended, &y_begin]
+                        (auto begin, auto end, auto) -> void  {
+                            for (size_type i = begin; i < end; ++i) [[likely]]
+                                detrended[i] =
+                                    *(y_begin + i) / this->trend_[i];
+                        });
+
+            for (auto &fut : futures)  fut.get();
+        }
+        else  {
+            if (type_ == decompose_type::additive)
+                std::transform(y_begin, y_end,
+                               trend_.begin(),
+                               detrended.begin(),
+                               std::minus<value_type>());
+            else
+                std::transform(y_begin, y_end,
+                               trend_.begin(),
+                               detrended.begin(),
+                               std::divides<value_type>());
+        }
 
         if (type_ == decompose_type::additive)
             do_seasonal_<MeanVisitor<T, I>>
@@ -6213,7 +6616,11 @@ struct  DecomposeVisitor  {
                       value_type frac,
                       value_type delta,
                       decompose_type t = decompose_type::additive)
-        : frac_(frac), s_period_(s_period), delta_(delta), type_(t)  {   }
+        : frac_(frac),
+          s_period_(s_period),
+          delta_(delta),
+          type_(t),
+          thread_level_(ThreadGranularity::get_thread_level())  {   }
 
 private:
 
@@ -6231,6 +6638,8 @@ struct  DecomposeVisitor  {
     const value_type        delta_;
     const decompose_type    type_;
 
+    const long              thread_level_;
+
     result_type             trend_ {  };
     result_type             seasonal_ {  };
     result_type             residual_ {  };
@@ -6260,10 +6669,12 @@ is_normal(const V &column, double epsl, bool check_for_standard)  {
     const value_type    high_band_1 = static_cast<value_type>(mean + std);
     const value_type    low_band_1 = static_cast<value_type>(mean - std);
     double              count_1 = 0.0;
-    const value_type    high_band_2 = static_cast<value_type>(mean + std * 2.0);
+    const value_type    high_band_2 =
+        static_cast<value_type>(mean + std * 2.0);
     const value_type    low_band_2 = static_cast<value_type>(mean - std * 2.0);
     double              count_2 = 0.0;
-    const value_type    high_band_3 = static_cast<value_type>(mean + std * 3.0);
+    const value_type    high_band_3 =
+        static_cast<value_type>(mean + std * 3.0);
     const value_type    low_band_3 = static_cast<value_type>(mean - std * 3.0);
     double              count_3 = 0.0;
 
@@ -6319,10 +6730,12 @@ is_lognormal(const V &column, double epsl)  {
     const value_type    high_band_1 = static_cast<value_type>(mean + std);
     const value_type    low_band_1 = static_cast<value_type>(mean - std);
     double              count_1 = 0.0;
-    const value_type    high_band_2 = static_cast<value_type>(mean + std * 2.0);
+    const value_type    high_band_2 =
+        static_cast<value_type>(mean + std * 2.0);
     const value_type    low_band_2 = static_cast<value_type>(mean - std * 2.0);
     double              count_2 = 0.0;
-    const value_type    high_band_3 = static_cast<value_type>(mean + std * 3.0);
+    const value_type    high_band_3 =
+        static_cast<value_type>(mean + std * 3.0);
     const value_type    low_band_3 = static_cast<value_type>(mean - std * 3.0);
     double              count_3 = 0.0;
 
@@ -6380,12 +6793,33 @@ struct  BiasVisitor  {
         avger.post();
 
         result_ = std::move(avger.get_result());
-        std::transform(column_begin + (roll_period_ - 1), column_end,
-                       result_.begin() + (roll_period_ - 1),
-                       result_.begin() + (roll_period_ - 1),
-                       [](auto val, auto result) -> value_type  {
-                           return (val / result - T(1));
-                       });
+        if (ThreadGranularity::get_thread_level() > 2 &&
+            col_s >= ThreadPool::MUL_THR_THHOLD)  {
+            auto    futures =
+                ThreadGranularity::thr_pool_.parallel_loop2(
+                    roll_period_ - 1,
+                    col_s,
+                    roll_period_ - 1,
+                    col_s,
+                    [this, &column_begin]
+                    (auto begin, auto end, auto) -> void  {
+                        for (size_type i = begin; i < end; ++i) [[likely]]  {
+                            value_type  &re = this->result_[i];
+
+                            re = *(column_begin + i) / re - T(1);
+                        }
+                    });
+
+            for (auto &fut : futures)  fut.get();
+        }
+        else  {
+            std::transform(column_begin + (roll_period_ - 1), column_end,
+                           result_.begin() + (roll_period_ - 1),
+                           result_.begin() + (roll_period_ - 1),
+                           [](auto val, auto result) -> value_type  {
+                               return (val / result - T(1));
+                           });
+        }
     }
 
     DEFINE_PRE_POST
@@ -6402,7 +6836,8 @@ struct  BiasVisitor  {
     result_type     result_ { };
 };
 
-template<typename AV, typename T, typename I = unsigned long, std::size_t A = 0>
+template<typename AV, typename T, typename I = unsigned long,
+         std::size_t A = 0>
 using bias_v = BiasVisitor<AV, T, I, A>;
 
 // ----------------------------------------------------------------------------
@@ -6426,18 +6861,60 @@ struct  NonZeroRangeVisitor  {
         bool        there_is_zero = false;
         result_type result;
 
-        result.reserve(col_s);
-        for (size_type i = 0; i < col_s; ++i) [[likely]]  {
-            const value_type    v = *(column1_begin + i) - *(column2_begin + i);
+        if (ThreadGranularity::get_thread_level() > 2 &&
+            col_s >= ThreadPool::MUL_THR_THHOLD)  {
+            result.resize(col_s);
+
+            auto    futures =
+                ThreadGranularity::thr_pool_.parallel_loop(
+                    size_type(0),
+                    col_s,
+                    [&result, &column1_begin, &column2_begin]
+                    (auto begin, auto end) -> bool  {
+                        bool    there_is_zero = false;
+
+                        for (size_type i = begin; i < end; ++i)  {
+                            const value_type    v =
+                                *(column1_begin + i) - *(column2_begin + i);
+
+                            result[i] = v;
+                            if (v == 0)  there_is_zero = true;
+                        }
+                        return (there_is_zero);
+                    });
+
+            for (auto &fut : futures)  there_is_zero |= fut.get();
+            if (there_is_zero)  {
+                auto    futures =
+                    ThreadGranularity::thr_pool_.parallel_loop(
+                        size_type(0),
+                        col_s,
+                        [&result]
+                        (auto begin, auto end) -> void  {
+                            for (size_type i = begin; i < end; ++i)
+                                result[i] +=
+                                    std::numeric_limits<value_type>::epsilon();
+                        });
 
-            result.push_back(v);
-            if (v == 0)  there_is_zero = true;
+                for (auto &fut : futures)  fut.get();
+            }
+        }
+        else  {
+            result.reserve(col_s);
+            for (size_type i = 0; i < col_s; ++i) [[likely]]  {
+                const value_type    v =
+                    *(column1_begin + i) - *(column2_begin + i);
+
+                result.push_back(v);
+                if (v == 0)  there_is_zero = true;
+            }
+            if (there_is_zero)
+                std::for_each(result.begin(), result.end(),
+                              [](value_type &v) -> void  {
+                                  v += std::numeric_limits
+                                           <value_type>::epsilon();
+                              });
         }
-        if (there_is_zero)
-            std::for_each(result.begin(), result.end(),
-                          [](value_type &v) -> void  {
-                              v += std::numeric_limits<value_type>::epsilon();
-                          });
 
         result_.swap(result);
     }