From 67a1db829faee60a713b0d3386b32d30e231065f Mon Sep 17 00:00:00 2001 From: Hossein Moein Date: Thu, 4 Jan 2024 14:28:24 -0500 Subject: [PATCH 01/13] Implemented inversion_count() --- docs/HTML/DataFrame.html | 2 +- include/DataFrame/DataFrame.h | 15 +++ include/DataFrame/Internals/DataFrame_get.tcc | 59 ++++++--- .../Internals/DataFrame_standalone.tcc | 120 +++++++++++++++++- 4 files changed, 178 insertions(+), 18 deletions(-) diff --git a/docs/HTML/DataFrame.html b/docs/HTML/DataFrame.html index cf9c5152..6b16a76e 100644 --- a/docs/HTML/DataFrame.html +++ b/docs/HTML/DataFrame.html @@ -88,7 +88,7 @@
  • To start off on basic operations, see Hello World
  • Also, see DataFrame Library Types
  • DataFrame has both sync and async interfaces, latter returning C++ std::futures
  • -
  • Read views, visitors, multithreading, and memory alignment sections below, before getting serious about this library
  • +
  • Read multithreading, views, visitors, and memory alignment sections below, before getting serious about this library


  • diff --git a/include/DataFrame/DataFrame.h b/include/DataFrame/DataFrame.h index cbee0995..f4da36d9 100644 --- a/include/DataFrame/DataFrame.h +++ b/include/DataFrame/DataFrame.h @@ -2665,6 +2665,21 @@ class DataFrame : public ThreadGranularity { [[nodiscard]] StringStats get_str_col_stats(const char *col_name) const; + // This retunrs the number of inversions in the named column. For example, + // in a column that is already sorted, the number of inversions is zero. + // In a column that is sorted in reverse, the number of inversions is + // n(n - 1) / 2. + // + // T: + // Data type of the named column + // C: + // Type of the comparison functor defaulted to std::less + // col_name: + // Name of the column + // + template> + size_type inversion_count(const char *col_name) const; + // This function returns a DataFrame indexed by std::string that provides // a few statistics about the columns of the calling DataFrame. // The statistics are: diff --git a/include/DataFrame/Internals/DataFrame_get.tcc b/include/DataFrame/Internals/DataFrame_get.tcc index d87250f7..ea8fd91d 100644 --- a/include/DataFrame/Internals/DataFrame_get.tcc +++ b/include/DataFrame/Internals/DataFrame_get.tcc @@ -433,7 +433,8 @@ get_data_by_idx(const StlVecType &values) const { const SpinGuard guard(lock_); for (const auto &citer : column_list_) [[likely]] { - create_col_functor_ functor(citer.first.c_str(), df); + create_col_functor_ functor(citer.first.c_str(), + df); data_[citer.second].change(functor); } @@ -553,10 +554,8 @@ DataFrame::get_view_by_idx (Index2D range) const { const SpinGuard guard(lock_); for (const auto &iter : column_list_) [[likely]] { - view_setup_functor_ functor (iter.first.c_str(), - b_dist, - e_dist, - dfcv); + view_setup_functor_ functor ( + iter.first.c_str(), b_dist, e_dist, dfcv); data_[iter.second].change(functor); } @@ -759,7 +758,8 @@ get_data_by_loc (const StlVecType &locations) const { const SpinGuard guard(lock_); for (const auto &citer : column_list_) [[likely]] { - create_col_functor_ functor(citer.first.c_str(), df); + create_col_functor_ functor(citer.first.c_str(), + df); data_[citer.second].change(functor); } @@ -1013,7 +1013,8 @@ get_data_by_sel (const char *name, F &sel_functor) const { const SpinGuard guard(lock_); for (const auto &citer : column_list_) [[likely]] { - create_col_functor_ functor(citer.first.c_str(), df); + create_col_functor_ functor(citer.first.c_str(), + df); data_[citer.second].change(functor); } @@ -1184,7 +1185,8 @@ get_data_by_sel (const char *name1, const char *name2, F &sel_functor) const { df.load_index(std::move(new_index)); for (const auto &citer : column_list_) [[likely]] { - create_col_functor_ functor(citer.first.c_str(), df); + create_col_functor_ functor(citer.first.c_str(), + df); data_[citer.second].change(functor); } @@ -1375,7 +1377,8 @@ get_data_by_sel (const char *name1, df.load_index(std::move(new_index)); for (const auto &citer : column_list_) [[likely]] { - create_col_functor_ functor(citer.first.c_str(), df); + create_col_functor_ functor(citer.first.c_str(), + df); data_[citer.second].change(functor); } @@ -1779,7 +1782,8 @@ get_data_by_sel(const char *name1, df.load_index(std::move(new_index)); for (const auto &citer : column_list_) [[likely]] { - create_col_functor_ functor(citer.first.c_str(), df); + create_col_functor_ functor(citer.first.c_str(), + df); data_[citer.second].change(functor); } @@ -2008,7 +2012,8 @@ get_data_by_sel(const char *name1, df.load_index(std::move(new_index)); for (const auto &citer : column_list_) [[likely]] { - create_col_functor_ functor(citer.first.c_str(), df); + create_col_functor_ functor(citer.first.c_str(), + df); data_[citer.second].change(functor); } @@ -2135,7 +2140,8 @@ get_data_by_sel(const char *name1, df.load_index(std::move(new_index)); for (const auto &citer : column_list_) [[likely]] { - create_col_functor_ functor(citer.first.c_str(), df); + create_col_functor_ functor(citer.first.c_str(), + df); data_[citer.second].change(functor); } @@ -2266,7 +2272,8 @@ get_data_by_sel(const char *name1, df.load_index(std::move(new_index)); for (const auto &citer : column_list_) [[likely]] { - create_col_functor_ functor(citer.first.c_str(), df); + create_col_functor_ functor(citer.first.c_str(), + df); data_[citer.second].change(functor); } @@ -2401,7 +2408,8 @@ get_data_by_sel(const char *name1, df.load_index(std::move(new_index)); for (const auto &citer : column_list_) [[likely]] { - create_col_functor_ functor(citer.first.c_str(), df); + create_col_functor_ functor(citer.first.c_str(), + df); data_[citer.second].change(functor); } @@ -3250,7 +3258,8 @@ combine(const char *col_name, guard.release(); const size_type col_s = - std::min({ lhs_col.size(), df1_col.size(), df2_col.size() }); + std::min({ lhs_col.size(), df1_col.size(), + df2_col.size() }); StlVecType result; result.reserve(col_s); @@ -3287,7 +3296,8 @@ combine(const char *col_name, result.reserve(col_s); for (size_type i = 0; i < col_s; ++i) [[likely]] result.push_back( - std::move(functor(lhs_col[i], df1_col[i], df2_col[i], df3_col[i]))); + std::move(functor(lhs_col[i], df1_col[i], df2_col[i], + df3_col[i]))); return (result); } @@ -3360,6 +3370,23 @@ DataFrame::get_str_col_stats(const char *col_name) const { return (result); } +// ---------------------------------------------------------------------------- + +template +template +typename DataFrame::size_type +DataFrame::inversion_count(const char *col_name) const { + + const auto &col = get_column(col_name); + const auto col_s = col.size(); + StlVecType original = col; + StlVecType temp(col_s); + const auto thread_level = get_thread_level(); + + return (_inv_merge_sort_(original, temp, 0, col_s - 1, C{ }, + thread_level)); +} + } // namespace hmdf // ---------------------------------------------------------------------------- diff --git a/include/DataFrame/Internals/DataFrame_standalone.tcc b/include/DataFrame/Internals/DataFrame_standalone.tcc index ed164f4f..1f5d3022 100644 --- a/include/DataFrame/Internals/DataFrame_standalone.tcc +++ b/include/DataFrame/Internals/DataFrame_standalone.tcc @@ -277,7 +277,8 @@ _load_groupby_data_2_( std::size_t marker = 0; auto &dst_idx = dest.get_index(); - const std::size_t vec_size = std::min(input_col1.size(), input_col2.size()); + const std::size_t vec_size = + std::min(input_col1.size(), input_col2.size()); const auto &src_idx = source.get_index(); if (dst_idx.empty()) { @@ -935,6 +936,123 @@ inline static T _string_to_(const char *value) { return (ret); } +// ---------------------------------------------------------------------------- + +template +static std::size_t +_inv_merge_(Con &original, + Con &temp, + std::size_t left, + std::size_t mid, + std::size_t right, + Comp &&comp) { + + std::size_t i { left }; + std::size_t j { mid }; + std::size_t k { left }; + std::size_t inv_count { 0 }; + + while ((i <= mid - 1) && (j <= right)) { + if (comp (original[i], original[j])) { + temp[k++] = original[i++]; + } + else { + temp[k++] = original[j++]; + inv_count += mid - i; + } + } + + // Copy the remaining elements of left sub-original (if there are any) + // to temp + // + while (i <= mid - 1) + temp[k++] = original[i++]; + + // Copy the remaining elements of right sub-original (if there are any) + // to temp + // + while (j <= right) + temp[k++] = original[j++]; + + // Copy back the merged elements to original original + // + for (i = left; i <= right; i++) + original[i] = temp[i]; + + return (inv_count); +} + +// ---------------------------------------------------------------------------- + +template +static std::size_t +_inv_merge_sort_(Con &original, + Con &temp, + std::size_t left, + std::size_t right, + Comp comp, + long thread_level) { + + using fut_type = std::future; + + std::size_t mid { 0 }; + std::size_t inv_count { 0 }; + + if (right > left) { + const auto thr_lvl = + ((right - left) < (ThreadPool::MUL_THR_THHOLD / 2)) + ? 0L : thread_level; + + // Divide the original into two parts and call _inv_merge_sort_() + // for each of the parts + // + mid = (right + left) / 2; + + // Inversion count will be sum of inversions in left-part, right-part + // and number of inversions in merging + // + if (thr_lvl > 2) { + fut_type left_fut = + ThreadGranularity::thr_pool_.dispatch( + false, + _inv_merge_sort_, + std::ref(original), + std::ref(temp), + left, + mid, + comp, + thread_level); + fut_type right_fut = + ThreadGranularity::thr_pool_.dispatch( + false, + _inv_merge_sort_, + std::ref(original), + std::ref(temp), + mid + 1, + right, + comp, + thread_level); + + ThreadGranularity::thr_pool_.run_task(); + ThreadGranularity::thr_pool_.run_task(); + inv_count += left_fut.get() + right_fut.get(); + } + else { + inv_count += + _inv_merge_sort_(original, temp, left, mid, comp, thread_level); + inv_count += + _inv_merge_sort_(original, temp, mid + 1, right, comp, + thread_level); + } + + // Merge the two parts + // + inv_count += _inv_merge_(original, temp, left, mid + 1, right, comp); + } + + return (inv_count); +} + } // namespace hmdf // ---------------------------------------------------------------------------- From cc7dca8afa6f31def542306cda798b8f7ee97bce Mon Sep 17 00:00:00 2001 From: Hossein Moein Date: Fri, 5 Jan 2024 11:26:38 -0500 Subject: [PATCH 02/13] Added tests and docs for inversion_count() --- docs/HTML/DataFrame.html | 4 ++ docs/HTML/inversion_count.html | 115 +++++++++++++++++++++++++++++++++ test/dataframe_tester_3.cc | 58 ++++++++++++++++- 3 files changed, 175 insertions(+), 2 deletions(-) create mode 100644 docs/HTML/inversion_count.html diff --git a/docs/HTML/DataFrame.html b/docs/HTML/DataFrame.html index 6b16a76e..8e2ded44 100644 --- a/docs/HTML/DataFrame.html +++ b/docs/HTML/DataFrame.html @@ -349,6 +349,10 @@

    API Reference with code samples

    has_column( 2 ) + + inversion_count( ) + + is_equal( ) diff --git a/docs/HTML/inversion_count.html b/docs/HTML/inversion_count.html new file mode 100644 index 00000000..4a2b5f9d --- /dev/null +++ b/docs/HTML/inversion_count.html @@ -0,0 +1,115 @@ + + + + + + + + + + + + + + + +
    Signature Description Parameters
    +
    
    +template<typename T, typename C = std::less<T>>
    +size_type
    +inversion_count(const char *col_name) const;
    +        
    +
    + This retunrs the number of inversions in the named column.
    + For example, in a column that is already sorted, the number of inversions is zero. In a column that is sorted in reverse, the number of inversions is n(n - 1) / 2.
    +
    + T: Data type of the named column
    + C: Type of the comparison functor defaulted to std::less
    + col_name: Name of the column
    +
    + +
    static void test_inversion_count()  {
    +
    +    std::cout << "\nTesting inversion_count(  ) ..." << std::endl;
    +
    +    using IntDataFrame = StdDataFrame<int>;
    +
    +    std::vector<int>    idx = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17 };
    +    std::vector<int>    i1 = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17 };
    +    std::vector<int>    i2 = { 17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 };
    +    std::vector<int>    i3 = { 1,0,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17 };
    +    std::vector<int>    i4 = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,16 };
    +    std::vector<int>    i5 = { 1,0,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,16 };
    +    std::vector<int>    i6 = { 17,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,0 };
    +    std::vector<int>    i7 = { 0,1,2,3,4,5,6,10,8,9,7,11,12,13,14,15,16,17 };
    +    std::vector<int>    i8 = { 0,1,2,15,4,5,6,7,8,9,10,11,12,13,14,3,16,17 };
    +    std::vector<int>    i9 = { 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2 };
    +    std::vector<int>    i10 = { 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,3 };
    +    std::vector<int>    i11 = { 2,2,2,2,3,2,2,2,2,4,2,2,2,5,2,2,2,6 };
    +    IntDataFrame        df;
    +
    +    df.load_data(std::move(idx),
    +                 std::make_pair("i1", i1),
    +                 std::make_pair("i2", i2),
    +                 std::make_pair("i3", i3),
    +                 std::make_pair("i4", i4),
    +                 std::make_pair("i5", i5),
    +                 std::make_pair("i6", i6),
    +                 std::make_pair("i7", i7),
    +                 std::make_pair("i8", i8),
    +                 std::make_pair("i9", i9),
    +                 std::make_pair("i10", i10),
    +                 std::make_pair("i11", i11));
    +
    +    assert(df.inversion_count<int>("i1") == 0);
    +    assert(df.inversion_count<int>("i2") == 153);
    +    assert(df.inversion_count<int>("i3") == 1);
    +    assert(df.inversion_count<int>("i4") == 1);
    +    assert(df.inversion_count<int>("i5") == 2);
    +    assert(df.inversion_count<int>("i6") == 33);
    +    assert(df.inversion_count<int>("i7") == 5);
    +    assert(df.inversion_count<int>("i8") == 23);
    +    assert(df.inversion_count<int>("i9") == 153);
    +    assert(df.inversion_count<int>("i10") == 136);
    +    assert(df.inversion_count<int>("i11") == 110);
    +
    +    assert((df.inversion_count<int, std::greater<int>>("i1") == 153));
    +    assert((df.inversion_count<int, std::greater<int>>("i2") == 0));
    +}
    + + C++ DataFrame + + + + + diff --git a/test/dataframe_tester_3.cc b/test/dataframe_tester_3.cc index 272252d6..a05659fd 100644 --- a/test/dataframe_tester_3.cc +++ b/test/dataframe_tester_3.cc @@ -2211,11 +2211,13 @@ static void test_read_csv_with_maps() { // df.write // (std::cout, io_format::csv2); assert(df.get_index().size() == 4); - assert((std::fabs(df.get_column("Close")[3] - 1.0234) < 0.0001)); + assert((std::fabs(df.get_column("Close")[3] - 1.0234) < + 0.0001)); assert((df.get_column("Volume")[3] == 3605190400)); assert((std::fabs( - df.get_column("Map 1")[3]["label four 2"] - -782.5) < 0.001)); + df.get_column("Map 1")[3]["label four 2"] - -782.5) < + 0.001)); assert((std::fabs( df.get_column("Map 1")[0]["label one 1"] - 123.0) < 0.001)); assert((std::fabs( @@ -2479,6 +2481,57 @@ static void test_get_str_col_stats() { // ----------------------------------------------------------------------------- +static void test_inversion_count() { + + std::cout << "\nTesting inversion_count( ) ..." << std::endl; + + using IntDataFrame = StdDataFrame; + + std::vector idx = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17 }; + std::vector i1 = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17 }; + std::vector i2 = { 17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 }; + std::vector i3 = { 1,0,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17 }; + std::vector i4 = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,16 }; + std::vector i5 = { 1,0,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,16 }; + std::vector i6 = { 17,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,0 }; + std::vector i7 = { 0,1,2,3,4,5,6,10,8,9,7,11,12,13,14,15,16,17 }; + std::vector i8 = { 0,1,2,15,4,5,6,7,8,9,10,11,12,13,14,3,16,17 }; + std::vector i9 = { 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2 }; + std::vector i10 = { 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,3 }; + std::vector i11 = { 2,2,2,2,3,2,2,2,2,4,2,2,2,5,2,2,2,6 }; + IntDataFrame df; + + df.load_data(std::move(idx), + std::make_pair("i1", i1), + std::make_pair("i2", i2), + std::make_pair("i3", i3), + std::make_pair("i4", i4), + std::make_pair("i5", i5), + std::make_pair("i6", i6), + std::make_pair("i7", i7), + std::make_pair("i8", i8), + std::make_pair("i9", i9), + std::make_pair("i10", i10), + std::make_pair("i11", i11)); + + assert(df.inversion_count("i1") == 0); + assert(df.inversion_count("i2") == 153); + assert(df.inversion_count("i3") == 1); + assert(df.inversion_count("i4") == 1); + assert(df.inversion_count("i5") == 2); + assert(df.inversion_count("i6") == 33); + assert(df.inversion_count("i7") == 5); + assert(df.inversion_count("i8") == 23); + assert(df.inversion_count("i9") == 153); + assert(df.inversion_count("i10") == 136); + assert(df.inversion_count("i11") == 110); + + assert((df.inversion_count>("i1") == 153)); + assert((df.inversion_count>("i2") == 0)); +} + +// ----------------------------------------------------------------------------- + int main(int, char *[]) { MyDataFrame::set_optimum_thread_level(); @@ -2533,6 +2586,7 @@ int main(int, char *[]) { test_PriceVolumeTrendVisitor(); test_QuantQualEstimationVisitor(); test_get_str_col_stats(); + test_inversion_count(); return (0); } From 3215a9f018629efe6cbc78d949c5adc0c47eb800 Mon Sep 17 00:00:00 2001 From: Hossein Moein Date: Wed, 10 Jan 2024 11:00:16 -0500 Subject: [PATCH 03/13] Added backend plumbing for get_[data|view]_by_like() --- .../Internals/DataFrame_standalone.tcc | 225 ++++++++++++++++++ test/dataframe_tester_3.cc | 23 ++ 2 files changed, 248 insertions(+) diff --git a/include/DataFrame/Internals/DataFrame_standalone.tcc b/include/DataFrame/Internals/DataFrame_standalone.tcc index 1f5d3022..eda49b38 100644 --- a/include/DataFrame/Internals/DataFrame_standalone.tcc +++ b/include/DataFrame/Internals/DataFrame_standalone.tcc @@ -1053,6 +1053,231 @@ _inv_merge_sort_(Con &original, return (inv_count); } +// ---------------------------------------------------------------------------- + +struct _LikeClauseUtil_ { + + // This lookup table is used to help decode the first byte of + // a multi-byte UTF8 character. + // + inline static const unsigned char CHAR_TRANS1[] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, + 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x00, 0x01, 0x02, 0x03, + 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x00, 0x00, + }; + + inline static const unsigned char UPPER_TO_LOWER[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, + 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, + 119, 120, 121, 122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, + 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, + 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, + 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, + 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, + 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, + 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, + 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, + 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, + 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, + 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, + 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, + }; + + static unsigned int char_read(const unsigned char **str_ptr_ptr) { + + unsigned int c = *((*str_ptr_ptr)++); + + // For this routine, we assume the char string is always + // zero-terminated. + // + if (c >= 0xc0) { + c = CHAR_TRANS1[c - 0xc0]; + while ((*(*str_ptr_ptr) & 0xc0) == 0x80) + c = (c << 6) + (0x3f & *((*str_ptr_ptr)++)); + if (c < 0x80 || + (c & 0xFFFFF800) == 0xD800 || (c & 0xFFFFFFFE) == 0xFFFE) + c = 0xFFFD; + } + + return (c); + } + + static inline void upper_to_lower(unsigned int &val) { + + if (! (val & ~0x7f)) + val = UPPER_TO_LOWER[val]; + } +}; + +// ---------------------------------------------------------------------------- + +// This compares two null-terminated strings for equality where the first +// string can potentially be a "glob" expression (forget about regular +// expressions). +// It returns true if they are matched-equal and false if they are not +// matched-equal. +// +// Globbing rules: +// +// '*' Matches any sequence of zero or more characters. +// +// '?' Matches exactly one character. +// +// [...] Matches one character from the enclosed list of +// characters. +// +// [^...] Matches one character not in the enclosed list. +// +// With the [...] and [^...] matching, a ']' character can be included +// in the list by making it the first character after '[' or '^'. A +// range of characters can be specified using '-'. Example: +// "[a-z]" matches any single lower-case letter. To match a '-', make +// it the last character in the list. +// +// Hints: to match '*' or '?', put them in "[]". Like this: +// abc[*]xyz matches "abc*xyz" only +// +// NOTE: This could be, in some cases, n-squared. But it is pretty fast with +// moderately sized strings. I have not tested this with huge/massive +// strings. +// +static bool +_like_clause_compare_(const char *pattern, + const char *input_str, + bool case_insensitive = false, + unsigned int esc_char = '\\') { + + const unsigned char *upattern = + reinterpret_cast(pattern); + const unsigned char *uinput_str = + reinterpret_cast(input_str); + unsigned int c, c2; + const unsigned char match_one { '?' }; + const unsigned char match_all { '*' }; + const unsigned char match_set { '[' }; + // True if the previous character was escape + // + bool prev_escape { false }; + + while ((c = _LikeClauseUtil_::char_read(&upattern)) != 0) { + if (c == match_all && ! prev_escape) { + while ((c = _LikeClauseUtil_::char_read(&upattern)) == match_all || + c == match_one) { + if (c == match_one && + _LikeClauseUtil_::char_read(&uinput_str) == 0) + return (false); + } + if (c == 0) { + return (true); + } + else if (c == esc_char) { + c = _LikeClauseUtil_::char_read(&upattern); + if (c == 0) + return (false); + } + else if (c == match_set) { + while (*uinput_str && + _like_clause_compare_( + reinterpret_cast(&upattern[-1]), + reinterpret_cast(uinput_str), + case_insensitive, + esc_char) == 0) { + if ((*(uinput_str++)) >= 0xc0) + while ((*uinput_str & 0xc0) == 0x80) + uinput_str++; + } + return (*uinput_str != 0); + } + while ((c2 = _LikeClauseUtil_::char_read(&uinput_str)) != 0) { + if (case_insensitive) { + _LikeClauseUtil_::upper_to_lower(c2); + _LikeClauseUtil_::upper_to_lower(c); + while (c2 != 0 && c2 != c) { + c2 = _LikeClauseUtil_::char_read(&uinput_str); + _LikeClauseUtil_::upper_to_lower(c2); + } + } + else { + while (c2 != 0 && c2 != c) + c2 = _LikeClauseUtil_::char_read(&uinput_str); + } + if (c2 == 0) + return (false); + if (_like_clause_compare_( + reinterpret_cast(upattern), + reinterpret_cast(uinput_str), + case_insensitive, + esc_char)) + return (true); + } + return (false); + } + else if (c == match_one && ! prev_escape) { + if (_LikeClauseUtil_::char_read(&uinput_str) == 0) + return (false); + } + else if (c == match_set) { + unsigned int prior_c { 0 }; + int seen { 0 }; + int invert { 0 }; + + c = _LikeClauseUtil_::char_read(&uinput_str); + if (c == 0) + return (false); + c2 = _LikeClauseUtil_::char_read(&upattern); + if (c2 == '^') { + invert = 1; + c2 = _LikeClauseUtil_::char_read(&upattern); + } + if (c2 == ']') { + if (c == ']') + seen = 1; + c2 = _LikeClauseUtil_::char_read(&upattern); + } + while (c2 && c2 != ']') { + if (c2 == '-' && + upattern[0] != ']' && + upattern[0] != 0 && + prior_c > 0) { + c2 = _LikeClauseUtil_::char_read(&upattern); + if (c >= prior_c && c <= c2) + seen = 1; + prior_c = 0; + } + else { + if (c == c2) + seen = 1; + prior_c = c2; + } + c2 = _LikeClauseUtil_::char_read(&upattern); + } + if (c2 == 0 || (seen ^ invert) == 0) + return (false); + } + else if (esc_char == c && ! prev_escape) { + prev_escape = true; + } + else { + c2 = _LikeClauseUtil_::char_read(&uinput_str); + if (case_insensitive) { + _LikeClauseUtil_::upper_to_lower(c); + _LikeClauseUtil_::upper_to_lower(c2); + } + if (c != c2) + return (false); + prev_escape = false; + } + } + + return (*uinput_str == 0); +} + } // namespace hmdf // ---------------------------------------------------------------------------- diff --git a/test/dataframe_tester_3.cc b/test/dataframe_tester_3.cc index a05659fd..b2ed8294 100644 --- a/test/dataframe_tester_3.cc +++ b/test/dataframe_tester_3.cc @@ -2532,6 +2532,28 @@ static void test_inversion_count() { // ----------------------------------------------------------------------------- +static void test__like_clause_compare_() { + + std::cout << "\nTesting _like_clause_compare_( ) ..." << std::endl; + + const std::string str1("345&%$abcM"); + const std::string str2("!@#$0987^HGTtiff\""); + const std::string str3("ABFDTiy"); + + assert(_like_clause_compare_("345*", str1.c_str())); + assert(_like_clause_compare_("345*M", str1.c_str())); + assert(_like_clause_compare_("345*m", str1.c_str()) == false); + assert(_like_clause_compare_("345*m", str1.c_str(), true)); + assert(_like_clause_compare_("?*[0-9][0-9][0-9][0-9]?*", str2.c_str())); + assert((_like_clause_compare_("?*[0-9][0-9][0-9][0-9][0-9]?*", + str2.c_str()) == false)); + assert(_like_clause_compare_("?*\"", str2.c_str())); + assert(_like_clause_compare_("?*[^ABFDTiy]?*", str3.c_str()) == false); + assert(_like_clause_compare_("*[^WdrhID]*", str3.c_str())); +} + +// ----------------------------------------------------------------------------- + int main(int, char *[]) { MyDataFrame::set_optimum_thread_level(); @@ -2587,6 +2609,7 @@ int main(int, char *[]) { test_QuantQualEstimationVisitor(); test_get_str_col_stats(); test_inversion_count(); + test__like_clause_compare_(); return (0); } From 512e71316e9c8f239ef171bf7bab0ddfba44238b Mon Sep 17 00:00:00 2001 From: Hossein Moein Date: Fri, 12 Jan 2024 10:06:33 -0500 Subject: [PATCH 04/13] Implemented get_[data|view]_by_like() --- include/DataFrame/DataFrame.h | 127 ++++- include/DataFrame/Internals/DataFrame_get.tcc | 460 ++++++++++++++++++ .../Internals/DataFrame_standalone.tcc | 2 +- test/dataframe_tester_3.cc | 94 ++++ 4 files changed, 679 insertions(+), 4 deletions(-) diff --git a/include/DataFrame/DataFrame.h b/include/DataFrame/DataFrame.h index f4da36d9..346d3f59 100644 --- a/include/DataFrame/DataFrame.h +++ b/include/DataFrame/DataFrame.h @@ -1987,9 +1987,6 @@ class DataFrame : public ThreadGranularity { // 2) Since the result is a view, you cannot call make_consistent() on // the result. // - // NOTE: Although this is a const method, it returns a view. So, the data - // could still be modified through the returned view - // // T: // Type of the named column // F: @@ -2470,6 +2467,130 @@ class DataFrame : public ThreadGranularity { [[nodiscard]] DataFrame get_data_by_sel(F &sel_functor, FilterCols&&... filter_cols) const; + // This method does a basic Glob-like pattern matching (also similar to + // SQL like clause) to filter data in the named column. + // It returns a new DataFrame. Each element of the named column is checked + // against a Glob-like matching logic + // + // Globbing rules: + // + // '*' Matches any sequence of zero or more characters. + // + // '?' Matches exactly one character. + // + // [...] Matches one character from the enclosed list of + // characters. + // + // [^...] Matches one character not in the enclosed list. + // + // With the [...] and [^...] matching, a ']' character can be included + // in the list by making it the first character after '[' or '^'. A + // range of characters can be specified using '-'. Example: + // "[a-z]" matches any single lower-case letter. To match a '-', make + // it the last character in the list. + // + // Hints: to match '*' or '?', put them in "[]". Like this: + // abc[*]xyz matches "abc*xyz" only + // + // NOTE: This could be, in some cases, n-squared. But it is pretty fast with + // moderately sized strings. I have not tested this with huge/massive + // strings. + // + // T: + // Type of the named column. Based on the concept, it can only be either + // of these types: std::string, VirtualString, const char *, char * + // Ts: + // List all the types of all data columns. A type should be specified in + // the list only once. + // name: + // Name of the data column + // pattern: + // Glob like pattern to use for matching strings + // case_insensitive: + // If true, matching logic ignores case + // esc_char: + // Character used for escape + // + template + [[nodiscard]] DataFrame + get_data_by_like(const char *name, + const char *pattern, + bool case_insensitive = false, + char esc_char = '\\') const; + + // This is identical with above get_data_by_like(), but: + // 1) The result is a view + // 2) Since the result is a view, you cannot call make_consistent() on + // the result. + // + template + [[nodiscard]] PtrView + get_view_by_like(const char *name, + const char *pattern, + bool case_insensitive = false, + char esc_char = '\\'); + + template + [[nodiscard]] ConstPtrView + get_view_by_like(const char *name, + const char *pattern, + bool case_insensitive = false, + char esc_char = '\\') const; + + // This does the same function as above get_data_by_like() but operating + // on two columns. + // + // T: + // Type of both named columns. Based on the concept, it can only be + // either of these types: std::string, VirtualString, const char *, char * + // Ts: + // List all the types of all data columns. A type should be specified in + // the list only once. + // name1: + // Name of the first data column + // name2: + // Name of the second data column + // pattern1 + // Glob like pattern to use for matching strings for the first column + // pattern2 + // Glob like pattern to use for matching strings for the second column + // case_insensitive: + // If true, matching logic ignores case + // esc_char: + // Character used for escape + // + template + [[nodiscard]] DataFrame + get_data_by_like(const char *name1, + const char *name2, + const char *pattern1, + const char *pattern2, + bool case_insensitive = false, + char esc_char = '\\') const; + + // This is identical with above get_data_by_like(), but: + // 1) The result is a view + // 2) Since the result is a view, you cannot call make_consistent() on + // the result. + // + template + [[nodiscard]] PtrView + get_view_by_like(const char *name1, + const char *name2, + const char *pattern1, + const char *pattern2, + bool case_insensitive = false, + char esc_char = '\\'); + + template + [[nodiscard]] ConstPtrView + get_view_by_like(const char *name1, + const char *name2, + const char *pattern1, + const char *pattern2, + bool case_insensitive = false, + char esc_char = '\\') const; + // It returns a DataFrame (including the index and data columns) // containing the data from uniform random selection. // random_policy determines the behavior of method. diff --git a/include/DataFrame/Internals/DataFrame_get.tcc b/include/DataFrame/Internals/DataFrame_get.tcc index ea8fd91d..b7d2aaa0 100644 --- a/include/DataFrame/Internals/DataFrame_get.tcc +++ b/include/DataFrame/Internals/DataFrame_get.tcc @@ -2595,6 +2595,466 @@ get_view_by_sel(const char *name1, // ---------------------------------------------------------------------------- +template +template +DataFrame DataFrame:: +get_data_by_like(const char *name, + const char *pattern, + bool case_insensitive, + char esc_char) const { + + const ColumnVecType &vec = get_column(name); + const size_type col_s = vec.size(); + StlVecType col_indices; + + col_indices.reserve(col_s / 2); + for (size_type i = 0; i < col_s; ++i) { + if constexpr (std::is_same_v || + std::is_same_v) { + if (_like_clause_compare_(pattern, + vec[i].c_str(), + case_insensitive, + esc_char)) + col_indices.push_back(i); + } + else { + if (_like_clause_compare_(pattern, + vec[i], + case_insensitive, + esc_char)) + col_indices.push_back(i); + } + } + + DataFrame df; + IndexVecType new_index; + + new_index.reserve(col_indices.size()); + for (const auto &citer: col_indices) [[likely]] + new_index.push_back(indices_[citer]); + df.load_index(std::move(new_index)); + + const SpinGuard guard(lock_); + + for (const auto &citer : column_list_) [[likely]] { + create_col_functor_ functor( + citer.first.c_str(), df); + + data_[citer.second].change(functor); + } + + const size_type idx_s = indices_.size(); + const auto thread_level = + (idx_s < ThreadPool::MUL_THR_THHOLD) ? 0L : get_thread_level(); + + if (thread_level > 2) { + auto lbd = + [&col_indices = std::as_const(col_indices), idx_s, &df, this] + (const auto &begin, const auto &end) -> void { + for (auto citer = begin; citer < end; ++citer) { + sel_load_functor_ functor ( + citer->first.c_str(), + col_indices, + idx_s, + df); + + this->data_[citer->second].change(functor); + } + }; + + auto futuers = + thr_pool_.parallel_loop(column_list_.begin(), + column_list_.end(), + std::move(lbd)); + + for (auto &fut : futuers) fut.get(); + } + else { + for (const auto &citer : column_list_) [[likely]] { + sel_load_functor_ functor ( + citer.first.c_str(), + col_indices, + idx_s, + df); + + data_[citer.second].change(functor); + } + } + + return (df); +} + +// ---------------------------------------------------------------------------- + +template +template +typename DataFrame::PtrView DataFrame:: +get_view_by_like(const char *name, + const char *pattern, + bool case_insensitive, + char esc_char) { + + static_assert(std::is_base_of, H>::value, + "Only a StdDataFrame can call get_view_by_like()"); + + const ColumnVecType &vec = get_column(name); + const size_type col_s = vec.size(); + StlVecType col_indices; + + col_indices.reserve(col_s / 2); + for (size_type i = 0; i < col_s; ++i) [[likely]] { + if constexpr (std::is_same_v || + std::is_same_v) { + if (_like_clause_compare_(pattern, + vec[i].c_str(), + case_insensitive, + esc_char)) + col_indices.push_back(i); + } + else { + if (_like_clause_compare_(pattern, + vec[i], + case_insensitive, + esc_char)) + col_indices.push_back(i); + } + } + + using TheView = PtrView; + + TheView dfv; + typename TheView::IndexVecType new_index; + + new_index.reserve(col_indices.size()); + for (const auto &citer: col_indices) [[likely]] + new_index.push_back(&(indices_[citer])); + dfv.indices_ = std::move(new_index); + + const size_type idx_s = indices_.size(); + const SpinGuard guard(lock_); + + for (const auto &col_citer : column_list_) [[likely]] { + sel_load_view_functor_ functor ( + col_citer.first.c_str(), + col_indices, + idx_s, + dfv); + + data_[col_citer.second].change(functor); + } + + return (dfv); +} + +// ---------------------------------------------------------------------------- + +template +template +typename DataFrame::ConstPtrView DataFrame:: +get_view_by_like(const char *name, + const char *pattern, + bool case_insensitive, + char esc_char) const { + + static_assert(std::is_base_of, H>::value, + "Only a StdDataFrame can call get_view_by_like()"); + + const ColumnVecType &vec = get_column(name); + const size_type col_s = vec.size(); + StlVecType col_indices; + + col_indices.reserve(col_s / 2); + for (size_type i = 0; i < col_s; ++i) [[likely]] { + if constexpr (std::is_same_v || + std::is_same_v) { + if (_like_clause_compare_(pattern, + vec[i].c_str(), + case_insensitive, + esc_char)) + col_indices.push_back(i); + } + else { + if (_like_clause_compare_(pattern, + vec[i], + case_insensitive, + esc_char)) + col_indices.push_back(i); + } + } + + using TheView = ConstPtrView; + + TheView dfv; + typename TheView::IndexVecType new_index; + + new_index.reserve(col_indices.size()); + for (const auto &citer: col_indices) [[likely]] + new_index.push_back(&(indices_[citer])); + dfv.indices_ = std::move(new_index); + + const size_type idx_s = indices_.size(); + const SpinGuard guard(lock_); + + for (const auto &col_citer : column_list_) [[likely]] { + sel_load_view_functor_ functor ( + col_citer.first.c_str(), + col_indices, + idx_s, + dfv); + + data_[col_citer.second].change(functor); + } + + return (dfv); +} + +// ---------------------------------------------------------------------------- + +template +template +DataFrame DataFrame:: +get_data_by_like(const char *name1, + const char *name2, + const char *pattern1, + const char *pattern2, + bool case_insensitive, + char esc_char) const { + + const size_type idx_s = indices_.size(); + const SpinGuard guard (lock_); + const ColumnVecType &vec1 = get_column(name1, false); + const ColumnVecType &vec2 = get_column(name2, false); + const size_type min_col_s = std::min(vec1.size(), vec2.size()); + StlVecType col_indices; + + col_indices.reserve(idx_s / 2); + for (size_type i = 0; i < min_col_s; ++i) [[likely]] { + if constexpr (std::is_same_v || + std::is_same_v) { + if (_like_clause_compare_(pattern1, + vec1[i].c_str(), + case_insensitive, + esc_char) && + _like_clause_compare_(pattern2, + vec2[i].c_str(), + case_insensitive, + esc_char)) + col_indices.push_back(i); + } + else { + if (_like_clause_compare_(pattern1, + vec1[i], + case_insensitive, + esc_char) && + _like_clause_compare_(pattern2, + vec2[i], + case_insensitive, + esc_char)) + col_indices.push_back(i); + } + } + + DataFrame df; + IndexVecType new_index; + + new_index.reserve(col_indices.size()); + for (const auto &citer: col_indices) + new_index.push_back(indices_[citer]); + df.load_index(std::move(new_index)); + + for (const auto &citer : column_list_) [[likely]] { + create_col_functor_ functor( + citer.first.c_str(), df); + + data_[citer.second].change(functor); + } + + const auto thread_level = + (idx_s < ThreadPool::MUL_THR_THHOLD) ? 0L : get_thread_level(); + + if (thread_level > 2) { + auto lbd = + [&col_indices = std::as_const(col_indices), idx_s, &df, this] + (const auto &begin, const auto &end) -> void { + for (auto citer = begin; citer < end; ++citer) { + sel_load_functor_ functor ( + citer->first.c_str(), + col_indices, + idx_s, + df); + + this->data_[citer->second].change(functor); + } + }; + + auto futuers = + thr_pool_.parallel_loop(column_list_.begin(), + column_list_.end(), + std::move(lbd)); + + for (auto &fut : futuers) fut.get(); + } + else { + for (const auto &citer : column_list_) [[likely]] { + sel_load_functor_ functor ( + citer.first.c_str(), + col_indices, + idx_s, + df); + + data_[citer.second].change(functor); + } + } + + return (df); +} + +// ---------------------------------------------------------------------------- + +template +template +typename DataFrame::PtrView DataFrame:: +get_view_by_like(const char *name1, + const char *name2, + const char *pattern1, + const char *pattern2, + bool case_insensitive, + char esc_char) { + + static_assert(std::is_base_of, H>::value, + "Only a StdDataFrame can call get_view_by_like()"); + + const SpinGuard guard (lock_); + const ColumnVecType &vec1 = get_column(name1, false); + const ColumnVecType &vec2 = get_column(name2, false); + const size_type idx_s = indices_.size(); + const size_type min_col_s = std::min(vec1.size(), vec2.size()); + StlVecType col_indices; + + col_indices.reserve(idx_s / 2); + for (size_type i = 0; i < min_col_s; ++i) [[likely]] { + if constexpr (std::is_same_v || + std::is_same_v) { + if (_like_clause_compare_(pattern1, + vec1[i].c_str(), + case_insensitive, + esc_char) && + _like_clause_compare_(pattern2, + vec2[i].c_str(), + case_insensitive, + esc_char)) + col_indices.push_back(i); + } + else { + if (_like_clause_compare_(pattern1, + vec1[i], + case_insensitive, + esc_char) && + _like_clause_compare_(pattern2, + vec2[i], + case_insensitive, + esc_char)) + col_indices.push_back(i); + } + } + + using TheView = PtrView; + + TheView dfv; + typename TheView::IndexVecType new_index; + + new_index.reserve(col_indices.size()); + for (const auto &citer: col_indices) + new_index.push_back(&(indices_[citer])); + dfv.indices_ = std::move(new_index); + + for (const auto &col_citer : column_list_) [[likely]] { + sel_load_view_functor_ functor ( + col_citer.first.c_str(), + col_indices, + idx_s, + dfv); + + data_[col_citer.second].change(functor); + } + + return (dfv); +} + +// ---------------------------------------------------------------------------- + +template +template +typename DataFrame::ConstPtrView DataFrame:: +get_view_by_like(const char *name1, + const char *name2, + const char *pattern1, + const char *pattern2, + bool case_insensitive, + char esc_char) const { + + static_assert(std::is_base_of, H>::value, + "Only a StdDataFrame can call get_view_by_like()"); + + const SpinGuard guard (lock_); + const ColumnVecType &vec1 = get_column(name1, false); + const ColumnVecType &vec2 = get_column(name2, false); + const size_type idx_s = indices_.size(); + const size_type min_col_s = std::min(vec1.size(), vec2.size()); + StlVecType col_indices; + + col_indices.reserve(idx_s / 2); + for (size_type i = 0; i < min_col_s; ++i) [[likely]] { + if constexpr (std::is_same_v || + std::is_same_v) { + if (_like_clause_compare_(pattern1, + vec1[i].c_str(), + case_insensitive, + esc_char) && + _like_clause_compare_(pattern2, + vec2[i].c_str(), + case_insensitive, + esc_char)) + col_indices.push_back(i); + } + else { + if (_like_clause_compare_(pattern1, + vec1[i], + case_insensitive, + esc_char) && + _like_clause_compare_(pattern2, + vec2[i], + case_insensitive, + esc_char)) + col_indices.push_back(i); + } + } + + using TheView = ConstPtrView; + + TheView dfv; + typename TheView::IndexVecType new_index; + + new_index.reserve(col_indices.size()); + for (const auto &citer: col_indices) + new_index.push_back(&(indices_[citer])); + dfv.indices_ = std::move(new_index); + + for (const auto &col_citer : column_list_) [[likely]] { + sel_load_view_functor_ functor ( + col_citer.first.c_str(), + col_indices, + idx_s, + dfv); + + data_[col_citer.second].change(functor); + } + + return (dfv); +} + +// ---------------------------------------------------------------------------- + template template DataFrame DataFrame:: diff --git a/include/DataFrame/Internals/DataFrame_standalone.tcc b/include/DataFrame/Internals/DataFrame_standalone.tcc index eda49b38..9decdff5 100644 --- a/include/DataFrame/Internals/DataFrame_standalone.tcc +++ b/include/DataFrame/Internals/DataFrame_standalone.tcc @@ -1147,7 +1147,7 @@ struct _LikeClauseUtil_ { // moderately sized strings. I have not tested this with huge/massive // strings. // -static bool +static inline bool _like_clause_compare_(const char *pattern, const char *input_str, bool case_insensitive = false, diff --git a/test/dataframe_tester_3.cc b/test/dataframe_tester_3.cc index b2ed8294..45d78f2f 100644 --- a/test/dataframe_tester_3.cc +++ b/test/dataframe_tester_3.cc @@ -2554,6 +2554,99 @@ static void test__like_clause_compare_() { // ----------------------------------------------------------------------------- +static void test_get_data_by_like() { + + std::cout << "\nTesting get_data_by_like( ) ..." << std::endl; + + StlVecType idxvec = + { 1UL, 2UL, 3UL, 10UL, 5UL, 7UL, 8UL, 12UL, 9UL, 12UL, 10UL, 13UL, + 10UL, 15UL, 14UL }; + StlVecType strvec1 = + { "345&%$abcM", "!@#$0987^HGTtiff\"", "ABFDTiy", "345&%$abcM", + "!@#$0987^HGTtiff\"", "!@#$0987^HGTtiff\"", "345&%$abcM", + "!@#$0987^HGTtiff\"", "ABFDTiy", "345&%$abcM", "!@#$0987^HGTtiff\"", + "ABFDTiy", "345&%$abcM", "ABFDTiy", "ABFDTiy" }; + StlVecType strvec2 = + { "ABFDTiy", "!@#$0987^HGTtiff\"", "ABFDTiy", "345&%$abcM", + "!@#$0987^HGTtiff\"", "ABFDTiy", "!@#$0987^HGTtiff\"", + "!@#$0987^HGTtiff\"", "ABFDTiy", "345&%$abcM", "!@#$0987^HGTtiff\"", + "ABFDTiy", "345&%$abcM", "ABFDTiy", "ABFDTiy" }; + StlVecType intvec = + { 1, 2, 3, 10, 5, 7, 8, 12, 9, 12, 10, 13, 10, 15, 14 }; + MyDataFrame df; + + df.load_data(std::move(idxvec), + std::make_pair("str column 1", strvec1), + std::make_pair("str column 2", strvec2), + std::make_pair("int column", intvec)); + + auto df_like2 = + df.get_data_by_like( + "str column 1", + "str column 2", + "?*[0-9][0-9][0-9][0-9]?*", + "?*[0-9][0-9][0-9][0-9]?*"); + + assert(df_like2.get_index().size() == 4); + assert(df_like2.get_index()[2] == 12); + assert(df_like2.get_column("int column")[2] == 12); + assert(df_like2.get_column("str column 1").size() == 4); + assert(df_like2.get_column("str column 2").size() == 4); + assert((df_like2.get_column("str column 1")[0] == + "!@#$0987^HGTtiff\"")); + assert((df_like2.get_column("str column 1")[2] == + "!@#$0987^HGTtiff\"")); + assert((df_like2.get_column("str column 2")[0] == + "!@#$0987^HGTtiff\"")); + assert((df_like2.get_column("str column 2")[2] == + "!@#$0987^HGTtiff\"")); + + auto dfv_like2 = + df.get_view_by_like( + "str column 1", + "str column 2", + "?*[0-9][0-9][0-9][0-9]?*", + "?*[0-9][0-9][0-9][0-9]?*"); + + assert(dfv_like2.get_index().size() == 4); + assert(dfv_like2.get_index()[2] == 12); + assert(dfv_like2.get_column("int column")[2] == 12); + assert(dfv_like2.get_column("str column 1").size() == 4); + assert(dfv_like2.get_column("str column 2").size() == 4); + assert((dfv_like2.get_column("str column 1")[0] == + "!@#$0987^HGTtiff\"")); + assert((dfv_like2.get_column("str column 1")[2] == + "!@#$0987^HGTtiff\"")); + assert((dfv_like2.get_column("str column 2")[0] == + "!@#$0987^HGTtiff\"")); + assert((dfv_like2.get_column("str column 2")[2] == + "!@#$0987^HGTtiff\"")); + + dfv_like2.get_column("str column 2")[3] = "ABC"; + assert(dfv_like2.get_column("str column 2")[3] == "ABC"); + assert(df.get_column("str column 2")[10] == "ABC"); + + auto df_like1 = + df.get_data_by_like( + "str column 1", + "?*&%?*"); + + assert(df_like1.get_index().size() == 5); + assert(df_like1.get_index()[2] == 8); + assert(df_like1.get_column("int column")[2] == 8); + assert(df_like1.get_column("str column 1").size() == 5); + assert(df_like1.get_column("str column 2").size() == 5); + assert((df_like1.get_column("str column 1")[0] == + "345&%$abcM")); + assert((df_like1.get_column("str column 1")[2] == + "345&%$abcM")); + assert((df_like1.get_column("str column 2")[0] == "ABFDTiy")); + assert((df_like1.get_column("str column 2")[2] == + "!@#$0987^HGTtiff\"")); +} + +// ----------------------------------------------------------------------------- + int main(int, char *[]) { MyDataFrame::set_optimum_thread_level(); @@ -2610,6 +2703,7 @@ int main(int, char *[]) { test_get_str_col_stats(); test_inversion_count(); test__like_clause_compare_(); + test_get_data_by_like(); return (0); } From 8fa8acc7049f2dcd4966f2108bf2841c6b60a4e8 Mon Sep 17 00:00:00 2001 From: Hossein Moein Date: Sat, 13 Jan 2024 11:12:47 -0500 Subject: [PATCH 05/13] Added docs for get_[data|view]_by_like() --- docs/HTML/DataFrame.html | 8 + docs/HTML/get_data_by_like.html | 297 ++++++++++++++++++++++++++++++++ include/DataFrame/DataFrame.h | 4 +- 3 files changed, 307 insertions(+), 2 deletions(-) create mode 100644 docs/HTML/get_data_by_like.html diff --git a/docs/HTML/DataFrame.html b/docs/HTML/DataFrame.html index 8e2ded44..73b92a5e 100644 --- a/docs/HTML/DataFrame.html +++ b/docs/HTML/DataFrame.html @@ -265,6 +265,10 @@

    API Reference with code samples

    get_data_by_idx( 2 ) + + get_data_by_like( 2 ) + + get_data_by_loc( 2 ) @@ -309,6 +313,10 @@

    API Reference with code samples

    get_view_by_idx( 2 ) + + get_view_by_like( 2 ) + + get_view_by_loc( 2 ) diff --git a/docs/HTML/get_data_by_like.html b/docs/HTML/get_data_by_like.html new file mode 100644 index 00000000..cecbd60d --- /dev/null +++ b/docs/HTML/get_data_by_like.html @@ -0,0 +1,297 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Signature Description Parameters
    +
    
    +template<StringOnly T, typename ... Ts>
    +DataFrame
    +get_data_by_like(const char *name,
    +                 const char *pattern,
    +                 bool case_insensitive = false,
    +                 char esc_char = '\\') const;
    +        
    +
    + This method does a basic Glob-like pattern matching (also similar to SQL like clause) to filter data in the named column. It returns a new DataFrame. Each element of the named column is checked against a Glob-like matching logic

    + + Globbing rules:
    +  '*' Matches any sequence of zero or more characters.
    +  '?' Matches exactly one character.
    +  [...] Matches one character from the enclosed list of characters.
    +  [^...] Matches one character not in the enclosed list.

    + + With the [...] and [^...] matching, a ']' character can be included in the list by making it the first character after '[' or '^'. A range of characters can be specified using '-'. Example: "[a-z]" matches any single lower-case letter. To match a '-', make it the last character in the list.

    + + Hint: To match '*' or '?', put them in "[]". Like this: abc[*]xyz matches "abc*xyz" only
    + NOTE: This could be, in some cases, n-squared. But it is pretty fast with moderately sized strings. I have not tested this with huge/massive strings.
    +
    + T: Type of the named column. Based on the concept, it can only be either of these types: std::string, VirtualString, const char *, char *
    + Ts: List all the types of all data columns. A type should be specified in the list only once.
    + name: Name of the data column
    + pattern: Glob like pattern to use for matching strings
    + case_insensitive: If true, matching logic ignores case
    + esc_char: Character used for escape
    +
    +
    
    +template<StringOnly T, typename ... Ts>
    +PtrView
    +get_view_by_like(const char *name,
    +                 const char *pattern,
    +                 bool case_insensitive = false,
    +                 char esc_char = '\\');
    +        
    +
    + This is identical with above get_data_by_like(), but:
    +
      +
    1. The result is a view
    2. +
    3. Since the result is a view, you cannot call make_consistent() on the result.
    4. +
    + NOTE: There are certain operations that you cannot do with a view. For example, you cannot add/delete columns, etc.
    +
    + T: Type of the named column. Based on the concept, it can only be either of these types: std::string, VirtualString, const char *, char *
    + Ts: List all the types of all data columns. A type should be specified in the list only once.
    + name: Name of the data column
    + pattern: Glob like pattern to use for matching strings
    + case_insensitive: If true, matching logic ignores case
    + esc_char: Character used for escape
    +
    +
    
    +template<StringOnly T, typename ... Ts>
    +ConstPtrView
    +get_view_by_like(const char *name,
    +                 const char *pattern,
    +                 bool case_insensitive = false,
    +                 char esc_char = '\\') const;
    +        
    +
    + Same as above view, but it returns a const view. You can not change data in const views. But if the data is changed in the original DataFrame or through another view, it is refelcted in the const view. + + T: Type of the named column. Based on the concept, it can only be either of these types: std::string, VirtualString, const char *, char *
    + Ts: List all the types of all data columns. A type should be specified in the list only once.
    + name: Name of the data column
    + pattern: Glob like pattern to use for matching strings
    + case_insensitive: If true, matching logic ignores case
    + esc_char: Character used for escape
    +
    +
    
    +template<StringOnly T, typename ... Ts>
    +DataFrame
    +get_data_by_like(const char *name1,
    +                 const char *name2,
    +                 const char *pattern1,
    +                 const char *pattern2,
    +                 bool case_insensitive = false,
    +                 char esc_char = '\\') const;
    +        
    +
    + This does the same function as above get_data_by_like() but operating on two columns.
    +
    + T: Type of both named columns. Based on the concept, it can only be either of these types: std::string, VirtualString, const char *, char *
    + Ts: List all the types of all data columns. A type should be specified in the list only once.
    + name1: Name of the first data column
    + name2: Name of the second data column
    + pattern1: Glob like pattern to use for matching strings for the first column
    + pattern2: Glob like pattern to use for matching strings for the second column
    + case_insensitive: If true, matching logic ignores case
    + esc_char: Character used for escape
    +
    +
    
    +template<StringOnly T, typename ... Ts>
    +PtrView
    +get_view_by_like(const char *name1,
    +                 const char *name2,
    +                 const char *pattern1,
    +                 const char *pattern2,
    +                 bool case_insensitive = false,
    +                 char esc_char = '\\');
    +        
    +
    + This is identical with above get_data_by_like(), but:
    +
      +
    1. The result is a view
    2. +
    3. Since the result is a view, you cannot call make_consistent() on the result.
    4. +
    + NOTE: There are certain operations that you cannot do with a view. For example, you cannot add/delete columns, etc.
    +
    + T: Type of both named columns. Based on the concept, it can only be either of these types: std::string, VirtualString, const char *, char *
    + Ts: List all the types of all data columns. A type should be specified in the list only once.
    + name1: Name of the first data column
    + name2: Name of the second data column
    + pattern1: Glob like pattern to use for matching strings for the first column
    + pattern2: Glob like pattern to use for matching strings for the second column
    + case_insensitive: If true, matching logic ignores case
    + esc_char: Character used for escape
    +
    +
    
    +template<StringOnly T, typename ... Ts>
    +ConstPtrView
    +get_view_by_like(const char *name1,
    +                 const char *name2,
    +                 const char *pattern1,
    +                 const char *pattern2,
    +                 bool case_insensitive = false,
    +                 char esc_char = '\\') const;
    +        
    +
    + Same as above view, but it returns a const view. You can not change data in const views. But if the data is changed in the original DataFrame or through another view, it is refelcted in the const view. + + T: Type of both named columns. Based on the concept, it can only be either of these types: std::string, VirtualString, const char *, char *
    + Ts: List all the types of all data columns. A type should be specified in the list only once.
    + name1: Name of the first data column
    + name2: Name of the second data column
    + pattern1: Glob like pattern to use for matching strings for the first column
    + pattern2: Glob like pattern to use for matching strings for the second column
    + case_insensitive: If true, matching logic ignores case
    + esc_char: Character used for escape
    +
    + +
    static void test_get_data_by_like()  {
    +
    +    std::cout << "\nTesting get_data_by_like( ) ..." << std::endl;
    +
    +    StlVecType<unsigned long>  idxvec = { 1UL, 2UL, 3UL, 10UL, 5UL, 7UL, 8UL, 12UL, 9UL, 12UL, 10UL, 13UL, 10UL, 15UL, 14UL };
    +    StlVecType<std::string>    strvec1 =
    +        { "345&%$abcM", "!@#$0987^HGTtiff\"", "ABFDTiy", "345&%$abcM", "!@#$0987^HGTtiff\"", "!@#$0987^HGTtiff\"", "345&%$abcM",
    +          "!@#$0987^HGTtiff\"", "ABFDTiy", "345&%$abcM", "!@#$0987^HGTtiff\"", "ABFDTiy", "345&%$abcM", "ABFDTiy", "ABFDTiy" };
    +    StlVecType<std::string>    strvec2 =
    +        { "ABFDTiy", "!@#$0987^HGTtiff\"", "ABFDTiy", "345&%$abcM", "!@#$0987^HGTtiff\"", "ABFDTiy", "!@#$0987^HGTtiff\"",
    +          "!@#$0987^HGTtiff\"", "ABFDTiy", "345&%$abcM", "!@#$0987^HGTtiff\"", "ABFDTiy", "345&%$abcM", "ABFDTiy", "ABFDTiy" };
    +    StlVecType<int>            intvec = { 1, 2, 3, 10, 5, 7, 8, 12, 9, 12, 10, 13, 10, 15, 14 };
    +    MyDataFrame                df;
    +
    +    df.load_data(std::move(idxvec),
    +                 std::make_pair("str column 1", strvec1),
    +                 std::make_pair("str column 2", strvec2),
    +                 std::make_pair("int column", intvec));
    +
    +    auto    df_like2 =
    +        df.get_data_by_like<std::string, std::string, int>("str column 1",
    +                                                           "str column 2",
    +                                                           "?*[0-9][0-9][0-9][0-9]?*",
    +                                                           "?*[0-9][0-9][0-9][0-9]?*");
    +
    +    assert(df_like2.get_index().size() == 4);
    +    assert(df_like2.get_index()[2] == 12);
    +    assert(df_like2.get_column<int>("int column")[2] == 12);
    +    assert(df_like2.get_column<std::string>("str column 1").size() == 4);
    +    assert(df_like2.get_column<std::string>("str column 2").size() == 4);
    +    assert((df_like2.get_column<std::string>("str column 1")[0] == "!@#$0987^HGTtiff\""));
    +    assert((df_like2.get_column<std::string>("str column 1")[2] == "!@#$0987^HGTtiff\""));
    +    assert((df_like2.get_column<std::string>("str column 2")[0] == "!@#$0987^HGTtiff\""));
    +    assert((df_like2.get_column<std::string>("str column 2")[2] == "!@#$0987^HGTtiff\""));
    +
    +    auto    dfv_like2 =
    +        df.get_view_by_like<std::string, std::string, int>("str column 1",
    +                                                           "str column 2",
    +                                                           "?*[0-9][0-9][0-9][0-9]?*",
    +                                                           "?*[0-9][0-9][0-9][0-9]?*");
    +
    +    assert(dfv_like2.get_index().size() == 4);
    +    assert(dfv_like2.get_index()[2] == 12);
    +    assert(dfv_like2.get_column<int>("int column")[2] == 12);
    +    assert(dfv_like2.get_column<std::string>("str column 1").size() == 4);
    +    assert(dfv_like2.get_column<std::string>("str column 2").size() == 4);
    +    assert((dfv_like2.get_column<std::string>("str column 1")[0] == "!@#$0987^HGTtiff\""));
    +    assert((dfv_like2.get_column<std::string>("str column 1")[2] == "!@#$0987^HGTtiff\""));
    +    assert((dfv_like2.get_column<std::string>("str column 2")[0] == "!@#$0987^HGTtiff\""));
    +    assert((dfv_like2.get_column<std::string>("str column 2")[2] == "!@#$0987^HGTtiff\""));
    +
    +    dfv_like2.get_column<std::string>("str column 2")[3] = "ABC";
    +    assert(dfv_like2.get_column<std::string>("str column 2")[3] == "ABC");
    +    assert(df.get_column<std::string>("str column 2")[10] == "ABC");
    +
    +    auto    df_like1 =
    +        df.get_data_by_like<std::string, std::string, int>("str column 1", "?*&%?*");
    +
    +    assert(df_like1.get_index().size() == 5);
    +    assert(df_like1.get_index()[2] == 8);
    +    assert(df_like1.get_column<int>("int column")[2] == 8);
    +    assert(df_like1.get_column<std::string>("str column 1").size() == 5);
    +    assert(df_like1.get_column<std::string>("str column 2").size() == 5);
    +    assert((df_like1.get_column<std::string>("str column 1")[0] == "345&%$abcM"));
    +    assert((df_like1.get_column<std::string>("str column 1")[2] == "345&%$abcM"));
    +    assert((df_like1.get_column<std::string>("str column 2")[0] == "ABFDTiy"));
    +    assert((df_like1.get_column<std::string>("str column 2")[2] == "!@#$0987^HGTtiff\""));
    +}
    +
    + + C++ DataFrame + + + + + diff --git a/include/DataFrame/DataFrame.h b/include/DataFrame/DataFrame.h index 346d3f59..959ca03e 100644 --- a/include/DataFrame/DataFrame.h +++ b/include/DataFrame/DataFrame.h @@ -2550,9 +2550,9 @@ class DataFrame : public ThreadGranularity { // Name of the first data column // name2: // Name of the second data column - // pattern1 + // pattern1: // Glob like pattern to use for matching strings for the first column - // pattern2 + // pattern2: // Glob like pattern to use for matching strings for the second column // case_insensitive: // If true, matching logic ignores case From e8d21a121486817f49797bf704b3fbe02ea77927 Mon Sep 17 00:00:00 2001 From: Hossein Moein Date: Mon, 15 Jan 2024 09:31:06 -0500 Subject: [PATCH 06/13] Factored out a bunch of code from various get_[data|view]_by_sel()'s --- include/DataFrame/Internals/DataFrame_get.tcc | 952 ++---------------- .../DataFrame/Internals/DataFrame_join.tcc | 134 --- .../Internals/DataFrame_private_decl.h | 435 +++++++- include/DataFrame/Internals/DataFrame_set.tcc | 157 --- test/dataframe_tester_output.txt | 23 +- 5 files changed, 498 insertions(+), 1203 deletions(-) diff --git a/include/DataFrame/Internals/DataFrame_get.tcc b/include/DataFrame/Internals/DataFrame_get.tcc index b7d2aaa0..1cc2e4f4 100644 --- a/include/DataFrame/Internals/DataFrame_get.tcc +++ b/include/DataFrame/Internals/DataFrame_get.tcc @@ -1002,61 +1002,7 @@ get_data_by_sel (const char *name, F &sel_functor) const { if (sel_functor (indices_[i], vec[i])) col_indices.push_back(i); - DataFrame df; - IndexVecType new_index; - - new_index.reserve(col_indices.size()); - for (const auto &citer: col_indices) [[likely]] - new_index.push_back(indices_[citer]); - df.load_index(std::move(new_index)); - - const SpinGuard guard(lock_); - - for (const auto &citer : column_list_) [[likely]] { - create_col_functor_ functor(citer.first.c_str(), - df); - - data_[citer.second].change(functor); - } - - const auto thread_level = - (idx_s < ThreadPool::MUL_THR_THHOLD) ? 0L : get_thread_level(); - - if (thread_level > 2) { - auto lbd = - [&col_indices = std::as_const(col_indices), idx_s, &df, this] - (const auto &begin, const auto &end) -> void { - for (auto citer = begin; citer < end; ++citer) { - sel_load_functor_ functor ( - citer->first.c_str(), - col_indices, - idx_s, - df); - - this->data_[citer->second].change(functor); - } - }; - - auto futuers = - thr_pool_.parallel_loop(column_list_.begin(), - column_list_.end(), - std::move(lbd)); - - for (auto &fut : futuers) fut.get(); - } - else { - for (const auto &citer : column_list_) [[likely]] { - sel_load_functor_ functor ( - citer.first.c_str(), - col_indices, - idx_s, - df); - - data_[citer.second].change(functor); - } - } - - return (df); + return (data_by_sel_common_(col_indices, idx_s)); } // ---------------------------------------------------------------------------- @@ -1079,37 +1025,14 @@ get_view_by_sel (const char *name, F &sel_functor) { if (sel_functor (indices_[i], vec[i])) [[unlikely]] col_indices.push_back(i); - using TheView = PtrView; - - TheView dfv; - typename TheView::IndexVecType new_index; - - new_index.reserve(col_indices.size()); - for (const auto &citer: col_indices) [[likely]] - new_index.push_back(&(indices_[citer])); - dfv.indices_ = std::move(new_index); - - const SpinGuard guard(lock_); - - for (const auto &col_citer : column_list_) [[likely]] { - sel_load_view_functor_ functor ( - col_citer.first.c_str(), - col_indices, - idx_s, - dfv); - - data_[col_citer.second].change(functor); - } - - return (dfv); + return (view_by_sel_common_(col_indices, idx_s)); } // ---------------------------------------------------------------------------- template template -typename DataFrame::ConstPtrView -DataFrame:: +typename DataFrame::ConstPtrView DataFrame:: get_view_by_sel (const char *name, F &sel_functor) const { static_assert(std::is_base_of, H>::value, @@ -1125,29 +1048,7 @@ get_view_by_sel (const char *name, F &sel_functor) const { if (sel_functor (indices_[i], vec[i])) [[unlikely]] col_indices.push_back(i); - using TheView = ConstPtrView; - - TheView dfv; - typename TheView::IndexVecType new_index; - - new_index.reserve(col_indices.size()); - for (const auto &citer: col_indices) [[likely]] - new_index.push_back(&(indices_[citer])); - dfv.indices_ = std::move(new_index); - - const SpinGuard guard(lock_); - - for (const auto &col_citer : column_list_) [[likely]] { - sel_load_view_functor_ functor ( - col_citer.first.c_str(), - col_indices, - idx_s, - dfv); - - data_[col_citer.second].change(functor); - } - - return (dfv); + return (view_by_sel_common_(col_indices, idx_s)); } // ---------------------------------------------------------------------------- @@ -1176,59 +1077,7 @@ get_data_by_sel (const char *name1, const char *name2, F &sel_functor) const { i < col_s2 ? vec2[i] : get_nan())) col_indices.push_back(i); - DataFrame df; - IndexVecType new_index; - - new_index.reserve(col_indices.size()); - for (const auto &citer: col_indices) - new_index.push_back(indices_[citer]); - df.load_index(std::move(new_index)); - - for (const auto &citer : column_list_) [[likely]] { - create_col_functor_ functor(citer.first.c_str(), - df); - - data_[citer.second].change(functor); - } - - const auto thread_level = - (idx_s < ThreadPool::MUL_THR_THHOLD) ? 0L : get_thread_level(); - - if (thread_level > 2) { - auto lbd = - [&col_indices = std::as_const(col_indices), idx_s, &df, this] - (const auto &begin, const auto &end) -> void { - for (auto citer = begin; citer < end; ++citer) { - sel_load_functor_ functor ( - citer->first.c_str(), - col_indices, - idx_s, - df); - - this->data_[citer->second].change(functor); - } - }; - - auto futuers = - thr_pool_.parallel_loop(column_list_.begin(), - column_list_.end(), - std::move(lbd)); - - for (auto &fut : futuers) fut.get(); - } - else { - for (const auto &citer : column_list_) [[likely]] { - sel_load_functor_ functor ( - citer.first.c_str(), - col_indices, - idx_s, - df); - - data_[citer.second].change(functor); - } - } - - return (df); + return (data_by_sel_common_(col_indices, idx_s)); } // ---------------------------------------------------------------------------- @@ -1260,27 +1109,7 @@ get_view_by_sel (const char *name1, const char *name2, F &sel_functor) { i < col_s2 ? vec2[i] : get_nan())) col_indices.push_back(i); - using TheView = PtrView; - - TheView dfv; - typename TheView::IndexVecType new_index; - - new_index.reserve(col_indices.size()); - for (const auto &citer: col_indices) - new_index.push_back(&(indices_[citer])); - dfv.indices_ = std::move(new_index); - - for (const auto &col_citer : column_list_) [[likely]] { - sel_load_view_functor_ functor ( - col_citer.first.c_str(), - col_indices, - idx_s, - dfv); - - data_[col_citer.second].change(functor); - } - - return (dfv); + return (view_by_sel_common_(col_indices, idx_s)); } // ---------------------------------------------------------------------------- @@ -1313,27 +1142,7 @@ get_view_by_sel (const char *name1, const char *name2, F &sel_functor) const { i < col_s2 ? vec2[i] : get_nan())) col_indices.push_back(i); - using TheView = ConstPtrView; - - TheView dfv; - typename TheView::IndexVecType new_index; - - new_index.reserve(col_indices.size()); - for (const auto &citer: col_indices) [[likely]] - new_index.push_back(&(indices_[citer])); - dfv.indices_ = std::move(new_index); - - for (const auto &col_citer : column_list_) [[likely]] { - sel_load_view_functor_ functor ( - col_citer.first.c_str(), - col_indices, - idx_s, - dfv); - - data_[col_citer.second].change(functor); - } - - return (dfv); + return (view_by_sel_common_(col_indices, idx_s)); } // ---------------------------------------------------------------------------- @@ -1368,59 +1177,7 @@ get_data_by_sel (const char *name1, i < col_s3 ? vec3[i] : get_nan())) col_indices.push_back(i); - DataFrame df; - IndexVecType new_index; - - new_index.reserve(col_indices.size()); - for (const auto &citer: col_indices) [[likely]] - new_index.push_back(indices_[citer]); - df.load_index(std::move(new_index)); - - for (const auto &citer : column_list_) [[likely]] { - create_col_functor_ functor(citer.first.c_str(), - df); - - data_[citer.second].change(functor); - } - - const auto thread_level = - (idx_s < ThreadPool::MUL_THR_THHOLD) ? 0L : get_thread_level(); - - if (thread_level > 2) { - auto lbd = - [&col_indices = std::as_const(col_indices), idx_s, &df, this] - (const auto &begin, const auto &end) -> void { - for (auto citer = begin; citer < end; ++citer) { - sel_load_functor_ functor ( - citer->first.c_str(), - col_indices, - idx_s, - df); - - this->data_[citer->second].change(functor); - } - }; - - auto futuers = - thr_pool_.parallel_loop(column_list_.begin(), - column_list_.end(), - std::move(lbd)); - - for (auto &fut : futuers) fut.get(); - } - else { - for (const auto &citer : column_list_) [[likely]] { - sel_load_functor_ functor ( - citer.first.c_str(), - col_indices, - idx_s, - df); - - data_[citer.second].change(functor); - } - } - - return (df); + return (data_by_sel_common_(col_indices, idx_s)); } // ---------------------------------------------------------------------------- @@ -1652,27 +1409,7 @@ get_view_by_sel (const char *name1, i < col_s3 ? vec3[i] : get_nan())) col_indices.push_back(i); - using TheView = PtrView; - - TheView dfv; - typename TheView::IndexVecType new_index; - - new_index.reserve(col_indices.size()); - for (const auto &citer: col_indices) [[likely]] - new_index.push_back(&(indices_[citer])); - dfv.indices_ = std::move(new_index); - - for (const auto &col_citer : column_list_) [[likely]] { - sel_load_view_functor_ functor ( - col_citer.first.c_str(), - col_indices, - idx_s, - dfv); - - data_[col_citer.second].change(functor); - } - - return (dfv); + return (view_by_sel_common_(col_indices, idx_s)); } // ---------------------------------------------------------------------------- @@ -1711,30 +1448,10 @@ get_view_by_sel (const char *name1, i < col_s3 ? vec3[i] : get_nan())) col_indices.push_back(i); - using TheView = ConstPtrView; + return (view_by_sel_common_(col_indices, idx_s)); +} - TheView dfv; - typename TheView::IndexVecType new_index; - - new_index.reserve(col_indices.size()); - for (const auto &citer: col_indices) [[likely]] - new_index.push_back(&(indices_[citer])); - dfv.indices_ = std::move(new_index); - - for (const auto &col_citer : column_list_) [[likely]] { - sel_load_view_functor_ functor ( - col_citer.first.c_str(), - col_indices, - idx_s, - dfv); - - data_[col_citer.second].change(functor); - } - - return (dfv); -} - -// ---------------------------------------------------------------------------- +// ---------------------------------------------------------------------------- template template())) col_indices.push_back(i); - DataFrame df; - IndexVecType new_index; - - new_index.reserve(col_indices.size()); - for (const auto &citer: col_indices) [[likely]] - new_index.push_back(indices_[citer]); - df.load_index(std::move(new_index)); - - for (const auto &citer : column_list_) [[likely]] { - create_col_functor_ functor(citer.first.c_str(), - df); - - data_[citer.second].change(functor); - } - - const auto thread_level = - (idx_s < ThreadPool::MUL_THR_THHOLD) ? 0L : get_thread_level(); - - if (thread_level > 2) { - auto lbd = - [&col_indices = std::as_const(col_indices), idx_s, &df, this] - (const auto &begin, const auto &end) -> void { - for (auto citer = begin; citer < end; ++citer) { - sel_load_functor_ functor ( - citer->first.c_str(), - col_indices, - idx_s, - df); - - this->data_[citer->second].change(functor); - } - }; - - auto futuers = - thr_pool_.parallel_loop(column_list_.begin(), - column_list_.end(), - std::move(lbd)); - - for (auto &fut : futuers) fut.get(); - } - else { - for (const auto &citer : column_list_) [[likely]] { - sel_load_functor_ functor ( - citer.first.c_str(), - col_indices, - idx_s, - df); - - data_[citer.second].change(functor); - } - } - - return (df); + return (data_by_sel_common_(col_indices, idx_s)); } // ---------------------------------------------------------------------------- @@ -1870,27 +1535,7 @@ get_view_by_sel(const char *name1, i < col_s4 ? vec4[i] : get_nan())) col_indices.push_back(i); - using TheView = PtrView; - - TheView dfv; - typename TheView::IndexVecType new_index; - - new_index.reserve(col_indices.size()); - for (const auto &citer: col_indices) [[likely]] - new_index.push_back(&(indices_[citer])); - dfv.indices_ = std::move(new_index); - - for (const auto &col_citer : column_list_) [[likely]] { - sel_load_view_functor_ functor ( - col_citer.first.c_str(), - col_indices, - idx_s, - dfv); - - data_[col_citer.second].change(functor); - } - - return (dfv); + return (view_by_sel_common_(col_indices, idx_s)); } // ---------------------------------------------------------------------------- @@ -1936,27 +1581,7 @@ get_view_by_sel(const char *name1, i < col_s4 ? vec4[i] : get_nan())) col_indices.push_back(i); - using TheView = ConstPtrView; - - TheView dfv; - typename TheView::IndexVecType new_index; - - new_index.reserve(col_indices.size()); - for (const auto &citer: col_indices) [[likely]] - new_index.push_back(&(indices_[citer])); - dfv.indices_ = std::move(new_index); - - for (const auto &col_citer : column_list_) [[likely]] { - sel_load_view_functor_ functor ( - col_citer.first.c_str(), - col_indices, - idx_s, - dfv); - - data_[col_citer.second].change(functor); - } - - return (dfv); + return (view_by_sel_common_(col_indices, idx_s)); } // ---------------------------------------------------------------------------- @@ -2003,59 +1628,7 @@ get_data_by_sel(const char *name1, i < col_s5 ? vec5[i] : get_nan())) col_indices.push_back(i); - DataFrame df; - IndexVecType new_index; - - new_index.reserve(col_indices.size()); - for (const auto &citer: col_indices) [[likely]] - new_index.push_back(indices_[citer]); - df.load_index(std::move(new_index)); - - for (const auto &citer : column_list_) [[likely]] { - create_col_functor_ functor(citer.first.c_str(), - df); - - data_[citer.second].change(functor); - } - - const auto thread_level = - (idx_s < ThreadPool::MUL_THR_THHOLD) ? 0L : get_thread_level(); - - if (thread_level > 2) { - auto lbd = - [&col_indices = std::as_const(col_indices), idx_s, &df, this] - (const auto &begin, const auto &end) -> void { - for (auto citer = begin; citer < end; ++citer) { - sel_load_functor_ functor ( - citer->first.c_str(), - col_indices, - idx_s, - df); - - this->data_[citer->second].change(functor); - } - }; - - auto futuers = - thr_pool_.parallel_loop(column_list_.begin(), - column_list_.end(), - std::move(lbd)); - - for (auto &fut : futuers) fut.get(); - } - else { - for (const auto &citer : column_list_) [[likely]] { - sel_load_functor_ functor ( - citer.first.c_str(), - col_indices, - idx_s, - df); - - data_[citer.second].change(functor); - } - } - - return (df); + return (data_by_sel_common_(col_indices, idx_s)); } // ---------------------------------------------------------------------------- @@ -2131,59 +1704,7 @@ get_data_by_sel(const char *name1, i < col_s11 ? vec11[i] : get_nan())) col_indices.push_back(i); - DataFrame df; - IndexVecType new_index; - - new_index.reserve(col_indices.size()); - for (const auto &citer: col_indices) [[likely]] - new_index.push_back(indices_[citer]); - df.load_index(std::move(new_index)); - - for (const auto &citer : column_list_) [[likely]] { - create_col_functor_ functor(citer.first.c_str(), - df); - - data_[citer.second].change(functor); - } - - const auto thread_level = - (idx_s < ThreadPool::MUL_THR_THHOLD) ? 0L : get_thread_level(); - - if (thread_level > 2) { - auto lbd = - [&col_indices = std::as_const(col_indices), idx_s, &df, this] - (const auto &begin, const auto &end) -> void { - for (auto citer = begin; citer < end; ++citer) { - sel_load_functor_ functor ( - citer->first.c_str(), - col_indices, - idx_s, - df); - - this->data_[citer->second].change(functor); - } - }; - - auto futuers = - thr_pool_.parallel_loop(column_list_.begin(), - column_list_.end(), - std::move(lbd)); - - for (auto &fut : futuers) fut.get(); - } - else { - for (const auto &citer : column_list_) [[likely]] { - sel_load_functor_ functor ( - citer.first.c_str(), - col_indices, - idx_s, - df); - - data_[citer.second].change(functor); - } - } - - return (df); + return (data_by_sel_common_(col_indices, idx_s)); } // ---------------------------------------------------------------------------- @@ -2263,59 +1784,7 @@ get_data_by_sel(const char *name1, i < col_s12 ? vec12[i] : get_nan())) col_indices.push_back(i); - DataFrame df; - IndexVecType new_index; - - new_index.reserve(col_indices.size()); - for (const auto &citer: col_indices) - new_index.push_back(indices_[citer]); - df.load_index(std::move(new_index)); - - for (const auto &citer : column_list_) [[likely]] { - create_col_functor_ functor(citer.first.c_str(), - df); - - data_[citer.second].change(functor); - } - - const auto thread_level = - (idx_s < ThreadPool::MUL_THR_THHOLD) ? 0L : get_thread_level(); - - if (thread_level > 2) { - auto lbd = - [&col_indices = std::as_const(col_indices), idx_s, &df, this] - (const auto &begin, const auto &end) -> void { - for (auto citer = begin; citer < end; ++citer) { - sel_load_functor_ functor ( - citer->first.c_str(), - col_indices, - idx_s, - df); - - this->data_[citer->second].change(functor); - } - }; - - auto futuers = - thr_pool_.parallel_loop(column_list_.begin(), - column_list_.end(), - std::move(lbd)); - - for (auto &fut : futuers) fut.get(); - } - else { - for (const auto &citer : column_list_) [[likely]] { - sel_load_functor_ functor ( - citer.first.c_str(), - col_indices, - idx_s, - df); - - data_[citer.second].change(functor); - } - } - - return (df); + return (data_by_sel_common_(col_indices, idx_s)); } // ---------------------------------------------------------------------------- @@ -2356,102 +1825,50 @@ get_data_by_sel(const char *name1, const ColumnVecType &vec12 = get_column(name12, false); const ColumnVecType &vec13 = get_column(name13, false); const size_type idx_s = indices_.size(); - const size_type col_s1 = vec1.size(); - const size_type col_s2 = vec2.size(); - const size_type col_s3 = vec3.size(); - const size_type col_s4 = vec4.size(); - const size_type col_s5 = vec5.size(); - const size_type col_s6 = vec6.size(); - const size_type col_s7 = vec7.size(); - const size_type col_s8 = vec8.size(); - const size_type col_s9 = vec9.size(); - const size_type col_s10 = vec10.size(); - const size_type col_s11 = vec11.size(); - const size_type col_s12 = vec12.size(); - const size_type col_s13 = vec13.size(); - const size_type min_col_s = - std::min({ col_s1, col_s2, col_s3, col_s4, col_s5, - col_s6, col_s7, col_s8, col_s9, col_s10, - col_s11, col_s12, col_s13 }); - StlVecType col_indices; - - col_indices.reserve(idx_s / 2); - for (size_type i = 0; i < min_col_s; ++i) - if (sel_functor(indices_[i], - vec1[i], vec2[i], vec3[i], vec4[i], vec5[i], - vec6[i], vec7[i], vec8[i], vec9[i], vec10[i], - vec11[i], vec12[i], vec13[i])) - col_indices.push_back(i); - for (size_type i = min_col_s; i < idx_s; ++i) - if (sel_functor(indices_[i], - i < col_s1 ? vec1[i] : get_nan(), - i < col_s2 ? vec2[i] : get_nan(), - i < col_s3 ? vec3[i] : get_nan(), - i < col_s4 ? vec4[i] : get_nan(), - i < col_s5 ? vec5[i] : get_nan(), - i < col_s6 ? vec6[i] : get_nan(), - i < col_s7 ? vec7[i] : get_nan(), - i < col_s8 ? vec8[i] : get_nan(), - i < col_s9 ? vec9[i] : get_nan(), - i < col_s10 ? vec10[i] : get_nan(), - i < col_s11 ? vec11[i] : get_nan(), - i < col_s12 ? vec12[i] : get_nan(), - i < col_s13 ? vec13[i] : get_nan())) - col_indices.push_back(i); - - DataFrame df; - IndexVecType new_index; - - new_index.reserve(col_indices.size()); - for (const auto &citer: col_indices) - new_index.push_back(indices_[citer]); - df.load_index(std::move(new_index)); - - for (const auto &citer : column_list_) [[likely]] { - create_col_functor_ functor(citer.first.c_str(), - df); - - data_[citer.second].change(functor); - } - - const auto thread_level = - (idx_s < ThreadPool::MUL_THR_THHOLD) ? 0L : get_thread_level(); - - if (thread_level > 2) { - auto lbd = - [&col_indices = std::as_const(col_indices), idx_s, &df, this] - (const auto &begin, const auto &end) -> void { - for (auto citer = begin; citer < end; ++citer) { - sel_load_functor_ functor ( - citer->first.c_str(), - col_indices, - idx_s, - df); - - this->data_[citer->second].change(functor); - } - }; - - auto futuers = - thr_pool_.parallel_loop(column_list_.begin(), - column_list_.end(), - std::move(lbd)); - - for (auto &fut : futuers) fut.get(); - } - else { - for (const auto &citer : column_list_) [[likely]] { - sel_load_functor_ functor ( - citer.first.c_str(), - col_indices, - idx_s, - df); + const size_type col_s1 = vec1.size(); + const size_type col_s2 = vec2.size(); + const size_type col_s3 = vec3.size(); + const size_type col_s4 = vec4.size(); + const size_type col_s5 = vec5.size(); + const size_type col_s6 = vec6.size(); + const size_type col_s7 = vec7.size(); + const size_type col_s8 = vec8.size(); + const size_type col_s9 = vec9.size(); + const size_type col_s10 = vec10.size(); + const size_type col_s11 = vec11.size(); + const size_type col_s12 = vec12.size(); + const size_type col_s13 = vec13.size(); + const size_type min_col_s = + std::min({ col_s1, col_s2, col_s3, col_s4, col_s5, + col_s6, col_s7, col_s8, col_s9, col_s10, + col_s11, col_s12, col_s13 }); + StlVecType col_indices; - data_[citer.second].change(functor); - } - } + col_indices.reserve(idx_s / 2); + for (size_type i = 0; i < min_col_s; ++i) + if (sel_functor(indices_[i], + vec1[i], vec2[i], vec3[i], vec4[i], vec5[i], + vec6[i], vec7[i], vec8[i], vec9[i], vec10[i], + vec11[i], vec12[i], vec13[i])) + col_indices.push_back(i); + for (size_type i = min_col_s; i < idx_s; ++i) + if (sel_functor(indices_[i], + i < col_s1 ? vec1[i] : get_nan(), + i < col_s2 ? vec2[i] : get_nan(), + i < col_s3 ? vec3[i] : get_nan(), + i < col_s4 ? vec4[i] : get_nan(), + i < col_s5 ? vec5[i] : get_nan(), + i < col_s6 ? vec6[i] : get_nan(), + i < col_s7 ? vec7[i] : get_nan(), + i < col_s8 ? vec8[i] : get_nan(), + i < col_s9 ? vec9[i] : get_nan(), + i < col_s10 ? vec10[i] : get_nan(), + i < col_s11 ? vec11[i] : get_nan(), + i < col_s12 ? vec12[i] : get_nan(), + i < col_s13 ? vec13[i] : get_nan())) + col_indices.push_back(i); - return (df); + return (data_by_sel_common_(col_indices, idx_s)); } // ---------------------------------------------------------------------------- @@ -2500,27 +1917,7 @@ get_view_by_sel(const char *name1, i < col_s5 ? vec5[i] : get_nan())) col_indices.push_back(i); - using TheView = PtrView; - - TheView dfv; - typename TheView::IndexVecType new_index; - - new_index.reserve(col_indices.size()); - for (const auto &citer: col_indices) - new_index.push_back(&(indices_[citer])); - dfv.indices_ = std::move(new_index); - - for (const auto &col_citer : column_list_) { - sel_load_view_functor_ functor ( - col_citer.first.c_str(), - col_indices, - idx_s, - dfv); - - data_[col_citer.second].change(functor); - } - - return (dfv); + return (view_by_sel_common_(col_indices, idx_s)); } // ---------------------------------------------------------------------------- @@ -2570,27 +1967,7 @@ get_view_by_sel(const char *name1, i < col_s5 ? vec5[i] : get_nan())) col_indices.push_back(i); - using TheView = ConstPtrView; - - TheView dfv; - typename TheView::IndexVecType new_index; - - new_index.reserve(col_indices.size()); - for (const auto &citer: col_indices) - new_index.push_back(&(indices_[citer])); - dfv.indices_ = std::move(new_index); - - for (const auto &col_citer : column_list_) { - sel_load_view_functor_ functor ( - col_citer.first.c_str(), - col_indices, - idx_s, - dfv); - - data_[col_citer.second].change(functor); - } - - return (dfv); + return (view_by_sel_common_(col_indices, idx_s)); } // ---------------------------------------------------------------------------- @@ -2626,62 +2003,7 @@ get_data_by_like(const char *name, } } - DataFrame df; - IndexVecType new_index; - - new_index.reserve(col_indices.size()); - for (const auto &citer: col_indices) [[likely]] - new_index.push_back(indices_[citer]); - df.load_index(std::move(new_index)); - - const SpinGuard guard(lock_); - - for (const auto &citer : column_list_) [[likely]] { - create_col_functor_ functor( - citer.first.c_str(), df); - - data_[citer.second].change(functor); - } - - const size_type idx_s = indices_.size(); - const auto thread_level = - (idx_s < ThreadPool::MUL_THR_THHOLD) ? 0L : get_thread_level(); - - if (thread_level > 2) { - auto lbd = - [&col_indices = std::as_const(col_indices), idx_s, &df, this] - (const auto &begin, const auto &end) -> void { - for (auto citer = begin; citer < end; ++citer) { - sel_load_functor_ functor ( - citer->first.c_str(), - col_indices, - idx_s, - df); - - this->data_[citer->second].change(functor); - } - }; - - auto futuers = - thr_pool_.parallel_loop(column_list_.begin(), - column_list_.end(), - std::move(lbd)); - - for (auto &fut : futuers) fut.get(); - } - else { - for (const auto &citer : column_list_) [[likely]] { - sel_load_functor_ functor ( - citer.first.c_str(), - col_indices, - idx_s, - df); - - data_[citer.second].change(functor); - } - } - - return (df); + return (data_by_sel_common_(col_indices, indices_.size())); } // ---------------------------------------------------------------------------- @@ -2720,30 +2042,7 @@ get_view_by_like(const char *name, } } - using TheView = PtrView; - - TheView dfv; - typename TheView::IndexVecType new_index; - - new_index.reserve(col_indices.size()); - for (const auto &citer: col_indices) [[likely]] - new_index.push_back(&(indices_[citer])); - dfv.indices_ = std::move(new_index); - - const size_type idx_s = indices_.size(); - const SpinGuard guard(lock_); - - for (const auto &col_citer : column_list_) [[likely]] { - sel_load_view_functor_ functor ( - col_citer.first.c_str(), - col_indices, - idx_s, - dfv); - - data_[col_citer.second].change(functor); - } - - return (dfv); + return (view_by_sel_common_(col_indices, indices_.size())); } // ---------------------------------------------------------------------------- @@ -2782,30 +2081,7 @@ get_view_by_like(const char *name, } } - using TheView = ConstPtrView; - - TheView dfv; - typename TheView::IndexVecType new_index; - - new_index.reserve(col_indices.size()); - for (const auto &citer: col_indices) [[likely]] - new_index.push_back(&(indices_[citer])); - dfv.indices_ = std::move(new_index); - - const size_type idx_s = indices_.size(); - const SpinGuard guard(lock_); - - for (const auto &col_citer : column_list_) [[likely]] { - sel_load_view_functor_ functor ( - col_citer.first.c_str(), - col_indices, - idx_s, - dfv); - - data_[col_citer.second].change(functor); - } - - return (dfv); + return (view_by_sel_common_(col_indices, indices_.size())); } // ---------------------------------------------------------------------------- @@ -2854,59 +2130,7 @@ get_data_by_like(const char *name1, } } - DataFrame df; - IndexVecType new_index; - - new_index.reserve(col_indices.size()); - for (const auto &citer: col_indices) - new_index.push_back(indices_[citer]); - df.load_index(std::move(new_index)); - - for (const auto &citer : column_list_) [[likely]] { - create_col_functor_ functor( - citer.first.c_str(), df); - - data_[citer.second].change(functor); - } - - const auto thread_level = - (idx_s < ThreadPool::MUL_THR_THHOLD) ? 0L : get_thread_level(); - - if (thread_level > 2) { - auto lbd = - [&col_indices = std::as_const(col_indices), idx_s, &df, this] - (const auto &begin, const auto &end) -> void { - for (auto citer = begin; citer < end; ++citer) { - sel_load_functor_ functor ( - citer->first.c_str(), - col_indices, - idx_s, - df); - - this->data_[citer->second].change(functor); - } - }; - - auto futuers = - thr_pool_.parallel_loop(column_list_.begin(), - column_list_.end(), - std::move(lbd)); - - for (auto &fut : futuers) fut.get(); - } - else { - for (const auto &citer : column_list_) [[likely]] { - sel_load_functor_ functor ( - citer.first.c_str(), - col_indices, - idx_s, - df); - - data_[citer.second].change(functor); - } - } - - return (df); + return (data_by_sel_common_(col_indices, idx_s)); } // ---------------------------------------------------------------------------- @@ -2956,29 +2180,9 @@ get_view_by_like(const char *name1, esc_char)) col_indices.push_back(i); } - } - - using TheView = PtrView; - - TheView dfv; - typename TheView::IndexVecType new_index; - - new_index.reserve(col_indices.size()); - for (const auto &citer: col_indices) - new_index.push_back(&(indices_[citer])); - dfv.indices_ = std::move(new_index); - - for (const auto &col_citer : column_list_) [[likely]] { - sel_load_view_functor_ functor ( - col_citer.first.c_str(), - col_indices, - idx_s, - dfv); - - data_[col_citer.second].change(functor); } - return (dfv); + return (view_by_sel_common_(col_indices, idx_s)); } // ---------------------------------------------------------------------------- @@ -3028,29 +2232,9 @@ get_view_by_like(const char *name1, esc_char)) col_indices.push_back(i); } - } - - using TheView = ConstPtrView; - - TheView dfv; - typename TheView::IndexVecType new_index; - - new_index.reserve(col_indices.size()); - for (const auto &citer: col_indices) - new_index.push_back(&(indices_[citer])); - dfv.indices_ = std::move(new_index); - - for (const auto &col_citer : column_list_) [[likely]] { - sel_load_view_functor_ functor ( - col_citer.first.c_str(), - col_indices, - idx_s, - dfv); - - data_[col_citer.second].change(functor); } - return (dfv); + return (view_by_sel_common_(col_indices, idx_s)); } // ---------------------------------------------------------------------------- diff --git a/include/DataFrame/Internals/DataFrame_join.tcc b/include/DataFrame/Internals/DataFrame_join.tcc index 42b6a20b..76795f7a 100644 --- a/include/DataFrame/Internals/DataFrame_join.tcc +++ b/include/DataFrame/Internals/DataFrame_join.tcc @@ -195,140 +195,6 @@ join_by_column (const RHS_T &rhs, const char *name, join_policy mp) const { // ---------------------------------------------------------------------------- -template -template -void DataFrame:: -join_helper_common_( - const LHS_T &lhs, - const RHS_T &rhs, - const IndexIdxVector &joined_index_idx, - DataFrame> &result, - const char *skip_col_name) { - - using res_t = decltype(result); - - std::vector> futures; - const auto thread_level = - (lhs.indices_.size() < ThreadPool::MUL_THR_THHOLD) - ? 0L : get_thread_level(); - - - if (thread_level > 2) - futures.reserve(lhs.column_list_.size() + rhs.column_list_.size()); - - const SpinGuard guard(lock_); - - // NOTE: I had to do this in two separate loops. Otherwise, it would - // occasionally crash in multithreaded mode under MacOS. - // - for (const auto &citer : lhs.column_list_) [[likely]] { - if (skip_col_name && citer.first == skip_col_name) continue; - - if (rhs.column_tb_.find(citer.first) != rhs.column_tb_.end()) { - create_join_common_col_functor_ create_f( - citer.first.c_str(), result); - - lhs.data_[citer.second].change(create_f); - } - else { - create_col_functor_ create_f( - citer.first.c_str(), result); - - lhs.data_[citer.second].change(create_f); - } - } - - // Load the common and lhs columns - // - for (const auto &citer : lhs.column_list_) [[likely]] { - if (skip_col_name && citer.first == skip_col_name) continue; - - // Common column between two frames - // - if (rhs.column_tb_.find(citer.first) != rhs.column_tb_.end()) { - auto jcomm_lbd = - [&citer = std::as_const(citer), - &lhs = std::as_const(lhs), - &rhs = std::as_const(rhs), - &joined_index_idx = std::as_const(joined_index_idx), - &result] () -> void { - index_join_functor_common_ functor( - citer.first.c_str(), - rhs, - joined_index_idx, - result); - - lhs.data_[citer.second].change(functor); - }; - - if (thread_level > 2) - futures.emplace_back(thr_pool_.dispatch(false, jcomm_lbd)); - else - jcomm_lbd(); - } - else { // lhs only column - auto jlhs_lbd = - [&citer = std::as_const(citer), - &lhs = std::as_const(lhs), - &joined_index_idx = std::as_const(joined_index_idx), - &result] () -> void { - // 0 = Left - index_join_functor_oneside_<0, res_t, Ts ...> functor ( - citer.first.c_str(), - joined_index_idx, - result); - - lhs.data_[citer.second].change(functor); - }; - - if (thread_level > 2) - futures.emplace_back(thr_pool_.dispatch(false, jlhs_lbd)); - else - jlhs_lbd(); - } - } - - // Load the rhs columns - // - for (const auto &citer : rhs.column_list_) [[likely]] { - const auto lhs_citer = lhs.column_tb_.find(citer.first); - - if (skip_col_name && citer.first == skip_col_name) continue; - - if (lhs_citer == lhs.column_tb_.end()) { // rhs only column - create_col_functor_ create_f( - citer.first.c_str(), result); - - rhs.data_[citer.second].change(create_f); - } - - if (lhs_citer == lhs.column_tb_.end()) { // rhs only column - auto jrhs_lbd = - [&citer = std::as_const(citer), - &rhs = std::as_const(rhs), - &joined_index_idx = std::as_const(joined_index_idx), - &result] () -> void { - // 1 = Right - index_join_functor_oneside_<1, res_t, Ts ...> functor ( - citer.first.c_str(), - joined_index_idx, - result); - - rhs.data_[citer.second].change(functor); - }; - - if (thread_level > 2) - futures.emplace_back(thr_pool_.dispatch(false, jrhs_lbd)); - else - jrhs_lbd(); - } - } - - for (auto &fut : futures) fut.get(); -} - -// ---------------------------------------------------------------------------- - template template DataFrame> DataFrame:: diff --git a/include/DataFrame/Internals/DataFrame_private_decl.h b/include/DataFrame/Internals/DataFrame_private_decl.h index 06ccfcd5..8812f57d 100644 --- a/include/DataFrame/Internals/DataFrame_private_decl.h +++ b/include/DataFrame/Internals/DataFrame_private_decl.h @@ -56,9 +56,6 @@ void read_csv2_(std::istream &file, size_type starting_row, size_type num_rows); -template -void remove_data_by_sel_common_(const StlVecType &col_indices); - template static void fill_missing_value_(ColumnVecType &vec, @@ -124,16 +121,6 @@ using IndexIdxVector = StlVecType>; template using JoinSortingPair = std::pair; -template -static void -join_helper_common_(const LHS_T &lhs, - const RHS_T &rhs, - const IndexIdxVector &joined_index_idx, - DataFrame< - IDX_T, - HeteroVector> &result, - const char *skip_col_name = nullptr); - template static DataFrame> index_join_helper_(const LHS_T &lhs, @@ -210,13 +197,6 @@ column_right_join_(const LHS_T &lhs, const StlVecType> &col_vec_lhs, const StlVecType> &col_vec_rhs); -template -static DataFrame -remove_dups_common_(const DataFrame &s_df, - remove_dup_spec rds, - const MAP &row_table, - const IndexVecType &index); - template static void concat_helper_(LHS_T &lhs, const RHS_T &rhs, bool add_new_columns); @@ -245,6 +225,421 @@ column_left_right_join_( // ---------------------------------------------------------------------------- +template +void remove_data_by_sel_common_(const StlVecType &col_indices) { + + const auto thread_level = + (indices_.size() < ThreadPool::MUL_THR_THHOLD) + ? 0L : get_thread_level(); + SpinGuard guard (lock_); + + if (thread_level > 2) { + auto lbd = + [&col_indices = std::as_const(col_indices), this] + (const auto &begin, const auto &end) -> void { + const sel_remove_functor_ functor (col_indices); + + for (auto citer = begin; citer < end; ++citer) + this->data_[citer->second].change(functor); + }; + auto lbd_idx = + [&col_indices = std::as_const(col_indices), this] () -> void { + const size_type col_indices_s = col_indices.size(); + size_type del_count = 0; + + for (size_type i = 0; i < col_indices_s; ++i) [[likely]] + this->indices_.erase(this->indices_.begin() + + (col_indices[i] - del_count++)); + }; + auto future_idx = thr_pool_.dispatch(false, lbd_idx); + auto futures = + thr_pool_.parallel_loop(column_list_.begin(), + column_list_.end(), + std::move(lbd)); + + future_idx.get(); + for (auto &fut : futures) fut.get(); + } + else { + const sel_remove_functor_ functor (col_indices); + + for (const auto &citer : column_list_) + data_[citer.second].change(functor); + guard.release(); + + const size_type col_indices_s = col_indices.size(); + size_type del_count = 0; + + for (size_type i = 0; i < col_indices_s; ++i) [[likely]] + indices_.erase(indices_.begin() + (col_indices[i] - del_count++)); + } + + return; +} + +// ---------------------------------------------------------------------------- + +template +static void +join_helper_common_( + const LHS_T &lhs, + const RHS_T &rhs, + const IndexIdxVector &joined_index_idx, + DataFrame> &result, + const char *skip_col_name = nullptr) { + + using res_t = decltype(result); + + std::vector> futures; + const auto thread_level = + (lhs.indices_.size() < ThreadPool::MUL_THR_THHOLD) + ? 0L : get_thread_level(); + + + if (thread_level > 2) + futures.reserve(lhs.column_list_.size() + rhs.column_list_.size()); + + const SpinGuard guard(lock_); + + // NOTE: I had to do this in two separate loops. Otherwise, it would + // occasionally crash in multithreaded mode under MacOS. + // + for (const auto &citer : lhs.column_list_) [[likely]] { + if (skip_col_name && citer.first == skip_col_name) continue; + + if (rhs.column_tb_.find(citer.first) != rhs.column_tb_.end()) { + create_join_common_col_functor_ create_f( + citer.first.c_str(), result); + + lhs.data_[citer.second].change(create_f); + } + else { + create_col_functor_ create_f( + citer.first.c_str(), result); + + lhs.data_[citer.second].change(create_f); + } + } + + // Load the common and lhs columns + // + for (const auto &citer : lhs.column_list_) [[likely]] { + if (skip_col_name && citer.first == skip_col_name) continue; + + // Common column between two frames + // + if (rhs.column_tb_.find(citer.first) != rhs.column_tb_.end()) { + auto jcomm_lbd = + [&citer = std::as_const(citer), + &lhs = std::as_const(lhs), + &rhs = std::as_const(rhs), + &joined_index_idx = std::as_const(joined_index_idx), + &result] () -> void { + index_join_functor_common_ functor( + citer.first.c_str(), + rhs, + joined_index_idx, + result); + + lhs.data_[citer.second].change(functor); + }; + + if (thread_level > 2) + futures.emplace_back(thr_pool_.dispatch(false, jcomm_lbd)); + else + jcomm_lbd(); + } + else { // lhs only column + auto jlhs_lbd = + [&citer = std::as_const(citer), + &lhs = std::as_const(lhs), + &joined_index_idx = std::as_const(joined_index_idx), + &result] () -> void { + // 0 = Left + index_join_functor_oneside_<0, res_t, Ts ...> functor ( + citer.first.c_str(), + joined_index_idx, + result); + + lhs.data_[citer.second].change(functor); + }; + + if (thread_level > 2) + futures.emplace_back(thr_pool_.dispatch(false, jlhs_lbd)); + else + jlhs_lbd(); + } + } + + // Load the rhs columns + // + for (const auto &citer : rhs.column_list_) [[likely]] { + const auto lhs_citer = lhs.column_tb_.find(citer.first); + + if (skip_col_name && citer.first == skip_col_name) continue; + + if (lhs_citer == lhs.column_tb_.end()) { // rhs only column + create_col_functor_ create_f( + citer.first.c_str(), result); + + rhs.data_[citer.second].change(create_f); + } + + if (lhs_citer == lhs.column_tb_.end()) { // rhs only column + auto jrhs_lbd = + [&citer = std::as_const(citer), + &rhs = std::as_const(rhs), + &joined_index_idx = std::as_const(joined_index_idx), + &result] () -> void { + // 1 = Right + index_join_functor_oneside_<1, res_t, Ts ...> functor ( + citer.first.c_str(), + joined_index_idx, + result); + + rhs.data_[citer.second].change(functor); + }; + + if (thread_level > 2) + futures.emplace_back(thr_pool_.dispatch(false, jrhs_lbd)); + else + jrhs_lbd(); + } + } + + for (auto &fut : futures) fut.get(); +} + +// ---------------------------------------------------------------------------- + +template +static DataFrame +remove_dups_common_(const DataFrame &s_df, + remove_dup_spec rds, + const MAP &row_table, + const IndexVecType &index) { + + using count_vec = StlVecType; + + count_vec rows_to_del; + + rows_to_del.reserve(8); + if (rds == remove_dup_spec::keep_first) { + for (const auto &citer : row_table) { + if (citer.second.size() > 1) { + for (size_type i = 1; i < citer.second.size(); ++i) + rows_to_del.push_back(citer.second[i]); + } + } + } + else if (rds == remove_dup_spec::keep_last) { + for (const auto &citer : row_table) { + if (citer.second.size() > 1) { + for (size_type i = 0; i < citer.second.size() - 1; ++i) + rows_to_del.push_back(citer.second[i]); + } + } + } + else { // remove_dup_spec::keep_none + for (const auto &citer : row_table) { + if (citer.second.size() > 1) { + for (size_type i = 0; i < citer.second.size(); ++i) + rows_to_del.push_back(citer.second[i]); + } + } + } + + DataFrame new_df; + IndexVecType new_index (index.size() - rows_to_del.size()); + const SpinGuard guard(lock_); + + // Load the index + // + _remove_copy_if_(index.begin(), index.end(), new_index.begin(), + [&rows_to_del = std::as_const(rows_to_del)] + (std::size_t n) -> bool { + return (std::find(rows_to_del.begin(), + rows_to_del.end(), + n) != rows_to_del.end()); + }); + new_df.load_index(std::move(new_index)); + + // Create the columns, so loading can proceed in parallel + // + for (const auto &citer : s_df.column_list_) { + create_col_functor_ functor( + citer.first.c_str(), new_df); + + s_df.data_[citer.second].change(functor); + } + + const auto thread_level = + (new_df.get_index().size() < ThreadPool::MUL_THR_THHOLD) + ? 0L : get_thread_level(); + + if (thread_level > 2) { + auto lbd = + [&rows_to_del = std::as_const(rows_to_del), + &s_df = std::as_const(s_df), + &new_df] + (const auto &begin, const auto &end) -> void { + for (auto citer = begin; citer < end; ++citer) { + copy_remove_functor_ functor (citer->first.c_str(), + rows_to_del, + new_df); + + s_df.data_[citer->second].change(functor); + } + }; + auto futures = + thr_pool_.parallel_loop(s_df.column_list_.begin(), + s_df.column_list_.end(), + std::move(lbd)); + + for (auto &fut : futures) fut.get(); + } + else { + for (const auto &citer : s_df.column_list_) { + copy_remove_functor_ functor (citer.first.c_str(), + rows_to_del, + new_df); + + s_df.data_[citer.second].change(functor); + } + } + return (new_df); +} + +// ---------------------------------------------------------------------------- + +template +DataFrame +data_by_sel_common_(const StlVecType &col_indices, + size_type idx_s) const { + + DataFrame ret_df; + IndexVecType new_index; + + new_index.reserve(col_indices.size()); + for (const auto &citer: col_indices) [[likely]] + new_index.push_back(indices_[citer]); + ret_df.load_index(std::move(new_index)); + + const SpinGuard guard(lock_); + + for (const auto &citer : column_list_) [[likely]] { + create_col_functor_ functor(citer.first.c_str(), + ret_df); + + data_[citer.second].change(functor); + } + + const auto thread_level = + (idx_s < ThreadPool::MUL_THR_THHOLD) ? 0L : get_thread_level(); + + if (thread_level > 2) { + auto lbd = + [&col_indices = std::as_const(col_indices), idx_s, &ret_df, this] + (const auto &begin, const auto &end) -> void { + for (auto citer = begin; citer < end; ++citer) { + sel_load_functor_ functor ( + citer->first.c_str(), + col_indices, + idx_s, + ret_df); + + this->data_[citer->second].change(functor); + } + }; + + auto futuers = + thr_pool_.parallel_loop(column_list_.begin(), + column_list_.end(), + std::move(lbd)); + + for (auto &fut : futuers) fut.get(); + } + else { + for (const auto &citer : column_list_) [[likely]] { + sel_load_functor_ functor ( + citer.first.c_str(), + col_indices, + idx_s, + ret_df); + + data_[citer.second].change(functor); + } + } + + return (ret_df); +} + +// ---------------------------------------------------------------------------- + +template +PtrView +view_by_sel_common_(const StlVecType &col_indices, + size_type idx_s) { + + using TheView = PtrView; + + TheView ret_dfv; + typename TheView::IndexVecType new_index; + + new_index.reserve(col_indices.size()); + for (const auto &citer: col_indices) [[likely]] + new_index.push_back(&(indices_[citer])); + ret_dfv.indices_ = std::move(new_index); + + const SpinGuard guard(lock_); + + for (const auto &col_citer : column_list_) [[likely]] { + sel_load_view_functor_ functor ( + col_citer.first.c_str(), + col_indices, + idx_s, + ret_dfv); + + data_[col_citer.second].change(functor); + } + + return (ret_dfv); +} + +// ---------------------------------------------------------------------------- + +template +ConstPtrView +view_by_sel_common_(const StlVecType &col_indices, + size_type idx_s) const { + + using TheView = ConstPtrView; + + TheView ret_dfv; + typename TheView::IndexVecType new_index; + + new_index.reserve(col_indices.size()); + for (const auto &citer: col_indices) [[likely]] + new_index.push_back(&(indices_[citer])); + ret_dfv.indices_ = std::move(new_index); + + const SpinGuard guard(lock_); + + for (const auto &col_citer : column_list_) [[likely]] { + sel_load_view_functor_ functor ( + col_citer.first.c_str(), + col_indices, + idx_s, + ret_dfv); + + data_[col_citer.second].change(functor); + } + + return (ret_dfv); +} + +// ---------------------------------------------------------------------------- + template inline static void replace_vector_vals_(V &data_vec, diff --git a/include/DataFrame/Internals/DataFrame_set.tcc b/include/DataFrame/Internals/DataFrame_set.tcc index 7b95aa50..4eb44f13 100644 --- a/include/DataFrame/Internals/DataFrame_set.tcc +++ b/include/DataFrame/Internals/DataFrame_set.tcc @@ -1026,62 +1026,6 @@ void DataFrame::remove_data_by_loc (Index2D range) { // ---------------------------------------------------------------------------- -template -template -void DataFrame:: -remove_data_by_sel_common_(const StlVecType &col_indices) { - - const auto thread_level = - (indices_.size() < ThreadPool::MUL_THR_THHOLD) - ? 0L : get_thread_level(); - SpinGuard guard (lock_); - - if (thread_level > 2) { - auto lbd = - [&col_indices = std::as_const(col_indices), this] - (const auto &begin, const auto &end) -> void { - const sel_remove_functor_ functor (col_indices); - - for (auto citer = begin; citer < end; ++citer) - this->data_[citer->second].change(functor); - }; - auto lbd_idx = - [&col_indices = std::as_const(col_indices), this] () -> void { - const size_type col_indices_s = col_indices.size(); - size_type del_count = 0; - - for (size_type i = 0; i < col_indices_s; ++i) [[likely]] - this->indices_.erase(this->indices_.begin() + - (col_indices[i] - del_count++)); - }; - auto future_idx = thr_pool_.dispatch(false, lbd_idx); - auto futures = - thr_pool_.parallel_loop(column_list_.begin(), - column_list_.end(), - std::move(lbd)); - - future_idx.get(); - for (auto &fut : futures) fut.get(); - } - else { - const sel_remove_functor_ functor (col_indices); - - for (const auto &citer : column_list_) - data_[citer.second].change(functor); - guard.release(); - - const size_type col_indices_s = col_indices.size(); - size_type del_count = 0; - - for (size_type i = 0; i < col_indices_s; ++i) [[likely]] - indices_.erase(indices_.begin() + (col_indices[i] - del_count++)); - } - - return; -} - -// ---------------------------------------------------------------------------- - template template void DataFrame::remove_data_by_sel (const char *name, F &sel_functor) { @@ -1170,107 +1114,6 @@ remove_data_by_sel (const char *name1, // ---------------------------------------------------------------------------- -template -template -DataFrame DataFrame:: -remove_dups_common_(const DataFrame &s_df, - remove_dup_spec rds, - const MAP &row_table, - const IndexVecType &index) { - - using count_vec = StlVecType; - - count_vec rows_to_del; - - rows_to_del.reserve(8); - if (rds == remove_dup_spec::keep_first) { - for (const auto &citer : row_table) { - if (citer.second.size() > 1) { - for (size_type i = 1; i < citer.second.size(); ++i) - rows_to_del.push_back(citer.second[i]); - } - } - } - else if (rds == remove_dup_spec::keep_last) { - for (const auto &citer : row_table) { - if (citer.second.size() > 1) { - for (size_type i = 0; i < citer.second.size() - 1; ++i) - rows_to_del.push_back(citer.second[i]); - } - } - } - else { // remove_dup_spec::keep_none - for (const auto &citer : row_table) { - if (citer.second.size() > 1) { - for (size_type i = 0; i < citer.second.size(); ++i) - rows_to_del.push_back(citer.second[i]); - } - } - } - - DataFrame new_df; - IndexVecType new_index (index.size() - rows_to_del.size()); - const SpinGuard guard(lock_); - - // Load the index - // - _remove_copy_if_(index.begin(), index.end(), new_index.begin(), - [&rows_to_del = std::as_const(rows_to_del)] - (std::size_t n) -> bool { - return (std::find(rows_to_del.begin(), - rows_to_del.end(), - n) != rows_to_del.end()); - }); - new_df.load_index(std::move(new_index)); - - // Create the columns, so loading can proceed in parallel - // - for (const auto &citer : s_df.column_list_) { - create_col_functor_ functor( - citer.first.c_str(), new_df); - - s_df.data_[citer.second].change(functor); - } - - const auto thread_level = - (new_df.get_index().size() < ThreadPool::MUL_THR_THHOLD) - ? 0L : get_thread_level(); - - if (thread_level > 2) { - auto lbd = - [&rows_to_del = std::as_const(rows_to_del), - &s_df = std::as_const(s_df), - &new_df] - (const auto &begin, const auto &end) -> void { - for (auto citer = begin; citer < end; ++citer) { - copy_remove_functor_ functor (citer->first.c_str(), - rows_to_del, - new_df); - - s_df.data_[citer->second].change(functor); - } - }; - auto futures = - thr_pool_.parallel_loop(s_df.column_list_.begin(), - s_df.column_list_.end(), - std::move(lbd)); - - for (auto &fut : futures) fut.get(); - } - else { - for (const auto &citer : s_df.column_list_) { - copy_remove_functor_ functor (citer.first.c_str(), - rows_to_del, - new_df); - - s_df.data_[citer.second].change(functor); - } - } - return (new_df); -} - -// ---------------------------------------------------------------------------- - template template DataFrame DataFrame:: diff --git a/test/dataframe_tester_output.txt b/test/dataframe_tester_output.txt index 4c2b5e11..c90c456c 100644 --- a/test/dataframe_tester_output.txt +++ b/test/dataframe_tester_output.txt @@ -744,13 +744,6 @@ col_3:1::15, col_str:1::11, col_4:1::22, -INDEX:1::123450, -col_1:1::1, -col_2:1::8, -col_3:1::15, -col_str:1::11, -col_4:1::22, - INDEX:1::123451, col_1:1::2, col_2:1::9, @@ -758,6 +751,13 @@ col_3:1::16, col_str:1::22, col_4:1::23, +INDEX:1::123452, +col_1:1::3, +col_2:1::10, +col_3:1::17, +col_str:1::33, +col_4:1::24, + Testing write(json) ... Writing in JSON: @@ -1237,6 +1237,7 @@ Testing GarmanKlassVolVisitor{ } ... Testing YangZhangVolVisitor{ } ... + Testing no_index_writes ... INDEX:28::123450,123451,123452,123450,123455,123450,123449,123450,123451,123450,123452,123450,123455,123450,123454,123450,123450,123457,123458,123459,123450,123441,123442,123432,123450,123450,123435,123450, ul_col:28::123450,123451,123452,123450,123455,123450,123449,123450,123451,123450,123452,123450,123455,123450,123454,123450,123450,123457,123458,123459,123450,123441,123442,123432,123450,123450,123435,123450, @@ -1999,6 +2000,12 @@ Testing PriceVolumeTrendVisitor{ } ... Testing QuantQualEstimationVisitor{ } ... Testing get_str_col_stats( ) ... + +Testing inversion_count( ) ... + +Testing _like_clause_compare_( ) ... + +Testing get_data_by_like( ) ... Hello World! Str Column = A, B, C, D, E, F, G, H, I, J, There are 5031 IBM close prices @@ -2018,7 +2025,7 @@ INDEX:10:,string col:10:,Cool Column:10:,numbers:10: Date: Tue, 16 Jan 2024 08:01:26 -0500 Subject: [PATCH 07/13] Rearrange a few code segments --- include/DataFrame/Internals/DataFrame.tcc | 250 ----------- .../DataFrame/Internals/DataFrame_join.tcc | 40 +- .../Internals/DataFrame_private_decl.h | 391 +++++++++++++----- include/DataFrame/Internals/DataFrame_set.tcc | 70 ++-- 4 files changed, 337 insertions(+), 414 deletions(-) diff --git a/include/DataFrame/Internals/DataFrame.tcc b/include/DataFrame/Internals/DataFrame.tcc index 16680517..cc5d186a 100644 --- a/include/DataFrame/Internals/DataFrame.tcc +++ b/include/DataFrame/Internals/DataFrame.tcc @@ -182,222 +182,6 @@ DataFrame::shuffle(const StlVecType &col_names, // ---------------------------------------------------------------------------- -template -template -void DataFrame:: -fill_missing_value_(ColumnVecType &vec, - const T &value, - int limit, - size_type col_num) { - - const size_type vec_size = vec.size(); - int count = 0; - - if (limit < 0) - vec.reserve(col_num); - for (size_type i = 0; i < col_num; ++i) { - if (limit >= 0 && count >= limit) break; - if (i >= vec_size) { - vec.push_back(value); - count += 1; - } - else if (is_nan(vec[i])) { - vec[i] = value; - count += 1; - } - } - return; -} - -// ---------------------------------------------------------------------------- - -template -template -void DataFrame:: -fill_missing_ffill_(ColumnVecType &vec, int limit, size_type col_num) { - - const size_type vec_size = vec.size(); - - if (vec_size == 0) return; - - int count = 0; - T last_value = vec[0]; - - for (size_type i = 1; i < col_num; ++i) { - if (limit >= 0 && count >= limit) break; - if (i >= vec_size) { - if (! is_nan(last_value)) { - vec.reserve(col_num); - vec.push_back(last_value); - count += 1; - } - else break; - } - else { - if (! is_nan(vec[i])) - last_value = vec[i]; - else if (! is_nan(last_value)) { - vec[i] = last_value; - count += 1; - } - } - } - return; -} - -// ---------------------------------------------------------------------------- - -template -template::value || - ! supports_arithmetic::value>::type*> -void DataFrame:: -fill_missing_midpoint_(ColumnVecType &, int, size_type) { - - throw NotFeasible("fill_missing_midpoint_(): ERROR: Mid-point filling is " - "not feasible on non-arithmetic types"); -} - -// ---------------------------------------------------------------------------- - -template -template::value && - supports_arithmetic::value>::type*> -void DataFrame:: -fill_missing_midpoint_(ColumnVecType &vec, int limit, size_type) { - - const size_type vec_size = vec.size(); - - if (vec_size < 3) [[unlikely]] return; - - int count = 0; - T last_value = vec[0]; - - for (size_type i = 1; i < vec_size - 1; ++i) { - if (limit >= 0 && count >= limit) break; - - if (! is_nan(vec[i])) - last_value = vec[i]; - else if (! is_nan(last_value)) { - for (size_type j = i + 1; j < vec_size; ++j) { - if (! is_nan(vec[j])) { - vec[i] = (last_value + vec[j]) / T(2); - last_value = vec[i]; - count += 1; - break; - } - } - } - } - return; -} - -// ---------------------------------------------------------------------------- - -template -template -void DataFrame:: -fill_missing_bfill_(ColumnVecType &vec, int limit) { - - const long vec_size = static_cast(vec.size()); - - if (vec_size == 0) return; - - int count = 0; - T last_value = vec[vec_size - 1]; - - for (long i = vec_size - 1; i >= 0; --i) { - if (limit >= 0 && count >= limit) break; - if (! is_nan(vec[i])) last_value = vec[i]; - if (is_nan(vec[i]) && ! is_nan(last_value)) { - vec[i] = last_value; - count += 1; - } - } - return; -} - -// ---------------------------------------------------------------------------- - -template -template::value || - ! supports_arithmetic::value>::type*> -void DataFrame:: -fill_missing_linter_(ColumnVecType &, const IndexVecType &, int) { - - throw NotFeasible("fill_missing_linter_(): ERROR: Interpolation is " - "not feasible on non-arithmetic types"); -} - -// ---------------------------------------------------------------------------- - -template -template::value && - supports_arithmetic::value>::type*> -void DataFrame:: -fill_missing_linter_(ColumnVecType &vec, - const IndexVecType &index, - int limit) { - - const long vec_size = static_cast(vec.size()); - - if (vec_size < 3) return; - - int count = 0; - const T *y1 = &(vec[0]); - const T *y2 = &(vec[2]); - const IndexType *x = &(index[1]); - const IndexType *x1 = &(index[0]); - const IndexType *x2 = &(index[2]); - - for (long i = 1; i < vec_size - 1; ++i) { - if (limit >= 0 && count >= limit) break; - if (is_nan(vec[i])) { - if (is_nan(*y2)) { - bool found = false; - - for (long j = i + 1; j < vec_size; ++j) { - if (! is_nan(vec[j])) { - y2 = &(vec[j]); - x2 = &(index[j]); - found = true; - break; - } - } - if (! found) break; - } - if (is_nan(*y1)) { - for (long j = i - 1; j >= 0; --j) { - if (! is_nan(vec[j])) { - y1 = &(vec[j]); - x1 = &(index[j]); - break; - } - } - } - vec[i] = - *y1 + - (static_cast(*x - *x1) / static_cast(*x2 - *x1)) * - (*y2 - *y1); - count += 1; - } - if (i < (vec_size - 2)) { - y1 = &(vec[i]); - y2 = &(vec[i + 2]); - x = &(index[i + 1]); - x1 = &(index[i]); - x2 = &(index[i + 2]); - } - } - - return; -} - -// ---------------------------------------------------------------------------- - template template void DataFrame:: @@ -503,40 +287,6 @@ void DataFrame::fill_missing (const DF &rhs) { // ---------------------------------------------------------------------------- -template -template -void DataFrame:: -drop_missing_rows_(T &vec, - const DropRowMap &missing_row_map, - drop_policy policy, - size_type threshold, - size_type col_num) { - - size_type erase_count = 0; - auto dropper = - [&vec, &erase_count](const auto &iter) -> void { - vec.erase(vec.begin() + (iter.first - erase_count)); - erase_count += 1; - }; - - if (policy == drop_policy::all) { - for (const auto &iter : missing_row_map) - if (iter.second == col_num) dropper(iter); - } - else if (policy == drop_policy::any) { - for (const auto &iter : missing_row_map) - if (iter.second > 0) dropper(iter); - } - else if (policy == drop_policy::threshold) { - for (const auto &iter : missing_row_map) - if (iter.second > threshold) dropper(iter); - } - - return; -} - -// ---------------------------------------------------------------------------- - template template void DataFrame:: diff --git a/include/DataFrame/Internals/DataFrame_join.tcc b/include/DataFrame/Internals/DataFrame_join.tcc index 76795f7a..e8956f51 100644 --- a/include/DataFrame/Internals/DataFrame_join.tcc +++ b/include/DataFrame/Internals/DataFrame_join.tcc @@ -96,22 +96,18 @@ join_by_index (const RHS_T &rhs, join_policy mp) const { switch(mp) { case join_policy::inner_join: - return (index_inner_join_ - - (*this, rhs, idx_vec_lhs, idx_vec_rhs)); + return (index_inner_join_ + (*this, rhs, idx_vec_lhs, idx_vec_rhs)); case join_policy::left_join: - return (index_left_join_ - - (*this, rhs, idx_vec_lhs, idx_vec_rhs)); + return (index_left_join_ + (*this, rhs, idx_vec_lhs, idx_vec_rhs)); case join_policy::right_join: - return (index_right_join_ - - (*this, rhs, idx_vec_lhs, idx_vec_rhs)); + return (index_right_join_ + (*this, rhs, idx_vec_lhs, idx_vec_rhs)); case join_policy::left_right_join: default: - return (index_left_right_join_ - - (*this, rhs, idx_vec_lhs, idx_vec_rhs)); + return (index_left_right_join_ + (*this, rhs, idx_vec_lhs, idx_vec_rhs)); } } @@ -174,22 +170,18 @@ join_by_column (const RHS_T &rhs, const char *name, join_policy mp) const { switch(mp) { case join_policy::inner_join: - return (column_inner_join_ - - (*this, rhs, name, col_vec_lhs, col_vec_rhs)); + return (column_inner_join_ + (*this, rhs, name, col_vec_lhs, col_vec_rhs)); case join_policy::left_join: - return (column_left_join_ - - (*this, rhs, name, col_vec_lhs, col_vec_rhs)); + return (column_left_join_ + (*this, rhs, name, col_vec_lhs, col_vec_rhs)); case join_policy::right_join: - return (column_right_join_ - - (*this, rhs, name, col_vec_lhs, col_vec_rhs)); + return (column_right_join_ + (*this, rhs, name, col_vec_lhs, col_vec_rhs)); case join_policy::left_right_join: default: - return (column_left_right_join_ - - (*this, rhs, name, col_vec_lhs, col_vec_rhs)); + return (column_left_right_join_ + (*this, rhs, name, col_vec_lhs, col_vec_rhs)); } } diff --git a/include/DataFrame/Internals/DataFrame_private_decl.h b/include/DataFrame/Internals/DataFrame_private_decl.h index 8812f57d..88c423c7 100644 --- a/include/DataFrame/Internals/DataFrame_private_decl.h +++ b/include/DataFrame/Internals/DataFrame_private_decl.h @@ -41,13 +41,17 @@ template class OPT, typename ... Ts> friend DF binary_operation(const DF &lhs, const DF &rhs); -template -size_type -load_pair_(std::pair &col_name_data, bool do_lock = true); +// ---------------------------------------------------------------------------- + +// Maps row number -> number of missing column(s) +// +using DropRowMap = DFMap; +using IndexIdxVector = StlVecType>; template -size_type -append_row_(std::pair &row_name_data); +using JoinSortingPair = std::pair; + +// ---------------------------------------------------------------------------- void read_json_(std::istream &file, bool columns_only); void read_csv_(std::istream &file, bool columns_only); @@ -56,71 +60,10 @@ void read_csv2_(std::istream &file, size_type starting_row, size_type num_rows); -template -static void -fill_missing_value_(ColumnVecType &vec, - const T &value, - int limit, - size_type col_num); - -template -static void -fill_missing_ffill_(ColumnVecType &vec, int limit, size_type col_num); - -template::value && - supports_arithmetic::value>::type* = nullptr> -static void -fill_missing_midpoint_(ColumnVecType &vec, int limit, size_type col_num); - -template::value || - ! supports_arithmetic::value>::type* = nullptr> -static void -fill_missing_midpoint_(ColumnVecType &vec, int limit, size_type col_num); - -template -static void -fill_missing_bfill_(ColumnVecType &vec, int limit); - -template::value && - supports_arithmetic::value>::type* = nullptr> -static void -fill_missing_linter_(ColumnVecType &vec, - const IndexVecType &index, - int limit); - -template::value || - ! supports_arithmetic::value>::type* = nullptr> -static void -fill_missing_linter_(ColumnVecType &, const IndexVecType &, int); - -// Maps row number -> number of missing column(s) -using DropRowMap = DFMap; - - -template -static void -drop_missing_rows_(T &vec, - const DropRowMap &missing_row_map, - drop_policy policy, - size_type threshold, - size_type col_num); - template void setup_view_column_(const char *name, Index2D range); -using IndexIdxVector = StlVecType>; -template -using JoinSortingPair = std::pair; - template static DataFrame> index_join_helper_(const LHS_T &lhs, @@ -136,16 +79,14 @@ column_join_helper_(const LHS_T &lhs, template static IndexIdxVector -get_inner_index_idx_vector_( - const StlVecType> &col_vec_lhs, - const StlVecType> &col_vec_rhs); +get_inner_index_idx_vector_(const StlVecType> &col_vec_lhs, + const StlVecType> &col_vec_rhs); template static DataFrame> -index_inner_join_( - const LHS_T &lhs, const RHS_T &rhs, - const StlVecType> &col_vec_lhs, - const StlVecType> &col_vec_rhs); +index_inner_join_(const LHS_T &lhs, const RHS_T &rhs, + const StlVecType> &col_vec_lhs, + const StlVecType> &col_vec_rhs); template static DataFrame> @@ -157,16 +98,14 @@ column_inner_join_(const LHS_T &lhs, template static IndexIdxVector -get_left_index_idx_vector_( - const StlVecType> &col_vec_lhs, - const StlVecType> &col_vec_rhs); +get_left_index_idx_vector_(const StlVecType> &col_vec_lhs, + const StlVecType> &col_vec_rhs); template static DataFrame> -index_left_join_( - const LHS_T &lhs, const RHS_T &rhs, - const StlVecType> &col_vec_lhs, - const StlVecType> &col_vec_rhs); +index_left_join_(const LHS_T &lhs, const RHS_T &rhs, + const StlVecType> &col_vec_lhs, + const StlVecType> &col_vec_rhs); template static DataFrame> @@ -178,16 +117,14 @@ column_left_join_(const LHS_T &lhs, template static IndexIdxVector -get_right_index_idx_vector_( - const StlVecType> &col_vec_lhs, - const StlVecType> &col_vec_rhs); +get_right_index_idx_vector_(const StlVecType> &col_vec_lhs, + const StlVecType> &col_vec_rhs); template static DataFrame> -index_right_join_( - const LHS_T &lhs, const RHS_T &rhs, - const StlVecType> &col_vec_lhs, - const StlVecType> &col_vec_rhs); +index_right_join_(const LHS_T &lhs, const RHS_T &rhs, + const StlVecType> &col_vec_lhs, + const StlVecType> &col_vec_rhs); template static DataFrame> @@ -216,12 +153,280 @@ index_left_right_join_( template static DataFrame> -column_left_right_join_( - const LHS_T &lhs, - const RHS_T &rhs, - const char *col_name, - const StlVecType> &col_vec_lhs, - const StlVecType> &col_vec_rhs); +column_left_right_join_(const LHS_T &lhs, + const RHS_T &rhs, + const char *col_name, + const StlVecType> &col_vec_lhs, + const StlVecType> &col_vec_rhs); + +// ---------------------------------------------------------------------------- + +template +size_type +load_pair_(std::pair &col_name_data, bool do_lock = true) { + + return (load_column( + col_name_data.first, // column name + std::forward(col_name_data.second), + nan_policy::pad_with_nans, + do_lock)); +} + +// ---------------------------------------------------------------------------- + +template +size_type +append_row_(std::pair &row_name_data) { + + return (append_column(row_name_data.first, // column name + std::forward(row_name_data.second), + nan_policy::dont_pad_with_nans)); +} + +// ---------------------------------------------------------------------------- + +template +static void +drop_missing_rows_(T &vec, + const DropRowMap &missing_row_map, + drop_policy policy, + size_type threshold, + size_type col_num) { + + size_type erase_count = 0; + auto dropper = + [&vec, &erase_count](const auto &iter) -> void { + vec.erase(vec.begin() + (iter.first - erase_count)); + erase_count += 1; + }; + + if (policy == drop_policy::all) { + for (const auto &iter : missing_row_map) + if (iter.second == col_num) dropper(iter); + } + else if (policy == drop_policy::any) { + for (const auto &iter : missing_row_map) + if (iter.second > 0) dropper(iter); + } + else if (policy == drop_policy::threshold) { + for (const auto &iter : missing_row_map) + if (iter.second > threshold) dropper(iter); + } + + return; +} + +// ---------------------------------------------------------------------------- + +template +static void +fill_missing_value_(ColumnVecType &vec, + const T &value, + int limit, + size_type col_num) { + + const size_type vec_size = vec.size(); + int count = 0; + + if (limit < 0) + vec.reserve(col_num); + for (size_type i = 0; i < col_num; ++i) { + if (limit >= 0 && count >= limit) break; + if (i >= vec_size) { + vec.push_back(value); + count += 1; + } + else if (is_nan(vec[i])) { + vec[i] = value; + count += 1; + } + } + return; +} + +// ---------------------------------------------------------------------------- + +template +static void +fill_missing_ffill_(ColumnVecType &vec, int limit, size_type col_num) { + + const size_type vec_size = vec.size(); + + if (vec_size == 0) return; + + int count = 0; + T last_value = vec[0]; + + for (size_type i = 1; i < col_num; ++i) { + if (limit >= 0 && count >= limit) break; + if (i >= vec_size) { + if (! is_nan(last_value)) { + vec.reserve(col_num); + vec.push_back(last_value); + count += 1; + } + else break; + } + else { + if (! is_nan(vec[i])) + last_value = vec[i]; + else if (! is_nan(last_value)) { + vec[i] = last_value; + count += 1; + } + } + } + return; +} + +// ---------------------------------------------------------------------------- + +template::value && + supports_arithmetic::value>::type* = nullptr> +static void +fill_missing_midpoint_(ColumnVecType &vec, int limit, size_type) { + + const size_type vec_size = vec.size(); + + if (vec_size < 3) [[unlikely]] return; + + int count = 0; + T last_value = vec[0]; + + for (size_type i = 1; i < vec_size - 1; ++i) { + if (limit >= 0 && count >= limit) break; + + if (! is_nan(vec[i])) + last_value = vec[i]; + else if (! is_nan(last_value)) { + for (size_type j = i + 1; j < vec_size; ++j) { + if (! is_nan(vec[j])) { + vec[i] = (last_value + vec[j]) / T(2); + last_value = vec[i]; + count += 1; + break; + } + } + } + } + return; +} + +// ---------------------------------------------------------------------------- + +template::value || + ! supports_arithmetic::value>::type* = nullptr> +static void +fill_missing_midpoint_(ColumnVecType &, int, size_type) { + + throw NotFeasible("fill_missing_midpoint_(): ERROR: Mid-point filling is " + "not feasible on non-arithmetic types"); +} + +// ---------------------------------------------------------------------------- + +template +static void +fill_missing_bfill_(ColumnVecType &vec, int limit) { + + const long vec_size = static_cast(vec.size()); + + if (vec_size == 0) return; + + int count = 0; + T last_value = vec[vec_size - 1]; + + for (long i = vec_size - 1; i >= 0; --i) { + if (limit >= 0 && count >= limit) break; + if (! is_nan(vec[i])) last_value = vec[i]; + if (is_nan(vec[i]) && ! is_nan(last_value)) { + vec[i] = last_value; + count += 1; + } + } + return; +} + +// ---------------------------------------------------------------------------- + +template::value && + supports_arithmetic::value>::type* = nullptr> +static void +fill_missing_linter_(ColumnVecType &vec, + const IndexVecType &index, + int limit) { + + const long vec_size = static_cast(vec.size()); + + if (vec_size < 3) return; + + int count = 0; + const T *y1 = &(vec[0]); + const T *y2 = &(vec[2]); + const IndexType *x = &(index[1]); + const IndexType *x1 = &(index[0]); + const IndexType *x2 = &(index[2]); + + for (long i = 1; i < vec_size - 1; ++i) { + if (limit >= 0 && count >= limit) break; + if (is_nan(vec[i])) { + if (is_nan(*y2)) { + bool found = false; + + for (long j = i + 1; j < vec_size; ++j) { + if (! is_nan(vec[j])) { + y2 = &(vec[j]); + x2 = &(index[j]); + found = true; + break; + } + } + if (! found) break; + } + if (is_nan(*y1)) { + for (long j = i - 1; j >= 0; --j) { + if (! is_nan(vec[j])) { + y1 = &(vec[j]); + x1 = &(index[j]); + break; + } + } + } + vec[i] = + *y1 + + (static_cast(*x - *x1) / static_cast(*x2 - *x1)) * + (*y2 - *y1); + count += 1; + } + if (i < (vec_size - 2)) { + y1 = &(vec[i]); + y2 = &(vec[i + 2]); + x = &(index[i + 1]); + x1 = &(index[i]); + x2 = &(index[i + 2]); + } + } + return; +} + +// ---------------------------------------------------------------------------- + +template::value || + ! supports_arithmetic::value>::type* = nullptr> +static void +fill_missing_linter_(ColumnVecType &, const IndexVecType &, int) { + + throw NotFeasible("fill_missing_linter_(): ERROR: Interpolation is " + "not feasible on non-arithmetic types"); +} // ---------------------------------------------------------------------------- diff --git a/include/DataFrame/Internals/DataFrame_set.tcc b/include/DataFrame/Internals/DataFrame_set.tcc index 4eb44f13..fa1eeec2 100644 --- a/include/DataFrame/Internals/DataFrame_set.tcc +++ b/include/DataFrame/Internals/DataFrame_set.tcc @@ -431,11 +431,12 @@ load_result_as_column(V &visitor, size_type ret_cnt = data_s; - if (padding == nan_policy::pad_with_nans && data_s < idx_s) + if (padding == nan_policy::pad_with_nans && data_s < idx_s) { for (size_type i = 0; i < idx_s - data_s; ++i) { new_col.push_back (std::move(get_nan())); ret_cnt += 1; } + } const auto iter = column_tb_.find (new_col_name); StlVecType *vec_ptr = nullptr; @@ -595,13 +596,15 @@ from_indicators(const StlVecType &ind_col_names, guard.release(); new_col.reserve(col_s); - for (size_type i = 0; i < col_s; ++i) [[likely]] - for (size_type j = 0; j < ind_col_s; ++j) [[likely]] + for (size_type i = 0; i < col_s; ++i) [[likely]] { + for (size_type j = 0; j < ind_col_s; ++j) [[likely]] { if (ind_cols[j]->at(i)) { new_col.push_back( _string_to_(ind_col_names[j] + pre_offset)); break; } + } + } return (col_s); } @@ -770,21 +773,6 @@ load_column (const char *name, // ---------------------------------------------------------------------------- -template -template -typename DataFrame::size_type -DataFrame:: -load_pair_(std::pair &col_name_data, bool do_lock) { - - return (load_column( - col_name_data.first, // column name - std::forward(col_name_data.second), - nan_policy::pad_with_nans, - do_lock)); -} - -// ---------------------------------------------------------------------------- - template template typename DataFrame::size_type @@ -872,18 +860,6 @@ append_column (const char *name, const T &val, nan_policy padding) { // ---------------------------------------------------------------------------- -template -template -typename DataFrame::size_type -DataFrame::append_row_(std::pair &row_name_data) { - - return (append_column(row_name_data.first, // column name - std::forward(row_name_data.second), - nan_policy::dont_pad_with_nans)); -} - -// ---------------------------------------------------------------------------- - template template typename DataFrame::size_type @@ -1167,12 +1143,12 @@ remove_duplicates (const char *name1, guard.release(); - const auto &index = get_index(); - const size_type col_s = + const auto &index = get_index(); + const size_type col_s = std::min({ vec1.size(), vec2.size(), index.size() }); - map_t row_table; - count_vec dummy_vec; - const IndexType dummy_idx { }; + map_t row_table; + count_vec dummy_vec; + const IndexType dummy_idx { }; for (size_type i = 0; i < col_s; ++i) [[likely]] { const auto insert_res = @@ -1217,9 +1193,9 @@ remove_duplicates (const char *name1, const size_type col_s = std::min( { vec1.size(), vec2.size(), vec3.size(), index.size() }); - map_t row_table; - count_vec dummy_vec; - const IndexType dummy_idx { }; + map_t row_table; + count_vec dummy_vec; + const IndexType dummy_idx { }; for (size_type i = 0; i < col_s; ++i) [[likely]] { const auto insert_res = @@ -1269,9 +1245,9 @@ remove_duplicates (const char *name1, std::min( { vec1.size(), vec2.size(), vec3.size(), vec4.size(), index.size() }); - map_t row_table; - count_vec dummy_vec; - const IndexType dummy_idx { }; + map_t row_table; + count_vec dummy_vec; + const IndexType dummy_idx { }; for (size_type i = 0; i < col_s; ++i) [[likely]] { const auto insert_res = @@ -1323,9 +1299,9 @@ remove_duplicates (const char *name1, std::min( { vec1.size(), vec2.size(), vec3.size(), vec4.size(), vec5.size(), index.size() }); - map_t row_table; - count_vec dummy_vec; - const IndexType dummy_idx { }; + map_t row_table; + count_vec dummy_vec; + const IndexType dummy_idx { }; for (size_type i = 0; i < col_s; ++i) [[likely]] { const auto insert_res = @@ -1382,9 +1358,9 @@ remove_duplicates (const char *name1, { vec1.size(), vec2.size(), vec3.size(), vec4.size(), vec5.size(), vec6.size(), index.size() }); - map_t row_table; - count_vec dummy_vec; - const IndexType dummy_idx { }; + map_t row_table; + count_vec dummy_vec; + const IndexType dummy_idx { }; for (size_type i = 0; i < col_s; ++i) [[likely]] { const auto insert_res = From ca58afc2796e15a99f052514f565d49ef9468be9 Mon Sep 17 00:00:00 2001 From: Hossein Moein Date: Sun, 21 Jan 2024 11:03:28 -0500 Subject: [PATCH 08/13] Added char and uchar type to types read/written from/to files --- data/csv2_format_data.csv | 58 ++--- data/sample_data.json | 4 +- data/sample_data_2.json | 4 +- data/sample_data_no_index.csv | 2 + data/sample_data_no_index.json | 4 +- data/sample_data_string_index.csv | 2 + .../DataFrame/Internals/DataFrame_read.tcc | 113 ++++++++- .../Internals/DataFrame_standalone.tcc | 6 +- test/dataframe_tester.cc | 27 +++ test/dataframe_tester_2.cc | 6 + test/dataframe_tester_output.txt | 227 +++++++++--------- 11 files changed, 304 insertions(+), 149 deletions(-) diff --git a/data/csv2_format_data.csv b/data/csv2_format_data.csv index ab32a867..54cc37eb 100644 --- a/data/csv2_format_data.csv +++ b/data/csv2_format_data.csv @@ -1,29 +1,29 @@ -INDEX:28:,ul_col:28:,dbl_col_2:26:,bool_col:6:,str_col:25:,xint_col:28:,dbl_col:27: -123450,123450,0.998,1,4% of something,1,1.2345 -123451,123451,0.3456,1,Description 4/5,2,2.2345 -123452,123452,0.056,1,This is bad,3,3.2345 -123450,123450,0.15678,0,3.4% of GDP,4,4.2345 -123455,123455,0.00345,0,Market drops,5,5.2345 -123450,123450,0.923,1,Market pulls back,3,3 -123449,123449,0.06743,,$15 increase,7,0.9999 -123450,123450,0.1,,Running fast,3,10 -123451,123451,0.0056,,C++14 development,9,4.25 -123450,123450,0.07865,,Some explanation,10,0.009 -123452,123452,0.0111,,More strings,3,8 -123450,123450,0.1002,,Bonds vs. Equities,2,2.2222 -123455,123455,-0.8888,,Almost done,3,3.3333 -123450,123450,0.14,,XXXX04,14,11 -123454,123454,0.0456,,XXXX2,2,5.25 -123450,123450,0.078654,,XXXX3,2,1.009 -123450,123450,-0.8999,,XXXX4,2,2.111 -123457,123457,0.8002,,XXXX4,3,9 -123458,123458,-0.9888,,XXXX5,2,3.2222 -123459,123459,0.2,,XXXX6,3,4.3333 -123450,123450,0.1056,,XXXX7,3,12 -123441,123441,0.87865,,XXXX10,3,6.25 -123442,123442,-0.6999,,XXXX11,3,2.009 -123432,123432,0.4111,,XXXX02,3,3.111 -123450,123450,0.1902,,XXXX03,36,10 -123450,123450,-0.4888,,,2,4.2222 -123435,123435,,,,45,5.3333 -123450,123450,,,,2, +INDEX:28:,ul_col:28:,dbl_col_2:26:,bool_col:6:,str_col:25:,xint_col:28:,dbl_col:27:,char_col:27:,uchar_col:27: +123450,123450,0.998,1,4% of something,1,1.2345,C,B +123451,123451,0.3456,1,Description 4/5,2,2.2345,23,B +123452,123452,0.056,1,This is bad,3,3.2345,^,& +123450,123450,0.15678,0,3.4% of GDP,4,4.2345,F,250 +123455,123455,0.00345,0,Market drops,5,5.2345,120,B +123450,123450,0.923,1,Market pulls back,3,3,78,B +123449,123449,0.06743,,$15 increase,7,0.9999,,B +123450,123450,0.1,,Running fast,3,10,C,B +123451,123451,0.0056,,C++14 development,9,4.25,C, +123450,123450,0.07865,,Some explanation,10,0.009,C,B +123452,123452,0.0111,,More strings,3,8,C,B +123450,123450,0.1002,,Bonds vs. Equities,2,2.2222,C,40 +123455,123455,-0.8888,,Almost done,3,3.3333,C,B +123450,123450,0.14,,XXXX04,14,11,C,B +123454,123454,0.0456,,XXXX2,2,5.25,C,B +123450,123450,0.078654,,XXXX3,2,1.009,, +123450,123450,-0.8999,,XXXX4,2,2.111,C, +123457,123457,0.8002,,XXXX4,3,9,C,B +123458,123458,-0.9888,,XXXX5,2,3.2222,,B +123459,123459,0.2,,XXXX6,3,4.3333,C,B +123450,123450,0.1056,,XXXX7,3,12,C,B +123441,123441,0.87865,,XXXX10,3,6.25,C,B +123442,123442,-0.6999,,XXXX11,3,2.009,C,B +123432,123432,0.4111,,XXXX02,3,3.111,C,B +123450,123450,0.1902,,XXXX03,36,10,C,B +123450,123450,-0.4888,,,2,4.2222,C,B +123435,123435,,,,45,5.3333,C,B +123450,123450,,,,2,,C,B diff --git a/data/sample_data.json b/data/sample_data.json index 6c4b0467..dd77b71f 100644 --- a/data/sample_data.json +++ b/data/sample_data.json @@ -6,5 +6,7 @@ "This is a test"]}, "col_2":{"N":12,"T":"double","D":[8,9,10,11,12,13,14,15,16,17,18, 777.78]}, "col_1":{"N":12,"T":"double","D":[1,2,3,4,5,6,7,8,9,10,11, - 55.55]} + 55.55]}, +"col_char":{"N":12,"T":"char","D":[8,C,F,$,,8,120,88,h,u,18, 78]}, +"col_uchar":{"N":12,"T":"uchar","D":[8,C,F,$,,8,220,88,h,&,18, 255]} } diff --git a/data/sample_data_2.json b/data/sample_data_2.json index e41a462d..7973f0b8 100644 --- a/data/sample_data_2.json +++ b/data/sample_data_2.json @@ -6,5 +6,7 @@ "This is a test"]}, "col_2_2":{"N":12,"T":"double","D":[8,9,10,11,12,13,14,15,16,17,18, 777.78]}, "col_1_2":{"N":12,"T":"double","D":[1,2,3,4,5,6,7,8,9,10,11, - 55.55]} + 55.55]}, +"col_char":{"N":12,"T":"char","D":[8,C,F,$,,8,120,88,h,u,18, 78]}, +"col_uchar":{"N":12,"T":"uchar","D":[8,C,F,$,,8,220,88,h,&,18, 255]} } diff --git a/data/sample_data_no_index.csv b/data/sample_data_no_index.csv index d3af792b..f75ff652 100644 --- a/data/sample_data_no_index.csv +++ b/data/sample_data_no_index.csv @@ -7,3 +7,5 @@ str_col_no_idx:28::XXXX10,XXXX11,XXXX01,XXXX02,XXXX03,XXXX6,XXXX7,Runnin dbl_col_no_idx:28::2.009,3.111,10,4.2222,5.3333,12,6.25,10,0.9999,1.2345,4.2345,3,8,3.3333,2.2345,4.25,3.2345,0.009,1.111,5.25,11,5.2345,2.2222,1.009,2.111,9,3.2222,4.3333, dbl_col_2_no_idx:28::0.87865,-0.6999,0.4111,0.1902,-0.4888,0.2,0.1056,0.1,0.06743,0.998,0.15678,0.923,0.0111,-0.8888,0.3456,0.0056,0.056,0.07865,-0.9999,0.0456,0.14,0.00345,0.1002,0.078654,-0.8999,0.01119,0.8002,-0.9888, bool_col_no_idx:28::0,1,1,1,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0, +char_col:6::C,%,120,,65,! +uchar_col:6::C,%,250,,65,! diff --git a/data/sample_data_no_index.json b/data/sample_data_no_index.json index dd25c366..2a0aecfc 100644 --- a/data/sample_data_no_index.json +++ b/data/sample_data_no_index.json @@ -5,5 +5,7 @@ "This is a test"]}, "col_2_no_idx":{"N":12,"T":"double","D":[8,9,10,11,12,13,14,15,16,17,18, 777.78]}, "col_1_no_idx":{"N":12,"T":"double","D":[1,2,3,4,5,6,7,8,9,10,11, - 55.55]} + 55.55]}, +"col_char_no_idx":{"N":12,"T":"char","D":[8,C,F,$,,8,120,88,h,u,18, 78]}, +"col_uchar_no_idx":{"N":12,"T":"uchar","D":[8,C,F,$,,8,220,88,h,&,18, 255]} } diff --git a/data/sample_data_string_index.csv b/data/sample_data_string_index.csv index 67417729..f583b23d 100644 --- a/data/sample_data_string_index.csv +++ b/data/sample_data_string_index.csv @@ -8,3 +8,5 @@ str_col:28::XXXX10,XXXX11,XXXX01,XXXX02,XXXX03,XXXX6,XXXX7,Running fast, dbl_col:28::2.009,3.111,10,4.2222,5.3333,12,6.25,10,0.9999,1.2345,4.2345,3,8,3.3333,2.2345,4.25,3.2345,0.009,1.111,5.25,11,5.2345,2.2222,1.009,2.111,9,3.2222,4.3333, dbl_col_2:28::0.87865,-0.6999,0.4111,0.1902,-0.4888,0.2,0.1056,0.1,0.06743,0.998,0.15678,0.923,0.0111,-0.8888,0.3456,0.0056,0.056,0.07865,-0.9999,0.0456,0.14,0.00345,0.1002,0.078654,-0.8999,0.01119,0.8002,-0.9888, bool_col:28::0,1,1,1,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0, +char_col:6::C,%,120,,65,! +uchar_col:6::C,%,250,,65,! diff --git a/include/DataFrame/Internals/DataFrame_read.tcc b/include/DataFrame/Internals/DataFrame_read.tcc index 8871d7f3..c4eba1f6 100644 --- a/include/DataFrame/Internals/DataFrame_read.tcc +++ b/include/DataFrame/Internals/DataFrame_read.tcc @@ -212,9 +212,42 @@ void DataFrame::read_json_(std::istream &stream, bool columns_only) { vec.reserve(col_size); col_vector_push_back_func_(vec, - stream, - &::strtoul, - io_format::json); + stream, + &::strtoul, + io_format::json); + } + else if (col_type == "char") { + StlVecType &vec = + create_column(col_name.c_str(), false); + + vec.reserve(col_size); + col_vector_push_back_func_>( + vec, + stream, + [](const char *tok, char **, int) -> char { + if (tok[0] == '\0' || tok[1] == '\0') + return (static_cast(int(tok[0]))); + else + return (static_cast(atoi(tok))); + }, + io_format::json); + } + else if (col_type == "uchar") { + StlVecType &vec = + create_column(col_name.c_str(), false); + + vec.reserve(col_size); + col_vector_push_back_func_>( + vec, + stream, + [](const char *tok, char **, int) -> unsigned char { + if (tok[0] == '\0' || tok[1] == '\0') + return (static_cast(int(tok[0]))); + else + return (static_cast(atoi(tok))); + }, + io_format::json); } else if (col_type == "long") { StlVecType &vec = @@ -396,6 +429,39 @@ void DataFrame::read_csv_(std::istream &stream, bool columns_only) { vec.reserve(::atoi(value.c_str())); col_vector_push_back_func_(vec, stream, &::strtoul); } + else if (type_str == "char") { + StlVecType &vec = + create_column(col_name.c_str(), false); + + vec.reserve(::atoi(value.c_str())); + col_vector_push_back_func_(vec, stream, &::strtol); + col_vector_push_back_func_>( + vec, + stream, + [](const char *tok, char **, int) -> char { + if (tok[0] == '\0' || tok[1] == '\0') + return (static_cast(int(tok[0]))); + else + return (static_cast(atoi(tok))); + }); + } + else if (type_str == "uchar") { + StlVecType &vec = + create_column(col_name.c_str(), false); + + vec.reserve(::atoi(value.c_str())); + col_vector_push_back_func_(vec, stream, &::strtoul); + col_vector_push_back_func_>( + vec, + stream, + [](const char *tok, char **, int) -> unsigned char { + if (tok[0] == '\0' || tok[1] == '\0') + return (static_cast(int(tok[0]))); + else + return (static_cast(atoi(tok))); + }); + } else if (type_str == "long") { StlVecType &vec = create_column(col_name.c_str(), false); @@ -568,6 +634,16 @@ read_csv2_(std::istream &stream, type_str.c_str(), col_name.c_str(), nrows); + else if (type_str == "char") + spec_vec.emplace_back(StlVecType(), + type_str.c_str(), + col_name.c_str(), + nrows); + else if (type_str == "uchar") + spec_vec.emplace_back(StlVecType(), + type_str.c_str(), + col_name.c_str(), + nrows); else if (type_str == "long") spec_vec.emplace_back(StlVecType(), type_str.c_str(), @@ -676,6 +752,25 @@ read_csv2_(std::istream &stream, (col_spec.col_vec).push_back( (unsigned int) strtoul(value.c_str(), nullptr, 0)); } + else if (col_spec.type_spec == "char") { + if (value.size() > 1) + std::any_cast &> + (col_spec.col_vec).push_back( + static_cast(atoi(value.c_str()))); + else if (! value.empty()) + std::any_cast &> + (col_spec.col_vec).push_back(value[0]); + } + else if (col_spec.type_spec == "uchar") { + if (value.size() > 1) + std::any_cast &> + (col_spec.col_vec).push_back( + static_cast(atoi(value.c_str()))); + else if (! value.empty()) + std::any_cast &> + (col_spec.col_vec).push_back( + static_cast(value[0])); + } else if (col_spec.type_spec == "long") { if (! value.empty()) std::any_cast &> @@ -867,6 +962,18 @@ read_csv2_(std::istream &stream, std::move(std::any_cast &> (col_spec.col_vec)), nan_policy::dont_pad_with_nans); + else if (col_spec.type_spec == "char") + load_column( + col_spec.col_name.c_str(), + std::move(std::any_cast &> + (col_spec.col_vec)), + nan_policy::dont_pad_with_nans); + else if (col_spec.type_spec == "uchar") + load_column( + col_spec.col_name.c_str(), + std::move(std::any_cast &> + (col_spec.col_vec)), + nan_policy::dont_pad_with_nans); else if (col_spec.type_spec == "long") load_column( col_spec.col_name.c_str(), diff --git a/include/DataFrame/Internals/DataFrame_standalone.tcc b/include/DataFrame/Internals/DataFrame_standalone.tcc index 9decdff5..beb49249 100644 --- a/include/DataFrame/Internals/DataFrame_standalone.tcc +++ b/include/DataFrame/Internals/DataFrame_standalone.tcc @@ -70,7 +70,7 @@ static const std::unordered_map<_TypeInfoRef_, const char *const, _TypeinfoHasher_, - _TypeinfoEqualTo_> _typeinfo_name_ { + _TypeinfoEqualTo_> _typeinfo_name_ { { typeid(float), "float" }, { typeid(double), "double" }, { typeid(long double), "longdouble" }, @@ -82,7 +82,11 @@ std::unordered_map<_TypeInfoRef_, { typeid(unsigned long int), "ulong" }, { typeid(long long int), "longlong" }, { typeid(unsigned long long int), "ulonglong" }, + { typeid(char), "char" }, + { typeid(unsigned char), "uchar" }, { typeid(std::string), "string" }, + { typeid(const char *), "string" }, + { typeid(char *), "string" }, { typeid(bool), "bool" }, { typeid(DateTime), "DateTime" }, { typeid(std::vector), "dbl_vec" }, diff --git a/test/dataframe_tester.cc b/test/dataframe_tester.cc index 70f8561b..dd409de2 100644 --- a/test/dataframe_tester.cc +++ b/test/dataframe_tester.cc @@ -517,6 +517,8 @@ static void test_read() { unsigned long, double, std::string, + char, + unsigned char, bool>(std::cout); StdDataFrame df_read_str; @@ -532,6 +534,8 @@ static void test_read() { unsigned long, double, std::string, + char, + unsigned char, bool>(std::cout); StdDataFrame df_read_dt; @@ -547,6 +551,8 @@ static void test_read() { unsigned long, double, std::string, + char, + unsigned char, bool>(std::cout); } @@ -1262,6 +1268,8 @@ static void test_dataframe_friend_plus_operator() { int, double, std::string, + char, + unsigned char, bool>(df1, df2); std::cout << "Original DF1:" << std::endl; @@ -1270,6 +1278,8 @@ static void test_dataframe_friend_plus_operator() { unsigned long, double, std::string, + char, + unsigned char, bool>(std::cout); std::cout << "Original DF2:" << std::endl; df2.write(std::cout); std::cout << "Result DF:" << std::endl; result.write(std::cout); } @@ -1311,6 +1325,8 @@ static void test_dataframe_friend_minus_operator() { unsigned long, int, double, + char, + unsigned char, bool>(df1, df2); std::cout << "Original DF1:" << std::endl; @@ -1319,6 +1335,8 @@ static void test_dataframe_friend_minus_operator() { unsigned long, double, std::string, + char, + unsigned char, bool>(std::cout); std::cout << "Original DF2:" << std::endl; df2.write(std::cout); } @@ -3333,6 +3353,13 @@ static void test_reading_writing_json() { assert(df.get_column("col_3").size() == 12); assert(df.get_column("col_3")[3] == 18.0); assert(df.get_column("col_3")[11] == 555.543); + + assert(df.get_column("col_char")[11] == 78); + assert(df.get_column("col_char")[2] == 'F'); + assert(df.get_column("col_char")[8] == 'h'); + assert(df.get_column("col_uchar")[11] == 255); + assert(df.get_column("col_uchar")[7] == 88); + assert(df.get_column("col_uchar")[9] == '&'); } catch (const DataFrameError &ex) { std::cout << ex.what() << std::endl; diff --git a/test/dataframe_tester_2.cc b/test/dataframe_tester_2.cc index 73b30126..d875a2f0 100644 --- a/test/dataframe_tester_2.cc +++ b/test/dataframe_tester_2.cc @@ -3105,6 +3105,8 @@ static void test_no_index_reads() { unsigned long, double, bool, + char, + unsigned char, std::string>(std::cout, io_format::csv2); std::cout << '\n' << std::endl; @@ -3116,6 +3118,8 @@ static void test_no_index_reads() { unsigned long, double, bool, + char, + unsigned char, std::string>(std::cout, io_format::csv2); std::cout << '\n' << std::endl; @@ -3127,6 +3131,8 @@ static void test_no_index_reads() { unsigned long, double, bool, + char, + unsigned char, std::string>(std::cout, io_format::csv2); } catch (const DataFrameError &ex) { diff --git a/test/dataframe_tester_output.txt b/test/dataframe_tester_output.txt index c90c456c..f6986af6 100644 --- a/test/dataframe_tester_output.txt +++ b/test/dataframe_tester_output.txt @@ -93,6 +93,7 @@ str_col:28::XXXX10,XXXX11,XXXX01,XXXX02,XXXX03,XXXX6,XXXX7,Running fast, dbl_col:28::2.009,3.111,10,4.2222,5.3333,12,6.25,10,0.9999,1.2345,4.2345,3,8,3.3333,2.2345,4.25,3.2345,0.009,1.111,5.25,11,5.2345,2.2222,1.009,2.111,9,3.2222,4.3333, dbl_col_2:28::0.87865,-0.6999,0.4111,0.1902,-0.4888,0.2,0.1056,0.1,0.06743,0.998,0.15678,0.923,0.0111,-0.8888,0.3456,0.0056,0.056,0.07865,-0.9999,0.0456,0.14,0.00345,0.1002,0.078654,-0.8999,0.01119,0.8002,-0.9888, bool_col:28::0,1,1,1,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0, +char_col:11::,,x,,A,,,ú,,A,, INDEX:28::1547825036.3,1516179600.874123908,1516093200.234,1516006800.234098,1515920400.2309,1515834000.89,1515747600.123456789,1515661200.12309,1515574800.4562387,1515488400.2345609,1515402000.78,1515315600.340987645,1515229200.309812765,1515142800.93451984,1515056400.671092346,1514970000.450137234,1514883600.91256923,1514797200.67,1514624400.4562,1514538000.5,1514451600.0,1514365200.896120945,1514278800.783452098,378205200000.561209834,409741200000.346,441277200000.340987,472899600.0,504435600.871234561, ul_col:28::123450,123451,123452,123450,123455,123450,123449,123448,123451,123452,123452,123450,123455,123450,123454,123453,123456,123457,123458,123459,123460,123441,123442,123432,123433,123434,123435,123436, @@ -744,12 +745,12 @@ col_3:1::15, col_str:1::11, col_4:1::22, -INDEX:1::123451, -col_1:1::2, -col_2:1::9, -col_3:1::16, -col_str:1::22, -col_4:1::23, +INDEX:1::123450, +col_1:1::1, +col_2:1::8, +col_3:1::15, +col_str:1::11, +col_4:1::22, INDEX:1::123452, col_1:1::3, @@ -1141,35 +1142,35 @@ INDEX:28:,ul_col:28:,xint_col:28:,str_col:25:,dbl_col 123435,123435,45,,5.3333,, 123450,123450,2,,,, -INDEX:28:,ul_col:28:,dbl_col_2:26:,bool_col:6:,str_col:28:,xint_col:28:,dbl_col:27: -123450,123450,0.998,1,4% of something,1,1.2345 -123451,123451,0.3456,1,Description 4/5,2,2.2345 -123452,123452,0.056,1,This is bad,3,3.2345 -123450,123450,0.15678,0,3.4% of GDP,4,4.2345 -123455,123455,0.00345,0,Market drops,5,5.2345 -123450,123450,0.923,1,Market pulls back,3,3 -123449,123449,0.06743,,$15 increase,7,0.9999 -123450,123450,0.1,,Running fast,3,10 -123451,123451,0.0056,,C++14 development,9,4.25 -123450,123450,0.07865,,Some explanation,10,0.009 -123452,123452,0.0111,,More strings,3,8 -123450,123450,0.1002,,Bonds vs. Equities,2,2.2222 -123455,123455,-0.8888,,Almost done,3,3.3333 -123450,123450,0.14,,XXXX04,14,11 -123454,123454,0.0456,,XXXX2,2,5.25 -123450,123450,0.078654,,XXXX3,2,1.009 -123450,123450,-0.8999,,XXXX4,2,2.111 -123457,123457,0.8002,,XXXX4,3,9 -123458,123458,-0.9888,,XXXX5,2,3.2222 -123459,123459,0.2,,XXXX6,3,4.3333 -123450,123450,0.1056,,XXXX7,3,12 -123441,123441,0.87865,,XXXX10,3,6.25 -123442,123442,-0.6999,,XXXX11,3,2.009 -123432,123432,0.4111,,XXXX02,3,3.111 -123450,123450,0.1902,,XXXX03,36,10 -123450,123450,-0.4888,,,2,4.2222 -123435,123435,,,,45,5.3333 -123450,123450,,,,2, +INDEX:28:,ul_col:28:,dbl_col_2:26:,bool_col:6:,str_col:28:,xint_col:28:,dbl_col:27:,, +123450,123450,0.998,1,4% of something,1,1.2345,, +123451,123451,0.3456,1,Description 4/5,2,2.2345,, +123452,123452,0.056,1,This is bad,3,3.2345,, +123450,123450,0.15678,0,3.4% of GDP,4,4.2345,, +123455,123455,0.00345,0,Market drops,5,5.2345,, +123450,123450,0.923,1,Market pulls back,3,3,, +123449,123449,0.06743,,$15 increase,7,0.9999,, +123450,123450,0.1,,Running fast,3,10,, +123451,123451,0.0056,,C++14 development,9,4.25,, +123450,123450,0.07865,,Some explanation,10,0.009,, +123452,123452,0.0111,,More strings,3,8,, +123450,123450,0.1002,,Bonds vs. Equities,2,2.2222,, +123455,123455,-0.8888,,Almost done,3,3.3333,, +123450,123450,0.14,,XXXX04,14,11,, +123454,123454,0.0456,,XXXX2,2,5.25,, +123450,123450,0.078654,,XXXX3,2,1.009,, +123450,123450,-0.8999,,XXXX4,2,2.111,, +123457,123457,0.8002,,XXXX4,3,9,, +123458,123458,-0.9888,,XXXX5,2,3.2222,, +123459,123459,0.2,,XXXX6,3,4.3333,, +123450,123450,0.1056,,XXXX7,3,12,, +123441,123441,0.87865,,XXXX10,3,6.25,, +123442,123442,-0.6999,,XXXX11,3,2.009,, +123432,123432,0.4111,,XXXX02,3,3.111,, +123450,123450,0.1902,,XXXX03,36,10,, +123450,123450,-0.4888,,,2,4.2222,, +123435,123435,,,,45,5.3333,, +123450,123450,,,,2,,, Testing BoxCoxVisitor{ } ... @@ -1340,83 +1341,83 @@ ul_col:28:,xint_col:28:,str_col:25:,dbl_col:27:,dbl_ } Testing no_index_reads ... -INDEX:28:,ul_col:28:,dbl_col_2:26:,bool_col:6:,str_col:28:,xint_col:28:,dbl_col:27:,ul_col_2:28:,dbl_col_2_2:26:,bool_col_2:6:,str_col_2:28:,xint_col_2:28:,dbl_col_3_2:27:,ul_col_no_idx:28:,dbl_col_2_no_idx:26:,bool_col_no_idx:6:,str_col_no_idx:28:,xint_col_no_idx:28:,dbl_col_no_idx:27: -123450,123450,0.998,1,4% of something,1,1.2345,123450,0.998,1,4% of something,1,1.2345,123450,0.998,1,4% of something,1,1.2345 -123451,123451,0.3456,1,Description 4/5,2,2.2345,123451,0.3456,1,Description 4/5,2,2.2345,123451,0.3456,1,Description 4/5,2,2.2345 -123452,123452,0.056,1,This is bad,3,3.2345,123452,0.056,1,This is bad,3,3.2345,123452,0.056,1,This is bad,3,3.2345 -123450,123450,0.15678,0,3.4% of GDP,4,4.2345,123450,0.15678,0,3.4% of GDP,4,4.2345,123450,0.15678,0,3.4% of GDP,4,4.2345 -123455,123455,0.00345,0,Market drops,5,5.2345,123455,0.00345,0,Market drops,5,5.2345,123455,0.00345,0,Market drops,5,5.2345 -123450,123450,0.923,1,Market pulls back,3,3,123450,0.923,1,Market pulls back,3,3,123450,0.923,1,Market pulls back,3,3 -123449,123449,0.06743,,$15 increase,7,0.9999,123449,0.06743,,$15 increase,7,0.9999,123449,0.06743,,$15 increase,7,0.9999 -123450,123450,0.1,,Running fast,3,10,123450,0.1,,Running fast,3,10,123450,0.1,,Running fast,3,10 -123451,123451,0.0056,,C++14 development,9,4.25,123451,0.0056,,C++14 development,9,4.25,123451,0.0056,,C++14 development,9,4.25 -123450,123450,0.07865,,Some explanation,10,0.009,123450,0.07865,,Some explanation,10,0.009,123450,0.07865,,Some explanation,10,0.009 -123452,123452,0.0111,,More strings,3,8,123452,0.0111,,More strings,3,8,123452,0.0111,,More strings,3,8 -123450,123450,0.1002,,Bonds vs. Equities,2,2.2222,123450,0.1002,,Bonds vs. Equities,2,2.2222,123450,0.1002,,Bonds vs. Equities,2,2.2222 -123455,123455,-0.8888,,Almost done,3,3.3333,123455,-0.8888,,Almost done,3,3.3333,123455,-0.8888,,Almost done,3,3.3333 -123450,123450,0.14,,XXXX04,14,11,123450,0.14,,XXXX04,14,11,123450,0.14,,XXXX04,14,11 -123454,123454,0.0456,,XXXX2,2,5.25,123454,0.0456,,XXXX2,2,5.25,123454,0.0456,,XXXX2,2,5.25 -123450,123450,0.078654,,XXXX3,2,1.009,123450,0.078654,,XXXX3,2,1.009,123450,0.078654,,XXXX3,2,1.009 -123450,123450,-0.8999,,XXXX4,2,2.111,123450,-0.8999,,XXXX4,2,2.111,123450,-0.8999,,XXXX4,2,2.111 -123457,123457,0.8002,,XXXX4,3,9,123457,0.8002,,XXXX4,3,9,123457,0.8002,,XXXX4,3,9 -123458,123458,-0.9888,,XXXX5,2,3.2222,123458,-0.9888,,XXXX5,2,3.2222,123458,-0.9888,,XXXX5,2,3.2222 -123459,123459,0.2,,XXXX6,3,4.3333,123459,0.2,,XXXX6,3,4.3333,123459,0.2,,XXXX6,3,4.3333 -123450,123450,0.1056,,XXXX7,3,12,123450,0.1056,,XXXX7,3,12,123450,0.1056,,XXXX7,3,12 -123441,123441,0.87865,,XXXX10,3,6.25,123441,0.87865,,XXXX10,3,6.25,123441,0.87865,,XXXX10,3,6.25 -123442,123442,-0.6999,,XXXX11,3,2.009,123442,-0.6999,,XXXX11,3,2.009,123442,-0.6999,,XXXX11,3,2.009 -123432,123432,0.4111,,XXXX02,3,3.111,123432,0.4111,,XXXX02,3,3.111,123432,0.4111,,XXXX02,3,3.111 -123450,123450,0.1902,,XXXX03,36,10,123450,0.1902,,XXXX03,36,10,123450,0.1902,,XXXX03,36,10 -123450,123450,-0.4888,,,2,4.2222,123450,-0.4888,,,2,4.2222,123450,-0.4888,,,2,4.2222 -123435,123435,,,,45,5.3333,123435,,,,45,5.3333,123435,,,,45,5.3333 -123450,123450,,,,2,,123450,,,,2,,123450,,,,2, - - - -INDEX:28:,ul_col:28:,xint_col:28:,str_col:28:,dbl_col:28:,dbl_col_2:28:,bool_col:28:,ul_col_2:28:,xint_col_2:28:,str_col_2:28:,dbl_col_3:28:,dbl_col_2_2:28:,bool_col_2:28:,ul_col_no_idx:28:,xint_col_no_idx:28:,str_col_no_idx:28:,dbl_col_no_idx:28:,dbl_col_2_no_idx:28:,bool_col_no_idx:28: -123432,123450,35,XXXX10,2.009,0.87865,0,123450,35,XXXX10,2.009,0.87865,0,123450,35,XXXX10,2.009,0.87865,0 -123433,123451,36,XXXX11,3.111,-0.6999,1,123451,36,XXXX11,3.111,-0.6999,1,123451,36,XXXX11,3.111,-0.6999,1 -123434,123452,40,XXXX01,10,0.4111,1,123452,40,XXXX01,10,0.4111,1,123452,40,XXXX01,10,0.4111,1 -123435,123450,45,XXXX02,4.2222,0.1902,1,123450,45,XXXX02,4.2222,0.1902,1,123450,45,XXXX02,4.2222,0.1902,1 -123436,123455,46,XXXX03,5.3333,-0.4888,0,123455,46,XXXX03,5.3333,-0.4888,0,123455,46,XXXX03,5.3333,-0.4888,0 -123441,123450,33,XXXX6,12,0.2,0,123450,33,XXXX6,12,0.2,0,123450,33,XXXX6,12,0.2,0 -123442,123449,34,XXXX7,6.25,0.1056,1,123449,34,XXXX7,6.25,0.1056,1,123449,34,XXXX7,6.25,0.1056,1 -123448,123448,8,Running fast,10,0.1,0,123448,8,Running fast,10,0.1,0,123448,8,Running fast,10,0.1,0 -123449,123451,7,$15 increase,0.9999,0.06743,0,123451,7,$15 increase,0.9999,0.06743,0,123451,7,$15 increase,0.9999,0.06743,0 -123450,123452,1,4% of something,1.2345,0.998,0,123452,1,4% of something,1.2345,0.998,0,123452,1,4% of something,1.2345,0.998,0 -123450,123452,4,3.4% of GDP,4.2345,0.15678,1,123452,4,3.4% of GDP,4.2345,0.15678,1,123452,4,3.4% of GDP,4.2345,0.15678,1 -123450,123450,6,Market pulls back,3,0.923,0,123450,6,Market pulls back,3,0.923,0,123450,6,Market pulls back,3,0.923,0 -123450,123455,12,Bonds vs. Equities,8,0.0111,1,123455,12,Bonds vs. Equities,8,0.0111,1,123455,12,Bonds vs. Equities,8,0.0111,1 -123450,123450,14,Here comes the sun,3.3333,-0.8888,0,123450,14,Here comes the sun,3.3333,-0.8888,0,123450,14,Here comes the sun,3.3333,-0.8888,0 -123451,123454,2,Description 4/5,2.2345,0.3456,0,123454,2,Description 4/5,2.2345,0.3456,0,123454,2,Description 4/5,2.2345,0.3456,0 -123451,123453,9,C++14 development,4.25,0.0056,0,123453,9,C++14 development,4.25,0.0056,0,123453,9,C++14 development,4.25,0.0056,0 -123452,123456,3,This is bad,3.2345,0.056,0,123456,3,This is bad,3.2345,0.056,0,123456,3,This is bad,3.2345,0.056,0 -123452,123457,10,Some explanation,0.009,0.07865,0,123457,10,Some explanation,0.009,0.07865,0,123457,10,Some explanation,0.009,0.07865,0 -123452,123458,11,More strings,1.111,-0.9999,0,123458,11,More strings,1.111,-0.9999,0,123458,11,More strings,1.111,-0.9999,0 -123453,123459,20,XXXX04,5.25,0.0456,0,123459,20,XXXX04,5.25,0.0456,0,123459,20,XXXX04,5.25,0.0456,0 -123454,123460,15,XXXX1,11,0.14,1,123460,15,XXXX1,11,0.14,1,123460,15,XXXX1,11,0.14,1 -123455,123441,5,Market drops,5.2345,0.00345,0,123441,5,Market drops,5.2345,0.00345,0,123441,5,Market drops,5.2345,0.00345,0 -123455,123442,13,Almost done,2.2222,0.1002,1,123442,13,Almost done,2.2222,0.1002,1,123442,13,Almost done,2.2222,0.1002,1 -123456,123432,22,XXXX2,1.009,0.078654,0,123432,22,XXXX2,1.009,0.078654,0,123432,22,XXXX2,1.009,0.078654,0 -123457,123433,23,XXXX3,2.111,-0.8999,0,123433,23,XXXX3,2.111,-0.8999,0,123433,23,XXXX3,2.111,-0.8999,0 -123458,123434,24,XXXX4,9,0.01119,1,123434,24,XXXX4,9,0.01119,1,123434,24,XXXX4,9,0.01119,1 -123459,123435,25,XXXX4,3.2222,0.8002,0,123435,25,XXXX4,3.2222,0.8002,0,123435,25,XXXX4,3.2222,0.8002,0 -123460,123436,30,XXXX5,4.3333,-0.9888,0,123436,30,XXXX5,4.3333,-0.9888,0,123436,30,XXXX5,4.3333,-0.9888,0 - - - -INDEX:12:,col_3:12:,col_4:6:,col_str:12:,col_2:12:,col_1:12:,col_3_2:12:,col_4_2:6:,col_str_2:12:,col_2_2:12:,col_1_2:12:,col_3_no_idx:12:,col_4_no_idx:6:,col_str_no_idx:12:,col_2_no_idx:12:,col_1_no_idx:12: -123450,15,22,11,8,1,15,22,11,8,1,15,22,11,8,1 -123451,16,23,22,9,2,16,23,22,9,2,16,23,22,9,2 -123452,17,24,33,10,3,17,24,33,10,3,17,24,33,10,3 -123453,18,25,aa,11,4,18,25,aa,11,4,18,25,aa,11,4 -123454,19,26,bb,12,5,19,26,bb,12,5,19,26,bb,12,5 -123455,20,27,cc,13,6,20,27,cc,13,6,20,27,cc,13,6 -123456,21,,dd,14,7,21,,dd,14,7,21,,dd,14,7 -123457,22,,tt,15,8,22,,tt,15,8,22,,tt,15,8 -123458,23,,uu,16,9,23,,uu,16,9,23,,uu,16,9 -123459,24,,ii,17,10,24,,ii,17,10,24,,ii,17,10 -123460,25,,88,18,11,25,,88,18,11,25,,88,18,11 -555555,555.543,,This is a test,777.78,55.55,555.543,,This is a test,777.78,55.55,555.543,,This is a test,777.78,55.55 +INDEX:28:,ul_col:28:,dbl_col_2:26:,bool_col:6:,str_col:28:,xint_col:28:,dbl_col:27:,char_col:25:,uchar_col:25:,ul_col_2:28:,dbl_col_2_2:26:,bool_col_2:6:,str_col_2:28:,xint_col_2:28:,dbl_col_3_2:27:,ul_col_no_idx:28:,dbl_col_2_no_idx:26:,bool_col_no_idx:6:,str_col_no_idx:28:,xint_col_no_idx:28:,dbl_col_no_idx:27: +123450,123450,0.998,1,4% of something,1,1.2345,C,B,123450,0.998,1,4% of something,1,1.2345,123450,0.998,1,4% of something,1,1.2345 +123451,123451,0.3456,1,Description 4/5,2,2.2345,,B,123451,0.3456,1,Description 4/5,2,2.2345,123451,0.3456,1,Description 4/5,2,2.2345 +123452,123452,0.056,1,This is bad,3,3.2345,^,&,123452,0.056,1,This is bad,3,3.2345,123452,0.056,1,This is bad,3,3.2345 +123450,123450,0.15678,0,3.4% of GDP,4,4.2345,F,ú,123450,0.15678,0,3.4% of GDP,4,4.2345,123450,0.15678,0,3.4% of GDP,4,4.2345 +123455,123455,0.00345,0,Market drops,5,5.2345,x,B,123455,0.00345,0,Market drops,5,5.2345,123455,0.00345,0,Market drops,5,5.2345 +123450,123450,0.923,1,Market pulls back,3,3,N,B,123450,0.923,1,Market pulls back,3,3,123450,0.923,1,Market pulls back,3,3 +123449,123449,0.06743,,$15 increase,7,0.9999,C,B,123449,0.06743,,$15 increase,7,0.9999,123449,0.06743,,$15 increase,7,0.9999 +123450,123450,0.1,,Running fast,3,10,C,B,123450,0.1,,Running fast,3,10,123450,0.1,,Running fast,3,10 +123451,123451,0.0056,,C++14 development,9,4.25,C,B,123451,0.0056,,C++14 development,9,4.25,123451,0.0056,,C++14 development,9,4.25 +123450,123450,0.07865,,Some explanation,10,0.009,C,B,123450,0.07865,,Some explanation,10,0.009,123450,0.07865,,Some explanation,10,0.009 +123452,123452,0.0111,,More strings,3,8,C,(,123452,0.0111,,More strings,3,8,123452,0.0111,,More strings,3,8 +123450,123450,0.1002,,Bonds vs. Equities,2,2.2222,C,B,123450,0.1002,,Bonds vs. Equities,2,2.2222,123450,0.1002,,Bonds vs. Equities,2,2.2222 +123455,123455,-0.8888,,Almost done,3,3.3333,C,B,123455,-0.8888,,Almost done,3,3.3333,123455,-0.8888,,Almost done,3,3.3333 +123450,123450,0.14,,XXXX04,14,11,C,B,123450,0.14,,XXXX04,14,11,123450,0.14,,XXXX04,14,11 +123454,123454,0.0456,,XXXX2,2,5.25,C,B,123454,0.0456,,XXXX2,2,5.25,123454,0.0456,,XXXX2,2,5.25 +123450,123450,0.078654,,XXXX3,2,1.009,C,B,123450,0.078654,,XXXX3,2,1.009,123450,0.078654,,XXXX3,2,1.009 +123450,123450,-0.8999,,XXXX4,2,2.111,C,B,123450,-0.8999,,XXXX4,2,2.111,123450,-0.8999,,XXXX4,2,2.111 +123457,123457,0.8002,,XXXX4,3,9,C,B,123457,0.8002,,XXXX4,3,9,123457,0.8002,,XXXX4,3,9 +123458,123458,-0.9888,,XXXX5,2,3.2222,C,B,123458,-0.9888,,XXXX5,2,3.2222,123458,-0.9888,,XXXX5,2,3.2222 +123459,123459,0.2,,XXXX6,3,4.3333,C,B,123459,0.2,,XXXX6,3,4.3333,123459,0.2,,XXXX6,3,4.3333 +123450,123450,0.1056,,XXXX7,3,12,C,B,123450,0.1056,,XXXX7,3,12,123450,0.1056,,XXXX7,3,12 +123441,123441,0.87865,,XXXX10,3,6.25,C,B,123441,0.87865,,XXXX10,3,6.25,123441,0.87865,,XXXX10,3,6.25 +123442,123442,-0.6999,,XXXX11,3,2.009,C,B,123442,-0.6999,,XXXX11,3,2.009,123442,-0.6999,,XXXX11,3,2.009 +123432,123432,0.4111,,XXXX02,3,3.111,C,B,123432,0.4111,,XXXX02,3,3.111,123432,0.4111,,XXXX02,3,3.111 +123450,123450,0.1902,,XXXX03,36,10,C,B,123450,0.1902,,XXXX03,36,10,123450,0.1902,,XXXX03,36,10 +123450,123450,-0.4888,,,2,4.2222,,,123450,-0.4888,,,2,4.2222,123450,-0.4888,,,2,4.2222 +123435,123435,,,,45,5.3333,,,123435,,,,45,5.3333,123435,,,,45,5.3333 +123450,123450,,,,2,,,,123450,,,,2,,123450,,,,2, + + + +INDEX:28:,ul_col:28:,xint_col:28:,str_col:28:,dbl_col:28:,dbl_col_2:28:,bool_col:28:,ul_col_2:28:,xint_col_2:28:,str_col_2:28:,dbl_col_3:28:,dbl_col_2_2:28:,bool_col_2:28:,ul_col_no_idx:28:,xint_col_no_idx:28:,str_col_no_idx:28:,dbl_col_no_idx:28:,dbl_col_2_no_idx:28:,bool_col_no_idx:28:,char_col:11: +123432,123450,35,XXXX10,2.009,0.87865,0,123450,35,XXXX10,2.009,0.87865,0,123450,35,XXXX10,2.009,0.87865,0, +123433,123451,36,XXXX11,3.111,-0.6999,1,123451,36,XXXX11,3.111,-0.6999,1,123451,36,XXXX11,3.111,-0.6999,1, +123434,123452,40,XXXX01,10,0.4111,1,123452,40,XXXX01,10,0.4111,1,123452,40,XXXX01,10,0.4111,1,x +123435,123450,45,XXXX02,4.2222,0.1902,1,123450,45,XXXX02,4.2222,0.1902,1,123450,45,XXXX02,4.2222,0.1902,1, +123436,123455,46,XXXX03,5.3333,-0.4888,0,123455,46,XXXX03,5.3333,-0.4888,0,123455,46,XXXX03,5.3333,-0.4888,0,A +123441,123450,33,XXXX6,12,0.2,0,123450,33,XXXX6,12,0.2,0,123450,33,XXXX6,12,0.2,0, +123442,123449,34,XXXX7,6.25,0.1056,1,123449,34,XXXX7,6.25,0.1056,1,123449,34,XXXX7,6.25,0.1056,1, +123448,123448,8,Running fast,10,0.1,0,123448,8,Running fast,10,0.1,0,123448,8,Running fast,10,0.1,0,ú +123449,123451,7,$15 increase,0.9999,0.06743,0,123451,7,$15 increase,0.9999,0.06743,0,123451,7,$15 increase,0.9999,0.06743,0, +123450,123452,1,4% of something,1.2345,0.998,0,123452,1,4% of something,1.2345,0.998,0,123452,1,4% of something,1.2345,0.998,0,A +123450,123452,4,3.4% of GDP,4.2345,0.15678,1,123452,4,3.4% of GDP,4.2345,0.15678,1,123452,4,3.4% of GDP,4.2345,0.15678,1, +123450,123450,6,Market pulls back,3,0.923,0,123450,6,Market pulls back,3,0.923,0,123450,6,Market pulls back,3,0.923,0, +123450,123455,12,Bonds vs. Equities,8,0.0111,1,123455,12,Bonds vs. Equities,8,0.0111,1,123455,12,Bonds vs. Equities,8,0.0111,1, +123450,123450,14,Here comes the sun,3.3333,-0.8888,0,123450,14,Here comes the sun,3.3333,-0.8888,0,123450,14,Here comes the sun,3.3333,-0.8888,0, +123451,123454,2,Description 4/5,2.2345,0.3456,0,123454,2,Description 4/5,2.2345,0.3456,0,123454,2,Description 4/5,2.2345,0.3456,0, +123451,123453,9,C++14 development,4.25,0.0056,0,123453,9,C++14 development,4.25,0.0056,0,123453,9,C++14 development,4.25,0.0056,0, +123452,123456,3,This is bad,3.2345,0.056,0,123456,3,This is bad,3.2345,0.056,0,123456,3,This is bad,3.2345,0.056,0, +123452,123457,10,Some explanation,0.009,0.07865,0,123457,10,Some explanation,0.009,0.07865,0,123457,10,Some explanation,0.009,0.07865,0, +123452,123458,11,More strings,1.111,-0.9999,0,123458,11,More strings,1.111,-0.9999,0,123458,11,More strings,1.111,-0.9999,0, +123453,123459,20,XXXX04,5.25,0.0456,0,123459,20,XXXX04,5.25,0.0456,0,123459,20,XXXX04,5.25,0.0456,0, +123454,123460,15,XXXX1,11,0.14,1,123460,15,XXXX1,11,0.14,1,123460,15,XXXX1,11,0.14,1, +123455,123441,5,Market drops,5.2345,0.00345,0,123441,5,Market drops,5.2345,0.00345,0,123441,5,Market drops,5.2345,0.00345,0, +123455,123442,13,Almost done,2.2222,0.1002,1,123442,13,Almost done,2.2222,0.1002,1,123442,13,Almost done,2.2222,0.1002,1, +123456,123432,22,XXXX2,1.009,0.078654,0,123432,22,XXXX2,1.009,0.078654,0,123432,22,XXXX2,1.009,0.078654,0, +123457,123433,23,XXXX3,2.111,-0.8999,0,123433,23,XXXX3,2.111,-0.8999,0,123433,23,XXXX3,2.111,-0.8999,0, +123458,123434,24,XXXX4,9,0.01119,1,123434,24,XXXX4,9,0.01119,1,123434,24,XXXX4,9,0.01119,1, +123459,123435,25,XXXX4,3.2222,0.8002,0,123435,25,XXXX4,3.2222,0.8002,0,123435,25,XXXX4,3.2222,0.8002,0, +123460,123436,30,XXXX5,4.3333,-0.9888,0,123436,30,XXXX5,4.3333,-0.9888,0,123436,30,XXXX5,4.3333,-0.9888,0, + + + +INDEX:12:,col_3:12:,col_4:6:,col_str:12:,col_2:12:,col_1:12:,col_char:12:,col_uchar:12:,col_3_2:12:,col_4_2:6:,col_str_2:12:,col_2_2:12:,col_1_2:12:,col_3_no_idx:12:,col_4_no_idx:6:,col_str_no_idx:12:,col_2_no_idx:12:,col_1_no_idx:12:,col_char_no_idx:12:,col_uchar_no_idx:12: +123450,15,22,11,8,1,8,8,15,22,11,8,1,15,22,11,8,1,8,8 +123451,16,23,22,9,2,C,C,16,23,22,9,2,16,23,22,9,2,C,C +123452,17,24,33,10,3,F,F,17,24,33,10,3,17,24,33,10,3,F,F +123453,18,25,aa,11,4,$,$,18,25,aa,11,4,18,25,aa,11,4,$,$ +123454,19,26,bb,12,5,,,19,26,bb,12,5,19,26,bb,12,5,, +123455,20,27,cc,13,6,8,8,20,27,cc,13,6,20,27,cc,13,6,8,8 +123456,21,,dd,14,7,x,Ü,21,,dd,14,7,21,,dd,14,7,x,Ü +123457,22,,tt,15,8,X,X,22,,tt,15,8,22,,tt,15,8,X,X +123458,23,,uu,16,9,h,h,23,,uu,16,9,23,,uu,16,9,h,h +123459,24,,ii,17,10,u,&,24,,ii,17,10,24,,ii,17,10,u,& +123460,25,,88,18,11,,,25,,88,18,11,25,,88,18,11,, +555555,555.543,,This is a test,777.78,55.55,N,ÿ,555.543,,This is a test,777.78,55.55,555.543,,This is a test,777.78,55.55,N,ÿ Testing KamaVisitor{ } ... @@ -2025,7 +2026,7 @@ INDEX:10:,string col:10:,Cool Column:10:,numbers:10: Date: Mon, 22 Jan 2024 13:56:04 -0500 Subject: [PATCH 09/13] Added docs for read/write of char from/to files --- docs/HTML/read.html | 20 ++++++--- docs/HTML/write.html | 20 ++++++--- .../DataFrame/Internals/DataFrame_misc.tcc | 4 +- .../Internals/DataFrame_standalone.tcc | 45 +++++++++++++++++++ test/dataframe_tester_output.txt | 44 +++++++++--------- 5 files changed, 95 insertions(+), 38 deletions(-) diff --git a/docs/HTML/read.html b/docs/HTML/read.html index cc806e23..8b195352 100644 --- a/docs/HTML/read.html +++ b/docs/HTML/read.html @@ -85,17 +85,23 @@
    In all formats the following data types are supported:
    -          float
    -          double
    +          float       -- float
    +          double      -- double
               longdouble  -- long double
    -          int
    +          short       -- short int
    +          ushort      -- unsigned short int
    +          int         -- int
               uint        -- unsigned int
    -          long
    +          long        -- long int
               longlong    -- long long int
    -          ulong       -- unsigned long
    +          ulong       -- unsigned long int
               ulonglong   -- unsigned long long int
    -          string
    -          bool
    +          char        -- char
    +          uchar       -- unsigned char
    +          string      -- std::string
    +          string      -- const char *
    +          string      -- char *
    +          bool        -- bool
               DateTime    -- DateTime data in format of <Epoch seconds>.<nanoseconds> (1516179600.874123908)
             
    In case of io_format::csv2 the following additional types are also supported: diff --git a/docs/HTML/write.html b/docs/HTML/write.html index 86f607ef..4d334593 100644 --- a/docs/HTML/write.html +++ b/docs/HTML/write.html @@ -86,17 +86,23 @@
    In all formats the following data types are supported:
    -          float
    -          double
    +          float       -- float
    +          double      -- double
               longdouble  -- long double
    -          int
    +          short       -- short int
    +          ushort      -- unsigned short int
    +          int         -- int
               uint        -- unsigned int
    -          long
    +          long        -- long int
               longlong    -- long long int
    -          ulong       -- unsigned long
    +          ulong       -- unsigned long int
               ulonglong   -- unsigned long long int
    -          string
    -          bool
    +          char        -- char
    +          uchar       -- unsigned char
    +          string      -- std::string
    +          string      -- const char *
    +          string      -- char *
    +          bool        -- bool
               DateTime    -- DateTime data in format of <Epoch seconds>.<nanoseconds> (1516179600.874123908)
             
    In case of io_format::csv2 the following additional types are also supported: diff --git a/include/DataFrame/Internals/DataFrame_misc.tcc b/include/DataFrame/Internals/DataFrame_misc.tcc index 4b7b8a10..7359227c 100644 --- a/include/DataFrame/Internals/DataFrame_misc.tcc +++ b/include/DataFrame/Internals/DataFrame_misc.tcc @@ -200,7 +200,7 @@ DataFrame::print_csv_functor_::operator() (const T &vec) { if (vec_size > 0) { for (long i = sr; i < er; ++i) - os << vec[i] << ','; + _write_csv_df_index_(os, vec[i]) << ','; } os << '\n'; @@ -266,7 +266,7 @@ template void DataFrame:: print_csv2_data_functor_::operator() (const T &vec) { - if (vec.size() > index) os << vec[index]; + if (vec.size() > index) _write_csv_df_index_(os, vec[index]); return; } diff --git a/include/DataFrame/Internals/DataFrame_standalone.tcc b/include/DataFrame/Internals/DataFrame_standalone.tcc index beb49249..441b9d70 100644 --- a/include/DataFrame/Internals/DataFrame_standalone.tcc +++ b/include/DataFrame/Internals/DataFrame_standalone.tcc @@ -32,6 +32,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include @@ -582,6 +583,28 @@ inline static S &_write_json_df_index_(S &o, const std::string &value) { // ---------------------------------------------------------------------------- +template +inline static S &_write_json_df_index_(S &o, char value) { + + if (std::isprint(value)) + return (o << value); + else + return (o << static_cast(value)); +} + +// ---------------------------------------------------------------------------- + +template +inline static S &_write_json_df_index_(S &o, unsigned char value) { + + if (std::isprint(value)) + return (o << value); + else + return (o << static_cast(value)); +} + +// ---------------------------------------------------------------------------- + inline static void _get_token_from_file_ (std::istream &file, char delim, @@ -830,6 +853,28 @@ inline static S &_write_csv_df_index_(S &o, const DateTime &value) { // ---------------------------------------------------------------------------- +template +inline static S &_write_csv_df_index_(S &o, char value) { + + if (std::isprint(value)) + return (o << value); + else + return (o << static_cast(value)); +} + +// ---------------------------------------------------------------------------- + +template +inline static S &_write_csv_df_index_(S &o, unsigned char value) { + + if (std::isprint(value)) + return (o << value); + else + return (o << static_cast(value)); +} + +// ---------------------------------------------------------------------------- + // // Specializing std::hash for tuples // diff --git a/test/dataframe_tester_output.txt b/test/dataframe_tester_output.txt index f6986af6..e0bf38a1 100644 --- a/test/dataframe_tester_output.txt +++ b/test/dataframe_tester_output.txt @@ -93,7 +93,7 @@ str_col:28::XXXX10,XXXX11,XXXX01,XXXX02,XXXX03,XXXX6,XXXX7,Running fast, dbl_col:28::2.009,3.111,10,4.2222,5.3333,12,6.25,10,0.9999,1.2345,4.2345,3,8,3.3333,2.2345,4.25,3.2345,0.009,1.111,5.25,11,5.2345,2.2222,1.009,2.111,9,3.2222,4.3333, dbl_col_2:28::0.87865,-0.6999,0.4111,0.1902,-0.4888,0.2,0.1056,0.1,0.06743,0.998,0.15678,0.923,0.0111,-0.8888,0.3456,0.0056,0.056,0.07865,-0.9999,0.0456,0.14,0.00345,0.1002,0.078654,-0.8999,0.01119,0.8002,-0.9888, bool_col:28::0,1,1,1,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0, -char_col:11::,,x,,A,,,ú,,A,, +char_col:11::0,0,x,0,A,0,0,-6,0,A,0, INDEX:28::1547825036.3,1516179600.874123908,1516093200.234,1516006800.234098,1515920400.2309,1515834000.89,1515747600.123456789,1515661200.12309,1515574800.4562387,1515488400.2345609,1515402000.78,1515315600.340987645,1515229200.309812765,1515142800.93451984,1515056400.671092346,1514970000.450137234,1514883600.91256923,1514797200.67,1514624400.4562,1514538000.5,1514451600.0,1514365200.896120945,1514278800.783452098,378205200000.561209834,409741200000.346,441277200000.340987,472899600.0,504435600.871234561, ul_col:28::123450,123451,123452,123450,123455,123450,123449,123448,123451,123452,123452,123450,123455,123450,123454,123453,123456,123457,123458,123459,123460,123441,123442,123432,123433,123434,123435,123436, @@ -752,12 +752,12 @@ col_3:1::15, col_str:1::11, col_4:1::22, -INDEX:1::123452, -col_1:1::3, -col_2:1::10, -col_3:1::17, -col_str:1::33, -col_4:1::24, +INDEX:1::123451, +col_1:1::2, +col_2:1::9, +col_3:1::16, +col_str:1::22, +col_4:1::23, Testing write(json) ... @@ -1343,9 +1343,9 @@ ul_col:28:,xint_col:28:,str_col:25:,dbl_col:27:,dbl_ Testing no_index_reads ... INDEX:28:,ul_col:28:,dbl_col_2:26:,bool_col:6:,str_col:28:,xint_col:28:,dbl_col:27:,char_col:25:,uchar_col:25:,ul_col_2:28:,dbl_col_2_2:26:,bool_col_2:6:,str_col_2:28:,xint_col_2:28:,dbl_col_3_2:27:,ul_col_no_idx:28:,dbl_col_2_no_idx:26:,bool_col_no_idx:6:,str_col_no_idx:28:,xint_col_no_idx:28:,dbl_col_no_idx:27: 123450,123450,0.998,1,4% of something,1,1.2345,C,B,123450,0.998,1,4% of something,1,1.2345,123450,0.998,1,4% of something,1,1.2345 -123451,123451,0.3456,1,Description 4/5,2,2.2345,,B,123451,0.3456,1,Description 4/5,2,2.2345,123451,0.3456,1,Description 4/5,2,2.2345 +123451,123451,0.3456,1,Description 4/5,2,2.2345,23,B,123451,0.3456,1,Description 4/5,2,2.2345,123451,0.3456,1,Description 4/5,2,2.2345 123452,123452,0.056,1,This is bad,3,3.2345,^,&,123452,0.056,1,This is bad,3,3.2345,123452,0.056,1,This is bad,3,3.2345 -123450,123450,0.15678,0,3.4% of GDP,4,4.2345,F,ú,123450,0.15678,0,3.4% of GDP,4,4.2345,123450,0.15678,0,3.4% of GDP,4,4.2345 +123450,123450,0.15678,0,3.4% of GDP,4,4.2345,F,250,123450,0.15678,0,3.4% of GDP,4,4.2345,123450,0.15678,0,3.4% of GDP,4,4.2345 123455,123455,0.00345,0,Market drops,5,5.2345,x,B,123455,0.00345,0,Market drops,5,5.2345,123455,0.00345,0,Market drops,5,5.2345 123450,123450,0.923,1,Market pulls back,3,3,N,B,123450,0.923,1,Market pulls back,3,3,123450,0.923,1,Market pulls back,3,3 123449,123449,0.06743,,$15 increase,7,0.9999,C,B,123449,0.06743,,$15 increase,7,0.9999,123449,0.06743,,$15 increase,7,0.9999 @@ -1374,17 +1374,17 @@ INDEX:28:,ul_col:28:,dbl_col_2:26:,bool_col:6:,str_c INDEX:28:,ul_col:28:,xint_col:28:,str_col:28:,dbl_col:28:,dbl_col_2:28:,bool_col:28:,ul_col_2:28:,xint_col_2:28:,str_col_2:28:,dbl_col_3:28:,dbl_col_2_2:28:,bool_col_2:28:,ul_col_no_idx:28:,xint_col_no_idx:28:,str_col_no_idx:28:,dbl_col_no_idx:28:,dbl_col_2_no_idx:28:,bool_col_no_idx:28:,char_col:11: -123432,123450,35,XXXX10,2.009,0.87865,0,123450,35,XXXX10,2.009,0.87865,0,123450,35,XXXX10,2.009,0.87865,0, -123433,123451,36,XXXX11,3.111,-0.6999,1,123451,36,XXXX11,3.111,-0.6999,1,123451,36,XXXX11,3.111,-0.6999,1, +123432,123450,35,XXXX10,2.009,0.87865,0,123450,35,XXXX10,2.009,0.87865,0,123450,35,XXXX10,2.009,0.87865,0,0 +123433,123451,36,XXXX11,3.111,-0.6999,1,123451,36,XXXX11,3.111,-0.6999,1,123451,36,XXXX11,3.111,-0.6999,1,0 123434,123452,40,XXXX01,10,0.4111,1,123452,40,XXXX01,10,0.4111,1,123452,40,XXXX01,10,0.4111,1,x -123435,123450,45,XXXX02,4.2222,0.1902,1,123450,45,XXXX02,4.2222,0.1902,1,123450,45,XXXX02,4.2222,0.1902,1, +123435,123450,45,XXXX02,4.2222,0.1902,1,123450,45,XXXX02,4.2222,0.1902,1,123450,45,XXXX02,4.2222,0.1902,1,0 123436,123455,46,XXXX03,5.3333,-0.4888,0,123455,46,XXXX03,5.3333,-0.4888,0,123455,46,XXXX03,5.3333,-0.4888,0,A -123441,123450,33,XXXX6,12,0.2,0,123450,33,XXXX6,12,0.2,0,123450,33,XXXX6,12,0.2,0, -123442,123449,34,XXXX7,6.25,0.1056,1,123449,34,XXXX7,6.25,0.1056,1,123449,34,XXXX7,6.25,0.1056,1, -123448,123448,8,Running fast,10,0.1,0,123448,8,Running fast,10,0.1,0,123448,8,Running fast,10,0.1,0,ú -123449,123451,7,$15 increase,0.9999,0.06743,0,123451,7,$15 increase,0.9999,0.06743,0,123451,7,$15 increase,0.9999,0.06743,0, +123441,123450,33,XXXX6,12,0.2,0,123450,33,XXXX6,12,0.2,0,123450,33,XXXX6,12,0.2,0,0 +123442,123449,34,XXXX7,6.25,0.1056,1,123449,34,XXXX7,6.25,0.1056,1,123449,34,XXXX7,6.25,0.1056,1,0 +123448,123448,8,Running fast,10,0.1,0,123448,8,Running fast,10,0.1,0,123448,8,Running fast,10,0.1,0,-6 +123449,123451,7,$15 increase,0.9999,0.06743,0,123451,7,$15 increase,0.9999,0.06743,0,123451,7,$15 increase,0.9999,0.06743,0,0 123450,123452,1,4% of something,1.2345,0.998,0,123452,1,4% of something,1.2345,0.998,0,123452,1,4% of something,1.2345,0.998,0,A -123450,123452,4,3.4% of GDP,4.2345,0.15678,1,123452,4,3.4% of GDP,4.2345,0.15678,1,123452,4,3.4% of GDP,4.2345,0.15678,1, +123450,123452,4,3.4% of GDP,4.2345,0.15678,1,123452,4,3.4% of GDP,4.2345,0.15678,1,123452,4,3.4% of GDP,4.2345,0.15678,1,0 123450,123450,6,Market pulls back,3,0.923,0,123450,6,Market pulls back,3,0.923,0,123450,6,Market pulls back,3,0.923,0, 123450,123455,12,Bonds vs. Equities,8,0.0111,1,123455,12,Bonds vs. Equities,8,0.0111,1,123455,12,Bonds vs. Equities,8,0.0111,1, 123450,123450,14,Here comes the sun,3.3333,-0.8888,0,123450,14,Here comes the sun,3.3333,-0.8888,0,123450,14,Here comes the sun,3.3333,-0.8888,0, @@ -1410,14 +1410,14 @@ INDEX:12:,col_3:12:,col_4:6:,col_str:12:,col_2:12 123451,16,23,22,9,2,C,C,16,23,22,9,2,16,23,22,9,2,C,C 123452,17,24,33,10,3,F,F,17,24,33,10,3,17,24,33,10,3,F,F 123453,18,25,aa,11,4,$,$,18,25,aa,11,4,18,25,aa,11,4,$,$ -123454,19,26,bb,12,5,,,19,26,bb,12,5,19,26,bb,12,5,, +123454,19,26,bb,12,5,0,0,19,26,bb,12,5,19,26,bb,12,5,0,0 123455,20,27,cc,13,6,8,8,20,27,cc,13,6,20,27,cc,13,6,8,8 -123456,21,,dd,14,7,x,Ü,21,,dd,14,7,21,,dd,14,7,x,Ü +123456,21,,dd,14,7,x,220,21,,dd,14,7,21,,dd,14,7,x,220 123457,22,,tt,15,8,X,X,22,,tt,15,8,22,,tt,15,8,X,X 123458,23,,uu,16,9,h,h,23,,uu,16,9,23,,uu,16,9,h,h 123459,24,,ii,17,10,u,&,24,,ii,17,10,24,,ii,17,10,u,& -123460,25,,88,18,11,,,25,,88,18,11,25,,88,18,11,, -555555,555.543,,This is a test,777.78,55.55,N,ÿ,555.543,,This is a test,777.78,55.55,555.543,,This is a test,777.78,55.55,N,ÿ +123460,25,,88,18,11,18,18,25,,88,18,11,25,,88,18,11,18,18 +555555,555.543,,This is a test,777.78,55.55,N,255,555.543,,This is a test,777.78,55.55,555.543,,This is a test,777.78,55.55,N,255 Testing KamaVisitor{ } ... @@ -2026,7 +2026,7 @@ INDEX:10:,string col:10:,Cool Column:10:,numbers:10: Date: Thu, 25 Jan 2024 12:21:43 -0500 Subject: [PATCH 10/13] Now we are able to read/write containers in csv format from/to files --- data/sample_data.csv | 6 + data/sample_data_string_index.csv | 11 +- docs/HTML/read.html | 54 ++++--- docs/HTML/write.html | 51 ++++--- .../Internals/DataFrame_private_decl.h | 21 +++ .../DataFrame/Internals/DataFrame_read.tcc | 143 ++++++++++++++---- .../Internals/DataFrame_standalone.tcc | 13 +- test/dataframe_tester.cc | 16 +- test/dataframe_tester_2.cc | 9 +- test/dataframe_tester_output.txt | 87 ++++++----- 10 files changed, 288 insertions(+), 123 deletions(-) diff --git a/data/sample_data.csv b/data/sample_data.csv index 2cfa271a..51cbe221 100644 --- a/data/sample_data.csv +++ b/data/sample_data.csv @@ -8,3 +8,9 @@ str_col:28::XXXX10,XXXX11,XXXX01,XXXX02,XXXX03,XXXX6,XXXX7,Running fast, dbl_col:28::2.009,3.111,10,4.2222,5.3333,12,6.25,10,0.9999,1.2345,4.2345,3,8,3.3333,2.2345,4.25,3.2345,0.009,1.111,5.25,11,5.2345,2.2222,1.009,2.111,9,3.2222,4.3333, dbl_col_2:28::0.87865,-0.6999,0.4111,0.1902,-0.4888,0.2,0.1056,0.1,0.06743,0.998,0.15678,0.923,0.0111,-0.8888,0.3456,0.0056,0.056,0.07865,-0.9999,0.0456,0.14,0.00345,0.1002,0.078654,-0.8999,0.01119,0.8002,-0.9888, bool_col:28::0,1,1,1,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0, +Map 1:4::3{label one 1:123.0|label one 2:-782.5|label one 3:444.44},3{label two 1:123.0|label two 2:-782.5|label two 3:444.44},3{label three 1:123.0|label three 2:-782.5|label three 3:444.44},3{label four 1:123.0|label four 2:-782.5|label four 3:444.44}, +Unordered Map:4::3{Key one 1:123.0|Key one 2:-782.5|Key one 3:444.44},3{Key two 1:123.0|Key two 2:-782.5|Key two 3:444.44},3{Key three 1:123.0|Key three 2:-782.5|Key three 3:444.44},3{Key four 1:123.0|Key four 2:-782.5|Key four 3:444.44}, +Str Vec:4::4[bbb|aaa|zzz|ddd],4[aaa|bbb|ccc|www],4[123|abc|345|list],3[bbb|aaa|zzz], +Double Set:4::3[123.0|-782.5|444.44],3[1:123.0|-782.5|:444.44],3[123.0|-782.5|444.44],4[123.0|-782.5|444.44|100.5], +Str Set:4::3[123.0|-782.5|444.44],3[1:123.0|-782.5|:444.44],3[123.0|-782.5|444.44],4[123.0|-782.5|444.44|100.5], +Z Score:4::10[1.95474040557|0.552535091086|0.775388936446|-0.561817339812|0.106794118727|-0.153218675013|-0.896114748672|-1.72258101434|-0.301804546072|0.246077772077],10[-0.985180680575|-0.338649566179|1.37000434149|0.831246802651|-0.415610988193|1.06213106869|0.554158098662|0.507981245453|-1.55472278822|-1.03135753378],10[-1.39575784008|-1.62506351709|-0.907239380237|-0.159508137551|0.807541881212|0.937157205458|0.578236204203|0.717820563726|-0.0398797142361|1.0866927346],10[1.94246107491|-0.062340594565|0.246115232403|-1.24462409799|-0.190844664632|0.8115331407|0.381021476571|-1.60448155299|-0.422151990754|0.143311976349], diff --git a/data/sample_data_string_index.csv b/data/sample_data_string_index.csv index f583b23d..d486a7ab 100644 --- a/data/sample_data_string_index.csv +++ b/data/sample_data_string_index.csv @@ -8,5 +8,12 @@ str_col:28::XXXX10,XXXX11,XXXX01,XXXX02,XXXX03,XXXX6,XXXX7,Running fast, dbl_col:28::2.009,3.111,10,4.2222,5.3333,12,6.25,10,0.9999,1.2345,4.2345,3,8,3.3333,2.2345,4.25,3.2345,0.009,1.111,5.25,11,5.2345,2.2222,1.009,2.111,9,3.2222,4.3333, dbl_col_2:28::0.87865,-0.6999,0.4111,0.1902,-0.4888,0.2,0.1056,0.1,0.06743,0.998,0.15678,0.923,0.0111,-0.8888,0.3456,0.0056,0.056,0.07865,-0.9999,0.0456,0.14,0.00345,0.1002,0.078654,-0.8999,0.01119,0.8002,-0.9888, bool_col:28::0,1,1,1,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0, -char_col:6::C,%,120,,65,! -uchar_col:6::C,%,250,,65,! +char_col:6::C,%,120,,65,!, +uchar_col:6::C,%,250,,65,!, +Map 1:4::3{label one 1:123.0|label one 2:-782.5|label one 3:444.44},3{label two 1:123.0|label two 2:-782.5|label two 3:444.44},3{label three 1:123.0|label three 2:-782.5|label three 3:444.44},3{label four 1:123.0|label four 2:-782.5|label four 3:444.44}, +Unordered Map:4::3{Key one 1:123.0|Key one 2:-782.5|Key one 3:444.44},3{Key two 1:123.0|Key two 2:-782.5|Key two 3:444.44},3{Key three 1:123.0|Key three 2:-782.5|Key three 3:444.44},3{Key four 1:123.0|Key four 2:-782.5|Key four 3:444.44}, +Str Vec:4::4[bbb|aaa|zzz|ddd],4[aaa|bbb|ccc|www],4[123|abc|345|list],3[bbb|aaa|zzz], +Double Set:4::3[123.0|-782.5|444.44],3[1:123.0|-782.5|:444.44],3[123.0|-782.5|444.44],4[123.0|-782.5|444.44|100.5], +Str Set:4::3[123.0|-782.5|444.44],3[1:123.0|-782.5|:444.44],3[123.0|-782.5|444.44],4[123.0|-782.5|444.44|100.5], +Z Score:4::10[1.95474040557|0.552535091086|0.775388936446|-0.561817339812|0.106794118727|-0.153218675013|-0.896114748672|-1.72258101434|-0.301804546072|0.246077772077],10[-0.985180680575|-0.338649566179|1.37000434149|0.831246802651|-0.415610988193|1.06213106869|0.554158098662|0.507981245453|-1.55472278822|-1.03135753378],10[-1.39575784008|-1.62506351709|-0.907239380237|-0.159508137551|0.807541881212|0.937157205458|0.578236204203|0.717820563726|-0.0398797142361|1.0866927346],10[1.94246107491|-0.062340594565|0.246115232403|-1.24462409799|-0.190844664632|0.8115331407|0.381021476571|-1.60448155299|-0.422151990754|0.143311976349], + diff --git a/docs/HTML/read.html b/docs/HTML/read.html index 8b195352..78cde5ee 100644 --- a/docs/HTML/read.html +++ b/docs/HTML/read.html @@ -44,6 +44,12 @@ It inputs the contents of a text file/stream into itself (i.e. DataFrame). Currently 3 formats (i.e. csv, csv2, json) are supported. See io_format documentation page
    +
    +  NOTE: If the DataFrame that is reading the file already has existing data columns, the file data will be added to the existing DataFrame columns.
    +        If the file has a data column with the same name and type as a column in the DataFrame, the file data will replace the existing data column in the DataFrame.
    +        If the file has a data column with the same name but different type as a column in the DataFrame, the behavior is undefined.
    +        Obviously, if the DataFrame is empty none of these matters.
    +

    CSV file format must be:
       INDEX:<Number of data points>:<Comma delimited list of values>
    @@ -85,30 +91,27 @@
             
    In all formats the following data types are supported:
    -          float       -- float
    -          double      -- double
    -          longdouble  -- long double
    -          short       -- short int
    -          ushort      -- unsigned short int
    -          int         -- int
    -          uint        -- unsigned int
    -          long        -- long int
    -          longlong    -- long long int
    -          ulong       -- unsigned long int
    -          ulonglong   -- unsigned long long int
    -          char        -- char
    -          uchar       -- unsigned char
    -          string      -- std::string
    -          string      -- const char *
    -          string      -- char *
    -          bool        -- bool
    -          DateTime    -- DateTime data in format of <Epoch seconds>.<nanoseconds> (1516179600.874123908)
    +          float      -- float
    +          double     -- double
    +          longdouble -- long double
    +          short      -- short int
    +          ushort     -- unsigned short int
    +          int        -- int
    +          uint       -- unsigned int
    +          long       -- long int
    +          longlong   -- long long int
    +          ulong      -- unsigned long int
    +          ulonglong  -- unsigned long long int
    +          char       -- char
    +          uchar      -- unsigned char
    +          string     -- std::string
    +          string     -- const char *
    +          string     -- char *
    +          bool       -- bool
    +          DateTime   -- DateTime data in format of <Epoch seconds>.<nanoseconds> (1516179600.874123908)
             
    - In case of io_format::csv2 the following additional types are also supported: + In case of io_format::csv2 and io_format::csv the following additional types are also supported:
    -          DateTimeAME    -- DateTime string printed in American style (MM/DD/YYYY HH:MM:SS.mmm)
    -          DateTimeEUR    -- DateTime string printed in European style (YYYY/MM/DD HH:MM:SS.mmm)
    -          DateTimeISO    -- DateTime string printed in ISO style (YYYY-MM-DD HH:MM:SS.mmm)
               dbl_vec        -- A vector of double precision values, The vector is printed as "s[d1|d2|...]"
                                 where s is the size of the vector and d's are the double values.
               str_vec        -- A vector of std::string values, The vector is printed as "s[str1|str2|...]"
    @@ -123,6 +126,13 @@
                                 where s is the size of the map and k's and v's are keys and values.
             
    + In case of io_format::csv2 the following additional types are also supported: +
    +          DateTimeAME -- DateTime string printed in American style (MM/DD/YYYY HH:MM:SS.mmm)
    +          DateTimeEUR -- DateTime string printed in European style (YYYY/MM/DD HH:MM:SS.mmm)
    +          DateTimeISO -- DateTime string printed in ISO style (YYYY-MM-DD HH:MM:SS.mmm)
    +        
    + NOTE:: This version of read() can be substantially faster, especially for larger files, than if you open the file yourself and use the read() version below. diff --git a/docs/HTML/write.html b/docs/HTML/write.html index 4d334593..143f31e6 100644 --- a/docs/HTML/write.html +++ b/docs/HTML/write.html @@ -44,7 +44,7 @@
    - It outputs the content of DataFrame into the stream o. Currently 3 formats (i.e. csv, csv2, json) are supported specified by the iof parameter.
    + It outputs the content of DataFrame into the stream o. Currently 3 formats (i.e. csv, csv2, json) are supported specified by the iof parameter.


    The CSV file format is written:
       INDEX:<Number of data points>:<Comma delimited list of values>
    @@ -86,30 +86,27 @@
             
    In all formats the following data types are supported:
    -          float       -- float
    -          double      -- double
    -          longdouble  -- long double
    -          short       -- short int
    -          ushort      -- unsigned short int
    -          int         -- int
    -          uint        -- unsigned int
    -          long        -- long int
    -          longlong    -- long long int
    -          ulong       -- unsigned long int
    -          ulonglong   -- unsigned long long int
    -          char        -- char
    -          uchar       -- unsigned char
    -          string      -- std::string
    -          string      -- const char *
    -          string      -- char *
    -          bool        -- bool
    -          DateTime    -- DateTime data in format of <Epoch seconds>.<nanoseconds> (1516179600.874123908)
    +          float      -- float
    +          double     -- double
    +          longdouble -- long double
    +          short      -- short int
    +          ushort     -- unsigned short int
    +          int        -- int
    +          uint       -- unsigned int
    +          long       -- long int
    +          longlong   -- long long int
    +          ulong      -- unsigned long int
    +          ulonglong  -- unsigned long long int
    +          char       -- char
    +          uchar      -- unsigned char
    +          string     -- std::string
    +          string     -- const char *
    +          string     -- char *
    +          bool       -- bool
    +          DateTime   -- DateTime data in format of <Epoch seconds>.<nanoseconds> (1516179600.874123908)
             
    - In case of io_format::csv2 the following additional types are also supported: + In case of io_format::csv2 and io_format::csv the following additional types are also supported:
    -          DateTimeAME    -- DateTime string printed in American style (MM/DD/YYYY HH:MM:SS.mmm)
    -          DateTimeEUR    -- DateTime string printed in European style (YYYY/MM/DD HH:MM:SS.mmm)
    -          DateTimeISO    -- DateTime string printed in ISO style (YYYY-MM-DD HH:MM:SS.mmm)
               dbl_vec        -- A vector of double precision values, The vector is printed as "s[d1|d2|...]"
                                 where s is the size of the vector and d's are the double values.
               str_vec        -- A vector of std::string values, The vector is printed as "s[str1|str2|...]"
    @@ -123,6 +120,14 @@
               str_dbl_unomap -- An unordered map of string keys to double precision values, The map is printed as "s{k1:v1|k2:v2|...}"
                                 where s is the size of the map and k's and v's are keys and values.
             
    + + In case of io_format::csv2 the following additional types are also supported: +
    +          DateTimeAME -- DateTime string printed in American style (MM/DD/YYYY HH:MM:SS.mmm)
    +          DateTimeEUR -- DateTime string printed in European style (YYYY/MM/DD HH:MM:SS.mmm)
    +          DateTimeISO -- DateTime string printed in ISO style (YYYY-MM-DD HH:MM:SS.mmm)
    +        
    +
    diff --git a/include/DataFrame/Internals/DataFrame_private_decl.h b/include/DataFrame/Internals/DataFrame_private_decl.h
    index 88c423c7..bbc5a1d3 100644
    --- a/include/DataFrame/Internals/DataFrame_private_decl.h
    +++ b/include/DataFrame/Internals/DataFrame_private_decl.h
    @@ -897,6 +897,27 @@ col_vector_push_back_func_(V &vec,
     
     // ----------------------------------------------------------------------------
     
    +template
    +inline static void
    +col_vector_push_back_cont_func_(V &vec,
    +                                std::istream &file,
    +                                T (*converter)(const char *))  {
    +
    +    std::string value;
    +    char        c = 0;
    +
    +    value.reserve(2048);
    +    while (file.get(c)) [[likely]] {
    +        value.clear();
    +        if (c == '\n')  break;
    +        file.unget();
    +        _get_token_from_file_(file, ',', value, '\0');
    +        vec.push_back(converter(value.c_str()));
    +    }
    +}
    +
    +// ----------------------------------------------------------------------------
    +
     template
     struct  ColVectorPushBack_  {
     
    diff --git a/include/DataFrame/Internals/DataFrame_read.tcc b/include/DataFrame/Internals/DataFrame_read.tcc
    index c4eba1f6..f869ffeb 100644
    --- a/include/DataFrame/Internals/DataFrame_read.tcc
    +++ b/include/DataFrame/Internals/DataFrame_read.tcc
    @@ -225,7 +225,9 @@ void DataFrame::read_json_(std::istream &stream, bool columns_only)  {
                         vec,
                         stream,
                         [](const char *tok, char **, int) -> char  {
    -                        if (tok[0] == '\0' || tok[1] == '\0')
    +                        if (tok[0] == '\0')
    +                            return ('\0');
    +                        else if (tok[1] == '\0')
                                 return (static_cast(int(tok[0])));
                             else
                                 return (static_cast(atoi(tok)));
    @@ -242,7 +244,9 @@ void DataFrame::read_json_(std::istream &stream, bool columns_only)  {
                         vec,
                         stream,
                         [](const char *tok, char **, int) -> unsigned char  {
    -                        if (tok[0] == '\0' || tok[1] == '\0')
    +                        if (tok[0] == '\0')
    +                            return ('\0');
    +                        else if (tok[1] == '\0')
                                 return (static_cast(int(tok[0])));
                             else
                                 return (static_cast(atoi(tok)));
    @@ -300,7 +304,9 @@ void DataFrame::read_json_(std::istream &stream, bool columns_only)  {
                     StlVecType   &vec =
                         create_column(col_name.c_str(), false);
                     auto                    converter =
    -                    [](const char *, char **)-> DateTime { return DateTime(); };
    +                    [](const char *, char **) -> DateTime {
    +                        return DateTime();
    +                    };
                     const ColVectorPushBack_>  slug;
     
                     vec.reserve(col_size);
    @@ -434,12 +440,13 @@ void DataFrame::read_csv_(std::istream &stream, bool columns_only)  {
                         create_column(col_name.c_str(), false);
     
                     vec.reserve(::atoi(value.c_str()));
    -                col_vector_push_back_func_(vec, stream, &::strtol);
                     col_vector_push_back_func_>(
                         vec,
                         stream,
                         [](const char *tok, char **, int) -> char  {
    -                        if (tok[0] == '\0' || tok[1] == '\0')
    +                        if (tok[0] == '\0')
    +                            return ('\0');
    +                        else if (tok[1] == '\0')
                                 return (static_cast(int(tok[0])));
                             else
                                 return (static_cast(atoi(tok)));
    @@ -450,13 +457,14 @@ void DataFrame::read_csv_(std::istream &stream, bool columns_only)  {
                         create_column(col_name.c_str(), false);
     
                     vec.reserve(::atoi(value.c_str()));
    -                col_vector_push_back_func_(vec, stream, &::strtoul);
                     col_vector_push_back_func_>(
                         vec,
                         stream,
                         [](const char *tok, char **, int) -> unsigned char  {
    -                        if (tok[0] == '\0' || tok[1] == '\0')
    +                        if (tok[0] == '\0')
    +                            return ('\0');
    +                        else if (tok[1] == '\0')
                                 return (static_cast(int(tok[0])));
                             else
                                 return (static_cast(atoi(tok)));
    @@ -505,7 +513,9 @@ void DataFrame::read_csv_(std::istream &stream, bool columns_only)  {
                     StlVecType   &vec =
                         create_column(col_name.c_str(), false);
                     auto                    converter =
    -                    [](const char *, char **)-> DateTime { return DateTime(); };
    +                    [](const char *, char **) -> DateTime {
    +                        return DateTime();
    +                    };
                     const ColVectorPushBack_>  slug;
     
                     vec.reserve(::atoi(value.c_str()));
    @@ -518,6 +528,79 @@ void DataFrame::read_csv_(std::istream &stream, bool columns_only)  {
                     vec.reserve(::atoi(value.c_str()));
                     col_vector_push_back_func_(vec, stream, &::strtol);
                 }
    +
    +            // Containers
    +            //
    +            else if (type_str == "dbl_vec")  {
    +                using vec_t = std::vector;
    +
    +                StlVecType   &vec =
    +                    create_column(col_name.c_str(), false);
    +
    +                vec.reserve(::atoi(value.c_str()));
    +                col_vector_push_back_cont_func_(
    +                    vec,
    +                    stream,
    +                    &_get_dbl_vec_from_value_);
    +            }
    +            else if (type_str == "str_vec")  {
    +                using vec_t = std::vector;
    +
    +                StlVecType   &vec =
    +                    create_column(col_name.c_str(), false);
    +
    +                vec.reserve(::atoi(value.c_str()));
    +                col_vector_push_back_cont_func_(
    +                    vec,
    +                    stream,
    +                    &_get_str_vec_from_value_);
    +            }
    +            else if (type_str == "dbl_set")  {
    +                using set_t = std::set;
    +
    +                StlVecType   &vec =
    +                    create_column(col_name.c_str(), false);
    +
    +                vec.reserve(::atoi(value.c_str()));
    +                col_vector_push_back_cont_func_(vec,
    +                                                stream,
    +                                                &_get_dbl_set_from_value_);
    +            }
    +            else if (type_str == "str_set")  {
    +                using set_t = std::set;
    +
    +                StlVecType   &vec =
    +                    create_column(col_name.c_str(), false);
    +
    +                vec.reserve(::atoi(value.c_str()));
    +                col_vector_push_back_cont_func_(vec,
    +                                                stream,
    +                                                &_get_str_set_from_value_);
    +            }
    +            else if (type_str == "str_dbl_map")  {
    +                using map_t = std::map;
    +
    +                StlVecType   &vec =
    +                    create_column(col_name.c_str(), false);
    +
    +                vec.reserve(::atoi(value.c_str()));
    +                col_vector_push_back_cont_func_(
    +                    vec,
    +                    stream,
    +                    &_get_str_dbl_map_from_value_);
    +            }
    +            else if (type_str == "str_dbl_unomap")  {
    +                using map_t = std::unordered_map;
    +
    +                StlVecType   &vec =
    +                    create_column(col_name.c_str(), false);
    +
    +                vec.reserve(::atoi(value.c_str()));
    +                col_vector_push_back_cont_func_(
    +                    vec,
    +                    stream,
    +                    &_get_str_dbl_map_from_value_);
    +            }
                 else [[unlikely]]
                     throw DataFrameError("DataFrame::read_csv_(): ERROR: Unknown "
                                          "column type");
    @@ -681,13 +764,15 @@ read_csv2_(std::istream &stream,
                                           type_str.c_str(),
                                           col_name.c_str(),
                                           nrows);
    +            // Containers
    +            //
                 else if (type_str == "dbl_vec")
    -                spec_vec.emplace_back(StlVecType>{ },
    +                spec_vec.emplace_back(StlVecType>{ },
                                           type_str.c_str(),
                                           col_name.c_str(),
                                           nrows);
                 else if (type_str == "str_vec")
    -                spec_vec.emplace_back(StlVecType>{ },
    +                spec_vec.emplace_back(StlVecType>{ },
                                           type_str.c_str(),
                                           col_name.c_str(),
                                           nrows);
    @@ -845,26 +930,27 @@ read_csv2_(std::istream &stream,
                         vec.push_back(v);
                     }
                 }
    +
    +            // Containers
    +            //
                 else if (col_spec.type_spec == "dbl_vec")  {
                     if (! value.empty())  {
    -                    StlVecType>  &vec =
    -                        std::any_cast> &>
    +                    StlVecType>  &vec =
    +                        std::any_cast> &>
                                 (col_spec.col_vec);
     
                         vec.push_back(
    -                        std::move(_get_dbl_vec_from_value_>(
    -                                      value.c_str())));
    +                        std::move(_get_dbl_vec_from_value_(value.c_str())));
                     }
                 }
                 else if (col_spec.type_spec == "str_vec")  {
                     if (! value.empty())  {
    -                    StlVecType> &vec =
    -                        std::any_cast> &>
    +                    StlVecType> &vec =
    +                        std::any_cast> &>
                                 (col_spec.col_vec);
     
                         vec.push_back(
    -                        std::move(_get_str_vec_from_value_>(
    -                                      value.c_str())));
    +                        std::move(_get_str_vec_from_value_(value.c_str())));
                     }
                 }
                 else if (col_spec.type_spec == "dbl_set")  {
    @@ -896,8 +982,9 @@ read_csv2_(std::istream &stream,
                         StlVecType   &vec =
                             std::any_cast &>(col_spec.col_vec);
     
    -                    vec.push_back(std::move(_get_str_dbl_map_from_value_(
    -                                      value.c_str())));
    +                    vec.push_back(
    +                        std::move(_get_str_dbl_map_from_value_(
    +                        value.c_str())));
                     }
                 }
                 else if (col_spec.type_spec == "str_dbl_unomap")  {
    @@ -907,8 +994,9 @@ read_csv2_(std::istream &stream,
                         StlVecType   &vec =
                             std::any_cast &> (col_spec.col_vec);
     
    -                    vec.push_back(std::move(_get_str_dbl_map_from_value_(
    -                                      value.c_str())));
    +                    vec.push_back(
    +                        std::move(_get_str_dbl_map_from_value_(
    +                        value.c_str())));
                     }
                 }
                 col_index += 1;
    @@ -1016,17 +1104,20 @@ read_csv2_(std::istream &stream,
                         std::move(std::any_cast &>
                             (col_spec.col_vec)),
                         nan_policy::dont_pad_with_nans);
    +
    +            // Containers
    +            //
                 else if (col_spec.type_spec == "dbl_vec")
    -                load_column>(
    +                load_column>(
                         col_spec.col_name.c_str(),
    -                    std::move(std::any_cast> &>
    +                    std::move(std::any_cast> &>
                             (col_spec.col_vec)),
                         nan_policy::dont_pad_with_nans);
                 else if (col_spec.type_spec == "str_vec")
    -                load_column>(
    +                load_column>(
                         col_spec.col_name.c_str(),
                         std::move(
    -                        std::any_cast> &>
    +                        std::any_cast> &>
                             (col_spec.col_vec)),
                         nan_policy::dont_pad_with_nans);
                 else if (col_spec.type_spec == "dbl_set")  {
    diff --git a/include/DataFrame/Internals/DataFrame_standalone.tcc b/include/DataFrame/Internals/DataFrame_standalone.tcc
    index 441b9d70..a95290a1 100644
    --- a/include/DataFrame/Internals/DataFrame_standalone.tcc
    +++ b/include/DataFrame/Internals/DataFrame_standalone.tcc
    @@ -90,6 +90,9 @@ std::unordered_map<_TypeInfoRef_,
         { typeid(char *), "string" },
         { typeid(bool), "bool" },
         { typeid(DateTime), "DateTime" },
    +
    +    // Containers
    +    //
         { typeid(std::vector), "dbl_vec" },
         { typeid(std::vector), "str_vec" },
         { typeid(std::set), "dbl_set" },
    @@ -629,11 +632,10 @@ _get_token_from_file_ (std::istream &file,
     
     // ----------------------------------------------------------------------------
     
    -template
    -inline static typename DF::template StlVecType
    +inline static std::vector
     _get_dbl_vec_from_value_(const char *value)  {
     
    -    using vec_t = typename DF::template StlVecType;
    +    using vec_t = std::vector;
     
         std::size_t vcnt = 0;
         char        buffer[128];
    @@ -662,11 +664,10 @@ _get_dbl_vec_from_value_(const char *value)  {
     
     // ----------------------------------------------------------------------------
     
    -template
    -inline static typename DF::template StlVecType
    +inline static std::vector
     _get_str_vec_from_value_(const char *value)  {
     
    -    using vec_t = typename DF::template StlVecType;
    +    using vec_t = std::vector;
     
         std::size_t vcnt { 0 };
         char        buffer[2048];
    diff --git a/test/dataframe_tester.cc b/test/dataframe_tester.cc
    index dd409de2..339fe268 100644
    --- a/test/dataframe_tester.cc
    +++ b/test/dataframe_tester.cc
    @@ -516,9 +516,13 @@ static void test_read()  {
                       int,
                       unsigned long,
                       double,
    +                  std::map,
    +                  std::unordered_map,
    +                  std::vector,
    +                  std::set,
    +                  std::set,
    +                  std::vector,
                       std::string,
    -                  char,
    -                  unsigned char,
                       bool>(std::cout);
     
         StdDataFrame   df_read_str;
    @@ -536,6 +540,12 @@ static void test_read()  {
                           std::string,
                           char,
                           unsigned char,
    +                      std::map,
    +                      std::unordered_map,
    +                      std::vector,
    +                      std::set,
    +                      std::set,
    +                      std::vector,
                           bool>(std::cout);
     
         StdDataFrame  df_read_dt;
    @@ -551,8 +561,6 @@ static void test_read()  {
                          unsigned long,
                          double,
                          std::string,
    -                     char,
    -                     unsigned char,
                          bool>(std::cout);
     }
     
    diff --git a/test/dataframe_tester_2.cc b/test/dataframe_tester_2.cc
    index d875a2f0..e72021c5 100644
    --- a/test/dataframe_tester_2.cc
    +++ b/test/dataframe_tester_2.cc
    @@ -3120,6 +3120,12 @@ static void test_no_index_reads()  {
                       bool,
                       char,
                       unsigned char,
    +                  std::map,
    +                  std::unordered_map,
    +                  std::vector,
    +                  std::set,
    +                  std::set,
    +                  std::vector,
                       std::string>(std::cout, io_format::csv2);
     
             std::cout << '\n' << std::endl;
    @@ -3127,10 +3133,7 @@ static void test_no_index_reads()  {
             df3.read("data/sample_data_2.json", io_format::json, true);
             df3.read("data/sample_data_no_index.json", io_format::json, true);
             df3.write(std::cout, io_format::csv2);
    diff --git a/test/dataframe_tester_output.txt b/test/dataframe_tester_output.txt
    index e0bf38a1..059df055 100644
    --- a/test/dataframe_tester_output.txt
    +++ b/test/dataframe_tester_output.txt
    @@ -85,6 +85,12 @@ str_col:28::XXXX10,XXXX11,XXXX01,XXXX02,XXXX03,XXXX6,XXXX7,Running fast,
     dbl_col:28::2.009,3.111,10,4.2222,5.3333,12,6.25,10,0.9999,1.2345,4.2345,3,8,3.3333,2.2345,4.25,3.2345,0.009,1.111,5.25,11,5.2345,2.2222,1.009,2.111,9,3.2222,4.3333,
     dbl_col_2:28::0.87865,-0.6999,0.4111,0.1902,-0.4888,0.2,0.1056,0.1,0.06743,0.998,0.15678,0.923,0.0111,-0.8888,0.3456,0.0056,0.056,0.07865,-0.9999,0.0456,0.14,0.00345,0.1002,0.078654,-0.8999,0.01119,0.8002,-0.9888,
     bool_col:28::0,1,1,1,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,
    +Map 1:4::3{label one 1:123|label one 2:-782.5|label one 3:444.44},3{label two 1:123|label two 2:-782.5|label two 3:444.44},3{label three 1:123|label three 2:-782.5|label three 3:444.44},3{label four 1:123|label four 2:-782.5|label four 3:444.44},
    +Unordered Map:4::3{Key one 3:444.44|Key one 2:-782.5|Key one 1:123},3{Key two 3:444.44|Key two 2:-782.5|Key two 1:123},3{Key three 3:444.44|Key three 2:-782.5|Key three 1:123},3{Key four 2:-782.5|Key four 3:444.44|Key four 1:123},
    +Str Vec:4::4[bbb|aaa|zzz|ddd],4[aaa|bbb|ccc|www],4[123|abc|345|list],3[bbb|aaa|zzz],
    +Double Set:4::3[-782.5|123|444.44],3[-782.5|0|1],3[-782.5|123|444.44],4[-782.5|100.5|123|444.44],
    +Str Set:4::3[-782.5|123.0|444.44],3[-782.5|1:123.0|:444.44],3[-782.5|123.0|444.44],4[-782.5|100.5|123.0|444.44],
    +Z Score:4::10[1.95474040557|0.552535091086|0.775388936446|-0.561817339812|0.106794118727|-0.153218675013|-0.896114748672|-1.72258101434|-0.301804546072|0.246077772077],10[-0.985180680575|-0.338649566179|1.37000434149|0.831246802651|-0.415610988193|1.06213106869|0.554158098662|0.507981245453|-1.55472278822|-1.03135753378],10[-1.39575784008|-1.62506351709|-0.907239380237|-0.159508137551|0.807541881212|0.937157205458|0.578236204203|0.717820563726|-0.0398797142361|1.0866927346],10[1.94246107491|-0.062340594565|0.246115232403|-1.24462409799|-0.190844664632|0.8115331407|0.381021476571|-1.60448155299|-0.422151990754|0.143311976349],
     
     INDEX:28::string_index_1,string_index_2,string_index_3,string_index_4,string_index_5,string_index_6,string_index_7,string_index_8,string_index_9,string_index_10,string_index_11,string_index_12,string_index_13,string_index_14,string_index_15,string_index_16,string_index_17,string_index_18,string_index_19,string_index_20,string_index_21,string_index_22,string_index_23,string_index_24,string_index_25,string_index_26,string_index_27,string_index_28,
     ul_col:28::123450,123451,123452,123450,123455,123450,123449,123448,123451,123452,123452,123450,123455,123450,123454,123453,123456,123457,123458,123459,123460,123441,123442,123432,123433,123434,123435,123436,
    @@ -93,7 +99,14 @@ str_col:28::XXXX10,XXXX11,XXXX01,XXXX02,XXXX03,XXXX6,XXXX7,Running fast,
     dbl_col:28::2.009,3.111,10,4.2222,5.3333,12,6.25,10,0.9999,1.2345,4.2345,3,8,3.3333,2.2345,4.25,3.2345,0.009,1.111,5.25,11,5.2345,2.2222,1.009,2.111,9,3.2222,4.3333,
     dbl_col_2:28::0.87865,-0.6999,0.4111,0.1902,-0.4888,0.2,0.1056,0.1,0.06743,0.998,0.15678,0.923,0.0111,-0.8888,0.3456,0.0056,0.056,0.07865,-0.9999,0.0456,0.14,0.00345,0.1002,0.078654,-0.8999,0.01119,0.8002,-0.9888,
     bool_col:28::0,1,1,1,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,
    -char_col:11::0,0,x,0,A,0,0,-6,0,A,0,
    +char_col:6::C,%,x,0,A,!,
    +uchar_col:6::C,%,250,0,A,!,
    +Map 1:4::3{label one 1:123|label one 2:-782.5|label one 3:444.44},3{label two 1:123|label two 2:-782.5|label two 3:444.44},3{label three 1:123|label three 2:-782.5|label three 3:444.44},3{label four 1:123|label four 2:-782.5|label four 3:444.44},
    +Unordered Map:4::3{Key one 3:444.44|Key one 2:-782.5|Key one 1:123},3{Key two 3:444.44|Key two 2:-782.5|Key two 1:123},3{Key three 3:444.44|Key three 2:-782.5|Key three 1:123},3{Key four 2:-782.5|Key four 3:444.44|Key four 1:123},
    +Str Vec:4::4[bbb|aaa|zzz|ddd],4[aaa|bbb|ccc|www],4[123|abc|345|list],3[bbb|aaa|zzz],
    +Double Set:4::3[-782.5|123|444.44],3[-782.5|0|1],3[-782.5|123|444.44],4[-782.5|100.5|123|444.44],
    +Str Set:4::3[-782.5|123.0|444.44],3[-782.5|1:123.0|:444.44],3[-782.5|123.0|444.44],4[-782.5|100.5|123.0|444.44],
    +Z Score:4::10[1.95474040557|0.552535091086|0.775388936446|-0.561817339812|0.106794118727|-0.153218675013|-0.896114748672|-1.72258101434|-0.301804546072|0.246077772077],10[-0.985180680575|-0.338649566179|1.37000434149|0.831246802651|-0.415610988193|1.06213106869|0.554158098662|0.507981245453|-1.55472278822|-1.03135753378],10[-1.39575784008|-1.62506351709|-0.907239380237|-0.159508137551|0.807541881212|0.937157205458|0.578236204203|0.717820563726|-0.0398797142361|1.0866927346],10[1.94246107491|-0.062340594565|0.246115232403|-1.24462409799|-0.190844664632|0.8115331407|0.381021476571|-1.60448155299|-0.422151990754|0.143311976349],
     
     INDEX:28::1547825036.3,1516179600.874123908,1516093200.234,1516006800.234098,1515920400.2309,1515834000.89,1515747600.123456789,1515661200.12309,1515574800.4562387,1515488400.2345609,1515402000.78,1515315600.340987645,1515229200.309812765,1515142800.93451984,1515056400.671092346,1514970000.450137234,1514883600.91256923,1514797200.67,1514624400.4562,1514538000.5,1514451600.0,1514365200.896120945,1514278800.783452098,378205200000.561209834,409741200000.346,441277200000.340987,472899600.0,504435600.871234561,
     ul_col:28::123450,123451,123452,123450,123455,123450,123449,123448,123451,123452,123452,123450,123455,123450,123454,123453,123456,123457,123458,123459,123460,123441,123442,123432,123433,123434,123435,123436,
    @@ -752,12 +765,12 @@ col_3:1::15,
     col_str:1::11,
     col_4:1::22,
     
    -INDEX:1::123451,
    -col_1:1::2,
    -col_2:1::9,
    -col_3:1::16,
    -col_str:1::22,
    -col_4:1::23,
    +INDEX:1::123452,
    +col_1:1::3,
    +col_2:1::10,
    +col_3:1::17,
    +col_str:1::33,
    +col_4:1::24,
     
     
     Testing write(json) ...
    @@ -1373,35 +1386,35 @@ INDEX:28:,ul_col:28:,dbl_col_2:26:,bool_col:6:,str_c
     
     
     
    -INDEX:28:,ul_col:28:,xint_col:28:,str_col:28:,dbl_col:28:,dbl_col_2:28:,bool_col:28:,ul_col_2:28:,xint_col_2:28:,str_col_2:28:,dbl_col_3:28:,dbl_col_2_2:28:,bool_col_2:28:,ul_col_no_idx:28:,xint_col_no_idx:28:,str_col_no_idx:28:,dbl_col_no_idx:28:,dbl_col_2_no_idx:28:,bool_col_no_idx:28:,char_col:11:
    -123432,123450,35,XXXX10,2.009,0.87865,0,123450,35,XXXX10,2.009,0.87865,0,123450,35,XXXX10,2.009,0.87865,0,0
    -123433,123451,36,XXXX11,3.111,-0.6999,1,123451,36,XXXX11,3.111,-0.6999,1,123451,36,XXXX11,3.111,-0.6999,1,0
    -123434,123452,40,XXXX01,10,0.4111,1,123452,40,XXXX01,10,0.4111,1,123452,40,XXXX01,10,0.4111,1,x
    -123435,123450,45,XXXX02,4.2222,0.1902,1,123450,45,XXXX02,4.2222,0.1902,1,123450,45,XXXX02,4.2222,0.1902,1,0
    -123436,123455,46,XXXX03,5.3333,-0.4888,0,123455,46,XXXX03,5.3333,-0.4888,0,123455,46,XXXX03,5.3333,-0.4888,0,A
    -123441,123450,33,XXXX6,12,0.2,0,123450,33,XXXX6,12,0.2,0,123450,33,XXXX6,12,0.2,0,0
    -123442,123449,34,XXXX7,6.25,0.1056,1,123449,34,XXXX7,6.25,0.1056,1,123449,34,XXXX7,6.25,0.1056,1,0
    -123448,123448,8,Running fast,10,0.1,0,123448,8,Running fast,10,0.1,0,123448,8,Running fast,10,0.1,0,-6
    -123449,123451,7,$15 increase,0.9999,0.06743,0,123451,7,$15 increase,0.9999,0.06743,0,123451,7,$15 increase,0.9999,0.06743,0,0
    -123450,123452,1,4% of something,1.2345,0.998,0,123452,1,4% of something,1.2345,0.998,0,123452,1,4% of something,1.2345,0.998,0,A
    -123450,123452,4,3.4% of GDP,4.2345,0.15678,1,123452,4,3.4% of GDP,4.2345,0.15678,1,123452,4,3.4% of GDP,4.2345,0.15678,1,0
    -123450,123450,6,Market pulls back,3,0.923,0,123450,6,Market pulls back,3,0.923,0,123450,6,Market pulls back,3,0.923,0,
    -123450,123455,12,Bonds vs. Equities,8,0.0111,1,123455,12,Bonds vs. Equities,8,0.0111,1,123455,12,Bonds vs. Equities,8,0.0111,1,
    -123450,123450,14,Here comes the sun,3.3333,-0.8888,0,123450,14,Here comes the sun,3.3333,-0.8888,0,123450,14,Here comes the sun,3.3333,-0.8888,0,
    -123451,123454,2,Description 4/5,2.2345,0.3456,0,123454,2,Description 4/5,2.2345,0.3456,0,123454,2,Description 4/5,2.2345,0.3456,0,
    -123451,123453,9,C++14 development,4.25,0.0056,0,123453,9,C++14 development,4.25,0.0056,0,123453,9,C++14 development,4.25,0.0056,0,
    -123452,123456,3,This is bad,3.2345,0.056,0,123456,3,This is bad,3.2345,0.056,0,123456,3,This is bad,3.2345,0.056,0,
    -123452,123457,10,Some explanation,0.009,0.07865,0,123457,10,Some explanation,0.009,0.07865,0,123457,10,Some explanation,0.009,0.07865,0,
    -123452,123458,11,More strings,1.111,-0.9999,0,123458,11,More strings,1.111,-0.9999,0,123458,11,More strings,1.111,-0.9999,0,
    -123453,123459,20,XXXX04,5.25,0.0456,0,123459,20,XXXX04,5.25,0.0456,0,123459,20,XXXX04,5.25,0.0456,0,
    -123454,123460,15,XXXX1,11,0.14,1,123460,15,XXXX1,11,0.14,1,123460,15,XXXX1,11,0.14,1,
    -123455,123441,5,Market drops,5.2345,0.00345,0,123441,5,Market drops,5.2345,0.00345,0,123441,5,Market drops,5.2345,0.00345,0,
    -123455,123442,13,Almost done,2.2222,0.1002,1,123442,13,Almost done,2.2222,0.1002,1,123442,13,Almost done,2.2222,0.1002,1,
    -123456,123432,22,XXXX2,1.009,0.078654,0,123432,22,XXXX2,1.009,0.078654,0,123432,22,XXXX2,1.009,0.078654,0,
    -123457,123433,23,XXXX3,2.111,-0.8999,0,123433,23,XXXX3,2.111,-0.8999,0,123433,23,XXXX3,2.111,-0.8999,0,
    -123458,123434,24,XXXX4,9,0.01119,1,123434,24,XXXX4,9,0.01119,1,123434,24,XXXX4,9,0.01119,1,
    -123459,123435,25,XXXX4,3.2222,0.8002,0,123435,25,XXXX4,3.2222,0.8002,0,123435,25,XXXX4,3.2222,0.8002,0,
    -123460,123436,30,XXXX5,4.3333,-0.9888,0,123436,30,XXXX5,4.3333,-0.9888,0,123436,30,XXXX5,4.3333,-0.9888,0,
    +INDEX:28:,ul_col:28:,xint_col:28:,str_col:28:,dbl_col:28:,dbl_col_2:28:,bool_col:28:,Map 1:4:,Unordered Map:4:,Str Vec:4:,Double Set:4:,Str Set:4:,Z Score:4:,ul_col_2:28:,xint_col_2:28:,str_col_2:28:,dbl_col_3:28:,dbl_col_2_2:28:,bool_col_2:28:,ul_col_no_idx:28:,xint_col_no_idx:28:,str_col_no_idx:28:,dbl_col_no_idx:28:,dbl_col_2_no_idx:28:,bool_col_no_idx:28:,char_col:11:
    +123432,123450,35,XXXX10,2.009,0.87865,0,3{label one 1:123|label one 2:-782.5|label one 3:444.44},3{Key one 3:444.44|Key one 2:-782.5|Key one 1:123},4[bbb|aaa|zzz|ddd],3[-782.5|123|444.44],3[-782.5|123.0|444.44],10[1.95474040557|0.552535091086|0.775388936446|-0.561817339812|0.106794118727|-0.153218675013|-0.896114748672|-1.72258101434|-0.301804546072|0.246077772077],123450,35,XXXX10,2.009,0.87865,0,123450,35,XXXX10,2.009,0.87865,0,C
    +123433,123451,36,XXXX11,3.111,-0.6999,1,3{label two 1:123|label two 2:-782.5|label two 3:444.44},3{Key two 3:444.44|Key two 2:-782.5|Key two 1:123},4[aaa|bbb|ccc|www],3[-782.5|0|1],3[-782.5|1:123.0|:444.44],10[-0.985180680575|-0.338649566179|1.37000434149|0.831246802651|-0.415610988193|1.06213106869|0.554158098662|0.507981245453|-1.55472278822|-1.03135753378],123451,36,XXXX11,3.111,-0.6999,1,123451,36,XXXX11,3.111,-0.6999,1,%
    +123434,123452,40,XXXX01,10,0.4111,1,3{label three 1:123|label three 2:-782.5|label three 3:444.44},3{Key three 3:444.44|Key three 2:-782.5|Key three 1:123},4[123|abc|345|list],3[-782.5|123|444.44],3[-782.5|123.0|444.44],10[-1.39575784008|-1.62506351709|-0.907239380237|-0.159508137551|0.807541881212|0.937157205458|0.578236204203|0.717820563726|-0.0398797142361|1.0866927346],123452,40,XXXX01,10,0.4111,1,123452,40,XXXX01,10,0.4111,1,x
    +123435,123450,45,XXXX02,4.2222,0.1902,1,3{label four 1:123|label four 2:-782.5|label four 3:444.44},3{Key four 2:-782.5|Key four 3:444.44|Key four 1:123},3[bbb|aaa|zzz],4[-782.5|100.5|123|444.44],4[-782.5|100.5|123.0|444.44],10[1.94246107491|-0.062340594565|0.246115232403|-1.24462409799|-0.190844664632|0.8115331407|0.381021476571|-1.60448155299|-0.422151990754|0.143311976349],123450,45,XXXX02,4.2222,0.1902,1,123450,45,XXXX02,4.2222,0.1902,1,0
    +123436,123455,46,XXXX03,5.3333,-0.4888,0,,,,,,,123455,46,XXXX03,5.3333,-0.4888,0,123455,46,XXXX03,5.3333,-0.4888,0,A
    +123441,123450,33,XXXX6,12,0.2,0,,,,,,,123450,33,XXXX6,12,0.2,0,123450,33,XXXX6,12,0.2,0,0
    +123442,123449,34,XXXX7,6.25,0.1056,1,,,,,,,123449,34,XXXX7,6.25,0.1056,1,123449,34,XXXX7,6.25,0.1056,1,%
    +123448,123448,8,Running fast,10,0.1,0,,,,,,,123448,8,Running fast,10,0.1,0,123448,8,Running fast,10,0.1,0,-6
    +123449,123451,7,$15 increase,0.9999,0.06743,0,,,,,,,123451,7,$15 increase,0.9999,0.06743,0,123451,7,$15 increase,0.9999,0.06743,0,0
    +123450,123452,1,4% of something,1.2345,0.998,0,,,,,,,123452,1,4% of something,1.2345,0.998,0,123452,1,4% of something,1.2345,0.998,0,A
    +123450,123452,4,3.4% of GDP,4.2345,0.15678,1,,,,,,,123452,4,3.4% of GDP,4.2345,0.15678,1,123452,4,3.4% of GDP,4.2345,0.15678,1,0
    +123450,123450,6,Market pulls back,3,0.923,0,,,,,,,123450,6,Market pulls back,3,0.923,0,123450,6,Market pulls back,3,0.923,0,
    +123450,123455,12,Bonds vs. Equities,8,0.0111,1,,,,,,,123455,12,Bonds vs. Equities,8,0.0111,1,123455,12,Bonds vs. Equities,8,0.0111,1,
    +123450,123450,14,Here comes the sun,3.3333,-0.8888,0,,,,,,,123450,14,Here comes the sun,3.3333,-0.8888,0,123450,14,Here comes the sun,3.3333,-0.8888,0,
    +123451,123454,2,Description 4/5,2.2345,0.3456,0,,,,,,,123454,2,Description 4/5,2.2345,0.3456,0,123454,2,Description 4/5,2.2345,0.3456,0,
    +123451,123453,9,C++14 development,4.25,0.0056,0,,,,,,,123453,9,C++14 development,4.25,0.0056,0,123453,9,C++14 development,4.25,0.0056,0,
    +123452,123456,3,This is bad,3.2345,0.056,0,,,,,,,123456,3,This is bad,3.2345,0.056,0,123456,3,This is bad,3.2345,0.056,0,
    +123452,123457,10,Some explanation,0.009,0.07865,0,,,,,,,123457,10,Some explanation,0.009,0.07865,0,123457,10,Some explanation,0.009,0.07865,0,
    +123452,123458,11,More strings,1.111,-0.9999,0,,,,,,,123458,11,More strings,1.111,-0.9999,0,123458,11,More strings,1.111,-0.9999,0,
    +123453,123459,20,XXXX04,5.25,0.0456,0,,,,,,,123459,20,XXXX04,5.25,0.0456,0,123459,20,XXXX04,5.25,0.0456,0,
    +123454,123460,15,XXXX1,11,0.14,1,,,,,,,123460,15,XXXX1,11,0.14,1,123460,15,XXXX1,11,0.14,1,
    +123455,123441,5,Market drops,5.2345,0.00345,0,,,,,,,123441,5,Market drops,5.2345,0.00345,0,123441,5,Market drops,5.2345,0.00345,0,
    +123455,123442,13,Almost done,2.2222,0.1002,1,,,,,,,123442,13,Almost done,2.2222,0.1002,1,123442,13,Almost done,2.2222,0.1002,1,
    +123456,123432,22,XXXX2,1.009,0.078654,0,,,,,,,123432,22,XXXX2,1.009,0.078654,0,123432,22,XXXX2,1.009,0.078654,0,
    +123457,123433,23,XXXX3,2.111,-0.8999,0,,,,,,,123433,23,XXXX3,2.111,-0.8999,0,123433,23,XXXX3,2.111,-0.8999,0,
    +123458,123434,24,XXXX4,9,0.01119,1,,,,,,,123434,24,XXXX4,9,0.01119,1,123434,24,XXXX4,9,0.01119,1,
    +123459,123435,25,XXXX4,3.2222,0.8002,0,,,,,,,123435,25,XXXX4,3.2222,0.8002,0,123435,25,XXXX4,3.2222,0.8002,0,
    +123460,123436,30,XXXX5,4.3333,-0.9888,0,,,,,,,123436,30,XXXX5,4.3333,-0.9888,0,123436,30,XXXX5,4.3333,-0.9888,0,
     
     
     
    @@ -2026,7 +2039,7 @@ INDEX:10:,string col:10:,Cool Column:10:,numbers:10:
    Date: Sun, 28 Jan 2024 10:48:09 -0500
    Subject: [PATCH 11/13] remove_column() now requires a template parameter. It
     actually frees up the memory space now
    
    ---
     docs/HTML/remove_column.html                  | 84 ++++++++++---------
     include/DataFrame/DataFrame.h                 | 23 +++--
     .../DataFrame/Internals/DataFrame_functors.h  | 17 +++-
     .../DataFrame/Internals/DataFrame_misc.tcc    | 13 +++
     include/DataFrame/Internals/DataFrame_set.tcc | 61 +++++++-------
     .../DataFrame/Internals/DataFrame_shift.tcc   | 24 ++++--
     test/dataframe_tester.cc                      |  6 +-
     7 files changed, 138 insertions(+), 90 deletions(-)
    
    diff --git a/docs/HTML/remove_column.html b/docs/HTML/remove_column.html
    index f36f170c..0faa7d27 100644
    --- a/docs/HTML/remove_column.html
    +++ b/docs/HTML/remove_column.html
    @@ -33,16 +33,17 @@
         
            
             
    
    +template<typename T>
     void
     remove_column(const char *name);
             
    - It removes a column named name.
    - The actual data vector is not deleted, but the column is dropped from DataFrame + It removes the named column and frees up the memory space.
    - name: Column name + T: Type of the named column
    + name: Column name
    @@ -50,56 +51,57 @@
    
    +template<typename T>
     void
     remove_column(std::size_t index);
             
    - It removes a column based on index.
    - The actual data vector is not deleted, but the column is dropped from DataFrame + It removes a column based on index and frees up the memory space.
    - index: Column index + T: Type of the named column
    + index: Column index
    -
    static void test_remove_column()  {
    -
    -    std::cout << "\nTesting remove_column() ..." << std::endl;
    -
    -    std::vector<unsigned long>  idx = { 123450, 123451, 123452, 123450, 123455, 123450, 123449 };
    -    std::vector<double> d1 = { 1, 2, 3, 4, 5, 6, 7 };
    -    std::vector<double> d2 = { 8, 9, 10, 11, 12, 13, 14 };
    -    std::vector<double> d3 = { 15, 16, 17, 18, 19, 20, 21 };
    -    std::vector<int>    i1 = { 22, 23, 24, 25 };
    -    std::vector<std::string> s1 = { "11", "22", "33", "xx", "yy", "gg", "string" };
    -    MyDataFrame         df;
    -
    -    df.load_data(std::move(idx),
    -                 std::make_pair("col_1", d1),
    -                 std::make_pair("col_2", d2),
    -                 std::make_pair("col_3", d3),
    -                 std::make_pair("col_int", i1),
    -                 std::make_pair("col_str", s1));
    -
    -    df.write<std::ostream, double, int, std::string>(std::cout);
    -    df.remove_column("col_2");
    -    std::cout << "After removing column `col_2`" << std::endl;
    -    df.write<std::ostream, double, int, std::string>(std::cout);
    -    df.remove_column("col_str");
    -    std::cout << "After removing column `col_str`" << std::endl;
    -    df.write<std::ostream, double, int, std::string>(std::cout);
    -
    -    std::vector<double> d22 = { 8, 9, 10, 11, 12, 13, 14 };
    -
    -    df.load_column("col_2", std::move(d22));
    -    std::cout << "After adding back column `col_2`" << std::endl;
    -    df.write<std::ostream, double, int, std::string>(std::cout);
    -}
    -
    - +
    static void test_remove_column()  {
    +
    +    std::cout << "\nTesting remove_column() ..." << std::endl;
    +
    +    StlVecType<unsigned long>  idx =
    +        { 123450, 123451, 123452, 123450, 123455, 123450, 123449 };
    +    StlVecType<double>         d1 = { 1, 2, 3, 4, 5, 6, 7 };
    +    StlVecType<double>         d2 = { 8, 9, 10, 11, 12, 13, 14 };
    +    StlVecType<double>         d3 = { 15, 16, 17, 18, 19, 20, 21 };
    +    StlVecType<int>            i1 = { 22, 23, 24, 25 };
    +    StlVecType<std::string>    s1 = { "11", "22", "33", "xx", "yy", "gg", "string" };
    +    MyDataFrame                df;
    +
    +    df.load_data(std::move(idx),
    +                 std::make_pair("col_1", d1),
    +                 std::make_pair("col_2", d2),
    +                 std::make_pair("col_3", d3),
    +                 std::make_pair("col_int", i1),
    +                 std::make_pair("col_str", s1));
    +
    +    df.write<std::ostream, double, int, std::string>(std::cout);
    +    df.remove_column<double>("col_2");
    +    std::cout << "After removing column `col_2`" << std::endl;
    +    df.write<std::ostream, double, int, std::string>(std::cout);
    +    df.remove_column<std::string>("col_str");
    +    std::cout << "After removing column `col_str`" << std::endl;
    +    df.write<std::ostream, double, int, std::string>(std::cout);
    +
    +    StlVecType<double> d22 = { 8, 9, 10, 11, 12, 13, 14 };
    +
    +    df.load_column<double>("col_2", std::move(d22));
    +    std::cout << "After adding back column `col_2`" << std::endl;
    +    df.write<std::ostream, double, int, std::string>(std::cout);
    +}
    +
    C++ DataFrame diff --git a/include/DataFrame/DataFrame.h b/include/DataFrame/DataFrame.h index 959ca03e..1d82363b 100644 --- a/include/DataFrame/DataFrame.h +++ b/include/DataFrame/DataFrame.h @@ -150,12 +150,22 @@ class DataFrame : public ThreadGranularity { create_column(const char *name, bool do_lock = true); // It removes a column named name. - // The actual data vector is not deleted, but the column is dropped from - // DataFrame // + // T: + // Type of the named column + // name: + // Name of the column + // + template void remove_column(const char *name); + // T: + // Type of the indexed column + // index: + // Index of the column + // + template void remove_column(size_type index); @@ -1598,7 +1608,8 @@ class DataFrame : public ThreadGranularity { [[nodiscard]] DataFrame shift(size_type periods, shift_policy sp) const; - // This copies the named column into another vector and shifts it up or down + // This copies the named column into another vector and shifts it up + // or down // and returns it. // It is handy to create columns of shifted data in the dataframe for // machine-learning analysis @@ -2492,9 +2503,9 @@ class DataFrame : public ThreadGranularity { // Hints: to match '*' or '?', put them in "[]". Like this: // abc[*]xyz matches "abc*xyz" only // - // NOTE: This could be, in some cases, n-squared. But it is pretty fast with - // moderately sized strings. I have not tested this with huge/massive - // strings. + // NOTE: This could be, in some cases, n-squared. But it is pretty fast + // with moderately sized strings. I have not tested this with + // huge/massive strings. // // T: // Type of the named column. Based on the concept, it can only be either diff --git a/include/DataFrame/Internals/DataFrame_functors.h b/include/DataFrame/Internals/DataFrame_functors.h index 53488d77..34d89eb4 100644 --- a/include/DataFrame/Internals/DataFrame_functors.h +++ b/include/DataFrame/Internals/DataFrame_functors.h @@ -52,7 +52,22 @@ struct consistent_functor_ : DataVec::template visitor_base { template struct shrink_to_fit_functor_ : DataVec::template visitor_base { - inline shrink_to_fit_functor_ () { } + inline shrink_to_fit_functor_() { } + + template + void operator() (T &vec) const; +}; + +// ---------------------------------------------------------------------------- + +template +struct remove_column_functor_ : DataVec::template visitor_base { + + inline remove_column_functor_ (const char *cn, DataFrame &d) + : col_name(cn), df(d) { } + + const char *col_name; + DataFrame &df; template void operator() (T &vec) const; diff --git a/include/DataFrame/Internals/DataFrame_misc.tcc b/include/DataFrame/Internals/DataFrame_misc.tcc index 7359227c..a6e3d16c 100644 --- a/include/DataFrame/Internals/DataFrame_misc.tcc +++ b/include/DataFrame/Internals/DataFrame_misc.tcc @@ -71,6 +71,19 @@ DataFrame::shrink_to_fit_functor_::operator() (T &vec) const { // ---------------------------------------------------------------------------- +template +template +template +void +DataFrame::remove_column_functor_::operator() (T &) const { + + using value_type = typename T::value_type; + + df.remove_column(col_name); +} + +// ---------------------------------------------------------------------------- + template template template diff --git a/include/DataFrame/Internals/DataFrame_set.tcc b/include/DataFrame/Internals/DataFrame_set.tcc index fa1eeec2..268527cc 100644 --- a/include/DataFrame/Internals/DataFrame_set.tcc +++ b/include/DataFrame/Internals/DataFrame_set.tcc @@ -70,30 +70,22 @@ DataFrame::create_column (const char *name, bool do_lock) { // ---------------------------------------------------------------------------- template +template void DataFrame::remove_column (const char *name) { static_assert(std::is_base_of, DataVec>::value, "Only a StdDataFrame can call remove_column()"); - if (! ::strcmp(name, DF_INDEX_COL_NAME)) - throw DataFrameError ("DataFrame::remove_column(): ERROR: " - "Data column name cannot be 'INDEX'"); - - const auto iter = column_tb_.find (name); + ColumnVecType &vec = get_column(name); - if (iter == column_tb_.end()) { - char buffer [512]; - - snprintf (buffer, sizeof(buffer) - 1, - "DataFrame::remove_column(): ERROR: Cannot find column '%s'", - name); - throw ColNotFound (buffer); - } + // Free the memory space + // + vec = std::move(ColumnVecType{ }); // I do not erase the column from the data_ vector, because it will mess up // indices in the hash table column_tb_ /* data_.erase (data_.begin() + iter->second); */ - column_tb_.erase (iter); + column_tb_.erase (name); for (size_type i = 0; i < column_list_.size(); ++i) { if (column_list_[i].first == name) { column_list_.erase(column_list_.begin() + i); @@ -107,9 +99,10 @@ void DataFrame::remove_column (const char *name) { // ---------------------------------------------------------------------------- template +template void DataFrame::remove_column(size_type index) { - return (remove_column(column_list_[index].first.c_str())); + return (remove_column(column_list_[index].first.c_str())); } // ---------------------------------------------------------------------------- @@ -147,7 +140,13 @@ void DataFrame::rename_column (const char *from, const char *to) { column_tb_.emplace (to, from_iter->second); column_list_.emplace_back (to, from_iter->second); - remove_column(from); + column_tb_.erase (from); + for (size_type i = 0; i < column_list_.size(); ++i) { + if (column_list_[i].first == from) { + column_list_.erase(column_list_.begin() + i); + break; + } + } return; } @@ -172,7 +171,7 @@ retype_column (const char *name, new_vec.reserve(old_vec.size()); for (const auto &citer : old_vec) new_vec.push_back(std::move(convert_func(citer))); - remove_column(name); + remove_column(name); load_column(name, std::move(new_vec)); return; } @@ -1406,8 +1405,8 @@ consolidate(const char *old_col_name1, false); guard.release(); if (delete_old_cols) { - remove_column(old_col_name1); - remove_column(old_col_name2); + remove_column(old_col_name1); + remove_column(old_col_name2); } return; } @@ -1445,9 +1444,9 @@ consolidate(const char *old_col_name1, false); guard.release(); if (delete_old_cols) { - remove_column(old_col_name1); - remove_column(old_col_name2); - remove_column(old_col_name3); + remove_column(old_col_name1); + remove_column(old_col_name2); + remove_column(old_col_name3); } return; } @@ -1489,10 +1488,10 @@ consolidate(const char *old_col_name1, false); guard.release(); if (delete_old_cols) { - remove_column(old_col_name1); - remove_column(old_col_name2); - remove_column(old_col_name3); - remove_column(old_col_name4); + remove_column(old_col_name1); + remove_column(old_col_name2); + remove_column(old_col_name3); + remove_column(old_col_name4); } return; } @@ -1539,11 +1538,11 @@ consolidate(const char *old_col_name1, false); guard.release(); if (delete_old_cols) { - remove_column(old_col_name1); - remove_column(old_col_name2); - remove_column(old_col_name3); - remove_column(old_col_name4); - remove_column(old_col_name5); + remove_column(old_col_name1); + remove_column(old_col_name2); + remove_column(old_col_name3); + remove_column(old_col_name4); + remove_column(old_col_name5); } return; } diff --git a/include/DataFrame/Internals/DataFrame_shift.tcc b/include/DataFrame/Internals/DataFrame_shift.tcc index 7cf308b9..bcaa2520 100644 --- a/include/DataFrame/Internals/DataFrame_shift.tcc +++ b/include/DataFrame/Internals/DataFrame_shift.tcc @@ -65,18 +65,26 @@ void DataFrame::self_shift(size_type periods, shift_policy sp) { for (auto &fut : futuers) fut.get(); } - else { + else { for (size_type idx = 0; idx < num_cols; ++idx) [[likely]] data_[idx].change(functor); } } - else if (sp == shift_policy::left) { - while (periods-- > 0) - remove_column(column_list_.front().first.c_str()); - } - else if (sp == shift_policy::right) { - while (periods-- > 0) - remove_column(column_list_.back().first.c_str()); + else { + while (periods-- > 0) { + const char *col_name = + (sp == shift_policy::left) + ? column_list_.front().first.c_str() + : column_list_.back().first.c_str(); + remove_column_functor_ functor (col_name, *this); + + for (const auto &citer : column_list_) { + if (citer.first == col_name) { + data_[citer.second].change(functor); + break; + } + } + } } } } diff --git a/test/dataframe_tester.cc b/test/dataframe_tester.cc index 339fe268..79393e06 100644 --- a/test/dataframe_tester.cc +++ b/test/dataframe_tester.cc @@ -656,7 +656,7 @@ static void test_remove_column() { StlVecType i1 = { 22, 23, 24, 25 }; StlVecType s1 = { "11", "22", "33", "xx", "yy", "gg", "string" }; - MyDataFrame df; + MyDataFrame df; df.load_data(std::move(idx), std::make_pair("col_1", d1), @@ -666,10 +666,10 @@ static void test_remove_column() { std::make_pair("col_str", s1)); df.write(std::cout); - df.remove_column("col_2"); + df.remove_column("col_2"); std::cout << "After removing column `col_2`" << std::endl; df.write(std::cout); - df.remove_column("col_str"); + df.remove_column("col_str"); std::cout << "After removing column `col_str`" << std::endl; df.write(std::cout); From d0e695993809641af5abd6e41cb2715aa24602fc Mon Sep 17 00:00:00 2001 From: Hossein Moein Date: Tue, 30 Jan 2024 09:09:55 -0500 Subject: [PATCH 12/13] Implemented clear() --- docs/HTML/DataFrame.html | 4 ++ docs/HTML/remove_column.html | 72 +++++++++++++++++++ include/DataFrame/DataFrame.h | 10 ++- include/DataFrame/Internals/DataFrame_set.tcc | 16 +++++ test/dataframe_tester_3.cc | 62 ++++++++++++++++ 5 files changed, 163 insertions(+), 1 deletion(-) diff --git a/docs/HTML/DataFrame.html b/docs/HTML/DataFrame.html index 73b92a5e..80dc804c 100644 --- a/docs/HTML/DataFrame.html +++ b/docs/HTML/DataFrame.html @@ -181,6 +181,10 @@

    API Reference with code samples

    bucketize_async( ) + + clear( ) + + col_name_to_idx( ) diff --git a/docs/HTML/remove_column.html b/docs/HTML/remove_column.html index 0faa7d27..1a888c8b 100644 --- a/docs/HTML/remove_column.html +++ b/docs/HTML/remove_column.html @@ -65,6 +65,26 @@ + + + + + + + +
    
    +void
    +clear();
    +        
    + + + This removes all the index and data columns but doesn't necessarily free memeory space of underlying containers. After this call DataFrame will be empty.
    + It is very similar to std::vector clear()
    + + + + +
    static void test_remove_column()  {
    @@ -101,6 +121,58 @@
         std::cout << "After adding back column `col_2`" << std::endl;
         df.write<std::ostream, double, int, std::string>(std::cout);
     }
    +
    +
    +// -----------------------------------------------------------------------------
    +
    +static void test_clear()  {
    +
    +    std::cout << "\nTesting clear( ) ..." << std::endl;
    +
    +    StlVecType<unsigned long>  idx =
    +        { 123450, 123451, 123452, 123453, 123454, 123455, 123456, 123457, 123458, 123459, 123460, 123461, 123462, 123466 };
    +    StlVecType<double> d1 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
    +    StlVecType<double> d2 = { 8, 9, 10, 11, 12, 13, 14, 20, 22, 23, 30, 31, 32, 1.89 };
    +    StlVecType<double> d3 = { 15, 16, 17, 18, 19, 20, 21, 0.34, 1.56, 0.34, 2.3, 0.1, 0.89, 0.45 };
    +    StlVecType<int>    i1 = { 22, 23, 24, 25, 99, 100, 101, 3, 2 };
    +    StlVecType<std::string>    strvec = { "zz", "bb", "cc", "ww", "ee", "ff", "gg", "hh", "ii", "jj", "kk", "ll", "mm", "nn" };
    +    MyDataFrame        df1;
    +
    +    df1.load_data(std::move(idx),
    +                  std::make_pair("col_1", d1),
    +                  std::make_pair("col_2", d2),
    +                  std::make_pair("col_3", d3),
    +                  std::make_pair("col_4", i1),
    +                  std::make_pair("str_col", strvec));
    +
    +    StlVecType<unsigned long>  idx2 =
    +        { 123450, 123451, 123452, 123453, 123454, 123455, 123456, 123457, 123458, 123459, 123460, 123461, 123462, 123466 };
    +    StlVecType<double> d12 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
    +    StlVecType<double> d22 = { 8, 9, 10, 11, 12, 13, 14, 20, 22, 23, 30, 31, 32, 1.89 };
    +    StlVecType<double> d32 = { 15, 16, 17, 18, 19, 20, 21, 0.34, 1.56, 0.34, 2.3, 0.1, 0.89, 0.45 };
    +    StlVecType<int>    i12 = { 22, 23, 24, 25, 99, 100, 101, 3, 2 };
    +    StlVecType<std::string>    strvec2 = { "zz", "bb", "cc", "ww", "ee", "ff", "gg", "hh", "ii", "jj", "kk", "ll", "mm", "nn" };
    +    MyDataFrame        df2;
    +
    +    df2.load_data(std::move(idx2),
    +                  std::make_pair("col_1", d12),
    +                  std::make_pair("col_2", d22),
    +                  std::make_pair("col_3", d32),
    +                  std::make_pair("col_4", i12),
    +                  std::make_pair("str_col", strvec2));
    +
    +    df1.clear();
    +    assert(df1.empty());
    +    assert(df1.shapeless());
    +    assert(df2.get_index()[4] == 123454);
    +    assert(df2.get_column<int>("col_4")[7] == 3);
    +    assert(df2.get_column<std::string>("str_col")[5] == "ff");
    +
    +    df1 = df2;
    +    assert(df1.get_index()[4] == 123454);
    +    assert(df1.get_column<int>("col_4")[7] == 3);
    +    assert(df1.get_column<std::string>("str_col")[5] == "ff");
    +}
     
    C++ DataFrame using AllocatorType = typename allocator_declare::type; @@ -169,6 +169,14 @@ class DataFrame : public ThreadGranularity { void remove_column(size_type index); + // This removes all the index and data columns but doesn't necessarily + // free memeory space of underlying containers. After this call DataFrame + // will be empty. + // It is very similar to std::vector clear() + // + void + clear(); + // It renames column named from to to. If column from does not exist, // it throws an exception // diff --git a/include/DataFrame/Internals/DataFrame_set.tcc b/include/DataFrame/Internals/DataFrame_set.tcc index 268527cc..2c3bdf65 100644 --- a/include/DataFrame/Internals/DataFrame_set.tcc +++ b/include/DataFrame/Internals/DataFrame_set.tcc @@ -107,6 +107,22 @@ void DataFrame::remove_column(size_type index) { // ---------------------------------------------------------------------------- +template +void DataFrame::clear() { + + { + const SpinGuard guard(lock_); + + data_.clear(); + } + indices_.clear(); + column_tb_.clear(); + column_list_.clear(); + return; +} + +// ---------------------------------------------------------------------------- + template void DataFrame::rename_column (const char *from, const char *to) { diff --git a/test/dataframe_tester_3.cc b/test/dataframe_tester_3.cc index 45d78f2f..93343689 100644 --- a/test/dataframe_tester_3.cc +++ b/test/dataframe_tester_3.cc @@ -2647,6 +2647,67 @@ static void test_get_data_by_like() { // ----------------------------------------------------------------------------- +static void test_clear() { + + std::cout << "\nTesting clear( ) ..." << std::endl; + + StlVecType idx = + { 123450, 123451, 123452, 123453, 123454, 123455, 123456, + 123457, 123458, 123459, 123460, 123461, 123462, 123466 }; + StlVecType d1 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }; + StlVecType d2 = { 8, 9, 10, 11, 12, 13, 14, 20, 22, 23, + 30, 31, 32, 1.89 }; + StlVecType d3 = { 15, 16, 17, 18, 19, 20, 21, + 0.34, 1.56, 0.34, 2.3, 0.1, 0.89, 0.45 }; + StlVecType i1 = { 22, 23, 24, 25, 99, 100, 101, 3, 2 }; + StlVecType strvec = + { "zz", "bb", "cc", "ww", "ee", "ff", "gg", "hh", "ii", "jj", "kk", + "ll", "mm", "nn" }; + MyDataFrame df1; + + df1.load_data(std::move(idx), + std::make_pair("col_1", d1), + std::make_pair("col_2", d2), + std::make_pair("col_3", d3), + std::make_pair("col_4", i1), + std::make_pair("str_col", strvec)); + + StlVecType idx2 = + { 123450, 123451, 123452, 123453, 123454, 123455, 123456, + 123457, 123458, 123459, 123460, 123461, 123462, 123466 }; + StlVecType d12 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }; + StlVecType d22 = { 8, 9, 10, 11, 12, 13, 14, 20, 22, 23, + 30, 31, 32, 1.89 }; + StlVecType d32 = { 15, 16, 17, 18, 19, 20, 21, + 0.34, 1.56, 0.34, 2.3, 0.1, 0.89, 0.45 }; + StlVecType i12 = { 22, 23, 24, 25, 99, 100, 101, 3, 2 }; + StlVecType strvec2 = + { "zz", "bb", "cc", "ww", "ee", "ff", "gg", "hh", "ii", "jj", "kk", + "ll", "mm", "nn" }; + MyDataFrame df2; + + df2.load_data(std::move(idx2), + std::make_pair("col_1", d12), + std::make_pair("col_2", d22), + std::make_pair("col_3", d32), + std::make_pair("col_4", i12), + std::make_pair("str_col", strvec2)); + + df1.clear(); + assert(df1.empty()); + assert(df1.shapeless()); + assert(df2.get_index()[4] == 123454); + assert(df2.get_column("col_4")[7] == 3); + assert(df2.get_column("str_col")[5] == "ff"); + + df1 = df2; + assert(df1.get_index()[4] == 123454); + assert(df1.get_column("col_4")[7] == 3); + assert(df1.get_column("str_col")[5] == "ff"); +} + +// ----------------------------------------------------------------------------- + int main(int, char *[]) { MyDataFrame::set_optimum_thread_level(); @@ -2704,6 +2765,7 @@ int main(int, char *[]) { test_inversion_count(); test__like_clause_compare_(); test_get_data_by_like(); + test_clear(); return (0); } From 1bf3e3928eb996574d32f1e727b9d818337300bf Mon Sep 17 00:00:00 2001 From: Hossein Moein Date: Wed, 31 Jan 2024 10:18:56 -0500 Subject: [PATCH 13/13] Implemented swap() --- docs/HTML/DataFrame.html | 4 + docs/HTML/remove_column.html | 58 ++++++++- include/DataFrame/DataFrame.h | 9 ++ include/DataFrame/Internals/DataFrame_set.tcc | 16 +++ test/dataframe_tester_3.cc | 118 +++++++++++++----- 5 files changed, 171 insertions(+), 34 deletions(-) diff --git a/docs/HTML/DataFrame.html b/docs/HTML/DataFrame.html index 80dc804c..d124ca2a 100644 --- a/docs/HTML/DataFrame.html +++ b/docs/HTML/DataFrame.html @@ -525,6 +525,10 @@

    API Reference with code samples

    sort_async( 5 ) + + swap( ) + + to_string( ) diff --git a/docs/HTML/remove_column.html b/docs/HTML/remove_column.html index 1a888c8b..a75118f0 100644 --- a/docs/HTML/remove_column.html +++ b/docs/HTML/remove_column.html @@ -65,10 +65,6 @@ - - - - @@ -85,6 +81,23 @@ + + + +
    
    +void
    +swap(DataFrame &other);
    +        
    + + + This swaps all self's index and data columns with the ones in other
    + It is very similar to std::vector swap()
    + + + other: Another DataFrme of the same type
    + + +
    static void test_remove_column()  {
    @@ -173,6 +186,43 @@
         assert(df1.get_column<int>("col_4")[7] == 3);
         assert(df1.get_column<std::string>("str_col")[5] == "ff");
     }
    +
    +
    // -----------------------------------------------------------------------------
    +
    +static void test_swap()  {
    +
    +    std::cout << "\nTesting swap( ) ..." << std::endl;
    +
    +    StlVecType<unsigned long>  idx =
    +        { 123450, 123451, 123452, 123453, 123454, 123455, 123456, 123457, 123458, 123459, 123460, 123461, 123462, 123466 };
    +    StlVecType<double> d1 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
    +    StlVecType<double> d2 = { 8, 9, 10, 11, 12, 13, 14, 20, 22, 23, 30, 31, 32, 1.89 };
    +    StlVecType<double> d3 = { 15, 16, 17, 18, 19, 20, 21, 0.34, 1.56, 0.34, 2.3, 0.1, 0.89, 0.45 };
    +    StlVecType<int>    i1 = { 22, 23, 24, 25, 99, 100, 101, 3, 2 };
    +    StlVecType<std::string>    strvec = { "zz", "bb", "cc", "ww", "ee", "ff", "gg", "hh", "ii", "jj", "kk", "ll", "mm", "nn" };
    +    MyDataFrame        df1;
    +    MyDataFrame        df2;
    +
    +    df1.load_data(std::move(idx),
    +                  std::make_pair("col_1", d1),
    +                  std::make_pair("col_2", d2),
    +                  std::make_pair("col_3", d3),
    +                  std::make_pair("col_4", i1),
    +                  std::make_pair("str_col", strvec));
    +
    +    assert(df2.empty());
    +    assert(df2.shapeless());
    +    assert(df1.get_index()[4] == 123454);
    +    assert(df1.get_column<int>("col_4")[7] == 3);
    +    assert(df1.get_column<std::string>("str_col")[5] == "ff");
    +
    +    df1.swap(df2);
    +    assert(df1.empty());
    +    assert(df1.shapeless());
    +    assert(df2.get_index()[4] == 123454);
    +    assert(df2.get_column<int>("col_4")[7] == 3);
    +    assert(df2.get_column<std::string>("str_col")[5] == "ff");
    +}
     
    C++ DataFrame::clear() { // ---------------------------------------------------------------------------- +template +void DataFrame::swap(DataFrame &other) { + + { + const SpinGuard guard(lock_); + + data_.swap(other.data_); + } + indices_.swap(other.indices_); + column_tb_.swap(other.column_tb_); + column_list_.swap(other.column_list_); + return; +} + +// ---------------------------------------------------------------------------- + template void DataFrame::rename_column (const char *from, const char *to) { diff --git a/test/dataframe_tester_3.cc b/test/dataframe_tester_3.cc index 93343689..08e73477 100644 --- a/test/dataframe_tester_3.cc +++ b/test/dataframe_tester_3.cc @@ -293,14 +293,14 @@ static void test_to_from_string() { 123457, 123458, 123459, 123460, 123461, 123462, 123466 }; StlVecType d1 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }; StlVecType d2 = { 8, 9, 10, 11, 12, 13, 14, 20, 22, 23, - 30, 31, 32, 1.89 }; + 30, 31, 32, 1.89 }; StlVecType d3 = { 15, 16, 17, 18, 19, 20, 21, - 0.34, 1.56, 0.34, 2.3, 0.1, 0.89, 0.45 }; + 0.34, 1.56, 0.34, 2.3, 0.1, 0.89, 0.45 }; StlVecType i1 = { 22, 23, 24, 25, 99, 100, 101, 3, 2 }; StlVecType strvec = { "zz", "bb", "cc", "ww", "ee", "ff", "gg", "hh", "ii", "jj", "kk", "ll", "mm", "nn" }; - MyDataFrame df; + MyDataFrame df; df.load_data(std::move(idx), std::make_pair("col_1", d1), @@ -365,7 +365,8 @@ static void test_BiasVisitor() { df.remove_data_by_loc({ 0, 1500 }); using avg1 = MeanVisitor; - avg1 avg1_v; + + avg1 avg1_v; bias_v bias1 (avg1_v); df.single_act_visit("IBM_Close", bias1); @@ -381,7 +382,8 @@ static void test_BiasVisitor() { assert(std::abs(bias1.get_result()[210] - 0.0242) < 0.0001); using s_avg1 = StableMeanVisitor; - s_avg1 s_avg1_v; + + s_avg1 s_avg1_v; bias_v s_bias1 (s_avg1_v); df.single_act_visit("IBM_Close", s_bias1); @@ -397,7 +399,8 @@ static void test_BiasVisitor() { assert(std::abs(s_bias1.get_result()[210] - 0.0242) < 0.0001); using avg2 = WeightedMeanVisitor; - avg2 avg2_v; + + avg2 avg2_v; bias_v bias2 (avg2_v); df.single_act_visit("IBM_Close", bias2); @@ -413,7 +416,8 @@ static void test_BiasVisitor() { assert(std::abs(bias2.get_result()[210] - 0.0168) < 0.0001); using avg3 = GeometricMeanVisitor; - avg3 avg3_v; + + avg3 avg3_v; bias_v bias3 (avg3_v); df.single_act_visit("IBM_Close", bias3); @@ -429,7 +433,8 @@ static void test_BiasVisitor() { assert(std::abs(bias3.get_result()[210] - 0.0245) < 0.0001); using avg4 = HarmonicMeanVisitor; - avg4 avg4_v; + + avg4 avg4_v; bias_v bias4 (avg4_v); df.single_act_visit("IBM_Close", bias4); @@ -1081,7 +1086,7 @@ static void test_TreynorRatioVisitor() { { 0.2, 0.58, -0.60, -0.08, 0.05, 0.87, 0.2, 0.4, 0.5, 0.06, 0.3, -0.34, -0.9, 0.8, -0.4, 0.86, 0.01, 1.02, -0.02, -1.5, 0.2 }; StlVecType i1 = { 22, 23, 24, 25, 99 }; - MyDataFrame df; + MyDataFrame df; df.load_data(std::move(idx), std::make_pair("asset", d1), @@ -2174,14 +2179,12 @@ static void test_read_csv_with_vector() { assert(df.get_index().size() == 564); assert((std::fabs( - df.get_column("Close")[4] - 1.0201) < 0.0001)); + df.get_column("Close")[4] - 1.0201) < 0.0001)); assert((df.get_column("Volume")[4] == 3724291200)); - assert((std::fabs( - df.get_column> - ("Z Score")[4][1] - -0.329) < 0.0001)); - assert((std::fabs( - df.get_column> - ("Return Vector")[4][3] - -0.0182) < 0.0001)); + assert((std::fabs(df.get_column> + ("Z Score")[4][1] - -0.329) < 0.0001)); + assert((std::fabs(df.get_column> + ("Return Vector")[4][3] - -0.0182) < 0.0001)); assert((std::isnan( df.get_column>("Return Vector")[10][0]))); } @@ -2487,18 +2490,30 @@ static void test_inversion_count() { using IntDataFrame = StdDataFrame; - std::vector idx = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17 }; - std::vector i1 = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17 }; - std::vector i2 = { 17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 }; - std::vector i3 = { 1,0,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17 }; - std::vector i4 = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,16 }; - std::vector i5 = { 1,0,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,16 }; - std::vector i6 = { 17,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,0 }; - std::vector i7 = { 0,1,2,3,4,5,6,10,8,9,7,11,12,13,14,15,16,17 }; - std::vector i8 = { 0,1,2,15,4,5,6,7,8,9,10,11,12,13,14,3,16,17 }; - std::vector i9 = { 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2 }; - std::vector i10 = { 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,3 }; - std::vector i11 = { 2,2,2,2,3,2,2,2,2,4,2,2,2,5,2,2,2,6 }; + std::vector idx = + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 }; + std::vector i1 = + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 }; + std::vector i2 = + { 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; + std::vector i3 = + { 1, 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 }; + std::vector i4 = + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 16 }; + std::vector i5 = + { 1, 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 16 }; + std::vector i6 = + { 17, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0 }; + std::vector i7 = + { 0, 1, 2, 3, 4, 5, 6, 10, 8, 9, 7, 11, 12, 13, 14, 15, 16, 17 }; + std::vector i8 = + { 0, 1, 2, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 3, 16, 17 }; + std::vector i9 = + { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 }; + std::vector i10 = + { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3 }; + std::vector i11 = + { 2, 2, 2, 2, 3, 2, 2, 2, 2, 4, 2, 2, 2, 5, 2, 2, 2, 6 }; IntDataFrame df; df.load_data(std::move(idx), @@ -2656,9 +2671,9 @@ static void test_clear() { 123457, 123458, 123459, 123460, 123461, 123462, 123466 }; StlVecType d1 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }; StlVecType d2 = { 8, 9, 10, 11, 12, 13, 14, 20, 22, 23, - 30, 31, 32, 1.89 }; + 30, 31, 32, 1.89 }; StlVecType d3 = { 15, 16, 17, 18, 19, 20, 21, - 0.34, 1.56, 0.34, 2.3, 0.1, 0.89, 0.45 }; + 0.34, 1.56, 0.34, 2.3, 0.1, 0.89, 0.45 }; StlVecType i1 = { 22, 23, 24, 25, 99, 100, 101, 3, 2 }; StlVecType strvec = { "zz", "bb", "cc", "ww", "ee", "ff", "gg", "hh", "ii", "jj", "kk", @@ -2708,6 +2723,48 @@ static void test_clear() { // ----------------------------------------------------------------------------- +static void test_swap() { + + std::cout << "\nTesting swap( ) ..." << std::endl; + + StlVecType idx = + { 123450, 123451, 123452, 123453, 123454, 123455, 123456, + 123457, 123458, 123459, 123460, 123461, 123462, 123466 }; + StlVecType d1 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }; + StlVecType d2 = { 8, 9, 10, 11, 12, 13, 14, 20, 22, 23, + 30, 31, 32, 1.89 }; + StlVecType d3 = { 15, 16, 17, 18, 19, 20, 21, + 0.34, 1.56, 0.34, 2.3, 0.1, 0.89, 0.45 }; + StlVecType i1 = { 22, 23, 24, 25, 99, 100, 101, 3, 2 }; + StlVecType strvec = + { "zz", "bb", "cc", "ww", "ee", "ff", "gg", "hh", "ii", "jj", "kk", + "ll", "mm", "nn" }; + MyDataFrame df1; + MyDataFrame df2; + + df1.load_data(std::move(idx), + std::make_pair("col_1", d1), + std::make_pair("col_2", d2), + std::make_pair("col_3", d3), + std::make_pair("col_4", i1), + std::make_pair("str_col", strvec)); + + assert(df2.empty()); + assert(df2.shapeless()); + assert(df1.get_index()[4] == 123454); + assert(df1.get_column("col_4")[7] == 3); + assert(df1.get_column("str_col")[5] == "ff"); + + df1.swap(df2); + assert(df1.empty()); + assert(df1.shapeless()); + assert(df2.get_index()[4] == 123454); + assert(df2.get_column("col_4")[7] == 3); + assert(df2.get_column("str_col")[5] == "ff"); +} + +// ----------------------------------------------------------------------------- + int main(int, char *[]) { MyDataFrame::set_optimum_thread_level(); @@ -2766,6 +2823,7 @@ int main(int, char *[]) { test__like_clause_compare_(); test_get_data_by_like(); test_clear(); + test_swap(); return (0); }