From c7d1c7b09115b045c5d35f0ca3bf4a8749a38180 Mon Sep 17 00:00:00 2001 From: Hossein Moein Date: Mon, 9 Oct 2023 14:02:42 -0400 Subject: [PATCH] Added str_vec, dbl_set, and str_set to read/write --- data/AAPL_10dBucketWithMaps.csv | 11 +- docs/HTML/read.html | 8 +- docs/HTML/write.html | 8 +- examples/hello_world.cc | 2 +- .../DataFrame/Internals/DataFrame_read.tcc | 76 ++++++++++++ .../Internals/DataFrame_standalone.tcc | 109 ++++++++++++++++++ test/dataframe_tester_3.cc | 20 ++++ 7 files changed, 225 insertions(+), 9 deletions(-) diff --git a/data/AAPL_10dBucketWithMaps.csv b/data/AAPL_10dBucketWithMaps.csv index 7e70d05f..b96984d1 100644 --- a/data/AAPL_10dBucketWithMaps.csv +++ b/data/AAPL_10dBucketWithMaps.csv @@ -1,6 +1,5 @@ -INDEX:4:,Open:4:,High:4:,Low:4:,Close:4:,Mean:4:,Median:4:,25% Quantile:4:,Std:4:,MAD:4:,Map 1:4:,Unordered Map:4:,Volume:4: -01/14/2000 00:00:00.000,0.999442,0.999442,0.77846,0.896763,0.8819754,0.8805805,0.8560265,0.0600931968588,0.0436942,3{label one 1:123.0|label one 2:-782.5|label one 3:444.44},3{Key one 1:123.0|Key one 2:-782.5|Key one 3:444.44},6400945600 -01/31/2000 00:00:00.000,0.928013,1.013393,0.907366,0.926339,0.9637277,0.966797,0.938337,0.0362519289143,0.0313617,3{label two 1:123.0|label two 2:-782.5|label two 3:444.44},3{Key two 1:123.0|Key two 2:-782.5|Key two 3:444.44},6154232000 -02/14/2000 00:00:00.000,0.895089,1.03404,0.882254,1.03404,0.9732142,0.988281,0.9433595,0.0559733198384,0.0462054,3{label three 1:123.0|label three 2:-782.5|label three 3:444.44},3{Key three 1:123.0|Key three 2:-782.5|Key three 3:444.44},3714592000 -02/29/2000 00:00:00.000,1.0625,1.0625,0.985491,1.023438,1.0203265,1.0212055,1.013672,0.0217113745778,0.0153041,3{label four 1:123.0|label four 2:-782.5|label four 3:444.44},3{Key four 1:123.0|Key four 2:-782.5|Key four 3:444.44},3605190400 - +INDEX:4:,Open:4:,High:4:,Low:4:,Close:4:,Mean:4:,Median:4:,25% Quantile:4:,Std:4:,MAD:4:,Map 1:4:,Unordered Map:4:,Str Vec:4,Double Set:4:,Str Set:4:,Volume:4: +01/14/2000 00:00:00.000,0.999442,0.999442,0.77846,0.896763,0.8819754,0.8805805,0.8560265,0.060093197,0.0436942,3{label one 1:123.0|label one 2:-782.5|label one 3:444.44},3{Key one 1:123.0|Key one 2:-782.5|Key one 3:444.44},4[bbb|aaa|zzz|ddd],3[123.0|-782.5|444.44],3[123.0|-782.5|444.44],6400945600 +01/31/2000 00:00:00.000,0.928013,1.013393,0.907366,0.926339,0.9637277,0.966797,0.938337,0.036251929,0.0313617,3{label two 1:123.0|label two 2:-782.5|label two 3:444.44},3{Key two 1:123.0|Key two 2:-782.5|Key two 3:444.44},4[aaa|bbb|ccc|www],3[1:123.0|-782.5|:444.44],3[1:123.0|-782.5|:444.44],6154232000 +02/14/2000 00:00:00.000,0.895089,1.03404,0.882254,1.03404,0.9732142,0.988281,0.9433595,0.05597332,0.0462054,3{label three 1:123.0|label three 2:-782.5|label three 3:444.44},3{Key three 1:123.0|Key three 2:-782.5|Key three 3:444.44},4[123|abc|345|list],3[123.0|-782.5|444.44],3[123.0|-782.5|444.44],3714592000 +02/29/2000 00:00:00.000,1.0625,1.0625,0.985491,1.023438,1.0203265,1.0212055,1.013672,0.021711375,0.0153041,3{label four 1:123.0|label four 2:-782.5|label four 3:444.44},3{Key four 1:123.0|Key four 2:-782.5|Key four 3:444.44},3[bbb|aaa|zzz],4[123.0|-782.5|444.44|100.5],4[123.0|-782.5|444.44|100.5],3605190400 \ No newline at end of file diff --git a/docs/HTML/read.html b/docs/HTML/read.html index 0c39cd1c..cc806e23 100644 --- a/docs/HTML/read.html +++ b/docs/HTML/read.html @@ -103,8 +103,14 @@ DateTimeAME -- DateTime string printed in American style (MM/DD/YYYY HH:MM:SS.mmm) DateTimeEUR -- DateTime string printed in European style (YYYY/MM/DD HH:MM:SS.mmm) DateTimeISO -- DateTime string printed in ISO style (YYYY-MM-DD HH:MM:SS.mmm) - dbl_vector -- A vector of double precision values, The vector is printed as "s[d1|d2|...]" + dbl_vec -- A vector of double precision values, The vector is printed as "s[d1|d2|...]" where s is the size of the vector and d's are the double values. + str_vec -- A vector of std::string values, The vector is printed as "s[str1|str2|...]" + where s is the size of the vector and str's are the strings. + dbl_set -- A set of double precision values, The set is printed as "s[d1|d2|...]" + where s is the size of the set and d's are the double values. + str_set -- A set of std::string values, The set is printed as "s[str1|str2|...]" + where s is the size of the set and str's are the strings. str_dbl_map -- A map of string keys to double precision values, The map is printed as "s{k1:v1|k2:v2|...}" where s is the size of the map and k's and v's are keys and values. str_dbl_unomap -- An unordered map of string keys to double precision values, The map is printed as "s{k1:v1|k2:v2|...}" diff --git a/docs/HTML/write.html b/docs/HTML/write.html index 4740b216..86f607ef 100644 --- a/docs/HTML/write.html +++ b/docs/HTML/write.html @@ -104,8 +104,14 @@ DateTimeAME -- DateTime string printed in American style (MM/DD/YYYY HH:MM:SS.mmm) DateTimeEUR -- DateTime string printed in European style (YYYY/MM/DD HH:MM:SS.mmm) DateTimeISO -- DateTime string printed in ISO style (YYYY-MM-DD HH:MM:SS.mmm) - dbl_vector -- A vector of double precision values, The vector is printed as "s[d1|d2|...]" + dbl_vec -- A vector of double precision values, The vector is printed as "s[d1|d2|...]" where s is the size of the vector and d's are the double values. + str_vec -- A vector of std::string values, The vector is printed as "s[str1|str2|...]" + where s is the size of the vector and str's are the strings. + dbl_set -- A set of double precision values, The set is printed as "s[d1|d2|...]" + where s is the size of the set and d's are the double values. + str_set -- A set of std::string values, The set is printed as "s[str1|str2|...]" + where s is the size of the set and str's are the strings. str_dbl_map -- A map of string keys to double precision values, The map is printed as "s{k1:v1|k2:v2|...}" where s is the size of the map and k's and v's are keys and values. str_dbl_unomap -- An unordered map of string keys to double precision values, The map is printed as "s{k1:v1|k2:v2|...}" diff --git a/examples/hello_world.cc b/examples/hello_world.cc index 36fada9f..44235040 100644 --- a/examples/hello_world.cc +++ b/examples/hello_world.cc @@ -51,7 +51,7 @@ using StrDataFrame = StdDataFrame; // using DTDataFrame = StdDataFrame; -// This is just some arbitrary type to show how any type could be in DataFrame +// This is just some arbitrary type to show how any type, including the DataFrame itself, could be in DataFrame // struct MyData { int i { 10 }; diff --git a/include/DataFrame/Internals/DataFrame_read.tcc b/include/DataFrame/Internals/DataFrame_read.tcc index 78a1e33d..e2c72f2c 100644 --- a/include/DataFrame/Internals/DataFrame_read.tcc +++ b/include/DataFrame/Internals/DataFrame_read.tcc @@ -514,6 +514,7 @@ read_csv2_(std::istream &stream, stream.unget(); // First get the header which is column names, sizes and types + // if (! header_read) [[unlikely]] { col_name.clear(); type_str.clear(); @@ -532,6 +533,7 @@ read_csv2_(std::istream &stream, size_type row_cnt = 0; // Jump to the starting row + // while (row_cnt < starting_row && stream.get(c)) if (c == '\r' || c == '\n') row_cnt += 1; @@ -592,6 +594,7 @@ read_csv2_(std::istream &stream, col_name.c_str(), nrows); // This includes DateTime, DateTimeAME, DateTimeEUR, DateTimeISO + // else if (! ::strncmp(type_str.c_str(), "DateTime", 8)) spec_vec.emplace_back(StlVecType(), type_str.c_str(), @@ -607,6 +610,21 @@ read_csv2_(std::istream &stream, type_str.c_str(), col_name.c_str(), nrows); + else if (type_str == "str_vec") + spec_vec.emplace_back(StlVecType>{ }, + type_str.c_str(), + col_name.c_str(), + nrows); + else if (type_str == "dbl_set") + spec_vec.emplace_back(StlVecType>{ }, + type_str.c_str(), + col_name.c_str(), + nrows); + else if (type_str == "str_set") + spec_vec.emplace_back(StlVecType>{ }, + type_str.c_str(), + col_name.c_str(), + nrows); else if (type_str == "str_dbl_map") spec_vec.emplace_back( StlVecType>{ }, @@ -743,6 +761,39 @@ read_csv2_(std::istream &stream, value.c_str()))); } } + else if (col_spec.type_spec == "str_vec") { + if (! value.empty()) { + StlVecType> &vec = + std::any_cast> &> + (col_spec.col_vec); + + vec.push_back( + std::move(_get_str_vec_from_value_>( + value.c_str()))); + } + } + else if (col_spec.type_spec == "dbl_set") { + using set_t = std::set; + + if (! value.empty()) { + StlVecType &vec = + std::any_cast &>(col_spec.col_vec); + + vec.push_back(std::move(_get_dbl_set_from_value_( + value.c_str()))); + } + } + else if (col_spec.type_spec == "str_set") { + using set_t = std::set; + + if (! value.empty()) { + StlVecType &vec = + std::any_cast &>(col_spec.col_vec); + + vec.push_back(std::move(_get_str_set_from_value_( + value.c_str()))); + } + } else if (col_spec.type_spec == "str_dbl_map") { using map_t = std::map; @@ -864,6 +915,31 @@ read_csv2_(std::istream &stream, std::move(std::any_cast> &> (col_spec.col_vec)), nan_policy::dont_pad_with_nans); + else if (col_spec.type_spec == "str_vec") + load_column>( + col_spec.col_name.c_str(), + std::move( + std::any_cast> &> + (col_spec.col_vec)), + nan_policy::dont_pad_with_nans); + else if (col_spec.type_spec == "dbl_set") { + using set_t = std::set; + + load_column( + col_spec.col_name.c_str(), + std::move(std::any_cast &> + (col_spec.col_vec)), + nan_policy::dont_pad_with_nans); + } + else if (col_spec.type_spec == "str_set") { + using set_t = std::set; + + load_column( + col_spec.col_name.c_str(), + std::move(std::any_cast &> + (col_spec.col_vec)), + nan_policy::dont_pad_with_nans); + } else if (col_spec.type_spec == "str_dbl_map") { using map_t = std::map; diff --git a/include/DataFrame/Internals/DataFrame_standalone.tcc b/include/DataFrame/Internals/DataFrame_standalone.tcc index ecda2ce7..e655e485 100644 --- a/include/DataFrame/Internals/DataFrame_standalone.tcc +++ b/include/DataFrame/Internals/DataFrame_standalone.tcc @@ -106,6 +106,20 @@ static S &operator << (S &stream, const std::vector &data) { // ---------------------------------------------------------------------------- +template +static S &operator << (S &stream, const std::set &data) { + + if (! data.empty()) { + stream << data.size() << '[' << *(data.cbegin()); + for (auto citer = ++(data.cbegin()); citer != data.cend(); ++citer) + stream << '|' << *citer; + stream << ']'; + } + return (stream); +} + +// ---------------------------------------------------------------------------- + template static S &operator << (S &stream, const std::array &data) { @@ -596,6 +610,101 @@ _get_dbl_vec_from_value_(const char *value) { // ---------------------------------------------------------------------------- +template +inline static typename DF::template StlVecType +_get_str_vec_from_value_(const char *value) { + + using vec_t = typename DF::template StlVecType; + + std::size_t vcnt { 0 }; + char buffer[2048]; + + while (value[vcnt] != '[') { + buffer[vcnt] = value[vcnt]; + vcnt += 1; + } + buffer[vcnt] = '\0'; + + vec_t data; + std::size_t bcnt; + + data.reserve(std::strtol(buffer, nullptr, 10)); + vcnt += 1; // skip [ + while (value[vcnt] && value[vcnt] != ']') { + bcnt = 0; + while (value[vcnt] != '|' && value[vcnt] != ']') + buffer[bcnt++] = value[vcnt++]; + buffer[bcnt] = '\0'; + data.push_back(buffer); + vcnt += 1; // skip separator + } + return (data); +} + +// ---------------------------------------------------------------------------- + +inline static std::set +_get_dbl_set_from_value_(const char *value) { + + using set_t = typename std::set; + + std::size_t vcnt = 0; + char buffer[128]; + + while (value[vcnt] != '[') { + buffer[vcnt] = value[vcnt]; + vcnt += 1; + } + buffer[vcnt] = '\0'; // That is the count which is useless for sets + + set_t data; + std::size_t bcnt; + + vcnt += 1; // skip [ + while (value[vcnt] && value[vcnt] != ']') { + bcnt = 0; + while (value[vcnt] != '|' && value[vcnt] != ']') + buffer[bcnt++] = value[vcnt++]; + buffer[bcnt] = '\0'; + data.insert(std::strtod(buffer, nullptr)); + vcnt += 1; // skip separator + } + return (data); +} + +// ---------------------------------------------------------------------------- + +inline static std::set +_get_str_set_from_value_(const char *value) { + + using set_t = typename std::set; + + std::size_t vcnt = 0; + char buffer[2048]; + + while (value[vcnt] != '[') { + buffer[vcnt] = value[vcnt]; + vcnt += 1; + } + buffer[vcnt] = '\0'; // That is the count which is useless for sets + + set_t data; + std::size_t bcnt; + + vcnt += 1; // skip [ + while (value[vcnt] && value[vcnt] != ']') { + bcnt = 0; + while (value[vcnt] != '|' && value[vcnt] != ']') + buffer[bcnt++] = value[vcnt++]; + buffer[bcnt] = '\0'; + data.insert(buffer); + vcnt += 1; // skip separator + } + return (data); +} + +// ---------------------------------------------------------------------------- + template inline static MAP _get_str_dbl_map_from_value_(const char *value) { diff --git a/test/dataframe_tester_3.cc b/test/dataframe_tester_3.cc index 36699b05..91e9375b 100644 --- a/test/dataframe_tester_3.cc +++ b/test/dataframe_tester_3.cc @@ -2199,6 +2199,9 @@ static void test_read_csv_with_maps() { using DT_DataFrame = StdDataFrame; using map_t = std::map; using unomap_t = std::unordered_map; + using str_vec_t = std::vector; + using str_set_t = std::set; + using dbl_set_t = std::set; DT_DataFrame df; @@ -2221,6 +2224,23 @@ static void test_read_csv_with_maps() { assert((std::fabs( df.get_column ("Unordered Map")[0]["Key one 2"] - -782.5) < 0.001)); + + assert((df.get_column("Str Vec").size() == 4)); + assert((df.get_column("Str Vec")[1].size() == 4)); + assert((df.get_column("Str Vec")[3].size() == 3)); + assert((df.get_column("Str Vec")[2][2] == "345")); + + assert((df.get_column("Double Set").size() == 4)); + assert((df.get_column("Double Set")[1].size() == 3)); + assert((df.get_column("Double Set")[3].size() == 4)); + assert((*(df.get_column("Double Set")[2].find(444.44)) == + 444.44)); + + assert((df.get_column("Str Set").size() == 4)); + assert((df.get_column("Str Set")[1].size() == 3)); + assert((df.get_column("Str Set")[3].size() == 4)); + assert((*(df.get_column("Str Set")[0].find("123.0")) == + "123.0")); } catch (const DataFrameError &ex) { std::cout << ex.what() << std::endl;