From 89459fb71a9df59013a3b98126ac82c9e61b43d9 Mon Sep 17 00:00:00 2001 From: Hossein Moein Date: Mon, 27 May 2024 11:39:04 -0400 Subject: [PATCH 1/4] Implemented serialize() and deserialize() --- README.md | 2 +- examples/hello_world.cc | 2 +- include/DataFrame/DataFrame.h | 73 +++++++++++++++++-- .../DataFrame/Internals/DataFrame_read.tcc | 15 ++++ .../DataFrame/Internals/DataFrame_write.tcc | 16 +++- include/DataFrame/Utils/Utils.h | 2 +- 6 files changed, 101 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 182a8bd5..75b199ae 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ I have followed a few principles in this library:
5. [Avoid copying data as much as possible](https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/copying_data.html) 6. [Use multi-threading but only when it makes sense](https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/multithreading.html) 7. [Do not attempt to protect the user against _garbage in_, _garbage out_](https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/garbage_in_garbage_out.html) -8. DataFrame library is self-contained, meaning DataFrame only depends on _C++ language_ and its _standard library_ +8. Keep DataFrame library self-contained, meaning DataFrame must only depend on _C++ language_ and its _standard library_ --- diff --git a/examples/hello_world.cc b/examples/hello_world.cc index 889ffc36..232ee481 100644 --- a/examples/hello_world.cc +++ b/examples/hello_world.cc @@ -109,7 +109,7 @@ int main(int, char *[]) { StrDataFrame ibm_df; // Also, you can load data into a DataFrame from a file, supporting a few different formats. If the file cannot be found, - // an exception will be thrown. If the DataFrame root directory is your current directory when running this, it should + // an exception will be thrown. If the DataFrame data directory is your current directory when running this, it should // work fine. // ibm_df.read("IBM.csv", io_format::csv2); diff --git a/include/DataFrame/DataFrame.h b/include/DataFrame/DataFrame.h index 1c428b6d..c05be53b 100644 --- a/include/DataFrame/DataFrame.h +++ b/include/DataFrame/DataFrame.h @@ -127,7 +127,7 @@ class DataFrame : public ThreadGranularity { // Any version of DataFrame should be assignable to any other version // template - DataFrame &assign(const OTHER &rhs); + DataFrame &assign(const OTHER &rhs); public: // Load/append/remove interfaces @@ -4157,11 +4157,17 @@ class DataFrame : public ThreadGranularity { // These functions could be used to transmit a DataFrame from one place to // another or store a DataFrame in databases, caches, … // + // NOTE: The choice between to_string() and serialize() depends on the + // dataset. Some datasets (i.e. US Options market data) mostly + // contain small floating-point/integer numbers such as '.5', '.75' + // or '123' and so on. These set of numbers will produce a smaller + // buffer size in string form compared with binary form, especially + // if there are millions of them. But generally, in most cases + // binary form is more efficient. + // // Ts: // List all the types of all data columns. A type should be specified in // the list only once. - // iof: - // Specifies the I/O format. The default is CSV // precision: // Specifies the precision for floating point numbers // @@ -4175,6 +4181,34 @@ class DataFrame : public ThreadGranularity { [[nodiscard]] std::future to_string_async(std::streamsize precision = 12) const; + // This is similar to to_string() to serialize a DataFrame into a binary + // buffer that could be restored later by calling deserialize(). It + // utilizes the write() member function of DataFrame. + // These functions could be used to transmit a DataFrame from one place to + // another or store a DataFrame in databases, caches, … + // + // NOTE: Although this returns a std::string, the string contains binary + // data including potentially many null chars. The best way to read + // the string is by using .data() and .size() methods on std::string + // + // NOTE: The choice between to_string() and serialize() depends on the + // dataset. Some datasets (i.e. US Options market data) mostly + // contain small floating-point/integer numbers such as '.5', '.75' + // or '123' and so on. These set of numbers will produce a smaller + // buffer size in string form compared with binary form, especially + // if there are millions of them. But generally, in most cases + // binary form is more efficient. + // + // Ts: + // List all the types of all data columns. A type should be specified in + // the list only once. + // precision: + // Specifies the precision for floating point numbers + // + template + [[nodiscard]] std::string + serialize() const; + // It inputs the contents of a text file into itself (i.e. DataFrame). // Currently two formats (i.e. csv, json) are supported specified by // the iof parameter. @@ -4253,19 +4287,48 @@ class DataFrame : public ThreadGranularity { // These functions could be used to transmit a DataFrame from one place to // another or store a DataFrame in databases, caches, … // + // NOTE: The choice between to_string() and serialize() depends on the + // dataset. Some datasets (i.e. US Options market data) mostly + // contain small floating-point/integer numbers such as '.5', '.75' + // or '123' and so on. These set of numbers will produce a smaller + // buffer size in string form compared with binary form, especially + // if there are millions of them. But generally, in most cases + // binary form is more efficient. + // // data_frame: // A null terminated string that was generated by calling to_string(). // It must contain a complete DataFrame - // iof: - // Specifies the I/O format. The default is CSV // bool from_string(const char *data_frame); // Same as from_string() above, but executed asynchronously + // [[nodiscard]] std::future from_string_async(const char *data_frame); + // This is a convenient function (conceptually similar to from_string()) + // to restore a DataFrame from a binary buffer that was previously + // generated by calling serialize(). It utilizes the read() member function + // of DataFrame. + // These functions could be used to transmit a DataFrame from one place to + // another or store a DataFrame in databases, caches, … + // + // NOTE: The choice between to_string() and serialize() depends on the + // dataset. Some datasets (i.e. US Options market data) mostly + // contain small floating-point/integer numbers such as '.5', '.75' + // or '123' and so on. These set of numbers will produce a smaller + // buffer size in string form compared with binary form, especially + // if there are millions of them. But generally, in most cases + // binary form is more efficient. + // + // data_frame: + // A std::string that was generated by calling serialize(). + // It must contain a complete DataFrame in binary format + // + bool + deserialize(const std::string &data_frame); + private: // Internally used containers aligned with DataFrame alignment diff --git a/include/DataFrame/Internals/DataFrame_read.tcc b/include/DataFrame/Internals/DataFrame_read.tcc index 1220432e..2557c79a 100644 --- a/include/DataFrame/Internals/DataFrame_read.tcc +++ b/include/DataFrame/Internals/DataFrame_read.tcc @@ -1583,6 +1583,21 @@ DataFrame::from_string (const char *data_frame) { // ---------------------------------------------------------------------------- +template +bool +DataFrame::deserialize (const std::string &data_frame) { + + static_assert(std::is_base_of, H>::value, + "Only a StdDataFrame can call deserialize()"); + + std::stringstream ss (data_frame, std::ios_base::in); + + read(ss, io_format::binary, false); + return (true); +} + +// ---------------------------------------------------------------------------- + template std::future DataFrame:: read_async(const char *file_name, diff --git a/include/DataFrame/Internals/DataFrame_write.tcc b/include/DataFrame/Internals/DataFrame_write.tcc index d337f311..d94fdd31 100644 --- a/include/DataFrame/Internals/DataFrame_write.tcc +++ b/include/DataFrame/Internals/DataFrame_write.tcc @@ -74,6 +74,19 @@ DataFrame::to_string(std::streamsize precision) const { // ---------------------------------------------------------------------------- +template +template +std::string +DataFrame::serialize() const { + + std::stringstream ss (std::ios_base::out); + + write(ss, io_format::binary); + return (ss.str()); +} + +// ---------------------------------------------------------------------------- + template template bool DataFrame:: @@ -98,7 +111,8 @@ write(S &o, else start_row = std::max(long(0), end_row + max_recs); - o.precision(precision); + if (iof != io_format::binary) o.precision(precision); + if (iof == io_format::json) { o << "{\n"; if (! columns_only) [[likely]] { diff --git a/include/DataFrame/Utils/Utils.h b/include/DataFrame/Utils/Utils.h index 19bb977f..98218de8 100644 --- a/include/DataFrame/Utils/Utils.h +++ b/include/DataFrame/Utils/Utils.h @@ -347,7 +347,7 @@ shift_left(V &vec, std::size_t n) { // ---------------------------------------------------------------------------- -template +template struct IOStreamOpti { IOStreamOpti (STR &stream, const char *file_name, bool binary = false) From 774a86b285d9f84f86277efc51e2ddab3ea0b2cc Mon Sep 17 00:00:00 2001 From: Hossein Moein Date: Tue, 28 May 2024 10:18:20 -0400 Subject: [PATCH 2/4] Added tests for serialize()/deserialize() --- include/DataFrame/DataFrame.h | 14 +++++- .../DataFrame/Internals/DataFrame_read.tcc | 16 ++++++- .../DataFrame/Internals/DataFrame_write.tcc | 10 +++++ test/dataframe_tester_3.cc | 44 +++++++++++++++++-- 4 files changed, 77 insertions(+), 7 deletions(-) diff --git a/include/DataFrame/DataFrame.h b/include/DataFrame/DataFrame.h index c05be53b..550aa5e2 100644 --- a/include/DataFrame/DataFrame.h +++ b/include/DataFrame/DataFrame.h @@ -4202,13 +4202,18 @@ class DataFrame : public ThreadGranularity { // Ts: // List all the types of all data columns. A type should be specified in // the list only once. - // precision: - // Specifies the precision for floating point numbers // template [[nodiscard]] std::string serialize() const; + // Same as serialize() above, but executed asynchronously + // + template + [[nodiscard]] std::future + serialize_async() const; + + // This is similar to to_string() to serialize a DataFrame into a binary // It inputs the contents of a text file into itself (i.e. DataFrame). // Currently two formats (i.e. csv, json) are supported specified by // the iof parameter. @@ -4329,6 +4334,11 @@ class DataFrame : public ThreadGranularity { bool deserialize(const std::string &data_frame); + // Same as deserialize() above, but executed asynchronously + // + [[nodiscard]] std::future + deserialize_async(const std::string &data_frame); + private: // Internally used containers aligned with DataFrame alignment diff --git a/include/DataFrame/Internals/DataFrame_read.tcc b/include/DataFrame/Internals/DataFrame_read.tcc index 2557c79a..f121672c 100644 --- a/include/DataFrame/Internals/DataFrame_read.tcc +++ b/include/DataFrame/Internals/DataFrame_read.tcc @@ -1658,7 +1658,21 @@ DataFrame::from_string_async(const char *data_frame) { return (thr_pool_.dispatch(true, &DataFrame::from_string, this, - data_frame)); + data_frame) + ); +} + +// ---------------------------------------------------------------------------- + +template +std::future +DataFrame::deserialize_async(const std::string &data_frame) { + + return (thr_pool_.dispatch(true, + &DataFrame::deserialize, + this, + std::forward(data_frame) + )); } } // namespace hmdf diff --git a/include/DataFrame/Internals/DataFrame_write.tcc b/include/DataFrame/Internals/DataFrame_write.tcc index d94fdd31..31cf4969 100644 --- a/include/DataFrame/Internals/DataFrame_write.tcc +++ b/include/DataFrame/Internals/DataFrame_write.tcc @@ -312,6 +312,16 @@ to_string_async (std::streamsize precision) const { precision)); } +// ---------------------------------------------------------------------------- + +template +template +std::future DataFrame:: +serialize_async () const { + + return (thr_pool_.dispatch(true, &DataFrame::serialize, this)); +} + } // namespace hmdf // ---------------------------------------------------------------------------- diff --git a/test/dataframe_tester_3.cc b/test/dataframe_tester_3.cc index ff3f4719..0e984bb7 100644 --- a/test/dataframe_tester_3.cc +++ b/test/dataframe_tester_3.cc @@ -265,19 +265,54 @@ static void test_to_from_string() { const std::string str_dump_from_vw = vw.to_string(); - // std::cout << str_dump << std::endl; - MyDataFrame df2; df2.from_string(str_dump.c_str()); - // std::cout << '\n' << std::endl; - // df2.write(std::cout); assert((df.is_equal(df2))); assert(str_dump == str_dump_from_vw); } // ---------------------------------------------------------------------------- +static void test_serialize() { + + std::cout << "\nTesting test_serialize() ..." << std::endl; + + StlVecType idx = + { 123450, 123451, 123452, 123453, 123454, 123455, 123456, + 123457, 123458, 123459, 123460, 123461, 123462, 123466 }; + StlVecType d1 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }; + StlVecType d2 = { 8, 9, 10, 11, 12, 13, 14, 20, 22, 23, + 30, 31, 32, 1.89 }; + StlVecType d3 = { 15, 16, 17, 18, 19, 20, 21, + 0.34, 1.56, 0.34, 2.3, 0.1, 0.89, 0.45 }; + StlVecType i1 = { 22, 23, 24, 25, 99, 100, 101, 3, 2 }; + StlVecType strvec = + { "zz", "bb", "cc", "ww", "ee", "ff", "gg", "hh", "ii", "jj", "kk", + "ll", "mm", "nn" }; + MyDataFrame df; + + df.load_data(std::move(idx), + std::make_pair("col_1", d1), + std::make_pair("col_2", d2), + std::make_pair("col_3", d3), + std::make_pair("col_4", i1), + std::make_pair("str_col", strvec)); + + std::future ser_fut = + df.serialize_async(); + const std::string ser = ser_fut.get(); + + MyDataFrame df2; + + std::future deser_fut = df2.deserialize_async(ser); + + deser_fut.get(); + assert((df.is_equal(df2))); +} + +// ---------------------------------------------------------------------------- + static void test_CoppockCurveVisitor() { std::cout << "\nTesting CoppockCurveVisitor{ } ..." << std::endl; @@ -4064,6 +4099,7 @@ int main(int, char *[]) { test_groupby_edge(); test_concat_view(); test_to_from_string(); + test_serialize(); test_CoppockCurveVisitor(); test_BiasVisitor(); test_BalanceOfPowerVisitor(); From fb652a2b9bfa632556148cfc641e0f21f2b598ca Mon Sep 17 00:00:00 2001 From: Hossein Moein Date: Wed, 29 May 2024 09:59:04 -0400 Subject: [PATCH 3/4] Added docs for serialize()/deserialize() --- docs/HTML/DataFrame.html | 14 ++- docs/HTML/from_string.html | 184 ++++++++++++++++++++++++++++++++ docs/HTML/read.html | 78 +------------- docs/HTML/to_string.html | 192 ++++++++++++++++++++++++++++++++++ docs/HTML/write.html | 80 -------------- include/DataFrame/DataFrame.h | 2 +- 6 files changed, 389 insertions(+), 161 deletions(-) create mode 100644 docs/HTML/from_string.html create mode 100644 docs/HTML/to_string.html diff --git a/docs/HTML/DataFrame.html b/docs/HTML/DataFrame.html index 4002d67e..272ecee5 100644 --- a/docs/HTML/DataFrame.html +++ b/docs/HTML/DataFrame.html @@ -216,6 +216,10 @@

API Reference with code samples

describe() + + deserialize()
deserialize_async()
+ + drop_missing() @@ -233,7 +237,7 @@

API Reference with code samples

- from_string()
from_string_async()
+ from_string()
from_string_async()
@@ -409,13 +413,17 @@

API Reference with code samples

- shape() + serialize()
serialize_async()
static
set_lock
() + + shape() + + shapeless() @@ -445,7 +453,7 @@

API Reference with code samples

- to_string()
to_string_async()
+ to_string()
to_string_async()
diff --git a/docs/HTML/from_string.html b/docs/HTML/from_string.html new file mode 100644 index 00000000..b35dff89 --- /dev/null +++ b/docs/HTML/from_string.html @@ -0,0 +1,184 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Signature Description Parameters
+

+bool
+from_string(const char *data_frame); 
+        
+
+ This is a convenient function (simple implementation) to restore a DataFrame from a string that was previously generated by calling to_string(). It utilizes the read() member function of DataFrame. These functions could be used to transmit a DataFrame from one place to another or store a DataFrame in databases, caches, ...

+ + NOTE: The choice between to_string() and serialize() depends on the dataset. Some datasets (i.e. US Options market data) mostly contain small floating-point/integer numbers such as '.5', '.75' or '123' and so on. These set of numbers will produce a smaller buffer size in string form compared with binary form, especially if there are billions of them. But generally, in most cases binary form is more efficient.
+
+ data_frame: A null terminated string that was generated by calling to_string(). It must contain a complete DataFrame
+
+

+std::future<bool>
+from_string_async(const char *data_frame); 
+        
+
+ Same as from_string() above, but executed asynchronously + +
+

+bool
+deserialize(const std::string &data_frame); 
+        
+
+ This is a convenient function (conceptually similar to from_string()) to restore a DataFrame from a binary buffer that was previously generated by calling serialize(). It utilizes the read() member function of DataFrame. These functions could be used to transmit a DataFrame from one place to another or store a DataFrame in databases, caches, ...

+ + NOTE: The choice between to_string() and serialize() depends on the dataset. Some datasets (i.e. US Options market data) mostly contain small floating-point/integer numbers such as '.5', '.75' or '123' and so on. These set of numbers will produce a smaller buffer size in string form compared with binary form, especially if there are billions of them. But generally, in most cases binary form is more efficient.
+
+ data_frame: A std::string that was generated by calling serialize(). It must contain a complete DataFrame in binary format
+
+

+std::future<bool>
+deserialize_async(const std::string &data_frame); 
+        
+
+ Same as deserialize() above, but executed asynchronously + +
+ +
static void test_to_from_string()  {
+
+    std::cout << "\nTesting to_from_string() ..." << std::endl;
+
+    StlVecType<unsigned long>  idx = { 123450, 123451, 123452, 123453, 123454, 123455, 123456, 123457, 123458, 123459, 123460, 123461, 123462, 123466 };
+    StlVecType<double>         d1 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
+    StlVecType<double>         d2 = { 8, 9, 10, 11, 12, 13, 14, 20, 22, 23, 30, 31, 32, 1.89 };
+    StlVecType<double>         d3 = { 15, 16, 17, 18, 19, 20, 21, 0.34, 1.56, 0.34, 2.3, 0.1, 0.89, 0.45 };
+    StlVecType<int>            i1 = { 22, 23, 24, 25, 99, 100, 101, 3, 2 };
+    StlVecType<std::string>    strvec = { "zz", "bb", "cc", "ww", "ee", "ff", "gg", "hh", "ii", "jj", "kk", "ll", "mm", "nn" };
+    MyDataFrame                df;
+
+    df.load_data(std::move(idx),
+                 std::make_pair("col_1", d1),
+                 std::make_pair("col_2", d2),
+                 std::make_pair("col_3", d3),
+                 std::make_pair("col_4", i1),
+                 std::make_pair("str_col", strvec));
+
+    auto    vw = df.get_view<double, int, std::string>({ "col_1", "col_2", "col_3", "col_4", "str_col" });
+
+    std::future<std::string>    f = df.to_string_async<double, int, std::string>();
+    const std::string           str_dump = f.get();
+    const std::string           str_dump_from_vw = vw.to_string<double, int, std::string>();
+
+    MyDataFrame df2;
+
+    df2.from_string(str_dump.c_str());
+    assert((df.is_equal<double, int, std::string>(df2)));
+    assert(str_dump == str_dump_from_vw);
+}
+
+// ----------------------------------------------------------------------------
+
+static void test_serialize()  {
+
+    std::cout << "\nTesting test_serialize() ..." << std::endl;
+
+    StlVecType<unsigned long>  idx = { 123450, 123451, 123452, 123453, 123454, 123455, 123456, 123457, 123458, 123459, 123460, 123461, 123462, 123466 };
+    StlVecType<double>.        d1 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
+    StlVecType<double>         d2 = { 8, 9, 10, 11, 12, 13, 14, 20, 22, 23, 30, 31, 32, 1.89 };
+    StlVecType<double>         d3 = { 15, 16, 17, 18, 19, 20, 21, 0.34, 1.56, 0.34, 2.3, 0.1, 0.89, 0.45 };
+    StlVecType<int>            i1 = { 22, 23, 24, 25, 99, 100, 101, 3, 2 };
+    StlVecType<std::string>    strvec = { "zz", "bb", "cc", "ww", "ee", "ff", "gg", "hh", "ii", "jj", "kk", "ll", "mm", "nn" };
+    MyDataFrame                df;
+
+    df.load_data(std::move(idx),
+                 std::make_pair("col_1", d1),
+                 std::make_pair("col_2", d2),
+                 std::make_pair("col_3", d3),
+                 std::make_pair("col_4", i1),
+                 std::make_pair("str_col", strvec));
+
+    std::future<std::string>    ser_fut = df.serialize_async<double, int, std::string>();
+    const std::string           ser = ser_fut.get();
+
+    MyDataFrame df2;
+
+    std::future<bool>   deser_fut = df2.deserialize_async(ser);
+
+    deser_fut.get();
+    assert((df.is_equal<double, int, std::string>(df2)));
+}
+
+ +
C++ DataFrame + + + + + diff --git a/docs/HTML/read.html b/docs/HTML/read.html index 1f6f95c7..fbc1a6f4 100644 --- a/docs/HTML/read.html +++ b/docs/HTML/read.html @@ -221,44 +221,6 @@ - - -

-bool
-from_string(const char *data_frame); 
-        
- - - This is a convenient function (simple implementation) to restore a DataFrame from a string that was previously generated by calling to_string(). It utilizes the read() member function of DataFrame. - These functions could be used to transmit a DataFrame from one place to another or store a DataFrame in databases, caches, ...

- - I have been asked why I implemented from_string instead of/before doing "from binary format"
- Implementing a binary format as a form of serialization is a legit ask and I will add that option when I find time to implement it. But implementing a binary format is more involved. And binary format is not always more efficient than string format. Two issues stand out
-
    -
  1. Consider Options market data. Options' prices and sizes are usually smaller numbers. For example, consider the number 0.5. In string format that is 3 bytes ".5|". In binary format it is always 8 bytes. So, if you have a dataset with millions/billions of this kind of numbers, it makes a significant difference
  2. -
  3. In binary format you must deal with big-endian vs. little-endian. It is a pain in the neck and affects efficiency
  4. -
- - - - data_frame: A null terminated string that was generated by calling to_string(). It must contain a complete DataFrame
- - - - - -

-std::future<bool>
-from_string_async(const char *data_frame); 
-        
- - - Same as from_string() above, but executed asynchronously - - - - - @@ -368,45 +330,7 @@ -
-// -----------------------------------------------------------------------------
-
-static void test_to_from_string()  {
-
-    std::cout << "\nTesting to_from_string() ..." << std::endl;
-
-    std::vector<unsigned long>  idx =
-        { 123450, 123451, 123452, 123453, 123454, 123455, 123456, 123457, 123458, 123459, 123460, 123461, 123462, 123466 };
-    std::vector<double> d1 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
-    std::vector<double> d2 = { 8, 9, 10, 11, 12, 13, 14, 20, 22, 23, 30, 31, 32, 1.89 };
-    std::vector<double> d3 = { 15, 16, 17, 18, 19, 20, 21, 0.34, 1.56, 0.34, 2.3, 0.1, 0.89, 0.45 };
-    std::vector<int>    i1 = { 22, 23, 24, 25, 99, 100, 101, 3, 2 };
-    std::vector<std::string>    strvec =
-        { "zz", "bb", "cc", "ww", "ee", "ff", "gg", "hh", "ii", "jj", "kk", "ll", "mm", "nn" };
-    MyDataFrame         df;
-
-    df.load_data(std::move(idx),
-                 std::make_pair("col_1", d1),
-                 std::make_pair("col_2", d2),
-                 std::make_pair("col_3", d3),
-                 std::make_pair("col_4", i1),
-                 std::make_pair("str_col", strvec));
-
-    std::future<std::string>    f = df.to_string_async<double, int, std::string>();
-    const std::string           str_dump = f.get();
-
-    // std::cout << str_dump << std::endl;
-
-    MyDataFrame df2;
-
-    df2.from_string(str_dump.c_str());
-    // std::cout << '\n' << std::endl;
-    // df2.write<std::ostream, double, int, std::string>(std::cout);
-    assert((df.is_equal<double, int, std::string>(df2)));
-}
-
- -

+
 // -----------------------------------------------------------------------------
 
 static void test_reading_in_chunks()  {
diff --git a/docs/HTML/to_string.html b/docs/HTML/to_string.html
new file mode 100644
index 00000000..58099c4c
--- /dev/null
+++ b/docs/HTML/to_string.html
@@ -0,0 +1,192 @@
+
+
+
+
+
+
+
+
+
+  
+
+    
+        
+    
+
+    
+      
+      
+      
+    
+
+    
+      
+      
+      
+    
+
+    
+      
+      
+      
+    
+
+    
+      
+      
+      
+    
+
+  
Signature Description Parameters
+

+template<typename ... Ts>
+std::string
+to_string(std::streamsize precision = 12) const; 
+        
+
+ This is a convenient function (simple implementation) to convert a DataFrame into a string that could be restored later by calling from_string(). It utilizes the write() member function of DataFrame.
+ These functions could be used to transmit a DataFrame from one place to another or store a DataFrame in databases, caches, ...

+ + NOTE: The choice between to_string() and serialize() depends on the dataset. Some datasets (i.e. US Options market data) mostly contain small floating-point/integer numbers such as '.5', '.75' or '123' and so on. These set of numbers will produce a smaller buffer size in string form compared with binary form, especially if there are billions of them. But generally, in most cases binary form is more efficient.
+
+ Ts: The list of types for all columns. A type should be specified only once
+ precision: Specifies the precision for floating point numbers
+
+

+template<typename ... Ts>
+std::fututre<std::string>
+to_string_async(std::streamsize precision = 12) const; 
+        
+
+ Same as to_string() above, but executed asynchronously + +
+

+template<typename ... Ts>
+std::string
+serialize() const; 
+        
+
+ This is similar to to_string() to serialize a DataFrame into a binary buffer that could be restored later by calling deserialize(). It utilizes the write() member function of DataFrame. These functions could be used to transmit a DataFrame from one place to another or store a DataFrame in databases, caches, ...

+ + NOTE:: Although this returns a std::string, the string contains binary data including potentially many null chars. The best way to read the string is by using .data() and .size() methods on std::string

+ NOTE:: The choice between to_string() and serialize() depends on the dataset. Some datasets (i.e. US Options market data) mostly contain small floating-point/integer numbers such as '.5', '.75' or '123' and so on. These set of numbers will produce a smaller buffer size in string form compared with binary form, especially if there are billions of them. But generally, in most cases binary form is more efficient.
+
+ Ts: The list of types for all columns. A type should be specified only once
+
+

+template<typename ... Ts>
+std::fututre<std::string>
+serialize_async() const; 
+        
+
+ Same as serialize() above, but executed asynchronously + +
+ +
static void test_to_from_string()  {
+
+    std::cout << "\nTesting to_from_string() ..." << std::endl;
+
+    StlVecType<unsigned long>  idx = { 123450, 123451, 123452, 123453, 123454, 123455, 123456, 123457, 123458, 123459, 123460, 123461, 123462, 123466 };
+    StlVecType<double>         d1 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
+    StlVecType<double>         d2 = { 8, 9, 10, 11, 12, 13, 14, 20, 22, 23, 30, 31, 32, 1.89 };
+    StlVecType<double>         d3 = { 15, 16, 17, 18, 19, 20, 21, 0.34, 1.56, 0.34, 2.3, 0.1, 0.89, 0.45 };
+    StlVecType<int>            i1 = { 22, 23, 24, 25, 99, 100, 101, 3, 2 };
+    StlVecType<std::string>    strvec = { "zz", "bb", "cc", "ww", "ee", "ff", "gg", "hh", "ii", "jj", "kk", "ll", "mm", "nn" };
+    MyDataFrame                df;
+
+    df.load_data(std::move(idx),
+                 std::make_pair("col_1", d1),
+                 std::make_pair("col_2", d2),
+                 std::make_pair("col_3", d3),
+                 std::make_pair("col_4", i1),
+                 std::make_pair("str_col", strvec));
+
+    auto    vw = df.get_view<double, int, std::string>({ "col_1", "col_2", "col_3", "col_4", "str_col" });
+
+    std::future<std::string>    f = df.to_string_async<double, int, std::string>();
+    const std::string           str_dump = f.get();
+    const std::string           str_dump_from_vw = vw.to_string<double, int, std::string>();
+
+    MyDataFrame df2;
+
+    df2.from_string(str_dump.c_str());
+    assert((df.is_equal<double, int, std::string>(df2)));
+    assert(str_dump == str_dump_from_vw);
+}
+
+// ----------------------------------------------------------------------------
+
+static void test_serialize()  {
+
+    std::cout << "\nTesting test_serialize() ..." << std::endl;
+
+    StlVecType<unsigned long>  idx = { 123450, 123451, 123452, 123453, 123454, 123455, 123456, 123457, 123458, 123459, 123460, 123461, 123462, 123466 };
+    StlVecType<double>.        d1 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
+    StlVecType<double>         d2 = { 8, 9, 10, 11, 12, 13, 14, 20, 22, 23, 30, 31, 32, 1.89 };
+    StlVecType<double>         d3 = { 15, 16, 17, 18, 19, 20, 21, 0.34, 1.56, 0.34, 2.3, 0.1, 0.89, 0.45 };
+    StlVecType<int>            i1 = { 22, 23, 24, 25, 99, 100, 101, 3, 2 };
+    StlVecType<std::string>    strvec = { "zz", "bb", "cc", "ww", "ee", "ff", "gg", "hh", "ii", "jj", "kk", "ll", "mm", "nn" };
+    MyDataFrame                df;
+
+    df.load_data(std::move(idx),
+                 std::make_pair("col_1", d1),
+                 std::make_pair("col_2", d2),
+                 std::make_pair("col_3", d3),
+                 std::make_pair("col_4", i1),
+                 std::make_pair("str_col", strvec));
+
+    std::future<std::string>    ser_fut = df.serialize_async<double, int, std::string>();
+    const std::string           ser = ser_fut.get();
+
+    MyDataFrame df2;
+
+    std::future<bool>   deser_fut = df2.deserialize_async(ser);
+
+    deser_fut.get();
+    assert((df.is_equal<double, int, std::string>(df2)));
+}
+
+ +
C++ DataFrame + + + + + + diff --git a/docs/HTML/write.html b/docs/HTML/write.html index 1b0b1a5d..fc5936a6 100644 --- a/docs/HTML/write.html +++ b/docs/HTML/write.html @@ -220,47 +220,6 @@ - - -

-template<typename ... Ts>
-std::string
-to_string(std::streamsize precision = 12) const; 
-        
- - - This is a convenient function (simple implementation) to convert a DataFrame into a string that could be restored later by calling from_string(). It utilizes the write() member function of DataFrame.
- These functions could be used to transmit a DataFrame from one place to another or store a DataFrame in databases, caches, ...

- - I have been asked why I implemented to_string instead of/before doing "to binary format"
- Implementing a binary format as a form of serialization is a legit ask and I will add that option when I find time to implement it. But implementing a binary format is more involved. And binary format is not always more efficient than string format. Two issues stand out
-
    -
  1. Consider Options market data. Options' prices and sizes are usually smaller numbers. For example, consider the number 0.5. In string format that is 3 bytes ".5|". In binary format it is always 8 bytes. So, if you have a dataset with millions/billions of this kind of numbers, it makes a significant difference
  2. -
  3. In binary format you must deal with big-endian vs. little-endian. It is a pain in the neck and affects efficiency
  4. -
- - - - Ts: The list of types for all columns. A type should be specified only once
- precision: Specifies the precision for floating point numbers
- - - - - -

-template<typename ... Ts>
-std::fututre<std::string>
-to_string_async(std::streamsize precision = 12) const; 
-        
- - - Same as to_string() above, but executed asynchronously - - - - -
static void test_write_json()  {
@@ -336,45 +295,6 @@
 
-
-// -----------------------------------------------------------------------------
-
-static void test_to_from_string()  {
-
-    std::cout << "\nTesting to_from_string() ..." << std::endl;
-
-    std::vector<unsigned long>  idx =
-        { 123450, 123451, 123452, 123453, 123454, 123455, 123456, 123457, 123458, 123459, 123460, 123461, 123462, 123466 };
-    std::vector<double> d1 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
-    std::vector<double> d2 = { 8, 9, 10, 11, 12, 13, 14, 20, 22, 23, 30, 31, 32, 1.89 };
-    std::vector<double> d3 = { 15, 16, 17, 18, 19, 20, 21, 0.34, 1.56, 0.34, 2.3, 0.1, 0.89, 0.45 };
-    std::vector<int>    i1 = { 22, 23, 24, 25, 99, 100, 101, 3, 2 };
-    std::vector<std::string>    strvec =
-        { "zz", "bb", "cc", "ww", "ee", "ff", "gg", "hh", "ii", "jj", "kk", "ll", "mm", "nn" };
-    MyDataFrame         df;
-
-    df.load_data(std::move(idx),
-                 std::make_pair("col_1", d1),
-                 std::make_pair("col_2", d2),
-                 std::make_pair("col_3", d3),
-                 std::make_pair("col_4", i1),
-                 std::make_pair("str_col", strvec));
-
-    std::future<std::string>    f = df.to_string_async<double, int, std::string>();
-    const std::string           str_dump = f.get();
-
-    // std::cout << str_dump << std::endl;
-
-    MyDataFrame df2;
-
-    df2.from_string(str_dump.c_str());
-    // std::cout << '\n' << std::endl;
-    // df2.write<std::ostream, double, int, std::string>(std::cout);
-    assert((df.is_equal<double, int, std::string>(df2)));
-}
-
- -
C++ DataFrame diff --git a/include/DataFrame/DataFrame.h b/include/DataFrame/DataFrame.h index 550aa5e2..46e4054f 100644 --- a/include/DataFrame/DataFrame.h +++ b/include/DataFrame/DataFrame.h @@ -4163,7 +4163,7 @@ class DataFrame : public ThreadGranularity { // or '123' and so on. These set of numbers will produce a smaller // buffer size in string form compared with binary form, especially // if there are millions of them. But generally, in most cases - // binary form is more efficient. + // binary form is more efficient. // // Ts: // List all the types of all data columns. A type should be specified in From ac3df3b4f90fa4bb8b3a2869eb321a03bc2a313f Mon Sep 17 00:00:00 2001 From: Hossein Moein Date: Wed, 29 May 2024 11:01:54 -0400 Subject: [PATCH 4/4] Adjusted hello world with serialize() --- examples/hello_world.cc | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/examples/hello_world.cc b/examples/hello_world.cc index 232ee481..a7c0ddf6 100644 --- a/examples/hello_world.cc +++ b/examples/hello_world.cc @@ -138,17 +138,13 @@ int main(int, char *[]) { ul_df2.write(std::cout, io_format::csv2); ibm_df.write("/tmp/test.json", io_format::json); - // You can convert a DataFrame to a string and from a string back into a DataFrame. This could be used to transmit a - // DataFrame from one place to another or store a DataFrame in databases, caches, ... + // You can serialize and deserialize the DataFrame both in string and binary formats. + // This could be used to transmit a DataFrame from one node to another or store a DataFrame in databases, caches, ... // - const std::string ibm_df_as_str = ibm_df.to_string(); + const std::string ibm_df_serialized = ibm_df.serialize(); StrDataFrame ibm_df_2; - // Since we convert from native type to string and back, if you have floating point numbers with long precisions, you may - // run into precision mismatches. to_string() has a precision parameter you can adjust. The default is 12 which is a - // relatively high precision. - // - ibm_df_2.from_string(ibm_df_as_str.c_str()); + ibm_df_2.deserialize(ibm_df_serialized); // std::cout << ibm_df_as_str << std::endl; // Large output using ul_idx_t = ULDataFrame::IndexType; // This is just unsigned long.