diff --git a/README.md b/README.md
index 182a8bd5..75b199ae 100644
--- a/README.md
+++ b/README.md
@@ -50,7 +50,7 @@ I have followed a few principles in this library:
5. [Avoid copying data as much as possible](https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/copying_data.html)
6. [Use multi-threading but only when it makes sense](https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/multithreading.html)
7. [Do not attempt to protect the user against _garbage in_, _garbage out_](https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/garbage_in_garbage_out.html)
-8. DataFrame library is self-contained, meaning DataFrame only depends on _C++ language_ and its _standard library_
+8. Keep DataFrame library self-contained, meaning DataFrame must only depend on _C++ language_ and its _standard library_
---
diff --git a/docs/HTML/DataFrame.html b/docs/HTML/DataFrame.html
index 4002d67e..272ecee5 100644
--- a/docs/HTML/DataFrame.html
+++ b/docs/HTML/DataFrame.html
@@ -216,6 +216,10 @@
API Reference with code samples
describe() |
+
+ deserialize() deserialize_async() |
+
+
drop_missing() |
@@ -233,7 +237,7 @@ API Reference with code samples
- from_string() from_string_async() |
+ from_string() from_string_async() |
@@ -409,13 +413,17 @@ API Reference with code samples
- shape() |
+ serialize() serialize_async() |
static set_lock() |
+
+ shape() |
+
+
shapeless() |
@@ -445,7 +453,7 @@ API Reference with code samples
- to_string() to_string_async() |
+ to_string() to_string_async() |
diff --git a/docs/HTML/from_string.html b/docs/HTML/from_string.html
new file mode 100644
index 00000000..b35dff89
--- /dev/null
+++ b/docs/HTML/from_string.html
@@ -0,0 +1,184 @@
+
+
+
+
+
+
+
+
+
+
+
+
+ Signature | Description | Parameters |
+
+
+
+
+
+bool
+from_string(const char *data_frame);
+
+ |
+
+ This is a convenient function (simple implementation) to restore a DataFrame from a string that was previously generated by calling to_string(). It utilizes the read() member function of DataFrame. These functions could be used to transmit a DataFrame from one place to another or store a DataFrame in databases, caches, ...
+
+ NOTE: The choice between to_string() and serialize() depends on the dataset. Some datasets (i.e. US Options market data) mostly contain small floating-point/integer numbers such as '.5', '.75' or '123' and so on. These set of numbers will produce a smaller buffer size in string form compared with binary form, especially if there are billions of them. But generally, in most cases binary form is more efficient.
+ |
+
+ data_frame: A null terminated string that was generated by calling to_string(). It must contain a complete DataFrame
+ |
+
+
+
+
+
+std::future<bool>
+from_string_async(const char *data_frame);
+
+ |
+
+ Same as from_string() above, but executed asynchronously
+ |
+
+ |
+
+
+
+
+
+bool
+deserialize(const std::string &data_frame);
+
+ |
+
+ This is a convenient function (conceptually similar to from_string()) to restore a DataFrame from a binary buffer that was previously generated by calling serialize(). It utilizes the read() member function of DataFrame. These functions could be used to transmit a DataFrame from one place to another or store a DataFrame in databases, caches, ...
+
+ NOTE: The choice between to_string() and serialize() depends on the dataset. Some datasets (i.e. US Options market data) mostly contain small floating-point/integer numbers such as '.5', '.75' or '123' and so on. These set of numbers will produce a smaller buffer size in string form compared with binary form, especially if there are billions of them. But generally, in most cases binary form is more efficient.
+ |
+
+ data_frame: A std::string that was generated by calling serialize(). It must contain a complete DataFrame in binary format
+ |
+
+
+
+
+
+std::future<bool>
+deserialize_async(const std::string &data_frame);
+
+ |
+
+ Same as deserialize() above, but executed asynchronously
+ |
+
+ |
+
+
+
+
+static void test_to_from_string() {
+
+ std::cout << "\nTesting to_from_string() ..." << std::endl;
+
+ StlVecType<unsigned long> idx = { 123450, 123451, 123452, 123453, 123454, 123455, 123456, 123457, 123458, 123459, 123460, 123461, 123462, 123466 };
+ StlVecType<double> d1 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
+ StlVecType<double> d2 = { 8, 9, 10, 11, 12, 13, 14, 20, 22, 23, 30, 31, 32, 1.89 };
+ StlVecType<double> d3 = { 15, 16, 17, 18, 19, 20, 21, 0.34, 1.56, 0.34, 2.3, 0.1, 0.89, 0.45 };
+ StlVecType<int> i1 = { 22, 23, 24, 25, 99, 100, 101, 3, 2 };
+ StlVecType<std::string> strvec = { "zz", "bb", "cc", "ww", "ee", "ff", "gg", "hh", "ii", "jj", "kk", "ll", "mm", "nn" };
+ MyDataFrame df;
+
+ df.load_data(std::move(idx),
+ std::make_pair("col_1", d1),
+ std::make_pair("col_2", d2),
+ std::make_pair("col_3", d3),
+ std::make_pair("col_4", i1),
+ std::make_pair("str_col", strvec));
+
+ auto vw = df.get_view<double, int, std::string>({ "col_1", "col_2", "col_3", "col_4", "str_col" });
+
+ std::future<std::string> f = df.to_string_async<double, int, std::string>();
+ const std::string str_dump = f.get();
+ const std::string str_dump_from_vw = vw.to_string<double, int, std::string>();
+
+ MyDataFrame df2;
+
+ df2.from_string(str_dump.c_str());
+ assert((df.is_equal<double, int, std::string>(df2)));
+ assert(str_dump == str_dump_from_vw);
+}
+
+// ----------------------------------------------------------------------------
+
+static void test_serialize() {
+
+ std::cout << "\nTesting test_serialize() ..." << std::endl;
+
+ StlVecType<unsigned long> idx = { 123450, 123451, 123452, 123453, 123454, 123455, 123456, 123457, 123458, 123459, 123460, 123461, 123462, 123466 };
+ StlVecType<double>. d1 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
+ StlVecType<double> d2 = { 8, 9, 10, 11, 12, 13, 14, 20, 22, 23, 30, 31, 32, 1.89 };
+ StlVecType<double> d3 = { 15, 16, 17, 18, 19, 20, 21, 0.34, 1.56, 0.34, 2.3, 0.1, 0.89, 0.45 };
+ StlVecType<int> i1 = { 22, 23, 24, 25, 99, 100, 101, 3, 2 };
+ StlVecType<std::string> strvec = { "zz", "bb", "cc", "ww", "ee", "ff", "gg", "hh", "ii", "jj", "kk", "ll", "mm", "nn" };
+ MyDataFrame df;
+
+ df.load_data(std::move(idx),
+ std::make_pair("col_1", d1),
+ std::make_pair("col_2", d2),
+ std::make_pair("col_3", d3),
+ std::make_pair("col_4", i1),
+ std::make_pair("str_col", strvec));
+
+ std::future<std::string> ser_fut = df.serialize_async<double, int, std::string>();
+ const std::string ser = ser_fut.get();
+
+ MyDataFrame df2;
+
+ std::future<bool> deser_fut = df2.deserialize_async(ser);
+
+ deser_fut.get();
+ assert((df.is_equal<double, int, std::string>(df2)));
+}
+
+
+
+
+
+
+
+
diff --git a/docs/HTML/read.html b/docs/HTML/read.html
index 1f6f95c7..fbc1a6f4 100644
--- a/docs/HTML/read.html
+++ b/docs/HTML/read.html
@@ -221,44 +221,6 @@
-
-
-
-bool
-from_string(const char *data_frame);
-
- |
-
- This is a convenient function (simple implementation) to restore a DataFrame from a string that was previously generated by calling to_string(). It utilizes the read() member function of DataFrame.
- These functions could be used to transmit a DataFrame from one place to another or store a DataFrame in databases, caches, ...
-
- I have been asked why I implemented from_string instead of/before doing "from binary format"
- Implementing a binary format as a form of serialization is a legit ask and I will add that option when I find time to implement it. But implementing a binary format is more involved. And binary format is not always more efficient than string format. Two issues stand out
-
- - Consider Options market data. Options' prices and sizes are usually smaller numbers. For example, consider the number 0.5. In string format that is 3 bytes ".5|". In binary format it is always 8 bytes. So, if you have a dataset with millions/billions of this kind of numbers, it makes a significant difference
- - In binary format you must deal with big-endian vs. little-endian. It is a pain in the neck and affects efficiency
-
-
- |
-
- data_frame: A null terminated string that was generated by calling to_string(). It must contain a complete DataFrame
- |
-
-
-
-
-
-std::future<bool>
-from_string_async(const char *data_frame);
-
- |
-
- Same as from_string() above, but executed asynchronously
- |
-
- |
-
-
@@ -368,45 +330,7 @@
-
-// -----------------------------------------------------------------------------
-
-static void test_to_from_string() {
-
- std::cout << "\nTesting to_from_string() ..." << std::endl;
-
- std::vector<unsigned long> idx =
- { 123450, 123451, 123452, 123453, 123454, 123455, 123456, 123457, 123458, 123459, 123460, 123461, 123462, 123466 };
- std::vector<double> d1 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
- std::vector<double> d2 = { 8, 9, 10, 11, 12, 13, 14, 20, 22, 23, 30, 31, 32, 1.89 };
- std::vector<double> d3 = { 15, 16, 17, 18, 19, 20, 21, 0.34, 1.56, 0.34, 2.3, 0.1, 0.89, 0.45 };
- std::vector<int> i1 = { 22, 23, 24, 25, 99, 100, 101, 3, 2 };
- std::vector<std::string> strvec =
- { "zz", "bb", "cc", "ww", "ee", "ff", "gg", "hh", "ii", "jj", "kk", "ll", "mm", "nn" };
- MyDataFrame df;
-
- df.load_data(std::move(idx),
- std::make_pair("col_1", d1),
- std::make_pair("col_2", d2),
- std::make_pair("col_3", d3),
- std::make_pair("col_4", i1),
- std::make_pair("str_col", strvec));
-
- std::future<std::string> f = df.to_string_async<double, int, std::string>();
- const std::string str_dump = f.get();
-
- // std::cout << str_dump << std::endl;
-
- MyDataFrame df2;
-
- df2.from_string(str_dump.c_str());
- // std::cout << '\n' << std::endl;
- // df2.write<std::ostream, double, int, std::string>(std::cout);
- assert((df.is_equal<double, int, std::string>(df2)));
-}
-
-
-
+
// -----------------------------------------------------------------------------
static void test_reading_in_chunks() {
diff --git a/docs/HTML/to_string.html b/docs/HTML/to_string.html
new file mode 100644
index 00000000..58099c4c
--- /dev/null
+++ b/docs/HTML/to_string.html
@@ -0,0 +1,192 @@
+
+
+
+
+
+
+
+
+
+
+
+
+ Signature | Description | Parameters |
+
+
+
+
+
+template<typename ... Ts>
+std::string
+to_string(std::streamsize precision = 12) const;
+
+ |
+
+ This is a convenient function (simple implementation) to convert a DataFrame into a string that could be restored later by calling from_string(). It utilizes the write() member function of DataFrame.
+ These functions could be used to transmit a DataFrame from one place to another or store a DataFrame in databases, caches, ...
+
+ NOTE: The choice between to_string() and serialize() depends on the dataset. Some datasets (i.e. US Options market data) mostly contain small floating-point/integer numbers such as '.5', '.75' or '123' and so on. These set of numbers will produce a smaller buffer size in string form compared with binary form, especially if there are billions of them. But generally, in most cases binary form is more efficient.
+ |
+
+ Ts: The list of types for all columns. A type should be specified only once
+ precision: Specifies the precision for floating point numbers
+ |
+
+
+
+
+
+template<typename ... Ts>
+std::fututre<std::string>
+to_string_async(std::streamsize precision = 12) const;
+
+ |
+
+ Same as to_string() above, but executed asynchronously
+ |
+
+ |
+
+
+
+
+
+template<typename ... Ts>
+std::string
+serialize() const;
+
+ |
+
+ This is similar to to_string() to serialize a DataFrame into a binary buffer that could be restored later by calling deserialize(). It utilizes the write() member function of DataFrame. These functions could be used to transmit a DataFrame from one place to another or store a DataFrame in databases, caches, ...
+
+ NOTE:: Although this returns a std::string, the string contains binary data including potentially many null chars. The best way to read the string is by using .data() and .size() methods on std::string
+ NOTE:: The choice between to_string() and serialize() depends on the dataset. Some datasets (i.e. US Options market data) mostly contain small floating-point/integer numbers such as '.5', '.75' or '123' and so on. These set of numbers will produce a smaller buffer size in string form compared with binary form, especially if there are billions of them. But generally, in most cases binary form is more efficient.
+ |
+
+ Ts: The list of types for all columns. A type should be specified only once
+ |
+
+
+
+
+
+template<typename ... Ts>
+std::fututre<std::string>
+serialize_async() const;
+
+ |
+
+ Same as serialize() above, but executed asynchronously
+ |
+
+ |
+
+
+
+
+static void test_to_from_string() {
+
+ std::cout << "\nTesting to_from_string() ..." << std::endl;
+
+ StlVecType<unsigned long> idx = { 123450, 123451, 123452, 123453, 123454, 123455, 123456, 123457, 123458, 123459, 123460, 123461, 123462, 123466 };
+ StlVecType<double> d1 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
+ StlVecType<double> d2 = { 8, 9, 10, 11, 12, 13, 14, 20, 22, 23, 30, 31, 32, 1.89 };
+ StlVecType<double> d3 = { 15, 16, 17, 18, 19, 20, 21, 0.34, 1.56, 0.34, 2.3, 0.1, 0.89, 0.45 };
+ StlVecType<int> i1 = { 22, 23, 24, 25, 99, 100, 101, 3, 2 };
+ StlVecType<std::string> strvec = { "zz", "bb", "cc", "ww", "ee", "ff", "gg", "hh", "ii", "jj", "kk", "ll", "mm", "nn" };
+ MyDataFrame df;
+
+ df.load_data(std::move(idx),
+ std::make_pair("col_1", d1),
+ std::make_pair("col_2", d2),
+ std::make_pair("col_3", d3),
+ std::make_pair("col_4", i1),
+ std::make_pair("str_col", strvec));
+
+ auto vw = df.get_view<double, int, std::string>({ "col_1", "col_2", "col_3", "col_4", "str_col" });
+
+ std::future<std::string> f = df.to_string_async<double, int, std::string>();
+ const std::string str_dump = f.get();
+ const std::string str_dump_from_vw = vw.to_string<double, int, std::string>();
+
+ MyDataFrame df2;
+
+ df2.from_string(str_dump.c_str());
+ assert((df.is_equal<double, int, std::string>(df2)));
+ assert(str_dump == str_dump_from_vw);
+}
+
+// ----------------------------------------------------------------------------
+
+static void test_serialize() {
+
+ std::cout << "\nTesting test_serialize() ..." << std::endl;
+
+ StlVecType<unsigned long> idx = { 123450, 123451, 123452, 123453, 123454, 123455, 123456, 123457, 123458, 123459, 123460, 123461, 123462, 123466 };
+ StlVecType<double>. d1 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
+ StlVecType<double> d2 = { 8, 9, 10, 11, 12, 13, 14, 20, 22, 23, 30, 31, 32, 1.89 };
+ StlVecType<double> d3 = { 15, 16, 17, 18, 19, 20, 21, 0.34, 1.56, 0.34, 2.3, 0.1, 0.89, 0.45 };
+ StlVecType<int> i1 = { 22, 23, 24, 25, 99, 100, 101, 3, 2 };
+ StlVecType<std::string> strvec = { "zz", "bb", "cc", "ww", "ee", "ff", "gg", "hh", "ii", "jj", "kk", "ll", "mm", "nn" };
+ MyDataFrame df;
+
+ df.load_data(std::move(idx),
+ std::make_pair("col_1", d1),
+ std::make_pair("col_2", d2),
+ std::make_pair("col_3", d3),
+ std::make_pair("col_4", i1),
+ std::make_pair("str_col", strvec));
+
+ std::future<std::string> ser_fut = df.serialize_async<double, int, std::string>();
+ const std::string ser = ser_fut.get();
+
+ MyDataFrame df2;
+
+ std::future<bool> deser_fut = df2.deserialize_async(ser);
+
+ deser_fut.get();
+ assert((df.is_equal<double, int, std::string>(df2)));
+}
+
+
+
+
+
+
+
+
+
diff --git a/docs/HTML/write.html b/docs/HTML/write.html
index 1b0b1a5d..fc5936a6 100644
--- a/docs/HTML/write.html
+++ b/docs/HTML/write.html
@@ -220,47 +220,6 @@
-
-
-
-template<typename ... Ts>
-std::string
-to_string(std::streamsize precision = 12) const;
-
- |
-
- This is a convenient function (simple implementation) to convert a DataFrame into a string that could be restored later by calling from_string(). It utilizes the write() member function of DataFrame.
- These functions could be used to transmit a DataFrame from one place to another or store a DataFrame in databases, caches, ...
-
- I have been asked why I implemented to_string instead of/before doing "to binary format"
- Implementing a binary format as a form of serialization is a legit ask and I will add that option when I find time to implement it. But implementing a binary format is more involved. And binary format is not always more efficient than string format. Two issues stand out
-
- - Consider Options market data. Options' prices and sizes are usually smaller numbers. For example, consider the number 0.5. In string format that is 3 bytes ".5|". In binary format it is always 8 bytes. So, if you have a dataset with millions/billions of this kind of numbers, it makes a significant difference
- - In binary format you must deal with big-endian vs. little-endian. It is a pain in the neck and affects efficiency
-
-
- |
-
- Ts: The list of types for all columns. A type should be specified only once
- precision: Specifies the precision for floating point numbers
- |
-
-
-
-
-
-template<typename ... Ts>
-std::fututre<std::string>
-to_string_async(std::streamsize precision = 12) const;
-
- |
-
- Same as to_string() above, but executed asynchronously
- |
-
- |
-
-
static void test_write_json() {
@@ -336,45 +295,6 @@
-
-// -----------------------------------------------------------------------------
-
-static void test_to_from_string() {
-
- std::cout << "\nTesting to_from_string() ..." << std::endl;
-
- std::vector<unsigned long> idx =
- { 123450, 123451, 123452, 123453, 123454, 123455, 123456, 123457, 123458, 123459, 123460, 123461, 123462, 123466 };
- std::vector<double> d1 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
- std::vector<double> d2 = { 8, 9, 10, 11, 12, 13, 14, 20, 22, 23, 30, 31, 32, 1.89 };
- std::vector<double> d3 = { 15, 16, 17, 18, 19, 20, 21, 0.34, 1.56, 0.34, 2.3, 0.1, 0.89, 0.45 };
- std::vector<int> i1 = { 22, 23, 24, 25, 99, 100, 101, 3, 2 };
- std::vector<std::string> strvec =
- { "zz", "bb", "cc", "ww", "ee", "ff", "gg", "hh", "ii", "jj", "kk", "ll", "mm", "nn" };
- MyDataFrame df;
-
- df.load_data(std::move(idx),
- std::make_pair("col_1", d1),
- std::make_pair("col_2", d2),
- std::make_pair("col_3", d3),
- std::make_pair("col_4", i1),
- std::make_pair("str_col", strvec));
-
- std::future<std::string> f = df.to_string_async<double, int, std::string>();
- const std::string str_dump = f.get();
-
- // std::cout << str_dump << std::endl;
-
- MyDataFrame df2;
-
- df2.from_string(str_dump.c_str());
- // std::cout << '\n' << std::endl;
- // df2.write<std::ostream, double, int, std::string>(std::cout);
- assert((df.is_equal<double, int, std::string>(df2)));
-}
-
-
-
diff --git a/examples/hello_world.cc b/examples/hello_world.cc
index 889ffc36..a7c0ddf6 100644
--- a/examples/hello_world.cc
+++ b/examples/hello_world.cc
@@ -109,7 +109,7 @@ int main(int, char *[]) {
StrDataFrame ibm_df;
// Also, you can load data into a DataFrame from a file, supporting a few different formats. If the file cannot be found,
- // an exception will be thrown. If the DataFrame root directory is your current directory when running this, it should
+ // an exception will be thrown. If the DataFrame data directory is your current directory when running this, it should
// work fine.
//
ibm_df.read("IBM.csv", io_format::csv2);
@@ -138,17 +138,13 @@ int main(int, char *[]) {
ul_df2.write(std::cout, io_format::csv2);
ibm_df.write("/tmp/test.json", io_format::json);
- // You can convert a DataFrame to a string and from a string back into a DataFrame. This could be used to transmit a
- // DataFrame from one place to another or store a DataFrame in databases, caches, ...
+ // You can serialize and deserialize the DataFrame both in string and binary formats.
+ // This could be used to transmit a DataFrame from one node to another or store a DataFrame in databases, caches, ...
//
- const std::string ibm_df_as_str = ibm_df.to_string();
+ const std::string ibm_df_serialized = ibm_df.serialize();
StrDataFrame ibm_df_2;
- // Since we convert from native type to string and back, if you have floating point numbers with long precisions, you may
- // run into precision mismatches. to_string() has a precision parameter you can adjust. The default is 12 which is a
- // relatively high precision.
- //
- ibm_df_2.from_string(ibm_df_as_str.c_str());
+ ibm_df_2.deserialize(ibm_df_serialized);
// std::cout << ibm_df_as_str << std::endl; // Large output
using ul_idx_t = ULDataFrame::IndexType; // This is just unsigned long.
diff --git a/include/DataFrame/DataFrame.h b/include/DataFrame/DataFrame.h
index 1c428b6d..46e4054f 100644
--- a/include/DataFrame/DataFrame.h
+++ b/include/DataFrame/DataFrame.h
@@ -127,7 +127,7 @@ class DataFrame : public ThreadGranularity {
// Any version of DataFrame should be assignable to any other version
//
template
- DataFrame &assign(const OTHER &rhs);
+ DataFrame &assign(const OTHER &rhs);
public: // Load/append/remove interfaces
@@ -4157,11 +4157,17 @@ class DataFrame : public ThreadGranularity {
// These functions could be used to transmit a DataFrame from one place to
// another or store a DataFrame in databases, caches, …
//
+ // NOTE: The choice between to_string() and serialize() depends on the
+ // dataset. Some datasets (i.e. US Options market data) mostly
+ // contain small floating-point/integer numbers such as '.5', '.75'
+ // or '123' and so on. These set of numbers will produce a smaller
+ // buffer size in string form compared with binary form, especially
+ // if there are millions of them. But generally, in most cases
+ // binary form is more efficient.
+ //
// Ts:
// List all the types of all data columns. A type should be specified in
// the list only once.
- // iof:
- // Specifies the I/O format. The default is CSV
// precision:
// Specifies the precision for floating point numbers
//
@@ -4175,6 +4181,39 @@ class DataFrame : public ThreadGranularity {
[[nodiscard]] std::future
to_string_async(std::streamsize precision = 12) const;
+ // This is similar to to_string() to serialize a DataFrame into a binary
+ // buffer that could be restored later by calling deserialize(). It
+ // utilizes the write() member function of DataFrame.
+ // These functions could be used to transmit a DataFrame from one place to
+ // another or store a DataFrame in databases, caches, …
+ //
+ // NOTE: Although this returns a std::string, the string contains binary
+ // data including potentially many null chars. The best way to read
+ // the string is by using .data() and .size() methods on std::string
+ //
+ // NOTE: The choice between to_string() and serialize() depends on the
+ // dataset. Some datasets (i.e. US Options market data) mostly
+ // contain small floating-point/integer numbers such as '.5', '.75'
+ // or '123' and so on. These set of numbers will produce a smaller
+ // buffer size in string form compared with binary form, especially
+ // if there are millions of them. But generally, in most cases
+ // binary form is more efficient.
+ //
+ // Ts:
+ // List all the types of all data columns. A type should be specified in
+ // the list only once.
+ //
+ template
+ [[nodiscard]] std::string
+ serialize() const;
+
+ // Same as serialize() above, but executed asynchronously
+ //
+ template
+ [[nodiscard]] std::future
+ serialize_async() const;
+
+ // This is similar to to_string() to serialize a DataFrame into a binary
// It inputs the contents of a text file into itself (i.e. DataFrame).
// Currently two formats (i.e. csv, json) are supported specified by
// the iof parameter.
@@ -4253,19 +4292,53 @@ class DataFrame : public ThreadGranularity {
// These functions could be used to transmit a DataFrame from one place to
// another or store a DataFrame in databases, caches, …
//
+ // NOTE: The choice between to_string() and serialize() depends on the
+ // dataset. Some datasets (i.e. US Options market data) mostly
+ // contain small floating-point/integer numbers such as '.5', '.75'
+ // or '123' and so on. These set of numbers will produce a smaller
+ // buffer size in string form compared with binary form, especially
+ // if there are millions of them. But generally, in most cases
+ // binary form is more efficient.
+ //
// data_frame:
// A null terminated string that was generated by calling to_string().
// It must contain a complete DataFrame
- // iof:
- // Specifies the I/O format. The default is CSV
//
bool
from_string(const char *data_frame);
// Same as from_string() above, but executed asynchronously
+ //
[[nodiscard]] std::future
from_string_async(const char *data_frame);
+ // This is a convenient function (conceptually similar to from_string())
+ // to restore a DataFrame from a binary buffer that was previously
+ // generated by calling serialize(). It utilizes the read() member function
+ // of DataFrame.
+ // These functions could be used to transmit a DataFrame from one place to
+ // another or store a DataFrame in databases, caches, …
+ //
+ // NOTE: The choice between to_string() and serialize() depends on the
+ // dataset. Some datasets (i.e. US Options market data) mostly
+ // contain small floating-point/integer numbers such as '.5', '.75'
+ // or '123' and so on. These set of numbers will produce a smaller
+ // buffer size in string form compared with binary form, especially
+ // if there are millions of them. But generally, in most cases
+ // binary form is more efficient.
+ //
+ // data_frame:
+ // A std::string that was generated by calling serialize().
+ // It must contain a complete DataFrame in binary format
+ //
+ bool
+ deserialize(const std::string &data_frame);
+
+ // Same as deserialize() above, but executed asynchronously
+ //
+ [[nodiscard]] std::future
+ deserialize_async(const std::string &data_frame);
+
private:
// Internally used containers aligned with DataFrame alignment
diff --git a/include/DataFrame/Internals/DataFrame_read.tcc b/include/DataFrame/Internals/DataFrame_read.tcc
index 1220432e..f121672c 100644
--- a/include/DataFrame/Internals/DataFrame_read.tcc
+++ b/include/DataFrame/Internals/DataFrame_read.tcc
@@ -1583,6 +1583,21 @@ DataFrame::from_string (const char *data_frame) {
// ----------------------------------------------------------------------------
+template
+bool
+DataFrame::deserialize (const std::string &data_frame) {
+
+ static_assert(std::is_base_of, H>::value,
+ "Only a StdDataFrame can call deserialize()");
+
+ std::stringstream ss (data_frame, std::ios_base::in);
+
+ read(ss, io_format::binary, false);
+ return (true);
+}
+
+// ----------------------------------------------------------------------------
+
template
std::future DataFrame::
read_async(const char *file_name,
@@ -1643,7 +1658,21 @@ DataFrame::from_string_async(const char *data_frame) {
return (thr_pool_.dispatch(true,
&DataFrame::from_string,
this,
- data_frame));
+ data_frame)
+ );
+}
+
+// ----------------------------------------------------------------------------
+
+template
+std::future
+DataFrame::deserialize_async(const std::string &data_frame) {
+
+ return (thr_pool_.dispatch(true,
+ &DataFrame::deserialize,
+ this,
+ std::forward(data_frame)
+ ));
}
} // namespace hmdf
diff --git a/include/DataFrame/Internals/DataFrame_write.tcc b/include/DataFrame/Internals/DataFrame_write.tcc
index d337f311..31cf4969 100644
--- a/include/DataFrame/Internals/DataFrame_write.tcc
+++ b/include/DataFrame/Internals/DataFrame_write.tcc
@@ -74,6 +74,19 @@ DataFrame::to_string(std::streamsize precision) const {
// ----------------------------------------------------------------------------
+template
+template
+std::string
+DataFrame::serialize() const {
+
+ std::stringstream ss (std::ios_base::out);
+
+ write(ss, io_format::binary);
+ return (ss.str());
+}
+
+// ----------------------------------------------------------------------------
+
template
template
bool DataFrame::
@@ -98,7 +111,8 @@ write(S &o,
else
start_row = std::max(long(0), end_row + max_recs);
- o.precision(precision);
+ if (iof != io_format::binary) o.precision(precision);
+
if (iof == io_format::json) {
o << "{\n";
if (! columns_only) [[likely]] {
@@ -298,6 +312,16 @@ to_string_async (std::streamsize precision) const {
precision));
}
+// ----------------------------------------------------------------------------
+
+template
+template
+std::future DataFrame::
+serialize_async () const {
+
+ return (thr_pool_.dispatch(true, &DataFrame::serialize, this));
+}
+
} // namespace hmdf
// ----------------------------------------------------------------------------
diff --git a/include/DataFrame/Utils/Utils.h b/include/DataFrame/Utils/Utils.h
index 19bb977f..98218de8 100644
--- a/include/DataFrame/Utils/Utils.h
+++ b/include/DataFrame/Utils/Utils.h
@@ -347,7 +347,7 @@ shift_left(V &vec, std::size_t n) {
// ----------------------------------------------------------------------------
-template
+template
struct IOStreamOpti {
IOStreamOpti (STR &stream, const char *file_name, bool binary = false)
diff --git a/test/dataframe_tester_3.cc b/test/dataframe_tester_3.cc
index ff3f4719..0e984bb7 100644
--- a/test/dataframe_tester_3.cc
+++ b/test/dataframe_tester_3.cc
@@ -265,19 +265,54 @@ static void test_to_from_string() {
const std::string str_dump_from_vw =
vw.to_string();
- // std::cout << str_dump << std::endl;
-
MyDataFrame df2;
df2.from_string(str_dump.c_str());
- // std::cout << '\n' << std::endl;
- // df2.write(std::cout);
assert((df.is_equal(df2)));
assert(str_dump == str_dump_from_vw);
}
// ----------------------------------------------------------------------------
+static void test_serialize() {
+
+ std::cout << "\nTesting test_serialize() ..." << std::endl;
+
+ StlVecType idx =
+ { 123450, 123451, 123452, 123453, 123454, 123455, 123456,
+ 123457, 123458, 123459, 123460, 123461, 123462, 123466 };
+ StlVecType d1 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
+ StlVecType d2 = { 8, 9, 10, 11, 12, 13, 14, 20, 22, 23,
+ 30, 31, 32, 1.89 };
+ StlVecType d3 = { 15, 16, 17, 18, 19, 20, 21,
+ 0.34, 1.56, 0.34, 2.3, 0.1, 0.89, 0.45 };
+ StlVecType i1 = { 22, 23, 24, 25, 99, 100, 101, 3, 2 };
+ StlVecType strvec =
+ { "zz", "bb", "cc", "ww", "ee", "ff", "gg", "hh", "ii", "jj", "kk",
+ "ll", "mm", "nn" };
+ MyDataFrame df;
+
+ df.load_data(std::move(idx),
+ std::make_pair("col_1", d1),
+ std::make_pair("col_2", d2),
+ std::make_pair("col_3", d3),
+ std::make_pair("col_4", i1),
+ std::make_pair("str_col", strvec));
+
+ std::future ser_fut =
+ df.serialize_async();
+ const std::string ser = ser_fut.get();
+
+ MyDataFrame df2;
+
+ std::future deser_fut = df2.deserialize_async(ser);
+
+ deser_fut.get();
+ assert((df.is_equal(df2)));
+}
+
+// ----------------------------------------------------------------------------
+
static void test_CoppockCurveVisitor() {
std::cout << "\nTesting CoppockCurveVisitor{ } ..." << std::endl;
@@ -4064,6 +4099,7 @@ int main(int, char *[]) {
test_groupby_edge();
test_concat_view();
test_to_from_string();
+ test_serialize();
test_CoppockCurveVisitor();
test_BiasVisitor();
test_BalanceOfPowerVisitor();