Implemented serialize() and deserialize()

hosseinmoein · May 27, 2024 · 89459fb · 89459fb
1 parent 4c042f1
commit 89459fb
Show file tree

Hide file tree

Showing 6 changed files with 101 additions and 9 deletions.
diff --git a/README.md b/README.md
@@ -50,7 +50,7 @@ I have followed a few <B>principles in this library</B>:<BR>
 5. [Avoid copying data as much as possible](https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/copying_data.html)
 6. [Use multi-threading but only when it makes sense](https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/multithreading.html)
 7. [Do not attempt to protect the user against _garbage in_, _garbage out_](https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/garbage_in_garbage_out.html)
-8. DataFrame library is self-contained, meaning DataFrame only depends on _C++ language_ and its _standard library_
+8. Keep DataFrame library self-contained, meaning DataFrame must only depend on _C++ language_ and its _standard library_
 
 ---
 

diff --git a/examples/hello_world.cc b/examples/hello_world.cc
@@ -109,7 +109,7 @@ int main(int, char *[])  {
     StrDataFrame    ibm_df;
 
     // Also, you can load data into a DataFrame from a file, supporting a few different formats. If the file cannot be found,
-    // an exception will be thrown. If the DataFrame root directory is your current directory when running this, it should
+    // an exception will be thrown. If the DataFrame data directory is your current directory when running this, it should
     // work fine.
     //
     ibm_df.read("IBM.csv", io_format::csv2);

diff --git a/include/DataFrame/DataFrame.h b/include/DataFrame/DataFrame.h
@@ -127,7 +127,7 @@ class   DataFrame : public ThreadGranularity {
     // Any version of DataFrame should be assignable to any other version
     //
     template<typename OTHER, typename ... Ts>
-	DataFrame &assign(const OTHER &rhs);
+    DataFrame &assign(const OTHER &rhs);
 
 public:  // Load/append/remove interfaces
 
@@ -4157,11 +4157,17 @@ class   DataFrame : public ThreadGranularity {
     // These functions could be used to transmit a DataFrame from one place to
     // another or store a DataFrame in databases, caches, …
     //
+    // NOTE: The choice between to_string() and serialize() depends on the
+    //       dataset. Some datasets (i.e. US Options market data) mostly
+    //       contain small floating-point/integer numbers such as '.5', '.75'
+    //       or  '123' and so on. These set of numbers will produce a smaller
+    //       buffer size in string form compared with binary form, especially
+    //       if there are millions of them. But generally, in most cases
+    //       binary form is more efficient. 
+    //
     // Ts:
     //   List all the types of all data columns. A type should be specified in
     //   the list only once.
-    // iof:
-    //   Specifies the I/O format. The default is CSV
     // precision:
     //   Specifies the precision for floating point numbers
     //
@@ -4175,6 +4181,34 @@ class   DataFrame : public ThreadGranularity {
     [[nodiscard]] std::future<std::string>
     to_string_async(std::streamsize precision = 12) const;
 
+    // This is similar to to_string() to serialize a DataFrame into a binary
+    // buffer that could be restored later by calling deserialize(). It
+    // utilizes the write() member function of DataFrame.
+    // These functions could be used to transmit a DataFrame from one place to
+    // another or store a DataFrame in databases, caches, …
+    //
+    // NOTE: Although this returns a std::string, the string contains binary
+    //       data including potentially many null chars. The best way to read
+    //       the string is by using .data() and .size() methods on std::string
+    //
+    // NOTE: The choice between to_string() and serialize() depends on the
+    //       dataset. Some datasets (i.e. US Options market data) mostly
+    //       contain small floating-point/integer numbers such as '.5', '.75'
+    //       or  '123' and so on. These set of numbers will produce a smaller
+    //       buffer size in string form compared with binary form, especially
+    //       if there are millions of them. But generally, in most cases
+    //       binary form is more efficient. 
+    //
+    // Ts:
+    //   List all the types of all data columns. A type should be specified in
+    //   the list only once.
+    // precision:
+    //   Specifies the precision for floating point numbers
+    //
+    template<typename ... Ts>
+    [[nodiscard]] std::string
+    serialize() const;
+
     // It inputs the contents of a text file into itself (i.e. DataFrame).
     // Currently two formats (i.e. csv, json) are supported specified by
     // the iof parameter.
@@ -4253,19 +4287,48 @@ class   DataFrame : public ThreadGranularity {
     // These functions could be used to transmit a DataFrame from one place to
     // another or store a DataFrame in databases, caches, …
     //
+    // NOTE: The choice between to_string() and serialize() depends on the
+    //       dataset. Some datasets (i.e. US Options market data) mostly
+    //       contain small floating-point/integer numbers such as '.5', '.75'
+    //       or  '123' and so on. These set of numbers will produce a smaller
+    //       buffer size in string form compared with binary form, especially
+    //       if there are millions of them. But generally, in most cases
+    //       binary form is more efficient. 
+    //
     // data_frame:
     //   A null terminated string that was generated by calling to_string().
     //   It must contain a complete DataFrame
-    // iof:
-    //   Specifies the I/O format. The default is CSV
     //
     bool
     from_string(const char *data_frame);
 
     // Same as from_string() above, but executed asynchronously
+    //
     [[nodiscard]] std::future<bool>
     from_string_async(const char *data_frame);
 
+    // This is a convenient function (conceptually similar to from_string())
+    // to restore a DataFrame from a binary buffer that was previously
+    // generated by calling serialize(). It utilizes the read() member function
+    // of DataFrame.
+    // These functions could be used to transmit a DataFrame from one place to
+    // another or store a DataFrame in databases, caches, …
+    //
+    // NOTE: The choice between to_string() and serialize() depends on the
+    //       dataset. Some datasets (i.e. US Options market data) mostly
+    //       contain small floating-point/integer numbers such as '.5', '.75'
+    //       or  '123' and so on. These set of numbers will produce a smaller
+    //       buffer size in string form compared with binary form, especially
+    //       if there are millions of them. But generally, in most cases
+    //       binary form is more efficient. 
+    //
+    // data_frame:
+    //   A std::string that was generated by calling serialize().
+    //   It must contain a complete DataFrame in binary format
+    //
+    bool
+    deserialize(const std::string &data_frame);
+
 private:
 
     // Internally used containers aligned with DataFrame alignment

diff --git a/include/DataFrame/Internals/DataFrame_read.tcc b/include/DataFrame/Internals/DataFrame_read.tcc
@@ -1583,6 +1583,21 @@ DataFrame<I, H>::from_string (const char *data_frame)  {
 
 // ----------------------------------------------------------------------------
 
+template<typename I, typename H>
+bool
+DataFrame<I, H>::deserialize (const std::string &data_frame)  {
+
+    static_assert(std::is_base_of<HeteroVector<align_value>, H>::value,
+                  "Only a StdDataFrame can call deserialize()");
+
+    std::stringstream   ss (data_frame, std::ios_base::in);
+
+    read<std::istream>(ss, io_format::binary, false);
+    return (true);
+}
+
+// ----------------------------------------------------------------------------
+
 template<typename I, typename H>
 std::future<bool> DataFrame<I, H>::
 read_async(const char *file_name,

diff --git a/include/DataFrame/Internals/DataFrame_write.tcc b/include/DataFrame/Internals/DataFrame_write.tcc
@@ -74,6 +74,19 @@ DataFrame<I, H>::to_string(std::streamsize precision) const  {
 
 // ----------------------------------------------------------------------------
 
+template<typename I, typename H>
+template<typename ... Ts>
+std::string
+DataFrame<I, H>::serialize() const  {
+
+    std::stringstream   ss (std::ios_base::out);
+
+    write<std::ostream, Ts ...>(ss, io_format::binary);
+    return (ss.str());
+}
+
+// ----------------------------------------------------------------------------
+
 template<typename I, typename H>
 template<typename S, typename ... Ts>
 bool DataFrame<I, H>::
@@ -98,7 +111,8 @@ write(S &o,
     else
         start_row = std::max(long(0), end_row + max_recs);
 
-    o.precision(precision);
+    if (iof != io_format::binary)  o.precision(precision);
+
     if (iof == io_format::json)  {
         o << "{\n";
         if (! columns_only) [[likely]]  {

diff --git a/include/DataFrame/Utils/Utils.h b/include/DataFrame/Utils/Utils.h
@@ -347,7 +347,7 @@ shift_left(V &vec, std::size_t n)  {
 
 // ----------------------------------------------------------------------------
 
-template<typename STR, std::size_t SIZ = 128 * 1024>
+template<typename STR, std::size_t SIZ = 64 * 1024>
 struct  IOStreamOpti  {
 
     IOStreamOpti (STR &stream, const char *file_name, bool binary = false)