diff --git a/README.md b/README.md
index 182a8bd5..75b199ae 100644
--- a/README.md
+++ b/README.md
@@ -50,7 +50,7 @@ I have followed a few principles in this library:
5. [Avoid copying data as much as possible](https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/copying_data.html)
6. [Use multi-threading but only when it makes sense](https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/multithreading.html)
7. [Do not attempt to protect the user against _garbage in_, _garbage out_](https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/garbage_in_garbage_out.html)
-8. DataFrame library is self-contained, meaning DataFrame only depends on _C++ language_ and its _standard library_
+8. Keep DataFrame library self-contained, meaning DataFrame must only depend on _C++ language_ and its _standard library_
---
diff --git a/examples/hello_world.cc b/examples/hello_world.cc
index 889ffc36..232ee481 100644
--- a/examples/hello_world.cc
+++ b/examples/hello_world.cc
@@ -109,7 +109,7 @@ int main(int, char *[]) {
StrDataFrame ibm_df;
// Also, you can load data into a DataFrame from a file, supporting a few different formats. If the file cannot be found,
- // an exception will be thrown. If the DataFrame root directory is your current directory when running this, it should
+ // an exception will be thrown. If the DataFrame data directory is your current directory when running this, it should
// work fine.
//
ibm_df.read("IBM.csv", io_format::csv2);
diff --git a/include/DataFrame/DataFrame.h b/include/DataFrame/DataFrame.h
index 1c428b6d..c05be53b 100644
--- a/include/DataFrame/DataFrame.h
+++ b/include/DataFrame/DataFrame.h
@@ -127,7 +127,7 @@ class DataFrame : public ThreadGranularity {
// Any version of DataFrame should be assignable to any other version
//
template
- DataFrame &assign(const OTHER &rhs);
+ DataFrame &assign(const OTHER &rhs);
public: // Load/append/remove interfaces
@@ -4157,11 +4157,17 @@ class DataFrame : public ThreadGranularity {
// These functions could be used to transmit a DataFrame from one place to
// another or store a DataFrame in databases, caches, …
//
+ // NOTE: The choice between to_string() and serialize() depends on the
+ // dataset. Some datasets (i.e. US Options market data) mostly
+ // contain small floating-point/integer numbers such as '.5', '.75'
+ // or '123' and so on. These set of numbers will produce a smaller
+ // buffer size in string form compared with binary form, especially
+ // if there are millions of them. But generally, in most cases
+ // binary form is more efficient.
+ //
// Ts:
// List all the types of all data columns. A type should be specified in
// the list only once.
- // iof:
- // Specifies the I/O format. The default is CSV
// precision:
// Specifies the precision for floating point numbers
//
@@ -4175,6 +4181,34 @@ class DataFrame : public ThreadGranularity {
[[nodiscard]] std::future
to_string_async(std::streamsize precision = 12) const;
+ // This is similar to to_string() to serialize a DataFrame into a binary
+ // buffer that could be restored later by calling deserialize(). It
+ // utilizes the write() member function of DataFrame.
+ // These functions could be used to transmit a DataFrame from one place to
+ // another or store a DataFrame in databases, caches, …
+ //
+ // NOTE: Although this returns a std::string, the string contains binary
+ // data including potentially many null chars. The best way to read
+ // the string is by using .data() and .size() methods on std::string
+ //
+ // NOTE: The choice between to_string() and serialize() depends on the
+ // dataset. Some datasets (i.e. US Options market data) mostly
+ // contain small floating-point/integer numbers such as '.5', '.75'
+ // or '123' and so on. These set of numbers will produce a smaller
+ // buffer size in string form compared with binary form, especially
+ // if there are millions of them. But generally, in most cases
+ // binary form is more efficient.
+ //
+ // Ts:
+ // List all the types of all data columns. A type should be specified in
+ // the list only once.
+ // precision:
+ // Specifies the precision for floating point numbers
+ //
+ template
+ [[nodiscard]] std::string
+ serialize() const;
+
// It inputs the contents of a text file into itself (i.e. DataFrame).
// Currently two formats (i.e. csv, json) are supported specified by
// the iof parameter.
@@ -4253,19 +4287,48 @@ class DataFrame : public ThreadGranularity {
// These functions could be used to transmit a DataFrame from one place to
// another or store a DataFrame in databases, caches, …
//
+ // NOTE: The choice between to_string() and serialize() depends on the
+ // dataset. Some datasets (i.e. US Options market data) mostly
+ // contain small floating-point/integer numbers such as '.5', '.75'
+ // or '123' and so on. These set of numbers will produce a smaller
+ // buffer size in string form compared with binary form, especially
+ // if there are millions of them. But generally, in most cases
+ // binary form is more efficient.
+ //
// data_frame:
// A null terminated string that was generated by calling to_string().
// It must contain a complete DataFrame
- // iof:
- // Specifies the I/O format. The default is CSV
//
bool
from_string(const char *data_frame);
// Same as from_string() above, but executed asynchronously
+ //
[[nodiscard]] std::future
from_string_async(const char *data_frame);
+ // This is a convenient function (conceptually similar to from_string())
+ // to restore a DataFrame from a binary buffer that was previously
+ // generated by calling serialize(). It utilizes the read() member function
+ // of DataFrame.
+ // These functions could be used to transmit a DataFrame from one place to
+ // another or store a DataFrame in databases, caches, …
+ //
+ // NOTE: The choice between to_string() and serialize() depends on the
+ // dataset. Some datasets (i.e. US Options market data) mostly
+ // contain small floating-point/integer numbers such as '.5', '.75'
+ // or '123' and so on. These set of numbers will produce a smaller
+ // buffer size in string form compared with binary form, especially
+ // if there are millions of them. But generally, in most cases
+ // binary form is more efficient.
+ //
+ // data_frame:
+ // A std::string that was generated by calling serialize().
+ // It must contain a complete DataFrame in binary format
+ //
+ bool
+ deserialize(const std::string &data_frame);
+
private:
// Internally used containers aligned with DataFrame alignment
diff --git a/include/DataFrame/Internals/DataFrame_read.tcc b/include/DataFrame/Internals/DataFrame_read.tcc
index 1220432e..2557c79a 100644
--- a/include/DataFrame/Internals/DataFrame_read.tcc
+++ b/include/DataFrame/Internals/DataFrame_read.tcc
@@ -1583,6 +1583,21 @@ DataFrame::from_string (const char *data_frame) {
// ----------------------------------------------------------------------------
+template
+bool
+DataFrame::deserialize (const std::string &data_frame) {
+
+ static_assert(std::is_base_of, H>::value,
+ "Only a StdDataFrame can call deserialize()");
+
+ std::stringstream ss (data_frame, std::ios_base::in);
+
+ read(ss, io_format::binary, false);
+ return (true);
+}
+
+// ----------------------------------------------------------------------------
+
template
std::future DataFrame::
read_async(const char *file_name,
diff --git a/include/DataFrame/Internals/DataFrame_write.tcc b/include/DataFrame/Internals/DataFrame_write.tcc
index d337f311..d94fdd31 100644
--- a/include/DataFrame/Internals/DataFrame_write.tcc
+++ b/include/DataFrame/Internals/DataFrame_write.tcc
@@ -74,6 +74,19 @@ DataFrame::to_string(std::streamsize precision) const {
// ----------------------------------------------------------------------------
+template
+template
+std::string
+DataFrame::serialize() const {
+
+ std::stringstream ss (std::ios_base::out);
+
+ write(ss, io_format::binary);
+ return (ss.str());
+}
+
+// ----------------------------------------------------------------------------
+
template
template
bool DataFrame::
@@ -98,7 +111,8 @@ write(S &o,
else
start_row = std::max(long(0), end_row + max_recs);
- o.precision(precision);
+ if (iof != io_format::binary) o.precision(precision);
+
if (iof == io_format::json) {
o << "{\n";
if (! columns_only) [[likely]] {
diff --git a/include/DataFrame/Utils/Utils.h b/include/DataFrame/Utils/Utils.h
index 19bb977f..98218de8 100644
--- a/include/DataFrame/Utils/Utils.h
+++ b/include/DataFrame/Utils/Utils.h
@@ -347,7 +347,7 @@ shift_left(V &vec, std::size_t n) {
// ----------------------------------------------------------------------------
-template
+template
struct IOStreamOpti {
IOStreamOpti (STR &stream, const char *file_name, bool binary = false)