diff --git a/README.md b/README.md index 182a8bd5..75b199ae 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ I have followed a few principles in this library:
5. [Avoid copying data as much as possible](https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/copying_data.html) 6. [Use multi-threading but only when it makes sense](https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/multithreading.html) 7. [Do not attempt to protect the user against _garbage in_, _garbage out_](https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/garbage_in_garbage_out.html) -8. DataFrame library is self-contained, meaning DataFrame only depends on _C++ language_ and its _standard library_ +8. Keep DataFrame library self-contained, meaning DataFrame must only depend on _C++ language_ and its _standard library_ --- diff --git a/examples/hello_world.cc b/examples/hello_world.cc index 889ffc36..232ee481 100644 --- a/examples/hello_world.cc +++ b/examples/hello_world.cc @@ -109,7 +109,7 @@ int main(int, char *[]) { StrDataFrame ibm_df; // Also, you can load data into a DataFrame from a file, supporting a few different formats. If the file cannot be found, - // an exception will be thrown. If the DataFrame root directory is your current directory when running this, it should + // an exception will be thrown. If the DataFrame data directory is your current directory when running this, it should // work fine. // ibm_df.read("IBM.csv", io_format::csv2); diff --git a/include/DataFrame/DataFrame.h b/include/DataFrame/DataFrame.h index 1c428b6d..c05be53b 100644 --- a/include/DataFrame/DataFrame.h +++ b/include/DataFrame/DataFrame.h @@ -127,7 +127,7 @@ class DataFrame : public ThreadGranularity { // Any version of DataFrame should be assignable to any other version // template - DataFrame &assign(const OTHER &rhs); + DataFrame &assign(const OTHER &rhs); public: // Load/append/remove interfaces @@ -4157,11 +4157,17 @@ class DataFrame : public ThreadGranularity { // These functions could be used to transmit a DataFrame from one place to // another or store a DataFrame in databases, caches, … // + // NOTE: The choice between to_string() and serialize() depends on the + // dataset. Some datasets (i.e. US Options market data) mostly + // contain small floating-point/integer numbers such as '.5', '.75' + // or '123' and so on. These set of numbers will produce a smaller + // buffer size in string form compared with binary form, especially + // if there are millions of them. But generally, in most cases + // binary form is more efficient. + // // Ts: // List all the types of all data columns. A type should be specified in // the list only once. - // iof: - // Specifies the I/O format. The default is CSV // precision: // Specifies the precision for floating point numbers // @@ -4175,6 +4181,34 @@ class DataFrame : public ThreadGranularity { [[nodiscard]] std::future to_string_async(std::streamsize precision = 12) const; + // This is similar to to_string() to serialize a DataFrame into a binary + // buffer that could be restored later by calling deserialize(). It + // utilizes the write() member function of DataFrame. + // These functions could be used to transmit a DataFrame from one place to + // another or store a DataFrame in databases, caches, … + // + // NOTE: Although this returns a std::string, the string contains binary + // data including potentially many null chars. The best way to read + // the string is by using .data() and .size() methods on std::string + // + // NOTE: The choice between to_string() and serialize() depends on the + // dataset. Some datasets (i.e. US Options market data) mostly + // contain small floating-point/integer numbers such as '.5', '.75' + // or '123' and so on. These set of numbers will produce a smaller + // buffer size in string form compared with binary form, especially + // if there are millions of them. But generally, in most cases + // binary form is more efficient. + // + // Ts: + // List all the types of all data columns. A type should be specified in + // the list only once. + // precision: + // Specifies the precision for floating point numbers + // + template + [[nodiscard]] std::string + serialize() const; + // It inputs the contents of a text file into itself (i.e. DataFrame). // Currently two formats (i.e. csv, json) are supported specified by // the iof parameter. @@ -4253,19 +4287,48 @@ class DataFrame : public ThreadGranularity { // These functions could be used to transmit a DataFrame from one place to // another or store a DataFrame in databases, caches, … // + // NOTE: The choice between to_string() and serialize() depends on the + // dataset. Some datasets (i.e. US Options market data) mostly + // contain small floating-point/integer numbers such as '.5', '.75' + // or '123' and so on. These set of numbers will produce a smaller + // buffer size in string form compared with binary form, especially + // if there are millions of them. But generally, in most cases + // binary form is more efficient. + // // data_frame: // A null terminated string that was generated by calling to_string(). // It must contain a complete DataFrame - // iof: - // Specifies the I/O format. The default is CSV // bool from_string(const char *data_frame); // Same as from_string() above, but executed asynchronously + // [[nodiscard]] std::future from_string_async(const char *data_frame); + // This is a convenient function (conceptually similar to from_string()) + // to restore a DataFrame from a binary buffer that was previously + // generated by calling serialize(). It utilizes the read() member function + // of DataFrame. + // These functions could be used to transmit a DataFrame from one place to + // another or store a DataFrame in databases, caches, … + // + // NOTE: The choice between to_string() and serialize() depends on the + // dataset. Some datasets (i.e. US Options market data) mostly + // contain small floating-point/integer numbers such as '.5', '.75' + // or '123' and so on. These set of numbers will produce a smaller + // buffer size in string form compared with binary form, especially + // if there are millions of them. But generally, in most cases + // binary form is more efficient. + // + // data_frame: + // A std::string that was generated by calling serialize(). + // It must contain a complete DataFrame in binary format + // + bool + deserialize(const std::string &data_frame); + private: // Internally used containers aligned with DataFrame alignment diff --git a/include/DataFrame/Internals/DataFrame_read.tcc b/include/DataFrame/Internals/DataFrame_read.tcc index 1220432e..2557c79a 100644 --- a/include/DataFrame/Internals/DataFrame_read.tcc +++ b/include/DataFrame/Internals/DataFrame_read.tcc @@ -1583,6 +1583,21 @@ DataFrame::from_string (const char *data_frame) { // ---------------------------------------------------------------------------- +template +bool +DataFrame::deserialize (const std::string &data_frame) { + + static_assert(std::is_base_of, H>::value, + "Only a StdDataFrame can call deserialize()"); + + std::stringstream ss (data_frame, std::ios_base::in); + + read(ss, io_format::binary, false); + return (true); +} + +// ---------------------------------------------------------------------------- + template std::future DataFrame:: read_async(const char *file_name, diff --git a/include/DataFrame/Internals/DataFrame_write.tcc b/include/DataFrame/Internals/DataFrame_write.tcc index d337f311..d94fdd31 100644 --- a/include/DataFrame/Internals/DataFrame_write.tcc +++ b/include/DataFrame/Internals/DataFrame_write.tcc @@ -74,6 +74,19 @@ DataFrame::to_string(std::streamsize precision) const { // ---------------------------------------------------------------------------- +template +template +std::string +DataFrame::serialize() const { + + std::stringstream ss (std::ios_base::out); + + write(ss, io_format::binary); + return (ss.str()); +} + +// ---------------------------------------------------------------------------- + template template bool DataFrame:: @@ -98,7 +111,8 @@ write(S &o, else start_row = std::max(long(0), end_row + max_recs); - o.precision(precision); + if (iof != io_format::binary) o.precision(precision); + if (iof == io_format::json) { o << "{\n"; if (! columns_only) [[likely]] { diff --git a/include/DataFrame/Utils/Utils.h b/include/DataFrame/Utils/Utils.h index 19bb977f..98218de8 100644 --- a/include/DataFrame/Utils/Utils.h +++ b/include/DataFrame/Utils/Utils.h @@ -347,7 +347,7 @@ shift_left(V &vec, std::size_t n) { // ---------------------------------------------------------------------------- -template +template struct IOStreamOpti { IOStreamOpti (STR &stream, const char *file_name, bool binary = false)