From f7f635862b1419b1019caf554591665d03f71354 Mon Sep 17 00:00:00 2001 From: Hossein Moein Date: Mon, 20 May 2024 13:23:40 -0400 Subject: [PATCH] Added docs for binrary read/write --- docs/HTML/read.html | 8 ++- docs/HTML/write.html | 155 +++++++++++++++++++++++-------------------- 2 files changed, 88 insertions(+), 75 deletions(-) diff --git a/docs/HTML/read.html b/docs/HTML/read.html index e7511a04..b56e1534 100644 --- a/docs/HTML/read.html +++ b/docs/HTML/read.html @@ -95,7 +95,9 @@
  • Column "INDEX" must be the first column, if it exists
  • Fields in column dictionaries must be in N (name), T (type), D (data) order
  • -
    + -----------------------------------------------
    + Binary format is a proprietary format, that is optimized for compressing algorithms. It also takes care of different endianness. The file is always written with the same endianness as the writing host. But it will be adjusted accordingly when reading it from a different host with a different endianness.

    + -----------------------------------------------
    In all formats the following data types are supported:
    @@ -117,8 +119,8 @@
     string     -- char *
     bool       -- bool
     DateTime   -- DateTime data in format of
    -                        <Epoch seconds>.<nanoseconds>
    -                        (1516179600.874123908)
    +    <Epoch seconds>.<nanoseconds>
    +    (1516179600.874123908)
             
    In case of io_format::csv2 and io_format::csv the following additional types are also supported:
    diff --git a/docs/HTML/write.html b/docs/HTML/write.html
    index 0820eb15..b0b00c4e 100644
    --- a/docs/HTML/write.html
    +++ b/docs/HTML/write.html
    @@ -41,7 +41,7 @@
         
     
         
    -       
    +       
             
    
     template<typename S, typename ... Ts>
     bool
    @@ -52,39 +52,41 @@
           long max_recs = std::numeric_limits::max()) const; 
             
    - - It outputs the content of DataFrame into the stream o. Currently 3 formats (i.e. csv, csv2, json) are supported specified by the iof parameter.


    + + It outputs the content of DataFrame into the stream o. Currently 4 formats (i.e. csv, csv2, json, binary) are supported specified by the iof parameter.


    The CSV file format is written:
    -  INDEX:<Number of data points>:<Comma delimited list of values>
    -  <Column1 name>:<Number of data points>:<Column1 type>:<Comma delimited list of values>
    -  <Column2 name>:<Number of data points>:<Column2 type>:<Comma delimited list of values>
    -      .
    -      .
    -      .
    +INDEX:<Number of data points>:<Comma delimited list of values>
    +<Col1 name>:<Number of data points>:<Col1 type>:<Comma delimited list of values>
    +<Col2 name>:<Number of data points>:<Col2 type>:<Comma delimited list of values>
    +    .
    +    .
    +    .
             
    All empty lines or lines starting with # will be skipped. For examples, see files in test directory

    The CSV2 file format must be (this is similar to Pandas csv format):
    -  INDEX:<Number of data points>:<Index type>:,<Column1 name>:<Number of data points>:<Column1 type>,<Column2 name>:<Number of data points>:<Column2 type>, . . .
    -  Comma delimited rows of values
    -      .
    -      .
    -      .
    +INDEX:<Number of data points>:<Index type>:,<Column1 name>:
    +<Number of data points>:<Column1 type>,<Column2 name>:
    +<Number of data points>:<Column2 type>, . . .
    +Comma delimited rows of values
    +    .
    +    .
    +    .
             
    All empty lines or lines starting with # will be skipped. For examples, see IBM and FORD files in test directory

    The JSON file format looks like this:
    -  {
    -    "INDEX":{"N":3,"T":"ulong","D":[123450,123451,123452]},
    -    "col_3":{"N":3,"T":"double","D":[15.2,16.34,17.764]},
    -    "col_4":{"N":3,"T":"int","D":[22,23,24]},
    -    "col_str":{"N":3,"T":"string","D":["11","22","33"]},
    -    "col_2":{"N":3,"T":"double","D":[8,9.001,10]},
    -    "col_1":{"N":3,"T":"double","D":[1,2,3.456]}
    -  }
    +{
    +  "INDEX":{"N":3,"T":"ulong","D":[123450,123451,123452]},
    +  "col_3":{"N":3,"T":"double","D":[15.2,16.34,17.764]},
    +  "col_4":{"N":3,"T":"int","D":[22,23,24]},
    +  "col_str":{"N":3,"T":"string","D":["11","22","33"]},
    +  "col_2":{"N":3,"T":"double","D":[8,9.001,10]},
    +  "col_1":{"N":3,"T":"double","D":[1,2,3.456]}
    +}
             
    Please note DataFrame json does not follow json spec 100%. In json, there is no particular order in dictionary fields. But in DataFrame json:
      @@ -93,66 +95,75 @@


    + + Binary format is a proprietary format, that is optimized for compressing algorithms. It also takes care of different endianness. The file is always written with the same endianness as the writing host. But it will be adjusted accordingly when reading it from a different host with a different endianness.


    + In all formats the following data types are supported:
    -          float      -- float
    -          double     -- double
    -          longdouble -- long double
    -          short      -- short int
    -          ushort     -- unsigned short int
    -          int        -- int
    -          uint       -- unsigned int
    -          long       -- long int
    -          longlong   -- long long int
    -          ulong      -- unsigned long int
    -          ulonglong  -- unsigned long long int
    -          char       -- char
    -          uchar      -- unsigned char
    -          string     -- std::string
    -          string     -- const char *
    -          string     -- char *
    -          bool       -- bool
    -          DateTime   -- DateTime data in format of <Epoch seconds>.<nanoseconds> (1516179600.874123908)
    +float      -- float
    +double     -- double
    +longdouble -- long double
    +short      -- short int
    +ushort     -- unsigned short int
    +int        -- int
    +uint       -- unsigned int
    +long       -- long int
    +longlong   -- long long int
    +ulong      -- unsigned long int
    +ulonglong  -- unsigned long long int
    +char       -- char
    +uchar      -- unsigned char
    +string     -- std::string
    +string     -- const char *
    +string     -- char *
    +bool       -- bool
    +DateTime   -- DateTime data in format of
    +    <Epoch seconds>.<nanoseconds>
    +    (1516179600.874123908)
             
    In case of io_format::csv2 and io_format::csv the following additional types are also supported:
    -          dbl_vec        -- A vector of double precision values, The vector is printed as "s[d1|d2|...]"
    -                            where s is the size of the vector and d's are the double values.
    -          str_vec        -- A vector of std::string values, The vector is printed as "s[str1|str2|...]"
    -                            where s is the size of the vector and str's are the strings.
    -          dbl_set        -- A set of double precision values, The set is printed as "s[d1|d2|...]"
    -                            where s is the size of the set and d's are the double values.
    -          str_set        -- A set of std::string values, The set is printed as "s[str1|str2|...]"
    -                            where s is the size of the set and str's are the strings.
    -          str_dbl_map    -- A map of string keys to double precision values, The map is printed as "s{k1:v1|k2:v2|...}"
    -                            where s is the size of the map and k's and v's are keys and values.
    -          str_dbl_unomap -- An unordered map of string keys to double precision values, The map is printed as "s{k1:v1|k2:v2|...}"
    -                            where s is the size of the map and k's and v's are keys and values.
    +dbl_vec        -- A vector of double precision values, The vector is printed
    +                  as "s[d1|d2|...]" where s is the size of the vector and d's
    +                  are the double values.
    +str_vec        -- A vector of std::string values, The vector is printed as
    +                  "s[str1|str2|...]" where s is the size of the vector and
    +                  str's are the strings.
    +dbl_set        -- A set of double precision values, The set is printed as
    +                  "s[d1|d2|...]" where s is the size of the set and d's
    +                  are the double values.
    +str_set        -- A set of std::string values, The set is printed as
    +                  "s[str1|str2|...]" where s is the size of the set and
    +                  str's are the strings.
    +str_dbl_map    -- A map of string keys to double precision values, The map is
    +                  printed as "s{k1:v1|k2:v2|...}" where s is the size of
    +                  the map and k's and v's are keys and values.
    +str_dbl_unomap -- An unordered map of string keys to double precision values,
    +                  The map is printed as "s{k1:v1|k2:v2|...}" where s is the
    +                  size of the map and k's and v's are keys and values.
             
    In case of io_format::csv2 the following additional types are also supported:
    -          DateTimeAME -- DateTime string printed in American style (MM/DD/YYYY HH:MM:SS.mmm)
    -          DateTimeEUR -- DateTime string printed in European style (YYYY/MM/DD HH:MM:SS.mmm)
    -          DateTimeISO -- DateTime string printed in ISO style (YYYY-MM-DD HH:MM:SS.mmm)
    +DateTimeAME -- American style (MM/DD/YYYY HH:MM:SS.mmm)
    +DateTimeEUR -- European style (YYYY/MM/DD HH:MM:SS.mmm)
    +DateTimeISO -- ISO style (YYYY-MM-DD HH:MM:SS.mmm)
             
    - -
    - S: Output stream type
    - Ts: The list of types for all columns. A type should be specified only once
    - o: Reference to an streamable object (e.g. cout, file, ...)
    - iof: Specifies the I/O format. The default is CSV
    - precision: Specifies the precision for floating point numbers
    - columns_only: If true, the index columns is not written into the stream
    - max_recs: Max number of rows to write. If it is positive, it will write max_recs from the beginning of DataFrame. If it is negative, it will write max_recs from the end of DataFrame
    -        
    + + S: Output stream type
    + Ts: The list of types for all columns. A type should be specified only once
    + o: Reference to an streamable object (e.g. cout, file, ...)
    + iof: Specifies the I/O format. The default is CSV
    + precision: Specifies the precision for floating point numbers
    + columns_only: If true, the index columns is not written into the stream
    + max_recs: Max number of rows to write. If it is positive, it will write max_recs from the beginning of DataFrame. If it is negative, it will write max_recs from the end of DataFrame
    - +
    
     template<typename ... Ts>
     std::future<bool>
    @@ -163,7 +174,7 @@
           long max_recs = std::numeric_limits::max()) const; 
             
    - + Same as write() above, but it takes a file name

    NOTE:: This version of write() can be substantially faster, especially for larger files, than if you open the file yourself and use the write() version above. @@ -172,7 +183,7 @@ - +
    
     template<typename S, typename ... Ts>
     std::future<bool>
    @@ -191,7 +202,7 @@
         
     
         
    -       
    +       
             
    
     template<typename ... Ts>
     std::future<bool>
    @@ -210,14 +221,14 @@
         
     
         
    -       
    +       
             
    
     template<typename ... Ts>
     std::string
     to_string(std::streamsize precision = 12) const; 
             
    - + This is a convenient function (simple implementation) to convert a DataFrame into a string that could be restored later by calling from_string(). It utilizes the write() member function of DataFrame.
    These functions could be used to transmit a DataFrame from one place to another or store a DataFrame in databases, caches, ...

    @@ -229,7 +240,7 @@ - + Ts: The list of types for all columns. A type should be specified only once
    precision: Specifies the precision for floating point numbers