Now you can call read() with csv2 format with a stream reference

hosseinmoein · Feb 1, 2025 · 4a9e568 · 4a9e568
1 parent be9d7bd
commit 4a9e568
Show file tree

Hide file tree

Showing 6 changed files with 38 additions and 14 deletions.
diff --git a/docs/HTML/DataFrame.html b/docs/HTML/DataFrame.html
@@ -1978,7 +1978,7 @@ <H2 ID="9"><font color="blue">Build Instructions <font size="+4">&#x1F6E0;</font
     When building your application with DataFrame, if you define <I>HMDF_SANITY_EXCEPTIONS=1</I> on the compile line, DataFrame algorithms do runtime checks to make sure the dimensionality of your data is correct and other sanity checks (throw exceptions otherwise). If this is not defined there are no checks. For example, consider you pass a column with 3 values in it and ask DataFrame to find peaks that are higher than 5 data points around them. Without a sanity check you get a crash or garbage.<BR>
   <P>
 
-    <font size="+1"><B>Using plain make and make-files:</B></font><BR>
+    <font size="+1"><B>Using plain make and make-files</B></font> (Not Recommended):<BR>
     Go to the <I>src</I> subdirectory, and execute build_all.sh. This will build the library and test executables for <I>Linux/Unix flavors only</I><BR><BR>
 
     <font size="+1"><B>Using CMake:</B></font><BR>

diff --git a/docs/HTML/read.html b/docs/HTML/read.html
@@ -77,7 +77,8 @@
       .<BR>
       .<BR>
       .<BR>
-        All empty lines or lines starting with # will be skipped.<BR>
+        All empty lines or lines starting with # will be skipped.<BR><BR>
+        In CSV2 format it is more efficient if you call <I>read()</I> with a filename instead of opening the file yourself and passing a stream reference. With a name, DataFrame opens the file and sets the read buffers the most efficient way.<BR><BR>
         <B>NOTE:</B> Only in CSV2 and binary formats you can specify <I>starting_row</I> and <I>num_rows</I>. This way you can read very large files (that don't fit into memory) in chunks and process them. In this case the reading starts at <I>starting_row</I> and continues until either <I>num_rows</I> rows is read or EOF is reached.<BR><BR>
 
  -----------------------------------------------<BR>

diff --git a/include/DataFrame/Internals/DataFrame_private_decl.h b/include/DataFrame/Internals/DataFrame_private_decl.h
@@ -67,7 +67,9 @@ void read_binary_(std::istream &file,
                   size_type starting_row,
                   size_type num_rows);
 void read_csv_(std::istream &file, bool columns_only);
-void read_csv2_(std::FILE *stream,
+
+template<typename S>
+void read_csv2_(S &stream,
                 bool columns_only,
                 size_type starting_row,
                 size_type num_rows);

diff --git a/include/DataFrame/Internals/DataFrame_read.tcc b/include/DataFrame/Internals/DataFrame_read.tcc
@@ -768,8 +768,9 @@ struct _col_data_spec_  {
 // --------------------------------------
 
 template<typename I, typename H>
+template<typename S>
 void DataFrame<I, H>::
-read_csv2_(std::FILE *stream,
+read_csv2_(S &stream,
            bool columns_only,
            size_type starting_row,
            size_type num_rows)  {
@@ -789,10 +790,24 @@ read_csv2_(std::FILE *stream,
 
     value.reserve(64);
     spec_vec.reserve(32);
-    while (! std::feof(stream)) {
+    while (true)  {
+        if constexpr (std::same_as<S, std::FILE *>)  {
+            if (std::feof(stream))  break;
+        }
+        else  {
+            if (stream.eof())  break;
+        }
+
         line[0] = '\0';
-        if (std::fgets(line, sizeof(line) - 1, stream) == nullptr)
-            continue;
+        if constexpr (std::same_as<S, std::FILE *>)  {
+            if (std::fgets(line, sizeof(line) - 1, stream) == nullptr)
+                continue;
+        }
+        else  {
+            stream.getline(line, sizeof(line) - 1);
+            if (stream.fail())
+                continue;
+        }
 
         if (line[0] == '\0' || line[0] == '#') [[unlikely]]  continue;
 
@@ -2085,22 +2100,21 @@ read (S &in_s,
             throw NotImplemented("read(): Reading files in chunks is currently"
                                  " only implemented for io_format::csv2");
 
-        read_csv_ (in_s, columns_only);
+        read_csv_(in_s, columns_only);
     }
     else if (iof == io_format::csv2)  {
-        throw NotImplemented("read(): You can read a file in io_format::csv2 "
-                             "format only by calling read() with file name");
+        read_csv2_(in_s, columns_only, starting_row, num_rows);
     }
     else if (iof == io_format::json)  {
         if (starting_row != 0 ||
             num_rows != std::numeric_limits<size_type>::max()) [[unlikely]]
             throw NotImplemented("read(): Reading files in chunks is currently"
                                  " only implemented for io_format::csv2");
 
-        read_json_ (in_s, columns_only);
+        read_json_(in_s, columns_only);
     }
     else if (iof == io_format::binary)  {
-        read_binary_ (in_s, columns_only, starting_row, num_rows);
+        read_binary_(in_s, columns_only, starting_row, num_rows);
     }
     else
         throw NotImplemented("read(): This io_format is not implemented");

diff --git a/test/dataframe_tester_2.cc b/test/dataframe_tester_2.cc
@@ -3149,7 +3149,10 @@ static void test_YangZhangVolVisitor()  {
     MyDataFrame df;
 
     try  {
-        df.read("FORD.csv", io_format::csv2);
+        std::ifstream   stream;
+
+        stream.open("FORD.csv");
+        df.read(stream, io_format::csv2);
 
         YangZhangVolVisitor<double, unsigned long, 64> yz_v;
 

diff --git a/test/dataframe_tester_3.cc b/test/dataframe_tester_3.cc
@@ -3865,7 +3865,10 @@ static void test_writing_binary()  {
     StrDataFrame    ibm_vw_json;
 
     try  {
-        ibm.read("SHORT_IBM.csv", io_format::csv2);
+        std::ifstream   stream;
+
+        stream.open("SHORT_IBM.csv");
+        ibm.read(stream, io_format::csv2);
 
         ibm.write<double, long>("./SHORT_IBM_dup.csv", io_format::csv);
         ibm.write<double, long>("./SHORT_IBM_dup.csv2", io_format::csv2);
@@ -4102,6 +4105,7 @@ static void test_reading_in_binary_chunks()  {
         StrDataFrame    df1;
 
         df1.read("SHORT_IBM.dat", io_format::binary, false, 0, 10);
+
         assert(df1.get_index().size() == 10);
         assert(df1.get_column<double>("IBM_Close").size() == 10);
         assert(df1.get_index()[0] == "2014-01-02");