Add load tests

hi-primus · Sep 10, 2021 · 6199e79 · 6199e79
1 parent 91cf4f5
commit 6199e79
Show file tree

Hide file tree

Showing 2 changed files with 44 additions and 21 deletions.
diff --git a/tests/test_load.py b/tests/test_load.py
@@ -4,109 +4,115 @@
 class TestLoadPandas(TestBase):
 
     def test_json(self):
-        df = self.load_dataframe("examples/data/foo.json", type="json", multiline=True)
+        df = self.load_dataframe("../../examples/data/foo.json", type="json", multiline=True)
         self.assertEqual(df.rows.count(), 19)
         self.assertEqual(df.cols.names(), ["id", "firstName", "lastName", "billingId", "product", "price", "birth", "dummyCol"])
         self.assertEqual(self.config.get("n_partitions", 1), df.partitions())
 
     def test_json_less_rows(self):
-        df = self.load_dataframe("examples/data/foo.json", type="json", n_rows=13)
+        df = self.load_dataframe("../../examples/data/foo.json", type="json", n_rows=13)
         self.assertEqual(df.rows.count(), 13)
         self.assertEqual(df.cols.names(), ["id", "firstName", "lastName", "billingId", "product", "price", "birth", "dummyCol"])
         self.assertEqual(self.config.get("n_partitions", 1), df.partitions())
 
     def test_json_more_rows(self):
-        df = self.load_dataframe("examples/data/foo.json", type="json", n_rows=50)
+        df = self.load_dataframe("../../examples/data/foo.json", type="json", n_rows=50)
         self.assertLess(df.rows.count(), 50)
         self.assertEqual(df.cols.names(), ["id", "firstName", "lastName", "billingId", "product", "price", "birth", "dummyCol"])
         self.assertEqual(self.config.get("n_partitions", 1), df.partitions())
 
+    def test_json_multiline(self):
+        df = self.load_dataframe("../../examples/data/foo.json", type="json", multiline=True)
+        self.assertEqual(df.rows.count(), 19)
+        self.assertEqual(df.cols.names(), ["id", "firstName", "lastName", "billingId", "product", "price", "birth", "dummyCol"])
+        self.assertEqual(self.config.get("n_partitions", 1), df.partitions())
+
     def test_xml(self):
-        df = self.load_dataframe("examples/data/foo.xml", type="xml")
+        df = self.load_dataframe("../../examples/data/foo.xml", type="xml")
         self.assertEqual(df.rows.count(), 19)
         self.assertEqual(df.cols.names(), ["id", "firstName", "lastName", "billingId", "product", "price", "birth", "dummyCol"])
         self.assertEqual(self.config.get("n_partitions", 1), df.partitions())
 
     def test_xml_less_rows(self):
-        df = self.load_dataframe("examples/data/foo.xml", type="xml", n_rows=13)
+        df = self.load_dataframe("../../examples/data/foo.xml", type="xml", n_rows=13)
         self.assertEqual(df.rows.count(), 13)
         self.assertEqual(df.cols.names(), ["id", "firstName", "lastName", "billingId", "product", "price", "birth", "dummyCol"])
         self.assertEqual(self.config.get("n_partitions", 1), df.partitions())
 
     def test_xml_more_rows(self):
-        df = self.load_dataframe("examples/data/foo.xml", type="xml", n_rows=50)
+        df = self.load_dataframe("../../examples/data/foo.xml", type="xml", n_rows=50)
         self.assertLess(df.rows.count(), 50)
         self.assertEqual(df.cols.names(), ["id", "firstName", "lastName", "billingId", "product", "price", "birth", "dummyCol"])
         self.assertEqual(self.config.get("n_partitions", 1), df.partitions())
 
     def test_parquet(self):
-        df = self.load_dataframe("examples/data/foo.parquet", type="parquet")
+        df = self.load_dataframe("../../examples/data/foo.parquet", type="parquet")
         self.assertEqual(df.rows.count(), 19)
         self.assertEqual(df.cols.names(), ["id", "firstName", "lastName", "billingId", "product", "price", "birth", "dummyCol"])
         self.assertEqual(self.config.get("n_partitions", 1), df.partitions())
 
     def test_parquet_less_rows(self):
-        df = self.load_dataframe("examples/data/foo.parquet", type="parquet", n_rows=13)
+        df = self.load_dataframe("../../examples/data/foo.parquet", type="parquet", n_rows=13)
         self.assertEqual(df.rows.count(), 13)
         self.assertEqual(df.cols.names(), ["id", "firstName", "lastName", "billingId", "product", "price", "birth", "dummyCol"])
         self.assertEqual(self.config.get("n_partitions", 1), df.partitions())
 
     def test_parquet_more_rows(self):
-        df = self.load_dataframe("examples/data/foo.parquet", type="parquet", n_rows=50)
+        df = self.load_dataframe("../../examples/data/foo.parquet", type="parquet", n_rows=50)
         self.assertLess(df.rows.count(), 50)
         self.assertEqual(df.cols.names(), ["id", "firstName", "lastName", "billingId", "product", "price", "birth", "dummyCol"])
         self.assertEqual(self.config.get("n_partitions", 1), df.partitions())
 
     def test_avro(self):
-        df = self.load_dataframe("examples/data/foo.avro", type="avro")
+        df = self.load_dataframe("../../examples/data/foo.avro", type="avro")
         self.assertEqual(df.rows.count(), 19)
         self.assertEqual(df.cols.names(), ["id", "firstName", "lastName", "billingId", "product", "price", "birth", "dummyCol"])
         self.assertEqual(self.config.get("n_partitions", 1), df.partitions())
 
     def test_avro_less_rows(self):
-        df = self.load_dataframe("examples/data/foo.avro", type="avro", n_rows=13)
+        df = self.load_dataframe("../../examples/data/foo.avro", type="avro", n_rows=13)
         self.assertEqual(df.rows.count(), 13)
         self.assertEqual(df.cols.names(), ["id", "firstName", "lastName", "billingId", "product", "price", "birth", "dummyCol"])
         self.assertEqual(self.config.get("n_partitions", 1), df.partitions())
 
     def test_avro_more_rows(self):
-        df = self.load_dataframe("examples/data/foo.avro", type="avro", n_rows=50)
+        df = self.load_dataframe("../../examples/data/foo.avro", type="avro", n_rows=50)
         self.assertLess(df.rows.count(), 50)
         self.assertEqual(df.cols.names(), ["id", "firstName", "lastName", "billingId", "product", "price", "birth", "dummyCol"])
         self.assertEqual(self.config.get("n_partitions", 1), df.partitions())
 
     def test_tsv(self):
-        df = self.load_dataframe("examples/data/foo.tsv", type="tsv")
+        df = self.load_dataframe("../../examples/data/foo.tsv", type="tsv")
         self.assertEqual(df.rows.count(), 5)
         self.assertEqual(df.cols.names(), ["Sepal length", "Sepal width", "Petal length", "Petal width", "Species"])
         self.assertEqual(self.config.get("n_partitions", 1), df.partitions())
 
     def test_tsv_less_rows(self):
-        df = self.load_dataframe("examples/data/foo.tsv", type="tsv", n_rows=3)
+        df = self.load_dataframe("../../examples/data/foo.tsv", type="tsv", n_rows=3)
         self.assertEqual(df.rows.count(), 3)
         self.assertEqual(df.cols.names(), ["Sepal length", "Sepal width", "Petal length", "Petal width", "Species"])        
         self.assertEqual(self.config.get("n_partitions", 1), df.partitions())
 
     def test_tsv_more_rows(self):
-        df = self.load_dataframe("examples/data/foo.tsv", type="tsv", n_rows=50)
+        df = self.load_dataframe("../../examples/data/foo.tsv", type="tsv", n_rows=50)
         self.assertLess(df.rows.count(), 50)
         self.assertEqual(df.cols.names(), ["Sepal length", "Sepal width", "Petal length", "Petal width", "Species"])        
         self.assertEqual(self.config.get("n_partitions", 1), df.partitions())
 
     def test_xls(self):
-        df = self.load_dataframe("examples/data/titanic3.xls", type="excel")
+        df = self.load_dataframe("../../examples/data/titanic3.xls", type="excel")
         self.assertEqual(df.rows.count(), 1309)
         self.assertEqual(df.cols.names(), ["pclass", "survived", "name", "sex", "age", "sibsp", "parch", "ticket", "fare", "cabin", "embarked", "boat", "body", "home.dest"])
         self.assertEqual(self.config.get("n_partitions", 1), df.partitions())
 
     def test_xls_less_rows(self):
-        df = self.load_dataframe("examples/data/titanic3.xls", type="excel", n_rows=13)
+        df = self.load_dataframe("../../examples/data/titanic3.xls", type="excel", n_rows=13)
         self.assertEqual(df.rows.count(), 13)
         self.assertEqual(df.cols.names(), ["pclass", "survived", "name", "sex", "age", "sibsp", "parch", "ticket", "fare", "cabin", "embarked", "boat", "body", "home.dest"])
         self.assertEqual(self.config.get("n_partitions", 1), df.partitions())
 
     def test_xls_more_rows(self):
-        df = self.load_dataframe("examples/data/titanic3.xls", type="excel", n_rows=50)
+        df = self.load_dataframe("../../examples/data/titanic3.xls", type="excel", n_rows=50)
         self.assertLess(df.rows.count(), 5000)
         self.assertEqual(df.cols.names(), ["pclass", "survived", "name", "sex", "age", "sibsp", "parch", "ticket", "fare", "cabin", "embarked", "boat", "body", "home.dest"])
         self.assertEqual(self.config.get("n_partitions", 1), df.partitions())

diff --git a/tests/test_load_csv.py b/tests/test_load_csv.py
@@ -4,19 +4,36 @@
 class TestCSVPandas(TestBase):
 
     def test_csv(self):
-        df = self.load_dataframe("examples/data/foo.csv")
+        df = self.load_dataframe("../../examples/data/foo.csv")
         self.assertEqual(df.rows.count(), 19)
         self.assertEqual(df.cols.names(), ["id", "firstName", "lastName", "billingId", "product", "price", "birth", "dummyCol"])
 
     def test_csv_less_rows(self):
-        df = self.load_dataframe("examples/data/foo.csv", n_rows=13)
+        df = self.load_dataframe("../../examples/data/foo.csv", n_rows=13)
         self.assertEqual(df.rows.count(), 13)
         self.assertEqual(df.cols.names(), ["id", "firstName", "lastName", "billingId", "product", "price", "birth", "dummyCol"])
 
     def test_csv_more_rows(self):
-        df = self.load_dataframe("examples/data/foo.csv", n_rows=50)
+        df = self.load_dataframe("../../examples/data/foo.csv", n_rows=50)
         self.assertLess(df.rows.count(), 50)
         self.assertEqual(df.cols.names(), ["id", "firstName", "lastName", "billingId", "product", "price", "birth", "dummyCol"])
+
+    def test_csv_no_header(self):
+        df = self.load_dataframe("../../examples/data/foo.csv", header=None)
+        self.assertEqual(df.cols.names(), [0,1,2,3,4,5,6,7])
+
+    def test_csv_null(self):
+        df = self.load_dataframe("../../examples/data/foo.csv", null_value="null")
+        self.assertEqual(df.mask.null(cols='product').cols.frequency(), {'frequency': {'product': {'values': [{'value': False, 'count': 18},
+                        {'value': True, 'count': 1}]}}})
+
+    def test_csv_semicolon(self):
+        df = self.load_dataframe("../../exmaples/data/foo.csv", sep=";")
+        self.assertEqual(df.cols.names(), ['id,firstName,lastName,billingId,product,price,birth,dummyCol'])
+
+    def test_csv_coma(self):
+        df = self.load_dataframe("../../exmaples/data/foo.csv", sep=",")
+        self.assertEqual(df.cols.names(), ["id", "firstName", "lastName", "billingId", "product", "price", "birth", "dummyCol"])
 
 class TestCSVDask(TestCSVPandas):
     config = {'engine': 'dask', 'n_partitions': 1}