diff --git a/tests/test_load.py b/tests/test_load.py index 9b069b26d..a14fc709a 100644 --- a/tests/test_load.py +++ b/tests/test_load.py @@ -4,109 +4,115 @@ class TestLoadPandas(TestBase): def test_json(self): - df = self.load_dataframe("examples/data/foo.json", type="json", multiline=True) + df = self.load_dataframe("../../examples/data/foo.json", type="json", multiline=True) self.assertEqual(df.rows.count(), 19) self.assertEqual(df.cols.names(), ["id", "firstName", "lastName", "billingId", "product", "price", "birth", "dummyCol"]) self.assertEqual(self.config.get("n_partitions", 1), df.partitions()) def test_json_less_rows(self): - df = self.load_dataframe("examples/data/foo.json", type="json", n_rows=13) + df = self.load_dataframe("../../examples/data/foo.json", type="json", n_rows=13) self.assertEqual(df.rows.count(), 13) self.assertEqual(df.cols.names(), ["id", "firstName", "lastName", "billingId", "product", "price", "birth", "dummyCol"]) self.assertEqual(self.config.get("n_partitions", 1), df.partitions()) def test_json_more_rows(self): - df = self.load_dataframe("examples/data/foo.json", type="json", n_rows=50) + df = self.load_dataframe("../../examples/data/foo.json", type="json", n_rows=50) self.assertLess(df.rows.count(), 50) self.assertEqual(df.cols.names(), ["id", "firstName", "lastName", "billingId", "product", "price", "birth", "dummyCol"]) self.assertEqual(self.config.get("n_partitions", 1), df.partitions()) + def test_json_multiline(self): + df = self.load_dataframe("../../examples/data/foo.json", type="json", multiline=True) + self.assertEqual(df.rows.count(), 19) + self.assertEqual(df.cols.names(), ["id", "firstName", "lastName", "billingId", "product", "price", "birth", "dummyCol"]) + self.assertEqual(self.config.get("n_partitions", 1), df.partitions()) + def test_xml(self): - df = self.load_dataframe("examples/data/foo.xml", type="xml") + df = self.load_dataframe("../../examples/data/foo.xml", type="xml") self.assertEqual(df.rows.count(), 19) self.assertEqual(df.cols.names(), ["id", "firstName", "lastName", "billingId", "product", "price", "birth", "dummyCol"]) self.assertEqual(self.config.get("n_partitions", 1), df.partitions()) def test_xml_less_rows(self): - df = self.load_dataframe("examples/data/foo.xml", type="xml", n_rows=13) + df = self.load_dataframe("../../examples/data/foo.xml", type="xml", n_rows=13) self.assertEqual(df.rows.count(), 13) self.assertEqual(df.cols.names(), ["id", "firstName", "lastName", "billingId", "product", "price", "birth", "dummyCol"]) self.assertEqual(self.config.get("n_partitions", 1), df.partitions()) def test_xml_more_rows(self): - df = self.load_dataframe("examples/data/foo.xml", type="xml", n_rows=50) + df = self.load_dataframe("../../examples/data/foo.xml", type="xml", n_rows=50) self.assertLess(df.rows.count(), 50) self.assertEqual(df.cols.names(), ["id", "firstName", "lastName", "billingId", "product", "price", "birth", "dummyCol"]) self.assertEqual(self.config.get("n_partitions", 1), df.partitions()) def test_parquet(self): - df = self.load_dataframe("examples/data/foo.parquet", type="parquet") + df = self.load_dataframe("../../examples/data/foo.parquet", type="parquet") self.assertEqual(df.rows.count(), 19) self.assertEqual(df.cols.names(), ["id", "firstName", "lastName", "billingId", "product", "price", "birth", "dummyCol"]) self.assertEqual(self.config.get("n_partitions", 1), df.partitions()) def test_parquet_less_rows(self): - df = self.load_dataframe("examples/data/foo.parquet", type="parquet", n_rows=13) + df = self.load_dataframe("../../examples/data/foo.parquet", type="parquet", n_rows=13) self.assertEqual(df.rows.count(), 13) self.assertEqual(df.cols.names(), ["id", "firstName", "lastName", "billingId", "product", "price", "birth", "dummyCol"]) self.assertEqual(self.config.get("n_partitions", 1), df.partitions()) def test_parquet_more_rows(self): - df = self.load_dataframe("examples/data/foo.parquet", type="parquet", n_rows=50) + df = self.load_dataframe("../../examples/data/foo.parquet", type="parquet", n_rows=50) self.assertLess(df.rows.count(), 50) self.assertEqual(df.cols.names(), ["id", "firstName", "lastName", "billingId", "product", "price", "birth", "dummyCol"]) self.assertEqual(self.config.get("n_partitions", 1), df.partitions()) def test_avro(self): - df = self.load_dataframe("examples/data/foo.avro", type="avro") + df = self.load_dataframe("../../examples/data/foo.avro", type="avro") self.assertEqual(df.rows.count(), 19) self.assertEqual(df.cols.names(), ["id", "firstName", "lastName", "billingId", "product", "price", "birth", "dummyCol"]) self.assertEqual(self.config.get("n_partitions", 1), df.partitions()) def test_avro_less_rows(self): - df = self.load_dataframe("examples/data/foo.avro", type="avro", n_rows=13) + df = self.load_dataframe("../../examples/data/foo.avro", type="avro", n_rows=13) self.assertEqual(df.rows.count(), 13) self.assertEqual(df.cols.names(), ["id", "firstName", "lastName", "billingId", "product", "price", "birth", "dummyCol"]) self.assertEqual(self.config.get("n_partitions", 1), df.partitions()) def test_avro_more_rows(self): - df = self.load_dataframe("examples/data/foo.avro", type="avro", n_rows=50) + df = self.load_dataframe("../../examples/data/foo.avro", type="avro", n_rows=50) self.assertLess(df.rows.count(), 50) self.assertEqual(df.cols.names(), ["id", "firstName", "lastName", "billingId", "product", "price", "birth", "dummyCol"]) self.assertEqual(self.config.get("n_partitions", 1), df.partitions()) def test_tsv(self): - df = self.load_dataframe("examples/data/foo.tsv", type="tsv") + df = self.load_dataframe("../../examples/data/foo.tsv", type="tsv") self.assertEqual(df.rows.count(), 5) self.assertEqual(df.cols.names(), ["Sepal length", "Sepal width", "Petal length", "Petal width", "Species"]) self.assertEqual(self.config.get("n_partitions", 1), df.partitions()) def test_tsv_less_rows(self): - df = self.load_dataframe("examples/data/foo.tsv", type="tsv", n_rows=3) + df = self.load_dataframe("../../examples/data/foo.tsv", type="tsv", n_rows=3) self.assertEqual(df.rows.count(), 3) self.assertEqual(df.cols.names(), ["Sepal length", "Sepal width", "Petal length", "Petal width", "Species"]) self.assertEqual(self.config.get("n_partitions", 1), df.partitions()) def test_tsv_more_rows(self): - df = self.load_dataframe("examples/data/foo.tsv", type="tsv", n_rows=50) + df = self.load_dataframe("../../examples/data/foo.tsv", type="tsv", n_rows=50) self.assertLess(df.rows.count(), 50) self.assertEqual(df.cols.names(), ["Sepal length", "Sepal width", "Petal length", "Petal width", "Species"]) self.assertEqual(self.config.get("n_partitions", 1), df.partitions()) def test_xls(self): - df = self.load_dataframe("examples/data/titanic3.xls", type="excel") + df = self.load_dataframe("../../examples/data/titanic3.xls", type="excel") self.assertEqual(df.rows.count(), 1309) self.assertEqual(df.cols.names(), ["pclass", "survived", "name", "sex", "age", "sibsp", "parch", "ticket", "fare", "cabin", "embarked", "boat", "body", "home.dest"]) self.assertEqual(self.config.get("n_partitions", 1), df.partitions()) def test_xls_less_rows(self): - df = self.load_dataframe("examples/data/titanic3.xls", type="excel", n_rows=13) + df = self.load_dataframe("../../examples/data/titanic3.xls", type="excel", n_rows=13) self.assertEqual(df.rows.count(), 13) self.assertEqual(df.cols.names(), ["pclass", "survived", "name", "sex", "age", "sibsp", "parch", "ticket", "fare", "cabin", "embarked", "boat", "body", "home.dest"]) self.assertEqual(self.config.get("n_partitions", 1), df.partitions()) def test_xls_more_rows(self): - df = self.load_dataframe("examples/data/titanic3.xls", type="excel", n_rows=50) + df = self.load_dataframe("../../examples/data/titanic3.xls", type="excel", n_rows=50) self.assertLess(df.rows.count(), 5000) self.assertEqual(df.cols.names(), ["pclass", "survived", "name", "sex", "age", "sibsp", "parch", "ticket", "fare", "cabin", "embarked", "boat", "body", "home.dest"]) self.assertEqual(self.config.get("n_partitions", 1), df.partitions()) diff --git a/tests/test_load_csv.py b/tests/test_load_csv.py index 55f337aec..26b78da5c 100644 --- a/tests/test_load_csv.py +++ b/tests/test_load_csv.py @@ -4,19 +4,36 @@ class TestCSVPandas(TestBase): def test_csv(self): - df = self.load_dataframe("examples/data/foo.csv") + df = self.load_dataframe("../../examples/data/foo.csv") self.assertEqual(df.rows.count(), 19) self.assertEqual(df.cols.names(), ["id", "firstName", "lastName", "billingId", "product", "price", "birth", "dummyCol"]) def test_csv_less_rows(self): - df = self.load_dataframe("examples/data/foo.csv", n_rows=13) + df = self.load_dataframe("../../examples/data/foo.csv", n_rows=13) self.assertEqual(df.rows.count(), 13) self.assertEqual(df.cols.names(), ["id", "firstName", "lastName", "billingId", "product", "price", "birth", "dummyCol"]) def test_csv_more_rows(self): - df = self.load_dataframe("examples/data/foo.csv", n_rows=50) + df = self.load_dataframe("../../examples/data/foo.csv", n_rows=50) self.assertLess(df.rows.count(), 50) self.assertEqual(df.cols.names(), ["id", "firstName", "lastName", "billingId", "product", "price", "birth", "dummyCol"]) + + def test_csv_no_header(self): + df = self.load_dataframe("../../examples/data/foo.csv", header=None) + self.assertEqual(df.cols.names(), [0,1,2,3,4,5,6,7]) + + def test_csv_null(self): + df = self.load_dataframe("../../examples/data/foo.csv", null_value="null") + self.assertEqual(df.mask.null(cols='product').cols.frequency(), {'frequency': {'product': {'values': [{'value': False, 'count': 18}, + {'value': True, 'count': 1}]}}}) + + def test_csv_semicolon(self): + df = self.load_dataframe("../../exmaples/data/foo.csv", sep=";") + self.assertEqual(df.cols.names(), ['id,firstName,lastName,billingId,product,price,birth,dummyCol']) + + def test_csv_coma(self): + df = self.load_dataframe("../../exmaples/data/foo.csv", sep=",") + self.assertEqual(df.cols.names(), ["id", "firstName", "lastName", "billingId", "product", "price", "birth", "dummyCol"]) class TestCSVDask(TestCSVPandas): config = {'engine': 'dask', 'n_partitions': 1}