Do not filter out .zip extensions from no-script datasets (#6208)

* Rename zip_csv_path fixture dirname and filename * Test load no-script dataset with ZIP file * Fix style * Avoid filtering out .zip extension
huggingface · Sep 6, 2023 · cc66766 · cc66766 · github-actions · Sep 6, 2023
1 parent 682d21e
commit cc66766
Show file tree

Hide file tree

Showing 3 changed files with 12 additions and 3 deletions.
diff --git a/src/datasets/packaged_modules/__init__.py b/src/datasets/packaged_modules/__init__.py
@@ -60,5 +60,5 @@ def _hash_python_lines(lines: List[str]) -> str:
 for _ext, (_module, _) in _EXTENSION_TO_MODULE.items():
     _MODULE_TO_EXTENSIONS.setdefault(_module, []).append(_ext)
 
-_MODULE_TO_EXTENSIONS["imagefolder"].append(".zip")
-_MODULE_TO_EXTENSIONS["audiofolder"].append(".zip")
+for _module in _MODULE_TO_EXTENSIONS:
+    _MODULE_TO_EXTENSIONS[_module].append(".zip")
diff --git a/tests/fixtures/files.py b/tests/fixtures/files.py
@@ -289,7 +289,7 @@ def bz2_csv_path(csv_path, tmp_path_factory):
 
 @pytest.fixture(scope="session")
 def zip_csv_path(csv_path, csv2_path, tmp_path_factory):
-    path = tmp_path_factory.mktemp("data") / "dataset.csv.zip"
+    path = tmp_path_factory.mktemp("zip_csv_path") / "csv-dataset.zip"
     with zipfile.ZipFile(path, "w") as f:
         f.write(csv_path, arcname=os.path.basename(csv_path))
         f.write(csv2_path, arcname=os.path.basename(csv2_path))

diff --git a/tests/test_load.py b/tests/test_load.py
@@ -1069,3 +1069,12 @@ def test_load_dataset_distributed(tmp_path, csv_path):
         assert all(len(dataset) == len(datasets[0]) > 0 for dataset in datasets)
         assert len(datasets[0].cache_files) > 0
         assert all(dataset.cache_files == datasets[0].cache_files for dataset in datasets)
+
+
+def test_load_dataset_without_script_with_zip(zip_csv_path):
+    path = str(zip_csv_path.parent)
+    ds = load_dataset(path)
+    assert list(ds.keys()) == ["train"]
+    assert ds["train"].column_names == ["col_1", "col_2", "col_3"]
+    assert ds["train"].num_rows == 8
+    assert ds["train"][0] == {"col_1": 0, "col_2": 0, "col_3": 0.0}