diff --git a/src/datasets/packaged_modules/__init__.py b/src/datasets/packaged_modules/__init__.py index c79b3762b38..dc9409040b0 100644 --- a/src/datasets/packaged_modules/__init__.py +++ b/src/datasets/packaged_modules/__init__.py @@ -60,5 +60,5 @@ def _hash_python_lines(lines: List[str]) -> str: for _ext, (_module, _) in _EXTENSION_TO_MODULE.items(): _MODULE_TO_EXTENSIONS.setdefault(_module, []).append(_ext) -_MODULE_TO_EXTENSIONS["imagefolder"].append(".zip") -_MODULE_TO_EXTENSIONS["audiofolder"].append(".zip") +for _module in _MODULE_TO_EXTENSIONS: + _MODULE_TO_EXTENSIONS[_module].append(".zip") diff --git a/tests/fixtures/files.py b/tests/fixtures/files.py index 3aebcff3bb7..da8d3efee48 100644 --- a/tests/fixtures/files.py +++ b/tests/fixtures/files.py @@ -289,7 +289,7 @@ def bz2_csv_path(csv_path, tmp_path_factory): @pytest.fixture(scope="session") def zip_csv_path(csv_path, csv2_path, tmp_path_factory): - path = tmp_path_factory.mktemp("data") / "dataset.csv.zip" + path = tmp_path_factory.mktemp("zip_csv_path") / "csv-dataset.zip" with zipfile.ZipFile(path, "w") as f: f.write(csv_path, arcname=os.path.basename(csv_path)) f.write(csv2_path, arcname=os.path.basename(csv2_path)) diff --git a/tests/test_load.py b/tests/test_load.py index a4202e523a9..c80a66a5824 100644 --- a/tests/test_load.py +++ b/tests/test_load.py @@ -1069,3 +1069,12 @@ def test_load_dataset_distributed(tmp_path, csv_path): assert all(len(dataset) == len(datasets[0]) > 0 for dataset in datasets) assert len(datasets[0].cache_files) > 0 assert all(dataset.cache_files == datasets[0].cache_files for dataset in datasets) + + +def test_load_dataset_without_script_with_zip(zip_csv_path): + path = str(zip_csv_path.parent) + ds = load_dataset(path) + assert list(ds.keys()) == ["train"] + assert ds["train"].column_names == ["col_1", "col_2", "col_3"] + assert ds["train"].num_rows == 8 + assert ds["train"][0] == {"col_1": 0, "col_2": 0, "col_3": 0.0}