Skip to content

Commit

Permalink
Do not filter out .zip extensions from no-script datasets (#6208)
Browse files Browse the repository at this point in the history
* Rename zip_csv_path fixture dirname and filename

* Test load no-script dataset with ZIP file

* Fix style

* Avoid filtering out .zip extension
  • Loading branch information
albertvillanova committed Sep 6, 2023
1 parent 682d21e commit cc66766
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 3 deletions.
4 changes: 2 additions & 2 deletions src/datasets/packaged_modules/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,5 +60,5 @@ def _hash_python_lines(lines: List[str]) -> str:
for _ext, (_module, _) in _EXTENSION_TO_MODULE.items():
_MODULE_TO_EXTENSIONS.setdefault(_module, []).append(_ext)

_MODULE_TO_EXTENSIONS["imagefolder"].append(".zip")
_MODULE_TO_EXTENSIONS["audiofolder"].append(".zip")
for _module in _MODULE_TO_EXTENSIONS:
_MODULE_TO_EXTENSIONS[_module].append(".zip")
2 changes: 1 addition & 1 deletion tests/fixtures/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ def bz2_csv_path(csv_path, tmp_path_factory):

@pytest.fixture(scope="session")
def zip_csv_path(csv_path, csv2_path, tmp_path_factory):
path = tmp_path_factory.mktemp("data") / "dataset.csv.zip"
path = tmp_path_factory.mktemp("zip_csv_path") / "csv-dataset.zip"
with zipfile.ZipFile(path, "w") as f:
f.write(csv_path, arcname=os.path.basename(csv_path))
f.write(csv2_path, arcname=os.path.basename(csv2_path))
Expand Down
9 changes: 9 additions & 0 deletions tests/test_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -1069,3 +1069,12 @@ def test_load_dataset_distributed(tmp_path, csv_path):
assert all(len(dataset) == len(datasets[0]) > 0 for dataset in datasets)
assert len(datasets[0].cache_files) > 0
assert all(dataset.cache_files == datasets[0].cache_files for dataset in datasets)


def test_load_dataset_without_script_with_zip(zip_csv_path):
path = str(zip_csv_path.parent)
ds = load_dataset(path)
assert list(ds.keys()) == ["train"]
assert ds["train"].column_names == ["col_1", "col_2", "col_3"]
assert ds["train"].num_rows == 8
assert ds["train"][0] == {"col_1": 0, "col_2": 0, "col_3": 0.0}

1 comment on commit cc66766

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Show benchmarks

PyArrow==8.0.0

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.008475 / 0.011353 (-0.002878) 0.004634 / 0.011008 (-0.006374) 0.111832 / 0.038508 (0.073323) 0.045193 / 0.023109 (0.022083) 0.351206 / 0.275898 (0.075308) 0.452893 / 0.323480 (0.129413) 0.006472 / 0.007986 (-0.001513) 0.004183 / 0.004328 (-0.000146) 0.084734 / 0.004250 (0.080483) 0.066715 / 0.037052 (0.029662) 0.392155 / 0.258489 (0.133666) 0.446548 / 0.293841 (0.152707) 0.042793 / 0.128546 (-0.085753) 0.012879 / 0.075646 (-0.062768) 0.419476 / 0.419271 (0.000204) 0.063603 / 0.043533 (0.020070) 0.350976 / 0.255139 (0.095837) 0.424094 / 0.283200 (0.140894) 0.042482 / 0.141683 (-0.099201) 1.727237 / 1.452155 (0.275082) 1.841518 / 1.492716 (0.348802)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.286248 / 0.018006 (0.268242) 0.578870 / 0.000490 (0.578380) 0.014968 / 0.000200 (0.014768) 0.000692 / 0.000054 (0.000637)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.028588 / 0.037411 (-0.008823) 0.114317 / 0.014526 (0.099791) 0.119468 / 0.176557 (-0.057088) 0.192022 / 0.737135 (-0.545114) 0.122646 / 0.296338 (-0.173692)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.590114 / 0.215209 (0.374905) 5.302025 / 2.077655 (3.224370) 2.231253 / 1.504120 (0.727133) 2.056217 / 1.541195 (0.515022) 2.103325 / 1.468490 (0.634834) 0.836559 / 4.584777 (-3.748218) 5.086835 / 3.745712 (1.341123) 2.510750 / 5.269862 (-2.759111) 1.571609 / 4.565676 (-2.994067) 0.101849 / 0.424275 (-0.322426) 0.013127 / 0.007607 (0.005520) 0.710150 / 0.226044 (0.484105) 7.645609 / 2.268929 (5.376681) 3.334376 / 55.444624 (-52.110248) 2.658922 / 6.876477 (-4.217554) 2.701702 / 2.142072 (0.559630) 0.995303 / 4.805227 (-3.809925) 0.211654 / 6.500664 (-6.289010) 0.080419 / 0.075469 (0.004950)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.329726 / 1.841788 (-0.512062) 16.575982 / 8.074308 (8.501673) 19.893004 / 10.191392 (9.701612) 0.208938 / 0.680424 (-0.471486) 0.028698 / 0.534201 (-0.505503) 0.479845 / 0.579283 (-0.099438) 0.557845 / 0.434364 (0.123481) 0.539234 / 0.540337 (-0.001103) 0.688392 / 1.386936 (-0.698544)
PyArrow==latest
Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.007829 / 0.011353 (-0.003524) 0.004374 / 0.011008 (-0.006634) 0.084843 / 0.038508 (0.046335) 0.038837 / 0.023109 (0.015728) 0.459015 / 0.275898 (0.183117) 0.518431 / 0.323480 (0.194952) 0.006146 / 0.007986 (-0.001839) 0.004288 / 0.004328 (-0.000041) 0.082256 / 0.004250 (0.078006) 0.068487 / 0.037052 (0.031435) 0.454099 / 0.258489 (0.195610) 0.535419 / 0.293841 (0.241578) 0.042136 / 0.128546 (-0.086410) 0.013005 / 0.075646 (-0.062642) 0.095646 / 0.419271 (-0.323625) 0.052133 / 0.043533 (0.008600) 0.434080 / 0.255139 (0.178941) 0.465170 / 0.283200 (0.181971) 0.031143 / 0.141683 (-0.110540) 1.579785 / 1.452155 (0.127630) 1.688110 / 1.492716 (0.195393)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.258421 / 0.018006 (0.240414) 0.559970 / 0.000490 (0.559480) 0.006079 / 0.000200 (0.005879) 0.000110 / 0.000054 (0.000055)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.030441 / 0.037411 (-0.006970) 0.121782 / 0.014526 (0.107256) 0.114306 / 0.176557 (-0.062250) 0.174604 / 0.737135 (-0.562531) 0.126961 / 0.296338 (-0.169378)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.659709 / 0.215209 (0.444500) 6.317704 / 2.077655 (4.240049) 2.900124 / 1.504120 (1.396004) 2.659364 / 1.541195 (1.118169) 2.637738 / 1.468490 (1.169248) 0.842799 / 4.584777 (-3.741978) 5.155350 / 3.745712 (1.409638) 2.364958 / 5.269862 (-2.904904) 1.632895 / 4.565676 (-2.932781) 0.090667 / 0.424275 (-0.333608) 0.011931 / 0.007607 (0.004324) 0.672989 / 0.226044 (0.446945) 7.226345 / 2.268929 (4.957417) 3.452495 / 55.444624 (-51.992130) 2.793593 / 6.876477 (-4.082884) 2.863319 / 2.142072 (0.721247) 0.978634 / 4.805227 (-3.826594) 0.203592 / 6.500664 (-6.297072) 0.068043 / 0.075469 (-0.007426)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.604086 / 1.841788 (-0.237702) 17.654606 / 8.074308 (9.580298) 19.719442 / 10.191392 (9.528050) 0.235258 / 0.680424 (-0.445165) 0.031293 / 0.534201 (-0.502908) 0.478702 / 0.579283 (-0.100581) 0.621006 / 0.434364 (0.186642) 0.561770 / 0.540337 (0.021433) 0.657141 / 1.386936 (-0.729795)

Please sign in to comment.