Skip to content

Commit

Permalink
Fix unused arguments (#1962)
Browse files Browse the repository at this point in the history
* Fix unused arguments

* Re-add the deep arg to PandasArrayExtensionArray.copy

Co-authored-by: Quentin Lhoest <[email protected]>
  • Loading branch information
mariosasko and lhoestq committed Mar 3, 2021
1 parent 96acc04 commit a100f35
Show file tree
Hide file tree
Showing 5 changed files with 6 additions and 5 deletions.
2 changes: 2 additions & 0 deletions src/datasets/arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1536,6 +1536,7 @@ def apply_function_on_filtered_inputs(inputs, indices, check_same_num_examples=F
writer_batch_size=writer_batch_size,
update_features=update_features,
fingerprint=new_fingerprint,
disable_nullable=disable_nullable,
)
else:
buf_writer = None
Expand All @@ -1547,6 +1548,7 @@ def apply_function_on_filtered_inputs(inputs, indices, check_same_num_examples=F
writer_batch_size=writer_batch_size,
update_features=update_features,
fingerprint=new_fingerprint,
disable_nullable=disable_nullable,
)

try:
Expand Down
2 changes: 1 addition & 1 deletion src/datasets/arrow_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ def write_batch(
typed_sequence = TypedSequence(batch_examples[col], type=col_type, try_type=col_try_type)
typed_sequence_examples[col] = typed_sequence
pa_table = pa.Table.from_pydict(typed_sequence_examples)
self.write_table(pa_table)
self.write_table(pa_table, writer_batch_size)

def write_table(self, pa_table: pa.Table, writer_batch_size: Optional[int] = None):
"""Write a batch of Example to file.
Expand Down
2 changes: 1 addition & 1 deletion src/datasets/inspect.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def list_datasets(with_community_datasets=True, with_details=False):
return api.dataset_list(with_community_datasets=with_community_datasets, id_only=bool(not with_details))


def list_metrics(with_community_metrics=True, id_only=False, with_details=False):
def list_metrics(with_community_metrics=True, with_details=False):
"""List all the metrics script available on HuggingFace AWS bucket
Args:
Expand Down
1 change: 0 additions & 1 deletion src/datasets/metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -509,7 +509,6 @@ def download_and_prepare(
self,
download_config: Optional[DownloadConfig] = None,
dl_manager: Optional[DownloadManager] = None,
**download_and_prepare_kwargs,
):
"""Downloads and prepares dataset for reading.
Expand Down
4 changes: 2 additions & 2 deletions src/datasets/utils/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,7 +449,7 @@ def ftp_head(url, timeout=10.0):
return True


def ftp_get(url, temp_file, proxies=None, resume_size=0, headers=None, cookies=None, timeout=10.0):
def ftp_get(url, temp_file, timeout=10.0):
_raise_if_offline_mode_is_enabled(f"Tried to reach {url}")
try:
logger.info(f"Getting through FTP {url} into {temp_file.name}")
Expand Down Expand Up @@ -651,7 +651,7 @@ def _resumable_file_manager():

# GET file object
if url.startswith("ftp://"):
ftp_get(url, temp_file, proxies=proxies, resume_size=resume_size, headers=headers, cookies=cookies)
ftp_get(url, temp_file)
else:
http_get(
url,
Expand Down

1 comment on commit a100f35

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Show benchmarks

PyArrow==0.17.1

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.019872 / 0.011353 (0.008519) 0.016624 / 0.011008 (0.005615) 0.047800 / 0.038508 (0.009292) 0.035245 / 0.023109 (0.012135) 0.217864 / 0.275898 (-0.058034) 0.247528 / 0.323480 (-0.075952) 0.006319 / 0.007986 (-0.001667) 0.006073 / 0.004328 (0.001744) 0.007532 / 0.004250 (0.003282) 0.050059 / 0.037052 (0.013007) 0.222679 / 0.258489 (-0.035811) 0.261685 / 0.293841 (-0.032156) 0.168587 / 0.128546 (0.040041) 0.136470 / 0.075646 (0.060824) 0.456729 / 0.419271 (0.037457) 0.480255 / 0.043533 (0.436722) 0.224077 / 0.255139 (-0.031062) 0.237716 / 0.283200 (-0.045483) 1.856013 / 0.141683 (1.714330) 1.909542 / 1.452155 (0.457387) 1.980087 / 1.492716 (0.487371)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.041936 / 0.037411 (0.004524) 0.022424 / 0.014526 (0.007898) 0.038423 / 0.176557 (-0.138134) 0.049038 / 0.737135 (-0.688097) 0.049436 / 0.296338 (-0.246902)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.283923 / 0.215209 (0.068714) 2.819780 / 2.077655 (0.742125) 1.397362 / 1.504120 (-0.106758) 1.207771 / 1.541195 (-0.333424) 1.219503 / 1.468490 (-0.248987) 7.190491 / 4.584777 (2.605714) 6.477028 / 3.745712 (2.731316) 8.813609 / 5.269862 (3.543748) 7.958696 / 4.565676 (3.393019) 0.702504 / 0.424275 (0.278229) 0.010962 / 0.007607 (0.003354) 0.312236 / 0.226044 (0.086191) 3.367275 / 2.268929 (1.098347) 1.995155 / 55.444624 (-53.449469) 1.653377 / 6.876477 (-5.223099) 1.651327 / 2.142072 (-0.490745) 7.292546 / 4.805227 (2.487319) 6.761345 / 6.500664 (0.260681) 8.962133 / 0.075469 (8.886664)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 11.837016 / 1.841788 (9.995229) 16.174446 / 8.074308 (8.100137) 23.134736 / 10.191392 (12.943344) 0.510678 / 0.680424 (-0.169746) 0.315710 / 0.534201 (-0.218491) 0.858264 / 0.579283 (0.278981) 0.671603 / 0.434364 (0.237240) 0.757416 / 0.540337 (0.217078) 1.679489 / 1.386936 (0.292553)
PyArrow==1.0
Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.019828 / 0.011353 (0.008475) 0.016383 / 0.011008 (0.005374) 0.047436 / 0.038508 (0.008928) 0.035268 / 0.023109 (0.012159) 0.381419 / 0.275898 (0.105521) 0.424639 / 0.323480 (0.101159) 0.008116 / 0.007986 (0.000130) 0.006728 / 0.004328 (0.002400) 0.007190 / 0.004250 (0.002939) 0.062676 / 0.037052 (0.025624) 0.381237 / 0.258489 (0.122748) 0.417837 / 0.293841 (0.123996) 0.170198 / 0.128546 (0.041652) 0.130980 / 0.075646 (0.055333) 0.440784 / 0.419271 (0.021513) 0.484100 / 0.043533 (0.440567) 0.389773 / 0.255139 (0.134634) 0.407516 / 0.283200 (0.124317) 1.888453 / 0.141683 (1.746770) 1.955385 / 1.452155 (0.503231) 2.030300 / 1.492716 (0.537583)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.046269 / 0.037411 (0.008858) 0.022811 / 0.014526 (0.008286) 0.041173 / 0.176557 (-0.135384) 0.048886 / 0.737135 (-0.688250) 0.050240 / 0.296338 (-0.246098)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.335936 / 0.215209 (0.120727) 3.361144 / 2.077655 (1.283489) 2.097649 / 1.504120 (0.593530) 1.905815 / 1.541195 (0.364620) 1.908957 / 1.468490 (0.440467) 7.193630 / 4.584777 (2.608853) 6.271213 / 3.745712 (2.525501) 8.868508 / 5.269862 (3.598647) 7.824582 / 4.565676 (3.258905) 0.730293 / 0.424275 (0.306018) 0.010955 / 0.007607 (0.003348) 0.366918 / 0.226044 (0.140873) 3.771872 / 2.268929 (1.502943) 2.503461 / 55.444624 (-52.941163) 2.224772 / 6.876477 (-4.651704) 2.273505 / 2.142072 (0.131432) 7.157116 / 4.805227 (2.351889) 7.126146 / 6.500664 (0.625482) 8.435586 / 0.075469 (8.360117)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 12.083722 / 1.841788 (10.241935) 16.956898 / 8.074308 (8.882590) 23.699050 / 10.191392 (13.507658) 1.170141 / 0.680424 (0.489717) 0.618735 / 0.534201 (0.084535) 0.811199 / 0.579283 (0.231915) 0.628919 / 0.434364 (0.194555) 0.723326 / 0.540337 (0.182988) 1.638509 / 1.386936 (0.251573)

CML watermark

Please sign in to comment.