Skip to content

Commit

Permalink
Fix authentication issues (#6127)
Browse files Browse the repository at this point in the history
* Fix hf_token fixture

* Do not store token but pass it explicitly

* Fix test with no token

* Fix style

* Test private load_dataset_builder and get_dataset_config_info

* Fix DownloadConfig to pass token to storage_options

* Set config HUB_DATASETS_HFFS_URL

* Use HUB_DATASETS_HFFS_URL in Audio/Image decode_example

* Pass download_config create_builder_configs_from_metadata_configs
  • Loading branch information
albertvillanova committed Aug 8, 2023
1 parent 33f736e commit 3648752
Show file tree
Hide file tree
Showing 9 changed files with 43 additions and 17 deletions.
1 change: 1 addition & 0 deletions src/datasets/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
# Hub
HF_ENDPOINT = os.environ.get("HF_ENDPOINT", "https://huggingface.co")
HUB_DATASETS_URL = HF_ENDPOINT + "/datasets/{repo_id}/resolve/{revision}/{path}"
HUB_DATASETS_HFFS_URL = "hf://datasets/{repo_id}@{revision}/{path}"
HUB_DEFAULT_VERSION = "main"

PY_VERSION = version.parse(platform.python_version())
Expand Down
8 changes: 8 additions & 0 deletions src/datasets/download/download_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,3 +92,11 @@ def __post_init__(self, use_auth_token):

def copy(self) -> "DownloadConfig":
return self.__class__(**{k: copy.deepcopy(v) for k, v in self.__dict__.items()})

def __setattr__(self, name, value):
if name == "token" and getattr(self, "storage_options", None) is not None:
if "hf" not in self.storage_options:
self.storage_options["hf"] = {"token": value, "endpoint": config.HF_ENDPOINT}
elif getattr(self.storage_options["hf"], "token", None) is None:
self.storage_options["hf"]["token"] = value
super().__setattr__(name, value)
5 changes: 4 additions & 1 deletion src/datasets/features/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,8 +173,11 @@ def decode_example(
if file is None:
token_per_repo_id = token_per_repo_id or {}
source_url = path.split("::")[-1]
pattern = (
config.HUB_DATASETS_URL if source_url.startswith(config.HF_ENDPOINT) else config.HUB_DATASETS_HFFS_URL
)
try:
repo_id = string_to_dict(source_url, config.HUB_DATASETS_URL)["repo_id"]
repo_id = string_to_dict(source_url, pattern)["repo_id"]
token = token_per_repo_id[repo_id]
except (ValueError, KeyError):
token = None
Expand Down
7 changes: 6 additions & 1 deletion src/datasets/features/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,8 +166,13 @@ def decode_example(self, value: dict, token_per_repo_id=None) -> "PIL.Image.Imag
image = PIL.Image.open(path)
else:
source_url = path.split("::")[-1]
pattern = (
config.HUB_DATASETS_URL
if source_url.startswith(config.HF_ENDPOINT)
else config.HUB_DATASETS_HFFS_URL
)
try:
repo_id = string_to_dict(source_url, config.HUB_DATASETS_URL)["repo_id"]
repo_id = string_to_dict(source_url, pattern)["repo_id"]
token = token_per_repo_id.get(repo_id)
except ValueError:
token = None
Expand Down
3 changes: 3 additions & 0 deletions src/datasets/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -539,6 +539,7 @@ def create_builder_configs_from_metadata_configs(
base_path: Optional[str] = None,
default_builder_kwargs: Dict[str, Any] = None,
allowed_extensions: Optional[List[str]] = None,
download_config: Optional[DownloadConfig] = None,
) -> Tuple[List[BuilderConfig], str]:
builder_cls = import_main_class(module_path)
builder_config_cls = builder_cls.BUILDER_CONFIG_CLASS
Expand All @@ -560,6 +561,7 @@ def create_builder_configs_from_metadata_configs(
config_patterns,
base_path=config_base_path,
allowed_extensions=ALL_ALLOWED_EXTENSIONS,
download_config=download_config,
)
except EmptyDatasetError as e:
raise EmptyDatasetError(
Expand Down Expand Up @@ -1070,6 +1072,7 @@ def get_module(self) -> DatasetModule:
base_path=base_path,
supports_metadata=supports_metadata,
default_builder_kwargs=default_builder_kwargs,
download_config=self.download_config,
)
else:
builder_configs, default_config_name = None, None
Expand Down
6 changes: 1 addition & 5 deletions tests/fixtures/hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,8 @@ def hf_api():


@pytest.fixture(scope="session")
def hf_token(hf_api: HfApi):
previous_token = HfFolder.get_token()
HfFolder.save_token(CI_HUB_USER_TOKEN)
def hf_token():
yield CI_HUB_USER_TOKEN
if previous_token is not None:
HfFolder.save_token(previous_token)


@pytest.fixture
Expand Down
5 changes: 5 additions & 0 deletions tests/test_inspect.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,11 @@ def test_get_dataset_config_info(path, config_name, expected_splits):
assert list(info.splits.keys()) == expected_splits


def test_get_dataset_config_info_private(hf_token, hf_private_dataset_repo_txt_data):
info = get_dataset_config_info(hf_private_dataset_repo_txt_data, config_name="default", token=hf_token)
assert list(info.splits.keys()) == ["train"]


@pytest.mark.parametrize(
"path, config_name, expected_exception",
[
Expand Down
19 changes: 11 additions & 8 deletions tests/test_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
PackagedDatasetModuleFactory,
infer_module_for_data_files_list,
infer_module_for_data_files_list_in_archives,
load_dataset_builder,
)
from datasets.packaged_modules.audiofolder.audiofolder import AudioFolder, AudioFolderConfig
from datasets.packaged_modules.imagefolder.imagefolder import ImageFolder, ImageFolderConfig
Expand Down Expand Up @@ -1223,13 +1224,19 @@ def assert_auth(method, url, *args, headers, **kwargs):

@pytest.mark.integration
def test_load_streaming_private_dataset(hf_token, hf_private_dataset_repo_txt_data):
ds = load_dataset(hf_private_dataset_repo_txt_data, streaming=True)
ds = load_dataset(hf_private_dataset_repo_txt_data, streaming=True, token=hf_token)
assert next(iter(ds)) is not None


@pytest.mark.integration
def test_load_dataset_builder_private_dataset(hf_token, hf_private_dataset_repo_txt_data):
builder = load_dataset_builder(hf_private_dataset_repo_txt_data, token=hf_token)
assert isinstance(builder, DatasetBuilder)


@pytest.mark.integration
def test_load_streaming_private_dataset_with_zipped_data(hf_token, hf_private_dataset_repo_zipped_txt_data):
ds = load_dataset(hf_private_dataset_repo_zipped_txt_data, streaming=True)
ds = load_dataset(hf_private_dataset_repo_zipped_txt_data, streaming=True, token=hf_token)
assert next(iter(ds)) is not None


Expand Down Expand Up @@ -1309,13 +1316,9 @@ def test_load_hub_dataset_without_script_with_metadata_config_in_parallel():

@require_pil
@pytest.mark.integration
@pytest.mark.parametrize("implicit_token", [True])
@pytest.mark.parametrize("streaming", [True])
def test_load_dataset_private_zipped_images(
hf_private_dataset_repo_zipped_img_data, hf_token, streaming, implicit_token
):
token = None if implicit_token else hf_token
ds = load_dataset(hf_private_dataset_repo_zipped_img_data, split="train", streaming=streaming, token=token)
def test_load_dataset_private_zipped_images(hf_private_dataset_repo_zipped_img_data, hf_token, streaming):
ds = load_dataset(hf_private_dataset_repo_zipped_img_data, split="train", streaming=streaming, token=hf_token)
assert isinstance(ds, IterableDataset if streaming else Dataset)
ds_items = list(ds)
assert len(ds_items) == 2
Expand Down
6 changes: 4 additions & 2 deletions tests/test_upstream_hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,12 @@


@for_all_test_methods(xfail_if_500_502_http_error)
@pytest.mark.usefixtures("set_ci_hub_access_token", "ci_hfh_hf_hub_url")
@pytest.mark.usefixtures("ci_hub_config", "ci_hfh_hf_hub_url")
class TestPushToHub:
_api = HfApi(endpoint=CI_HUB_ENDPOINT)
_token = CI_HUB_USER_TOKEN

def test_push_dataset_dict_to_hub_no_token(self, temporary_repo):
def test_push_dataset_dict_to_hub_no_token(self, temporary_repo, set_ci_hub_access_token):
ds = Dataset.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]})

local_ds = DatasetDict({"train": ds})
Expand Down Expand Up @@ -778,6 +778,7 @@ def test_push_dataset_to_hub_with_config_no_metadata_configs(self, temporary_rep
path_in_repo="data/train-00000-of-00001.parquet",
repo_id=ds_name,
repo_type="dataset",
token=self._token,
)
ds_another_config.push_to_hub(ds_name, "another_config", token=self._token)
ds_builder = load_dataset_builder(ds_name, download_mode="force_redownload")
Expand Down Expand Up @@ -811,6 +812,7 @@ def test_push_dataset_dict_to_hub_with_config_no_metadata_configs(self, temporar
path_in_repo="data/random-00000-of-00001.parquet",
repo_id=ds_name,
repo_type="dataset",
token=self._token,
)
local_ds_another_config.push_to_hub(ds_name, "another_config", token=self._token)
ds_builder = load_dataset_builder(ds_name, download_mode="force_redownload")
Expand Down

1 comment on commit 3648752

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Show benchmarks

PyArrow==8.0.0

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.009041 / 0.011353 (-0.002312) 0.005613 / 0.011008 (-0.005395) 0.106383 / 0.038508 (0.067875) 0.081061 / 0.023109 (0.057952) 0.376688 / 0.275898 (0.100790) 0.424798 / 0.323480 (0.101318) 0.006139 / 0.007986 (-0.001846) 0.005255 / 0.004328 (0.000927) 0.080095 / 0.004250 (0.075845) 0.062810 / 0.037052 (0.025757) 0.347418 / 0.258489 (0.088928) 0.446638 / 0.293841 (0.152797) 0.049621 / 0.128546 (-0.078925) 0.014292 / 0.075646 (-0.061355) 0.423418 / 0.419271 (0.004146) 0.067224 / 0.043533 (0.023691) 0.331408 / 0.255139 (0.076269) 0.466297 / 0.283200 (0.183098) 0.036476 / 0.141683 (-0.105207) 1.510923 / 1.452155 (0.058768) 1.958414 / 1.492716 (0.465698)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.268225 / 0.018006 (0.250219) 0.608598 / 0.000490 (0.608108) 0.004762 / 0.000200 (0.004562) 0.000109 / 0.000054 (0.000054)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.024859 / 0.037411 (-0.012553) 0.082691 / 0.014526 (0.068165) 0.100662 / 0.176557 (-0.075895) 0.156559 / 0.737135 (-0.580576) 0.098756 / 0.296338 (-0.197582)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.515386 / 0.215209 (0.300177) 5.340352 / 2.077655 (3.262698) 2.274894 / 1.504120 (0.770774) 2.000514 / 1.541195 (0.459319) 2.087043 / 1.468490 (0.618553) 0.770453 / 4.584777 (-3.814324) 5.084419 / 3.745712 (1.338706) 4.370562 / 5.269862 (-0.899299) 2.724469 / 4.565676 (-1.841208) 0.096262 / 0.424275 (-0.328013) 0.010212 / 0.007607 (0.002605) 0.689586 / 0.226044 (0.463542) 6.592824 / 2.268929 (4.323896) 3.133477 / 55.444624 (-52.311147) 2.443015 / 6.876477 (-4.433462) 2.541050 / 2.142072 (0.398978) 0.997895 / 4.805227 (-3.807332) 0.195846 / 6.500664 (-6.304818) 0.063015 / 0.075469 (-0.012454)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.454647 / 1.841788 (-0.387141) 22.186846 / 8.074308 (14.112538) 19.935059 / 10.191392 (9.743667) 0.239253 / 0.680424 (-0.441171) 0.031393 / 0.534201 (-0.502808) 0.430039 / 0.579283 (-0.149244) 0.575082 / 0.434364 (0.140718) 0.481053 / 0.540337 (-0.059285) 0.685276 / 1.386936 (-0.701660)
PyArrow==latest
Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.008766 / 0.011353 (-0.002587) 0.005890 / 0.011008 (-0.005118) 0.066816 / 0.038508 (0.028308) 0.065145 / 0.023109 (0.042035) 0.421891 / 0.275898 (0.145993) 0.469148 / 0.323480 (0.145668) 0.006317 / 0.007986 (-0.001669) 0.003802 / 0.004328 (-0.000526) 0.073717 / 0.004250 (0.069466) 0.054283 / 0.037052 (0.017231) 0.435984 / 0.258489 (0.177495) 0.450415 / 0.293841 (0.156574) 0.042501 / 0.128546 (-0.086045) 0.013831 / 0.075646 (-0.061815) 0.081742 / 0.419271 (-0.337529) 0.051874 / 0.043533 (0.008342) 0.452484 / 0.255139 (0.197345) 0.457015 / 0.283200 (0.173816) 0.033152 / 0.141683 (-0.108530) 1.584332 / 1.452155 (0.132177) 1.666457 / 1.492716 (0.173741)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.368046 / 0.018006 (0.350040) 0.580072 / 0.000490 (0.579583) 0.046921 / 0.000200 (0.046721) 0.000362 / 0.000054 (0.000307)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.028610 / 0.037411 (-0.008801) 0.086941 / 0.014526 (0.072415) 0.105326 / 0.176557 (-0.071231) 0.148655 / 0.737135 (-0.588481) 0.106945 / 0.296338 (-0.189393)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.571672 / 0.215209 (0.356463) 5.931025 / 2.077655 (3.853370) 2.937442 / 1.504120 (1.433323) 2.600986 / 1.541195 (1.059791) 2.751132 / 1.468490 (1.282642) 0.843249 / 4.584777 (-3.741528) 5.203854 / 3.745712 (1.458142) 4.727529 / 5.269862 (-0.542333) 2.949137 / 4.565676 (-1.616539) 0.099321 / 0.424275 (-0.324954) 0.009123 / 0.007607 (0.001516) 0.760042 / 0.226044 (0.533997) 7.317381 / 2.268929 (5.048452) 3.574934 / 55.444624 (-51.869690) 2.932134 / 6.876477 (-3.944343) 3.088345 / 2.142072 (0.946273) 0.924011 / 4.805227 (-3.881216) 0.206834 / 6.500664 (-6.293830) 0.074735 / 0.075469 (-0.000734)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.580871 / 1.841788 (-0.260916) 22.277535 / 8.074308 (14.203227) 18.299240 / 10.191392 (8.107848) 0.211985 / 0.680424 (-0.468439) 0.031200 / 0.534201 (-0.503001) 0.426379 / 0.579283 (-0.152905) 0.546094 / 0.434364 (0.111730) 0.519339 / 0.540337 (-0.020999) 0.681742 / 1.386936 (-0.705194)

Please sign in to comment.