From ba9fd295f84562a941ca18a3d05ebee248d4e59e Mon Sep 17 00:00:00 2001 From: cl <44345856+chrislee973@users.noreply.github.com> Date: Sat, 25 May 2024 05:38:49 +0000 Subject: [PATCH 1/3] use the ds hash as the dataset's config_name --- src/axolotl/utils/data/sft.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/axolotl/utils/data/sft.py b/src/axolotl/utils/data/sft.py index dbc4172b4..e861a0adf 100644 --- a/src/axolotl/utils/data/sft.py +++ b/src/axolotl/utils/data/sft.py @@ -161,7 +161,8 @@ def load_tokenized_prepared_datasets( try: if cfg.push_dataset_to_hub: dataset = load_dataset( - f"{cfg.push_dataset_to_hub}/{ds_hash}", + cfg.push_dataset_to_hub, + ds_hash, token=use_auth_token, ) dataset = dataset[split] @@ -424,10 +425,12 @@ def for_d_in_datasets(dataset_configs): dataset.save_to_disk(str(prepared_ds_path)) if cfg.push_dataset_to_hub: LOG.info( - f"Saving merged prepared dataset with push_to_hub... {cfg.push_dataset_to_hub}/{ds_hash}" + f"Saving merged prepared dataset with push_to_hub... {cfg.push_dataset_to_hub}" ) dataset.push_to_hub( - f"{cfg.push_dataset_to_hub}/{ds_hash}", private=True + cfg.push_dataset_to_hub, + ds_hash, + private=True, ) return dataset, prompters From d78566da0865bf84673d87ddb592ecedeb6aefdd Mon Sep 17 00:00:00 2001 From: cl <44345856+chrislee973@users.noreply.github.com> Date: Sat, 25 May 2024 05:57:34 +0000 Subject: [PATCH 2/3] improve logging for loading/pushing ds to hub --- src/axolotl/utils/data/sft.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/axolotl/utils/data/sft.py b/src/axolotl/utils/data/sft.py index e861a0adf..86a9a0c65 100644 --- a/src/axolotl/utils/data/sft.py +++ b/src/axolotl/utils/data/sft.py @@ -160,6 +160,9 @@ def load_tokenized_prepared_datasets( use_auth_token = cfg.hf_use_auth_token try: if cfg.push_dataset_to_hub: + LOG.info( + f"Attempting to load prepared dataset from Huggingface hub at {cfg.push_dataset_to_hub} with configuration name {ds_hash}..." + ) dataset = load_dataset( cfg.push_dataset_to_hub, ds_hash, @@ -181,7 +184,9 @@ def load_tokenized_prepared_datasets( dataset = load_from_disk(str(prepared_ds_path)) LOG.info("Prepared dataset loaded from disk...") else: - LOG.info(f"Unable to find prepared dataset in {prepared_ds_path}") + if cfg.push_dataset_to_hub: + LOG.info("Unable to find prepared dataset in Huggingface hub") + LOG.info("Unable to find prepared dataset in {prepared_ds_path}") LOG.info("Loading raw datasets...") if not cfg.is_preprocess: LOG.warning( @@ -425,7 +430,7 @@ def for_d_in_datasets(dataset_configs): dataset.save_to_disk(str(prepared_ds_path)) if cfg.push_dataset_to_hub: LOG.info( - f"Saving merged prepared dataset with push_to_hub... {cfg.push_dataset_to_hub}" + f"Pushing merged prepared dataset to Huggingface hub at {cfg.push_dataset_to_hub} with configuration name {ds_hash}..." ) dataset.push_to_hub( cfg.push_dataset_to_hub, From a7d4bfe54d8c51390f2ba5eda132aae284a81386 Mon Sep 17 00:00:00 2001 From: cl <44345856+chrislee973@users.noreply.github.com> Date: Sat, 25 May 2024 07:35:30 +0000 Subject: [PATCH 3/3] fix missing f string --- src/axolotl/utils/data/sft.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/axolotl/utils/data/sft.py b/src/axolotl/utils/data/sft.py index 86a9a0c65..41351e4ae 100644 --- a/src/axolotl/utils/data/sft.py +++ b/src/axolotl/utils/data/sft.py @@ -161,7 +161,7 @@ def load_tokenized_prepared_datasets( try: if cfg.push_dataset_to_hub: LOG.info( - f"Attempting to load prepared dataset from Huggingface hub at {cfg.push_dataset_to_hub} with configuration name {ds_hash}..." + f"Attempting to load prepared dataset from Huggingface hub at {cfg.push_dataset_to_hub} (version {ds_hash})..." ) dataset = load_dataset( cfg.push_dataset_to_hub, @@ -186,7 +186,7 @@ def load_tokenized_prepared_datasets( else: if cfg.push_dataset_to_hub: LOG.info("Unable to find prepared dataset in Huggingface hub") - LOG.info("Unable to find prepared dataset in {prepared_ds_path}") + LOG.info(f"Unable to find prepared dataset in {prepared_ds_path}") LOG.info("Loading raw datasets...") if not cfg.is_preprocess: LOG.warning( @@ -430,7 +430,7 @@ def for_d_in_datasets(dataset_configs): dataset.save_to_disk(str(prepared_ds_path)) if cfg.push_dataset_to_hub: LOG.info( - f"Pushing merged prepared dataset to Huggingface hub at {cfg.push_dataset_to_hub} with configuration name {ds_hash}..." + f"Pushing merged prepared dataset to Huggingface hub at {cfg.push_dataset_to_hub} (version {ds_hash})..." ) dataset.push_to_hub( cfg.push_dataset_to_hub,