You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
0%| | 0/600000 [00:00<?, ?it/s][rank0]: Traceback (most recent call last):
[rank0]: File "/Workspace/Shared/Groups/a100-shared-group/ALMA/run_llmmt.py", line 225, in
[rank0]: main()
[rank0]: File "/Workspace/Shared/Groups/a100-shared-group/ALMA/run_llmmt.py", line 174, in main
[rank0]: train_result = trainer.train(resume_from_checkpoint=checkpoint)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.11/site-packages/transformers/trainer.py", line 1938, in train
[rank0]: return inner_training_loop(
[rank0]: ^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.11/site-packages/transformers/trainer.py", line 2236, in _inner_training_loop
[rank0]: for step, inputs in enumerate(epoch_iterator):
[rank0]: File "/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.11/site-packages/accelerate/data_loader.py", line 677, in iter
[rank0]: next_batch, next_batch_info = self._fetch_batches(main_iterator)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.11/site-packages/accelerate/data_loader.py", line 631, in _fetch_batches
[rank0]: batches.append(next(iterator))
[rank0]: ^^^^^^^^^^^^^^
[rank0]: File "/databricks/python/lib/python3.11/site-packages/torch/utils/data/dataloader.py", line 631, in next
[rank0]: data = self._next_data()
[rank0]: ^^^^^^^^^^^^^^^^^
[rank0]: File "/databricks/python/lib/python3.11/site-packages/torch/utils/data/dataloader.py", line 675, in _next_data
[rank0]: data = self._dataset_fetcher.fetch(index) # may raise StopIteration
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/databricks/python/lib/python3.11/site-packages/torch/utils/data/_utils/fetch.py", line 42, in fetch
[rank0]: return self.collate_fn(data)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.11/site-packages/transformers/trainer_utils.py", line 814, in call
[rank0]: return self.data_collator(features)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.11/site-packages/transformers/data/data_collator.py", line 92, in default_data_collator
[rank0]: return torch_default_data_collator(features)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.11/site-packages/transformers/data/data_collator.py", line 158, in torch_default_data_collator
[rank0]: batch[k] = torch.tensor([f[k] for f in features])
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: RuntimeError: Could not infer dtype of NoneType
0%| | 0/600000 [00:00<?, ?it/s][rank0]: Traceback (most recent call last):
[rank0]: File "/Workspace/Shared/Groups/a100-shared-group/ALMA/run_llmmt.py", line 225, in
[rank0]: main()
[rank0]: File "/Workspace/Shared/Groups/a100-shared-group/ALMA/run_llmmt.py", line 174, in main
[rank0]: train_result = trainer.train(resume_from_checkpoint=checkpoint)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.11/site-packages/transformers/trainer.py", line 1938, in train
[rank0]: return inner_training_loop(
[rank0]: ^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.11/site-packages/transformers/trainer.py", line 2236, in _inner_training_loop
[rank0]: for step, inputs in enumerate(epoch_iterator):
[rank0]: File "/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.11/site-packages/accelerate/data_loader.py", line 677, in iter
[rank0]: next_batch, next_batch_info = self._fetch_batches(main_iterator)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.11/site-packages/accelerate/data_loader.py", line 631, in _fetch_batches
[rank0]: batches.append(next(iterator))
[rank0]: ^^^^^^^^^^^^^^
[rank0]: File "/databricks/python/lib/python3.11/site-packages/torch/utils/data/dataloader.py", line 631, in next
[rank0]: data = self._next_data()
[rank0]: ^^^^^^^^^^^^^^^^^
[rank0]: File "/databricks/python/lib/python3.11/site-packages/torch/utils/data/dataloader.py", line 675, in _next_data
[rank0]: data = self._dataset_fetcher.fetch(index) # may raise StopIteration
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/databricks/python/lib/python3.11/site-packages/torch/utils/data/_utils/fetch.py", line 42, in fetch
[rank0]: return self.collate_fn(data)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.11/site-packages/transformers/trainer_utils.py", line 814, in call
[rank0]: return self.data_collator(features)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.11/site-packages/transformers/data/data_collator.py", line 92, in default_data_collator
[rank0]: return torch_default_data_collator(features)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.11/site-packages/transformers/data/data_collator.py", line 158, in torch_default_data_collator
[rank0]: batch[k] = torch.tensor([f[k] for f in features])
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: RuntimeError: Could not infer dtype of NoneType
`OUTPUT_DIR=${1:-"/Volumes/main/default/default_volume/llama-3.1-8B-mono/"}
random port between 30000 and 50000
port=$(( RANDOM % (50000 - 30000 + 1 ) + 30000 ))
accelerate launch --main_process_port ${port} --config_file configs/deepspeed_train_config.yaml
run_llmmt.py
--model_name_or_path meta-llama/Meta-Llama-3.1-8B-Instruct
--tokenizer_name meta-llama/Meta-Llama-3.1-8B-Instruct
--oscar_data_path oscar-corpus/OSCAR-2301
--oscar_data_lang en,ru,cs,zh,is,de
--interleave_probs "0.17,0.22,0.14,0.19,0.08,0.2"
--streaming
--max_steps 600000
--do_train
--low_cpu_mem_usage
--fp16
--learning_rate 2e-5
--weight_decay 0.01
--gradient_accumulation_steps 4
--lr_scheduler_type cosine
--warmup_ratio 0.01
--ignore_pad_token_for_loss
--ignore_prompt_token_for_loss
--per_device_train_batch_size 2
--per_device_eval_batch_size 2
--save_strategy steps
--save_steps 2000
--save_total_limit 1
--logging_strategy steps
--logging_steps 1
--output_dir ${OUTPUT_DIR}
--max_new_tokens 256
--max_source_length 256
--seed 42
--overwrite_output_dir
--report_to none
`
I didn't change anything but the model path , but got this error ,seems some data in dataset is None ?
The text was updated successfully, but these errors were encountered: