Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Question] Deepspeed inference stage 3 + quantization #5398

Open
ocesp98 opened this issue Apr 11, 2024 · 1 comment · May be fixed by #5624
Open

[Question] Deepspeed inference stage 3 + quantization #5398

ocesp98 opened this issue Apr 11, 2024 · 1 comment · May be fixed by #5624
Labels
bug Something isn't working inference

Comments

@ocesp98
Copy link

ocesp98 commented Apr 11, 2024

trying to set deepspeed with inference zero 3 as follows:

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch 
from transformers.integrations import HfDeepSpeedConfig


with torch.no_grad():
    hfds_config = HfDeepSpeedConfig(config_file_or_dict="config.json")
    # Now model is on-the-fly quantized.
    tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
    model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it")

input_text = "Query"
input_ids = tokenizer(input_text, return_tensors="pt")

outputs = model.generate(**input_ids, max_length= 200)
print(tokenizer.decode(outputs[0]))

With config.json the deepspeed config file as follows:

{
    "weight_quantization": {
        "quantized_initialization": {
            "num_bits": 4,
            "group_size": 64,
            "group_dim": 1,
            "symmetric": false
        }
    }, 
    "zero_optimization": {
          "stage": 3,
          "offload_optimizer": {
              "device": "cpu",
              "pin_memory": true
          },
          "offload_param": {
              "device": "cpu",
              "pin_memory": true
          },
          "overlap_comm": true,
          "contiguous_gradients": true,
          "sub_group_size": 1e9,
          "reduce_bucket_size": "auto",
          "stage3_prefetch_bucket_size": "auto",
          "stage3_param_persistence_threshold": "auto",
          "stage3_max_live_parameters": 1e9,
          "stage3_max_reuse_distance": 1e9,
          "stage3_gather_fp16_weights_on_model_save": true
      }, 
      "train_batch_size": 32, 
      "torch_dtype": "float32"
  }

and the error as follows:

[2024-03-08 15:12:19,262] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-03-08 15:12:19,600] [WARNING] [config_utils.py:69:_process_deprecated_field] Config parameter stage3_gather_fp16_weights_on_model_save is deprecated use gather_16bit_weights_on_model_save instead
[2024-03-08 15:12:19,601] [INFO] [comm.py:637:init_distributed] cdb=None
[2024-03-08 15:12:19,602] [INFO] [comm.py:652:init_distributed] Not using the DeepSpeed or dist launchers, attempting to detect MPI environment…
[2024-03-08 15:12:19,624] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=10.0.0.4, master_port=29500
[2024-03-08 15:12:19,626] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
[2024-03-08 15:12:19,635] [INFO] [partition_parameters.py:559:patch_init_and_builtins] Enable Zero3 engine with INT4 quantization.
[2024-03-08 15:12:22,156] [INFO] [partition_parameters.py:343:exit] finished initializing model - num_params = 165, num_elems = 3.03B
{
“name”: “RuntimeError”,
“message”: “self.size(-1) must be divisible by 2 to view BFloat16 as Float (different element sizes), but got 1”,
“stack”: "---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Cell In[1], line 10
8 # Now model is on-the-fly quantized.
9 tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
—> 10 model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it")
12 input_text = "Query"
13 input_ids = tokenizer(input_text, return_tensors="pt")

File ~/.local/lib/python3.8/site-packages/transformers/models/auto/auto_factory.py:561, in _BaseAutoModelClass.from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
559 elif type(config) in cls._model_mapping.keys():
560 model_class = _get_model_class(config, cls._model_mapping)
→ 561 return model_class.from_pretrained(
562 pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
563 )
564 raise ValueError(
565 f"Unrecognized configuration class {config.class} for this kind of AutoModel: {cls.name}.
"
566 f"Model type should be one of {', '.join(c.name for c in cls._model_mapping.keys())}."
567 )

File ~/.local/lib/python3.8/site-packages/transformers/modeling_utils.py:3502, in PreTrainedModel.from_pretrained(cls, pretrained_model_name_or_path, config, cache_dir, ignore_mismatched_sizes, force_download, local_files_only, token, revision, use_safetensors, *model_args, **kwargs)
3493 if dtype_orig is not None:
3494 torch.set_default_dtype(dtype_orig)
3495 (
3496 model,
3497 missing_keys,
3498 unexpected_keys,
3499 mismatched_keys,
3500 offload_index,
3501 error_msgs,
→ 3502 ) = cls._load_pretrained_model(
3503 model,
3504 state_dict,
3505 loaded_state_dict_keys, # XXX: rename?
3506 resolved_archive_file,
3507 pretrained_model_name_or_path,
3508 ignore_mismatched_sizes=ignore_mismatched_sizes,
3509 sharded_metadata=sharded_metadata,
3510 _fast_init=_fast_init,
3511 low_cpu_mem_usage=low_cpu_mem_usage,
3512 device_map=device_map,
3513 offload_folder=offload_folder,
3514 offload_state_dict=offload_state_dict,
3515 dtype=torch_dtype,
3516 hf_quantizer=hf_quantizer,
3517 keep_in_fp32_modules=keep_in_fp32_modules,
3518 )
3520 # make sure token embedding weights are still tied if needed
3521 model.tie_weights()

File ~/.local/lib/python3.8/site-packages/transformers/modeling_utils.py:3945, in PreTrainedModel._load_pretrained_model(cls, model, state_dict, loaded_keys, resolved_archive_file, pretrained_model_name_or_path, ignore_mismatched_sizes, sharded_metadata, _fast_init, low_cpu_mem_usage, device_map, offload_folder, offload_state_dict, dtype, hf_quantizer, keep_in_fp32_modules)
3943 error_msgs += new_error_msgs
3944 else:
→ 3945 error_msgs += _load_state_dict_into_model(model_to_load, state_dict, start_prefix)
3947 # force memory release
3948 del state_dict

File ~/.local/lib/python3.8/site-packages/transformers/modeling_utils.py:626, in _load_state_dict_into_model(model_to_load, state_dict, start_prefix)
623 if child is not None:
624 load(child, state_dict, prefix + name + ".")
→ 626 load(model_to_load, state_dict, prefix=start_prefix)
627 # Delete state_dict so it could be collected by GC earlier. Note that state_dict is a copy of the argument, so
628 # it’s safe to delete it.
629 del state_dict

File ~/.local/lib/python3.8/site-packages/transformers/modeling_utils.py:624, in _load_state_dict_into_model..load(module, state_dict, prefix)
622 for name, child in module._modules.items():
623 if child is not None:
→ 624 load(child, state_dict, prefix + name + ".")

File ~/.local/lib/python3.8/site-packages/transformers/modeling_utils.py:624, in _load_state_dict_into_model..load(module, state_dict, prefix)
622 for name, child in module._modules.items():
623 if child is not None:
→ 624 load(child, state_dict, prefix + name + ".")

File ~/.local/lib/python3.8/site-packages/transformers/modeling_utils.py:618, in _load_state_dict_into_model..load(module, state_dict, prefix)
616 with deepspeed.zero.GatheredParameters(params_to_gather, modifier_rank=0):
617 if torch.distributed.get_rank() == 0:
→ 618 module._load_from_state_dict(*args)
619 else:
620 module._load_from_state_dict(*args)

File ~/.local/lib/python3.8/site-packages/deepspeed/inference/quantization/utils.py:269, in wrap_load_from_state_dict..wrapper(model, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs)
267 quantized_weight, quant_scale, quant_min = model.weight.quantizer.quantize(state_dict[key])
268 quantized_weight = quantized_weight.view(model.weight.dtype)
→ 269 quant_scale = quant_scale.view(model.weight.dtype)
270 quant_min = quant_min.view(model.weight.dtype)
272 replaced_old_value = state_dict[key]

RuntimeError: self.size(-1) must be divisible by 2 to view BFloat16 as Float (different element sizes), but got 1"
}

I tried setting the group_dim to 2 in the config.json, but this gave the error that the tuple was out of range.

My GPU doesn’t support BFloat type format. It is by default disabled I thought, but specifically set the dtype to fp32 (not that this changes anything). How can I fix this issue? The reason why I would like to use zero stage 3 is to go from a 2B model to a 7B model, offloading to CPU (also to test this out a bit).

@sevenandseven
Copy link

尝试使用推理零 3 设置 deepspeed,如下所示:

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch 
from transformers.integrations import HfDeepSpeedConfig


with torch.no_grad():
    hfds_config = HfDeepSpeedConfig(config_file_or_dict="config.json")
    # Now model is on-the-fly quantized.
    tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
    model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it")

input_text = "Query"
input_ids = tokenizer(input_text, return_tensors="pt")

outputs = model.generate(**input_ids, max_length= 200)
print(tokenizer.decode(outputs[0]))

config.json 的 deepspeed 配置文件如下:

{
    "weight_quantization": {
        "quantized_initialization": {
            "num_bits": 4,
            "group_size": 64,
            "group_dim": 1,
            "symmetric": false
        }
    }, 
    "zero_optimization": {
          "stage": 3,
          "offload_optimizer": {
              "device": "cpu",
              "pin_memory": true
          },
          "offload_param": {
              "device": "cpu",
              "pin_memory": true
          },
          "overlap_comm": true,
          "contiguous_gradients": true,
          "sub_group_size": 1e9,
          "reduce_bucket_size": "auto",
          "stage3_prefetch_bucket_size": "auto",
          "stage3_param_persistence_threshold": "auto",
          "stage3_max_live_parameters": 1e9,
          "stage3_max_reuse_distance": 1e9,
          "stage3_gather_fp16_weights_on_model_save": true
      }, 
      "train_batch_size": 32, 
      "torch_dtype": "float32"
  }

和错误如下:

[2024-03-08 15:12:19,262] [信息] [real_accelerator.py:191:get_accelerator] 将 ds_accelerator 设置为 cuda(自动检测)
[2024-03-08 15:12:19,600] [警告] [config_utils。 py:69:_process_deprecated_field] 配置参数 stage3_gather_fp16_weights_on_model_save 已弃用,请使用 Gather_16bit_weights_on_model_save 代替
[2024-03-08 15:12:19,601] [INFO] [comm.py:637:init_distributed] cdb=None
[2024-03-08 15:12 :19,602] [信息] [comm.py:652:init_distributed] 不使用 DeepSpeed 或 dist 启动器,尝试检测 MPI 环境…
[2024-03-08 15:12:19,624] [信息] [comm.py:702 :mpi_discovery] 发现 world_rank=0、local_rank=0、world_size=1、master_addr=10.0.0.4、master_port=29500 的 MPI 设置
[2024-03-08 15:12:19,626] [INFO] [comm.py:668:init_distributed] 使用后端 nccl 在 DeepSpeed 中初始化 TorchBackend
[2024-03-08 15:12:19,635] [INFO] [partition_parameters.py :559:patch_init_and_builtins] 启用具有 INT4 量化的 Zero3 引擎。
[2024-03-08 15:12:22,156] [INFO] [partition_parameters.py:343:exit] 完成初始化模型 - num_params = 165, num_elems = 3.03B
{
“name”: “RuntimeError”,
“message”: “ self.size(-1) 必须能被 2 整除才能将 BFloat16 视为 Float(不同的元素大小),但得到 1”,
“stack”: “----------------- -------------------------------------------------- --------
RuntimeError Traceback (最近一次调用最后)
Cell In[1], line 10
8 # 现在模型是动态量化的。
9 tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
—> 10 model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it")
12 input_text = "查询"
13 input_ids = tokenizer(input_text, return_tensors =“点”)
文件〜/.local/lib/python3.8/site-packages/transformers/models/auto/auto_factory.py:561,在_BaseAutoModelClass.from_pretrained(cls,pretrained_model_name_or_path,* model_args,** kwargs)
559 elif类型(配置)在cls._model_mapping.keys()中:
560 model_class = _get_model_class(config,cls._model_mapping)
→561返回model_class.from_pretrained(
562 pretrained_model_name_or_path,* model_args,config = config,** hub_kwargs,** kwargs
563)
564引发ValueError(第565章 566
、第
566

第 567章
文件〜/.local/lib/python3.8/site-packages/transformers/modeling_utils.py:3502,在PreTrainedModel.from_pretrained(cls,pretrained_model_name_or_path,config,cache_dir,ignore_mismatched_sizes,force_download,local_files_only,token,revision,use_safetensors,* model_args, **kwargs)
3493 如果 dtype_orig 不是 None:
3494 torch.set_default_dtype(dtype_orig)
3495 (
3496 model,
3497 Missing_keys,
3498 Outstanding_keys,
3499 Mismatched_keys,
3500 offload_index,
3501 error_msgs,
→ 3502 ) = cls ._load_pretrained_model(第
3503章 模型第
3504章 3505、
第3505章、
第3506章、
第3507章、
第3508章ignore_mismatched_sizes=ignore_mismatched_sizes,
3509sharded_metadata=sharded_metadata,
3510_fast_init=_fast_init,
3511low_cpu_mem_usage=low_cpu_mem_usage,
3512device_map=device_map,
3513offload_folder=offload_folder,
3514
关闭第3515章 3516、第
3517

第3518章【3520

【3521 】
文件 ~/.local/lib/python3.8/site-packages/transformers/modeling_utils.py:3945,在 PreTrainedModel._load_pretrained_model(cls​​、model、state_dict、loaded_keys、resolved_archive_file、pretrained_model_name_or_path、ignore_mismatched_sizes、sharded_metadata、_fast_init、low_cpu_mem_usage、device_map , offload_folder, offload_state_dict, dtype, hf_quantizer, keep_in_fp32_modules)
3943 error_msgs += new_error_msgs
3944 else:
→ 3945 error_msgs += _load_state_dict_into_model(model_to_load, state_dict, start_prefix)
3947 # 强制释放内存
3948 del state_dict
文件 ~/.local/lib/python3.8/site-packages/transformers/modeling_utils.py:626,在 _load_state_dict_into_model(model_to_load, state_dict, start_prefix)
623 如果 child 不是 None:
624 load(child, state_dict, prefix + name + ".")
→ 626 load(model_to_load, state_dict, prefix=start_prefix)
627 # 删除state_dict,以便GC更早收集它。请注意,state_dict 是参数的副本,因此
删除它是安全的(第 628 章)
第629章
文件 ~/.local/lib/python3.8/site-packages/transformers/modeling_utils.py:624,在 _load_state_dict_into_model..load(module, state_dict, prefix)
622 中为 name,child 在 module._modules.items() 中:
[第 623 章
] 第 624 章
文件 ~/.local/lib/python3.8/site-packages/transformers/modeling_utils.py:624,在 _load_state_dict_into_model..load(module, state_dict, prefix)
622 中为 name,child 在 module._modules.items() 中:
[第 623 章
] 第 624 章
文件 ~/.local/lib/python3.8/site-packages/transformers/modeling_utils.py:618,在 _load_state_dict_into_model..load(module, state_dict, prefix)
616 中,带有 deepspeed.zero.GatheredParameters(params_to_gather, modifier_rank=0) :
617 if torch.distributed.get_rank() == 0:
→ 618 module._load_from_state_dict(*args)
619 else:
620 module._load_from_state_dict(*args)
文件〜/.local/lib/python3.8/site-packages/deepspeed/inference/quantization/utils.py:269,在wrap_load_from_state_dict..wrapper(模型,state_dict,前缀,local_metadata,严格,missing_keys,unexpected_keys,error_msgs)
第267章 量化权重,量化尺度,量化最小= model.weight.quantizer.quantize(state_dict[key])
268 量化权重=量化权重.view(model.weight.dtype)
→ 269 量化尺度=量化尺度.view(模型.权重.dtype)
270 量化最小=第272章
272
RuntimeError: self.size(-1) 必须能被 2 整除才能将 BFloat16 视为 Float(不同的元素大小),但得到 1"
}

我尝试在 config.json 中将 group_dim 设置为 2,但这给出了元组超出范围的错误。

我的 GPU 不支持 BFloat 类型格式。我认为它默认被禁用,但特别将 dtype 设置为 fp32 (这并没有改变任何东西)。我该如何解决这个问题?我想使用零阶段 3 的原因是从 2B 模型转到 7B 模型,卸载到 CPU(也是为了稍微测试一下)。

Hello, I successfully used the zero3 stage acceleration for deepspeed inference and used two gpus, but I found that using two gpus did not perform data parallelism or model parallelism. Each apus asked the same question. How should I solve this problem? Thank you for your reply.

this is code:
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, trust_remote_code=True)

if args.cpu_offload and args.nvme_offload_path:

raise ValueError("Use one of --cpu_offload or --nvme_offload_path and not both")

if args.cpu_offload:
ds_config["zero_optimization"]["offload_param"] = dict(device="cpu", pin_memory=True)

dschf = HfDeepSpeedConfig(ds_config) # this tells from_pretrained to instantiate directly on gpus
ds_engine = deepspeed.initialize(model=model, config_params=ds_config)[0]
ds_engine.module.eval()
model = ds_engine.module
for i in range(0, int(len(input_instance)//args.batch_size)):
if args.batch_size > len(input_instance):
# dynamically extend to support larger bs by repetition
input_instance *= math.ceil(args.batch_size / len(input_instance))
inputs = input_instance[: i+args.batch_size]
input_instance = input_instance[args.batch_size-1:]
generate_kwargs = dict(do_sample=False,max_new_tokens=1024)
print("input is:", inputs)
start_time = perf_counter()
input_tokens = tokenizer.batch_encode_plus(inputs, return_tensors="pt", padding=True)
for t in input_tokens:
if torch.is_tensor(input_tokens[t]):
input_tokens[t] = input_tokens[t].to(torch.cuda.current_device())
outputs = model.generate(**input_tokens, **generate_kwargs)
torch.cuda.empty_cache()
gc.collect()
input_tokens_lengths = [x.shape[0] for x in input_tokens.input_ids]
output_tokens_lengths = [x.shape[0] for x in outputs]
outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
time_consu = perf_counter() - start_time
total_new_tokens = [o - i for i, o in zip(input_tokens_lengths, output_tokens_lengths)]
print("output is:", outputs)
total_token += sum(total_new_tokens)
time_consu_all += time_consu

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bug Something isn't working inference
Projects
None yet
Development

Successfully merging a pull request may close this issue.

2 participants