[Question] Deepspeed inference stage 3 + quantization #5398

ocesp98 · 2024-04-11T09:42:14Z

trying to set deepspeed with inference zero 3 as follows:

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch 
from transformers.integrations import HfDeepSpeedConfig


with torch.no_grad():
    hfds_config = HfDeepSpeedConfig(config_file_or_dict="config.json")
    # Now model is on-the-fly quantized.
    tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
    model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it")

input_text = "Query"
input_ids = tokenizer(input_text, return_tensors="pt")

outputs = model.generate(**input_ids, max_length= 200)
print(tokenizer.decode(outputs[0]))

With config.json the deepspeed config file as follows:

{
    "weight_quantization": {
        "quantized_initialization": {
            "num_bits": 4,
            "group_size": 64,
            "group_dim": 1,
            "symmetric": false
        }
    }, 
    "zero_optimization": {
          "stage": 3,
          "offload_optimizer": {
              "device": "cpu",
              "pin_memory": true
          },
          "offload_param": {
              "device": "cpu",
              "pin_memory": true
          },
          "overlap_comm": true,
          "contiguous_gradients": true,
          "sub_group_size": 1e9,
          "reduce_bucket_size": "auto",
          "stage3_prefetch_bucket_size": "auto",
          "stage3_param_persistence_threshold": "auto",
          "stage3_max_live_parameters": 1e9,
          "stage3_max_reuse_distance": 1e9,
          "stage3_gather_fp16_weights_on_model_save": true
      }, 
      "train_batch_size": 32, 
      "torch_dtype": "float32"
  }

and the error as follows:

[2024-03-08 15:12:19,262] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-03-08 15:12:19,600] [WARNING] [config_utils.py:69:_process_deprecated_field] Config parameter stage3_gather_fp16_weights_on_model_save is deprecated use gather_16bit_weights_on_model_save instead
[2024-03-08 15:12:19,601] [INFO] [comm.py:637:init_distributed] cdb=None
[2024-03-08 15:12:19,602] [INFO] [comm.py:652:init_distributed] Not using the DeepSpeed or dist launchers, attempting to detect MPI environment…
[2024-03-08 15:12:19,624] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=10.0.0.4, master_port=29500
[2024-03-08 15:12:19,626] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
[2024-03-08 15:12:19,635] [INFO] [partition_parameters.py:559:patch_init_and_builtins] Enable Zero3 engine with INT4 quantization.
[2024-03-08 15:12:22,156] [INFO] [partition_parameters.py:343:exit] finished initializing model - num_params = 165, num_elems = 3.03B
{
“name”: “RuntimeError”,
“message”: “self.size(-1) must be divisible by 2 to view BFloat16 as Float (different element sizes), but got 1”,
“stack”: "---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Cell In[1], line 10
8 # Now model is on-the-fly quantized.
9 tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
—> 10 model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it")
12 input_text = "Query"
13 input_ids = tokenizer(input_text, return_tensors="pt")

File ~/.local/lib/python3.8/site-packages/transformers/models/auto/auto_factory.py:561, in _BaseAutoModelClass.from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
559 elif type(config) in cls._model_mapping.keys():
560 model_class = _get_model_class(config, cls._model_mapping)
→ 561 return model_class.from_pretrained(
562 pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
563 )
564 raise ValueError(
565 f"Unrecognized configuration class {config.class} for this kind of AutoModel: {cls.name}.
"
566 f"Model type should be one of {', '.join(c.name for c in cls._model_mapping.keys())}."
567 )

File ~/.local/lib/python3.8/site-packages/transformers/modeling_utils.py:3502, in PreTrainedModel.from_pretrained(cls, pretrained_model_name_or_path, config, cache_dir, ignore_mismatched_sizes, force_download, local_files_only, token, revision, use_safetensors, *model_args, **kwargs)
3493 if dtype_orig is not None:
3494 torch.set_default_dtype(dtype_orig)
3495 (
3496 model,
3497 missing_keys,
3498 unexpected_keys,
3499 mismatched_keys,
3500 offload_index,
3501 error_msgs,
→ 3502 ) = cls._load_pretrained_model(
3503 model,
3504 state_dict,
3505 loaded_state_dict_keys, # XXX: rename?
3506 resolved_archive_file,
3507 pretrained_model_name_or_path,
3508 ignore_mismatched_sizes=ignore_mismatched_sizes,
3509 sharded_metadata=sharded_metadata,
3510 _fast_init=_fast_init,
3511 low_cpu_mem_usage=low_cpu_mem_usage,
3512 device_map=device_map,
3513 offload_folder=offload_folder,
3514 offload_state_dict=offload_state_dict,
3515 dtype=torch_dtype,
3516 hf_quantizer=hf_quantizer,
3517 keep_in_fp32_modules=keep_in_fp32_modules,
3518 )
3520 # make sure token embedding weights are still tied if needed
3521 model.tie_weights()

File ~/.local/lib/python3.8/site-packages/transformers/modeling_utils.py:3945, in PreTrainedModel._load_pretrained_model(cls, model, state_dict, loaded_keys, resolved_archive_file, pretrained_model_name_or_path, ignore_mismatched_sizes, sharded_metadata, _fast_init, low_cpu_mem_usage, device_map, offload_folder, offload_state_dict, dtype, hf_quantizer, keep_in_fp32_modules)
3943 error_msgs += new_error_msgs
3944 else:
→ 3945 error_msgs += _load_state_dict_into_model(model_to_load, state_dict, start_prefix)
3947 # force memory release
3948 del state_dict

File ~/.local/lib/python3.8/site-packages/transformers/modeling_utils.py:626, in _load_state_dict_into_model(model_to_load, state_dict, start_prefix)
623 if child is not None:
624 load(child, state_dict, prefix + name + ".")
→ 626 load(model_to_load, state_dict, prefix=start_prefix)
627 # Delete state_dict so it could be collected by GC earlier. Note that state_dict is a copy of the argument, so
628 # it’s safe to delete it.
629 del state_dict

File ~/.local/lib/python3.8/site-packages/transformers/modeling_utils.py:624, in _load_state_dict_into_model..load(module, state_dict, prefix)
622 for name, child in module._modules.items():
623 if child is not None:
→ 624 load(child, state_dict, prefix + name + ".")

File ~/.local/lib/python3.8/site-packages/transformers/modeling_utils.py:624, in _load_state_dict_into_model..load(module, state_dict, prefix)
622 for name, child in module._modules.items():
623 if child is not None:
→ 624 load(child, state_dict, prefix + name + ".")

File ~/.local/lib/python3.8/site-packages/transformers/modeling_utils.py:618, in _load_state_dict_into_model..load(module, state_dict, prefix)
616 with deepspeed.zero.GatheredParameters(params_to_gather, modifier_rank=0):
617 if torch.distributed.get_rank() == 0:
→ 618 module._load_from_state_dict(*args)
619 else:
620 module._load_from_state_dict(*args)

File ~/.local/lib/python3.8/site-packages/deepspeed/inference/quantization/utils.py:269, in wrap_load_from_state_dict..wrapper(model, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs)
267 quantized_weight, quant_scale, quant_min = model.weight.quantizer.quantize(state_dict[key])
268 quantized_weight = quantized_weight.view(model.weight.dtype)
→ 269 quant_scale = quant_scale.view(model.weight.dtype)
270 quant_min = quant_min.view(model.weight.dtype)
272 replaced_old_value = state_dict[key]

RuntimeError: self.size(-1) must be divisible by 2 to view BFloat16 as Float (different element sizes), but got 1"
}

I tried setting the group_dim to 2 in the config.json, but this gave the error that the tuple was out of range.

My GPU doesn’t support BFloat type format. It is by default disabled I thought, but specifically set the dtype to fp32 (not that this changes anything). How can I fix this issue? The reason why I would like to use zero stage 3 is to go from a 2B model to a 7B model, offloading to CPU (also to test this out a bit).

The text was updated successfully, but these errors were encountered:

sevenandseven · 2024-06-13T00:49:39Z

尝试使用推理零 3 设置 deepspeed，如下所示：
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch 
from transformers.integrations import HfDeepSpeedConfig


with torch.no_grad():
    hfds_config = HfDeepSpeedConfig(config_file_or_dict="config.json")
    # Now model is on-the-fly quantized.
    tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
    model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it")

input_text = "Query"
input_ids = tokenizer(input_text, return_tensors="pt")

outputs = model.generate(**input_ids, max_length= 200)
print(tokenizer.decode(outputs[0]))
config.json 的 deepspeed 配置文件如下：
{
    "weight_quantization": {
        "quantized_initialization": {
            "num_bits": 4,
            "group_size": 64,
            "group_dim": 1,
            "symmetric": false
        }
    }, 
    "zero_optimization": {
          "stage": 3,
          "offload_optimizer": {
              "device": "cpu",
              "pin_memory": true
          },
          "offload_param": {
              "device": "cpu",
              "pin_memory": true
          },
          "overlap_comm": true,
          "contiguous_gradients": true,
          "sub_group_size": 1e9,
          "reduce_bucket_size": "auto",
          "stage3_prefetch_bucket_size": "auto",
          "stage3_param_persistence_threshold": "auto",
          "stage3_max_live_parameters": 1e9,
          "stage3_max_reuse_distance": 1e9,
          "stage3_gather_fp16_weights_on_model_save": true
      }, 
      "train_batch_size": 32, 
      "torch_dtype": "float32"
  }
和错误如下：

[2024-03-08 15:12:19,262] [信息] [real_accelerator.py:191:get_accelerator] 将 ds_accelerator 设置为 cuda（自动检测）
[2024-03-08 15:12:19,600] [警告] [config_utils。 py:69:_process_deprecated_field] 配置参数 stage3_gather_fp16_weights_on_model_save 已弃用，请使用 Gather_16bit_weights_on_model_save 代替
[2024-03-08 15:12:19,601] [INFO] [comm.py:637:init_distributed] cdb=None
[2024-03-08 15:12 ：19,602] [信息] [comm.py:652:init_distributed] 不使用 DeepSpeed 或 dist 启动器，尝试检测 MPI 环境…
[2024-03-08 15:12:19,624] [信息] [comm.py:702 :mpi_discovery] 发现 world_rank=0、local_rank=0、world_size=1、master_addr=10.0.0.4、master_port=29500 的 MPI 设置
[2024-03-08 15:12:19,626] [INFO] [comm.py:668:init_distributed] 使用后端 nccl 在 DeepSpeed 中初始化 TorchBackend
[2024-03-08 15:12:19,635] [INFO] [partition_parameters.py :559:patch_init_and_builtins] 启用具有 INT4 量化的 Zero3 引擎。
[2024-03-08 15:12:22,156] [INFO] [partition_parameters.py:343:exit] 完成初始化模型 - num_params = 165, num_elems = 3.03B
{
“name”: “RuntimeError”,
“message”: “ self.size(-1) 必须能被 2 整除才能将 BFloat16 视为 Float（不同的元素大小），但得到 1”,
“stack”: “----------------- -------------------------------------------------- --------
RuntimeError Traceback (最近一次调用最后)
Cell In[1], line 10
8 # 现在模型是动态量化的。
9 tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
—> 10 model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it")
12 input_text = "查询"
13 input_ids = tokenizer(input_text, return_tensors =“点”）
文件〜/.local/lib/python3.8/site-packages/transformers/models/auto/auto_factory.py：561，在_BaseAutoModelClass.from_pretrained（cls，pretrained_model_name_or_path，* model_args，** kwargs）
559 elif类型（配置）在cls._model_mapping.keys（）中：
560 model_class = _get_model_class（config，cls._model_mapping）
→561返回model_class.from_pretrained（
562 pretrained_model_name_or_path，* model_args，config = config，** hub_kwargs，** kwargs
563）
564引发ValueError（第565章 566
、第
566
章
第 567章
文件〜/.local/lib/python3.8/site-packages/transformers/modeling_utils.py:3502，在PreTrainedModel.from_pretrained（cls，pretrained_model_name_or_path，config，cache_dir，ignore_mismatched_sizes，force_download，local_files_only，token，revision，use_safetensors，* model_args, **kwargs)
3493 如果 dtype_orig 不是 None:
3494 torch.set_default_dtype(dtype_orig)
3495 (
3496 model,
3497 Missing_keys,
3498 Outstanding_keys,
3499 Mismatched_keys,
3500 offload_index,
3501 error_msgs,
→ 3502 ) = cls ._load_pretrained_model(第
3503章模型第
3504章 3505、
第3505章、
第3506章、
第3507章、
第3508章ignore_mismatched_sizes=ignore_mismatched_sizes，
3509sharded_metadata=sharded_metadata，
3510_fast_init=_fast_init，
3511low_cpu_mem_usage=low_cpu_mem_usage，
3512device_map=device_map，
3513offload_folder=offload_folder，
3514
关闭第3515章 3516、第
3517
章
第3518章【3520
】
【3521 】
文件 ~/.local/lib/python3.8/site-packages/transformers/modeling_utils.py:3945，在 PreTrainedModel._load_pretrained_model(cls、model、state_dict、loaded_keys、resolved_archive_file、pretrained_model_name_or_path、ignore_mismatched_sizes、sharded_metadata、_fast_init、low_cpu_mem_usage、device_map , offload_folder, offload_state_dict, dtype, hf_quantizer, keep_in_fp32_modules)
3943 error_msgs += new_error_msgs
3944 else:
→ 3945 error_msgs += _load_state_dict_into_model(model_to_load, state_dict, start_prefix)
3947 # 强制释放内存
3948 del state_dict
文件 ~/.local/lib/python3.8/site-packages/transformers/modeling_utils.py:626，在 _load_state_dict_into_model(model_to_load, state_dict, start_prefix)
623 如果 child 不是 None:
624 load(child, state_dict, prefix + name + ".")
→ 626 load(model_to_load, state_dict, prefix=start_prefix)
627 # 删除state_dict，以便GC更早收集它。请注意，state_dict 是参数的副本，因此
删除它是安全的（第 628 章）
第629章
文件 ~/.local/lib/python3.8/site-packages/transformers/modeling_utils.py:624，在 _load_state_dict_into_model..load(module, state_dict, prefix)
622 中为 name，child 在 module._modules.items() 中：
[第 623 章
] 第 624 章
文件 ~/.local/lib/python3.8/site-packages/transformers/modeling_utils.py:624，在 _load_state_dict_into_model..load(module, state_dict, prefix)
622 中为 name，child 在 module._modules.items() 中：
[第 623 章
] 第 624 章
文件 ~/.local/lib/python3.8/site-packages/transformers/modeling_utils.py:618，在 _load_state_dict_into_model..load(module, state_dict, prefix)
616 中，带有 deepspeed.zero.GatheredParameters(params_to_gather, modifier_rank=0) :
617 if torch.distributed.get_rank() == 0:
→ 618 module._load_from_state_dict(*args)
619 else:
620 module._load_from_state_dict(*args)
文件〜/.local/lib/python3.8/site-packages/deepspeed/inference/quantization/utils.py:269，在wrap_load_from_state_dict..wrapper（模型，state_dict，前缀，local_metadata，严格，missing_keys，unexpected_keys，error_msgs）
第267章量化权重，量化尺度，量化最小= model.weight.quantizer.quantize（state_dict[key]）
268 量化权重=量化权重.view（model.weight.dtype）
→ 269 量化尺度=量化尺度.view（模型.权重.dtype）
270 量化最小=第272章
272
RuntimeError: self.size(-1) 必须能被 2 整除才能将 BFloat16 视为 Float（不同的元素大小），但得到 1"
}

我尝试在 config.json 中将 group_dim 设置为 2，但这给出了元组超出范围的错误。

我的 GPU 不支持 BFloat 类型格式。我认为它默认被禁用，但特别将 dtype 设置为 fp32 （这并没有改变任何东西）。我该如何解决这个问题？我想使用零阶段 3 的原因是从 2B 模型转到 7B 模型，卸载到 CPU（也是为了稍微测试一下）。

Hello, I successfully used the zero3 stage acceleration for deepspeed inference and used two gpus, but I found that using two gpus did not perform data parallelism or model parallelism. Each apus asked the same question. How should I solve this problem? Thank you for your reply.

this is code:
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, trust_remote_code=True)

if args.cpu_offload and args.nvme_offload_path:

raise ValueError("Use one of --cpu_offload or --nvme_offload_path and not both")

if args.cpu_offload:
ds_config["zero_optimization"]["offload_param"] = dict(device="cpu", pin_memory=True)

dschf = HfDeepSpeedConfig(ds_config) # this tells from_pretrained to instantiate directly on gpus
ds_engine = deepspeed.initialize(model=model, config_params=ds_config)[0]
ds_engine.module.eval()
model = ds_engine.module
for i in range(0, int(len(input_instance)//args.batch_size)):
if args.batch_size > len(input_instance):
# dynamically extend to support larger bs by repetition
input_instance *= math.ceil(args.batch_size / len(input_instance))
inputs = input_instance[: i+args.batch_size]
input_instance = input_instance[args.batch_size-1:]
generate_kwargs = dict(do_sample=False,max_new_tokens=1024)
print("input is:", inputs)
start_time = perf_counter()
input_tokens = tokenizer.batch_encode_plus(inputs, return_tensors="pt", padding=True)
for t in input_tokens:
if torch.is_tensor(input_tokens[t]):
input_tokens[t] = input_tokens[t].to(torch.cuda.current_device())
outputs = model.generate(**input_tokens, **generate_kwargs)
torch.cuda.empty_cache()
gc.collect()
input_tokens_lengths = [x.shape[0] for x in input_tokens.input_ids]
output_tokens_lengths = [x.shape[0] for x in outputs]
outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
time_consu = perf_counter() - start_time
total_new_tokens = [o - i for i, o in zip(input_tokens_lengths, output_tokens_lengths)]
print("output is:", outputs)
total_token += sum(total_new_tokens)
time_consu_all += time_consu

ocesp98 added bug Something isn't working inference labels Apr 11, 2024

This was referenced Jun 7, 2024

fix: quantization with DeepSpeed HE #5624

Open

[BUG] RuntimeError encountered when generating tokens from a DeepSpeedHybridEngine initialized with 4-bit quantization. #5630

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[Question] Deepspeed inference stage 3 + quantization #5398

[Question] Deepspeed inference stage 3 + quantization #5398

ocesp98 commented Apr 11, 2024

sevenandseven commented Jun 13, 2024

[Question] Deepspeed inference stage 3 + quantization #5398

[Question] Deepspeed inference stage 3 + quantization #5398

Comments

ocesp98 commented Apr 11, 2024

sevenandseven commented Jun 13, 2024

if args.cpu_offload and args.nvme_offload_path:

raise ValueError("Use one of --cpu_offload or --nvme_offload_path and not both")