diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index d9f1e6054..ccb12cf42 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -26,7 +26,7 @@ import accelerate import torch from accelerate.big_modeling import dispatch_model, infer_auto_device_map -from accelerate.utils import get_max_memory +from accelerate.utils import get_balanced_memory, get_max_memory from torch import autocast from tqdm import tqdm from transformers import set_seed @@ -1770,7 +1770,7 @@ def calib(self, nsamples, bs): data_new[key] = data[key].to(self.model.device) input_ids = data_new["input_ids"] elif isinstance(data, tuple) or isinstance(data, list): - data_new = to_device(data) + data_new = to_device(data, self.model.device) input_ids = data_new[0] else: data_new = {} @@ -1904,6 +1904,7 @@ def try_cache_inter_data_gpucpu(self, block_names, nsamples, layer_names=None, l if str(self.model.device) == "cpu" and (not self.device.startswith("hpu")): no_split_modules = getattr(self.model, "_no_split_modules", []) devices = parse_available_devices(self.device_map) + max_memory = get_max_memory() new_max_memory = {} if "cpu" not in devices: @@ -1915,13 +1916,21 @@ def try_cache_inter_data_gpucpu(self, block_names, nsamples, layer_names=None, l device = "cpu" else: raise ValueError(f"Unsupported device {device} in device_map: {self.device_map}") - new_max_memory[device] = max_memory[device] + # Use 90% of the reported max memory to leave headroom for activations, + # temporary tensors, other processes, and allocator fragmentation, reducing + # the chance of runtime OOM while still utilizing most available memory. + new_max_memory[device] = max_memory[device] * 0.9 + new_max_memory = get_balanced_memory( + self.model, + max_memory=new_max_memory, + no_split_module_classes=no_split_modules, + ) device_map = infer_auto_device_map( self.model, max_memory=new_max_memory, no_split_module_classes=no_split_modules ) if len(devices) > 1 and "cpu" in device_map.values(): logger.warning( - "Not enough vram cause the ram to be used, which may severely impact speed." + "Some layers are offloaded to cpu, which may severely impact calibration speed." " Please consider using more cards." ) diff --git a/auto_round/compressors/mllm/compressor.py b/auto_round/compressors/mllm/compressor.py index 31f97cbe3..67180555a 100644 --- a/auto_round/compressors/mllm/compressor.py +++ b/auto_round/compressors/mllm/compressor.py @@ -345,7 +345,7 @@ def calib(self, nsamples, bs): pbar.update(1) continue if isinstance(data, torch.Tensor): - input_ids = data.to(self.device) + input_ids = data.to(self.model.device) data_new = input_ids elif isinstance(data, str): if self.tokenizer is None: @@ -360,7 +360,7 @@ def calib(self, nsamples, bs): ) data_new = {} for key in data.keys(): - data_new[key] = data[key].to(self.device) + data_new[key] = data[key].to(self.model.device) input_ids = data_new["input_ids"] elif isinstance(data, dict) and "text" in data.keys(): text = data["text"] @@ -381,7 +381,7 @@ def calib(self, nsamples, bs): data_new[key] = to_dtype(data_new[key], self.model.dtype) input_ids = data_new["input_ids"] elif isinstance(data, tuple) or isinstance(data, list): - data_new = data + data_new = to_device(data, self.model.device) input_ids = data_new[0] else: data_new = {} diff --git a/auto_round/modelling/qwen3_vl_moe.py b/auto_round/modelling/qwen3_vl_moe.py index dbed42b1a..2e38bb740 100644 --- a/auto_round/modelling/qwen3_vl_moe.py +++ b/auto_round/modelling/qwen3_vl_moe.py @@ -67,8 +67,8 @@ def __init__( self.gate = original.gate self.calibrate_all_experts = calibrate_all_experts self.experts = SequentialQwen3VLMoeTextExperts(text_config, original.experts) - if not transformers_version <= version.parse( - "4.57.3" + if not transformers_version < version.parse( + "5.0" ): # remove conversion_mapping for qwen3_vl_moe when transformers>=5.0 from transformers.conversion_mapping import register_checkpoint_conversion_mapping