intel
diff --git a/‎auto_round/autoround.py‎
Lines changed: 4 additions & 3 deletions b/‎auto_round/autoround.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎auto_round/export/export_to_autogptq/export.py‎
Lines changed: 5 additions & 1 deletion b/‎auto_round/export/export_to_autogptq/export.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎auto_round/export/export_to_autoround/export.py‎
Lines changed: 6 additions & 1 deletion b/‎auto_round/export/export_to_autoround/export.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎auto_round/export/export_to_awq/export.py‎
Lines changed: 5 additions & 1 deletion b/‎auto_round/export/export_to_awq/export.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎auto_round/export/export_to_itrex/export.py‎
Lines changed: 5 additions & 1 deletion b/‎auto_round/export/export_to_itrex/export.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎auto_round/utils.py‎
Lines changed: 7 additions & 2 deletions b/‎auto_round/utils.py‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎examples/multimodal-modeling/Qwen-VL/README.md‎ renamed to ‎examples/multimodal-modeling/Common_model/README.md‎
Lines changed: 32 additions & 0 deletions b/‎examples/multimodal-modeling/Qwen-VL/README.md‎ renamed to ‎examples/multimodal-modeling/Common_model/README.md‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎examples/multimodal-modeling/Qwen-VL/eval_042/__init__.py‎ renamed to ‎examples/multimodal-modeling/Common_model/eval_042/__init__.py‎ b/‎examples/multimodal-modeling/Qwen-VL/eval_042/__init__.py‎ renamed to ‎examples/multimodal-modeling/Common_model/eval_042/__init__.py‎
diff --git a/‎examples/multimodal-modeling/Qwen-VL/eval_042/evaluation.py‎ renamed to ‎examples/multimodal-modeling/Common_model/eval_042/evaluation.py‎ b/‎examples/multimodal-modeling/Qwen-VL/eval_042/evaluation.py‎ renamed to ‎examples/multimodal-modeling/Common_model/eval_042/evaluation.py‎
diff --git a/‎examples/multimodal-modeling/Qwen-VL/main.py‎ renamed to ‎examples/multimodal-modeling/Common_model/main.py‎
Lines changed: 7 additions & 6 deletions b/‎examples/multimodal-modeling/Qwen-VL/main.py‎ renamed to ‎examples/multimodal-modeling/Common_model/main.py‎
Lines changed: 7 additions & 6 deletions
@@ -720,8 +720,8 @@ def forward(m, hidden_states, *positional_args, **kwargs):
                                 self.inputs[name][key].extend(list(torch.split(alibi.to("cpu"), 1, dim=0)))
                             else:
                                 self.inputs[name][key] = list(torch.split(alibi.to("cpu"), 1, dim=0))
-                    elif "position_ids" in key or 'cache_position' in key:
-                        if self.train_bs == 1 and self.not_share_rotary_pos_emb_flag:
+                    elif "position_ids" in key or 'cache_position' in key or 'position_embeddings' in key:
+                        if self.train_bs == 1 and self.not_share_position_ids_flag:
                             if key not in self.inputs[name].keys():
                                 self.inputs[name][key] = [to_device(kwargs[key], device=torch.device("cpu"))]
                             else:
@@ -1104,7 +1104,7 @@ def quant_blocks(
                 input_others[key] = input_others[key].to(tmp_dtype)
             elif isinstance(input_others[key], list):
                 for i in range(len(input_others[key])):
-                    input_others[key][i].to(tmp_dtype)
+                    to_dtype(input_others[key][i], tmp_dtype)
         pbar = tqdm(range(0, len(block_names), nblocks))
         for i in pbar:
             if nblocks == 1:
@@ -1621,3 +1621,4 @@ def __init__(
         )
 
 
+
@@ -113,13 +113,16 @@ def save_quantized_as_autogptq(output_dir, inplace=True, backend="auto_gptq:exll
     """Export the model to autogptq format to easily leverage cuda kernel."""
 
     model = kwargs["model"]
-    tokenizer = kwargs["tokenizer"]
     supported_types = kwargs["supported_types"]
     safe_serialization = True if 'safe_serialization' not in kwargs.keys() else kwargs["safe_serialization"]
     quant_block_list = kwargs["quant_block_list"]
     logger.info("Saving quantized model to autogptq format, this may take a while...")
+    tokenizer = kwargs.get("tokenizer", None)
+    processor = kwargs.get("processor", None)
     if tokenizer is not None:
         tokenizer.save_pretrained(output_dir)
+    if processor is not None:
+        processor.save_pretrained(output_dir)
     ##check module quantized in block, this may have bug for mixed precision quantization
     if bool(quant_block_list):
         all_blocks = quant_block_list
@@ -200,3 +203,4 @@ def save(model: torch.nn.Module, save_dir: str, max_shard_size: str = "5GB", saf
     if hasattr(model, "config") and hasattr(model.config, "quantization_config"):
         with open(os.path.join(save_dir, config_file), "w", encoding="utf-8") as f:
             json.dump(model.config.quantization_config, f, indent=2)
+
@@ -198,6 +198,8 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex
     layer_config = kwargs["layer_config"]
     quantization_config = kwargs["serialization_dict"]
     quantization_config["quant_method"] = "intel/auto-round"
+    tokenizer = kwargs.get("tokenizer", None)
+    processor = kwargs.get("processor", None)
     if "awq" not in backend:
         quantization_config["backend"] = backend
     extra_config = {}
@@ -235,12 +237,14 @@ def wrapper(name):
         model.config.quantization_config = quantization_config
     if output_dir is None:
         return model
-    tokenizer = kwargs["tokenizer"]
+    
     if output_dir is None:
         model.tokenizer = tokenizer
         return model
     if tokenizer is not None:
         tokenizer.save_pretrained(output_dir)
+    if processor is not None:
+        processor.save_pretrained(output_dir)
     modules_to_not_convert = []
     if "awq" not in backend:
         save(model, output_dir, safe_serialization=safe_serialization)
@@ -317,3 +321,4 @@ def save_awq(
     if hasattr(model, "config") and hasattr(model.config, "quantization_config"):
         with open(os.path.join(save_dir, config_file), "w", encoding="utf-8") as f:
             json.dump(quantization_config, f, indent=2)
+
@@ -91,11 +91,14 @@ def save_quantized_as_autoawq(output_dir, inplace=True, **kwargs):
     enable_minmax_tuning = kwargs["enable_minmax_tuning"]
     enable_quanted_input = kwargs["enable_quanted_input"]
     scale_dtype = kwargs["scale_dtype"]
-    tokenizer = kwargs["tokenizer"]
+    tokenizer = kwargs.get("tokenizer", None)
+    processor = kwargs.get("processor", None)
 
     logger.info("Saving quantized model to auto_awq format")
     if tokenizer is not None:
         tokenizer.save_pretrained(output_dir)
+    if processor is not None:
+        processor.save_pretrained(output_dir)
     ##check module quantized in block, this may have bug for mixed precision quantization
     modules_to_not_convert = []
     if inplace:
@@ -250,3 +253,4 @@ def get_module_name(model, module_to_find):
         if module is module_to_find:
             return name
     return None
+
@@ -121,7 +121,8 @@ def save_quantized_as_itrex_xpu(output_dir, inplace=True, **kwargs):
     enable_minmax_tuning = kwargs["enable_minmax_tuning"]
     enable_quanted_input = kwargs["enable_quanted_input"]
     scale_dtype = kwargs["scale_dtype"]
-    tokenizer = kwargs["tokenizer"]
+    tokenizer = kwargs.get("tokenizer", None)
+    processor = kwargs.get("processor", None)
 
     compressed_model = pack_model(inplace=inplace, **kwargs)
     if output_dir is None:
@@ -149,6 +150,8 @@ def save_quantized_as_itrex_xpu(output_dir, inplace=True, **kwargs):
         compressed_model.save_pretrained(output_dir, safe_serialization=True)
         if tokenizer is not None:
             tokenizer.save_pretrained(output_dir)
+        if processor is not None:
+            processor.save_pretrained(output_dir)
         logger.info("Saved config file and weights of quantized model to {}.".format(output_dir))
     except IOError as e:  # pragma: no cover
         logger.error("Fail to save configure file and weights due to {}.".format(e))
@@ -252,3 +255,4 @@ def pack_model(
     return compressed_model
 
 
+
@@ -366,14 +366,18 @@ def sampling_inputs(input_ids, input_others, indices, seqlen,
     current_input_others = {"positional_inputs": input_others["positional_inputs"]}
     for key in input_others.keys():
         if not share_attention_mask_flag and ("attention_mask" in key or "alibi" in key) \
-                or (not_share_position_ids_flag and ("position_ids" in key or "cache_position" in key)) \
+                or (not_share_position_ids_flag and ("position_ids" in key or \
+                    "cache_position" in key or "position_embeddings" in key)) \
                 or (not_share_rotary_pos_emb_flag and ("rotary_pos_emb" in key or 'cu_seqlens' in key)) \
                 or "cross_attention_states" in key:
             current_input_others[key] = None
             if input_others[key] is not None:
                 current_input_others[key] = [input_others[key][i] for i in indices]
                 if not isinstance(current_input_others[key], torch.Tensor):
-                    current_input_others[key] = torch.cat(current_input_others[key], dim=0)
+                    if len(current_input_others[key]) == 1:
+                        current_input_others[key] = current_input_others[key][0]
+                    else:
+                        current_input_others[key] = torch.cat(current_input_others[key], dim=0)
         else:
             current_input_others[key] = input_others[key]
 
@@ -973,3 +977,4 @@ def get_autogptq_packing_qlinear(backend, bits=4, group_size=128, sym=False):
     return QuantLinear
 
 
+
@@ -193,6 +193,37 @@ print(output_text)
 ```
 
 
+- Llama-3.2-11B-Vision-Instruct inference
+
+```python
+import requests
+import torch
+from PIL import Image
+from transformers import MllamaForConditionalGeneration, AutoProcessor
+from auto_round.auto_quantizer import AutoHfQuantizer
+quantized_model_path="./tmp_autoround"
+model = MllamaForConditionalGeneration.from_pretrained(
+    quantized_model_path, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True)
+processor = AutoProcessor.from_pretrained(quantized_model_path)
+
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+
+prompt = "<|image|><|begin_of_text|>If I had to write a haiku for this one"
+inputs = processor(image, prompt, return_tensors="pt", truncation=True).to(model.device)
+
+output = model.generate(**inputs, max_new_tokens=30)
+print(processor.decode(output[0]))
+
+# <|begin_of_text|><|image|><|begin_of_text|>If I had to write a haiku for this one, it would be:
+
+# Rabbit in a coat
+# Dressed up in style for the day
+# Country charm abounds
+
+# The image depicts a rabbit
+```
+
 ## 4. Results
 Using [COCO 2017](https://cocodataset.org/) and [LLaVA-Instruct-150K](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K) datasets for quantization calibration, and TextVQA dataset for evaluation. please follow the [recipe](./run_autoround.sh) and [evaluate script](./run_eval.sh). The results for Qwen-VL are as follows:
 | Metric         | bf16   | INT4   |
@@ -241,3 +272,4 @@ If you find SignRound useful for your research, please cite our paper:
 
 
 
+
@@ -34,7 +34,7 @@ def DataFormating(raw_data, image_folder=None, model_type='qwen'):
                 sentence['value'] = sentence['value'].strip()
                 if 'qwen2' in model_type: # for Qwen2-vl
                     replace_token = '<|vision_start|><|image_pad|><|vision_end|>'
-                if 'mllama' in model_type:
+                elif 'mllama' in model_type:
                     replace_token = '<|image|>'
                 else:
                     replace_img = os.path.join(image_folder, os.path.basename(source["image"]))
@@ -422,7 +422,7 @@ def get_train_dataloader(train_dataset, model, data_collator=default_data_collat
     model_type = config.model_type
     if "mllama" in model_type:
         from transformers import MllamaForConditionalGeneration
-        model = MllamaForConditionalGeneration.from_pretrained(args.model_name, 
+        model = MllamaForConditionalGeneration.from_pretrained(args.model_name, attn_implementation="eager",
                                                                trust_remote_code=not args.disable_trust_remote_code) # torch_dtype=torch.bfloat16
         processor = AutoProcessor.from_pretrained(args.model_name)
         tokenizer.processor = processor
@@ -534,17 +534,17 @@ def get_train_dataloader(train_dataset, model, data_collator=default_data_collat
     for gpu_format in gpu_formats:
         if "round" in gpu_format:
             eval_folder = f'{export_dir}-round'
-            autoround.save_quantized(eval_folder, format=gpu_format, use_triton=False, inplace=inplace)
+            autoround.save_quantized(eval_folder, format=gpu_format, use_triton=False, inplace=inplace, processor=processor)
         elif "gptq" in gpu_format:
             eval_folder = f'{export_dir}-gpu'
-            autoround.save_quantized(eval_folder, format=gpu_format, use_triton=False, inplace=inplace)
+            autoround.save_quantized(eval_folder, format=gpu_format, use_triton=False, inplace=inplace, processor=processor)
 
     if 'xpu' in deployment_device:
         autoround.save_quantized(f'{export_dir}-xpu', format="itrex_xpu", use_triton=True, inplace=inplace,
                                  compression_dtype=torch.int8, compression_dim=0, use_optimum_format=False,
-                                 device="xpu")
+                                 device="xpu", processor=processor)
     if "cpu" in deployment_device:
-        autoround.save_quantized(output_dir=f'{export_dir}-cpu', format='itrex', inplace=inplace)
+        autoround.save_quantized(output_dir=f'{export_dir}-cpu', format='itrex', inplace=inplace, processor=processor)
     if "fake" in deployment_device:
         model = model.to("cpu")
         model.save_pretrained(output_dir)
@@ -580,3 +580,4 @@ def get_train_dataloader(train_dataset, model, data_collator=default_data_collat
 
 
 
+