intel · chensuyue · Jan 15, 2026 · Jan 5, 2026 · Jan 5, 2026 · Jan 5, 2026
diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py
@@ -439,27 +439,11 @@ def fp8_static_scheme_checker(
     requirements=["autoawq", "transformers"],
 )
 
-# BackendInfos["auto_round_kernel"] = BackendInfo(
-#     device=["cpu"],
-#     sym=[True, False],
-#     packing_format=GPTQ_FORMAT_NO_ZP,
-#     bits=[2, 4, 8],
-#     group_size=None,
-#     priority=6,
-#     checkers=[ark_feature_checker],
-#     alias=["ark"],
-#     compute_dtype=["float32", "float16"],
-#     data_type=["int"],
-#     act_bits=WOQ_DEFAULT_ACT_BITS,
-#     requirements=["torch>=2.9.0", "auto_round_kernel"],
-#     systems=["linux"],
-# )
-
-BackendInfos["auto_round_kernel_xpu"] = BackendInfo(
-    device=["xpu"],
-    sym=[True],
+BackendInfos["auto_round_kernel"] = BackendInfo(
+    device=["cpu"],
+    sym=[True, False],
     packing_format=GPTQ_FORMAT_NO_ZP,
-    bits=[4, 8],
+    bits=[2, 4, 8],
     group_size=None,
     priority=6,
     checkers=[ark_feature_checker],
@@ -471,26 +455,10 @@ def fp8_static_scheme_checker(
     systems=["linux"],
 )
 
-# BackendInfos["auto_round_kernel_zp"] = BackendInfo(
-#     device=["cpu"],
-#     sym=[True, False],
-#     packing_format=GPTQ_FORMAT,
-#     bits=[2, 4, 8],
-#     group_size=None,
-#     priority=6,
-#     checkers=[ark_feature_checker],
-#     alias=["ark"],
-#     compute_dtype=["float32", "float16"],
-#     data_type=["int"],
-#     act_bits=WOQ_DEFAULT_ACT_BITS,
-#     requirements=["torch>=2.9.0", "auto_round_kernel"],
-#     systems=["linux"],
-# )
-
-BackendInfos["auto_round_kernel_zp_xpu"] = BackendInfo(
+BackendInfos["auto_round_kernel_xpu"] = BackendInfo(
     device=["xpu"],
     sym=[True],
-    packing_format=GPTQ_FORMAT,
+    packing_format=GPTQ_FORMAT_NO_ZP,
     bits=[4, 8],
     group_size=None,
     priority=6,
@@ -499,31 +467,15 @@ def fp8_static_scheme_checker(
     compute_dtype=["float32", "float16"],
     data_type=["int"],
     act_bits=WOQ_DEFAULT_ACT_BITS,
-    requirements=["torch>=2.9.0", "auto_round_kernel"],
+    requirements=["torch>=2.8.0", "auto_round_kernel"],
     systems=["linux"],
 )
 
-# BackendInfos["auto_round_kernel_awq"] = BackendInfo(
-#     device=["cpu"],
-#     sym=[True, False],
-#     packing_format=AWQ_FORMAT,
-#     bits=[2, 4, 8],
-#     group_size=None,
-#     priority=6,
-#     checkers=[ark_feature_checker],
-#     alias=["ark"],
-#     compute_dtype=["float32", "float16"],
-#     data_type=["int"],
-#     act_bits=WOQ_DEFAULT_ACT_BITS,
-#     requirements=["torch>=2.9.0", "auto_round_kernel"],
-#     systems=["linux"],
-# )
-
-BackendInfos["auto_round_kernel_awq_xpu"] = BackendInfo(
-    device=["xpu"],
-    sym=[True],
-    packing_format=AWQ_FORMAT,
-    bits=[4, 8],
+BackendInfos["auto_round_kernel_zp"] = BackendInfo(
+    device=["cpu"],
+    sym=[True, False],
+    packing_format=GPTQ_FORMAT,
+    bits=[2, 4, 8],
     group_size=None,
     priority=6,
     checkers=[ark_feature_checker],
@@ -535,66 +487,114 @@ def fp8_static_scheme_checker(
     systems=["linux"],
 )
 
-BackendInfos["ipex_gptq_cpu"] = BackendInfo(
-    device=["cpu"],
-    sym=[True, False],
-    packing_format=GPTQ_FORMAT,
-    bits=[4],
-    group_size=None,
-    priority=5,
-    checkers=[],
-    compute_dtype=["float16", "bfloat16"],
-    data_type=["int"],
-    act_bits=WOQ_DEFAULT_ACT_BITS,
-    alias=["ipex"],
-    requirements=["torch<2.9", "intel-extension-for-pytorch>=2.5"],
-)
-
-BackendInfos["ipex_gptq"] = BackendInfo(
+BackendInfos["auto_round_kernel_zp_xpu"] = BackendInfo(
     device=["xpu"],
-    sym=[True, False],
+    sym=[True],
     packing_format=GPTQ_FORMAT,
-    bits=[4],
+    bits=[4, 8],
     group_size=None,
-    priority=5,
-    checkers=[],
-    compute_dtype=["float16", "bfloat16"],
+    priority=6,
+    checkers=[ark_feature_checker],
+    alias=["ark"],
+    compute_dtype=["float32", "float16"],
     data_type=["int"],
     act_bits=WOQ_DEFAULT_ACT_BITS,
-    alias=["ipex"],
-    requirements=["intel-extension-for-pytorch>=2.5"],
+    requirements=["torch>=2.8.0", "auto_round_kernel"],
+    systems=["linux"],
 )
 
-BackendInfos["ipex_awq_cpu"] = BackendInfo(
+BackendInfos["auto_round_kernel_awq"] = BackendInfo(
     device=["cpu"],
     sym=[True, False],
     packing_format=AWQ_FORMAT,
     bits=[4],
     group_size=None,
-    priority=5,
-    checkers=[],
-    compute_dtype=["float16", "bfloat16"],
+    priority=6,
+    checkers=[ark_feature_checker],
+    alias=["ark"],
+    compute_dtype=["float32", "float16"],
     data_type=["int"],
     act_bits=WOQ_DEFAULT_ACT_BITS,
-    alias=["ipex"],
-    requirements=["torch<2.9", "intel-extension-for-pytorch>=2.5"],
+    requirements=["torch>=2.9.0", "auto_round_kernel"],
+    systems=["linux"],
 )
 
-
-BackendInfos["ipex_awq"] = BackendInfo(
+BackendInfos["auto_round_kernel_awq_xpu"] = BackendInfo(
     device=["xpu"],
-    sym=[True, False],
+    sym=[True],
     packing_format=AWQ_FORMAT,
     bits=[4],
     group_size=None,
-    priority=5,
-    checkers=[],
-    compute_dtype=["float16", "bfloat16"],
+    priority=6,
+    checkers=[ark_feature_checker],
+    alias=["ark"],
+    compute_dtype=["float32", "float16"],
     data_type=["int"],
     act_bits=WOQ_DEFAULT_ACT_BITS,
-    alias=["ipex"],
-    requirements=["intel-extension-for-pytorch>=2.5"],
+    requirements=["torch>=2.8.0", "auto_round_kernel"],
+    systems=["linux"],
 )
+
+# BackendInfos["ipex_gptq_cpu"] = BackendInfo(
+#     device=["cpu"],
+#     sym=[True, False],
+#     packing_format=GPTQ_FORMAT,
+#     bits=[4],
+#     group_size=None,
+#     priority=5,
+#     checkers=[],
+#     compute_dtype=["float16", "bfloat16"],
+#     data_type=["int"],
+#     act_bits=WOQ_DEFAULT_ACT_BITS,
+#     alias=["ipex"],
+#     requirements=["torch<2.9", "intel-extension-for-pytorch>=2.5"],
+# )
+
+# BackendInfos["ipex_gptq"] = BackendInfo(
+#     device=["xpu"],
+#     sym=[True, False],
+#     packing_format=GPTQ_FORMAT,
+#     bits=[4],
+#     group_size=None,
+#     priority=5,
+#     checkers=[],
+#     compute_dtype=["float16", "bfloat16"],
+#     data_type=["int"],
+#     act_bits=WOQ_DEFAULT_ACT_BITS,
+#     alias=["ipex"],
+#     requirements=["intel-extension-for-pytorch>=2.5"],
+# )
+
+# BackendInfos["ipex_awq_cpu"] = BackendInfo(
+#     device=["cpu"],
+#     sym=[True, False],
+#     packing_format=AWQ_FORMAT,
+#     bits=[4],
+#     group_size=None,
+#     priority=5,
+#     checkers=[],
+#     compute_dtype=["float16", "bfloat16"],
+#     data_type=["int"],
+#     act_bits=WOQ_DEFAULT_ACT_BITS,
+#     alias=["ipex"],
+#     requirements=["torch<2.9", "intel-extension-for-pytorch>=2.5"],
+# )
+
+
+# BackendInfos["ipex_awq"] = BackendInfo(
+#     device=["xpu"],
+#     sym=[True, False],
+#     packing_format=AWQ_FORMAT,
+#     bits=[4],
+#     group_size=None,
+#     priority=5,
+#     checkers=[],
+#     compute_dtype=["float16", "bfloat16"],
+#     data_type=["int"],
+#     act_bits=WOQ_DEFAULT_ACT_BITS,
+#     alias=["ipex"],
+#     requirements=["intel-extension-for-pytorch>=2.5"],
+# )
 BackendInfos["hpu"] = BackendInfo(
     device=["hpu"],
     sym=[True, False],

diff --git a/auto_round_extension/ark/README.md b/auto_round_extension/ark/README.md
@@ -0,0 +1,58 @@
+## What is AutoRound Kernel?
+[TODO]
+
+## Key Features
+[TODO]
+
+## Installation
+### Install via pip
+```bash
+# install the latest auto-round-kernel version and this cmd will update your local pytorch version if needed
+pip install auto-round-kernel 
+# or install together with a specific pytorch version to install the corresponding auto-round-kernel version, e.g., for torch 2.8.x
+pip install auto-round-kernel torch~=2.8.0 
+```
+
+<details>
+<summary>Other Installation Methods</summary>
+
+### Install via Script
+```bash
+curl -fsSL https://raw.githubusercontent.com/intel/auto-round/main/auto_round_extension/ark/install_kernel.py
+python3 install_kernel.py
+```
+**Notes:**  
+Recommend to use this method if you want to keep your current PyTorch and auto-round versions.  
+This installation script will detect the current environment and install the corresponding auto-round-kernel version. 
+
+### Install via auto_round
+```bash
+pip install auto-round
+auto-round-kernel-install
+```
+
+</details>
+
+### Versioning Scheme
+The version number of auto-round-kernel follows the format:  
+`{auto-round major version}.{auto-round minor version}.{oneAPI version}.{kernel version}`   
+
+**For example: v0.9.1.1**  
+- The first two digits (0.9) correspond to the major and minor version of the auto_round framework.
+- The third digit (1) represents the major version of Intel oneAPI and PyTorch version: `1` indicate support for oneAPI 2025.1 and torch 2.8, `2` indicate support for oneAPI 2025.2 and torch 2.9.
+- The final digit (1) is the patch version of auto-round-kernel, reflecting updates, bug fixes, or improvements to the kernel package itself.
+
+**Version mapping table**
+
+| auto-round-kernel Version | auto-round Version | oneAPI Version | Supported PyTorch Version |
+|:-------------------------:|:------------------:|:--------------:|:-------------------------:|
+|          0.9.1.x          |       0.9.x        |     2025.1     |           2.8.x           |
+|          0.9.2.x          |       0.9.x        |     2025.2     |           2.9.x           |
+
+**Notes:** oneAPI version is aligned with PyTorch version during auto-round-kernel binary build, but it is not required in runtime. 
+
+### Validated Hardware Environment
+#### CPU based on [Intel 64 architecture or compatible processors](https://en.wikipedia.org/wiki/X86-64):
+* Intel Xeon Scalable processor (Granite Rapids)
+#### GPU built on Intel's Xe architecture:
+* Intel Arc B-Series Graphics (Battlemage)
diff --git a/auto_round_extension/ark/install_kernel.py b/auto_round_extension/ark/install_kernel.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2026 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+import subprocess
+import sys
+
+
+def get_torch_minor():
+    try:
+        import torch
+
+        m = re.match(r"^(\d+)\.(\d+)", torch.__version__)
+        return f"{m.group(1)}.{m.group(2)}" if m else None
+    except ImportError:
+        return None
+
+
+def get_auto_round_minor():
+    try:
+        import auto_round
+
+        m = re.match(r"^(\d+)\.(\d+)", auto_round.__version__)
+        return f"{m.group(1)}.{m.group(2)}" if m else None
+    except ImportError:
+        return None
+
+
+# Map torch minor version to kernel version
+auto_round_minor = "0.9" if get_auto_round_minor() is None else get_auto_round_minor()
+KERNEL_MAP = {
+    "2.8": f"auto-round-kernel~={auto_round_minor}.1.0",
+    "2.9": f"auto-round-kernel~={auto_round_minor}.2.0",
+}
+
+
+def main():
+    torch_minor = get_torch_minor()
+    if torch_minor and torch_minor in KERNEL_MAP:
+        pkg = KERNEL_MAP[torch_minor]
+        print(f"Detected torch {torch_minor}, installing {pkg} ...")
+        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg, "--upgrade-strategy", "only-if-needed"])
+    else:
+        print("torch not found or no mapping for your version. Installing the latest auto-round-kernel ...")
+        subprocess.check_call([sys.executable, "-m", "pip", "install", "auto-round-kernel"])
+
+
+if __name__ == "__main__":
+    main()