intel · chensuyue · Jan 15, 2026 · Jan 5, 2026 · Jan 5, 2026 · Jan 5, 2026
diff --git a/.azure-pipelines/scripts/ut/run_ut_xpu.sh b/.azure-pipelines/scripts/ut/run_ut_xpu.sh
@@ -4,8 +4,7 @@ set -xe
 # install requirements
 echo "##[group]set up UT env..."
 uv pip install pytest-cov pytest-html
-uv pip install -r /auto-round/test/test_ark/requirements.txt \
-    --extra-index-url https://download.pytorch.org/whl/xpu
+uv pip install -r /auto-round/test/test_ark/requirements.txt
 
 cd /auto-round && uv pip install .
 echo "##[endgroup]"

diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py
@@ -439,21 +439,20 @@ def fp8_static_scheme_checker(
     requirements=["autoawq", "transformers"],
 )
 
-# BackendInfos["auto_round_kernel"] = BackendInfo(
-#     device=["cpu"],
-#     sym=[True, False],
-#     packing_format=GPTQ_FORMAT_NO_ZP,
-#     bits=[2, 4, 8],
-#     group_size=None,
-#     priority=6,
-#     checkers=[ark_feature_checker],
-#     alias=["ark"],
-#     compute_dtype=["float32", "float16"],
-#     data_type=["int"],
-#     act_bits=WOQ_DEFAULT_ACT_BITS,
-#     requirements=["torch>=2.9.0", "auto_round_kernel"],
-#     systems=["linux"],
-# )
+BackendInfos["auto_round_kernel"] = BackendInfo(
+    device=["cpu"],
+    sym=[True, False],
+    packing_format=GPTQ_FORMAT_NO_ZP,
+    bits=[2, 4, 8],
+    group_size=None,
+    priority=6,
+    checkers=[ark_feature_checker],
+    alias=["ark"],
+    compute_dtype=["float32", "float16"],
+    data_type=["int"],
+    act_bits=WOQ_DEFAULT_ACT_BITS,
+    requirements=["torch>=2.8.0", "auto_round_kernel"],
+)
 
 BackendInfos["auto_round_kernel_xpu"] = BackendInfo(
     device=["xpu"],
@@ -467,25 +466,23 @@ def fp8_static_scheme_checker(
     compute_dtype=["float32", "float16"],
     data_type=["int"],
     act_bits=WOQ_DEFAULT_ACT_BITS,
-    requirements=["torch>=2.9.0", "auto_round_kernel"],
-    systems=["linux"],
+    requirements=["torch>=2.8.0", "auto_round_kernel"],
 )
 
-# BackendInfos["auto_round_kernel_zp"] = BackendInfo(
-#     device=["cpu"],
-#     sym=[True, False],
-#     packing_format=GPTQ_FORMAT,
-#     bits=[2, 4, 8],
-#     group_size=None,
-#     priority=6,
-#     checkers=[ark_feature_checker],
-#     alias=["ark"],
-#     compute_dtype=["float32", "float16"],
-#     data_type=["int"],
-#     act_bits=WOQ_DEFAULT_ACT_BITS,
-#     requirements=["torch>=2.9.0", "auto_round_kernel"],
-#     systems=["linux"],
-# )
+BackendInfos["auto_round_kernel_zp"] = BackendInfo(
+    device=["cpu"],
+    sym=[True, False],
+    packing_format=GPTQ_FORMAT,
+    bits=[2, 4, 8],
+    group_size=None,
+    priority=6,
+    checkers=[ark_feature_checker],
+    alias=["ark"],
+    compute_dtype=["float32", "float16"],
+    data_type=["int"],
+    act_bits=WOQ_DEFAULT_ACT_BITS,
+    requirements=["torch>=2.8.0", "auto_round_kernel"],
+)
 
 BackendInfos["auto_round_kernel_zp_xpu"] = BackendInfo(
     device=["xpu"],
@@ -499,40 +496,37 @@ def fp8_static_scheme_checker(
     compute_dtype=["float32", "float16"],
     data_type=["int"],
     act_bits=WOQ_DEFAULT_ACT_BITS,
-    requirements=["torch>=2.9.0", "auto_round_kernel"],
-    systems=["linux"],
+    requirements=["torch>=2.8.0", "auto_round_kernel"],
 )
 
-# BackendInfos["auto_round_kernel_awq"] = BackendInfo(
-#     device=["cpu"],
-#     sym=[True, False],
-#     packing_format=AWQ_FORMAT,
-#     bits=[2, 4, 8],
-#     group_size=None,
-#     priority=6,
-#     checkers=[ark_feature_checker],
-#     alias=["ark"],
-#     compute_dtype=["float32", "float16"],
-#     data_type=["int"],
-#     act_bits=WOQ_DEFAULT_ACT_BITS,
-#     requirements=["torch>=2.9.0", "auto_round_kernel"],
-#     systems=["linux"],
-# )
+BackendInfos["auto_round_kernel_awq"] = BackendInfo(
+    device=["cpu"],
+    sym=[True, False],
+    packing_format=AWQ_FORMAT,
+    bits=[4],
+    group_size=None,
+    priority=6,
+    checkers=[ark_feature_checker],
+    alias=["ark"],
+    compute_dtype=["float32", "float16"],
+    data_type=["int"],
+    act_bits=WOQ_DEFAULT_ACT_BITS,
+    requirements=["torch>=2.8.0", "auto_round_kernel"],
+)
 
 BackendInfos["auto_round_kernel_awq_xpu"] = BackendInfo(
     device=["xpu"],
     sym=[True],
     packing_format=AWQ_FORMAT,
-    bits=[4, 8],
+    bits=[4],
     group_size=None,
     priority=6,
     checkers=[ark_feature_checker],
     alias=["ark"],
     compute_dtype=["float32", "float16"],
     data_type=["int"],
     act_bits=WOQ_DEFAULT_ACT_BITS,
-    requirements=["torch>=2.9.0", "auto_round_kernel"],
-    systems=["linux"],
+    requirements=["torch>=2.8.0", "auto_round_kernel"],
 )
 
 BackendInfos["ipex_gptq_cpu"] = BackendInfo(

diff --git a/auto_round_extension/ark/README.md b/auto_round_extension/ark/README.md
@@ -0,0 +1,90 @@
+## What is AutoRound Kernel?
+AutoRound Kernel is a low-bit acceleration library for Intel platform. 
+
+The kernels are optimized for the following CPUs:
+* Intel Xeon Scalable processor (formerly Sapphire Rapids, and Emerald Rapids)
+* Intel Xeon 6 processors (formerly Sierra Forest and Granite Rapids)
+
+The kernels are optimized for the following GPUs:
+* Intel Arc B-Series Graphics and Intel Arc Pro B-Series Graphics
+  (formerly Battlemage)
+
+## Key Features
+AutoRound Kernel provides weight-only linear computational capabilities for LLM inference. Specifically, the weight-only-quantization configs we support are given in the table below:
+### CPU 
+| Weight dtype           |   Compute dtype    |    Scale dtype    |    Algorithm<sup>[1]</sup>    |
+| ---------------------- | :----------------: | :---------------: | :--------: |
+| INT8                   | INT8<sup>[2]</sup> / BF16 / FP32 |    BF16 / FP32    | sym / asym |
+| INT4                   | INT8 / BF16 / FP32 |    BF16 / FP32    | sym / asym |
+| INT3                   | INT8 / BF16 / FP32 |    BF16 / FP32    | sym / asym |
+| INT2                   | INT8 / BF16 / FP32 |    BF16 / FP32    | sym / asym |
+| INT5                   | INT8 / BF16 / FP32 |    BF16 / FP32    | sym / asym |
+| INT6                   | INT8 / BF16 / FP32 |    BF16 / FP32    | sym / asym |
+| INT7                   | INT8 / BF16 / FP32 |    BF16 / FP32    | sym / asym |
+| INT1                   | INT8 / BF16 / FP32 |    BF16 / FP32    | sym / asym |
+| FP8 (E4M3, E5M2)       |    BF16 / FP32     | FP32 / FP8 (E8M0) |    NA     |
+| FP4 (E2M1)             |    BF16 / FP32     |    BF16 / FP32    |    NA     |
+
+### XPU 
+| Weight dtype           |   Compute dtype    |    Scale dtype    |    Algorithm    |
+| ---------------------- | :----------------: | :---------------: | :--------: |
+| INT8                   | INT8 / FP16 |    FP16    | sym |
+| INT4                   | INT8 / FP16 |    FP16    | sym |
+| FP8 (E4M3, E5M2)       |    FP16     | FP16 / FP8 (E8M0) |    NA     |
+
+<sup>[1]</sup>: Quantization algorithms for integer types: symmetric or asymmetric.  
+<sup>[2]</sup>: Includes dynamic activation quantization; results are dequantized to floating-point formats.  
+
+
+## Installation
+### Install via pip
+```bash
+# Install the latest auto-round kernel which may upgrade your PyTorch version automatically
+pip install auto-round-kernel 
+# Install auto-round kernel with respective to specific PyTorch version (e.g., v2.8.x)
+pip install auto-round-kernel torch~=2.8.0 
+```
+
+<details>
+<summary>Other Installation Methods</summary>
+
+### Install via Script
+```bash
+curl -fsSL https://raw.githubusercontent.com/intel/auto-round/main/auto_round_extension/ark/install_kernel.py
+python3 install_kernel.py
+```
+**Notes:**  
+Recommend to use this method if you want to keep your current PyTorch and auto-round versions.  
+This installation script will detect the current environment and install the corresponding auto-round-kernel version. 
+
+### Install via auto_round
+```bash
+pip install auto-round
+auto-round-kernel-install
+```
+
+</details>
+
+### Versioning Scheme
+The version number of auto-round-kernel follows the format:  
+`{auto-round major version}.{auto-round minor version}.{oneAPI version}.{kernel version}`   
+
+**For example: v0.9.1.1**  
+- The first two digits (0.9) correspond to the major and minor version of the auto_round framework.
+- The third digit (1) represents the major version of Intel oneAPI: `1` indicates support for oneAPI 2025.1 (typically Torch 2.8), `2` indicates support for oneAPI 2025.2 (typically Torch 2.9). 
+- The final digit (1) is the patch version of auto-round-kernel, reflecting updates, bug fixes, or improvements to the kernel package itself.
+
+**Version mapping table**
+
+| auto-round-kernel Version | auto-round Version | oneAPI Version | Typical PyTorch Version |
+|:-------------------------:|:------------------:|:--------------:|:-------------------------:|
+|          0.9.1.x          |       0.9.x        |     2025.1     |           2.8.x           |
+|          0.9.2.x          |       0.9.x        |     2025.2     |           2.9.x           |
+
+**Notes:** oneAPI version is aligned with PyTorch version during auto-round-kernel binary build, but oneAPI toolkit is not required in runtime. 
+
+### Validated Hardware Environment
+#### CPU based on [Intel 64 architecture or compatible processors](https://en.wikipedia.org/wiki/X86-64):
+* Intel Xeon Scalable processor (Granite Rapids)
+#### GPU built on Intel's Xe architecture:
+* Intel Arc B-Series Graphics (Battlemage)
diff --git a/auto_round_extension/ark/install_kernel.py b/auto_round_extension/ark/install_kernel.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2026 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+import subprocess
+import sys
+
+
+def get_torch_minor():
+    try:
+        import torch
+
+        m = re.match(r"^(\d+)\.(\d+)", torch.__version__)
+        return f"{m.group(1)}.{m.group(2)}" if m else None
+    except ImportError:
+        return None
+
+
+def get_auto_round_minor():
+    try:
+        import auto_round
+
+        m = re.match(r"^(\d+)\.(\d+)", auto_round.__version__)
+        return f"{m.group(1)}.{m.group(2)}" if m else None
+    except ImportError:
+        return None
+
+
+# Map torch minor version to kernel version
+auto_round_minor = "0.9" if get_auto_round_minor() is None else get_auto_round_minor()
+KERNEL_MAP = {
+    "2.8": f"auto-round-kernel~={auto_round_minor}.1.0",
+    "2.9": f"auto-round-kernel~={auto_round_minor}.2.0",
+}
+
+
+def main():
+    torch_minor = get_torch_minor()
+    if torch_minor and torch_minor in KERNEL_MAP:
+        pkg = KERNEL_MAP[torch_minor]
+        print(f"Detected torch {torch_minor}, installing {pkg} ...")
+        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg, "--upgrade-strategy", "only-if-needed"])
+    else:
+        print("torch not found or no mapping for your version. Installing the latest auto-round-kernel ...")
+        subprocess.check_call([sys.executable, "-m", "pip", "install", "auto-round-kernel"])
+
+
+if __name__ == "__main__":
+    main()