Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions .azure-pipelines/scripts/ut/run_ut_xpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@ set -xe
# install requirements
echo "##[group]set up UT env..."
uv pip install pytest-cov pytest-html
uv pip install -r /auto-round/test/test_ark/requirements.txt \
--extra-index-url https://download.pytorch.org/whl/xpu
uv pip install -r /auto-round/test/test_ark/requirements.txt

cd /auto-round && uv pip install .
echo "##[endgroup]"
Expand Down
98 changes: 46 additions & 52 deletions auto_round/inference/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,21 +439,20 @@ def fp8_static_scheme_checker(
requirements=["autoawq", "transformers"],
)

# BackendInfos["auto_round_kernel"] = BackendInfo(
# device=["cpu"],
# sym=[True, False],
# packing_format=GPTQ_FORMAT_NO_ZP,
# bits=[2, 4, 8],
# group_size=None,
# priority=6,
# checkers=[ark_feature_checker],
# alias=["ark"],
# compute_dtype=["float32", "float16"],
# data_type=["int"],
# act_bits=WOQ_DEFAULT_ACT_BITS,
# requirements=["torch>=2.9.0", "auto_round_kernel"],
# systems=["linux"],
# )
BackendInfos["auto_round_kernel"] = BackendInfo(
device=["cpu"],
sym=[True, False],
packing_format=GPTQ_FORMAT_NO_ZP,
bits=[2, 4, 8],
group_size=None,
priority=6,
checkers=[ark_feature_checker],
alias=["ark"],
compute_dtype=["float32", "float16"],
data_type=["int"],
act_bits=WOQ_DEFAULT_ACT_BITS,
requirements=["torch>=2.8.0", "auto_round_kernel"],
)

BackendInfos["auto_round_kernel_xpu"] = BackendInfo(
device=["xpu"],
Expand All @@ -467,25 +466,23 @@ def fp8_static_scheme_checker(
compute_dtype=["float32", "float16"],
data_type=["int"],
act_bits=WOQ_DEFAULT_ACT_BITS,
requirements=["torch>=2.9.0", "auto_round_kernel"],
systems=["linux"],
requirements=["torch>=2.8.0", "auto_round_kernel"],
)

# BackendInfos["auto_round_kernel_zp"] = BackendInfo(
# device=["cpu"],
# sym=[True, False],
# packing_format=GPTQ_FORMAT,
# bits=[2, 4, 8],
# group_size=None,
# priority=6,
# checkers=[ark_feature_checker],
# alias=["ark"],
# compute_dtype=["float32", "float16"],
# data_type=["int"],
# act_bits=WOQ_DEFAULT_ACT_BITS,
# requirements=["torch>=2.9.0", "auto_round_kernel"],
# systems=["linux"],
# )
BackendInfos["auto_round_kernel_zp"] = BackendInfo(
device=["cpu"],
sym=[True, False],
packing_format=GPTQ_FORMAT,
bits=[2, 4, 8],
group_size=None,
priority=6,
checkers=[ark_feature_checker],
alias=["ark"],
compute_dtype=["float32", "float16"],
data_type=["int"],
act_bits=WOQ_DEFAULT_ACT_BITS,
requirements=["torch>=2.8.0", "auto_round_kernel"],
)

BackendInfos["auto_round_kernel_zp_xpu"] = BackendInfo(
device=["xpu"],
Expand All @@ -499,40 +496,37 @@ def fp8_static_scheme_checker(
compute_dtype=["float32", "float16"],
data_type=["int"],
act_bits=WOQ_DEFAULT_ACT_BITS,
requirements=["torch>=2.9.0", "auto_round_kernel"],
systems=["linux"],
requirements=["torch>=2.8.0", "auto_round_kernel"],
)

# BackendInfos["auto_round_kernel_awq"] = BackendInfo(
# device=["cpu"],
# sym=[True, False],
# packing_format=AWQ_FORMAT,
# bits=[2, 4, 8],
# group_size=None,
# priority=6,
# checkers=[ark_feature_checker],
# alias=["ark"],
# compute_dtype=["float32", "float16"],
# data_type=["int"],
# act_bits=WOQ_DEFAULT_ACT_BITS,
# requirements=["torch>=2.9.0", "auto_round_kernel"],
# systems=["linux"],
# )
BackendInfos["auto_round_kernel_awq"] = BackendInfo(
device=["cpu"],
sym=[True, False],
packing_format=AWQ_FORMAT,
bits=[4],
group_size=None,
priority=6,
checkers=[ark_feature_checker],
alias=["ark"],
compute_dtype=["float32", "float16"],
data_type=["int"],
act_bits=WOQ_DEFAULT_ACT_BITS,
requirements=["torch>=2.8.0", "auto_round_kernel"],
)

BackendInfos["auto_round_kernel_awq_xpu"] = BackendInfo(
device=["xpu"],
sym=[True],
packing_format=AWQ_FORMAT,
bits=[4, 8],
bits=[4],
group_size=None,
priority=6,
checkers=[ark_feature_checker],
alias=["ark"],
compute_dtype=["float32", "float16"],
data_type=["int"],
act_bits=WOQ_DEFAULT_ACT_BITS,
requirements=["torch>=2.9.0", "auto_round_kernel"],
systems=["linux"],
requirements=["torch>=2.8.0", "auto_round_kernel"],
)

BackendInfos["ipex_gptq_cpu"] = BackendInfo(
Expand Down
90 changes: 90 additions & 0 deletions auto_round_extension/ark/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
## What is AutoRound Kernel?
AutoRound Kernel is a low-bit acceleration library for Intel platform.

The kernels are optimized for the following CPUs:
* Intel Xeon Scalable processor (formerly Sapphire Rapids, and Emerald Rapids)
* Intel Xeon 6 processors (formerly Sierra Forest and Granite Rapids)

The kernels are optimized for the following GPUs:
* Intel Arc B-Series Graphics and Intel Arc Pro B-Series Graphics
(formerly Battlemage)

## Key Features
AutoRound Kernel provides weight-only linear computational capabilities for LLM inference. Specifically, the weight-only-quantization configs we support are given in the table below:
### CPU
| Weight dtype | Compute dtype | Scale dtype | Algorithm<sup>[1]</sup> |
| ---------------------- | :----------------: | :---------------: | :--------: |
| INT8 | INT8<sup>[2]</sup> / BF16 / FP32 | BF16 / FP32 | sym / asym |
| INT4 | INT8 / BF16 / FP32 | BF16 / FP32 | sym / asym |
| INT3 | INT8 / BF16 / FP32 | BF16 / FP32 | sym / asym |
| INT2 | INT8 / BF16 / FP32 | BF16 / FP32 | sym / asym |
| INT5 | INT8 / BF16 / FP32 | BF16 / FP32 | sym / asym |
| INT6 | INT8 / BF16 / FP32 | BF16 / FP32 | sym / asym |
| INT7 | INT8 / BF16 / FP32 | BF16 / FP32 | sym / asym |
| INT1 | INT8 / BF16 / FP32 | BF16 / FP32 | sym / asym |
| FP8 (E4M3, E5M2) | BF16 / FP32 | FP32 / FP8 (E8M0) | NA |
| FP4 (E2M1) | BF16 / FP32 | BF16 / FP32 | NA |

### XPU
| Weight dtype | Compute dtype | Scale dtype | Algorithm |
| ---------------------- | :----------------: | :---------------: | :--------: |
| INT8 | INT8 / FP16 | FP16 | sym |
| INT4 | INT8 / FP16 | FP16 | sym |
| FP8 (E4M3, E5M2) | FP16 | FP16 / FP8 (E8M0) | NA |

<sup>[1]</sup>: Quantization algorithms for integer types: symmetric or asymmetric.
<sup>[2]</sup>: Includes dynamic activation quantization; results are dequantized to floating-point formats.


## Installation
### Install via pip
```bash
# Install the latest auto-round kernel which may upgrade your PyTorch version automatically
pip install auto-round-kernel
# Install auto-round kernel with respective to specific PyTorch version (e.g., v2.8.x)
pip install auto-round-kernel torch~=2.8.0
```

<details>
<summary>Other Installation Methods</summary>

### Install via Script
```bash
curl -fsSL https://raw.githubusercontent.com/intel/auto-round/main/auto_round_extension/ark/install_kernel.py
python3 install_kernel.py
```
**Notes:**
Recommend to use this method if you want to keep your current PyTorch and auto-round versions.
This installation script will detect the current environment and install the corresponding auto-round-kernel version.

### Install via auto_round
```bash
pip install auto-round
auto-round-kernel-install
```

</details>

### Versioning Scheme
The version number of auto-round-kernel follows the format:
`{auto-round major version}.{auto-round minor version}.{oneAPI version}.{kernel version}`

**For example: v0.9.1.1**
- The first two digits (0.9) correspond to the major and minor version of the auto_round framework.
- The third digit (1) represents the major version of Intel oneAPI: `1` indicates support for oneAPI 2025.1 (typically Torch 2.8), `2` indicates support for oneAPI 2025.2 (typically Torch 2.9).
- The final digit (1) is the patch version of auto-round-kernel, reflecting updates, bug fixes, or improvements to the kernel package itself.

**Version mapping table**

| auto-round-kernel Version | auto-round Version | oneAPI Version | Typical PyTorch Version |
|:-------------------------:|:------------------:|:--------------:|:-------------------------:|
| 0.9.1.x | 0.9.x | 2025.1 | 2.8.x |
| 0.9.2.x | 0.9.x | 2025.2 | 2.9.x |

**Notes:** oneAPI version is aligned with PyTorch version during auto-round-kernel binary build, but oneAPI toolkit is not required in runtime.

### Validated Hardware Environment
#### CPU based on [Intel 64 architecture or compatible processors](https://en.wikipedia.org/wiki/X86-64):
* Intel Xeon Scalable processor (Granite Rapids)
#### GPU built on Intel's Xe architecture:
* Intel Arc B-Series Graphics (Battlemage)
60 changes: 60 additions & 0 deletions auto_round_extension/ark/install_kernel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Copyright (c) 2026 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import re
import subprocess
import sys


def get_torch_minor():
try:
import torch

m = re.match(r"^(\d+)\.(\d+)", torch.__version__)
return f"{m.group(1)}.{m.group(2)}" if m else None
except ImportError:
return None


def get_auto_round_minor():
try:
import auto_round

m = re.match(r"^(\d+)\.(\d+)", auto_round.__version__)
return f"{m.group(1)}.{m.group(2)}" if m else None
except ImportError:
return None


# Map torch minor version to kernel version
auto_round_minor = "0.9" if get_auto_round_minor() is None else get_auto_round_minor()
KERNEL_MAP = {
"2.8": f"auto-round-kernel~={auto_round_minor}.1.0",
"2.9": f"auto-round-kernel~={auto_round_minor}.2.0",
}


def main():
torch_minor = get_torch_minor()
if torch_minor and torch_minor in KERNEL_MAP:
pkg = KERNEL_MAP[torch_minor]
print(f"Detected torch {torch_minor}, installing {pkg} ...")
subprocess.check_call([sys.executable, "-m", "pip", "install", pkg, "--upgrade-strategy", "only-if-needed"])
else:
print("torch not found or no mapping for your version. Installing the latest auto-round-kernel ...")
subprocess.check_call([sys.executable, "-m", "pip", "install", "auto-round-kernel"])


if __name__ == "__main__":
main()
Loading