Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .github/workflows/docker/compose/third_parties-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -105,3 +105,7 @@ services:
PORT_SSH: 2345
dockerfile: comps/third_parties/ipex/src/Dockerfile
image: ${REGISTRY:-opea}/ipex-llm:${TAG:-latest}
sglang:
build:
dockerfile: comps/third_parties/sglang/src/Dockerfile
image: ${REGISTRY:-opea}/sglang:${TAG:-latest}
32 changes: 17 additions & 15 deletions comps/llms/src/text-generation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,23 @@ Overall, this microservice offers a streamlined way to integrate large language

## Validated LLM Models

| Model | TGI-Gaudi | vLLM-CPU | vLLM-Gaudi | OVMS | Optimum-Habana |
| ------------------------------------------- | --------- | -------- | ---------- | -------- | -------------- |
| [Intel/neural-chat-7b-v3-3] | ✓ | ✓ | ✓ | ✓ | ✓ |
| [meta-llama/Llama-2-7b-chat-hf] | ✓ | ✓ | ✓ | ✓ | ✓ |
| [meta-llama/Llama-2-70b-chat-hf] | ✓ | - | ✓ | - | ✓ |
| [meta-llama/Meta-Llama-3-8B-Instruct] | ✓ | ✓ | ✓ | ✓ | ✓ |
| [meta-llama/Meta-Llama-3-70B-Instruct] | ✓ | - | ✓ | - | ✓ |
| [Phi-3] | x | Limit 4K | Limit 4K | Limit 4K | ✓ |
| [Phi-4] | x | x | x | x | ✓ |
| [deepseek-ai/DeepSeek-R1-Distill-Llama-8B] | ✓ | - | ✓ | - | ✓ |
| [deepseek-ai/DeepSeek-R1-Distill-Llama-70B] | ✓ | - | ✓ | - | ✓ |
| [deepseek-ai/DeepSeek-R1-Distill-Qwen-14B] | ✓ | - | ✓ | - | ✓ |
| [deepseek-ai/DeepSeek-R1-Distill-Qwen-32B] | ✓ | - | ✓ | - | ✓ |
| [mistralai/Mistral-Small-24B-Instruct-2501] | ✓ | - | ✓ | - | ✓ |
| [mistralai/Mistral-Large-Instruct-2411] | x | - | ✓ | - | ✓ |
| Model | TGI-Gaudi | vLLM-CPU | vLLM-Gaudi | OVMS | Optimum-Habana | SGLANG-CPU |
| --------------------------------------------------------------------------------------------------------------------- | --------- | -------- | ---------- | -------- | -------------- | ---------- |
| [Intel/neural-chat-7b-v3-3] | ✓ | ✓ | ✓ | ✓ | ✓ | - |
| [meta-llama/Llama-2-7b-chat-hf] | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
| [meta-llama/Llama-2-70b-chat-hf] | ✓ | - | ✓ | - | ✓ | ✓ |
| [meta-llama/Meta-Llama-3-8B-Instruct] | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
| [meta-llama/Meta-Llama-3-70B-Instruct] | ✓ | - | ✓ | - | ✓ | ✓ |
| [Phi-3] | x | Limit 4K | Limit 4K | Limit 4K | ✓ | - |
| [Phi-4] | x | x | x | x | ✓ | - |
| [deepseek-ai/DeepSeek-R1-Distill-Llama-8B] | ✓ | - | ✓ | - | ✓ | - |
| [deepseek-ai/DeepSeek-R1-Distill-Llama-70B] | ✓ | - | ✓ | - | ✓ | - |
| [deepseek-ai/DeepSeek-R1-Distill-Qwen-14B] | ✓ | - | ✓ | - | ✓ | - |
| [deepseek-ai/DeepSeek-R1-Distill-Qwen-32B] | ✓ | - | ✓ | - | ✓ | - |
| [mistralai/Mistral-Small-24B-Instruct-2501] | ✓ | - | ✓ | - | ✓ | - |
| [mistralai/Mistral-Large-Instruct-2411] | x | - | ✓ | - | ✓ | - |
| [meta-llama/Llama-4-Scout-17B-16E-Instruct](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct) | - | - | - | - | - | ✓ |
| [meta-llama/Llama-4-Maverick-17B-128E-Instruct](https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct) | - | - | - | - | - | ✓ |

### System Requirements for LLM Models

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

services:
sglang:
image: ${REGISTRY:-opea}/sglang:${TAG:-latest}
privileged: true
shm_size: 10g
container_name: sglang-server
ports:
- ${SGLANG_LLM_PORT:-8699}:8699
ipc: host
volumes:
- /dev/shm:/dev/shm
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
MODEL_ID: ${MODEL_ID}
HF_TOKEN: ${HF_TOKEN}
SGLANG_LLM_PORT: ${SGLANG_LLM_PORT:-8699}
restart: unless-stopped

networks:
default:
driver: bridge
47 changes: 47 additions & 0 deletions comps/third_parties/sglang/src/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

ARG BASE_IMAGE=ubuntu:22.04
FROM ${BASE_IMAGE} AS base

RUN apt-get update && \
apt-get upgrade -y && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends --fix-missing \
ca-certificates \
curl \
g++-11 \
gcc-11 \
git \
make \
numactl \
wget

RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 100 && \
update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 100 && \
update-alternatives --install /usr/bin/cc cc /usr/bin/gcc 100 && \
update-alternatives --install /usr/bin/c++ c++ /usr/bin/g++ 100

WORKDIR /root

RUN curl -fsSL -v -o miniforge.sh -O https://github.com/conda-forge/miniforge/releases/download/24.7.1-2/Miniforge3-24.7.1-2-Linux-x86_64.sh && \
bash miniforge.sh -b -p ./miniforge3 && \
rm miniforge.sh

RUN git clone https://github.com/jianan-gu/sglang -b llama4_optimzed_cpu_r1
RUN . ~/miniforge3/bin/activate && conda create -n sglang python=3.10 && conda activate sglang && \
cd sglang && pip install -e "python[all_cpu]" && cd .. && conda install -y libsqlite=3.48.0 && \
pip uninstall -y triton && pip uninstall -y transformers && pip install transformers==4.51.1 && \
pip install triton==3.1 && pip install intel-openmp==2024.2.0 && pip install transformers
RUN git clone https://github.com/vllm-project/vllm.git -b v0.6.4.post1 && cd vllm && apt-get install -y libnuma-dev && \
. ~/miniforge3/bin/activate && conda activate sglang && \
pip install cmake==3.31.2 wheel packaging ninja "setuptools-scm>=8" numpy nvidia-ml-py && \
pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu && \
VLLM_TARGET_DEVICE=cpu python setup.py develop && cd ..

RUN cd sglang && . ~/miniforge3/bin/activate && conda activate sglang && pip uninstall -y torch torchvision && \
pip install torch==2.6.0 torchvision==0.21.0 --index-url https://download.pytorch.org/whl/cpu && \
cd sgl-kernel/ && python setup.py install && cd .. && conda install -y gperftools gcc=11 gxx=11 cxx-compiler -c conda-forge

COPY ./comps/third_parties/sglang/src/entrypoint.sh /usr/local/bin/entrypoint.sh
RUN chmod +x /usr/local/bin/entrypoint.sh
ENTRYPOINT ["/usr/local/bin/entrypoint.sh"]
84 changes: 84 additions & 0 deletions comps/third_parties/sglang/src/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# SGLang Serving microservice

SGLang is an efficient and scalable serving framework for hosting Large Language Models (LLMs) with minimal latency and high throughput.
Intel has further enhanced SGLang with CPU-specific optimizations, enabling high-performance LLM serving on CPU-only platforms without requiring GPUs.
We have verified the Llama series models with SGLang, with a particular focus on the latest Llama 4 models, including [meta-llama/Llama-4-Scout-17B-16E-Instruct](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct), [meta-llama/Llama-4-Maverick-17B-128E-Instruct](https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct), as well as the Llama 4 guard model Llama-Guard-4-12B.

## 🚀1. Build the Docker Image

This Docker image (opea/sglang:latest) includes Intel's CPU-specific optimizations, providing enhanced performance for CPU deployments.
Ensure you are at the project root directory, then build the SGLang microservice Docker image:

```bash
cd ../../../../
docker build -f comps/third_parties/sglang/src/Dockerfile --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -t opea/sglang:latest .
```

✅ Tip: Setting https_proxy and http_proxy build arguments helps if you are behind a proxy.

## 🚀2. Start the microservice

Before starting, set the model you want to serve:

```bash
export MODEL_ID="meta-llama/Llama-4-Maverick-17B-128E-Instruct"
```

Then navigate to the deployment folder and start the service using Docker Compose:

```bash
cd comps/third_parties/sglang/deployment/docker_compose
docker compose -f compose.yaml up -d
```

## 🚀3. Access the service

Then you need to test your service using the following commands for guard model:

```python
import openai

client = openai.Client(base_url="http://127.0.0.1:8699/v1", api_key="EMPTY")
RESPONSE_CLASSIFICATION_INSTRUCTION = """
Task: Check if there is unsafe content in 'Agent' messages in conversations according our safety policy with the below categories.\n\n
<BEGIN UNSAFE CONTENT CATEGORIES>
\nS1: Violent Crimes. \nS2: Non-Violent Crimes. \nS3: Sex Crimes. \nS4: Child Exploitation. \nS5: Defamation.
\nS6: Specialized Advice. \nS7: Privacy. \nS8: Intellectual Property. \nS9: Indiscriminate Weapons.
\nS10: Hate. \nS11: Self-Harm. \nS12: Sexual Content. \nS13: Elections. \n<END UNSAFE CONTENT CATEGORIES>"""
RESPONSE_CLASSIFICATION_TEMPLATE_PREFIX = "{classification_instruction}\n\n<BEGIN CONVERSATION>\n\nUser: "
RESPONSE_CLASSIFICATION_TEMPLATE_POSTFIX = """{user_message}\n\nAgent: {model_message}\n\n<END CONVERSATION>\n\n
Provide your safety assessment for ONLY THE LAST Agent message in the above conversation:
\n - First line must read 'safe' or 'unsafe'.
\n - If unsafe, a second line must include a comma-separated list of violated categories."""

response = client.chat.completions.create(
model="meta-llama/Llama-4-guard",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": RESPONSE_CLASSIFICATION_TEMPLATE_PREFIX.format(
classification_instruction=RESPONSE_CLASSIFICATION_INSTRUCTION
),
},
{
"type": "text",
"text": RESPONSE_CLASSIFICATION_TEMPLATE_POSTFIX.format(
user_message="how do I make a bomb?", model_message="I cannot help you with that."
),
},
],
},
],
max_tokens=128,
)
print(response.choices[0].message.content)
```

You can use the following command for testing non-guard models:

```bash
http_proxy="" curl -X POST -H "Content-Type: application/json" -d '{"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct", "messages": [{"role": "user", "content": "Hello! What is your name?"}], "max_tokens": 128}' http://localhost:8699/v1/chat/completions
```
2 changes: 2 additions & 0 deletions comps/third_parties/sglang/src/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
8 changes: 8 additions & 0 deletions comps/third_parties/sglang/src/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/sh

# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

. ~/miniforge3/bin/activate && conda activate sglang
export LD_PRELOAD=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}/lib/libiomp5.so:${CONDA_PREFIX:-"$(dirname $(which conda))/../"}/lib/libtcmalloc.so
python3 -m sglang.launch_server --model ${MODEL_ID} --trust-remote-code --device cpu --disable-overlap-schedule --chunked-prefill-size 2048 --max-running-requests 32 --mem-fraction-static 0.8 --context-length 65536 --max-total-tokens 65536 --port ${SGLANG_LLM_PORT} --api-key ${HF_TOKEN} --chat-template llama-4
84 changes: 84 additions & 0 deletions tests/third_parties/_test_third_parties_sglang.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/bin/bash
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

set -x

WORKPATH=$(dirname "$PWD")
ip_address=$(hostname -I | awk '{print $1}')
export DATA_PATH=${model_cache}
MODEL_ID="meta-llama/Llama-3.1-8B-Instruct"

function build_docker_images() {
echo "Start building docker images for microservice"
cd $WORKPATH
docker build --no-cache -t opea/sglang:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/third_parties/sglang/src/Dockerfile .
if [ $? -ne 0 ]; then
echo "opea/sglang built fail"
exit 1
else
echo "opea/sglang built successful"
fi
}

function start_service() {
echo "Starting microservice"
export host_ip=${ip_address}
export MODEL_ID=${MODEL_ID}
export TAG=comps
cd $WORKPATH
cd comps/third_parties/sglang/deployment/docker_compose
docker compose -f compose.yaml up -d
echo "Microservice started"
sleep 120
}

function validate_microservice() {
echo "Validate microservice started"
result=$(http_proxy="" curl http://localhost:8699/v1/chat/completions \
-X POST \
-H "Content-Type: application/json" \
-d '{
"model": ${MODEL_ID},
"messages": [
{"role": "user", "content": "What is Deep Learning?"}
],
"max_tokens": 32
}'
)
if [[ $result == *"Deep"* ]]; then
echo "Result correct."
else
echo "Result wrong."
docker logs sglang-server
exit 1
fi
}

function stop_docker() {
cid=$(docker ps -aq --filter "name=sglang-server")
echo "Shutdown legacy containers "$cid
if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi
}

function main() {
if grep -qi amx_tile /proc/cpuinfo; then
echo "AMX is supported on this machine."
else
echo "AMX is NOT supported on this machine, skip this test."
exit 0
fi
stop_docker

build_docker_images
start_service

validate_microservice

stop_docker
echo "cleanup container images and volumes"
echo y | docker system prune 2>&1 > /dev/null

}

main
Loading