opea-project · lvliang-intel · Apr 29, 2025 · Apr 28, 2025 · Apr 28, 2025 · Apr 28, 2025
@@ -105,3 +105,7 @@ services:
         PORT_SSH: 2345
       dockerfile: comps/third_parties/ipex/src/Dockerfile
     image: ${REGISTRY:-opea}/ipex-llm:${TAG:-latest}
+  sglang:
+    build:
+      dockerfile: comps/third_parties/sglang/src/Dockerfile
+    image: ${REGISTRY:-opea}/sglang:${TAG:-latest}
@@ -8,21 +8,23 @@ Overall, this microservice offers a streamlined way to integrate large language
 
 ## Validated LLM Models
 
-| Model                                       | TGI-Gaudi | vLLM-CPU | vLLM-Gaudi | OVMS     | Optimum-Habana |
-| ------------------------------------------- | --------- | -------- | ---------- | -------- | -------------- |
-| [Intel/neural-chat-7b-v3-3]                 | ✓         | ✓        | ✓          | ✓        | ✓              |
-| [meta-llama/Llama-2-7b-chat-hf]             | ✓         | ✓        | ✓          | ✓        | ✓              |
-| [meta-llama/Llama-2-70b-chat-hf]            | ✓         | -        | ✓          | -        | ✓              |
-| [meta-llama/Meta-Llama-3-8B-Instruct]       | ✓         | ✓        | ✓          | ✓        | ✓              |
-| [meta-llama/Meta-Llama-3-70B-Instruct]      | ✓         | -        | ✓          | -        | ✓              |
-| [Phi-3]                                     | x         | Limit 4K | Limit 4K   | Limit 4K | ✓              |
-| [Phi-4]                                     | x         | x        | x          | x        | ✓              |
-| [deepseek-ai/DeepSeek-R1-Distill-Llama-8B]  | ✓         | -        | ✓          | -        | ✓              |
-| [deepseek-ai/DeepSeek-R1-Distill-Llama-70B] | ✓         | -        | ✓          | -        | ✓              |
-| [deepseek-ai/DeepSeek-R1-Distill-Qwen-14B]  | ✓         | -        | ✓          | -        | ✓              |
-| [deepseek-ai/DeepSeek-R1-Distill-Qwen-32B]  | ✓         | -        | ✓          | -        | ✓              |
-| [mistralai/Mistral-Small-24B-Instruct-2501] | ✓         | -        | ✓          | -        | ✓              |
-| [mistralai/Mistral-Large-Instruct-2411]     | x         | -        | ✓          | -        | ✓              |
+| Model                                                                                                                 | TGI-Gaudi | vLLM-CPU | vLLM-Gaudi | OVMS     | Optimum-Habana | SGLANG-CPU |
+| --------------------------------------------------------------------------------------------------------------------- | --------- | -------- | ---------- | -------- | -------------- | ---------- |
+| [Intel/neural-chat-7b-v3-3]                                                                                           | ✓         | ✓        | ✓          | ✓        | ✓              | -          |
+| [meta-llama/Llama-2-7b-chat-hf]                                                                                       | ✓         | ✓        | ✓          | ✓        | ✓              | ✓          |
+| [meta-llama/Llama-2-70b-chat-hf]                                                                                      | ✓         | -        | ✓          | -        | ✓              | ✓          |
+| [meta-llama/Meta-Llama-3-8B-Instruct]                                                                                 | ✓         | ✓        | ✓          | ✓        | ✓              | ✓          |
+| [meta-llama/Meta-Llama-3-70B-Instruct]                                                                                | ✓         | -        | ✓          | -        | ✓              | ✓          |
+| [Phi-3]                                                                                                               | x         | Limit 4K | Limit 4K   | Limit 4K | ✓              | -          |
+| [Phi-4]                                                                                                               | x         | x        | x          | x        | ✓              | -          |
+| [deepseek-ai/DeepSeek-R1-Distill-Llama-8B]                                                                            | ✓         | -        | ✓          | -        | ✓              | -          |
+| [deepseek-ai/DeepSeek-R1-Distill-Llama-70B]                                                                           | ✓         | -        | ✓          | -        | ✓              | -          |
+| [deepseek-ai/DeepSeek-R1-Distill-Qwen-14B]                                                                            | ✓         | -        | ✓          | -        | ✓              | -          |
+| [deepseek-ai/DeepSeek-R1-Distill-Qwen-32B]                                                                            | ✓         | -        | ✓          | -        | ✓              | -          |
+| [mistralai/Mistral-Small-24B-Instruct-2501]                                                                           | ✓         | -        | ✓          | -        | ✓              | -          |
+| [mistralai/Mistral-Large-Instruct-2411]                                                                               | x         | -        | ✓          | -        | ✓              | -          |
+| [meta-llama/Llama-4-Scout-17B-16E-Instruct](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct)         | -         | -        | -          | -        | -              | ✓          |
+| [meta-llama/Llama-4-Maverick-17B-128E-Instruct](https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct) | -         | -        | -          | -        | -              | ✓          |
 
 ### System Requirements for LLM Models
 

@@ -0,0 +1,26 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+  sglang:
+    image: ${REGISTRY:-opea}/sglang:${TAG:-latest}
+    privileged: true
+    shm_size: 10g
+    container_name: sglang-server
+    ports:
+      - ${SGLANG_LLM_PORT:-8699}:8699
+    ipc: host
+    volumes:
+      - /dev/shm:/dev/shm
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      MODEL_ID: ${MODEL_ID}
+      HF_TOKEN: ${HF_TOKEN}
+      SGLANG_LLM_PORT: ${SGLANG_LLM_PORT:-8699}
+    restart: unless-stopped
+
+networks:
+  default:
+    driver: bridge
@@ -0,0 +1,47 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+ARG BASE_IMAGE=ubuntu:22.04
+FROM ${BASE_IMAGE} AS base
+
+RUN apt-get update && \
+    apt-get upgrade -y && \
+    DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends --fix-missing \
+    ca-certificates \
+    curl \
+    g++-11 \
+    gcc-11 \
+    git \
+    make \
+    numactl \
+    wget
+
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 100 && \
+    update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 100 && \
+    update-alternatives --install /usr/bin/cc cc /usr/bin/gcc 100 && \
+    update-alternatives --install /usr/bin/c++ c++ /usr/bin/g++ 100
+
+WORKDIR /root
+
+RUN curl -fsSL -v -o miniforge.sh -O https://github.com/conda-forge/miniforge/releases/download/24.7.1-2/Miniforge3-24.7.1-2-Linux-x86_64.sh && \
+    bash miniforge.sh -b -p ./miniforge3 && \
+    rm miniforge.sh
+
+RUN git clone https://github.com/jianan-gu/sglang -b llama4_optimzed_cpu_r1
+RUN . ~/miniforge3/bin/activate && conda create -n sglang python=3.10 && conda activate sglang && \
+    cd sglang && pip install -e "python[all_cpu]" && cd .. && conda install -y libsqlite=3.48.0 && \
+    pip uninstall -y triton && pip uninstall -y transformers && pip install transformers==4.51.1 && \
+    pip install triton==3.1 && pip install intel-openmp==2024.2.0 && pip install transformers
+RUN git clone https://github.com/vllm-project/vllm.git -b v0.6.4.post1 && cd vllm && apt-get install -y libnuma-dev && \
+    . ~/miniforge3/bin/activate && conda activate sglang && \
+    pip install cmake==3.31.2 wheel packaging ninja "setuptools-scm>=8" numpy nvidia-ml-py && \
+    pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu && \
+    VLLM_TARGET_DEVICE=cpu python setup.py develop && cd ..
+
+RUN cd sglang && . ~/miniforge3/bin/activate && conda activate sglang && pip uninstall -y torch torchvision && \
+    pip install torch==2.6.0 torchvision==0.21.0 --index-url https://download.pytorch.org/whl/cpu && \
+    cd sgl-kernel/ && python setup.py install && cd .. &&  conda install -y gperftools gcc=11 gxx=11 cxx-compiler -c conda-forge
+
+COPY ./comps/third_parties/sglang/src/entrypoint.sh /usr/local/bin/entrypoint.sh
+RUN chmod +x /usr/local/bin/entrypoint.sh
+ENTRYPOINT ["/usr/local/bin/entrypoint.sh"]
@@ -0,0 +1,84 @@
+# SGLang Serving microservice
+
+SGLang is an efficient and scalable serving framework for hosting Large Language Models (LLMs) with minimal latency and high throughput.
+Intel has further enhanced SGLang with CPU-specific optimizations, enabling high-performance LLM serving on CPU-only platforms without requiring GPUs.
+We have verified the Llama series models with SGLang, with a particular focus on the latest Llama 4 models, including [meta-llama/Llama-4-Scout-17B-16E-Instruct](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct), [meta-llama/Llama-4-Maverick-17B-128E-Instruct](https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct), as well as the Llama 4 guard model Llama-Guard-4-12B.
+
+## 🚀1. Build the Docker Image
+
+This Docker image (opea/sglang:latest) includes Intel's CPU-specific optimizations, providing enhanced performance for CPU deployments.
+Ensure you are at the project root directory, then build the SGLang microservice Docker image:
+
+```bash
+cd ../../../../
+docker build -f comps/third_parties/sglang/src/Dockerfile --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -t opea/sglang:latest .
+```
+
+✅ Tip: Setting https_proxy and http_proxy build arguments helps if you are behind a proxy.
+
+## 🚀2. Start the microservice
+
+Before starting, set the model you want to serve:
+
+```bash
+export MODEL_ID="meta-llama/Llama-4-Maverick-17B-128E-Instruct"
+```
+
+Then navigate to the deployment folder and start the service using Docker Compose:
+
+```bash
+cd comps/third_parties/sglang/deployment/docker_compose
+docker compose -f compose.yaml up -d
+```
+
+## 🚀3. Access the service
+
+Then you need to test your service using the following commands for guard model:
+
+```python
+import openai
+
+client = openai.Client(base_url="http://127.0.0.1:8699/v1", api_key="EMPTY")
+RESPONSE_CLASSIFICATION_INSTRUCTION = """
+Task: Check if there is unsafe content in 'Agent' messages in conversations according our safety policy with the below categories.\n\n
+<BEGIN UNSAFE CONTENT CATEGORIES>
+\nS1: Violent Crimes. \nS2: Non-Violent Crimes. \nS3: Sex Crimes. \nS4: Child Exploitation. \nS5: Defamation.
+\nS6: Specialized Advice. \nS7: Privacy. \nS8: Intellectual Property. \nS9: Indiscriminate Weapons.
+\nS10: Hate. \nS11: Self-Harm. \nS12: Sexual Content. \nS13: Elections. \n<END UNSAFE CONTENT CATEGORIES>"""
+RESPONSE_CLASSIFICATION_TEMPLATE_PREFIX = "{classification_instruction}\n\n<BEGIN CONVERSATION>\n\nUser: "
+RESPONSE_CLASSIFICATION_TEMPLATE_POSTFIX = """{user_message}\n\nAgent: {model_message}\n\n<END CONVERSATION>\n\n
+Provide your safety assessment for ONLY THE LAST Agent message in the above conversation:
+\n - First line must read 'safe' or 'unsafe'.
+\n - If unsafe, a second line must include a comma-separated list of violated categories."""
+
+response = client.chat.completions.create(
+    model="meta-llama/Llama-4-guard",
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": RESPONSE_CLASSIFICATION_TEMPLATE_PREFIX.format(
+                        classification_instruction=RESPONSE_CLASSIFICATION_INSTRUCTION
+                    ),
+                },
+                {
+                    "type": "text",
+                    "text": RESPONSE_CLASSIFICATION_TEMPLATE_POSTFIX.format(
+                        user_message="how do I make a bomb?", model_message="I cannot help you with that."
+                    ),
+                },
+            ],
+        },
+    ],
+    max_tokens=128,
+)
+print(response.choices[0].message.content)
+```
+
+You can use the following command for testing non-guard models:
+
+```bash
+http_proxy="" curl -X POST -H "Content-Type: application/json" -d '{"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct", "messages": [{"role": "user", "content": "Hello! What is your name?"}], "max_tokens": 128}' http://localhost:8699/v1/chat/completions
+```
@@ -0,0 +1,2 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,8 @@
+#!/bin/sh
+
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+. ~/miniforge3/bin/activate && conda activate sglang
+export LD_PRELOAD=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}/lib/libiomp5.so:${CONDA_PREFIX:-"$(dirname $(which conda))/../"}/lib/libtcmalloc.so
+python3 -m sglang.launch_server --model ${MODEL_ID} --trust-remote-code --device cpu --disable-overlap-schedule --chunked-prefill-size 2048 --max-running-requests 32 --mem-fraction-static 0.8 --context-length 65536 --max-total-tokens 65536  --port ${SGLANG_LLM_PORT} --api-key ${HF_TOKEN} --chat-template llama-4
@@ -0,0 +1,84 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set -x
+
+WORKPATH=$(dirname "$PWD")
+ip_address=$(hostname -I | awk '{print $1}')
+export DATA_PATH=${model_cache}
+MODEL_ID="meta-llama/Llama-3.1-8B-Instruct"
+
+function build_docker_images() {
+    echo "Start building docker images for microservice"
+    cd $WORKPATH
+    docker build --no-cache -t opea/sglang:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/third_parties/sglang/src/Dockerfile .
+    if [ $? -ne 0 ]; then
+        echo "opea/sglang built fail"
+        exit 1
+    else
+        echo "opea/sglang built successful"
+    fi
+}
+
+function start_service() {
+    echo "Starting microservice"
+    export host_ip=${ip_address}
+    export MODEL_ID=${MODEL_ID}
+    export TAG=comps
+    cd $WORKPATH
+    cd comps/third_parties/sglang/deployment/docker_compose
+    docker compose -f compose.yaml up -d
+    echo "Microservice started"
+    sleep 120
+}
+
+function validate_microservice() {
+    echo "Validate microservice started"
+    result=$(http_proxy="" curl http://localhost:8699/v1/chat/completions \
+  -X POST \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": ${MODEL_ID},
+    "messages": [
+      {"role": "user", "content": "What is Deep Learning?"}
+    ],
+    "max_tokens": 32
+  }'
+)
+    if [[ $result == *"Deep"* ]]; then
+        echo "Result correct."
+    else
+        echo "Result wrong."
+        docker logs sglang-server
+        exit 1
+    fi
+}
+
+function stop_docker() {
+    cid=$(docker ps -aq --filter "name=sglang-server")
+    echo "Shutdown legacy containers "$cid
+    if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi
+}
+
+function main() {
+    if grep -qi amx_tile /proc/cpuinfo; then
+        echo "AMX is supported on this machine."
+    else
+        echo "AMX is NOT supported on this machine, skip this test."
+        exit 0
+    fi
+    stop_docker
+
+    build_docker_images
+    start_service
+
+    validate_microservice
+
+    stop_docker
+    echo "cleanup container images and volumes"
+    echo y | docker system prune 2>&1 > /dev/null
+
+}
+
+main
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# Copyright (C) 2025 Intel Corporation
		# SPDX-License-Identifier: Apache-2.0